Merge branch 'master' into soc-2021-knife-toolssoc-2021-knife-tools

author: Cian Jinks <cjinks99@gmail.com> 2021-09-22 17:09:31 +0300
committer: Cian Jinks <cjinks99@gmail.com> 2021-09-22 17:09:31 +0300
commit: e734491048ef2436af41e272b8900f20785ecbe6 (patch)
tree: 8cee3fc068c782c0ba8cb9a581e768968c565569
parent: f21cd0881948f6eaf16af0b354cd904df7407bda (diff)
parent: 204b01a254ac2445fea217e5211b2ed6aef631ca (diff)
685 files changed, 75538 insertions, 45142 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 47712f0ac1e..8e807b84e22 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -403,7 +403,7 @@ option(WITH_CYCLES_CUDA_BINARIES    "Build Cycles CUDA binaries" OFF)
 option(WITH_CYCLES_CUBIN_COMPILER   "Build cubins with nvrtc based compiler instead of nvcc" OFF)
 option(WITH_CYCLES_CUDA_BUILD_SERIAL "Build cubins one after another (useful on machines with limited RAM)" OFF)
 mark_as_advanced(WITH_CYCLES_CUDA_BUILD_SERIAL)
-set(CYCLES_TEST_DEVICES CPU CACHE STRING "Run regression tests on the specified device types (CPU CUDA OPTIX OPENCL)" )
+set(CYCLES_TEST_DEVICES CPU CACHE STRING "Run regression tests on the specified device types (CPU CUDA OPTIX)" )
 set(CYCLES_CUDA_BINARIES_ARCH sm_30 sm_35 sm_37 sm_50 sm_52 sm_60 sm_61 sm_70 sm_75 sm_86 compute_75 CACHE STRING "CUDA architectures to build binaries for")
 mark_as_advanced(CYCLES_CUDA_BINARIES_ARCH)
 unset(PLATFORM_DEFAULT)
@@ -418,12 +418,8 @@ mark_as_advanced(WITH_CYCLES_DEBUG_NAN)
 mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
 
 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
-option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" OFF)
-option(WITH_CYCLES_DEVICE_OPENCL            "Enable Cycles OpenCL compute support" ON)
-option(WITH_CYCLES_NETWORK              "Enable Cycles compute over network support (EXPERIMENTAL and unfinished)" OFF)
+option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
-mark_as_advanced(WITH_CYCLES_DEVICE_OPENCL)
-mark_as_advanced(WITH_CYCLES_NETWORK)
 
 option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON)
 mark_as_advanced(WITH_CUDA_DYNLOAD)
diff --git a/build_files/cmake/Modules/FindOptiX.cmake b/build_files/cmake/Modules/FindOptiX.cmake
index cfcdd9cd23b..67106740f57 100644
--- a/build_files/cmake/Modules/FindOptiX.cmake
+++ b/build_files/cmake/Modules/FindOptiX.cmake
@@ -33,11 +33,23 @@ FIND_PATH(OPTIX_INCLUDE_DIR
     include
 )
 
+IF(EXISTS "${OPTIX_INCLUDE_DIR}/optix.h")
+  FILE(STRINGS "${OPTIX_INCLUDE_DIR}/optix.h" _optix_version REGEX "^#define OPTIX_VERSION[ \t].*$")
+  STRING(REGEX MATCHALL "[0-9]+" _optix_version ${_optix_version})
+
+  MATH(EXPR _optix_version_major "${_optix_version} / 10000")
+  MATH(EXPR _optix_version_minor "(${_optix_version} % 10000) / 100")
+  MATH(EXPR _optix_version_patch "${_optix_version} % 100")
+
+  SET(OPTIX_VERSION "${_optix_version_major}.${_optix_version_minor}.${_optix_version_patch}")
+ENDIF()
+
 # handle the QUIETLY and REQUIRED arguments and set OPTIX_FOUND to TRUE if
 # all listed variables are TRUE
 INCLUDE(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(OptiX DEFAULT_MSG
-    OPTIX_INCLUDE_DIR)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OptiX
+    REQUIRED_VARS OPTIX_INCLUDE_DIR
+    VERSION_VAR OPTIX_VERSION)
 
 IF(OPTIX_FOUND)
   SET(OPTIX_INCLUDE_DIRS ${OPTIX_INCLUDE_DIR})
@@ -45,6 +57,7 @@ ENDIF()
 
 MARK_AS_ADVANCED(
   OPTIX_INCLUDE_DIR
+  OPTIX_VERSION
 )
 
 UNSET(_optix_SEARCH_DIRS)
diff --git a/build_files/config/pipeline_config.yaml b/build_files/config/pipeline_config.yaml
index 5d1a24a30f1..8222f2ff0b9 100644
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -55,7 +55,7 @@ buildbot:
     cuda11:
         version: '11.4.1'
     optix:
-        version: '7.1.0'
+        version: '7.3.0'
     cmake:
         default:
             version: any
diff --git a/extern/audaspace/CMakeLists.txt b/extern/audaspace/CMakeLists.txt
index 552ff749512..8493fe3e67d 100644
--- a/extern/audaspace/CMakeLists.txt
+++ b/extern/audaspace/CMakeLists.txt
@@ -129,6 +129,7 @@ set(SRC
 	src/util/Barrier.cpp
 	src/util/Buffer.cpp
 	src/util/BufferReader.cpp
+	src/util/RingBuffer.cpp
 	src/util/StreamBuffer.cpp
 	src/util/ThreadPool.cpp
 )
@@ -245,6 +246,7 @@ set(PUBLIC_HDR
 	include/util/BufferReader.h
 	include/util/ILockable.h
 	include/util/Math3D.h
+	include/util/RingBuffer.h
 	include/util/StreamBuffer.h
 	include/util/ThreadPool.h
 )
diff --git a/extern/audaspace/include/util/RingBuffer.h b/extern/audaspace/include/util/RingBuffer.h
new file mode 100644
index 00000000000..67bd1cc8640
--- /dev/null
+++ b/extern/audaspace/include/util/RingBuffer.h
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ * Copyright 2009-2021 Jörg Müller
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * @file RingBuffer.h
+ * @ingroup util
+ * The RingBuffer class.
+ */
+
+#include "Audaspace.h"
+#include "Buffer.h"
+
+#include <cstddef>
+
+AUD_NAMESPACE_BEGIN
+
+/**
+ * This class is a simple ring buffer in RAM which is 32 Byte aligned and provides
+ * functionality for concurrent reading and writting without locks.
+ */
+class AUD_API RingBuffer
+{
+private:
+	/// The buffer storing the actual data.
+	Buffer m_buffer;
+
+	/// The reading pointer.
+	volatile size_t m_read;
+
+	/// The writing pointer.
+	volatile size_t m_write;
+
+	// delete copy constructor and operator=
+	RingBuffer(const RingBuffer&) = delete;
+	RingBuffer& operator=(const RingBuffer&) = delete;
+
+public:
+	/**
+	 * Creates a new ring buffer.
+	 * \param size The size of the buffer in bytes.
+	 */
+	RingBuffer(int size = 0);
+
+	/**
+	 * Returns the pointer to the ring buffer in memory.
+	 */
+	sample_t* getBuffer() const;
+
+	/**
+	 * Returns the size of the ring buffer in bytes.
+	 */
+	int getSize() const;
+
+	size_t getReadSize() const;
+
+	size_t getWriteSize() const;
+
+	size_t read(data_t* target, size_t size);
+
+	size_t write(data_t* source, size_t size);
+
+	/**
+	 * Resets the ring buffer to a state where nothing has been written or read.
+	 */
+	void reset();
+
+	/**
+	 * Resizes the ring buffer.
+	 * \param size The new size of the ring buffer, measured in bytes.
+	 */
+	void resize(int size);
+
+	/**
+	 * Makes sure the ring buffer has a minimum size.
+	 * If size is >= current size, nothing will happen.
+	 * Otherwise the ring buffer is resized with keep as parameter.
+	 * \param size The new minimum size of the ring buffer, measured in bytes.
+	 */
+	void assureSize(int size);
+};
+
+AUD_NAMESPACE_END
diff --git a/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.cpp b/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.cpp
index bf3fad82620..cddc411cfc6 100644
--- a/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.cpp
+++ b/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.cpp
@@ -23,95 +23,121 @@
 
 AUD_NAMESPACE_BEGIN
 
-void PulseAudioDevice::PulseAudio_state_callback(pa_context *context, void *data)
+PulseAudioDevice::PulseAudioSynchronizer::PulseAudioSynchronizer(PulseAudioDevice *device) :
+	m_device(device)
 {
-	PulseAudioDevice* device = (PulseAudioDevice*)data;
+}
 
-	std::lock_guard<ILockable> lock(*device);
+double PulseAudioDevice::PulseAudioSynchronizer::getPosition(std::shared_ptr<IHandle> handle)
+{
+	pa_usec_t latency;
+	int negative;
+	AUD_pa_stream_get_latency(m_device->m_stream, &latency, &negative);
 
-	device->m_state = AUD_pa_context_get_state(context);
+	double delay = m_device->m_ring_buffer.getReadSize() / (AUD_SAMPLE_SIZE(m_device->m_specs) * m_device->m_specs.rate) + latency * 1.0e-6;
+
+	return handle->getPosition() - delay;
 }
 
-void PulseAudioDevice::PulseAudio_request(pa_stream *stream, size_t total_bytes, void *data)
+void PulseAudioDevice::updateRingBuffer()
 {
-	PulseAudioDevice* device = (PulseAudioDevice*)data;
+	unsigned int samplesize = AUD_SAMPLE_SIZE(m_specs);
 
-	void* buffer;
+	std::unique_lock<std::mutex> lock(m_mixingLock);
 
-	while(total_bytes > 0)
+	Buffer buffer;
+
+	while(m_valid)
 	{
-		size_t num_bytes = total_bytes;
+		size_t size = m_ring_buffer.getWriteSize();
 
-		AUD_pa_stream_begin_write(stream, &buffer, &num_bytes);
+		size_t sample_count = size / samplesize;
 
-		device->mix((data_t*)buffer, num_bytes / AUD_DEVICE_SAMPLE_SIZE(device->m_specs));
+		if(sample_count > 0)
+		{
+			size = sample_count * samplesize;
 
-		AUD_pa_stream_write(stream, buffer, num_bytes, nullptr, 0, PA_SEEK_RELATIVE);
+			buffer.assureSize(size);
 
-		total_bytes -= num_bytes;
+			mix(reinterpret_cast<data_t*>(buffer.getBuffer()), sample_count);
+
+			m_ring_buffer.write(reinterpret_cast<data_t*>(buffer.getBuffer()), size);
+		}
+
+		m_mixingCondition.wait(lock);
 	}
 }
 
-void PulseAudioDevice::PulseAudio_underflow(pa_stream *stream, void *data)
+void PulseAudioDevice::PulseAudio_state_callback(pa_context *context, void *data)
 {
 	PulseAudioDevice* device = (PulseAudioDevice*)data;
 
-	DeviceSpecs specs = device->getSpecs();
+	device->m_state = AUD_pa_context_get_state(context);
 
-	if(++device->m_underflows > 4 && device->m_buffersize < AUD_DEVICE_SAMPLE_SIZE(specs) * specs.rate * 2)
-	{
-		device->m_buffersize <<= 1;
-		device->m_underflows = 0;
+	AUD_pa_threaded_mainloop_signal(device->m_mainloop, 0);
+}
 
-		pa_buffer_attr buffer_attr;
+void PulseAudioDevice::PulseAudio_request(pa_stream *stream, size_t total_bytes, void *data)
+{
+	PulseAudioDevice* device = (PulseAudioDevice*)data;
 
-		buffer_attr.fragsize = -1U;
-		buffer_attr.maxlength = -1U;
-		buffer_attr.minreq = -1U;
-		buffer_attr.prebuf = -1U;
-		buffer_attr.tlength = device->m_buffersize;
+	data_t* buffer;
 
-		AUD_pa_stream_set_buffer_attr(stream, &buffer_attr, nullptr, nullptr);
-	}
-}
+	size_t sample_size = AUD_DEVICE_SAMPLE_SIZE(device->m_specs);
 
-void PulseAudioDevice::runMixingThread()
-{
-	for(;;)
+	while(total_bytes > 0)
 	{
+		size_t num_bytes = total_bytes;
+
+		AUD_pa_stream_begin_write(stream, reinterpret_cast<void**>(&buffer), &num_bytes);
+
+		size_t readsamples = device->m_ring_buffer.getReadSize();
+
+		readsamples = std::min(readsamples, size_t(num_bytes)) / sample_size;
+
+		device->m_ring_buffer.read(buffer, readsamples * sample_size);
+
+		if(readsamples * sample_size < num_bytes)
+			std::memset(buffer + readsamples * sample_size, 0, num_bytes - readsamples * sample_size);
+
+		if(device->m_mixingLock.try_lock())
 		{
-			std::lock_guard<ILockable> lock(*this);
-
-			if(shouldStop())
-			{
-				AUD_pa_stream_cork(m_stream, 1, nullptr, nullptr);
-				AUD_pa_stream_flush(m_stream, nullptr, nullptr);
-				doStop();
-				return;
-			}
+			device->m_mixingCondition.notify_all();
+			device->m_mixingLock.unlock();
 		}
 
-		if(AUD_pa_stream_is_corked(m_stream))
-			AUD_pa_stream_cork(m_stream, 0, nullptr, nullptr);
+		AUD_pa_stream_write(stream, reinterpret_cast<void*>(buffer), num_bytes, nullptr, 0, PA_SEEK_RELATIVE);
 
-		// similar to AUD_pa_mainloop_iterate(m_mainloop, false, nullptr); except with a longer timeout
-		AUD_pa_mainloop_prepare(m_mainloop, 1 << 14);
-		AUD_pa_mainloop_poll(m_mainloop);
-		AUD_pa_mainloop_dispatch(m_mainloop);
+		total_bytes -= num_bytes;
 	}
 }
 
+void PulseAudioDevice::playing(bool playing)
+{
+	m_playback = playing;
+
+	AUD_pa_threaded_mainloop_lock(m_mainloop);
+	AUD_pa_stream_cork(m_stream, playing ? 0 : 1, nullptr, nullptr);
+	AUD_pa_threaded_mainloop_unlock(m_mainloop);
+}
+
 PulseAudioDevice::PulseAudioDevice(std::string name, DeviceSpecs specs, int buffersize) :
+	m_synchronizer(this),
+	m_playback(false),
 	m_state(PA_CONTEXT_UNCONNECTED),
+	m_valid(true),
 	m_underflows(0)
 {
-	m_mainloop = AUD_pa_mainloop_new();
+	m_mainloop = AUD_pa_threaded_mainloop_new();
+
+	AUD_pa_threaded_mainloop_lock(m_mainloop);
 
-	m_context = AUD_pa_context_new(AUD_pa_mainloop_get_api(m_mainloop), name.c_str());
+	m_context = AUD_pa_context_new(AUD_pa_threaded_mainloop_get_api(m_mainloop), name.c_str());
 
 	if(!m_context)
 	{
-		AUD_pa_mainloop_free(m_mainloop);
+		AUD_pa_threaded_mainloop_unlock(m_mainloop);
+		AUD_pa_threaded_mainloop_free(m_mainloop);
 
 		AUD_THROW(DeviceException, "Could not connect to PulseAudio.");
 	}
@@ -120,21 +146,26 @@ PulseAudioDevice::PulseAudioDevice(std::string name, DeviceSpecs specs, int buff
 
 	AUD_pa_context_connect(m_context, nullptr, PA_CONTEXT_NOFLAGS, nullptr);
 
+	AUD_pa_threaded_mainloop_start(m_mainloop);
+
 	while(m_state != PA_CONTEXT_READY)
 	{
 		switch(m_state)
 		{
 		case PA_CONTEXT_FAILED:
 		case PA_CONTEXT_TERMINATED:
+			AUD_pa_threaded_mainloop_unlock(m_mainloop);
+			AUD_pa_threaded_mainloop_stop(m_mainloop);
+
 			AUD_pa_context_disconnect(m_context);
 			AUD_pa_context_unref(m_context);
 
-			AUD_pa_mainloop_free(m_mainloop);
+			AUD_pa_threaded_mainloop_free(m_mainloop);
 
 			AUD_THROW(DeviceException, "Could not connect to PulseAudio.");
 			break;
 		default:
-			AUD_pa_mainloop_iterate(m_mainloop, true, nullptr);
+			AUD_pa_threaded_mainloop_wait(m_mainloop);
 			break;
 		}
 	}
@@ -182,16 +213,18 @@ PulseAudioDevice::PulseAudioDevice(std::string name, DeviceSpecs specs, int buff
 
 	if(!m_stream)
 	{
+		AUD_pa_threaded_mainloop_unlock(m_mainloop);
+		AUD_pa_threaded_mainloop_stop(m_mainloop);
+
 		AUD_pa_context_disconnect(m_context);
 		AUD_pa_context_unref(m_context);
 
-		AUD_pa_mainloop_free(m_mainloop);
+		AUD_pa_threaded_mainloop_free(m_mainloop);
 
 		AUD_THROW(DeviceException, "Could not create PulseAudio stream.");
 	}
 
 	AUD_pa_stream_set_write_callback(m_stream, PulseAudio_request, this);
-	AUD_pa_stream_set_underflow_callback(m_stream, PulseAudio_underflow, this);
 
 	buffersize *= AUD_DEVICE_SAMPLE_SIZE(m_specs);
 	m_buffersize = buffersize;
@@ -204,31 +237,53 @@ PulseAudioDevice::PulseAudioDevice(std::string name, DeviceSpecs specs, int buff
 	buffer_attr.prebuf = -1U;
 	buffer_attr.tlength = buffersize;
 
-	if(AUD_pa_stream_connect_playback(m_stream, nullptr, &buffer_attr, static_cast<pa_stream_flags_t>(PA_STREAM_START_CORKED | PA_STREAM_INTERPOLATE_TIMING | PA_STREAM_ADJUST_LATENCY | PA_STREAM_AUTO_TIMING_UPDATE), nullptr, nullptr) < 0)
+	m_ring_buffer.resize(buffersize);
+
+	if(AUD_pa_stream_connect_playback(m_stream, nullptr, &buffer_attr, static_cast<pa_stream_flags_t>(PA_STREAM_INTERPOLATE_TIMING | PA_STREAM_ADJUST_LATENCY | PA_STREAM_AUTO_TIMING_UPDATE), nullptr, nullptr) < 0)
 	{
+		AUD_pa_threaded_mainloop_unlock(m_mainloop);
+		AUD_pa_threaded_mainloop_stop(m_mainloop);
+
 		AUD_pa_context_disconnect(m_context);
 		AUD_pa_context_unref(m_context);
 
-		AUD_pa_mainloop_free(m_mainloop);
+		AUD_pa_threaded_mainloop_free(m_mainloop);
 
 		AUD_THROW(DeviceException, "Could not connect PulseAudio stream.");
 	}
 
+	AUD_pa_threaded_mainloop_unlock(m_mainloop);
+
 	create();
+
+	m_mixingThread = std::thread(&PulseAudioDevice::updateRingBuffer, this);
 }
 
 PulseAudioDevice::~PulseAudioDevice()
 {
-	stopMixingThread();
+	m_valid = false;
+
+	m_mixingLock.lock();
+	m_mixingCondition.notify_all();
+	m_mixingLock.unlock();
+
+	m_mixingThread.join();
+
+	AUD_pa_threaded_mainloop_stop(m_mainloop);
 
 	AUD_pa_context_disconnect(m_context);
 	AUD_pa_context_unref(m_context);
 
-	AUD_pa_mainloop_free(m_mainloop);
+	AUD_pa_threaded_mainloop_free(m_mainloop);
 
 	destroy();
 }
 
+ISynchronizer *PulseAudioDevice::getSynchronizer()
+{
+	return &m_synchronizer;
+}
+
 class PulseAudioDeviceFactory : public IDeviceFactory
 {
 private:
diff --git a/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.h b/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.h
index 45b813a5755..57359110633 100644
--- a/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.h
+++ b/extern/audaspace/plugins/pulseaudio/PulseAudioDevice.h
@@ -26,7 +26,11 @@
  * The PulseAudioDevice class.
  */
 
-#include "devices/ThreadedDevice.h"
+#include "devices/SoftwareDevice.h"
+#include "util/RingBuffer.h"
+
+#include <condition_variable>
+#include <thread>
 
 #include <pulse/pulseaudio.h>
 
@@ -35,18 +39,66 @@ AUD_NAMESPACE_BEGIN
 /**
  * This device plays back through PulseAudio, the simple direct media layer.
  */
-class AUD_PLUGIN_API PulseAudioDevice : public ThreadedDevice
+class AUD_PLUGIN_API PulseAudioDevice : public SoftwareDevice
 {
 private:
-	pa_mainloop* m_mainloop;
+	class PulseAudioSynchronizer : public DefaultSynchronizer
+	{
+		PulseAudioDevice* m_device;
+
+	public:
+		PulseAudioSynchronizer(PulseAudioDevice* device);
+
+		virtual double getPosition(std::shared_ptr<IHandle> handle);
+	};
+
+	/// Synchronizer.
+	PulseAudioSynchronizer m_synchronizer;
+
+	/**
+	 * Whether there is currently playback.
+	 */
+	volatile bool m_playback;
+
+	pa_threaded_mainloop* m_mainloop;
 	pa_context* m_context;
 	pa_stream* m_stream;
 	pa_context_state_t m_state;
 
+	/**
+	 * The mixing ring buffer.
+	 */
+	RingBuffer m_ring_buffer;
+
+	/**
+	 * Whether the device is valid.
+	 */
+	bool m_valid;
+
 	int m_buffersize;
 	uint32_t m_underflows;
 
 	/**
+	 * The mixing thread.
+	 */
+	std::thread m_mixingThread;
+
+	/**
+	 * Mutex for mixing.
+	 */
+	std::mutex m_mixingLock;
+
+	/**
+	 * Condition for mixing.
+	 */
+	std::condition_variable m_mixingCondition;
+
+	/**
+	 * Updates the ring buffer.
+	 */
+	AUD_LOCAL void updateRingBuffer();
+
+	/**
 	 * Reports the state of the PulseAudio server connection.
 	 * \param context The PulseAudio context.
 	 * \param data The PulseAudio device.
@@ -61,23 +113,13 @@ private:
 	 */
 	AUD_LOCAL static void PulseAudio_request(pa_stream* stream, size_t total_bytes, void* data);
 
-	/**
-	 * Reports an underflow from the PulseAudio server.
-	 * Automatically adjusts the latency if this happens too often.
-	 * @param stream The PulseAudio stream.
-	 * \param data The PulseAudio device.
-	 */
-	AUD_LOCAL static void PulseAudio_underflow(pa_stream* stream, void* data);
-
-	/**
-	 * Streaming thread main function.
-	 */
-	AUD_LOCAL void runMixingThread();
-
 	// delete copy constructor and operator=
 	PulseAudioDevice(const PulseAudioDevice&) = delete;
 	PulseAudioDevice& operator=(const PulseAudioDevice&) = delete;
 
+protected:
+	virtual void playing(bool playing);
+
 public:
 	/**
 	 * Opens the PulseAudio audio device for playback.
@@ -93,6 +135,8 @@ public:
 	 */
 	virtual ~PulseAudioDevice();
 
+	virtual ISynchronizer* getSynchronizer();
+
 	/**
 	 * Registers this plugin.
 	 */
diff --git a/extern/audaspace/plugins/pulseaudio/PulseAudioSymbols.h b/extern/audaspace/plugins/pulseaudio/PulseAudioSymbols.h
index 361aa518087..a33135b6e25 100644
--- a/extern/audaspace/plugins/pulseaudio/PulseAudioSymbols.h
+++ b/extern/audaspace/plugins/pulseaudio/PulseAudioSymbols.h
@@ -25,6 +25,7 @@ PULSEAUDIO_SYMBOL(pa_stream_begin_write);
 PULSEAUDIO_SYMBOL(pa_stream_connect_playback);
 PULSEAUDIO_SYMBOL(pa_stream_cork);
 PULSEAUDIO_SYMBOL(pa_stream_flush);
+PULSEAUDIO_SYMBOL(pa_stream_get_latency);
 PULSEAUDIO_SYMBOL(pa_stream_is_corked);
 PULSEAUDIO_SYMBOL(pa_stream_new);
 PULSEAUDIO_SYMBOL(pa_stream_set_buffer_attr);
@@ -39,3 +40,13 @@ PULSEAUDIO_SYMBOL(pa_mainloop_iterate);
 PULSEAUDIO_SYMBOL(pa_mainloop_prepare);
 PULSEAUDIO_SYMBOL(pa_mainloop_poll);
 PULSEAUDIO_SYMBOL(pa_mainloop_dispatch);
+
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_free);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_get_api);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_lock);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_new);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_signal);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_start);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_stop);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_unlock);
+PULSEAUDIO_SYMBOL(pa_threaded_mainloop_wait);
diff --git a/extern/audaspace/src/util/RingBuffer.cpp b/extern/audaspace/src/util/RingBuffer.cpp
new file mode 100644
index 00000000000..3796684aa88
--- /dev/null
+++ b/extern/audaspace/src/util/RingBuffer.cpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ * Copyright 2009-2021 Jörg Müller
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#include "util/RingBuffer.h"
+
+#include <algorithm>
+#include <cstring>
+#include <cstdlib>
+
+#define ALIGNMENT 32
+#define ALIGN(a) (a + ALIGNMENT - ((long long)a & (ALIGNMENT-1)))
+
+AUD_NAMESPACE_BEGIN
+
+RingBuffer::RingBuffer(int size) :
+	m_buffer(size),
+	m_read(0),
+	m_write(0)
+{
+}
+
+sample_t* RingBuffer::getBuffer() const
+{
+	return m_buffer.getBuffer();
+}
+
+int RingBuffer::getSize() const
+{
+	return m_buffer.getSize();
+}
+
+size_t RingBuffer::getReadSize() const
+{
+	size_t read = m_read;
+	size_t write = m_write;
+
+	if(read > write)
+		return write + getSize() - read;
+	else
+		return write - read;
+}
+
+size_t RingBuffer::getWriteSize() const
+{
+	size_t read = m_read;
+	size_t write = m_write;
+
+	if(read > write)
+		return read - write - 1;
+	else
+		return read + getSize() - write - 1;
+}
+
+size_t RingBuffer::read(data_t* target, size_t size)
+{
+	size = std::min(size, getReadSize());
+
+	data_t* buffer = reinterpret_cast<data_t*>(m_buffer.getBuffer());
+
+	if(m_read + size > m_buffer.getSize())
+	{
+		size_t read_first = m_buffer.getSize() - m_read;
+		size_t read_second = size - read_first;
+
+		std::memcpy(target, buffer + m_read, read_first);
+		std::memcpy(target + read_first, buffer, read_second);
+
+		m_read = read_second;
+	}
+	else
+	{
+		std::memcpy(target, buffer + m_read, size);
+
+		m_read += size;
+	}
+
+	return size;
+}
+
+size_t RingBuffer::write(data_t* source, size_t size)
+{
+	size = std::min(size, getWriteSize());
+
+	data_t* buffer = reinterpret_cast<data_t*>(m_buffer.getBuffer());
+
+	if(m_write + size > m_buffer.getSize())
+	{
+		size_t write_first = m_buffer.getSize() - m_write;
+		size_t write_second = size - write_first;
+
+		std::memcpy(buffer + m_write, source, write_first);
+		std::memcpy(buffer, source + write_first, write_second);
+
+		m_write = write_second;
+	}
+	else
+	{
+		std::memcpy(buffer + m_write, source, size);
+
+		m_write += size;
+	}
+
+	return size;
+}
+
+void RingBuffer::reset()
+{
+	m_read = 0;
+	m_write = 0;
+}
+
+void RingBuffer::resize(int size)
+{
+	m_buffer.resize(size);
+	reset();
+}
+
+void RingBuffer::assureSize(int size)
+{
+	m_buffer.assureSize(size);
+	reset();
+}
+
+AUD_NAMESPACE_END
diff --git a/extern/cuew/include/cuew.h b/extern/cuew/include/cuew.h
index 0fa0f1291fa..a2142b8f2ba 100644
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -645,7 +645,8 @@ typedef enum CUdevice_P2PAttribute_enum {
   CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01,
   CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02,
   CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03,
-  CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED = 0x04,
+  CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04,
+  CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04,
 } CUdevice_P2PAttribute;
 
 typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void* userData);
diff --git a/extern/json/README.blender b/extern/json/README.blender
new file mode 100644
index 00000000000..b9d8b02d87e
--- /dev/null
+++ b/extern/json/README.blender
@@ -0,0 +1,5 @@
+Project: JSON
+URL: https://github.com/nlohmann/json/
+License: MIT License
+Upstream version: 3.10.2
+Local modifications: None
diff --git a/extern/json/include/json.hpp b/extern/json/include/json.hpp
new file mode 100644
index 00000000000..8959265daea
--- /dev/null
+++ b/extern/json/include/json.hpp
@@ -0,0 +1,26640 @@
+/*
+    __ _____ _____ _____
+ __|  |   __|     |   | |  JSON for Modern C++
+|  |  |__   |  |  | | | |  version 3.10.2
+|_____|_____|_____|_|___|  https://github.com/nlohmann/json
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2013-2019 Niels Lohmann <http://nlohmann.me>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef INCLUDE_NLOHMANN_JSON_HPP_
+#define INCLUDE_NLOHMANN_JSON_HPP_
+
+#define NLOHMANN_JSON_VERSION_MAJOR 3
+#define NLOHMANN_JSON_VERSION_MINOR 10
+#define NLOHMANN_JSON_VERSION_PATCH 2
+
+#include <algorithm> // all_of, find, for_each
+#include <cstddef> // nullptr_t, ptrdiff_t, size_t
+#include <functional> // hash, less
+#include <initializer_list> // initializer_list
+#ifndef JSON_NO_IO
+    #include <iosfwd> // istream, ostream
+#endif  // JSON_NO_IO
+#include <iterator> // random_access_iterator_tag
+#include <memory> // unique_ptr
+#include <numeric> // accumulate
+#include <string> // string, stoi, to_string
+#include <utility> // declval, forward, move, pair, swap
+#include <vector> // vector
+
+// #include <nlohmann/adl_serializer.hpp>
+
+
+#include <type_traits>
+#include <utility>
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+
+#include <algorithm> // transform
+#include <array> // array
+#include <forward_list> // forward_list
+#include <iterator> // inserter, front_inserter, end
+#include <map> // map
+#include <string> // string
+#include <tuple> // tuple, make_tuple
+#include <type_traits> // is_arithmetic, is_same, is_enum, underlying_type, is_convertible
+#include <unordered_map> // unordered_map
+#include <utility> // pair, declval
+#include <valarray> // valarray
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+
+#include <exception> // exception
+#include <stdexcept> // runtime_error
+#include <string> // to_string
+#include <vector> // vector
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t
+#include <string> // string
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////////////
+// JSON type enumeration //
+///////////////////////////
+
+/*!
+@brief the JSON type enumeration
+
+This enumeration collects the different JSON types. It is internally used to
+distinguish the stored values, and the functions @ref basic_json::is_null(),
+@ref basic_json::is_object(), @ref basic_json::is_array(),
+@ref basic_json::is_string(), @ref basic_json::is_boolean(),
+@ref basic_json::is_number() (with @ref basic_json::is_number_integer(),
+@ref basic_json::is_number_unsigned(), and @ref basic_json::is_number_float()),
+@ref basic_json::is_discarded(), @ref basic_json::is_primitive(), and
+@ref basic_json::is_structured() rely on it.
+
+@note There are three enumeration entries (number_integer, number_unsigned, and
+number_float), because the library distinguishes these three types for numbers:
+@ref basic_json::number_unsigned_t is used for unsigned integers,
+@ref basic_json::number_integer_t is used for signed integers, and
+@ref basic_json::number_float_t is used for floating-point numbers or to
+approximate integers which do not fit in the limits of their respective type.
+
+@sa see @ref basic_json::basic_json(const value_t value_type) -- create a JSON
+value with the default value for a given type
+
+@since version 1.0.0
+*/
+enum class value_t : std::uint8_t
+{
+    null,             ///< null value
+    object,           ///< object (unordered set of name/value pairs)
+    array,            ///< array (ordered collection of values)
+    string,           ///< string value
+    boolean,          ///< boolean value
+    number_integer,   ///< number value (signed integer)
+    number_unsigned,  ///< number value (unsigned integer)
+    number_float,     ///< number value (floating-point)
+    binary,           ///< binary array (ordered collection of bytes)
+    discarded         ///< discarded by the parser callback function
+};
+
+/*!
+@brief comparison operator for JSON types
+
+Returns an ordering that is similar to Python:
+- order: null < boolean < number < object < array < string < binary
+- furthermore, each type is not smaller than itself
+- discarded values are not comparable
+- binary is represented as a b"" string in python and directly comparable to a
+  string; however, making a binary array directly comparable with a string would
+  be surprising behavior in a JSON file.
+
+@since version 1.0.0
+*/
+inline bool operator<(const value_t lhs, const value_t rhs) noexcept
+{
+    static constexpr std::array<std::uint8_t, 9> order = {{
+            0 /* null */, 3 /* object */, 4 /* array */, 5 /* string */,
+            1 /* boolean */, 2 /* integer */, 2 /* unsigned */, 2 /* float */,
+            6 /* binary */
+        }
+    };
+
+    const auto l_index = static_cast<std::size_t>(lhs);
+    const auto r_index = static_cast<std::size_t>(rhs);
+    return l_index < order.size() && r_index < order.size() && order[l_index] < order[r_index];
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+
+#include <string>
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+#include <utility> // pair
+// #include <nlohmann/thirdparty/hedley/hedley.hpp>
+
+
+/* Hedley - https://nemequ.github.io/hedley
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all
+ * copyright and related and neighboring rights to this software to
+ * the public domain worldwide. This software is distributed without
+ * any warranty.
+ *
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ * SPDX-License-Identifier: CC0-1.0
+ */
+
+#if !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < 15)
+#if defined(JSON_HEDLEY_VERSION)
+    #undef JSON_HEDLEY_VERSION
+#endif
+#define JSON_HEDLEY_VERSION 15
+
+#if defined(JSON_HEDLEY_STRINGIFY_EX)
+    #undef JSON_HEDLEY_STRINGIFY_EX
+#endif
+#define JSON_HEDLEY_STRINGIFY_EX(x) #x
+
+#if defined(JSON_HEDLEY_STRINGIFY)
+    #undef JSON_HEDLEY_STRINGIFY
+#endif
+#define JSON_HEDLEY_STRINGIFY(x) JSON_HEDLEY_STRINGIFY_EX(x)
+
+#if defined(JSON_HEDLEY_CONCAT_EX)
+    #undef JSON_HEDLEY_CONCAT_EX
+#endif
+#define JSON_HEDLEY_CONCAT_EX(a,b) a##b
+
+#if defined(JSON_HEDLEY_CONCAT)
+    #undef JSON_HEDLEY_CONCAT
+#endif
+#define JSON_HEDLEY_CONCAT(a,b) JSON_HEDLEY_CONCAT_EX(a,b)
+
+#if defined(JSON_HEDLEY_CONCAT3_EX)
+    #undef JSON_HEDLEY_CONCAT3_EX
+#endif
+#define JSON_HEDLEY_CONCAT3_EX(a,b,c) a##b##c
+
+#if defined(JSON_HEDLEY_CONCAT3)
+    #undef JSON_HEDLEY_CONCAT3
+#endif
+#define JSON_HEDLEY_CONCAT3(a,b,c) JSON_HEDLEY_CONCAT3_EX(a,b,c)
+
+#if defined(JSON_HEDLEY_VERSION_ENCODE)
+    #undef JSON_HEDLEY_VERSION_ENCODE
+#endif
+#define JSON_HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision))
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MAJOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_MINOR)
+    #undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000)
+
+#if defined(JSON_HEDLEY_VERSION_DECODE_REVISION)
+    #undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#endif
+#define JSON_HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000)
+
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #undef JSON_HEDLEY_GNUC_VERSION
+#endif
+#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__)
+#elif defined(__GNUC__)
+    #define JSON_HEDLEY_GNUC_VERSION JSON_HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GNUC_VERSION)
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GNUC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION)
+    #undef JSON_HEDLEY_MSVC_VERSION
+#endif
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100)
+#elif defined(_MSC_FULL_VER) && !defined(__ICL)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10)
+#elif defined(_MSC_VER) && !defined(__ICL)
+    #define JSON_HEDLEY_MSVC_VERSION JSON_HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_MSVC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#endif
+#if !defined(JSON_HEDLEY_MSVC_VERSION)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0)
+#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch)))
+#elif defined(_MSC_VER) && (_MSC_VER >= 1200)
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch)))
+#else
+    #define JSON_HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor)))
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #undef JSON_HEDLEY_INTEL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE)
+#elif defined(__INTEL_COMPILER) && !defined(__ICL)
+    #define JSON_HEDLEY_INTEL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_VERSION)
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
+    #undef JSON_HEDLEY_INTEL_CL_VERSION
+#endif
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL)
+    #define JSON_HEDLEY_INTEL_CL_VERSION JSON_HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
+#endif
+
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION_CHECK)
+    #undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_INTEL_CL_VERSION)
+    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_INTEL_CL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #undef JSON_HEDLEY_PGI_VERSION
+#endif
+#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__)
+    #define JSON_HEDLEY_PGI_VERSION JSON_HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__)
+#endif
+
+#if defined(JSON_HEDLEY_PGI_VERSION_CHECK)
+    #undef JSON_HEDLEY_PGI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PGI_VERSION)
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PGI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #undef JSON_HEDLEY_SUNPRO_VERSION
+#endif
+#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10)
+#elif defined(__SUNPRO_C)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf)
+#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10)
+#elif defined(__SUNPRO_CC)
+    #define JSON_HEDLEY_SUNPRO_VERSION JSON_HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_SUNPRO_VERSION_CHECK)
+    #undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_SUNPRO_VERSION)
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_SUNPRO_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#endif
+#if defined(__EMSCRIPTEN__)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION JSON_HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__)
+#endif
+
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK)
+    #undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_EMSCRIPTEN_VERSION)
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_EMSCRIPTEN_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #undef JSON_HEDLEY_ARM_VERSION
+#endif
+#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100)
+#elif defined(__CC_ARM) && defined(__ARMCC_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100)
+#endif
+
+#if defined(JSON_HEDLEY_ARM_VERSION_CHECK)
+    #undef JSON_HEDLEY_ARM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_ARM_VERSION)
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_ARM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #undef JSON_HEDLEY_IBM_VERSION
+#endif
+#if defined(__ibmxl__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__)
+#elif defined(__xlC__) && defined(__xlC_ver__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff)
+#elif defined(__xlC__)
+    #define JSON_HEDLEY_IBM_VERSION JSON_HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0)
+#endif
+
+#if defined(JSON_HEDLEY_IBM_VERSION_CHECK)
+    #undef JSON_HEDLEY_IBM_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IBM_VERSION)
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IBM_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #undef JSON_HEDLEY_TI_VERSION
+#endif
+#if \
+    defined(__TI_COMPILER_VERSION__) && \
+    ( \
+      defined(__TMS470__) || defined(__TI_ARM__) || \
+      defined(__MSP430__) || \
+      defined(__TMS320C2000__) \
+    )
+#if (__TI_COMPILER_VERSION__ >= 16000000)
+    #define JSON_HEDLEY_TI_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+#endif
+
+#if defined(JSON_HEDLEY_TI_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_VERSION)
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__)
+    #define JSON_HEDLEY_TI_CL2000_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL2000_VERSION)
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL2000_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #undef JSON_HEDLEY_TI_CL430_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__)
+    #define JSON_HEDLEY_TI_CL430_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL430_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL430_VERSION)
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL430_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__))
+    #define JSON_HEDLEY_TI_ARMCL_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_ARMCL_VERSION)
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_ARMCL_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__)
+    #define JSON_HEDLEY_TI_CL6X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL6X_VERSION)
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL6X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__)
+    #define JSON_HEDLEY_TI_CL7X_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CL7X_VERSION)
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CL7X_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION
+#endif
+#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION JSON_HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000))
+#endif
+
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION_CHECK)
+    #undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TI_CLPRU_VERSION)
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TI_CLPRU_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #undef JSON_HEDLEY_CRAY_VERSION
+#endif
+#if defined(_CRAYC)
+    #if defined(_RELEASE_PATCHLEVEL)
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL)
+    #else
+        #define JSON_HEDLEY_CRAY_VERSION JSON_HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_CRAY_VERSION_CHECK)
+    #undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_CRAY_VERSION)
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_CRAY_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #undef JSON_HEDLEY_IAR_VERSION
+#endif
+#if defined(__IAR_SYSTEMS_ICC__)
+    #if __VER__ > 1000
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000))
+    #else
+        #define JSON_HEDLEY_IAR_VERSION JSON_HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0)
+    #endif
+#endif
+
+#if defined(JSON_HEDLEY_IAR_VERSION_CHECK)
+    #undef JSON_HEDLEY_IAR_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_IAR_VERSION)
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_IAR_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #undef JSON_HEDLEY_TINYC_VERSION
+#endif
+#if defined(__TINYC__)
+    #define JSON_HEDLEY_TINYC_VERSION JSON_HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_TINYC_VERSION_CHECK)
+    #undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_TINYC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #undef JSON_HEDLEY_DMC_VERSION
+#endif
+#if defined(__DMC__)
+    #define JSON_HEDLEY_DMC_VERSION JSON_HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf)
+#endif
+
+#if defined(JSON_HEDLEY_DMC_VERSION_CHECK)
+    #undef JSON_HEDLEY_DMC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_DMC_VERSION)
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_DMC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #undef JSON_HEDLEY_COMPCERT_VERSION
+#endif
+#if defined(__COMPCERT_VERSION__)
+    #define JSON_HEDLEY_COMPCERT_VERSION JSON_HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100)
+#endif
+
+#if defined(JSON_HEDLEY_COMPCERT_VERSION_CHECK)
+    #undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_COMPCERT_VERSION)
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_COMPCERT_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #undef JSON_HEDLEY_PELLES_VERSION
+#endif
+#if defined(__POCC__)
+    #define JSON_HEDLEY_PELLES_VERSION JSON_HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0)
+#endif
+
+#if defined(JSON_HEDLEY_PELLES_VERSION_CHECK)
+    #undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_PELLES_VERSION)
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_PELLES_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
+    #undef JSON_HEDLEY_MCST_LCC_VERSION
+#endif
+#if defined(__LCC__) && defined(__LCC_MINOR__)
+    #define JSON_HEDLEY_MCST_LCC_VERSION JSON_HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__)
+#endif
+
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_MCST_LCC_VERSION)
+    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_MCST_LCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #undef JSON_HEDLEY_GCC_VERSION
+#endif
+#if \
+    defined(JSON_HEDLEY_GNUC_VERSION) && \
+    !defined(__clang__) && \
+    !defined(JSON_HEDLEY_INTEL_VERSION) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_ARM_VERSION) && \
+    !defined(JSON_HEDLEY_CRAY_VERSION) && \
+    !defined(JSON_HEDLEY_TI_VERSION) && \
+    !defined(JSON_HEDLEY_TI_ARMCL_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL430_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL2000_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL6X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CL7X_VERSION) && \
+    !defined(JSON_HEDLEY_TI_CLPRU_VERSION) && \
+    !defined(__COMPCERT__) && \
+    !defined(JSON_HEDLEY_MCST_LCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION JSON_HEDLEY_GNUC_VERSION
+#endif
+
+#if defined(JSON_HEDLEY_GCC_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_VERSION_CHECK
+#endif
+#if defined(JSON_HEDLEY_GCC_VERSION)
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (JSON_HEDLEY_GCC_VERSION >= JSON_HEDLEY_VERSION_ENCODE(major, minor, patch))
+#else
+    #define JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_ATTRIBUTE
+#endif
+#if \
+  defined(__has_attribute) && \
+  ( \
+    (!defined(JSON_HEDLEY_IAR_VERSION) || JSON_HEDLEY_IAR_VERSION_CHECK(8,5,9)) \
+  )
+#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute)
+#else
+#  define JSON_HEDLEY_HAS_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#endif
+#if defined(__has_attribute)
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#endif
+#if \
+    defined(__has_cpp_attribute) && \
+    defined(__cplusplus) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS)
+    #undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#endif
+#if !defined(__cplusplus) || !defined(__has_cpp_attribute)
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#elif \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_IAR_VERSION) && \
+    (!defined(JSON_HEDLEY_SUNPRO_VERSION) || JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \
+    (!defined(JSON_HEDLEY_MSVC_VERSION) || JSON_HEDLEY_MSVC_VERSION_CHECK(19,20,0))
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute)
+#else
+    #define JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#endif
+#if defined(__has_cpp_attribute) && defined(__cplusplus)
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_BUILTIN)
+    #undef JSON_HEDLEY_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_HAS_BUILTIN(builtin) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_BUILTIN)
+    #undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#endif
+#if defined(__has_builtin)
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin)
+#else
+    #define JSON_HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_FEATURE)
+    #undef JSON_HEDLEY_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_HAS_FEATURE(feature) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_HAS_FEATURE(feature) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_FEATURE)
+    #undef JSON_HEDLEY_GCC_HAS_FEATURE
+#endif
+#if defined(__has_feature)
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature)
+#else
+    #define JSON_HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_EXTENSION)
+    #undef JSON_HEDLEY_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_HAS_EXTENSION(extension) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_EXTENSION)
+    #undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#endif
+#if defined(__has_extension)
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension)
+#else
+    #define JSON_HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#endif
+#if defined(__has_declspec_attribute)
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute)
+#else
+    #define JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_HAS_WARNING)
+    #undef JSON_HEDLEY_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_HAS_WARNING(warning) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_HAS_WARNING(warning) (0)
+#endif
+
+#if defined(JSON_HEDLEY_GNUC_HAS_WARNING)
+    #undef JSON_HEDLEY_GNUC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GNUC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_GCC_HAS_WARNING)
+    #undef JSON_HEDLEY_GCC_HAS_WARNING
+#endif
+#if defined(__has_warning)
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning)
+#else
+    #define JSON_HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \
+    (JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR))
+    #define JSON_HEDLEY_PRAGMA(value) _Pragma(#value)
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_PRAGMA(value) __pragma(value)
+#else
+    #define JSON_HEDLEY_PRAGMA(value)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_PUSH)
+    #undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#endif
+#if defined(JSON_HEDLEY_DIAGNOSTIC_POP)
+    #undef JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
+    #define JSON_HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
+#elif JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
+    #define JSON_HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_PUSH
+    #define JSON_HEDLEY_DIAGNOSTIC_POP
+#endif
+
+/* JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat")
+#    if JSON_HEDLEY_HAS_WARNING("-Wc++17-extensions")
+#      if JSON_HEDLEY_HAS_WARNING("-Wc++1z-extensions")
+#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#      else
+#        define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#      endif
+#    else
+#      define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \
+    xpr \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    endif
+#  endif
+#endif
+#if !defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x
+#endif
+
+#if defined(JSON_HEDLEY_CONST_CAST)
+    #undef JSON_HEDLEY_CONST_CAST
+#endif
+#if defined(__cplusplus)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (const_cast<T>(expr))
+#elif \
+  JSON_HEDLEY_HAS_WARNING("-Wcast-qual") || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_CONST_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_REINTERPRET_CAST)
+    #undef JSON_HEDLEY_REINTERPRET_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_CAST)
+    #undef JSON_HEDLEY_STATIC_CAST
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) (static_cast<T>(expr))
+#else
+    #define JSON_HEDLEY_STATIC_CAST(T, expr) ((T) (expr))
+#endif
+
+#if defined(JSON_HEDLEY_CPP_CAST)
+    #undef JSON_HEDLEY_CPP_CAST
+#endif
+#if defined(__cplusplus)
+#  if JSON_HEDLEY_HAS_WARNING("-Wold-style-cast")
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \
+    ((T) (expr)) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  elif JSON_HEDLEY_IAR_VERSION_CHECK(8,3,0)
+#    define JSON_HEDLEY_CPP_CAST(T, expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("diag_suppress=Pe137") \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_CPP_CAST(T, expr) ((T) (expr))
+#  endif
+#else
+#  define JSON_HEDLEY_CPP_CAST(T, expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wdeprecated-declarations")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996))
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215")
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,90,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068))
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(16,9,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161")
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-attributes")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(4,6,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)")
+#elif JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292))
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030))
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(20,7,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098")
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)")
+#elif \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097")
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wcast-qual")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"")
+#elif JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#endif
+
+#if defined(JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION)
+    #undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunused-function")
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"")
+#elif JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#elif JSON_HEDLEY_MSVC_VERSION_CHECK(1,0,0)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505))
+#elif JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142")
+#else
+    #define JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#endif
+
+#if defined(JSON_HEDLEY_DEPRECATED)
+    #undef JSON_HEDLEY_DEPRECATED
+#endif
+#if defined(JSON_HEDLEY_DEPRECATED_FOR)
+    #undef JSON_HEDLEY_DEPRECATED_FOR
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement))
+#elif \
+    (JSON_HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since)))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement)))
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
+    #define JSON_HEDLEY_DEPRECATED(since) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]])
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(deprecated) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_DEPRECATED(since) __declspec(deprecated)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_DEPRECATED(since) _Pragma("deprecated")
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated")
+#else
+    #define JSON_HEDLEY_DEPRECATED(since)
+    #define JSON_HEDLEY_DEPRECATED_FOR(since, replacement)
+#endif
+
+#if defined(JSON_HEDLEY_UNAVAILABLE)
+    #undef JSON_HEDLEY_UNAVAILABLE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warning) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since)))
+#else
+    #define JSON_HEDLEY_UNAVAILABLE(available_since)
+#endif
+
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#endif
+#if defined(JSON_HEDLEY_WARN_UNUSED_RESULT_MSG)
+    #undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__))
+#elif (JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
+#elif defined(_Check_return_) /* SAL */
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT _Check_return_
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
+#else
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT
+    #define JSON_HEDLEY_WARN_UNUSED_RESULT_MSG(msg)
+#endif
+
+#if defined(JSON_HEDLEY_SENTINEL)
+    #undef JSON_HEDLEY_SENTINEL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(sentinel) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position)))
+#else
+    #define JSON_HEDLEY_SENTINEL(position)
+#endif
+
+#if defined(JSON_HEDLEY_NO_RETURN)
+    #undef JSON_HEDLEY_NO_RETURN
+#endif
+#if JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NO_RETURN __noreturn
+#elif \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+    #define JSON_HEDLEY_NO_RETURN _Noreturn
+#elif defined(__cplusplus) && (__cplusplus >= 201103L)
+    #define JSON_HEDLEY_NO_RETURN JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]])
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noreturn) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,2,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute__((__noreturn__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("does_not_return")
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NO_RETURN __attribute((noreturn))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NO_RETURN __declspec(noreturn)
+#else
+    #define JSON_HEDLEY_NO_RETURN
+#endif
+
+#if defined(JSON_HEDLEY_NO_ESCAPE)
+    #undef JSON_HEDLEY_NO_ESCAPE
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(noescape)
+    #define JSON_HEDLEY_NO_ESCAPE __attribute__((__noescape__))
+#else
+    #define JSON_HEDLEY_NO_ESCAPE
+#endif
+
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #undef JSON_HEDLEY_UNREACHABLE
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE_RETURN)
+    #undef JSON_HEDLEY_UNREACHABLE_RETURN
+#endif
+#if defined(JSON_HEDLEY_ASSUME)
+    #undef JSON_HEDLEY_ASSUME
+#endif
+#if \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_ASSUME(expr) __assume(expr)
+#elif JSON_HEDLEY_HAS_BUILTIN(__builtin_assume)
+    #define JSON_HEDLEY_ASSUME(expr) __builtin_assume(expr)
+#elif \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+    #if defined(__cplusplus)
+        #define JSON_HEDLEY_ASSUME(expr) std::_nassert(expr)
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) _nassert(expr)
+    #endif
+#endif
+#if \
+    (JSON_HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(JSON_HEDLEY_ARM_VERSION))) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,5,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(18,10,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,5) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_UNREACHABLE() __builtin_unreachable()
+#elif defined(JSON_HEDLEY_ASSUME)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+#if !defined(JSON_HEDLEY_ASSUME)
+    #if defined(JSON_HEDLEY_UNREACHABLE)
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (JSON_HEDLEY_UNREACHABLE(), 1)))
+    #else
+        #define JSON_HEDLEY_ASSUME(expr) JSON_HEDLEY_STATIC_CAST(void, expr)
+    #endif
+#endif
+#if defined(JSON_HEDLEY_UNREACHABLE)
+    #if  \
+        JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0)
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (JSON_HEDLEY_STATIC_CAST(void, JSON_HEDLEY_ASSUME(0)), (value))
+    #else
+        #define JSON_HEDLEY_UNREACHABLE_RETURN(value) JSON_HEDLEY_UNREACHABLE()
+    #endif
+#else
+    #define JSON_HEDLEY_UNREACHABLE_RETURN(value) return (value)
+#endif
+#if !defined(JSON_HEDLEY_UNREACHABLE)
+    #define JSON_HEDLEY_UNREACHABLE() JSON_HEDLEY_ASSUME(0)
+#endif
+
+JSON_HEDLEY_DIAGNOSTIC_PUSH
+#if JSON_HEDLEY_HAS_WARNING("-Wpedantic")
+    #pragma clang diagnostic ignored "-Wpedantic"
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus)
+    #pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#endif
+#if JSON_HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0)
+    #if defined(__clang__)
+        #pragma clang diagnostic ignored "-Wvariadic-macros"
+    #elif defined(JSON_HEDLEY_GCC_VERSION)
+        #pragma GCC diagnostic ignored "-Wvariadic-macros"
+    #endif
+#endif
+#if defined(JSON_HEDLEY_NON_NULL)
+    #undef JSON_HEDLEY_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__)))
+#else
+    #define JSON_HEDLEY_NON_NULL(...)
+#endif
+JSON_HEDLEY_DIAGNOSTIC_POP
+
+#if defined(JSON_HEDLEY_PRINTF_FORMAT)
+    #undef JSON_HEDLEY_PRINTF_FORMAT
+#endif
+#if defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check)))
+#elif defined(__MINGW32__) && JSON_HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check)))
+#elif \
+    JSON_HEDLEY_HAS_ATTRIBUTE(format) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(5,6,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check)))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(6,0,0)
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check))
+#else
+    #define JSON_HEDLEY_PRINTF_FORMAT(string_idx,first_to_check)
+#endif
+
+#if defined(JSON_HEDLEY_CONSTEXPR)
+    #undef JSON_HEDLEY_CONSTEXPR
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_CONSTEXPR JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr)
+    #endif
+#endif
+#if !defined(JSON_HEDLEY_CONSTEXPR)
+    #define JSON_HEDLEY_CONSTEXPR
+#endif
+
+#if defined(JSON_HEDLEY_PREDICT)
+    #undef JSON_HEDLEY_PREDICT
+#endif
+#if defined(JSON_HEDLEY_LIKELY)
+    #undef JSON_HEDLEY_LIKELY
+#endif
+#if defined(JSON_HEDLEY_UNLIKELY)
+    #undef JSON_HEDLEY_UNLIKELY
+#endif
+#if defined(JSON_HEDLEY_UNPREDICTABLE)
+    #undef JSON_HEDLEY_UNPREDICTABLE
+#endif
+#if JSON_HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
+#endif
+#if \
+  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(JSON_HEDLEY_PGI_VERSION)) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(9,0,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define JSON_HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability(  (expr), (value), (probability))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability)   __builtin_expect_with_probability(!!(expr),    1   , (probability))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability)  __builtin_expect_with_probability(!!(expr),    0   , (probability))
+#  define JSON_HEDLEY_LIKELY(expr)                      __builtin_expect                 (!!(expr),    1                  )
+#  define JSON_HEDLEY_UNLIKELY(expr)                    __builtin_expect                 (!!(expr),    0                  )
+#elif \
+  (JSON_HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(3,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \
+  JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) \
+    (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (JSON_HEDLEY_STATIC_CAST(void, expected), (expr)))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) \
+    (__extension__ ({ \
+        double hedley_probability_ = (probability); \
+        ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \
+    }))
+#  define JSON_HEDLEY_LIKELY(expr)   __builtin_expect(!!(expr), 1)
+#  define JSON_HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#else
+#  define JSON_HEDLEY_PREDICT(expr, expected, probability) (JSON_HEDLEY_STATIC_CAST(void, expected), (expr))
+#  define JSON_HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr))
+#  define JSON_HEDLEY_LIKELY(expr) (!!(expr))
+#  define JSON_HEDLEY_UNLIKELY(expr) (!!(expr))
+#endif
+#if !defined(JSON_HEDLEY_UNPREDICTABLE)
+    #define JSON_HEDLEY_UNPREDICTABLE(expr) JSON_HEDLEY_PREDICT(expr, 1, 0.5)
+#endif
+
+#if defined(JSON_HEDLEY_MALLOC)
+    #undef JSON_HEDLEY_MALLOC
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(malloc) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_MALLOC __attribute__((__malloc__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_MALLOC _Pragma("returns_new_memory")
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_MALLOC __declspec(restrict)
+#else
+    #define JSON_HEDLEY_MALLOC
+#endif
+
+#if defined(JSON_HEDLEY_PURE)
+    #undef JSON_HEDLEY_PURE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(pure) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(2,96,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#  define JSON_HEDLEY_PURE __attribute__((__pure__))
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+#  define JSON_HEDLEY_PURE _Pragma("does_not_write_global_data")
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \
+    )
+#  define JSON_HEDLEY_PURE _Pragma("FUNC_IS_PURE;")
+#else
+#  define JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_CONST)
+    #undef JSON_HEDLEY_CONST
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(const) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(2,5,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_CONST __attribute__((__const__))
+#elif \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0)
+    #define JSON_HEDLEY_CONST _Pragma("no_side_effect")
+#else
+    #define JSON_HEDLEY_CONST JSON_HEDLEY_PURE
+#endif
+
+#if defined(JSON_HEDLEY_RESTRICT)
+    #undef JSON_HEDLEY_RESTRICT
+#endif
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT restrict
+#elif \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_PGI_VERSION_CHECK(17,10,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0) || \
+    defined(__clang__) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_RESTRICT __restrict
+#elif JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus)
+    #define JSON_HEDLEY_RESTRICT _Restrict
+#else
+    #define JSON_HEDLEY_RESTRICT
+#endif
+
+#if defined(JSON_HEDLEY_INLINE)
+    #undef JSON_HEDLEY_INLINE
+#endif
+#if \
+    (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
+    (defined(__cplusplus) && (__cplusplus >= 199711L))
+    #define JSON_HEDLEY_INLINE inline
+#elif \
+    defined(JSON_HEDLEY_GCC_VERSION) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(6,2,0)
+    #define JSON_HEDLEY_INLINE __inline__
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_INLINE __inline
+#else
+    #define JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_ALWAYS_INLINE)
+    #undef JSON_HEDLEY_ALWAYS_INLINE
+#endif
+#if \
+  JSON_HEDLEY_HAS_ATTRIBUTE(always_inline) || \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+  JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+  JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+  JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+  JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+  (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+  (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+  (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+  (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+  JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+  JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+  JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+  JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+  JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) JSON_HEDLEY_INLINE
+#elif \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \
+  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE __forceinline
+#elif defined(__cplusplus) && \
+    ( \
+      JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+      JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+      JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+      JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+      JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+      JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \
+    )
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_ALWAYS_INLINE _Pragma("inline=forced")
+#else
+#  define JSON_HEDLEY_ALWAYS_INLINE JSON_HEDLEY_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_NEVER_INLINE)
+    #undef JSON_HEDLEY_NEVER_INLINE
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(noinline) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,0,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(10,1,0) || \
+    JSON_HEDLEY_TI_VERSION_CHECK(15,12,0) || \
+    (JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \
+    (JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \
+    (JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \
+    (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \
+    JSON_HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \
+    JSON_HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \
+    JSON_HEDLEY_IAR_VERSION_CHECK(8,10,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute__((__noinline__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#elif JSON_HEDLEY_PGI_VERSION_CHECK(10,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("noinline")
+#elif JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;")
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE _Pragma("inline=never")
+#elif JSON_HEDLEY_COMPCERT_VERSION_CHECK(3,2,0)
+    #define JSON_HEDLEY_NEVER_INLINE __attribute((noinline))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(9,0,0)
+    #define JSON_HEDLEY_NEVER_INLINE __declspec(noinline)
+#else
+    #define JSON_HEDLEY_NEVER_INLINE
+#endif
+
+#if defined(JSON_HEDLEY_PRIVATE)
+    #undef JSON_HEDLEY_PRIVATE
+#endif
+#if defined(JSON_HEDLEY_PUBLIC)
+    #undef JSON_HEDLEY_PUBLIC
+#endif
+#if defined(JSON_HEDLEY_IMPORT)
+    #undef JSON_HEDLEY_IMPORT
+#endif
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  define JSON_HEDLEY_PRIVATE
+#  define JSON_HEDLEY_PUBLIC   __declspec(dllexport)
+#  define JSON_HEDLEY_IMPORT   __declspec(dllimport)
+#else
+#  if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(visibility) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    ( \
+      defined(__TI_EABI__) && \
+      ( \
+        (JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \
+        JSON_HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \
+      ) \
+    ) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+#    define JSON_HEDLEY_PRIVATE __attribute__((__visibility__("hidden")))
+#    define JSON_HEDLEY_PUBLIC  __attribute__((__visibility__("default")))
+#  else
+#    define JSON_HEDLEY_PRIVATE
+#    define JSON_HEDLEY_PUBLIC
+#  endif
+#  define JSON_HEDLEY_IMPORT    extern
+#endif
+
+#if defined(JSON_HEDLEY_NO_THROW)
+    #undef JSON_HEDLEY_NO_THROW
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(nothrow) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,3,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_NO_THROW __attribute__((__nothrow__))
+#elif \
+    JSON_HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0)
+    #define JSON_HEDLEY_NO_THROW __declspec(nothrow)
+#else
+    #define JSON_HEDLEY_NO_THROW
+#endif
+
+#if defined(JSON_HEDLEY_FALL_THROUGH)
+    #undef JSON_HEDLEY_FALL_THROUGH
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(fallthrough) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(7,0,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]])
+#elif JSON_HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough)
+    #define JSON_HEDLEY_FALL_THROUGH JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]])
+#elif defined(__fallthrough) /* SAL */
+    #define JSON_HEDLEY_FALL_THROUGH __fallthrough
+#else
+    #define JSON_HEDLEY_FALL_THROUGH
+#endif
+
+#if defined(JSON_HEDLEY_RETURNS_NON_NULL)
+    #undef JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+#if \
+    JSON_HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__))
+#elif defined(_Ret_notnull_) /* SAL */
+    #define JSON_HEDLEY_RETURNS_NON_NULL _Ret_notnull_
+#else
+    #define JSON_HEDLEY_RETURNS_NON_NULL
+#endif
+
+#if defined(JSON_HEDLEY_ARRAY_PARAM)
+    #undef JSON_HEDLEY_ARRAY_PARAM
+#endif
+#if \
+    defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
+    !defined(__STDC_NO_VLA__) && \
+    !defined(__cplusplus) && \
+    !defined(JSON_HEDLEY_PGI_VERSION) && \
+    !defined(JSON_HEDLEY_TINYC_VERSION)
+    #define JSON_HEDLEY_ARRAY_PARAM(name) (name)
+#else
+    #define JSON_HEDLEY_ARRAY_PARAM(name)
+#endif
+
+#if defined(JSON_HEDLEY_IS_CONSTANT)
+    #undef JSON_HEDLEY_IS_CONSTANT
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_CONSTEXPR)
+    #undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#endif
+/* JSON_HEDLEY_IS_CONSTEXPR_ is for
+   HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #undef JSON_HEDLEY_IS_CONSTEXPR_
+#endif
+#if \
+    JSON_HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \
+    JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+    JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+    JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \
+    JSON_HEDLEY_ARM_VERSION_CHECK(4,1,0) || \
+    JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+    JSON_HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \
+    (JSON_HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \
+    JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+    JSON_HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10)
+    #define JSON_HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr)
+#endif
+#if !defined(__cplusplus)
+#  if \
+       JSON_HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(13,1,0) || \
+       JSON_HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,4,0) || \
+       JSON_HEDLEY_TINYC_VERSION_CHECK(0,9,24)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*)
+#endif
+#  elif \
+       ( \
+          defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+          !defined(JSON_HEDLEY_SUNPRO_VERSION) && \
+          !defined(JSON_HEDLEY_PGI_VERSION) && \
+          !defined(JSON_HEDLEY_IAR_VERSION)) || \
+       (JSON_HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(JSON_HEDLEY_IAR_VERSION)) || \
+       JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) || \
+       JSON_HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \
+       JSON_HEDLEY_IBM_VERSION_CHECK(12,1,0) || \
+       JSON_HEDLEY_ARM_VERSION_CHECK(5,3,0)
+#if defined(__INTPTR_TYPE__)
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0)
+#else
+    #include <stdint.h>
+    #define JSON_HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0)
+#endif
+#  elif \
+       defined(JSON_HEDLEY_GCC_VERSION) || \
+       defined(JSON_HEDLEY_INTEL_VERSION) || \
+       defined(JSON_HEDLEY_TINYC_VERSION) || \
+       defined(JSON_HEDLEY_TI_ARMCL_VERSION) || \
+       JSON_HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \
+       defined(JSON_HEDLEY_TI_CL2000_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL6X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CL7X_VERSION) || \
+       defined(JSON_HEDLEY_TI_CLPRU_VERSION) || \
+       defined(__clang__)
+#    define JSON_HEDLEY_IS_CONSTEXPR_(expr) ( \
+        sizeof(void) != \
+        sizeof(*( \
+                  1 ? \
+                  ((void*) ((expr) * 0L) ) : \
+((struct { char v[sizeof(void) * 2]; } *) 1) \
+                ) \
+              ) \
+                                            )
+#  endif
+#endif
+#if defined(JSON_HEDLEY_IS_CONSTEXPR_)
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) JSON_HEDLEY_IS_CONSTEXPR_(expr)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (JSON_HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1))
+#else
+    #if !defined(JSON_HEDLEY_IS_CONSTANT)
+        #define JSON_HEDLEY_IS_CONSTANT(expr) (0)
+    #endif
+    #define JSON_HEDLEY_REQUIRE_CONSTEXPR(expr) (expr)
+#endif
+
+#if defined(JSON_HEDLEY_BEGIN_C_DECLS)
+    #undef JSON_HEDLEY_BEGIN_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_END_C_DECLS)
+    #undef JSON_HEDLEY_END_C_DECLS
+#endif
+#if defined(JSON_HEDLEY_C_DECL)
+    #undef JSON_HEDLEY_C_DECL
+#endif
+#if defined(__cplusplus)
+    #define JSON_HEDLEY_BEGIN_C_DECLS extern "C" {
+    #define JSON_HEDLEY_END_C_DECLS }
+    #define JSON_HEDLEY_C_DECL extern "C"
+#else
+    #define JSON_HEDLEY_BEGIN_C_DECLS
+    #define JSON_HEDLEY_END_C_DECLS
+    #define JSON_HEDLEY_C_DECL
+#endif
+
+#if defined(JSON_HEDLEY_STATIC_ASSERT)
+    #undef JSON_HEDLEY_STATIC_ASSERT
+#endif
+#if \
+  !defined(__cplusplus) && ( \
+      (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
+      (JSON_HEDLEY_HAS_FEATURE(c_static_assert) && !defined(JSON_HEDLEY_INTEL_CL_VERSION)) || \
+      JSON_HEDLEY_GCC_VERSION_CHECK(6,0,0) || \
+      JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \
+      defined(_Static_assert) \
+    )
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
+#elif \
+  (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \
+  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message) JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message))
+#else
+#  define JSON_HEDLEY_STATIC_ASSERT(expr, message)
+#endif
+
+#if defined(JSON_HEDLEY_NULL)
+    #undef JSON_HEDLEY_NULL
+#endif
+#if defined(__cplusplus)
+    #if __cplusplus >= 201103L
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr)
+    #elif defined(NULL)
+        #define JSON_HEDLEY_NULL NULL
+    #else
+        #define JSON_HEDLEY_NULL JSON_HEDLEY_STATIC_CAST(void*, 0)
+    #endif
+#elif defined(NULL)
+    #define JSON_HEDLEY_NULL NULL
+#else
+    #define JSON_HEDLEY_NULL ((void*) 0)
+#endif
+
+#if defined(JSON_HEDLEY_MESSAGE)
+    #undef JSON_HEDLEY_MESSAGE
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_MESSAGE(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(message msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message msg)
+#elif JSON_HEDLEY_CRAY_VERSION_CHECK(5,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(_CRI message msg)
+#elif JSON_HEDLEY_IAR_VERSION_CHECK(8,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#elif JSON_HEDLEY_PELLES_VERSION_CHECK(2,0,0)
+#  define JSON_HEDLEY_MESSAGE(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_WARNING)
+    #undef JSON_HEDLEY_WARNING
+#endif
+#if JSON_HEDLEY_HAS_WARNING("-Wunknown-pragmas")
+#  define JSON_HEDLEY_WARNING(msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
+    JSON_HEDLEY_PRAGMA(clang warning msg) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#elif \
+  JSON_HEDLEY_GCC_VERSION_CHECK(4,8,0) || \
+  JSON_HEDLEY_PGI_VERSION_CHECK(18,4,0) || \
+  JSON_HEDLEY_INTEL_VERSION_CHECK(13,0,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(GCC warning msg)
+#elif \
+  JSON_HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \
+  JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_PRAGMA(message(msg))
+#else
+#  define JSON_HEDLEY_WARNING(msg) JSON_HEDLEY_MESSAGE(msg)
+#endif
+
+#if defined(JSON_HEDLEY_REQUIRE)
+    #undef JSON_HEDLEY_REQUIRE
+#endif
+#if defined(JSON_HEDLEY_REQUIRE_MSG)
+    #undef JSON_HEDLEY_REQUIRE_MSG
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(diagnose_if)
+#  if JSON_HEDLEY_HAS_WARNING("-Wgcc-compat")
+#    define JSON_HEDLEY_REQUIRE(expr) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), #expr, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) \
+    JSON_HEDLEY_DIAGNOSTIC_PUSH \
+    _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \
+    __attribute__((diagnose_if(!(expr), msg, "error"))) \
+    JSON_HEDLEY_DIAGNOSTIC_POP
+#  else
+#    define JSON_HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error")))
+#    define JSON_HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error")))
+#  endif
+#else
+#  define JSON_HEDLEY_REQUIRE(expr)
+#  define JSON_HEDLEY_REQUIRE_MSG(expr,msg)
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS)
+    #undef JSON_HEDLEY_FLAGS
+#endif
+#if JSON_HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || JSON_HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion"))
+    #define JSON_HEDLEY_FLAGS __attribute__((__flag_enum__))
+#else
+    #define JSON_HEDLEY_FLAGS
+#endif
+
+#if defined(JSON_HEDLEY_FLAGS_CAST)
+    #undef JSON_HEDLEY_FLAGS_CAST
+#endif
+#if JSON_HEDLEY_INTEL_VERSION_CHECK(19,0,0)
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \
+        JSON_HEDLEY_DIAGNOSTIC_PUSH \
+        _Pragma("warning(disable:188)") \
+        ((T) (expr)); \
+        JSON_HEDLEY_DIAGNOSTIC_POP \
+    }))
+#else
+#  define JSON_HEDLEY_FLAGS_CAST(T, expr) JSON_HEDLEY_STATIC_CAST(T, expr)
+#endif
+
+#if defined(JSON_HEDLEY_EMPTY_BASES)
+    #undef JSON_HEDLEY_EMPTY_BASES
+#endif
+#if \
+    (JSON_HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !JSON_HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \
+    JSON_HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0)
+    #define JSON_HEDLEY_EMPTY_BASES __declspec(empty_bases)
+#else
+    #define JSON_HEDLEY_EMPTY_BASES
+#endif
+
+/* Remaining macros are deprecated. */
+
+#if defined(JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK)
+    #undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#endif
+#if defined(__clang__)
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0)
+#else
+    #define JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) JSON_HEDLEY_GCC_VERSION_CHECK(major,minor,patch)
+#endif
+
+#if defined(JSON_HEDLEY_CLANG_HAS_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_CPP_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_BUILTIN)
+    #undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#endif
+#define JSON_HEDLEY_CLANG_HAS_BUILTIN(builtin) JSON_HEDLEY_HAS_BUILTIN(builtin)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_FEATURE)
+    #undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_FEATURE(feature) JSON_HEDLEY_HAS_FEATURE(feature)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_EXTENSION)
+    #undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#endif
+#define JSON_HEDLEY_CLANG_HAS_EXTENSION(extension) JSON_HEDLEY_HAS_EXTENSION(extension)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE)
+    #undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#endif
+#define JSON_HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute)
+
+#if defined(JSON_HEDLEY_CLANG_HAS_WARNING)
+    #undef JSON_HEDLEY_CLANG_HAS_WARNING
+#endif
+#define JSON_HEDLEY_CLANG_HAS_WARNING(warning) JSON_HEDLEY_HAS_WARNING(warning)
+
+#endif /* !defined(JSON_HEDLEY_VERSION) || (JSON_HEDLEY_VERSION < X) */
+
+
+// This file contains all internal macro definitions
+// You MUST include macro_unscope.hpp at the end of json.hpp to undef all of them
+
+// exclude unsupported compilers
+#if !defined(JSON_SKIP_UNSUPPORTED_COMPILER_CHECK)
+    #if defined(__clang__)
+        #if (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) < 30400
+            #error "unsupported Clang version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #elif defined(__GNUC__) && !(defined(__ICC) || defined(__INTEL_COMPILER))
+        #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) < 40800
+            #error "unsupported GCC version - see https://github.com/nlohmann/json#supported-compilers"
+        #endif
+    #endif
+#endif
+
+// C++ language standard detection
+// if the user manually specified the used c++ version this is skipped
+#if !defined(JSON_HAS_CPP_20) && !defined(JSON_HAS_CPP_17) && !defined(JSON_HAS_CPP_14) && !defined(JSON_HAS_CPP_11)
+    #if (defined(__cplusplus) && __cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+        #define JSON_HAS_CPP_20
+        #define JSON_HAS_CPP_17
+        #define JSON_HAS_CPP_14
+    #elif (defined(__cplusplus) && __cplusplus >= 201703L) || (defined(_HAS_CXX17) && _HAS_CXX17 == 1) // fix for issue #464
+        #define JSON_HAS_CPP_17
+        #define JSON_HAS_CPP_14
+    #elif (defined(__cplusplus) && __cplusplus >= 201402L) || (defined(_HAS_CXX14) && _HAS_CXX14 == 1)
+        #define JSON_HAS_CPP_14
+    #endif
+    // the cpp 11 flag is always specified because it is the minimal required version
+    #define JSON_HAS_CPP_11
+#endif
+
+// disable documentation warnings on clang
+#if defined(__clang__)
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wdocumentation"
+    #pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
+#endif
+
+// allow to disable exceptions
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || defined(_CPPUNWIND)) && !defined(JSON_NOEXCEPTION)
+    #define JSON_THROW(exception) throw exception
+    #define JSON_TRY try
+    #define JSON_CATCH(exception) catch(exception)
+    #define JSON_INTERNAL_CATCH(exception) catch(exception)
+#else
+    #include <cstdlib>
+    #define JSON_THROW(exception) std::abort()
+    #define JSON_TRY if(true)
+    #define JSON_CATCH(exception) if(false)
+    #define JSON_INTERNAL_CATCH(exception) if(false)
+#endif
+
+// override exception macros
+#if defined(JSON_THROW_USER)
+    #undef JSON_THROW
+    #define JSON_THROW JSON_THROW_USER
+#endif
+#if defined(JSON_TRY_USER)
+    #undef JSON_TRY
+    #define JSON_TRY JSON_TRY_USER
+#endif
+#if defined(JSON_CATCH_USER)
+    #undef JSON_CATCH
+    #define JSON_CATCH JSON_CATCH_USER
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_CATCH_USER
+#endif
+#if defined(JSON_INTERNAL_CATCH_USER)
+    #undef JSON_INTERNAL_CATCH
+    #define JSON_INTERNAL_CATCH JSON_INTERNAL_CATCH_USER
+#endif
+
+// allow to override assert
+#if !defined(JSON_ASSERT)
+    #include <cassert> // assert
+    #define JSON_ASSERT(x) assert(x)
+#endif
+
+// allow to access some private functions (needed by the test suite)
+#if defined(JSON_TESTS_PRIVATE)
+    #define JSON_PRIVATE_UNLESS_TESTED public
+#else
+    #define JSON_PRIVATE_UNLESS_TESTED private
+#endif
+
+/*!
+@brief macro to briefly define a mapping between an enum and JSON
+@def NLOHMANN_JSON_SERIALIZE_ENUM
+@since version 3.4.0
+*/
+#define NLOHMANN_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...)                                            \
+    template<typename BasicJsonType>                                                            \
+    inline void to_json(BasicJsonType& j, const ENUM_TYPE& e)                                   \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [e](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool  \
+        {                                                                                       \
+            return ej_pair.first == e;                                                          \
+        });                                                                                     \
+        j = ((it != std::end(m)) ? it : std::begin(m))->second;                                 \
+    }                                                                                           \
+    template<typename BasicJsonType>                                                            \
+    inline void from_json(const BasicJsonType& j, ENUM_TYPE& e)                                 \
+    {                                                                                           \
+        static_assert(std::is_enum<ENUM_TYPE>::value, #ENUM_TYPE " must be an enum!");          \
+        static const std::pair<ENUM_TYPE, BasicJsonType> m[] = __VA_ARGS__;                     \
+        auto it = std::find_if(std::begin(m), std::end(m),                                      \
+                               [&j](const std::pair<ENUM_TYPE, BasicJsonType>& ej_pair) -> bool \
+        {                                                                                       \
+            return ej_pair.second == j;                                                         \
+        });                                                                                     \
+        e = ((it != std::end(m)) ? it : std::begin(m))->first;                                  \
+    }
+
+// Ugly macros to avoid uglier copy-paste when specializing basic_json. They
+// may be removed in the future once the class is split.
+
+#define NLOHMANN_BASIC_JSON_TPL_DECLARATION                                \
+    template<template<typename, typename, typename...> class ObjectType,   \
+             template<typename, typename...> class ArrayType,              \
+             class StringType, class BooleanType, class NumberIntegerType, \
+             class NumberUnsignedType, class NumberFloatType,              \
+             template<typename> class AllocatorType,                       \
+             template<typename, typename = void> class JSONSerializer,     \
+             class BinaryType>
+
+#define NLOHMANN_BASIC_JSON_TPL                                            \
+    basic_json<ObjectType, ArrayType, StringType, BooleanType,             \
+    NumberIntegerType, NumberUnsignedType, NumberFloatType,                \
+    AllocatorType, JSONSerializer, BinaryType>
+
+// Macros to simplify conversion from/to types
+
+#define NLOHMANN_JSON_EXPAND( x ) x
+#define NLOHMANN_JSON_GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, _64, NAME,...) NAME
+#define NLOHMANN_JSON_PASTE(...) NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_GET_MACRO(__VA_ARGS__, \
+        NLOHMANN_JSON_PASTE64, \
+        NLOHMANN_JSON_PASTE63, \
+        NLOHMANN_JSON_PASTE62, \
+        NLOHMANN_JSON_PASTE61, \
+        NLOHMANN_JSON_PASTE60, \
+        NLOHMANN_JSON_PASTE59, \
+        NLOHMANN_JSON_PASTE58, \
+        NLOHMANN_JSON_PASTE57, \
+        NLOHMANN_JSON_PASTE56, \
+        NLOHMANN_JSON_PASTE55, \
+        NLOHMANN_JSON_PASTE54, \
+        NLOHMANN_JSON_PASTE53, \
+        NLOHMANN_JSON_PASTE52, \
+        NLOHMANN_JSON_PASTE51, \
+        NLOHMANN_JSON_PASTE50, \
+        NLOHMANN_JSON_PASTE49, \
+        NLOHMANN_JSON_PASTE48, \
+        NLOHMANN_JSON_PASTE47, \
+        NLOHMANN_JSON_PASTE46, \
+        NLOHMANN_JSON_PASTE45, \
+        NLOHMANN_JSON_PASTE44, \
+        NLOHMANN_JSON_PASTE43, \
+        NLOHMANN_JSON_PASTE42, \
+        NLOHMANN_JSON_PASTE41, \
+        NLOHMANN_JSON_PASTE40, \
+        NLOHMANN_JSON_PASTE39, \
+        NLOHMANN_JSON_PASTE38, \
+        NLOHMANN_JSON_PASTE37, \
+        NLOHMANN_JSON_PASTE36, \
+        NLOHMANN_JSON_PASTE35, \
+        NLOHMANN_JSON_PASTE34, \
+        NLOHMANN_JSON_PASTE33, \
+        NLOHMANN_JSON_PASTE32, \
+        NLOHMANN_JSON_PASTE31, \
+        NLOHMANN_JSON_PASTE30, \
+        NLOHMANN_JSON_PASTE29, \
+        NLOHMANN_JSON_PASTE28, \
+        NLOHMANN_JSON_PASTE27, \
+        NLOHMANN_JSON_PASTE26, \
+        NLOHMANN_JSON_PASTE25, \
+        NLOHMANN_JSON_PASTE24, \
+        NLOHMANN_JSON_PASTE23, \
+        NLOHMANN_JSON_PASTE22, \
+        NLOHMANN_JSON_PASTE21, \
+        NLOHMANN_JSON_PASTE20, \
+        NLOHMANN_JSON_PASTE19, \
+        NLOHMANN_JSON_PASTE18, \
+        NLOHMANN_JSON_PASTE17, \
+        NLOHMANN_JSON_PASTE16, \
+        NLOHMANN_JSON_PASTE15, \
+        NLOHMANN_JSON_PASTE14, \
+        NLOHMANN_JSON_PASTE13, \
+        NLOHMANN_JSON_PASTE12, \
+        NLOHMANN_JSON_PASTE11, \
+        NLOHMANN_JSON_PASTE10, \
+        NLOHMANN_JSON_PASTE9, \
+        NLOHMANN_JSON_PASTE8, \
+        NLOHMANN_JSON_PASTE7, \
+        NLOHMANN_JSON_PASTE6, \
+        NLOHMANN_JSON_PASTE5, \
+        NLOHMANN_JSON_PASTE4, \
+        NLOHMANN_JSON_PASTE3, \
+        NLOHMANN_JSON_PASTE2, \
+        NLOHMANN_JSON_PASTE1)(__VA_ARGS__))
+#define NLOHMANN_JSON_PASTE2(func, v1) func(v1)
+#define NLOHMANN_JSON_PASTE3(func, v1, v2) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE2(func, v2)
+#define NLOHMANN_JSON_PASTE4(func, v1, v2, v3) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE3(func, v2, v3)
+#define NLOHMANN_JSON_PASTE5(func, v1, v2, v3, v4) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE4(func, v2, v3, v4)
+#define NLOHMANN_JSON_PASTE6(func, v1, v2, v3, v4, v5) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE5(func, v2, v3, v4, v5)
+#define NLOHMANN_JSON_PASTE7(func, v1, v2, v3, v4, v5, v6) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE6(func, v2, v3, v4, v5, v6)
+#define NLOHMANN_JSON_PASTE8(func, v1, v2, v3, v4, v5, v6, v7) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE7(func, v2, v3, v4, v5, v6, v7)
+#define NLOHMANN_JSON_PASTE9(func, v1, v2, v3, v4, v5, v6, v7, v8) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE8(func, v2, v3, v4, v5, v6, v7, v8)
+#define NLOHMANN_JSON_PASTE10(func, v1, v2, v3, v4, v5, v6, v7, v8, v9) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE9(func, v2, v3, v4, v5, v6, v7, v8, v9)
+#define NLOHMANN_JSON_PASTE11(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE10(func, v2, v3, v4, v5, v6, v7, v8, v9, v10)
+#define NLOHMANN_JSON_PASTE12(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE11(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11)
+#define NLOHMANN_JSON_PASTE13(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE12(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12)
+#define NLOHMANN_JSON_PASTE14(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE13(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13)
+#define NLOHMANN_JSON_PASTE15(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE14(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14)
+#define NLOHMANN_JSON_PASTE16(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE15(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15)
+#define NLOHMANN_JSON_PASTE17(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE16(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16)
+#define NLOHMANN_JSON_PASTE18(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE17(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17)
+#define NLOHMANN_JSON_PASTE19(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE18(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18)
+#define NLOHMANN_JSON_PASTE20(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE19(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19)
+#define NLOHMANN_JSON_PASTE21(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE20(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20)
+#define NLOHMANN_JSON_PASTE22(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE21(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21)
+#define NLOHMANN_JSON_PASTE23(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE22(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22)
+#define NLOHMANN_JSON_PASTE24(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE23(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23)
+#define NLOHMANN_JSON_PASTE25(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE24(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24)
+#define NLOHMANN_JSON_PASTE26(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE25(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25)
+#define NLOHMANN_JSON_PASTE27(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE26(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26)
+#define NLOHMANN_JSON_PASTE28(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE27(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27)
+#define NLOHMANN_JSON_PASTE29(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE28(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28)
+#define NLOHMANN_JSON_PASTE30(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE29(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29)
+#define NLOHMANN_JSON_PASTE31(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE30(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30)
+#define NLOHMANN_JSON_PASTE32(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE31(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+#define NLOHMANN_JSON_PASTE33(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE32(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32)
+#define NLOHMANN_JSON_PASTE34(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE33(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33)
+#define NLOHMANN_JSON_PASTE35(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE34(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34)
+#define NLOHMANN_JSON_PASTE36(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE35(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35)
+#define NLOHMANN_JSON_PASTE37(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE36(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36)
+#define NLOHMANN_JSON_PASTE38(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE37(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37)
+#define NLOHMANN_JSON_PASTE39(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE38(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38)
+#define NLOHMANN_JSON_PASTE40(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE39(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39)
+#define NLOHMANN_JSON_PASTE41(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE40(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40)
+#define NLOHMANN_JSON_PASTE42(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE41(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41)
+#define NLOHMANN_JSON_PASTE43(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE42(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42)
+#define NLOHMANN_JSON_PASTE44(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE43(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43)
+#define NLOHMANN_JSON_PASTE45(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE44(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44)
+#define NLOHMANN_JSON_PASTE46(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE45(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45)
+#define NLOHMANN_JSON_PASTE47(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE46(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46)
+#define NLOHMANN_JSON_PASTE48(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE47(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47)
+#define NLOHMANN_JSON_PASTE49(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE48(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48)
+#define NLOHMANN_JSON_PASTE50(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE49(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49)
+#define NLOHMANN_JSON_PASTE51(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE50(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50)
+#define NLOHMANN_JSON_PASTE52(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE51(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51)
+#define NLOHMANN_JSON_PASTE53(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE52(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52)
+#define NLOHMANN_JSON_PASTE54(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE53(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53)
+#define NLOHMANN_JSON_PASTE55(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE54(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54)
+#define NLOHMANN_JSON_PASTE56(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE55(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55)
+#define NLOHMANN_JSON_PASTE57(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE56(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56)
+#define NLOHMANN_JSON_PASTE58(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE57(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57)
+#define NLOHMANN_JSON_PASTE59(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE58(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58)
+#define NLOHMANN_JSON_PASTE60(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE59(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59)
+#define NLOHMANN_JSON_PASTE61(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE60(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60)
+#define NLOHMANN_JSON_PASTE62(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE61(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61)
+#define NLOHMANN_JSON_PASTE63(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE62(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62)
+#define NLOHMANN_JSON_PASTE64(func, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63) NLOHMANN_JSON_PASTE2(func, v1) NLOHMANN_JSON_PASTE63(func, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63)
+
+#define NLOHMANN_JSON_TO(v1) nlohmann_json_j[#v1] = nlohmann_json_t.v1;
+#define NLOHMANN_JSON_FROM(v1) nlohmann_json_j.at(#v1).get_to(nlohmann_json_t.v1);
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_INTRUSIVE(Type, ...)  \
+    friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+/*!
+@brief macro
+@def NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
+@since version 3.9.0
+*/
+#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Type, ...)  \
+    inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \
+    inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) }
+
+#ifndef JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_USE_IMPLICIT_CONVERSIONS 1
+#endif
+
+#if JSON_USE_IMPLICIT_CONVERSIONS
+    #define JSON_EXPLICIT
+#else
+    #define JSON_EXPLICIT explicit
+#endif
+
+#ifndef JSON_DIAGNOSTICS
+    #define JSON_DIAGNOSTICS 0
+#endif
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/*!
+@brief replace all occurrences of a substring by another string
+
+@param[in,out] s  the string to manipulate; changed so that all
+               occurrences of @a f are replaced with @a t
+@param[in]     f  the substring to replace with @a t
+@param[in]     t  the string to replace @a f
+
+@pre The search string @a f must not be empty. **This precondition is
+enforced with an assertion.**
+
+@since version 2.0.0
+*/
+inline void replace_substring(std::string& s, const std::string& f,
+                              const std::string& t)
+{
+    JSON_ASSERT(!f.empty());
+    for (auto pos = s.find(f);                // find first occurrence of f
+            pos != std::string::npos;         // make sure f was found
+            s.replace(pos, f.size(), t),      // replace with t, and
+            pos = s.find(f, pos + t.size()))  // find next occurrence of f
+    {}
+}
+
+/*!
+ * @brief string escaping as described in RFC 6901 (Sect. 4)
+ * @param[in] s string to escape
+ * @return    escaped string
+ *
+ * Note the order of escaping "~" to "~0" and "/" to "~1" is important.
+ */
+inline std::string escape(std::string s)
+{
+    replace_substring(s, "~", "~0");
+    replace_substring(s, "/", "~1");
+    return s;
+}
+
+/*!
+ * @brief string unescaping as described in RFC 6901 (Sect. 4)
+ * @param[in] s string to unescape
+ * @return    unescaped string
+ *
+ * Note the order of escaping "~1" to "/" and "~0" to "~" is important.
+ */
+static void unescape(std::string& s)
+{
+    replace_substring(s, "~1", "/");
+    replace_substring(s, "~0", "~");
+}
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+
+#include <cstddef> // size_t
+
+namespace nlohmann
+{
+namespace detail
+{
+/// struct to capture the start position of the current token
+struct position_t
+{
+    /// the total number of characters read
+    std::size_t chars_read_total = 0;
+    /// the number of characters read in the current line
+    std::size_t chars_read_current_line = 0;
+    /// the number of lines read
+    std::size_t lines_read = 0;
+
+    /// conversion to size_t to preserve SAX interface
+    constexpr operator size_t() const
+    {
+        return chars_read_total;
+    }
+};
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////////
+// exceptions //
+////////////////
+
+/*!
+@brief general exception of the @ref basic_json class
+
+This class is an extension of `std::exception` objects with a member @a id for
+exception ids. It is used as the base class for all exceptions thrown by the
+@ref basic_json class. This class can hence be used as "wildcard" to catch
+exceptions.
+
+Subclasses:
+- @ref parse_error for exceptions indicating a parse error
+- @ref invalid_iterator for exceptions indicating errors with iterators
+- @ref type_error for exceptions indicating executing a member function with
+                  a wrong type
+- @ref out_of_range for exceptions indicating access out of the defined range
+- @ref other_error for exceptions indicating other library errors
+
+@internal
+@note To have nothrow-copy-constructible exceptions, we internally use
+      `std::runtime_error` which can cope with arbitrary-length error messages.
+      Intermediate strings are built with static functions and then passed to
+      the actual constructor.
+@endinternal
+
+@liveexample{The following code shows how arbitrary library exceptions can be
+caught.,exception}
+
+@since version 3.0.0
+*/
+class exception : public std::exception
+{
+  public:
+    /// returns the explanatory string
+    const char* what() const noexcept override
+    {
+        return m.what();
+    }
+
+    /// the id of the exception
+    const int id; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+
+  protected:
+    JSON_HEDLEY_NON_NULL(3)
+    exception(int id_, const char* what_arg) : id(id_), m(what_arg) {}
+
+    static std::string name(const std::string& ename, int id_)
+    {
+        return "[json.exception." + ename + "." + std::to_string(id_) + "] ";
+    }
+
+    template<typename BasicJsonType>
+    static std::string diagnostics(const BasicJsonType& leaf_element)
+    {
+#if JSON_DIAGNOSTICS
+        std::vector<std::string> tokens;
+        for (const auto* current = &leaf_element; current->m_parent != nullptr; current = current->m_parent)
+        {
+            switch (current->m_parent->type())
+            {
+                case value_t::array:
+                {
+                    for (std::size_t i = 0; i < current->m_parent->m_value.array->size(); ++i)
+                    {
+                        if (&current->m_parent->m_value.array->operator[](i) == current)
+                        {
+                            tokens.emplace_back(std::to_string(i));
+                            break;
+                        }
+                    }
+                    break;
+                }
+
+                case value_t::object:
+                {
+                    for (const auto& element : *current->m_parent->m_value.object)
+                    {
+                        if (&element.second == current)
+                        {
+                            tokens.emplace_back(element.first.c_str());
+                            break;
+                        }
+                    }
+                    break;
+                }
+
+                case value_t::null: // LCOV_EXCL_LINE
+                case value_t::string: // LCOV_EXCL_LINE
+                case value_t::boolean: // LCOV_EXCL_LINE
+                case value_t::number_integer: // LCOV_EXCL_LINE
+                case value_t::number_unsigned: // LCOV_EXCL_LINE
+                case value_t::number_float: // LCOV_EXCL_LINE
+                case value_t::binary: // LCOV_EXCL_LINE
+                case value_t::discarded: // LCOV_EXCL_LINE
+                default:   // LCOV_EXCL_LINE
+                    break; // LCOV_EXCL_LINE
+            }
+        }
+
+        if (tokens.empty())
+        {
+            return "";
+        }
+
+        return "(" + std::accumulate(tokens.rbegin(), tokens.rend(), std::string{},
+                                     [](const std::string & a, const std::string & b)
+        {
+            return a + "/" + detail::escape(b);
+        }) + ") ";
+#else
+        static_cast<void>(leaf_element);
+        return "";
+#endif
+    }
+
+  private:
+    /// an exception object as storage for error messages
+    std::runtime_error m;
+};
+
+/*!
+@brief exception indicating a parse error
+
+This exception is thrown by the library when a parse error occurs. Parse errors
+can occur during the deserialization of JSON text, CBOR, MessagePack, as well
+as when using JSON Patch.
+
+Member @a byte holds the byte index of the last read character in the input
+file.
+
+Exceptions have ids 1xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.parse_error.101 | parse error at 2: unexpected end of input; expected string literal | This error indicates a syntax error while deserializing a JSON text. The error message describes that an unexpected token (character) was encountered, and the member @a byte indicates the error position.
+json.exception.parse_error.102 | parse error at 14: missing or wrong low surrogate | JSON uses the `\uxxxx` format to describe Unicode characters. Code points above above 0xFFFF are split into two `\uxxxx` entries ("surrogate pairs"). This error indicates that the surrogate pair is incomplete or contains an invalid code point.
+json.exception.parse_error.103 | parse error: code points above 0x10FFFF are invalid | Unicode supports code points up to 0x10FFFF. Code points above 0x10FFFF are invalid.
+json.exception.parse_error.104 | parse error: JSON patch must be an array of objects | [RFC 6902](https://tools.ietf.org/html/rfc6902) requires a JSON Patch document to be a JSON document that represents an array of objects.
+json.exception.parse_error.105 | parse error: operation must have string member 'op' | An operation of a JSON Patch document must contain exactly one "op" member, whose value indicates the operation to perform. Its value must be one of "add", "remove", "replace", "move", "copy", or "test"; other values are errors.
+json.exception.parse_error.106 | parse error: array index '01' must not begin with '0' | An array index in a JSON Pointer ([RFC 6901](https://tools.ietf.org/html/rfc6901)) may be `0` or any number without a leading `0`.
+json.exception.parse_error.107 | parse error: JSON pointer must be empty or begin with '/' - was: 'foo' | A JSON Pointer must be a Unicode string containing a sequence of zero or more reference tokens, each prefixed by a `/` character.
+json.exception.parse_error.108 | parse error: escape character '~' must be followed with '0' or '1' | In a JSON Pointer, only `~0` and `~1` are valid escape sequences.
+json.exception.parse_error.109 | parse error: array index 'one' is not a number | A JSON Pointer array index must be a number.
+json.exception.parse_error.110 | parse error at 1: cannot read 2 bytes from vector | When parsing CBOR or MessagePack, the byte vector ends before the complete value has been read.
+json.exception.parse_error.112 | parse error at 1: error reading CBOR; last byte: 0xF8 | Not all types of CBOR or MessagePack are supported. This exception occurs if an unsupported byte was read.
+json.exception.parse_error.113 | parse error at 2: expected a CBOR string; last byte: 0x98 | While parsing a map key, a value that is not a string has been read.
+json.exception.parse_error.114 | parse error: Unsupported BSON record type 0x0F | The parsing of the corresponding BSON record type is not implemented (yet).
+json.exception.parse_error.115 | parse error at byte 5: syntax error while parsing UBJSON high-precision number: invalid number text: 1A | A UBJSON high-precision number could not be parsed.
+
+@note For an input with n bytes, 1 is the index of the first character and n+1
+      is the index of the terminating null byte or the end of file. This also
+      holds true when reading a byte vector (CBOR or MessagePack).
+
+@liveexample{The following code shows how a `parse_error` exception can be
+caught.,parse_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class parse_error : public exception
+{
+  public:
+    /*!
+    @brief create a parse error exception
+    @param[in] id_       the id of the exception
+    @param[in] pos       the position where the error occurred (or with
+                         chars_read_total=0 if the position cannot be
+                         determined)
+    @param[in] what_arg  the explanatory string
+    @return parse_error object
+    */
+    template<typename BasicJsonType>
+    static parse_error create(int id_, const position_t& pos, const std::string& what_arg, const BasicJsonType& context)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        position_string(pos) + ": " + exception::diagnostics(context) + what_arg;
+        return parse_error(id_, pos.chars_read_total, w.c_str());
+    }
+
+    template<typename BasicJsonType>
+    static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, const BasicJsonType& context)
+    {
+        std::string w = exception::name("parse_error", id_) + "parse error" +
+                        (byte_ != 0 ? (" at byte " + std::to_string(byte_)) : "") +
+                        ": " + exception::diagnostics(context) + what_arg;
+        return parse_error(id_, byte_, w.c_str());
+    }
+
+    /*!
+    @brief byte index of the parse error
+
+    The byte index of the last read character in the input file.
+
+    @note For an input with n bytes, 1 is the index of the first character and
+          n+1 is the index of the terminating null byte or the end of file.
+          This also holds true when reading a byte vector (CBOR or MessagePack).
+    */
+    const std::size_t byte;
+
+  private:
+    parse_error(int id_, std::size_t byte_, const char* what_arg)
+        : exception(id_, what_arg), byte(byte_) {}
+
+    static std::string position_string(const position_t& pos)
+    {
+        return " at line " + std::to_string(pos.lines_read + 1) +
+               ", column " + std::to_string(pos.chars_read_current_line);
+    }
+};
+
+/*!
+@brief exception indicating errors with iterators
+
+This exception is thrown if iterators passed to a library function do not match
+the expected semantics.
+
+Exceptions have ids 2xx.
+
+name / id                           | example message | description
+----------------------------------- | --------------- | -------------------------
+json.exception.invalid_iterator.201 | iterators are not compatible | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.202 | iterator does not fit current value | In an erase or insert function, the passed iterator @a pos does not belong to the JSON value for which the function was called. It hence does not define a valid position for the deletion/insertion.
+json.exception.invalid_iterator.203 | iterators do not fit current value | Either iterator passed to function @ref erase(IteratorType first, IteratorType last) does not belong to the JSON value from which values shall be erased. It hence does not define a valid range to delete values from.
+json.exception.invalid_iterator.204 | iterators out of range | When an iterator range for a primitive type (number, boolean, or string) is passed to a constructor or an erase function, this range has to be exactly (@ref begin(), @ref end()), because this is the only way the single stored value is expressed. All other ranges are invalid.
+json.exception.invalid_iterator.205 | iterator out of range | When an iterator for a primitive type (number, boolean, or string) is passed to an erase function, the iterator has to be the @ref begin() iterator, because it is the only way to address the stored value. All other iterators are invalid.
+json.exception.invalid_iterator.206 | cannot construct with iterators from null | The iterators passed to constructor @ref basic_json(InputIT first, InputIT last) belong to a JSON null value and hence to not define a valid range.
+json.exception.invalid_iterator.207 | cannot use key() for non-object iterators | The key() member function can only be used on iterators belonging to a JSON object, because other types do not have a concept of a key.
+json.exception.invalid_iterator.208 | cannot use operator[] for object iterators | The operator[] to specify a concrete offset cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.209 | cannot use offsets with object iterators | The offset operators (+, -, +=, -=) cannot be used on iterators belonging to a JSON object, because JSON objects are unordered.
+json.exception.invalid_iterator.210 | iterators do not fit | The iterator range passed to the insert function are not compatible, meaning they do not belong to the same container. Therefore, the range (@a first, @a last) is invalid.
+json.exception.invalid_iterator.211 | passed iterators may not belong to container | The iterator range passed to the insert function must not be a subrange of the container to insert to.
+json.exception.invalid_iterator.212 | cannot compare iterators of different containers | When two iterators are compared, they must belong to the same container.
+json.exception.invalid_iterator.213 | cannot compare order of object iterators | The order of object iterators cannot be compared, because JSON objects are unordered.
+json.exception.invalid_iterator.214 | cannot get value | Cannot get value for iterator: Either the iterator belongs to a null value or it is an iterator to a primitive type (number, boolean, or string), but the iterator is different to @ref begin().
+
+@liveexample{The following code shows how an `invalid_iterator` exception can be
+caught.,invalid_iterator}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class invalid_iterator : public exception
+{
+  public:
+    template<typename BasicJsonType>
+    static invalid_iterator create(int id_, const std::string& what_arg, const BasicJsonType& context)
+    {
+        std::string w = exception::name("invalid_iterator", id_) + exception::diagnostics(context) + what_arg;
+        return invalid_iterator(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    invalid_iterator(int id_, const char* what_arg)
+        : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating executing a member function with a wrong type
+
+This exception is thrown in case of a type error; that is, a library function is
+executed on a JSON value whose type does not match the expected semantics.
+
+Exceptions have ids 3xx.
+
+name / id                     | example message | description
+----------------------------- | --------------- | -------------------------
+json.exception.type_error.301 | cannot create object from initializer list | To create an object from an initializer list, the initializer list must consist only of a list of pairs whose first element is a string. When this constraint is violated, an array is created instead.
+json.exception.type_error.302 | type must be object, but is array | During implicit or explicit value conversion, the JSON type must be compatible to the target type. For instance, a JSON string can only be converted into string types, but not into numbers or boolean types.
+json.exception.type_error.303 | incompatible ReferenceType for get_ref, actual type is object | To retrieve a reference to a value stored in a @ref basic_json object with @ref get_ref, the type of the reference must match the value type. For instance, for a JSON array, the @a ReferenceType must be @ref array_t &.
+json.exception.type_error.304 | cannot use at() with string | The @ref at() member functions can only be executed for certain JSON types.
+json.exception.type_error.305 | cannot use operator[] with string | The @ref operator[] member functions can only be executed for certain JSON types.
+json.exception.type_error.306 | cannot use value() with string | The @ref value() member functions can only be executed for certain JSON types.
+json.exception.type_error.307 | cannot use erase() with string | The @ref erase() member functions can only be executed for certain JSON types.
+json.exception.type_error.308 | cannot use push_back() with string | The @ref push_back() and @ref operator+= member functions can only be executed for certain JSON types.
+json.exception.type_error.309 | cannot use insert() with | The @ref insert() member functions can only be executed for certain JSON types.
+json.exception.type_error.310 | cannot use swap() with number | The @ref swap() member functions can only be executed for certain JSON types.
+json.exception.type_error.311 | cannot use emplace_back() with string | The @ref emplace_back() member function can only be executed for certain JSON types.
+json.exception.type_error.312 | cannot use update() with string | The @ref update() member functions can only be executed for certain JSON types.
+json.exception.type_error.313 | invalid value to unflatten | The @ref unflatten function converts an object whose keys are JSON Pointers back into an arbitrary nested JSON value. The JSON Pointers must not overlap, because then the resulting value would not be well defined.
+json.exception.type_error.314 | only objects can be unflattened | The @ref unflatten function only works for an object whose keys are JSON Pointers.
+json.exception.type_error.315 | values in object must be primitive | The @ref unflatten function only works for an object whose keys are JSON Pointers and whose values are primitive.
+json.exception.type_error.316 | invalid UTF-8 byte at index 10: 0x7E | The @ref dump function only works with UTF-8 encoded strings; that is, if you assign a `std::string` to a JSON value, make sure it is UTF-8 encoded. |
+json.exception.type_error.317 | JSON value cannot be serialized to requested format | The dynamic type of the object cannot be represented in the requested serialization format (e.g. a raw `true` or `null` JSON object cannot be serialized to BSON) |
+
+@liveexample{The following code shows how a `type_error` exception can be
+caught.,type_error}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class type_error : public exception
+{
+  public:
+    template<typename BasicJsonType>
+    static type_error create(int id_, const std::string& what_arg, const BasicJsonType& context)
+    {
+        std::string w = exception::name("type_error", id_) + exception::diagnostics(context) + what_arg;
+        return type_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    type_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating access out of the defined range
+
+This exception is thrown in case a library function is called on an input
+parameter that exceeds the expected range, for instance in case of array
+indices or nonexisting object keys.
+
+Exceptions have ids 4xx.
+
+name / id                       | example message | description
+------------------------------- | --------------- | -------------------------
+json.exception.out_of_range.401 | array index 3 is out of range | The provided array index @a i is larger than @a size-1.
+json.exception.out_of_range.402 | array index '-' (3) is out of range | The special array index `-` in a JSON Pointer never describes a valid element of the array, but the index past the end. That is, it can only be used to add elements at this position, but not to read it.
+json.exception.out_of_range.403 | key 'foo' not found | The provided key was not found in the JSON object.
+json.exception.out_of_range.404 | unresolved reference token 'foo' | A reference token in a JSON Pointer could not be resolved.
+json.exception.out_of_range.405 | JSON pointer has no parent | The JSON Patch operations 'remove' and 'add' can not be applied to the root element of the JSON value.
+json.exception.out_of_range.406 | number overflow parsing '10E1000' | A parsed number could not be stored as without changing it to NaN or INF.
+json.exception.out_of_range.407 | number overflow serializing '9223372036854775808' | UBJSON and BSON only support integer numbers up to 9223372036854775807. (until version 3.8.0) |
+json.exception.out_of_range.408 | excessive array size: 8658170730974374167 | The size (following `#`) of an UBJSON array or object exceeds the maximal capacity. |
+json.exception.out_of_range.409 | BSON key cannot contain code point U+0000 (at byte 2) | Key identifiers to be serialized to BSON cannot contain code point U+0000, since the key is stored as zero-terminated c-string |
+
+@liveexample{The following code shows how an `out_of_range` exception can be
+caught.,out_of_range}
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref other_error for exceptions indicating other library errors
+
+@since version 3.0.0
+*/
+class out_of_range : public exception
+{
+  public:
+    template<typename BasicJsonType>
+    static out_of_range create(int id_, const std::string& what_arg, const BasicJsonType& context)
+    {
+        std::string w = exception::name("out_of_range", id_) + exception::diagnostics(context) + what_arg;
+        return out_of_range(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    out_of_range(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+
+/*!
+@brief exception indicating other library errors
+
+This exception is thrown in case of errors that cannot be classified with the
+other exception types.
+
+Exceptions have ids 5xx.
+
+name / id                      | example message | description
+------------------------------ | --------------- | -------------------------
+json.exception.other_error.501 | unsuccessful: {"op":"test","path":"/baz", "value":"bar"} | A JSON Patch operation 'test' failed. The unsuccessful operation is also printed.
+
+@sa - @ref exception for the base class of the library exceptions
+@sa - @ref parse_error for exceptions indicating a parse error
+@sa - @ref invalid_iterator for exceptions indicating errors with iterators
+@sa - @ref type_error for exceptions indicating executing a member function with
+                    a wrong type
+@sa - @ref out_of_range for exceptions indicating access out of the defined range
+
+@liveexample{The following code shows how an `other_error` exception can be
+caught.,other_error}
+
+@since version 3.0.0
+*/
+class other_error : public exception
+{
+  public:
+    template<typename BasicJsonType>
+    static other_error create(int id_, const std::string& what_arg, const BasicJsonType& context)
+    {
+        std::string w = exception::name("other_error", id_) + exception::diagnostics(context) + what_arg;
+        return other_error(id_, w.c_str());
+    }
+
+  private:
+    JSON_HEDLEY_NON_NULL(3)
+    other_error(int id_, const char* what_arg) : exception(id_, what_arg) {}
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+#include <cstddef> // size_t
+#include <type_traits> // conditional, enable_if, false_type, integral_constant, is_constructible, is_integral, is_same, remove_cv, remove_reference, true_type
+#include <utility> // index_sequence, make_index_sequence, index_sequence_for
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+template<typename T>
+using uncvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+#ifdef JSON_HAS_CPP_14
+
+// the following utilities are natively available in C++14
+using std::enable_if_t;
+using std::index_sequence;
+using std::make_index_sequence;
+using std::index_sequence_for;
+
+#else
+
+// alias templates to reduce boilerplate
+template<bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+// The following code is taken from https://github.com/abseil/abseil-cpp/blob/10cb35e459f5ecca5b2ff107635da0bfa41011b4/absl/utility/utility.h
+// which is part of Google Abseil (https://github.com/abseil/abseil-cpp), licensed under the Apache License 2.0.
+
+//// START OF CODE FROM GOOGLE ABSEIL
+
+// integer_sequence
+//
+// Class template representing a compile-time integer sequence. An instantiation
+// of `integer_sequence<T, Ints...>` has a sequence of integers encoded in its
+// type through its template arguments (which is a common need when
+// working with C++11 variadic templates). `absl::integer_sequence` is designed
+// to be a drop-in replacement for C++14's `std::integer_sequence`.
+//
+// Example:
+//
+//   template< class T, T... Ints >
+//   void user_function(integer_sequence<T, Ints...>);
+//
+//   int main()
+//   {
+//     // user_function's `T` will be deduced to `int` and `Ints...`
+//     // will be deduced to `0, 1, 2, 3, 4`.
+//     user_function(make_integer_sequence<int, 5>());
+//   }
+template <typename T, T... Ints>
+struct integer_sequence
+{
+    using value_type = T;
+    static constexpr std::size_t size() noexcept
+    {
+        return sizeof...(Ints);
+    }
+};
+
+// index_sequence
+//
+// A helper template for an `integer_sequence` of `size_t`,
+// `absl::index_sequence` is designed to be a drop-in replacement for C++14's
+// `std::index_sequence`.
+template <size_t... Ints>
+using index_sequence = integer_sequence<size_t, Ints...>;
+
+namespace utility_internal
+{
+
+template <typename Seq, size_t SeqSize, size_t Rem>
+struct Extend;
+
+// Note that SeqSize == sizeof...(Ints). It's passed explicitly for efficiency.
+template <typename T, T... Ints, size_t SeqSize>
+struct Extend<integer_sequence<T, Ints...>, SeqSize, 0>
+{
+    using type = integer_sequence < T, Ints..., (Ints + SeqSize)... >;
+};
+
+template <typename T, T... Ints, size_t SeqSize>
+struct Extend<integer_sequence<T, Ints...>, SeqSize, 1>
+{
+    using type = integer_sequence < T, Ints..., (Ints + SeqSize)..., 2 * SeqSize >;
+};
+
+// Recursion helper for 'make_integer_sequence<T, N>'.
+// 'Gen<T, N>::type' is an alias for 'integer_sequence<T, 0, 1, ... N-1>'.
+template <typename T, size_t N>
+struct Gen
+{
+    using type =
+        typename Extend < typename Gen < T, N / 2 >::type, N / 2, N % 2 >::type;
+};
+
+template <typename T>
+struct Gen<T, 0>
+{
+    using type = integer_sequence<T>;
+};
+
+}  // namespace utility_internal
+
+// Compile-time sequences of integers
+
+// make_integer_sequence
+//
+// This template alias is equivalent to
+// `integer_sequence<int, 0, 1, ..., N-1>`, and is designed to be a drop-in
+// replacement for C++14's `std::make_integer_sequence`.
+template <typename T, T N>
+using make_integer_sequence = typename utility_internal::Gen<T, N>::type;
+
+// make_index_sequence
+//
+// This template alias is equivalent to `index_sequence<0, 1, ..., N-1>`,
+// and is designed to be a drop-in replacement for C++14's
+// `std::make_index_sequence`.
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+
+// index_sequence_for
+//
+// Converts a typename pack into an index sequence of the same length, and
+// is designed to be a drop-in replacement for C++14's
+// `std::index_sequence_for()`
+template <typename... Ts>
+using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+//// END OF CODE FROM GOOGLE ABSEIL
+
+#endif
+
+// dispatch utility (taken from ranges-v3)
+template<unsigned N> struct priority_tag : priority_tag < N - 1 > {};
+template<> struct priority_tag<0> {};
+
+// taken from ranges-v3
+template<typename T>
+struct static_const
+{
+    static constexpr T value{};
+};
+
+template<typename T>
+constexpr T static_const<T>::value;
+
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/identity_tag.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+// dispatching helper struct
+template <class T> struct identity_tag {};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+#include <limits> // numeric_limits
+#include <type_traits> // false_type, is_constructible, is_integral, is_same, true_type
+#include <utility> // declval
+#include <tuple> // tuple
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+
+#include <iterator> // random_access_iterator_tag
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename ...Ts> struct make_void
+{
+    using type = void;
+};
+template<typename ...Ts> using void_t = typename make_void<Ts...>::type;
+} // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename It, typename = void>
+struct iterator_types {};
+
+template<typename It>
+struct iterator_types <
+    It,
+    void_t<typename It::difference_type, typename It::value_type, typename It::pointer,
+    typename It::reference, typename It::iterator_category >>
+{
+    using difference_type = typename It::difference_type;
+    using value_type = typename It::value_type;
+    using pointer = typename It::pointer;
+    using reference = typename It::reference;
+    using iterator_category = typename It::iterator_category;
+};
+
+// This is required as some compilers implement std::iterator_traits in a way that
+// doesn't work with SFINAE. See https://github.com/nlohmann/json/issues/1341.
+template<typename T, typename = void>
+struct iterator_traits
+{
+};
+
+template<typename T>
+struct iterator_traits < T, enable_if_t < !std::is_pointer<T>::value >>
+            : iterator_types<T>
+{
+};
+
+template<typename T>
+struct iterator_traits<T*, enable_if_t<std::is_object<T>::value>>
+{
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = T;
+    using difference_type = ptrdiff_t;
+    using pointer = T*;
+    using reference = T&;
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+
+#include <type_traits>
+
+// #include <nlohmann/detail/meta/void_t.hpp>
+
+
+// https://en.cppreference.com/w/cpp/experimental/is_detected
+namespace nlohmann
+{
+namespace detail
+{
+struct nonesuch
+{
+    nonesuch() = delete;
+    ~nonesuch() = delete;
+    nonesuch(nonesuch const&) = delete;
+    nonesuch(nonesuch const&&) = delete;
+    void operator=(nonesuch const&) = delete;
+    void operator=(nonesuch&&) = delete;
+};
+
+template<class Default,
+         class AlwaysVoid,
+         template<class...> class Op,
+         class... Args>
+struct detector
+{
+    using value_t = std::false_type;
+    using type = Default;
+};
+
+template<class Default, template<class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...>
+{
+    using value_t = std::true_type;
+    using type = Op<Args...>;
+};
+
+template<template<class...> class Op, class... Args>
+using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+template<template<class...> class Op, class... Args>
+struct is_detected_lazy : is_detected<Op, Args...> { };
+
+template<template<class...> class Op, class... Args>
+using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or = detector<Default, void, Op, Args...>;
+
+template<class Default, template<class...> class Op, class... Args>
+using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+template<class Expected, template<class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template<class To, template<class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/json_fwd.hpp>
+#ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
+#define INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+#include <cstdint> // int64_t, uint64_t
+#include <map> // map
+#include <memory> // allocator
+#include <string> // string
+#include <vector> // vector
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+/*!
+@brief default JSONSerializer template argument
+
+This serializer ignores the template arguments and uses ADL
+([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
+for serialization.
+*/
+template<typename T = void, typename SFINAE = void>
+struct adl_serializer;
+
+template<template<typename U, typename V, typename... Args> class ObjectType =
+         std::map,
+         template<typename U, typename... Args> class ArrayType = std::vector,
+         class StringType = std::string, class BooleanType = bool,
+         class NumberIntegerType = std::int64_t,
+         class NumberUnsignedType = std::uint64_t,
+         class NumberFloatType = double,
+         template<typename U> class AllocatorType = std::allocator,
+         template<typename T, typename SFINAE = void> class JSONSerializer =
+         adl_serializer,
+         class BinaryType = std::vector<std::uint8_t>>
+class basic_json;
+
+/*!
+@brief JSON Pointer
+
+A JSON pointer defines a string syntax for identifying a specific value
+within a JSON document. It can be used with functions `at` and
+`operator[]`. Furthermore, JSON pointers are the base for JSON patches.
+
+@sa [RFC 6901](https://tools.ietf.org/html/rfc6901)
+
+@since version 2.0.0
+*/
+template<typename BasicJsonType>
+class json_pointer;
+
+/*!
+@brief default JSON class
+
+This type is the default specialization of the @ref basic_json class which
+uses the standard template types.
+
+@since version 1.0.0
+*/
+using json = basic_json<>;
+
+template<class Key, class T, class IgnoredLess, class Allocator>
+struct ordered_map;
+
+/*!
+@brief ordered JSON class
+
+This type preserves the insertion order of object keys.
+
+@since version 3.9.0
+*/
+using ordered_json = basic_json<nlohmann::ordered_map>;
+
+}  // namespace nlohmann
+
+#endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
+
+
+namespace nlohmann
+{
+/*!
+@brief detail namespace with internal helper functions
+
+This namespace collects functions that should not be exposed,
+implementations of some @ref basic_json methods, and meta-programming helpers.
+
+@since version 2.1.0
+*/
+namespace detail
+{
+/////////////
+// helpers //
+/////////////
+
+// Note to maintainers:
+//
+// Every trait in this file expects a non CV-qualified type.
+// The only exceptions are in the 'aliases for detected' section
+// (i.e. those of the form: decltype(T::member_function(std::declval<T>())))
+//
+// In this case, T has to be properly CV-qualified to constraint the function arguments
+// (e.g. to_json(BasicJsonType&, const T&))
+
+template<typename> struct is_basic_json : std::false_type {};
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+struct is_basic_json<NLOHMANN_BASIC_JSON_TPL> : std::true_type {};
+
+//////////////////////
+// json_ref helpers //
+//////////////////////
+
+template<typename>
+class json_ref;
+
+template<typename>
+struct is_json_ref : std::false_type {};
+
+template<typename T>
+struct is_json_ref<json_ref<T>> : std::true_type {};
+
+//////////////////////////
+// aliases for detected //
+//////////////////////////
+
+template<typename T>
+using mapped_type_t = typename T::mapped_type;
+
+template<typename T>
+using key_type_t = typename T::key_type;
+
+template<typename T>
+using value_type_t = typename T::value_type;
+
+template<typename T>
+using difference_type_t = typename T::difference_type;
+
+template<typename T>
+using pointer_t = typename T::pointer;
+
+template<typename T>
+using reference_t = typename T::reference;
+
+template<typename T>
+using iterator_category_t = typename T::iterator_category;
+
+template<typename T>
+using iterator_t = typename T::iterator;
+
+template<typename T, typename... Args>
+using to_json_function = decltype(T::to_json(std::declval<Args>()...));
+
+template<typename T, typename... Args>
+using from_json_function = decltype(T::from_json(std::declval<Args>()...));
+
+template<typename T, typename U>
+using get_template_function = decltype(std::declval<T>().template get<U>());
+
+// trait checking if JSONSerializer<T>::from_json(json const&, udt&) exists
+template<typename BasicJsonType, typename T, typename = void>
+struct has_from_json : std::false_type {};
+
+// trait checking if j.get<T> is valid
+// use this trait instead of std::is_constructible or std::is_convertible,
+// both rely on, or make use of implicit conversions, and thus fail when T
+// has several constructors/operator= (see https://github.com/nlohmann/json/issues/958)
+template <typename BasicJsonType, typename T>
+struct is_getable
+{
+    static constexpr bool value = is_detected<get_template_function, const BasicJsonType&, T>::value;
+};
+
+template<typename BasicJsonType, typename T>
+struct has_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, from_json_function, serializer,
+        const BasicJsonType&, T&>::value;
+};
+
+// This trait checks if JSONSerializer<T>::from_json(json const&) exists
+// this overload is used for non-default-constructible user-defined-types
+template<typename BasicJsonType, typename T, typename = void>
+struct has_non_default_from_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_non_default_from_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<T, from_json_function, serializer,
+        const BasicJsonType&>::value;
+};
+
+// This trait checks if BasicJsonType::json_serializer<T>::to_json exists
+// Do not evaluate the trait when T is a basic_json type, to avoid template instantiation infinite recursion.
+template<typename BasicJsonType, typename T, typename = void>
+struct has_to_json : std::false_type {};
+
+template<typename BasicJsonType, typename T>
+struct has_to_json < BasicJsonType, T, enable_if_t < !is_basic_json<T>::value >>
+{
+    using serializer = typename BasicJsonType::template json_serializer<T, void>;
+
+    static constexpr bool value =
+        is_detected_exact<void, to_json_function, serializer, BasicJsonType&,
+        T>::value;
+};
+
+
+///////////////////
+// is_ functions //
+///////////////////
+
+// https://en.cppreference.com/w/cpp/types/conjunction
+template<class...> struct conjunction : std::true_type { };
+template<class B1> struct conjunction<B1> : B1 { };
+template<class B1, class... Bn>
+struct conjunction<B1, Bn...>
+: std::conditional<bool(B1::value), conjunction<Bn...>, B1>::type {};
+
+// https://en.cppreference.com/w/cpp/types/negation
+template<class B> struct negation : std::integral_constant < bool, !B::value > { };
+
+// Reimplementation of is_constructible and is_default_constructible, due to them being broken for
+// std::pair and std::tuple until LWG 2367 fix (see https://cplusplus.github.io/LWG/lwg-defects.html#2367).
+// This causes compile errors in e.g. clang 3.5 or gcc 4.9.
+template <typename T>
+struct is_default_constructible : std::is_default_constructible<T> {};
+
+template <typename T1, typename T2>
+struct is_default_constructible<std::pair<T1, T2>>
+            : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+
+template <typename T1, typename T2>
+struct is_default_constructible<const std::pair<T1, T2>>
+            : conjunction<is_default_constructible<T1>, is_default_constructible<T2>> {};
+
+template <typename... Ts>
+struct is_default_constructible<std::tuple<Ts...>>
+            : conjunction<is_default_constructible<Ts>...> {};
+
+template <typename... Ts>
+struct is_default_constructible<const std::tuple<Ts...>>
+            : conjunction<is_default_constructible<Ts>...> {};
+
+
+template <typename T, typename... Args>
+struct is_constructible : std::is_constructible<T, Args...> {};
+
+template <typename T1, typename T2>
+struct is_constructible<std::pair<T1, T2>> : is_default_constructible<std::pair<T1, T2>> {};
+
+template <typename T1, typename T2>
+struct is_constructible<const std::pair<T1, T2>> : is_default_constructible<const std::pair<T1, T2>> {};
+
+template <typename... Ts>
+struct is_constructible<std::tuple<Ts...>> : is_default_constructible<std::tuple<Ts...>> {};
+
+template <typename... Ts>
+struct is_constructible<const std::tuple<Ts...>> : is_default_constructible<const std::tuple<Ts...>> {};
+
+
+template<typename T, typename = void>
+struct is_iterator_traits : std::false_type {};
+
+template<typename T>
+struct is_iterator_traits<iterator_traits<T>>
+{
+  private:
+    using traits = iterator_traits<T>;
+
+  public:
+    static constexpr auto value =
+        is_detected<value_type_t, traits>::value &&
+        is_detected<difference_type_t, traits>::value &&
+        is_detected<pointer_t, traits>::value &&
+        is_detected<iterator_category_t, traits>::value &&
+        is_detected<reference_t, traits>::value;
+};
+
+// The following implementation of is_complete_type is taken from
+// https://blogs.msdn.microsoft.com/vcblog/2015/12/02/partial-support-for-expression-sfinae-in-vs-2015-update-1/
+// and is written by Xiang Fan who agreed to using it in this library.
+
+template<typename T, typename = void>
+struct is_complete_type : std::false_type {};
+
+template<typename T>
+struct is_complete_type<T, decltype(void(sizeof(T)))> : std::true_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType,
+         typename = void>
+struct is_compatible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type_impl <
+    BasicJsonType, CompatibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, CompatibleObjectType>::value&&
+    is_detected<key_type_t, CompatibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    // macOS's is_constructible does not play well with nonesuch...
+    static constexpr bool value =
+        is_constructible<typename object_t::key_type,
+        typename CompatibleObjectType::key_type>::value &&
+        is_constructible<typename object_t::mapped_type,
+        typename CompatibleObjectType::mapped_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleObjectType>
+struct is_compatible_object_type
+    : is_compatible_object_type_impl<BasicJsonType, CompatibleObjectType> {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         typename = void>
+struct is_constructible_object_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type_impl <
+    BasicJsonType, ConstructibleObjectType,
+    enable_if_t < is_detected<mapped_type_t, ConstructibleObjectType>::value&&
+    is_detected<key_type_t, ConstructibleObjectType>::value >>
+{
+    using object_t = typename BasicJsonType::object_t;
+
+    static constexpr bool value =
+        (is_default_constructible<ConstructibleObjectType>::value &&
+         (std::is_move_assignable<ConstructibleObjectType>::value ||
+          std::is_copy_assignable<ConstructibleObjectType>::value) &&
+         (is_constructible<typename ConstructibleObjectType::key_type,
+          typename object_t::key_type>::value &&
+          std::is_same <
+          typename object_t::mapped_type,
+          typename ConstructibleObjectType::mapped_type >::value)) ||
+        (has_from_json<BasicJsonType,
+         typename ConstructibleObjectType::mapped_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType,
+         typename ConstructibleObjectType::mapped_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleObjectType>
+struct is_constructible_object_type
+    : is_constructible_object_type_impl<BasicJsonType,
+      ConstructibleObjectType> {};
+
+template<typename BasicJsonType, typename CompatibleStringType,
+         typename = void>
+struct is_compatible_string_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleStringType>
+struct is_compatible_string_type_impl <
+    BasicJsonType, CompatibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, CompatibleStringType>::value >>
+{
+    static constexpr auto value =
+        is_constructible<typename BasicJsonType::string_t, CompatibleStringType>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_compatible_string_type
+    : is_compatible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template<typename BasicJsonType, typename ConstructibleStringType,
+         typename = void>
+struct is_constructible_string_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type_impl <
+    BasicJsonType, ConstructibleStringType,
+    enable_if_t<is_detected_exact<typename BasicJsonType::string_t::value_type,
+    value_type_t, ConstructibleStringType>::value >>
+{
+    static constexpr auto value =
+        is_constructible<ConstructibleStringType,
+        typename BasicJsonType::string_t>::value;
+};
+
+template<typename BasicJsonType, typename ConstructibleStringType>
+struct is_constructible_string_type
+    : is_constructible_string_type_impl<BasicJsonType, ConstructibleStringType> {};
+
+template<typename BasicJsonType, typename CompatibleArrayType, typename = void>
+struct is_compatible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type_impl <
+    BasicJsonType, CompatibleArrayType,
+    enable_if_t < is_detected<value_type_t, CompatibleArrayType>::value&&
+    is_detected<iterator_t, CompatibleArrayType>::value&&
+// This is needed because json_reverse_iterator has a ::iterator type...
+// Therefore it is detected as a CompatibleArrayType.
+// The real fix would be to have an Iterable concept.
+    !is_iterator_traits <
+    iterator_traits<CompatibleArrayType >>::value >>
+{
+    static constexpr bool value =
+        is_constructible<BasicJsonType,
+        typename CompatibleArrayType::value_type>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleArrayType>
+struct is_compatible_array_type
+    : is_compatible_array_type_impl<BasicJsonType, CompatibleArrayType> {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType, typename = void>
+struct is_constructible_array_type_impl : std::false_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t<std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value >>
+            : std::true_type {};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type_impl <
+    BasicJsonType, ConstructibleArrayType,
+    enable_if_t < !std::is_same<ConstructibleArrayType,
+    typename BasicJsonType::value_type>::value&&
+    is_default_constructible<ConstructibleArrayType>::value&&
+(std::is_move_assignable<ConstructibleArrayType>::value ||
+ std::is_copy_assignable<ConstructibleArrayType>::value)&&
+is_detected<value_type_t, ConstructibleArrayType>::value&&
+is_detected<iterator_t, ConstructibleArrayType>::value&&
+is_complete_type <
+detected_t<value_type_t, ConstructibleArrayType >>::value >>
+{
+    static constexpr bool value =
+        // This is needed because json_reverse_iterator has a ::iterator type,
+        // furthermore, std::back_insert_iterator (and other iterators) have a
+        // base class `iterator`... Therefore it is detected as a
+        // ConstructibleArrayType. The real fix would be to have an Iterable
+        // concept.
+        !is_iterator_traits<iterator_traits<ConstructibleArrayType>>::value &&
+
+        (std::is_same<typename ConstructibleArrayType::value_type,
+         typename BasicJsonType::array_t::value_type>::value ||
+         has_from_json<BasicJsonType,
+         typename ConstructibleArrayType::value_type>::value ||
+         has_non_default_from_json <
+         BasicJsonType, typename ConstructibleArrayType::value_type >::value);
+};
+
+template<typename BasicJsonType, typename ConstructibleArrayType>
+struct is_constructible_array_type
+    : is_constructible_array_type_impl<BasicJsonType, ConstructibleArrayType> {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType,
+         typename = void>
+struct is_compatible_integer_type_impl : std::false_type {};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type_impl <
+    RealIntegerType, CompatibleNumberIntegerType,
+    enable_if_t < std::is_integral<RealIntegerType>::value&&
+    std::is_integral<CompatibleNumberIntegerType>::value&&
+    !std::is_same<bool, CompatibleNumberIntegerType>::value >>
+{
+    // is there an assert somewhere on overflows?
+    using RealLimits = std::numeric_limits<RealIntegerType>;
+    using CompatibleLimits = std::numeric_limits<CompatibleNumberIntegerType>;
+
+    static constexpr auto value =
+        is_constructible<RealIntegerType,
+        CompatibleNumberIntegerType>::value &&
+        CompatibleLimits::is_integer &&
+        RealLimits::is_signed == CompatibleLimits::is_signed;
+};
+
+template<typename RealIntegerType, typename CompatibleNumberIntegerType>
+struct is_compatible_integer_type
+    : is_compatible_integer_type_impl<RealIntegerType,
+      CompatibleNumberIntegerType> {};
+
+template<typename BasicJsonType, typename CompatibleType, typename = void>
+struct is_compatible_type_impl: std::false_type {};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type_impl <
+    BasicJsonType, CompatibleType,
+    enable_if_t<is_complete_type<CompatibleType>::value >>
+{
+    static constexpr bool value =
+        has_to_json<BasicJsonType, CompatibleType>::value;
+};
+
+template<typename BasicJsonType, typename CompatibleType>
+struct is_compatible_type
+    : is_compatible_type_impl<BasicJsonType, CompatibleType> {};
+
+template<typename T1, typename T2>
+struct is_constructible_tuple : std::false_type {};
+
+template<typename T1, typename... Args>
+struct is_constructible_tuple<T1, std::tuple<Args...>> : conjunction<is_constructible<T1, Args>...> {};
+
+// a naive helper to check if a type is an ordered_map (exploits the fact that
+// ordered_map inherits capacity() from std::vector)
+template <typename T>
+struct is_ordered_map
+{
+    using one = char;
+
+    struct two
+    {
+        char x[2]; // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+    };
+
+    template <typename C> static one test( decltype(&C::capacity) ) ;
+    template <typename C> static two test(...);
+
+    enum { value = sizeof(test<T>(nullptr)) == sizeof(char) }; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+};
+
+// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
+template < typename T, typename U, enable_if_t < !std::is_same<T, U>::value, int > = 0 >
+T conditional_static_cast(U value)
+{
+    return static_cast<T>(value);
+}
+
+template<typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
+T conditional_static_cast(U value)
+{
+    return value;
+}
+
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename std::nullptr_t& n)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_null()))
+    {
+        JSON_THROW(type_error::create(302, "type must be null, but is " + std::string(j.type_name()), j));
+    }
+    n = nullptr;
+}
+
+// overloads for basic_json template parameters
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t < std::is_arithmetic<ArithmeticType>::value&&
+                         !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+                         int > = 0 >
+void get_arithmetic_value(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+
+        case value_t::null:
+        case value_t::object:
+        case value_t::array:
+        case value_t::string:
+        case value_t::boolean:
+        case value_t::binary:
+        case value_t::discarded:
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name()), j));
+    }
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::boolean_t& b)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_boolean()))
+    {
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(j.type_name()), j));
+    }
+    b = *j.template get_ptr<const typename BasicJsonType::boolean_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::string_t& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name()), j));
+    }
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template <
+    typename BasicJsonType, typename ConstructibleStringType,
+    enable_if_t <
+        is_constructible_string_type<BasicJsonType, ConstructibleStringType>::value&&
+        !std::is_same<typename BasicJsonType::string_t,
+                      ConstructibleStringType>::value,
+        int > = 0 >
+void from_json(const BasicJsonType& j, ConstructibleStringType& s)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_string()))
+    {
+        JSON_THROW(type_error::create(302, "type must be string, but is " + std::string(j.type_name()), j));
+    }
+
+    s = *j.template get_ptr<const typename BasicJsonType::string_t*>();
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_float_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_unsigned_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::number_integer_t& val)
+{
+    get_arithmetic_value(j, val);
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void from_json(const BasicJsonType& j, EnumType& e)
+{
+    typename std::underlying_type<EnumType>::type val;
+    get_arithmetic_value(j, val);
+    e = static_cast<EnumType>(val);
+}
+
+// forward_list doesn't have an insert method
+template<typename BasicJsonType, typename T, typename Allocator,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::forward_list<T, Allocator>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+    l.clear();
+    std::transform(j.rbegin(), j.rend(),
+                   std::front_inserter(l), [](const BasicJsonType & i)
+    {
+        return i.template get<T>();
+    });
+}
+
+// valarray doesn't have an insert method
+template<typename BasicJsonType, typename T,
+         enable_if_t<is_getable<BasicJsonType, T>::value, int> = 0>
+void from_json(const BasicJsonType& j, std::valarray<T>& l)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+    l.resize(j.size());
+    std::transform(j.begin(), j.end(), std::begin(l),
+                   [](const BasicJsonType & elem)
+    {
+        return elem.template get<T>();
+    });
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json(const BasicJsonType& j, T (&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType>
+void from_json_array_impl(const BasicJsonType& j, typename BasicJsonType::array_t& arr, priority_tag<3> /*unused*/)
+{
+    arr = *j.template get_ptr<const typename BasicJsonType::array_t*>();
+}
+
+template<typename BasicJsonType, typename T, std::size_t N>
+auto from_json_array_impl(const BasicJsonType& j, std::array<T, N>& arr,
+                          priority_tag<2> /*unused*/)
+-> decltype(j.template get<T>(), void())
+{
+    for (std::size_t i = 0; i < N; ++i)
+    {
+        arr[i] = j.at(i).template get<T>();
+    }
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType,
+         enable_if_t<
+             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
+             int> = 0>
+auto from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr, priority_tag<1> /*unused*/)
+-> decltype(
+    arr.reserve(std::declval<typename ConstructibleArrayType::size_type>()),
+    j.template get<typename ConstructibleArrayType::value_type>(),
+    void())
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    ret.reserve(j.size());
+    std::transform(j.begin(), j.end(),
+                   std::inserter(ret, end(ret)), [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template<typename BasicJsonType, typename ConstructibleArrayType,
+         enable_if_t<
+             std::is_assignable<ConstructibleArrayType&, ConstructibleArrayType>::value,
+             int> = 0>
+void from_json_array_impl(const BasicJsonType& j, ConstructibleArrayType& arr,
+                          priority_tag<0> /*unused*/)
+{
+    using std::end;
+
+    ConstructibleArrayType ret;
+    std::transform(
+        j.begin(), j.end(), std::inserter(ret, end(ret)),
+        [](const BasicJsonType & i)
+    {
+        // get<BasicJsonType>() returns *this, this won't call a from_json
+        // method when value_type is BasicJsonType
+        return i.template get<typename ConstructibleArrayType::value_type>();
+    });
+    arr = std::move(ret);
+}
+
+template < typename BasicJsonType, typename ConstructibleArrayType,
+           enable_if_t <
+               is_constructible_array_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_object_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !is_constructible_string_type<BasicJsonType, ConstructibleArrayType>::value&&
+               !std::is_same<ConstructibleArrayType, typename BasicJsonType::binary_t>::value&&
+               !is_basic_json<ConstructibleArrayType>::value,
+               int > = 0 >
+auto from_json(const BasicJsonType& j, ConstructibleArrayType& arr)
+-> decltype(from_json_array_impl(j, arr, priority_tag<3> {}),
+j.template get<typename ConstructibleArrayType::value_type>(),
+void())
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+
+    from_json_array_impl(j, arr, priority_tag<3> {});
+}
+
+template < typename BasicJsonType, typename T, std::size_t... Idx >
+std::array<T, sizeof...(Idx)> from_json_inplace_array_impl(BasicJsonType&& j,
+        identity_tag<std::array<T, sizeof...(Idx)>> /*unused*/, index_sequence<Idx...> /*unused*/)
+{
+    return { { std::forward<BasicJsonType>(j).at(Idx).template get<T>()... } };
+}
+
+template < typename BasicJsonType, typename T, std::size_t N >
+auto from_json(BasicJsonType&& j, identity_tag<std::array<T, N>> tag)
+-> decltype(from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {}))
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+
+    return from_json_inplace_array_impl(std::forward<BasicJsonType>(j), tag, make_index_sequence<N> {});
+}
+
+template<typename BasicJsonType>
+void from_json(const BasicJsonType& j, typename BasicJsonType::binary_t& bin)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_binary()))
+    {
+        JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(j.type_name()), j));
+    }
+
+    bin = *j.template get_ptr<const typename BasicJsonType::binary_t*>();
+}
+
+template<typename BasicJsonType, typename ConstructibleObjectType,
+         enable_if_t<is_constructible_object_type<BasicJsonType, ConstructibleObjectType>::value, int> = 0>
+void from_json(const BasicJsonType& j, ConstructibleObjectType& obj)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+    {
+        JSON_THROW(type_error::create(302, "type must be object, but is " + std::string(j.type_name()), j));
+    }
+
+    ConstructibleObjectType ret;
+    const auto* inner_object = j.template get_ptr<const typename BasicJsonType::object_t*>();
+    using value_type = typename ConstructibleObjectType::value_type;
+    std::transform(
+        inner_object->begin(), inner_object->end(),
+        std::inserter(ret, ret.begin()),
+        [](typename BasicJsonType::object_t::value_type const & p)
+    {
+        return value_type(p.first, p.second.template get<typename ConstructibleObjectType::mapped_type>());
+    });
+    obj = std::move(ret);
+}
+
+// overload for arithmetic types, not chosen for basic_json template arguments
+// (BooleanType, etc..); note: Is it really necessary to provide explicit
+// overloads for boolean_t etc. in case of a custom BooleanType which is not
+// an arithmetic type?
+template < typename BasicJsonType, typename ArithmeticType,
+           enable_if_t <
+               std::is_arithmetic<ArithmeticType>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_unsigned_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_integer_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::number_float_t>::value&&
+               !std::is_same<ArithmeticType, typename BasicJsonType::boolean_t>::value,
+               int > = 0 >
+void from_json(const BasicJsonType& j, ArithmeticType& val)
+{
+    switch (static_cast<value_t>(j))
+    {
+        case value_t::number_unsigned:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_unsigned_t*>());
+            break;
+        }
+        case value_t::number_integer:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_integer_t*>());
+            break;
+        }
+        case value_t::number_float:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::number_float_t*>());
+            break;
+        }
+        case value_t::boolean:
+        {
+            val = static_cast<ArithmeticType>(*j.template get_ptr<const typename BasicJsonType::boolean_t*>());
+            break;
+        }
+
+        case value_t::null:
+        case value_t::object:
+        case value_t::array:
+        case value_t::string:
+        case value_t::binary:
+        case value_t::discarded:
+        default:
+            JSON_THROW(type_error::create(302, "type must be number, but is " + std::string(j.type_name()), j));
+    }
+}
+
+template<typename BasicJsonType, typename... Args, std::size_t... Idx>
+std::tuple<Args...> from_json_tuple_impl_base(BasicJsonType&& j, index_sequence<Idx...> /*unused*/)
+{
+    return std::make_tuple(std::forward<BasicJsonType>(j).at(Idx).template get<Args>()...);
+}
+
+template < typename BasicJsonType, class A1, class A2 >
+std::pair<A1, A2> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::pair<A1, A2>> /*unused*/, priority_tag<0> /*unused*/)
+{
+    return {std::forward<BasicJsonType>(j).at(0).template get<A1>(),
+            std::forward<BasicJsonType>(j).at(1).template get<A2>()};
+}
+
+template<typename BasicJsonType, typename A1, typename A2>
+void from_json_tuple_impl(BasicJsonType&& j, std::pair<A1, A2>& p, priority_tag<1> /*unused*/)
+{
+    p = from_json_tuple_impl(std::forward<BasicJsonType>(j), identity_tag<std::pair<A1, A2>> {}, priority_tag<0> {});
+}
+
+template<typename BasicJsonType, typename... Args>
+std::tuple<Args...> from_json_tuple_impl(BasicJsonType&& j, identity_tag<std::tuple<Args...>> /*unused*/, priority_tag<2> /*unused*/)
+{
+    return from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
+}
+
+template<typename BasicJsonType, typename... Args>
+void from_json_tuple_impl(BasicJsonType&& j, std::tuple<Args...>& t, priority_tag<3> /*unused*/)
+{
+    t = from_json_tuple_impl_base<BasicJsonType, Args...>(std::forward<BasicJsonType>(j), index_sequence_for<Args...> {});
+}
+
+template<typename BasicJsonType, typename TupleRelated>
+auto from_json(BasicJsonType&& j, TupleRelated&& t)
+-> decltype(from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {}))
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+
+    return from_json_tuple_impl(std::forward<BasicJsonType>(j), std::forward<TupleRelated>(t), priority_tag<3> {});
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Compare, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+void from_json(const BasicJsonType& j, std::map<Key, Value, Compare, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name()), j));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+template < typename BasicJsonType, typename Key, typename Value, typename Hash, typename KeyEqual, typename Allocator,
+           typename = enable_if_t < !std::is_constructible <
+                                        typename BasicJsonType::string_t, Key >::value >>
+void from_json(const BasicJsonType& j, std::unordered_map<Key, Value, Hash, KeyEqual, Allocator>& m)
+{
+    if (JSON_HEDLEY_UNLIKELY(!j.is_array()))
+    {
+        JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(j.type_name()), j));
+    }
+    m.clear();
+    for (const auto& p : j)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!p.is_array()))
+        {
+            JSON_THROW(type_error::create(302, "type must be array, but is " + std::string(p.type_name()), j));
+        }
+        m.emplace(p.at(0).template get<Key>(), p.at(1).template get<Value>());
+    }
+}
+
+struct from_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(const BasicJsonType& j, T&& val) const
+    noexcept(noexcept(from_json(j, std::forward<T>(val))))
+    -> decltype(from_json(j, std::forward<T>(val)))
+    {
+        return from_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `from_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
+{
+constexpr const auto& from_json = detail::static_const<detail::from_json_fn>::value; // NOLINT(misc-definitions-in-headers)
+} // namespace
+} // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+
+#include <algorithm> // copy
+#include <iterator> // begin, end
+#include <string> // string
+#include <tuple> // tuple, get
+#include <type_traits> // is_same, is_constructible, is_floating_point, is_enum, underlying_type
+#include <utility> // move, forward, declval, pair
+#include <valarray> // valarray
+#include <vector> // vector
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+
+#include <cstddef> // size_t
+#include <iterator> // input_iterator_tag
+#include <string> // string, to_string
+#include <tuple> // tuple_size, get, tuple_element
+#include <utility> // move
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename string_type>
+void int_to_string( string_type& target, std::size_t value )
+{
+    // For ADL
+    using std::to_string;
+    target = to_string(value);
+}
+template<typename IteratorType> class iteration_proxy_value
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = iteration_proxy_value;
+    using pointer = value_type * ;
+    using reference = value_type & ;
+    using iterator_category = std::input_iterator_tag;
+    using string_type = typename std::remove_cv< typename std::remove_reference<decltype( std::declval<IteratorType>().key() ) >::type >::type;
+
+  private:
+    /// the iterator
+    IteratorType anchor;
+    /// an index for arrays (used to create key names)
+    std::size_t array_index = 0;
+    /// last stringified array index
+    mutable std::size_t array_index_last = 0;
+    /// a string representation of the array index
+    mutable string_type array_index_str = "0";
+    /// an empty string (to return a reference for primitive values)
+    const string_type empty_str{};
+
+  public:
+    explicit iteration_proxy_value(IteratorType it) noexcept
+        : anchor(std::move(it))
+    {}
+
+    /// dereference operator (needed for range-based for)
+    iteration_proxy_value& operator*()
+    {
+        return *this;
+    }
+
+    /// increment operator (needed for range-based for)
+    iteration_proxy_value& operator++()
+    {
+        ++anchor;
+        ++array_index;
+
+        return *this;
+    }
+
+    /// equality operator (needed for InputIterator)
+    bool operator==(const iteration_proxy_value& o) const
+    {
+        return anchor == o.anchor;
+    }
+
+    /// inequality operator (needed for range-based for)
+    bool operator!=(const iteration_proxy_value& o) const
+    {
+        return anchor != o.anchor;
+    }
+
+    /// return key of the iterator
+    const string_type& key() const
+    {
+        JSON_ASSERT(anchor.m_object != nullptr);
+
+        switch (anchor.m_object->type())
+        {
+            // use integer array index as key
+            case value_t::array:
+            {
+                if (array_index != array_index_last)
+                {
+                    int_to_string( array_index_str, array_index );
+                    array_index_last = array_index;
+                }
+                return array_index_str;
+            }
+
+            // use key from the object
+            case value_t::object:
+                return anchor.key();
+
+            // use an empty key for all primitive types
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return empty_str;
+        }
+    }
+
+    /// return value of the iterator
+    typename IteratorType::reference value() const
+    {
+        return anchor.value();
+    }
+};
+
+/// proxy class for the items() function
+template<typename IteratorType> class iteration_proxy
+{
+  private:
+    /// the container to iterate
+    typename IteratorType::reference container;
+
+  public:
+    /// construct iteration proxy from a container
+    explicit iteration_proxy(typename IteratorType::reference cont) noexcept
+        : container(cont) {}
+
+    /// return iterator begin (needed for range-based for)
+    iteration_proxy_value<IteratorType> begin() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.begin());
+    }
+
+    /// return iterator end (needed for range-based for)
+    iteration_proxy_value<IteratorType> end() noexcept
+    {
+        return iteration_proxy_value<IteratorType>(container.end());
+    }
+};
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 0, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.key())
+{
+    return i.key();
+}
+// Structured Bindings Support
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+template<std::size_t N, typename IteratorType, enable_if_t<N == 1, int> = 0>
+auto get(const nlohmann::detail::iteration_proxy_value<IteratorType>& i) -> decltype(i.value())
+{
+    return i.value();
+}
+}  // namespace detail
+}  // namespace nlohmann
+
+// The Addition to the STD Namespace is required to add
+// Structured Bindings Support to the iteration_proxy_value class
+// For further reference see https://blog.tartanllama.xyz/structured-bindings/
+// And see https://github.com/nlohmann/json/pull/1391
+namespace std
+{
+#if defined(__clang__)
+    // Fix: https://github.com/nlohmann/json/issues/1401
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template<typename IteratorType>
+class tuple_size<::nlohmann::detail::iteration_proxy_value<IteratorType>>
+            : public std::integral_constant<std::size_t, 2> {};
+
+template<std::size_t N, typename IteratorType>
+class tuple_element<N, ::nlohmann::detail::iteration_proxy_value<IteratorType >>
+{
+  public:
+    using type = decltype(
+                     get<N>(std::declval <
+                            ::nlohmann::detail::iteration_proxy_value<IteratorType >> ()));
+};
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+} // namespace std
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////
+// constructors //
+//////////////////
+
+/*
+ * Note all external_constructor<>::construct functions need to call
+ * j.m_value.destroy(j.m_type) to avoid a memory leak in case j contains an
+ * allocated value (e.g., a string). See bug issue
+ * https://github.com/nlohmann/json/issues/2865 for more information.
+ */
+
+template<value_t> struct external_constructor;
+
+template<>
+struct external_constructor<value_t::boolean>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::boolean;
+        j.m_value = b;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::string>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::string;
+        j.m_value = s;
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::string;
+        j.m_value = std::move(s);
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleStringType,
+               enable_if_t < !std::is_same<CompatibleStringType, typename BasicJsonType::string_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleStringType& str)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::string;
+        j.m_value.string = j.template create<typename BasicJsonType::string_t>(str);
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::binary>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::binary;
+        j.m_value = typename BasicJsonType::binary_t(b);
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::binary;
+        j.m_value = typename BasicJsonType::binary_t(std::move(b));
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_float>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::number_float;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_unsigned>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::number_unsigned;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::number_integer>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::number_integer;
+        j.m_value = val;
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::array>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::array;
+        j.m_value = arr;
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::array;
+        j.m_value = std::move(arr);
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleArrayType,
+               enable_if_t < !std::is_same<CompatibleArrayType, typename BasicJsonType::array_t>::value,
+                             int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleArrayType& arr)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::array;
+        j.m_value.array = j.template create<typename BasicJsonType::array_t>(begin(arr), end(arr));
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const std::vector<bool>& arr)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->reserve(arr.size());
+        for (const bool x : arr)
+        {
+            j.m_value.array->push_back(x);
+            j.set_parent(j.m_value.array->back());
+        }
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType, typename T,
+             enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+    static void construct(BasicJsonType& j, const std::valarray<T>& arr)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::array;
+        j.m_value = value_t::array;
+        j.m_value.array->resize(arr.size());
+        if (arr.size() > 0)
+        {
+            std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin());
+        }
+        j.set_parents();
+        j.assert_invariant();
+    }
+};
+
+template<>
+struct external_constructor<value_t::object>
+{
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::object;
+        j.m_value = obj;
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template<typename BasicJsonType>
+    static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+    {
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::object;
+        j.m_value = std::move(obj);
+        j.set_parents();
+        j.assert_invariant();
+    }
+
+    template < typename BasicJsonType, typename CompatibleObjectType,
+               enable_if_t < !std::is_same<CompatibleObjectType, typename BasicJsonType::object_t>::value, int > = 0 >
+    static void construct(BasicJsonType& j, const CompatibleObjectType& obj)
+    {
+        using std::begin;
+        using std::end;
+
+        j.m_value.destroy(j.m_type);
+        j.m_type = value_t::object;
+        j.m_value.object = j.template create<typename BasicJsonType::object_t>(begin(obj), end(obj));
+        j.set_parents();
+        j.assert_invariant();
+    }
+};
+
+/////////////
+// to_json //
+/////////////
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, typename BasicJsonType::boolean_t>::value, int> = 0>
+void to_json(BasicJsonType& j, T b) noexcept
+{
+    external_constructor<value_t::boolean>::construct(j, b);
+}
+
+template<typename BasicJsonType, typename CompatibleString,
+         enable_if_t<std::is_constructible<typename BasicJsonType::string_t, CompatibleString>::value, int> = 0>
+void to_json(BasicJsonType& j, const CompatibleString& s)
+{
+    external_constructor<value_t::string>::construct(j, s);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::string_t&& s)
+{
+    external_constructor<value_t::string>::construct(j, std::move(s));
+}
+
+template<typename BasicJsonType, typename FloatType,
+         enable_if_t<std::is_floating_point<FloatType>::value, int> = 0>
+void to_json(BasicJsonType& j, FloatType val) noexcept
+{
+    external_constructor<value_t::number_float>::construct(j, static_cast<typename BasicJsonType::number_float_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberUnsignedType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_unsigned_t, CompatibleNumberUnsignedType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberUnsignedType val) noexcept
+{
+    external_constructor<value_t::number_unsigned>::construct(j, static_cast<typename BasicJsonType::number_unsigned_t>(val));
+}
+
+template<typename BasicJsonType, typename CompatibleNumberIntegerType,
+         enable_if_t<is_compatible_integer_type<typename BasicJsonType::number_integer_t, CompatibleNumberIntegerType>::value, int> = 0>
+void to_json(BasicJsonType& j, CompatibleNumberIntegerType val) noexcept
+{
+    external_constructor<value_t::number_integer>::construct(j, static_cast<typename BasicJsonType::number_integer_t>(val));
+}
+
+template<typename BasicJsonType, typename EnumType,
+         enable_if_t<std::is_enum<EnumType>::value, int> = 0>
+void to_json(BasicJsonType& j, EnumType e) noexcept
+{
+    using underlying_type = typename std::underlying_type<EnumType>::type;
+    external_constructor<value_t::number_integer>::construct(j, static_cast<underlying_type>(e));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const std::vector<bool>& e)
+{
+    external_constructor<value_t::array>::construct(j, e);
+}
+
+template < typename BasicJsonType, typename CompatibleArrayType,
+           enable_if_t < is_compatible_array_type<BasicJsonType,
+                         CompatibleArrayType>::value&&
+                         !is_compatible_object_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !is_compatible_string_type<BasicJsonType, CompatibleArrayType>::value&&
+                         !std::is_same<typename BasicJsonType::binary_t, CompatibleArrayType>::value&&
+                         !is_basic_json<CompatibleArrayType>::value,
+                         int > = 0 >
+void to_json(BasicJsonType& j, const CompatibleArrayType& arr)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, const typename BasicJsonType::binary_t& bin)
+{
+    external_constructor<value_t::binary>::construct(j, bin);
+}
+
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_convertible<T, BasicJsonType>::value, int> = 0>
+void to_json(BasicJsonType& j, const std::valarray<T>& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::array_t&& arr)
+{
+    external_constructor<value_t::array>::construct(j, std::move(arr));
+}
+
+template < typename BasicJsonType, typename CompatibleObjectType,
+           enable_if_t < is_compatible_object_type<BasicJsonType, CompatibleObjectType>::value&& !is_basic_json<CompatibleObjectType>::value, int > = 0 >
+void to_json(BasicJsonType& j, const CompatibleObjectType& obj)
+{
+    external_constructor<value_t::object>::construct(j, obj);
+}
+
+template<typename BasicJsonType>
+void to_json(BasicJsonType& j, typename BasicJsonType::object_t&& obj)
+{
+    external_constructor<value_t::object>::construct(j, std::move(obj));
+}
+
+template <
+    typename BasicJsonType, typename T, std::size_t N,
+    enable_if_t < !std::is_constructible<typename BasicJsonType::string_t,
+                  const T(&)[N]>::value, // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+                  int > = 0 >
+void to_json(BasicJsonType& j, const T(&arr)[N]) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+{
+    external_constructor<value_t::array>::construct(j, arr);
+}
+
+template < typename BasicJsonType, typename T1, typename T2, enable_if_t < std::is_constructible<BasicJsonType, T1>::value&& std::is_constructible<BasicJsonType, T2>::value, int > = 0 >
+void to_json(BasicJsonType& j, const std::pair<T1, T2>& p)
+{
+    j = { p.first, p.second };
+}
+
+// for https://github.com/nlohmann/json/pull/1134
+template<typename BasicJsonType, typename T,
+         enable_if_t<std::is_same<T, iteration_proxy_value<typename BasicJsonType::iterator>>::value, int> = 0>
+void to_json(BasicJsonType& j, const T& b)
+{
+    j = { {b.key(), b.value()} };
+}
+
+template<typename BasicJsonType, typename Tuple, std::size_t... Idx>
+void to_json_tuple_impl(BasicJsonType& j, const Tuple& t, index_sequence<Idx...> /*unused*/)
+{
+    j = { std::get<Idx>(t)... };
+}
+
+template<typename BasicJsonType, typename T, enable_if_t<is_constructible_tuple<BasicJsonType, T>::value, int > = 0>
+void to_json(BasicJsonType& j, const T& t)
+{
+    to_json_tuple_impl(j, t, make_index_sequence<std::tuple_size<T>::value> {});
+}
+
+struct to_json_fn
+{
+    template<typename BasicJsonType, typename T>
+    auto operator()(BasicJsonType& j, T&& val) const noexcept(noexcept(to_json(j, std::forward<T>(val))))
+    -> decltype(to_json(j, std::forward<T>(val)), void())
+    {
+        return to_json(j, std::forward<T>(val));
+    }
+};
+}  // namespace detail
+
+/// namespace to hold default `to_json` function
+/// to see why this is required:
+/// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/n4381.html
+namespace // NOLINT(cert-dcl59-cpp,fuchsia-header-anon-namespaces,google-build-namespaces)
+{
+constexpr const auto& to_json = detail::static_const<detail::to_json_fn>::value; // NOLINT(misc-definitions-in-headers)
+} // namespace
+} // namespace nlohmann
+
+// #include <nlohmann/detail/meta/identity_tag.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+
+template<typename ValueType, typename>
+struct adl_serializer
+{
+    /*!
+    @brief convert a JSON value to any value type
+
+    This function is usually called by the `get()` function of the
+    @ref basic_json class (either explicit or via conversion operators).
+
+    @note This function is chosen for default-constructible value types.
+
+    @param[in] j        JSON value to read from
+    @param[in,out] val  value to write to
+    */
+    template<typename BasicJsonType, typename TargetType = ValueType>
+    static auto from_json(BasicJsonType && j, TargetType& val) noexcept(
+        noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), val)))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), val), void())
+    {
+        ::nlohmann::from_json(std::forward<BasicJsonType>(j), val);
+    }
+
+    /*!
+    @brief convert a JSON value to any value type
+
+    This function is usually called by the `get()` function of the
+    @ref basic_json class (either explicit or via conversion operators).
+
+    @note This function is chosen for value types which are not default-constructible.
+
+    @param[in] j  JSON value to read from
+
+    @return copy of the JSON value, converted to @a ValueType
+    */
+    template<typename BasicJsonType, typename TargetType = ValueType>
+    static auto from_json(BasicJsonType && j) noexcept(
+    noexcept(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {})))
+    -> decltype(::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {}))
+    {
+        return ::nlohmann::from_json(std::forward<BasicJsonType>(j), detail::identity_tag<TargetType> {});
+    }
+
+    /*!
+    @brief convert any value type to a JSON value
+
+    This function is usually called by the constructors of the @ref basic_json
+    class.
+
+    @param[in,out] j  JSON value to write to
+    @param[in] val    value to read from
+    */
+    template<typename BasicJsonType, typename TargetType = ValueType>
+    static auto to_json(BasicJsonType& j, TargetType && val) noexcept(
+        noexcept(::nlohmann::to_json(j, std::forward<TargetType>(val))))
+    -> decltype(::nlohmann::to_json(j, std::forward<TargetType>(val)), void())
+    {
+        ::nlohmann::to_json(j, std::forward<TargetType>(val));
+    }
+};
+}  // namespace nlohmann
+
+// #include <nlohmann/byte_container_with_subtype.hpp>
+
+
+#include <cstdint> // uint8_t, uint64_t
+#include <tuple> // tie
+#include <utility> // move
+
+namespace nlohmann
+{
+
+/*!
+@brief an internal type for a backed binary type
+
+This type extends the template parameter @a BinaryType provided to `basic_json`
+with a subtype used by BSON and MessagePack. This type exists so that the user
+does not have to specify a type themselves with a specific naming scheme in
+order to override the binary type.
+
+@tparam BinaryType container to store bytes (`std::vector<std::uint8_t>` by
+                   default)
+
+@since version 3.8.0; changed type of subtypes to std::uint64_t in 3.10.0.
+*/
+template<typename BinaryType>
+class byte_container_with_subtype : public BinaryType
+{
+  public:
+    /// the type of the underlying container
+    using container_type = BinaryType;
+    /// the type of the subtype
+    using subtype_type = std::uint64_t;
+
+    byte_container_with_subtype() noexcept(noexcept(container_type()))
+        : container_type()
+    {}
+
+    byte_container_with_subtype(const container_type& b) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+    {}
+
+    byte_container_with_subtype(container_type&& b) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+    {}
+
+    byte_container_with_subtype(const container_type& b, subtype_type subtype_) noexcept(noexcept(container_type(b)))
+        : container_type(b)
+        , m_subtype(subtype_)
+        , m_has_subtype(true)
+    {}
+
+    byte_container_with_subtype(container_type&& b, subtype_type subtype_) noexcept(noexcept(container_type(std::move(b))))
+        : container_type(std::move(b))
+        , m_subtype(subtype_)
+        , m_has_subtype(true)
+    {}
+
+    bool operator==(const byte_container_with_subtype& rhs) const
+    {
+        return std::tie(static_cast<const BinaryType&>(*this), m_subtype, m_has_subtype) ==
+               std::tie(static_cast<const BinaryType&>(rhs), rhs.m_subtype, rhs.m_has_subtype);
+    }
+
+    bool operator!=(const byte_container_with_subtype& rhs) const
+    {
+        return !(rhs == *this);
+    }
+
+    /*!
+    @brief sets the binary subtype
+
+    Sets the binary subtype of the value, also flags a binary JSON value as
+    having a subtype, which has implications for serialization.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa see @ref subtype() -- return the binary subtype
+    @sa see @ref clear_subtype() -- clears the binary subtype
+    @sa see @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    void set_subtype(subtype_type subtype_) noexcept
+    {
+        m_subtype = subtype_;
+        m_has_subtype = true;
+    }
+
+    /*!
+    @brief return the binary subtype
+
+    Returns the numerical subtype of the value if it has a subtype. If it does
+    not have a subtype, this function will return subtype_type(-1) as a sentinel
+    value.
+
+    @return the numerical subtype of the binary value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa see @ref set_subtype() -- sets the binary subtype
+    @sa see @ref clear_subtype() -- clears the binary subtype
+    @sa see @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0; fixed return value to properly return
+           subtype_type(-1) as documented in version 3.10.0
+    */
+    constexpr subtype_type subtype() const noexcept
+    {
+        return m_has_subtype ? m_subtype : subtype_type(-1);
+    }
+
+    /*!
+    @brief return whether the value has a subtype
+
+    @return whether the value has a subtype
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa see @ref subtype() -- return the binary subtype
+    @sa see @ref set_subtype() -- sets the binary subtype
+    @sa see @ref clear_subtype() -- clears the binary subtype
+
+    @since version 3.8.0
+    */
+    constexpr bool has_subtype() const noexcept
+    {
+        return m_has_subtype;
+    }
+
+    /*!
+    @brief clears the binary subtype
+
+    Clears the binary subtype and flags the value as not having a subtype, which
+    has implications for serialization; for instance MessagePack will prefer the
+    bin family over the ext family.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @sa see @ref subtype() -- return the binary subtype
+    @sa see @ref set_subtype() -- sets the binary subtype
+    @sa see @ref has_subtype() -- returns whether or not the binary value has a
+    subtype
+
+    @since version 3.8.0
+    */
+    void clear_subtype() noexcept
+    {
+        m_subtype = 0;
+        m_has_subtype = false;
+    }
+
+  private:
+    subtype_type m_subtype = 0;
+    bool m_has_subtype = false;
+};
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/conversions/from_json.hpp>
+
+// #include <nlohmann/detail/conversions/to_json.hpp>
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/hash.hpp>
+
+
+#include <cstdint> // uint8_t
+#include <cstddef> // size_t
+#include <functional> // hash
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+// boost::hash_combine
+inline std::size_t combine(std::size_t seed, std::size_t h) noexcept
+{
+    seed ^= h + 0x9e3779b9 + (seed << 6U) + (seed >> 2U);
+    return seed;
+}
+
+/*!
+@brief hash a JSON value
+
+The hash function tries to rely on std::hash where possible. Furthermore, the
+type of the JSON value is taken into account to have different hash values for
+null, 0, 0U, and false, etc.
+
+@tparam BasicJsonType basic_json specialization
+@param j JSON value to hash
+@return hash value of j
+*/
+template<typename BasicJsonType>
+std::size_t hash(const BasicJsonType& j)
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+    const auto type = static_cast<std::size_t>(j.type());
+    switch (j.type())
+    {
+        case BasicJsonType::value_t::null:
+        case BasicJsonType::value_t::discarded:
+        {
+            return combine(type, 0);
+        }
+
+        case BasicJsonType::value_t::object:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j.items())
+            {
+                const auto h = std::hash<string_t> {}(element.key());
+                seed = combine(seed, h);
+                seed = combine(seed, hash(element.value()));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::array:
+        {
+            auto seed = combine(type, j.size());
+            for (const auto& element : j)
+            {
+                seed = combine(seed, hash(element));
+            }
+            return seed;
+        }
+
+        case BasicJsonType::value_t::string:
+        {
+            const auto h = std::hash<string_t> {}(j.template get_ref<const string_t&>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::boolean:
+        {
+            const auto h = std::hash<bool> {}(j.template get<bool>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_integer:
+        {
+            const auto h = std::hash<number_integer_t> {}(j.template get<number_integer_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_unsigned:
+        {
+            const auto h = std::hash<number_unsigned_t> {}(j.template get<number_unsigned_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::number_float:
+        {
+            const auto h = std::hash<number_float_t> {}(j.template get<number_float_t>());
+            return combine(type, h);
+        }
+
+        case BasicJsonType::value_t::binary:
+        {
+            auto seed = combine(type, j.get_binary().size());
+            const auto h = std::hash<bool> {}(j.get_binary().has_subtype());
+            seed = combine(seed, h);
+            seed = combine(seed, static_cast<std::size_t>(j.get_binary().subtype()));
+            for (const auto byte : j.get_binary())
+            {
+                seed = combine(seed, std::hash<std::uint8_t> {}(byte));
+            }
+            return seed;
+        }
+
+        default:                   // LCOV_EXCL_LINE
+            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            return 0;              // LCOV_EXCL_LINE
+    }
+}
+
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+
+#include <algorithm> // generate_n
+#include <array> // array
+#include <cmath> // ldexp
+#include <cstddef> // size_t
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstdio> // snprintf
+#include <cstring> // memcpy
+#include <iterator> // back_inserter
+#include <limits> // numeric_limits
+#include <string> // char_traits, string
+#include <utility> // make_pair, move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+
+#include <array> // array
+#include <cstddef> // size_t
+#include <cstring> // strlen
+#include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
+#include <memory> // shared_ptr, make_shared, addressof
+#include <numeric> // accumulate
+#include <string> // string, char_traits
+#include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
+#include <utility> // pair, declval
+
+#ifndef JSON_NO_IO
+    #include <cstdio>   // FILE *
+    #include <istream>  // istream
+#endif                  // JSON_NO_IO
+
+// #include <nlohmann/detail/iterators/iterator_traits.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// the supported input formats
+enum class input_format_t { json, cbor, msgpack, ubjson, bson };
+
+////////////////////
+// input adapters //
+////////////////////
+
+#ifndef JSON_NO_IO
+/*!
+Input adapter for stdio file access. This adapter read only 1 byte and do not use any
+ buffer. This adapter is a very low level adapter.
+*/
+class file_input_adapter
+{
+  public:
+    using char_type = char;
+
+    JSON_HEDLEY_NON_NULL(2)
+    explicit file_input_adapter(std::FILE* f) noexcept
+        : m_file(f)
+    {}
+
+    // make class move-only
+    file_input_adapter(const file_input_adapter&) = delete;
+    file_input_adapter(file_input_adapter&&) noexcept = default;
+    file_input_adapter& operator=(const file_input_adapter&) = delete;
+    file_input_adapter& operator=(file_input_adapter&&) = delete;
+    ~file_input_adapter() = default;
+
+    std::char_traits<char>::int_type get_character() noexcept
+    {
+        return std::fgetc(m_file);
+    }
+
+  private:
+    /// the file pointer to read from
+    std::FILE* m_file;
+};
+
+
+/*!
+Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
+beginning of input. Does not support changing the underlying std::streambuf
+in mid-input. Maintains underlying std::istream and std::streambuf to support
+subsequent use of standard std::istream operations to process any input
+characters following those used in parsing the JSON input.  Clears the
+std::istream flags; any input errors (e.g., EOF) will be detected by the first
+subsequent call for input from the std::istream.
+*/
+class input_stream_adapter
+{
+  public:
+    using char_type = char;
+
+    ~input_stream_adapter()
+    {
+        // clear stream flags; we use underlying streambuf I/O, do not
+        // maintain ifstream flags, except eof
+        if (is != nullptr)
+        {
+            is->clear(is->rdstate() & std::ios::eofbit);
+        }
+    }
+
+    explicit input_stream_adapter(std::istream& i)
+        : is(&i), sb(i.rdbuf())
+    {}
+
+    // delete because of pointer members
+    input_stream_adapter(const input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&) = delete;
+    input_stream_adapter& operator=(input_stream_adapter&&) = delete;
+
+    input_stream_adapter(input_stream_adapter&& rhs) noexcept
+        : is(rhs.is), sb(rhs.sb)
+    {
+        rhs.is = nullptr;
+        rhs.sb = nullptr;
+    }
+
+    // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
+    // ensure that std::char_traits<char>::eof() and the character 0xFF do not
+    // end up as the same value, eg. 0xFFFFFFFF.
+    std::char_traits<char>::int_type get_character()
+    {
+        auto res = sb->sbumpc();
+        // set eof manually, as we don't use the istream interface.
+        if (JSON_HEDLEY_UNLIKELY(res == std::char_traits<char>::eof()))
+        {
+            is->clear(is->rdstate() | std::ios::eofbit);
+        }
+        return res;
+    }
+
+  private:
+    /// the associated input stream
+    std::istream* is = nullptr;
+    std::streambuf* sb = nullptr;
+};
+#endif  // JSON_NO_IO
+
+// General-purpose iterator-based adapter. It might not be as fast as
+// theoretically possible for some containers, but it is extremely versatile.
+template<typename IteratorType>
+class iterator_input_adapter
+{
+  public:
+    using char_type = typename std::iterator_traits<IteratorType>::value_type;
+
+    iterator_input_adapter(IteratorType first, IteratorType last)
+        : current(std::move(first)), end(std::move(last))
+    {}
+
+    typename std::char_traits<char_type>::int_type get_character()
+    {
+        if (JSON_HEDLEY_LIKELY(current != end))
+        {
+            auto result = std::char_traits<char_type>::to_int_type(*current);
+            std::advance(current, 1);
+            return result;
+        }
+
+        return std::char_traits<char_type>::eof();
+    }
+
+  private:
+    IteratorType current;
+    IteratorType end;
+
+    template<typename BaseInputAdapter, size_t T>
+    friend struct wide_string_input_helper;
+
+    bool empty() const
+    {
+        return current == end;
+    }
+};
+
+
+template<typename BaseInputAdapter, size_t T>
+struct wide_string_input_helper;
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 4>
+{
+    // UTF-32
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-32 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u) & 0x1Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (wc <= 0xFFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u) & 0x0Fu));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else if (wc <= 0x10FFFF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((static_cast<unsigned int>(wc) >> 18u) & 0x07u));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 12u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 4;
+            }
+            else
+            {
+                // unknown character
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+        }
+    }
+};
+
+template<typename BaseInputAdapter>
+struct wide_string_input_helper<BaseInputAdapter, 2>
+{
+    // UTF-16
+    static void fill_buffer(BaseInputAdapter& input,
+                            std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
+                            size_t& utf8_bytes_index,
+                            size_t& utf8_bytes_filled)
+    {
+        utf8_bytes_index = 0;
+
+        if (JSON_HEDLEY_UNLIKELY(input.empty()))
+        {
+            utf8_bytes[0] = std::char_traits<char>::eof();
+            utf8_bytes_filled = 1;
+        }
+        else
+        {
+            // get the current character
+            const auto wc = input.get_character();
+
+            // UTF-16 to UTF-8 encoding
+            if (wc < 0x80)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                utf8_bytes_filled = 1;
+            }
+            else if (wc <= 0x7FF)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((static_cast<unsigned int>(wc) >> 6u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 2;
+            }
+            else if (0xD800 > wc || wc >= 0xE000)
+            {
+                utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((static_cast<unsigned int>(wc) >> 12u)));
+                utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((static_cast<unsigned int>(wc) >> 6u) & 0x3Fu));
+                utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (static_cast<unsigned int>(wc) & 0x3Fu));
+                utf8_bytes_filled = 3;
+            }
+            else
+            {
+                if (JSON_HEDLEY_UNLIKELY(!input.empty()))
+                {
+                    const auto wc2 = static_cast<unsigned int>(input.get_character());
+                    const auto charcode = 0x10000u + (((static_cast<unsigned int>(wc) & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
+                    utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
+                    utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
+                    utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
+                    utf8_bytes_filled = 4;
+                }
+                else
+                {
+                    utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
+                    utf8_bytes_filled = 1;
+                }
+            }
+        }
+    }
+};
+
+// Wraps another input apdater to convert wide character types into individual bytes.
+template<typename BaseInputAdapter, typename WideCharType>
+class wide_string_input_adapter
+{
+  public:
+    using char_type = char;
+
+    wide_string_input_adapter(BaseInputAdapter base)
+        : base_adapter(base) {}
+
+    typename std::char_traits<char>::int_type get_character() noexcept
+    {
+        // check if buffer needs to be filled
+        if (utf8_bytes_index == utf8_bytes_filled)
+        {
+            fill_buffer<sizeof(WideCharType)>();
+
+            JSON_ASSERT(utf8_bytes_filled > 0);
+            JSON_ASSERT(utf8_bytes_index == 0);
+        }
+
+        // use buffer
+        JSON_ASSERT(utf8_bytes_filled > 0);
+        JSON_ASSERT(utf8_bytes_index < utf8_bytes_filled);
+        return utf8_bytes[utf8_bytes_index++];
+    }
+
+  private:
+    BaseInputAdapter base_adapter;
+
+    template<size_t T>
+    void fill_buffer()
+    {
+        wide_string_input_helper<BaseInputAdapter, T>::fill_buffer(base_adapter, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
+    }
+
+    /// a buffer for UTF-8 bytes
+    std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
+
+    /// index to the utf8_codes array for the next valid byte
+    std::size_t utf8_bytes_index = 0;
+    /// number of valid bytes in the utf8_codes array
+    std::size_t utf8_bytes_filled = 0;
+};
+
+
+template<typename IteratorType, typename Enable = void>
+struct iterator_input_adapter_factory
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using adapter_type = iterator_input_adapter<iterator_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(std::move(first), std::move(last));
+    }
+};
+
+template<typename T>
+struct is_iterator_of_multibyte
+{
+    using value_type = typename std::iterator_traits<T>::value_type;
+    enum
+    {
+        value = sizeof(value_type) > 1
+    };
+};
+
+template<typename IteratorType>
+struct iterator_input_adapter_factory<IteratorType, enable_if_t<is_iterator_of_multibyte<IteratorType>::value>>
+{
+    using iterator_type = IteratorType;
+    using char_type = typename std::iterator_traits<iterator_type>::value_type;
+    using base_adapter_type = iterator_input_adapter<iterator_type>;
+    using adapter_type = wide_string_input_adapter<base_adapter_type, char_type>;
+
+    static adapter_type create(IteratorType first, IteratorType last)
+    {
+        return adapter_type(base_adapter_type(std::move(first), std::move(last)));
+    }
+};
+
+// General purpose iterator-based input
+template<typename IteratorType>
+typename iterator_input_adapter_factory<IteratorType>::adapter_type input_adapter(IteratorType first, IteratorType last)
+{
+    using factory_type = iterator_input_adapter_factory<IteratorType>;
+    return factory_type::create(first, last);
+}
+
+// Convenience shorthand from container to iterator
+// Enables ADL on begin(container) and end(container)
+// Encloses the using declarations in namespace for not to leak them to outside scope
+
+namespace container_input_adapter_factory_impl
+{
+
+using std::begin;
+using std::end;
+
+template<typename ContainerType, typename Enable = void>
+struct container_input_adapter_factory {};
+
+template<typename ContainerType>
+struct container_input_adapter_factory< ContainerType,
+       void_t<decltype(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>()))>>
+       {
+           using adapter_type = decltype(input_adapter(begin(std::declval<ContainerType>()), end(std::declval<ContainerType>())));
+
+           static adapter_type create(const ContainerType& container)
+{
+    return input_adapter(begin(container), end(container));
+}
+       };
+
+} // namespace container_input_adapter_factory_impl
+
+template<typename ContainerType>
+typename container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::adapter_type input_adapter(const ContainerType& container)
+{
+    return container_input_adapter_factory_impl::container_input_adapter_factory<ContainerType>::create(container);
+}
+
+#ifndef JSON_NO_IO
+// Special cases with fast paths
+inline file_input_adapter input_adapter(std::FILE* file)
+{
+    return file_input_adapter(file);
+}
+
+inline input_stream_adapter input_adapter(std::istream& stream)
+{
+    return input_stream_adapter(stream);
+}
+
+inline input_stream_adapter input_adapter(std::istream&& stream)
+{
+    return input_stream_adapter(stream);
+}
+#endif  // JSON_NO_IO
+
+using contiguous_bytes_input_adapter = decltype(input_adapter(std::declval<const char*>(), std::declval<const char*>()));
+
+// Null-delimited strings, and the like.
+template < typename CharT,
+           typename std::enable_if <
+               std::is_pointer<CharT>::value&&
+               !std::is_array<CharT>::value&&
+               std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+               sizeof(typename std::remove_pointer<CharT>::type) == 1,
+               int >::type = 0 >
+contiguous_bytes_input_adapter input_adapter(CharT b)
+{
+    auto length = std::strlen(reinterpret_cast<const char*>(b));
+    const auto* ptr = reinterpret_cast<const char*>(b);
+    return input_adapter(ptr, ptr + length);
+}
+
+template<typename T, std::size_t N>
+auto input_adapter(T (&array)[N]) -> decltype(input_adapter(array, array + N)) // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+{
+    return input_adapter(array, array + N);
+}
+
+// This class only handles inputs of input_buffer_adapter type.
+// It's required so that expressions like {ptr, len} can be implicitely casted
+// to the correct adapter.
+class span_input_adapter
+{
+  public:
+    template < typename CharT,
+               typename std::enable_if <
+                   std::is_pointer<CharT>::value&&
+                   std::is_integral<typename std::remove_pointer<CharT>::type>::value&&
+                   sizeof(typename std::remove_pointer<CharT>::type) == 1,
+                   int >::type = 0 >
+    span_input_adapter(CharT b, std::size_t l)
+        : ia(reinterpret_cast<const char*>(b), reinterpret_cast<const char*>(b) + l) {}
+
+    template<class IteratorType,
+             typename std::enable_if<
+                 std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
+                 int>::type = 0>
+    span_input_adapter(IteratorType first, IteratorType last)
+        : ia(input_adapter(first, last)) {}
+
+    contiguous_bytes_input_adapter&& get()
+    {
+        return std::move(ia); // NOLINT(hicpp-move-const-arg,performance-move-const-arg)
+    }
+
+  private:
+    contiguous_bytes_input_adapter ia;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+
+#include <cstddef>
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+
+/*!
+@brief SAX interface
+
+This class describes the SAX interface used by @ref nlohmann::json::sax_parse.
+Each function is called in different situations while the input is parsed. The
+boolean return value informs the parser whether to continue processing the
+input.
+*/
+template<typename BasicJsonType>
+struct json_sax
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @brief a null value was read
+    @return whether parsing should proceed
+    */
+    virtual bool null() = 0;
+
+    /*!
+    @brief a boolean value was read
+    @param[in] val  boolean value
+    @return whether parsing should proceed
+    */
+    virtual bool boolean(bool val) = 0;
+
+    /*!
+    @brief an integer number was read
+    @param[in] val  integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_integer(number_integer_t val) = 0;
+
+    /*!
+    @brief an unsigned integer number was read
+    @param[in] val  unsigned integer value
+    @return whether parsing should proceed
+    */
+    virtual bool number_unsigned(number_unsigned_t val) = 0;
+
+    /*!
+    @brief an floating-point number was read
+    @param[in] val  floating-point value
+    @param[in] s    raw token value
+    @return whether parsing should proceed
+    */
+    virtual bool number_float(number_float_t val, const string_t& s) = 0;
+
+    /*!
+    @brief a string was read
+    @param[in] val  string value
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool string(string_t& val) = 0;
+
+    /*!
+    @brief a binary string was read
+    @param[in] val  binary value
+    @return whether parsing should proceed
+    @note It is safe to move the passed binary.
+    */
+    virtual bool binary(binary_t& val) = 0;
+
+    /*!
+    @brief the beginning of an object was read
+    @param[in] elements  number of object elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_object(std::size_t elements) = 0;
+
+    /*!
+    @brief an object key was read
+    @param[in] val  object key
+    @return whether parsing should proceed
+    @note It is safe to move the passed string.
+    */
+    virtual bool key(string_t& val) = 0;
+
+    /*!
+    @brief the end of an object was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_object() = 0;
+
+    /*!
+    @brief the beginning of an array was read
+    @param[in] elements  number of array elements or -1 if unknown
+    @return whether parsing should proceed
+    @note binary formats may report the number of elements
+    */
+    virtual bool start_array(std::size_t elements) = 0;
+
+    /*!
+    @brief the end of an array was read
+    @return whether parsing should proceed
+    */
+    virtual bool end_array() = 0;
+
+    /*!
+    @brief a parse error occurred
+    @param[in] position    the position in the input where the error occurs
+    @param[in] last_token  the last read token
+    @param[in] ex          an exception object describing the error
+    @return whether parsing should proceed (must return false)
+    */
+    virtual bool parse_error(std::size_t position,
+                             const std::string& last_token,
+                             const detail::exception& ex) = 0;
+
+    json_sax() = default;
+    json_sax(const json_sax&) = default;
+    json_sax(json_sax&&) noexcept = default;
+    json_sax& operator=(const json_sax&) = default;
+    json_sax& operator=(json_sax&&) noexcept = default;
+    virtual ~json_sax() = default;
+};
+
+
+namespace detail
+{
+/*!
+@brief SAX implementation to create a JSON value from SAX events
+
+This class implements the @ref json_sax interface and processes the SAX events
+to create a JSON value which makes it basically a DOM parser. The structure or
+hierarchy of the JSON value is managed by the stack `ref_stack` which contains
+a pointer to the respective array or object for each recursion depth.
+
+After successful parsing, the value that is passed by reference to the
+constructor contains the parsed value.
+
+@tparam BasicJsonType  the JSON type
+*/
+template<typename BasicJsonType>
+class json_sax_dom_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    /*!
+    @param[in,out] r  reference to a JSON value that is manipulated while
+                       parsing
+    @param[in] allow_exceptions_  whether parse errors yield exceptions
+    */
+    explicit json_sax_dom_parser(BasicJsonType& r, const bool allow_exceptions_ = true)
+        : root(r), allow_exceptions(allow_exceptions_)
+    {}
+
+    // make class move-only
+    json_sax_dom_parser(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_parser& operator=(const json_sax_dom_parser&) = delete;
+    json_sax_dom_parser& operator=(json_sax_dom_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::object));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len), *ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        // add null at given key and store the reference for later
+        object_element = &(ref_stack.back()->m_value.object->operator[](val));
+        return true;
+    }
+
+    bool end_object()
+    {
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        ref_stack.push_back(handle_value(BasicJsonType::value_t::array));
+
+        if (JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len), *ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        ref_stack.back()->set_parents();
+        ref_stack.pop_back();
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+    */
+    template<typename Value>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    BasicJsonType* handle_value(Value&& v)
+    {
+        if (ref_stack.empty())
+        {
+            root = BasicJsonType(std::forward<Value>(v));
+            return &root;
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->emplace_back(std::forward<Value>(v));
+            return &(ref_stack.back()->m_value.array->back());
+        }
+
+        JSON_ASSERT(ref_stack.back()->is_object());
+        JSON_ASSERT(object_element);
+        *object_element = BasicJsonType(std::forward<Value>(v));
+        return object_element;
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+template<typename BasicJsonType>
+class json_sax_dom_callback_parser
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using parser_callback_t = typename BasicJsonType::parser_callback_t;
+    using parse_event_t = typename BasicJsonType::parse_event_t;
+
+    json_sax_dom_callback_parser(BasicJsonType& r,
+                                 const parser_callback_t cb,
+                                 const bool allow_exceptions_ = true)
+        : root(r), callback(cb), allow_exceptions(allow_exceptions_)
+    {
+        keep_stack.push_back(true);
+    }
+
+    // make class move-only
+    json_sax_dom_callback_parser(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    json_sax_dom_callback_parser& operator=(const json_sax_dom_callback_parser&) = delete;
+    json_sax_dom_callback_parser& operator=(json_sax_dom_callback_parser&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~json_sax_dom_callback_parser() = default;
+
+    bool null()
+    {
+        handle_value(nullptr);
+        return true;
+    }
+
+    bool boolean(bool val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_integer(number_integer_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool number_float(number_float_t val, const string_t& /*unused*/)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool string(string_t& val)
+    {
+        handle_value(val);
+        return true;
+    }
+
+    bool binary(binary_t& val)
+    {
+        handle_value(std::move(val));
+        return true;
+    }
+
+    bool start_object(std::size_t len)
+    {
+        // check callback for object start
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::object_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::object, true);
+        ref_stack.push_back(val.second);
+
+        // check object limit
+        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive object size: " + std::to_string(len), *ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool key(string_t& val)
+    {
+        BasicJsonType k = BasicJsonType(val);
+
+        // check callback for key
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::key, k);
+        key_keep_stack.push_back(keep);
+
+        // add discarded value at given key and store the reference for later
+        if (keep && ref_stack.back())
+        {
+            object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded);
+        }
+
+        return true;
+    }
+
+    bool end_object()
+    {
+        if (ref_stack.back())
+        {
+            if (!callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::object_end, *ref_stack.back()))
+            {
+                // discard object
+                *ref_stack.back() = discarded;
+            }
+            else
+            {
+                ref_stack.back()->set_parents();
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        if (!ref_stack.empty() && ref_stack.back() && ref_stack.back()->is_structured())
+        {
+            // remove discarded value
+            for (auto it = ref_stack.back()->begin(); it != ref_stack.back()->end(); ++it)
+            {
+                if (it->is_discarded())
+                {
+                    ref_stack.back()->erase(it);
+                    break;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    bool start_array(std::size_t len)
+    {
+        const bool keep = callback(static_cast<int>(ref_stack.size()), parse_event_t::array_start, discarded);
+        keep_stack.push_back(keep);
+
+        auto val = handle_value(BasicJsonType::value_t::array, true);
+        ref_stack.push_back(val.second);
+
+        // check array limit
+        if (ref_stack.back() && JSON_HEDLEY_UNLIKELY(len != std::size_t(-1) && len > ref_stack.back()->max_size()))
+        {
+            JSON_THROW(out_of_range::create(408, "excessive array size: " + std::to_string(len), *ref_stack.back()));
+        }
+
+        return true;
+    }
+
+    bool end_array()
+    {
+        bool keep = true;
+
+        if (ref_stack.back())
+        {
+            keep = callback(static_cast<int>(ref_stack.size()) - 1, parse_event_t::array_end, *ref_stack.back());
+            if (keep)
+            {
+                ref_stack.back()->set_parents();
+            }
+            else
+            {
+                // discard array
+                *ref_stack.back() = discarded;
+            }
+        }
+
+        JSON_ASSERT(!ref_stack.empty());
+        JSON_ASSERT(!keep_stack.empty());
+        ref_stack.pop_back();
+        keep_stack.pop_back();
+
+        // remove discarded value
+        if (!keep && !ref_stack.empty() && ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->pop_back();
+        }
+
+        return true;
+    }
+
+    template<class Exception>
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/,
+                     const Exception& ex)
+    {
+        errored = true;
+        static_cast<void>(ex);
+        if (allow_exceptions)
+        {
+            JSON_THROW(ex);
+        }
+        return false;
+    }
+
+    constexpr bool is_errored() const
+    {
+        return errored;
+    }
+
+  private:
+    /*!
+    @param[in] v  value to add to the JSON value we build during parsing
+    @param[in] skip_callback  whether we should skip calling the callback
+               function; this is required after start_array() and
+               start_object() SAX events, because otherwise we would call the
+               callback function with an empty array or object, respectively.
+
+    @invariant If the ref stack is empty, then the passed value will be the new
+               root.
+    @invariant If the ref stack contains a value, then it is an array or an
+               object to which we can add elements
+
+    @return pair of boolean (whether value should be kept) and pointer (to the
+            passed value in the ref_stack hierarchy; nullptr if not kept)
+    */
+    template<typename Value>
+    std::pair<bool, BasicJsonType*> handle_value(Value&& v, const bool skip_callback = false)
+    {
+        JSON_ASSERT(!keep_stack.empty());
+
+        // do not handle this value if we know it would be added to a discarded
+        // container
+        if (!keep_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // create value
+        auto value = BasicJsonType(std::forward<Value>(v));
+
+        // check callback
+        const bool keep = skip_callback || callback(static_cast<int>(ref_stack.size()), parse_event_t::value, value);
+
+        // do not handle this value if we just learnt it shall be discarded
+        if (!keep)
+        {
+            return {false, nullptr};
+        }
+
+        if (ref_stack.empty())
+        {
+            root = std::move(value);
+            return {true, &root};
+        }
+
+        // skip this value if we already decided to skip the parent
+        // (https://github.com/nlohmann/json/issues/971#issuecomment-413678360)
+        if (!ref_stack.back())
+        {
+            return {false, nullptr};
+        }
+
+        // we now only expect arrays and objects
+        JSON_ASSERT(ref_stack.back()->is_array() || ref_stack.back()->is_object());
+
+        // array
+        if (ref_stack.back()->is_array())
+        {
+            ref_stack.back()->m_value.array->emplace_back(std::move(value));
+            return {true, &(ref_stack.back()->m_value.array->back())};
+        }
+
+        // object
+        JSON_ASSERT(ref_stack.back()->is_object());
+        // check if we should store an element for the current key
+        JSON_ASSERT(!key_keep_stack.empty());
+        const bool store_element = key_keep_stack.back();
+        key_keep_stack.pop_back();
+
+        if (!store_element)
+        {
+            return {false, nullptr};
+        }
+
+        JSON_ASSERT(object_element);
+        *object_element = std::move(value);
+        return {true, object_element};
+    }
+
+    /// the parsed JSON value
+    BasicJsonType& root;
+    /// stack to model hierarchy of values
+    std::vector<BasicJsonType*> ref_stack {};
+    /// stack to manage which values to keep
+    std::vector<bool> keep_stack {};
+    /// stack to manage which object keys to keep
+    std::vector<bool> key_keep_stack {};
+    /// helper to hold the reference for the next object element
+    BasicJsonType* object_element = nullptr;
+    /// whether a syntax error occurred
+    bool errored = false;
+    /// callback function
+    const parser_callback_t callback = nullptr;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+    /// a discarded value for the callback
+    BasicJsonType discarded = BasicJsonType::value_t::discarded;
+};
+
+template<typename BasicJsonType>
+class json_sax_acceptor
+{
+  public:
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+
+    bool null()
+    {
+        return true;
+    }
+
+    bool boolean(bool /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_integer(number_integer_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_unsigned(number_unsigned_t /*unused*/)
+    {
+        return true;
+    }
+
+    bool number_float(number_float_t /*unused*/, const string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool string(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool binary(binary_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool start_object(std::size_t /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool key(string_t& /*unused*/)
+    {
+        return true;
+    }
+
+    bool end_object()
+    {
+        return true;
+    }
+
+    bool start_array(std::size_t /*unused*/ = std::size_t(-1))
+    {
+        return true;
+    }
+
+    bool end_array()
+    {
+        return true;
+    }
+
+    bool parse_error(std::size_t /*unused*/, const std::string& /*unused*/, const detail::exception& /*unused*/)
+    {
+        return false;
+    }
+};
+}  // namespace detail
+
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+
+#include <array> // array
+#include <clocale> // localeconv
+#include <cstddef> // size_t
+#include <cstdio> // snprintf
+#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
+#include <initializer_list> // initializer_list
+#include <string> // char_traits, string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/position_t.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////
+// lexer //
+///////////
+
+template<typename BasicJsonType>
+class lexer_base
+{
+  public:
+    /// token types for the parser
+    enum class token_type
+    {
+        uninitialized,    ///< indicating the scanner is uninitialized
+        literal_true,     ///< the `true` literal
+        literal_false,    ///< the `false` literal
+        literal_null,     ///< the `null` literal
+        value_string,     ///< a string -- use get_string() for actual value
+        value_unsigned,   ///< an unsigned integer -- use get_number_unsigned() for actual value
+        value_integer,    ///< a signed integer -- use get_number_integer() for actual value
+        value_float,      ///< an floating point number -- use get_number_float() for actual value
+        begin_array,      ///< the character for array begin `[`
+        begin_object,     ///< the character for object begin `{`
+        end_array,        ///< the character for array end `]`
+        end_object,       ///< the character for object end `}`
+        name_separator,   ///< the name separator `:`
+        value_separator,  ///< the value separator `,`
+        parse_error,      ///< indicating a parse error
+        end_of_input,     ///< indicating the end of the input buffer
+        literal_or_value  ///< a literal or the begin of a value (only for diagnostics)
+    };
+
+    /// return name of values of type token_type (only used for errors)
+    JSON_HEDLEY_RETURNS_NON_NULL
+    JSON_HEDLEY_CONST
+    static const char* token_type_name(const token_type t) noexcept
+    {
+        switch (t)
+        {
+            case token_type::uninitialized:
+                return "<uninitialized>";
+            case token_type::literal_true:
+                return "true literal";
+            case token_type::literal_false:
+                return "false literal";
+            case token_type::literal_null:
+                return "null literal";
+            case token_type::value_string:
+                return "string literal";
+            case token_type::value_unsigned:
+            case token_type::value_integer:
+            case token_type::value_float:
+                return "number literal";
+            case token_type::begin_array:
+                return "'['";
+            case token_type::begin_object:
+                return "'{'";
+            case token_type::end_array:
+                return "']'";
+            case token_type::end_object:
+                return "'}'";
+            case token_type::name_separator:
+                return "':'";
+            case token_type::value_separator:
+                return "','";
+            case token_type::parse_error:
+                return "<parse error>";
+            case token_type::end_of_input:
+                return "end of input";
+            case token_type::literal_or_value:
+                return "'[', '{', or a literal";
+            // LCOV_EXCL_START
+            default: // catch non-enum values
+                return "unknown token";
+                // LCOV_EXCL_STOP
+        }
+    }
+};
+/*!
+@brief lexical analysis
+
+This class organizes the lexical analysis during JSON deserialization.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class lexer : public lexer_base<BasicJsonType>
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename std::char_traits<char_type>::int_type;
+
+  public:
+    using token_type = typename lexer_base<BasicJsonType>::token_type;
+
+    explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
+        : ia(std::move(adapter))
+        , ignore_comments(ignore_comments_)
+        , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
+    {}
+
+    // delete because of pointer members
+    lexer(const lexer&) = delete;
+    lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    lexer& operator=(lexer&) = delete;
+    lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~lexer() = default;
+
+  private:
+    /////////////////////
+    // locales
+    /////////////////////
+
+    /// return the locale-dependent decimal point
+    JSON_HEDLEY_PURE
+    static char get_decimal_point() noexcept
+    {
+        const auto* loc = localeconv();
+        JSON_ASSERT(loc != nullptr);
+        return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
+    }
+
+    /////////////////////
+    // scan functions
+    /////////////////////
+
+    /*!
+    @brief get codepoint from 4 hex characters following `\u`
+
+    For input "\u c1 c2 c3 c4" the codepoint is:
+      (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
+    = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
+
+    Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
+    must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
+    conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
+    between the ASCII value of the character and the desired integer value.
+
+    @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
+            non-hex character)
+    */
+    int get_codepoint()
+    {
+        // this function only makes sense after reading `\u`
+        JSON_ASSERT(current == 'u');
+        int codepoint = 0;
+
+        const auto factors = { 12u, 8u, 4u, 0u };
+        for (const auto factor : factors)
+        {
+            get();
+
+            if (current >= '0' && current <= '9')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
+            }
+            else if (current >= 'A' && current <= 'F')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
+            }
+            else if (current >= 'a' && current <= 'f')
+            {
+                codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
+            }
+            else
+            {
+                return -1;
+            }
+        }
+
+        JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
+        return codepoint;
+    }
+
+    /*!
+    @brief check if the next byte(s) are inside a given range
+
+    Adds the current byte and, for each passed range, reads a new byte and
+    checks if it is inside the range. If a violation was detected, set up an
+    error message and return false. Otherwise, return true.
+
+    @param[in] ranges  list of integers; interpreted as list of pairs of
+                       inclusive lower and upper bound, respectively
+
+    @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
+         1, 2, or 3 pairs. This precondition is enforced by an assertion.
+
+    @return true if and only if no range violation was detected
+    */
+    bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
+    {
+        JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
+        add(current);
+
+        for (auto range = ranges.begin(); range != ranges.end(); ++range)
+        {
+            get();
+            if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range)))
+            {
+                add(current);
+            }
+            else
+            {
+                error_message = "invalid string: ill-formed UTF-8 byte";
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief scan a string literal
+
+    This function scans a string according to Sect. 7 of RFC 8259. While
+    scanning, bytes are escaped and copied into buffer token_buffer. Then the
+    function returns successfully, token_buffer is *not* null-terminated (as it
+    may contain \0 bytes), and token_buffer.size() is the number of bytes in the
+    string.
+
+    @return token_type::value_string if string could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note In case of errors, variable error_message contains a textual
+          description.
+    */
+    token_type scan_string()
+    {
+        // reset token_buffer (ignore opening quote)
+        reset();
+
+        // we entered the function by reading an open quote
+        JSON_ASSERT(current == '\"');
+
+        while (true)
+        {
+            // get next character
+            switch (get())
+            {
+                // end of file while parsing string
+                case std::char_traits<char_type>::eof():
+                {
+                    error_message = "invalid string: missing closing quote";
+                    return token_type::parse_error;
+                }
+
+                // closing quote
+                case '\"':
+                {
+                    return token_type::value_string;
+                }
+
+                // escapes
+                case '\\':
+                {
+                    switch (get())
+                    {
+                        // quotation mark
+                        case '\"':
+                            add('\"');
+                            break;
+                        // reverse solidus
+                        case '\\':
+                            add('\\');
+                            break;
+                        // solidus
+                        case '/':
+                            add('/');
+                            break;
+                        // backspace
+                        case 'b':
+                            add('\b');
+                            break;
+                        // form feed
+                        case 'f':
+                            add('\f');
+                            break;
+                        // line feed
+                        case 'n':
+                            add('\n');
+                            break;
+                        // carriage return
+                        case 'r':
+                            add('\r');
+                            break;
+                        // tab
+                        case 't':
+                            add('\t');
+                            break;
+
+                        // unicode escapes
+                        case 'u':
+                        {
+                            const int codepoint1 = get_codepoint();
+                            int codepoint = codepoint1; // start with codepoint1
+
+                            if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
+                            {
+                                error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                return token_type::parse_error;
+                            }
+
+                            // check if code point is a high surrogate
+                            if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
+                            {
+                                // expect next \uxxxx entry
+                                if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
+                                {
+                                    const int codepoint2 = get_codepoint();
+
+                                    if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
+                                    {
+                                        error_message = "invalid string: '\\u' must be followed by 4 hex digits";
+                                        return token_type::parse_error;
+                                    }
+
+                                    // check if codepoint2 is a low surrogate
+                                    if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
+                                    {
+                                        // overwrite codepoint
+                                        codepoint = static_cast<int>(
+                                                        // high surrogate occupies the most significant 22 bits
+                                                        (static_cast<unsigned int>(codepoint1) << 10u)
+                                                        // low surrogate occupies the least significant 15 bits
+                                                        + static_cast<unsigned int>(codepoint2)
+                                                        // there is still the 0xD800, 0xDC00 and 0x10000 noise
+                                                        // in the result so we have to subtract with:
+                                                        // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
+                                                        - 0x35FDC00u);
+                                    }
+                                    else
+                                    {
+                                        error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                        return token_type::parse_error;
+                                    }
+                                }
+                                else
+                                {
+                                    error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+                            else
+                            {
+                                if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
+                                {
+                                    error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
+                                    return token_type::parse_error;
+                                }
+                            }
+
+                            // result of the above calculation yields a proper codepoint
+                            JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
+
+                            // translate codepoint into bytes
+                            if (codepoint < 0x80)
+                            {
+                                // 1-byte characters: 0xxxxxxx (ASCII)
+                                add(static_cast<char_int_type>(codepoint));
+                            }
+                            else if (codepoint <= 0x7FF)
+                            {
+                                // 2-byte characters: 110xxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else if (codepoint <= 0xFFFF)
+                            {
+                                // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+                            else
+                            {
+                                // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                                add(static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
+                                add(static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
+                            }
+
+                            break;
+                        }
+
+                        // other characters after escape
+                        default:
+                            error_message = "invalid string: forbidden character after backslash";
+                            return token_type::parse_error;
+                    }
+
+                    break;
+                }
+
+                // invalid control characters
+                case 0x00:
+                {
+                    error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
+                    return token_type::parse_error;
+                }
+
+                case 0x01:
+                {
+                    error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
+                    return token_type::parse_error;
+                }
+
+                case 0x02:
+                {
+                    error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
+                    return token_type::parse_error;
+                }
+
+                case 0x03:
+                {
+                    error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
+                    return token_type::parse_error;
+                }
+
+                case 0x04:
+                {
+                    error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
+                    return token_type::parse_error;
+                }
+
+                case 0x05:
+                {
+                    error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
+                    return token_type::parse_error;
+                }
+
+                case 0x06:
+                {
+                    error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
+                    return token_type::parse_error;
+                }
+
+                case 0x07:
+                {
+                    error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
+                    return token_type::parse_error;
+                }
+
+                case 0x08:
+                {
+                    error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
+                    return token_type::parse_error;
+                }
+
+                case 0x09:
+                {
+                    error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
+                    return token_type::parse_error;
+                }
+
+                case 0x0A:
+                {
+                    error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
+                    return token_type::parse_error;
+                }
+
+                case 0x0B:
+                {
+                    error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
+                    return token_type::parse_error;
+                }
+
+                case 0x0C:
+                {
+                    error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
+                    return token_type::parse_error;
+                }
+
+                case 0x0D:
+                {
+                    error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
+                    return token_type::parse_error;
+                }
+
+                case 0x0E:
+                {
+                    error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
+                    return token_type::parse_error;
+                }
+
+                case 0x0F:
+                {
+                    error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
+                    return token_type::parse_error;
+                }
+
+                case 0x10:
+                {
+                    error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
+                    return token_type::parse_error;
+                }
+
+                case 0x11:
+                {
+                    error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
+                    return token_type::parse_error;
+                }
+
+                case 0x12:
+                {
+                    error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
+                    return token_type::parse_error;
+                }
+
+                case 0x13:
+                {
+                    error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
+                    return token_type::parse_error;
+                }
+
+                case 0x14:
+                {
+                    error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
+                    return token_type::parse_error;
+                }
+
+                case 0x15:
+                {
+                    error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
+                    return token_type::parse_error;
+                }
+
+                case 0x16:
+                {
+                    error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
+                    return token_type::parse_error;
+                }
+
+                case 0x17:
+                {
+                    error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
+                    return token_type::parse_error;
+                }
+
+                case 0x18:
+                {
+                    error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
+                    return token_type::parse_error;
+                }
+
+                case 0x19:
+                {
+                    error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
+                    return token_type::parse_error;
+                }
+
+                case 0x1A:
+                {
+                    error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
+                    return token_type::parse_error;
+                }
+
+                case 0x1B:
+                {
+                    error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
+                    return token_type::parse_error;
+                }
+
+                case 0x1C:
+                {
+                    error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
+                    return token_type::parse_error;
+                }
+
+                case 0x1D:
+                {
+                    error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
+                    return token_type::parse_error;
+                }
+
+                case 0x1E:
+                {
+                    error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
+                    return token_type::parse_error;
+                }
+
+                case 0x1F:
+                {
+                    error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
+                    return token_type::parse_error;
+                }
+
+                // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
+                case 0x20:
+                case 0x21:
+                case 0x23:
+                case 0x24:
+                case 0x25:
+                case 0x26:
+                case 0x27:
+                case 0x28:
+                case 0x29:
+                case 0x2A:
+                case 0x2B:
+                case 0x2C:
+                case 0x2D:
+                case 0x2E:
+                case 0x2F:
+                case 0x30:
+                case 0x31:
+                case 0x32:
+                case 0x33:
+                case 0x34:
+                case 0x35:
+                case 0x36:
+                case 0x37:
+                case 0x38:
+                case 0x39:
+                case 0x3A:
+                case 0x3B:
+                case 0x3C:
+                case 0x3D:
+                case 0x3E:
+                case 0x3F:
+                case 0x40:
+                case 0x41:
+                case 0x42:
+                case 0x43:
+                case 0x44:
+                case 0x45:
+                case 0x46:
+                case 0x47:
+                case 0x48:
+                case 0x49:
+                case 0x4A:
+                case 0x4B:
+                case 0x4C:
+                case 0x4D:
+                case 0x4E:
+                case 0x4F:
+                case 0x50:
+                case 0x51:
+                case 0x52:
+                case 0x53:
+                case 0x54:
+                case 0x55:
+                case 0x56:
+                case 0x57:
+                case 0x58:
+                case 0x59:
+                case 0x5A:
+                case 0x5B:
+                case 0x5D:
+                case 0x5E:
+                case 0x5F:
+                case 0x60:
+                case 0x61:
+                case 0x62:
+                case 0x63:
+                case 0x64:
+                case 0x65:
+                case 0x66:
+                case 0x67:
+                case 0x68:
+                case 0x69:
+                case 0x6A:
+                case 0x6B:
+                case 0x6C:
+                case 0x6D:
+                case 0x6E:
+                case 0x6F:
+                case 0x70:
+                case 0x71:
+                case 0x72:
+                case 0x73:
+                case 0x74:
+                case 0x75:
+                case 0x76:
+                case 0x77:
+                case 0x78:
+                case 0x79:
+                case 0x7A:
+                case 0x7B:
+                case 0x7C:
+                case 0x7D:
+                case 0x7E:
+                case 0x7F:
+                {
+                    add(current);
+                    break;
+                }
+
+                // U+0080..U+07FF: bytes C2..DF 80..BF
+                case 0xC2:
+                case 0xC3:
+                case 0xC4:
+                case 0xC5:
+                case 0xC6:
+                case 0xC7:
+                case 0xC8:
+                case 0xC9:
+                case 0xCA:
+                case 0xCB:
+                case 0xCC:
+                case 0xCD:
+                case 0xCE:
+                case 0xCF:
+                case 0xD0:
+                case 0xD1:
+                case 0xD2:
+                case 0xD3:
+                case 0xD4:
+                case 0xD5:
+                case 0xD6:
+                case 0xD7:
+                case 0xD8:
+                case 0xD9:
+                case 0xDA:
+                case 0xDB:
+                case 0xDC:
+                case 0xDD:
+                case 0xDE:
+                case 0xDF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
+                case 0xE0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
+                // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
+                case 0xE1:
+                case 0xE2:
+                case 0xE3:
+                case 0xE4:
+                case 0xE5:
+                case 0xE6:
+                case 0xE7:
+                case 0xE8:
+                case 0xE9:
+                case 0xEA:
+                case 0xEB:
+                case 0xEC:
+                case 0xEE:
+                case 0xEF:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+D000..U+D7FF: bytes ED 80..9F 80..BF
+                case 0xED:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+                case 0xF0:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+                case 0xF1:
+                case 0xF2:
+                case 0xF3:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+                case 0xF4:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
+                    {
+                        return token_type::parse_error;
+                    }
+                    break;
+                }
+
+                // remaining bytes (80..C1 and F5..FF) are ill-formed
+                default:
+                {
+                    error_message = "invalid string: ill-formed UTF-8 byte";
+                    return token_type::parse_error;
+                }
+            }
+        }
+    }
+
+    /*!
+     * @brief scan a comment
+     * @return whether comment could be scanned successfully
+     */
+    bool scan_comment()
+    {
+        switch (get())
+        {
+            // single-line comments skip input until a newline or EOF is read
+            case '/':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case '\n':
+                        case '\r':
+                        case std::char_traits<char_type>::eof():
+                        case '\0':
+                            return true;
+
+                        default:
+                            break;
+                    }
+                }
+            }
+
+            // multi-line comments skip input until */ is read
+            case '*':
+            {
+                while (true)
+                {
+                    switch (get())
+                    {
+                        case std::char_traits<char_type>::eof():
+                        case '\0':
+                        {
+                            error_message = "invalid comment; missing closing '*/'";
+                            return false;
+                        }
+
+                        case '*':
+                        {
+                            switch (get())
+                            {
+                                case '/':
+                                    return true;
+
+                                default:
+                                {
+                                    unget();
+                                    continue;
+                                }
+                            }
+                        }
+
+                        default:
+                            continue;
+                    }
+                }
+            }
+
+            // unexpected character after reading '/'
+            default:
+            {
+                error_message = "invalid comment; expecting '/' or '*' after '/'";
+                return false;
+            }
+        }
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(float& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtof(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtod(str, endptr);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    static void strtof(long double& f, const char* str, char** endptr) noexcept
+    {
+        f = std::strtold(str, endptr);
+    }
+
+    /*!
+    @brief scan a number literal
+
+    This function scans a string according to Sect. 6 of RFC 8259.
+
+    The function is realized with a deterministic finite state machine derived
+    from the grammar described in RFC 8259. Starting in state "init", the
+    input is read and used to determined the next state. Only state "done"
+    accepts the number. State "error" is a trap state to model errors. In the
+    table below, "anything" means any character but the ones listed before.
+
+    state    | 0        | 1-9      | e E      | +       | -       | .        | anything
+    ---------|----------|----------|----------|---------|---------|----------|-----------
+    init     | zero     | any1     | [error]  | [error] | minus   | [error]  | [error]
+    minus    | zero     | any1     | [error]  | [error] | [error] | [error]  | [error]
+    zero     | done     | done     | exponent | done    | done    | decimal1 | done
+    any1     | any1     | any1     | exponent | done    | done    | decimal1 | done
+    decimal1 | decimal2 | decimal2 | [error]  | [error] | [error] | [error]  | [error]
+    decimal2 | decimal2 | decimal2 | exponent | done    | done    | done     | done
+    exponent | any2     | any2     | [error]  | sign    | sign    | [error]  | [error]
+    sign     | any2     | any2     | [error]  | [error] | [error] | [error]  | [error]
+    any2     | any2     | any2     | done     | done    | done    | done     | done
+
+    The state machine is realized with one label per state (prefixed with
+    "scan_number_") and `goto` statements between them. The state machine
+    contains cycles, but any cycle can be left when EOF is read. Therefore,
+    the function is guaranteed to terminate.
+
+    During scanning, the read bytes are stored in token_buffer. This string is
+    then converted to a signed integer, an unsigned integer, or a
+    floating-point number.
+
+    @return token_type::value_unsigned, token_type::value_integer, or
+            token_type::value_float if number could be successfully scanned,
+            token_type::parse_error otherwise
+
+    @note The scanner is independent of the current locale. Internally, the
+          locale's decimal point is used instead of `.` to work with the
+          locale-dependent converters.
+    */
+    token_type scan_number()  // lgtm [cpp/use-of-goto]
+    {
+        // reset token_buffer to store the number's bytes
+        reset();
+
+        // the type of the parsed number; initially set to unsigned; will be
+        // changed if minus sign, decimal point or exponent is read
+        token_type number_type = token_type::value_unsigned;
+
+        // state (init): we just found out we need to scan a number
+        switch (current)
+        {
+            case '-':
+            {
+                add(current);
+                goto scan_number_minus;
+            }
+
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            // all other characters are rejected outside scan_number()
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+scan_number_minus:
+        // state: we just parsed a leading minus sign
+        number_type = token_type::value_integer;
+        switch (get())
+        {
+            case '0':
+            {
+                add(current);
+                goto scan_number_zero;
+            }
+
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '-'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_zero:
+        // state: we just parse a zero (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_any1:
+        // state: we just parsed a number 0-9 (maybe with a leading minus sign)
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any1;
+            }
+
+            case '.':
+            {
+                add(decimal_point_char);
+                goto scan_number_decimal1;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_decimal1:
+        // state: we just parsed a decimal point
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after '.'";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_decimal2:
+        // we just parsed at least one number after a decimal point
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_decimal2;
+            }
+
+            case 'e':
+            case 'E':
+            {
+                add(current);
+                goto scan_number_exponent;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_exponent:
+        // we just parsed an exponent
+        number_type = token_type::value_float;
+        switch (get())
+        {
+            case '+':
+            case '-':
+            {
+                add(current);
+                goto scan_number_sign;
+            }
+
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message =
+                    "invalid number; expected '+', '-', or digit after exponent";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_sign:
+        // we just parsed an exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+            {
+                error_message = "invalid number; expected digit after exponent sign";
+                return token_type::parse_error;
+            }
+        }
+
+scan_number_any2:
+        // we just parsed a number after the exponent or exponent sign
+        switch (get())
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                add(current);
+                goto scan_number_any2;
+            }
+
+            default:
+                goto scan_number_done;
+        }
+
+scan_number_done:
+        // unget the character after the number (we only read it to know that
+        // we are done scanning a number)
+        unget();
+
+        char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        errno = 0;
+
+        // try to parse integers first and fall back to floats
+        if (number_type == token_type::value_unsigned)
+        {
+            const auto x = std::strtoull(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_unsigned = static_cast<number_unsigned_t>(x);
+                if (value_unsigned == x)
+                {
+                    return token_type::value_unsigned;
+                }
+            }
+        }
+        else if (number_type == token_type::value_integer)
+        {
+            const auto x = std::strtoll(token_buffer.data(), &endptr, 10);
+
+            // we checked the number format before
+            JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+            if (errno == 0)
+            {
+                value_integer = static_cast<number_integer_t>(x);
+                if (value_integer == x)
+                {
+                    return token_type::value_integer;
+                }
+            }
+        }
+
+        // this code is reached if we parse a floating-point number or if an
+        // integer conversion above failed
+        strtof(value_float, token_buffer.data(), &endptr);
+
+        // we checked the number format before
+        JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
+
+        return token_type::value_float;
+    }
+
+    /*!
+    @param[in] literal_text  the literal text to expect
+    @param[in] length        the length of the passed literal text
+    @param[in] return_type   the token type to return on success
+    */
+    JSON_HEDLEY_NON_NULL(2)
+    token_type scan_literal(const char_type* literal_text, const std::size_t length,
+                            token_type return_type)
+    {
+        JSON_ASSERT(std::char_traits<char_type>::to_char_type(current) == literal_text[0]);
+        for (std::size_t i = 1; i < length; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(std::char_traits<char_type>::to_char_type(get()) != literal_text[i]))
+            {
+                error_message = "invalid literal";
+                return token_type::parse_error;
+            }
+        }
+        return return_type;
+    }
+
+    /////////////////////
+    // input management
+    /////////////////////
+
+    /// reset token_buffer; current character is beginning of token
+    void reset() noexcept
+    {
+        token_buffer.clear();
+        token_string.clear();
+        token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+    }
+
+    /*
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a
+    `std::char_traits<char>::eof()` in that case.  Stores the scanned characters
+    for use in error messages.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++position.chars_read_total;
+        ++position.chars_read_current_line;
+
+        if (next_unget)
+        {
+            // just reset the next_unget variable and work with current
+            next_unget = false;
+        }
+        else
+        {
+            current = ia.get_character();
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        {
+            token_string.push_back(std::char_traits<char_type>::to_char_type(current));
+        }
+
+        if (current == '\n')
+        {
+            ++position.lines_read;
+            position.chars_read_current_line = 0;
+        }
+
+        return current;
+    }
+
+    /*!
+    @brief unget current character (read it again on next get)
+
+    We implement unget by setting variable next_unget to true. The input is not
+    changed - we just simulate ungetting by modifying chars_read_total,
+    chars_read_current_line, and token_string. The next call to get() will
+    behave as if the unget character is read again.
+    */
+    void unget()
+    {
+        next_unget = true;
+
+        --position.chars_read_total;
+
+        // in case we "unget" a newline, we have to also decrement the lines_read
+        if (position.chars_read_current_line == 0)
+        {
+            if (position.lines_read > 0)
+            {
+                --position.lines_read;
+            }
+        }
+        else
+        {
+            --position.chars_read_current_line;
+        }
+
+        if (JSON_HEDLEY_LIKELY(current != std::char_traits<char_type>::eof()))
+        {
+            JSON_ASSERT(!token_string.empty());
+            token_string.pop_back();
+        }
+    }
+
+    /// add a character to token_buffer
+    void add(char_int_type c)
+    {
+        token_buffer.push_back(static_cast<typename string_t::value_type>(c));
+    }
+
+  public:
+    /////////////////////
+    // value getters
+    /////////////////////
+
+    /// return integer value
+    constexpr number_integer_t get_number_integer() const noexcept
+    {
+        return value_integer;
+    }
+
+    /// return unsigned integer value
+    constexpr number_unsigned_t get_number_unsigned() const noexcept
+    {
+        return value_unsigned;
+    }
+
+    /// return floating-point value
+    constexpr number_float_t get_number_float() const noexcept
+    {
+        return value_float;
+    }
+
+    /// return current string value (implicitly resets the token; useful only once)
+    string_t& get_string()
+    {
+        return token_buffer;
+    }
+
+    /////////////////////
+    // diagnostics
+    /////////////////////
+
+    /// return position of last read token
+    constexpr position_t get_position() const noexcept
+    {
+        return position;
+    }
+
+    /// return the last read token (for errors only).  Will never contain EOF
+    /// (an arbitrary value that is not a valid char value, often -1), because
+    /// 255 may legitimately occur.  May contain NUL, which should be escaped.
+    std::string get_token_string() const
+    {
+        // escape control characters
+        std::string result;
+        for (const auto c : token_string)
+        {
+            if (static_cast<unsigned char>(c) <= '\x1F')
+            {
+                // escape control characters
+                std::array<char, 9> cs{{}};
+                (std::snprintf)(cs.data(), cs.size(), "<U+%.4X>", static_cast<unsigned char>(c)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                result += cs.data();
+            }
+            else
+            {
+                // add character as is
+                result.push_back(static_cast<std::string::value_type>(c));
+            }
+        }
+
+        return result;
+    }
+
+    /// return syntax error message
+    JSON_HEDLEY_RETURNS_NON_NULL
+    constexpr const char* get_error_message() const noexcept
+    {
+        return error_message;
+    }
+
+    /////////////////////
+    // actual scanner
+    /////////////////////
+
+    /*!
+    @brief skip the UTF-8 byte order mark
+    @return true iff there is no BOM or the correct BOM has been skipped
+    */
+    bool skip_bom()
+    {
+        if (get() == 0xEF)
+        {
+            // check if we completely parse the BOM
+            return get() == 0xBB && get() == 0xBF;
+        }
+
+        // the first character is not the beginning of the BOM; unget it to
+        // process is later
+        unget();
+        return true;
+    }
+
+    void skip_whitespace()
+    {
+        do
+        {
+            get();
+        }
+        while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
+    }
+
+    token_type scan()
+    {
+        // initially, skip the BOM
+        if (position.chars_read_total == 0 && !skip_bom())
+        {
+            error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
+            return token_type::parse_error;
+        }
+
+        // read next character and ignore whitespace
+        skip_whitespace();
+
+        // ignore comments
+        while (ignore_comments && current == '/')
+        {
+            if (!scan_comment())
+            {
+                return token_type::parse_error;
+            }
+
+            // skip following whitespace
+            skip_whitespace();
+        }
+
+        switch (current)
+        {
+            // structural characters
+            case '[':
+                return token_type::begin_array;
+            case ']':
+                return token_type::end_array;
+            case '{':
+                return token_type::begin_object;
+            case '}':
+                return token_type::end_object;
+            case ':':
+                return token_type::name_separator;
+            case ',':
+                return token_type::value_separator;
+
+            // literals
+            case 't':
+            {
+                std::array<char_type, 4> true_literal = {{char_type('t'), char_type('r'), char_type('u'), char_type('e')}};
+                return scan_literal(true_literal.data(), true_literal.size(), token_type::literal_true);
+            }
+            case 'f':
+            {
+                std::array<char_type, 5> false_literal = {{char_type('f'), char_type('a'), char_type('l'), char_type('s'), char_type('e')}};
+                return scan_literal(false_literal.data(), false_literal.size(), token_type::literal_false);
+            }
+            case 'n':
+            {
+                std::array<char_type, 4> null_literal = {{char_type('n'), char_type('u'), char_type('l'), char_type('l')}};
+                return scan_literal(null_literal.data(), null_literal.size(), token_type::literal_null);
+            }
+
+            // string
+            case '\"':
+                return scan_string();
+
+            // number
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+                return scan_number();
+
+            // end of input (the null byte is needed when parsing from
+            // string literals)
+            case '\0':
+            case std::char_traits<char_type>::eof():
+                return token_type::end_of_input;
+
+            // error
+            default:
+                error_message = "invalid literal";
+                return token_type::parse_error;
+        }
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// whether comments should be ignored (true) or signaled as errors (false)
+    const bool ignore_comments = false;
+
+    /// the current character
+    char_int_type current = std::char_traits<char_type>::eof();
+
+    /// whether the next get() call should just return current
+    bool next_unget = false;
+
+    /// the start position of the current token
+    position_t position {};
+
+    /// raw input token string (for error messages)
+    std::vector<char_type> token_string {};
+
+    /// buffer for variable-length tokens (numbers, strings)
+    string_t token_buffer {};
+
+    /// a description of occurred lexer errors
+    const char* error_message = "";
+
+    // number values
+    number_integer_t value_integer = 0;
+    number_unsigned_t value_unsigned = 0;
+    number_float_t value_float = 0;
+
+    /// the decimal point
+    const char_int_type decimal_point_char = '.';
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+
+#include <cstdint> // size_t
+#include <utility> // declval
+#include <string> // string
+
+// #include <nlohmann/detail/meta/detected.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename T>
+using null_function_t = decltype(std::declval<T&>().null());
+
+template<typename T>
+using boolean_function_t =
+    decltype(std::declval<T&>().boolean(std::declval<bool>()));
+
+template<typename T, typename Integer>
+using number_integer_function_t =
+    decltype(std::declval<T&>().number_integer(std::declval<Integer>()));
+
+template<typename T, typename Unsigned>
+using number_unsigned_function_t =
+    decltype(std::declval<T&>().number_unsigned(std::declval<Unsigned>()));
+
+template<typename T, typename Float, typename String>
+using number_float_function_t = decltype(std::declval<T&>().number_float(
+                                    std::declval<Float>(), std::declval<const String&>()));
+
+template<typename T, typename String>
+using string_function_t =
+    decltype(std::declval<T&>().string(std::declval<String&>()));
+
+template<typename T, typename Binary>
+using binary_function_t =
+    decltype(std::declval<T&>().binary(std::declval<Binary&>()));
+
+template<typename T>
+using start_object_function_t =
+    decltype(std::declval<T&>().start_object(std::declval<std::size_t>()));
+
+template<typename T, typename String>
+using key_function_t =
+    decltype(std::declval<T&>().key(std::declval<String&>()));
+
+template<typename T>
+using end_object_function_t = decltype(std::declval<T&>().end_object());
+
+template<typename T>
+using start_array_function_t =
+    decltype(std::declval<T&>().start_array(std::declval<std::size_t>()));
+
+template<typename T>
+using end_array_function_t = decltype(std::declval<T&>().end_array());
+
+template<typename T, typename Exception>
+using parse_error_function_t = decltype(std::declval<T&>().parse_error(
+        std::declval<std::size_t>(), std::declval<const std::string&>(),
+        std::declval<const Exception&>()));
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static constexpr bool value =
+        is_detected_exact<bool, null_function_t, SAX>::value &&
+        is_detected_exact<bool, boolean_function_t, SAX>::value &&
+        is_detected_exact<bool, number_integer_function_t, SAX, number_integer_t>::value &&
+        is_detected_exact<bool, number_unsigned_function_t, SAX, number_unsigned_t>::value &&
+        is_detected_exact<bool, number_float_function_t, SAX, number_float_t, string_t>::value &&
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value &&
+        is_detected_exact<bool, start_object_function_t, SAX>::value &&
+        is_detected_exact<bool, key_function_t, SAX, string_t>::value &&
+        is_detected_exact<bool, end_object_function_t, SAX>::value &&
+        is_detected_exact<bool, start_array_function_t, SAX>::value &&
+        is_detected_exact<bool, end_array_function_t, SAX>::value &&
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value;
+};
+
+template<typename SAX, typename BasicJsonType>
+struct is_sax_static_asserts
+{
+  private:
+    static_assert(is_basic_json<BasicJsonType>::value,
+                  "BasicJsonType must be of type basic_json<...>");
+
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using exception_t = typename BasicJsonType::exception;
+
+  public:
+    static_assert(is_detected_exact<bool, null_function_t, SAX>::value,
+                  "Missing/invalid function: bool null()");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(is_detected_exact<bool, boolean_function_t, SAX>::value,
+                  "Missing/invalid function: bool boolean(bool)");
+    static_assert(
+        is_detected_exact<bool, number_integer_function_t, SAX,
+        number_integer_t>::value,
+        "Missing/invalid function: bool number_integer(number_integer_t)");
+    static_assert(
+        is_detected_exact<bool, number_unsigned_function_t, SAX,
+        number_unsigned_t>::value,
+        "Missing/invalid function: bool number_unsigned(number_unsigned_t)");
+    static_assert(is_detected_exact<bool, number_float_function_t, SAX,
+                  number_float_t, string_t>::value,
+                  "Missing/invalid function: bool number_float(number_float_t, const string_t&)");
+    static_assert(
+        is_detected_exact<bool, string_function_t, SAX, string_t>::value,
+        "Missing/invalid function: bool string(string_t&)");
+    static_assert(
+        is_detected_exact<bool, binary_function_t, SAX, binary_t>::value,
+        "Missing/invalid function: bool binary(binary_t&)");
+    static_assert(is_detected_exact<bool, start_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_object(std::size_t)");
+    static_assert(is_detected_exact<bool, key_function_t, SAX, string_t>::value,
+                  "Missing/invalid function: bool key(string_t&)");
+    static_assert(is_detected_exact<bool, end_object_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_object()");
+    static_assert(is_detected_exact<bool, start_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool start_array(std::size_t)");
+    static_assert(is_detected_exact<bool, end_array_function_t, SAX>::value,
+                  "Missing/invalid function: bool end_array()");
+    static_assert(
+        is_detected_exact<bool, parse_error_function_t, SAX, exception_t>::value,
+        "Missing/invalid function: bool parse_error(std::size_t, const "
+        "std::string&, const exception&)");
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/// how to treat CBOR tags
+enum class cbor_tag_handler_t
+{
+    error,   ///< throw a parse_error exception in case of a tag
+    ignore,  ///< ignore tags
+    store    ///< store tags as binary type
+};
+
+/*!
+@brief determine system byte order
+
+@return true if and only if system's byte order is little endian
+
+@note from https://stackoverflow.com/a/1001328/266378
+*/
+static inline bool little_endianess(int num = 1) noexcept
+{
+    return *reinterpret_cast<char*>(&num) == 1;
+}
+
+
+///////////////////
+// binary reader //
+///////////////////
+
+/*!
+@brief deserialization of CBOR, MessagePack, and UBJSON values
+*/
+template<typename BasicJsonType, typename InputAdapterType, typename SAX = json_sax_dom_parser<BasicJsonType>>
+class binary_reader
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using json_sax_t = SAX;
+    using char_type = typename InputAdapterType::char_type;
+    using char_int_type = typename std::char_traits<char_type>::int_type;
+
+  public:
+    /*!
+    @brief create a binary reader
+
+    @param[in] adapter  input adapter to read from
+    */
+    explicit binary_reader(InputAdapterType&& adapter) noexcept : ia(std::move(adapter))
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+    }
+
+    // make class move-only
+    binary_reader(const binary_reader&) = delete;
+    binary_reader(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    binary_reader& operator=(const binary_reader&) = delete;
+    binary_reader& operator=(binary_reader&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
+    ~binary_reader() = default;
+
+    /*!
+    @param[in] format  the binary format to parse
+    @param[in] sax_    a SAX event processor
+    @param[in] strict  whether to expect the input to be consumed completed
+    @param[in] tag_handler  how to treat CBOR tags
+
+    @return whether parsing was successful
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool sax_parse(const input_format_t format,
+                   json_sax_t* sax_,
+                   const bool strict = true,
+                   const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        sax = sax_;
+        bool result = false;
+
+        switch (format)
+        {
+            case input_format_t::bson:
+                result = parse_bson_internal();
+                break;
+
+            case input_format_t::cbor:
+                result = parse_cbor_internal(true, tag_handler);
+                break;
+
+            case input_format_t::msgpack:
+                result = parse_msgpack_internal();
+                break;
+
+            case input_format_t::ubjson:
+                result = parse_ubjson_internal();
+                break;
+
+            case input_format_t::json: // LCOV_EXCL_LINE
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+        // strict mode: next byte must be EOF
+        if (result && strict)
+        {
+            if (format == input_format_t::ubjson)
+            {
+                get_ignore_noop();
+            }
+            else
+            {
+                get();
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char_type>::eof()))
+            {
+                return sax->parse_error(chars_read, get_token_string(),
+                                        parse_error::create(110, chars_read, exception_message(format, "expected end of input; last byte: 0x" + get_token_string(), "value"), BasicJsonType()));
+            }
+        }
+
+        return result;
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @brief Reads in a BSON-object and passes it to the SAX-parser.
+    @return whether a valid BSON-value was passed to the SAX parser
+    */
+    bool parse_bson_internal()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/false)))
+        {
+            return false;
+        }
+
+        return sax->end_object();
+    }
+
+    /*!
+    @brief Parses a C-style string from the BSON input.
+    @param[in,out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @return `true` if the \x00-byte indicating the end of the string was
+             encountered before the EOF; false` indicates an unexpected EOF.
+    */
+    bool get_bson_cstr(string_t& result)
+    {
+        auto out = std::back_inserter(result);
+        while (true)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "cstring")))
+            {
+                return false;
+            }
+            if (current == 0x00)
+            {
+                return true;
+            }
+            *out++ = static_cast<typename string_t::value_type>(current);
+        }
+    }
+
+    /*!
+    @brief Parses a zero-terminated string of length @a len from the BSON
+           input.
+    @param[in] len  The length (including the zero-byte at the end) of the
+                    string to be read.
+    @param[in,out] result  A reference to the string variable where the read
+                            string is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 1
+    @return `true` if the string was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_string(const NumberType len, string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 1))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "string length must be at least 1, is " + std::to_string(len), "string"), BasicJsonType()));
+        }
+
+        return get_string(input_format_t::bson, len - static_cast<NumberType>(1), result) && get() != std::char_traits<char_type>::eof();
+    }
+
+    /*!
+    @brief Parses a byte array input of length @a len from the BSON input.
+    @param[in] len  The length of the byte array to be read.
+    @param[in,out] result  A reference to the binary variable where the read
+                            array is to be stored.
+    @tparam NumberType The type of the length @a len
+    @pre len >= 0
+    @return `true` if the byte array was successfully parsed
+    */
+    template<typename NumberType>
+    bool get_bson_binary(const NumberType len, binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(len < 0))
+        {
+            auto last_token = get_token_string();
+            return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::bson, "byte array length cannot be negative, is " + std::to_string(len), "binary"), BasicJsonType()));
+        }
+
+        // All BSON binary values have a subtype
+        std::uint8_t subtype{};
+        get_number<std::uint8_t>(input_format_t::bson, subtype);
+        result.set_subtype(subtype);
+
+        return get_binary(input_format_t::bson, len, result);
+    }
+
+    /*!
+    @brief Read a BSON document element of the given @a element_type.
+    @param[in] element_type The BSON element type, c.f. http://bsonspec.org/spec.html
+    @param[in] element_type_parse_position The position in the input stream,
+               where the `element_type` was read.
+    @warning Not all BSON element types are supported yet. An unsupported
+             @a element_type will give rise to a parse_error.114:
+             Unsupported BSON record type 0x...
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_internal(const char_int_type element_type,
+                                     const std::size_t element_type_parse_position)
+    {
+        switch (element_type)
+        {
+            case 0x01: // double
+            {
+                double number{};
+                return get_number<double, true>(input_format_t::bson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0x02: // string
+            {
+                std::int32_t len{};
+                string_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_string(len, value) && sax->string(value);
+            }
+
+            case 0x03: // object
+            {
+                return parse_bson_internal();
+            }
+
+            case 0x04: // array
+            {
+                return parse_bson_array();
+            }
+
+            case 0x05: // binary
+            {
+                std::int32_t len{};
+                binary_t value;
+                return get_number<std::int32_t, true>(input_format_t::bson, len) && get_bson_binary(len, value) && sax->binary(value);
+            }
+
+            case 0x08: // boolean
+            {
+                return sax->boolean(get() != 0);
+            }
+
+            case 0x0A: // null
+            {
+                return sax->null();
+            }
+
+            case 0x10: // int32
+            {
+                std::int32_t value{};
+                return get_number<std::int32_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            case 0x12: // int64
+            {
+                std::int64_t value{};
+                return get_number<std::int64_t, true>(input_format_t::bson, value) && sax->number_integer(value);
+            }
+
+            default: // anything else not supported (yet)
+            {
+                std::array<char, 3> cr{{}};
+                (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(element_type)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                return sax->parse_error(element_type_parse_position, std::string(cr.data()), parse_error::create(114, element_type_parse_position, "Unsupported BSON record type 0x" + std::string(cr.data()), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @brief Read a BSON element list (as specified in the BSON-spec)
+
+    The same binary layout is used for objects and arrays, hence it must be
+    indicated with the argument @a is_array which one is expected
+    (true --> array, false --> object).
+
+    @param[in] is_array Determines if the element list being read is to be
+                        treated as an object (@a is_array == false), or as an
+                        array (@a is_array == true).
+    @return whether a valid BSON-object/array was passed to the SAX parser
+    */
+    bool parse_bson_element_list(const bool is_array)
+    {
+        string_t key;
+
+        while (auto element_type = get())
+        {
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::bson, "element list")))
+            {
+                return false;
+            }
+
+            const std::size_t element_type_parse_position = chars_read;
+            if (JSON_HEDLEY_UNLIKELY(!get_bson_cstr(key)))
+            {
+                return false;
+            }
+
+            if (!is_array && !sax->key(key))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_internal(element_type, element_type_parse_position)))
+            {
+                return false;
+            }
+
+            // get_bson_cstr only appends
+            key.clear();
+        }
+
+        return true;
+    }
+
+    /*!
+    @brief Reads an array from the BSON input and passes it to the SAX-parser.
+    @return whether a valid BSON-array was passed to the SAX parser
+    */
+    bool parse_bson_array()
+    {
+        std::int32_t document_size{};
+        get_number<std::int32_t, true>(input_format_t::bson, document_size);
+
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+        {
+            return false;
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!parse_bson_element_list(/*is_array*/true)))
+        {
+            return false;
+        }
+
+        return sax->end_array();
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true) or whether the last read character should
+                         be considered instead (false)
+    @param[in] tag_handler how CBOR tags should be treated
+
+    @return whether a valid CBOR value was passed to the SAX parser
+    */
+    bool parse_cbor_internal(const bool get_char,
+                             const cbor_tag_handler_t tag_handler)
+    {
+        switch (get_char ? get() : current)
+        {
+            // EOF
+            case std::char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::cbor, "value");
+
+            // Integer 0x00..0x17 (0..23)
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            case 0x18: // Unsigned integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x19: // Unsigned integer (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1A: // Unsigned integer (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            case 0x1B: // Unsigned integer (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_unsigned(number);
+            }
+
+            // Negative integer -1-0x00..-1-0x17 (-1..-24)
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+                return sax->number_integer(static_cast<std::int8_t>(0x20 - 1 - current));
+
+            case 0x38: // Negative integer (one-byte uint8_t follows)
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x39: // Negative integer -1-n (two-byte uint16_t follows)
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3A: // Negative integer -1-n (four-byte uint32_t follows)
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1) - number);
+            }
+
+            case 0x3B: // Negative integer -1-n (eight-byte uint64_t follows)
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::cbor, number) && sax->number_integer(static_cast<number_integer_t>(-1)
+                        - static_cast<number_integer_t>(number));
+            }
+
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            case 0x5F: // Binary data (indefinite length)
+            {
+                binary_t b;
+                return get_cbor_binary(b) && sax->binary(b);
+            }
+
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                string_t s;
+                return get_cbor_string(s) && sax->string(s);
+            }
+
+            // array (0x00..0x17 data items follow)
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+                return get_cbor_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0x98: // array (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x99: // array (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9A: // array (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9B: // array (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_array(detail::conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0x9F: // array (indefinite length)
+                return get_cbor_array(std::size_t(-1), tag_handler);
+
+            // map (0x00..0x17 pairs of data items follow)
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+                return get_cbor_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x1Fu), tag_handler);
+
+            case 0xB8: // map (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xB9: // map (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBA: // map (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBB: // map (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_cbor_object(detail::conditional_static_cast<std::size_t>(len), tag_handler);
+            }
+
+            case 0xBF: // map (indefinite length)
+                return get_cbor_object(std::size_t(-1), tag_handler);
+
+            case 0xC6: // tagged item
+            case 0xC7:
+            case 0xC8:
+            case 0xC9:
+            case 0xCA:
+            case 0xCB:
+            case 0xCC:
+            case 0xCD:
+            case 0xCE:
+            case 0xCF:
+            case 0xD0:
+            case 0xD1:
+            case 0xD2:
+            case 0xD3:
+            case 0xD4:
+            case 0xD8: // tagged item (1 bytes follow)
+            case 0xD9: // tagged item (2 bytes follow)
+            case 0xDA: // tagged item (4 bytes follow)
+            case 0xDB: // tagged item (8 bytes follow)
+            {
+                switch (tag_handler)
+                {
+                    case cbor_tag_handler_t::error:
+                    {
+                        auto last_token = get_token_string();
+                        return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
+                    }
+
+                    case cbor_tag_handler_t::ignore:
+                    {
+                        // ignore binary subtype
+                        switch (current)
+                        {
+                            case 0xD8:
+                            {
+                                std::uint8_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xD9:
+                            {
+                                std::uint16_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xDA:
+                            {
+                                std::uint32_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            case 0xDB:
+                            {
+                                std::uint64_t subtype_to_ignore{};
+                                get_number(input_format_t::cbor, subtype_to_ignore);
+                                break;
+                            }
+                            default:
+                                break;
+                        }
+                        return parse_cbor_internal(true, tag_handler);
+                    }
+
+                    case cbor_tag_handler_t::store:
+                    {
+                        binary_t b;
+                        // use binary subtype and store in binary container
+                        switch (current)
+                        {
+                            case 0xD8:
+                            {
+                                std::uint8_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xD9:
+                            {
+                                std::uint16_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xDA:
+                            {
+                                std::uint32_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            case 0xDB:
+                            {
+                                std::uint64_t subtype{};
+                                get_number(input_format_t::cbor, subtype);
+                                b.set_subtype(detail::conditional_static_cast<typename binary_t::subtype_type>(subtype));
+                                break;
+                            }
+                            default:
+                                return parse_cbor_internal(true, tag_handler);
+                        }
+                        get();
+                        return get_cbor_binary(b) && sax->binary(b);
+                    }
+
+                    default:                 // LCOV_EXCL_LINE
+                        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                        return false;        // LCOV_EXCL_LINE
+                }
+            }
+
+            case 0xF4: // false
+                return sax->boolean(false);
+
+            case 0xF5: // true
+                return sax->boolean(true);
+
+            case 0xF6: // null
+                return sax->null();
+
+            case 0xF9: // Half-Precision Float (two-byte IEEE 754)
+            {
+                const auto byte1_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+                const auto byte2_raw = get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "number")))
+                {
+                    return false;
+                }
+
+                const auto byte1 = static_cast<unsigned char>(byte1_raw);
+                const auto byte2 = static_cast<unsigned char>(byte2_raw);
+
+                // code from RFC 7049, Appendix D, Figure 3:
+                // As half-precision floating-point numbers were only added
+                // to IEEE 754 in 2008, today's programming platforms often
+                // still only have limited support for them. It is very
+                // easy to include at least decoding support for them even
+                // without such support. An example of a small decoder for
+                // half-precision floating-point numbers in the C language
+                // is shown in Fig. 3.
+                const auto half = static_cast<unsigned int>((byte1 << 8u) + byte2);
+                const double val = [&half]
+                {
+                    const int exp = (half >> 10u) & 0x1Fu;
+                    const unsigned int mant = half & 0x3FFu;
+                    JSON_ASSERT(0 <= exp&& exp <= 32);
+                    JSON_ASSERT(mant <= 1024);
+                    switch (exp)
+                    {
+                        case 0:
+                            return std::ldexp(mant, -24);
+                        case 31:
+                            return (mant == 0)
+                            ? std::numeric_limits<double>::infinity()
+                            : std::numeric_limits<double>::quiet_NaN();
+                        default:
+                            return std::ldexp(mant + 1024, exp - 25);
+                    }
+                }();
+                return sax->number_float((half & 0x8000u) != 0
+                                         ? static_cast<number_float_t>(-val)
+                                         : static_cast<number_float_t>(val), "");
+            }
+
+            case 0xFA: // Single-Precision Float (four-byte IEEE 754)
+            {
+                float number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xFB: // Double-Precision Float (eight-byte IEEE 754)
+            {
+                double number{};
+                return get_number(input_format_t::cbor, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            default: // anything else (0xFF is handled inside the other types)
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::cbor, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+    Additionally, CBOR's strings with indefinite lengths are supported.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_cbor_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // UTF-8 string (0x00..0x17 bytes follow)
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            {
+                return get_string(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x78: // UTF-8 string (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x79: // UTF-8 string (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7A: // UTF-8 string (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7B: // UTF-8 string (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) && get_string(input_format_t::cbor, len, result);
+            }
+
+            case 0x7F: // UTF-8 string (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    string_t chunk;
+                    if (!get_cbor_string(chunk))
+                    {
+                        return false;
+                    }
+                    result.append(chunk);
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x60-0x7B) or indefinite string type (0x7F); last byte: 0x" + last_token, "string"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a CBOR byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into the byte array.
+    Additionally, CBOR's byte arrays with indefinite lengths are supported.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_cbor_binary(binary_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::cbor, "binary")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // Binary data (0x00..0x17 bytes follow)
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            {
+                return get_binary(input_format_t::cbor, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0x58: // Binary data (one-byte uint8_t for n follows)
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x59: // Binary data (two-byte uint16_t for n follow)
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5A: // Binary data (four-byte uint32_t for n follow)
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5B: // Binary data (eight-byte uint64_t for n follow)
+            {
+                std::uint64_t len{};
+                return get_number(input_format_t::cbor, len) &&
+                       get_binary(input_format_t::cbor, len, result);
+            }
+
+            case 0x5F: // Binary data (indefinite length)
+            {
+                while (get() != 0xFF)
+                {
+                    binary_t chunk;
+                    if (!get_cbor_binary(chunk))
+                    {
+                        return false;
+                    }
+                    result.insert(result.end(), chunk.begin(), chunk.end());
+                }
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::cbor, "expected length specification (0x40-0x5B) or indefinite binary array type (0x5F); last byte: 0x" + last_token, "binary"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array or std::size_t(-1) for an
+                    array of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether array creation completed
+    */
+    bool get_cbor_array(const std::size_t len,
+                        const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        if (len != std::size_t(-1))
+        {
+            for (std::size_t i = 0; i < len; ++i)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+        else
+        {
+            while (get() != 0xFF)
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(false, tag_handler)))
+                {
+                    return false;
+                }
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object or std::size_t(-1) for an
+                    object of indefinite size
+    @param[in] tag_handler how CBOR tags should be treated
+    @return whether object creation completed
+    */
+    bool get_cbor_object(const std::size_t len,
+                         const cbor_tag_handler_t tag_handler)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        if (len != 0)
+        {
+            string_t key;
+            if (len != std::size_t(-1))
+            {
+                for (std::size_t i = 0; i < len; ++i)
+                {
+                    get();
+                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                while (get() != 0xFF)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_cbor_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+
+                    if (JSON_HEDLEY_UNLIKELY(!parse_cbor_internal(true, tag_handler)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    /*!
+    @return whether a valid MessagePack value was passed to the SAX parser
+    */
+    bool parse_msgpack_internal()
+    {
+        switch (get())
+        {
+            // EOF
+            case std::char_traits<char_type>::eof():
+                return unexpect_eof(input_format_t::msgpack, "value");
+
+            // positive fixint
+            case 0x00:
+            case 0x01:
+            case 0x02:
+            case 0x03:
+            case 0x04:
+            case 0x05:
+            case 0x06:
+            case 0x07:
+            case 0x08:
+            case 0x09:
+            case 0x0A:
+            case 0x0B:
+            case 0x0C:
+            case 0x0D:
+            case 0x0E:
+            case 0x0F:
+            case 0x10:
+            case 0x11:
+            case 0x12:
+            case 0x13:
+            case 0x14:
+            case 0x15:
+            case 0x16:
+            case 0x17:
+            case 0x18:
+            case 0x19:
+            case 0x1A:
+            case 0x1B:
+            case 0x1C:
+            case 0x1D:
+            case 0x1E:
+            case 0x1F:
+            case 0x20:
+            case 0x21:
+            case 0x22:
+            case 0x23:
+            case 0x24:
+            case 0x25:
+            case 0x26:
+            case 0x27:
+            case 0x28:
+            case 0x29:
+            case 0x2A:
+            case 0x2B:
+            case 0x2C:
+            case 0x2D:
+            case 0x2E:
+            case 0x2F:
+            case 0x30:
+            case 0x31:
+            case 0x32:
+            case 0x33:
+            case 0x34:
+            case 0x35:
+            case 0x36:
+            case 0x37:
+            case 0x38:
+            case 0x39:
+            case 0x3A:
+            case 0x3B:
+            case 0x3C:
+            case 0x3D:
+            case 0x3E:
+            case 0x3F:
+            case 0x40:
+            case 0x41:
+            case 0x42:
+            case 0x43:
+            case 0x44:
+            case 0x45:
+            case 0x46:
+            case 0x47:
+            case 0x48:
+            case 0x49:
+            case 0x4A:
+            case 0x4B:
+            case 0x4C:
+            case 0x4D:
+            case 0x4E:
+            case 0x4F:
+            case 0x50:
+            case 0x51:
+            case 0x52:
+            case 0x53:
+            case 0x54:
+            case 0x55:
+            case 0x56:
+            case 0x57:
+            case 0x58:
+            case 0x59:
+            case 0x5A:
+            case 0x5B:
+            case 0x5C:
+            case 0x5D:
+            case 0x5E:
+            case 0x5F:
+            case 0x60:
+            case 0x61:
+            case 0x62:
+            case 0x63:
+            case 0x64:
+            case 0x65:
+            case 0x66:
+            case 0x67:
+            case 0x68:
+            case 0x69:
+            case 0x6A:
+            case 0x6B:
+            case 0x6C:
+            case 0x6D:
+            case 0x6E:
+            case 0x6F:
+            case 0x70:
+            case 0x71:
+            case 0x72:
+            case 0x73:
+            case 0x74:
+            case 0x75:
+            case 0x76:
+            case 0x77:
+            case 0x78:
+            case 0x79:
+            case 0x7A:
+            case 0x7B:
+            case 0x7C:
+            case 0x7D:
+            case 0x7E:
+            case 0x7F:
+                return sax->number_unsigned(static_cast<number_unsigned_t>(current));
+
+            // fixmap
+            case 0x80:
+            case 0x81:
+            case 0x82:
+            case 0x83:
+            case 0x84:
+            case 0x85:
+            case 0x86:
+            case 0x87:
+            case 0x88:
+            case 0x89:
+            case 0x8A:
+            case 0x8B:
+            case 0x8C:
+            case 0x8D:
+            case 0x8E:
+            case 0x8F:
+                return get_msgpack_object(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixarray
+            case 0x90:
+            case 0x91:
+            case 0x92:
+            case 0x93:
+            case 0x94:
+            case 0x95:
+            case 0x96:
+            case 0x97:
+            case 0x98:
+            case 0x99:
+            case 0x9A:
+            case 0x9B:
+            case 0x9C:
+            case 0x9D:
+            case 0x9E:
+            case 0x9F:
+                return get_msgpack_array(static_cast<std::size_t>(static_cast<unsigned int>(current) & 0x0Fu));
+
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            case 0xD9: // str 8
+            case 0xDA: // str 16
+            case 0xDB: // str 32
+            {
+                string_t s;
+                return get_msgpack_string(s) && sax->string(s);
+            }
+
+            case 0xC0: // nil
+                return sax->null();
+
+            case 0xC2: // false
+                return sax->boolean(false);
+
+            case 0xC3: // true
+                return sax->boolean(true);
+
+            case 0xC4: // bin 8
+            case 0xC5: // bin 16
+            case 0xC6: // bin 32
+            case 0xC7: // ext 8
+            case 0xC8: // ext 16
+            case 0xC9: // ext 32
+            case 0xD4: // fixext 1
+            case 0xD5: // fixext 2
+            case 0xD6: // fixext 4
+            case 0xD7: // fixext 8
+            case 0xD8: // fixext 16
+            {
+                binary_t b;
+                return get_msgpack_binary(b) && sax->binary(b);
+            }
+
+            case 0xCA: // float 32
+            {
+                float number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCB: // float 64
+            {
+                double number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 0xCC: // uint 8
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCD: // uint 16
+            {
+                std::uint16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCE: // uint 32
+            {
+                std::uint32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xCF: // uint 64
+            {
+                std::uint64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_unsigned(number);
+            }
+
+            case 0xD0: // int 8
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD1: // int 16
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD2: // int 32
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xD3: // int 64
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::msgpack, number) && sax->number_integer(number);
+            }
+
+            case 0xDC: // array 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDD: // array 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_array(static_cast<std::size_t>(len));
+            }
+
+            case 0xDE: // map 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            case 0xDF: // map 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_msgpack_object(static_cast<std::size_t>(len));
+            }
+
+            // negative fixint
+            case 0xE0:
+            case 0xE1:
+            case 0xE2:
+            case 0xE3:
+            case 0xE4:
+            case 0xE5:
+            case 0xE6:
+            case 0xE7:
+            case 0xE8:
+            case 0xE9:
+            case 0xEA:
+            case 0xEB:
+            case 0xEC:
+            case 0xED:
+            case 0xEE:
+            case 0xEF:
+            case 0xF0:
+            case 0xF1:
+            case 0xF2:
+            case 0xF3:
+            case 0xF4:
+            case 0xF5:
+            case 0xF6:
+            case 0xF7:
+            case 0xF8:
+            case 0xF9:
+            case 0xFA:
+            case 0xFB:
+            case 0xFC:
+            case 0xFD:
+            case 0xFE:
+            case 0xFF:
+                return sax->number_integer(static_cast<std::int8_t>(current));
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::msgpack, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack string
+
+    This function first reads starting bytes to determine the expected
+    string length and then copies this number of bytes into a string.
+
+    @param[out] result  created string
+
+    @return whether string creation completed
+    */
+    bool get_msgpack_string(string_t& result)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::msgpack, "string")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            // fixstr
+            case 0xA0:
+            case 0xA1:
+            case 0xA2:
+            case 0xA3:
+            case 0xA4:
+            case 0xA5:
+            case 0xA6:
+            case 0xA7:
+            case 0xA8:
+            case 0xA9:
+            case 0xAA:
+            case 0xAB:
+            case 0xAC:
+            case 0xAD:
+            case 0xAE:
+            case 0xAF:
+            case 0xB0:
+            case 0xB1:
+            case 0xB2:
+            case 0xB3:
+            case 0xB4:
+            case 0xB5:
+            case 0xB6:
+            case 0xB7:
+            case 0xB8:
+            case 0xB9:
+            case 0xBA:
+            case 0xBB:
+            case 0xBC:
+            case 0xBD:
+            case 0xBE:
+            case 0xBF:
+            {
+                return get_string(input_format_t::msgpack, static_cast<unsigned int>(current) & 0x1Fu, result);
+            }
+
+            case 0xD9: // str 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDA: // str 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            case 0xDB: // str 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) && get_string(input_format_t::msgpack, len, result);
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::msgpack, "expected length specification (0xA0-0xBF, 0xD9-0xDB); last byte: 0x" + last_token, "string"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @brief reads a MessagePack byte array
+
+    This function first reads starting bytes to determine the expected
+    byte array length and then copies this number of bytes into a byte array.
+
+    @param[out] result  created byte array
+
+    @return whether byte array creation completed
+    */
+    bool get_msgpack_binary(binary_t& result)
+    {
+        // helper function to set the subtype
+        auto assign_and_return_true = [&result](std::int8_t subtype)
+        {
+            result.set_subtype(static_cast<std::uint8_t>(subtype));
+            return true;
+        };
+
+        switch (current)
+        {
+            case 0xC4: // bin 8
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC5: // bin 16
+            {
+                std::uint16_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC6: // bin 32
+            {
+                std::uint32_t len{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_binary(input_format_t::msgpack, len, result);
+            }
+
+            case 0xC7: // ext 8
+            {
+                std::uint8_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC8: // ext 16
+            {
+                std::uint16_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xC9: // ext 32
+            {
+                std::uint32_t len{};
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, len) &&
+                       get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, len, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD4: // fixext 1
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 1, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD5: // fixext 2
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 2, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD6: // fixext 4
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 4, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD7: // fixext 8
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 8, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            case 0xD8: // fixext 16
+            {
+                std::int8_t subtype{};
+                return get_number(input_format_t::msgpack, subtype) &&
+                       get_binary(input_format_t::msgpack, 16, result) &&
+                       assign_and_return_true(subtype);
+            }
+
+            default:           // LCOV_EXCL_LINE
+                return false;  // LCOV_EXCL_LINE
+        }
+    }
+
+    /*!
+    @param[in] len  the length of the array
+    @return whether array creation completed
+    */
+    bool get_msgpack_array(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(len)))
+        {
+            return false;
+        }
+
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @param[in] len  the length of the object
+    @return whether object creation completed
+    */
+    bool get_msgpack_object(const std::size_t len)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(len)))
+        {
+            return false;
+        }
+
+        string_t key;
+        for (std::size_t i = 0; i < len; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!get_msgpack_string(key) || !sax->key(key)))
+            {
+                return false;
+            }
+
+            if (JSON_HEDLEY_UNLIKELY(!parse_msgpack_internal()))
+            {
+                return false;
+            }
+            key.clear();
+        }
+
+        return sax->end_object();
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    /*!
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether a valid UBJSON value was passed to the SAX parser
+    */
+    bool parse_ubjson_internal(const bool get_char = true)
+    {
+        return get_ubjson_value(get_char ? get_ignore_noop() : current);
+    }
+
+    /*!
+    @brief reads a UBJSON string
+
+    This function is either called after reading the 'S' byte explicitly
+    indicating a string, or in case of an object key where the 'S' byte can be
+    left out.
+
+    @param[out] result   created string
+    @param[in] get_char  whether a new character should be retrieved from the
+                         input (true, default) or whether the last read
+                         character should be considered instead
+
+    @return whether string creation completed
+    */
+    bool get_ubjson_string(string_t& result, const bool get_char = true)
+    {
+        if (get_char)
+        {
+            get();  // TODO(niels): may we ignore N here?
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
+        {
+            return false;
+        }
+
+        switch (current)
+        {
+            case 'U':
+            {
+                std::uint8_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'i':
+            {
+                std::int8_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'I':
+            {
+                std::int16_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'l':
+            {
+                std::int32_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            case 'L':
+            {
+                std::int64_t len{};
+                return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
+            }
+
+            default:
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token, "string"), BasicJsonType()));
+        }
+    }
+
+    /*!
+    @param[out] result  determined size
+    @return whether size determination completed
+    */
+    bool get_ubjson_size_value(std::size_t& result)
+    {
+        switch (get_ignore_noop())
+        {
+            case 'U':
+            {
+                std::uint8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number); // NOLINT(bugprone-signed-char-misuse,cert-str34-c): number is not a char
+                return true;
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
+                {
+                    return false;
+                }
+                result = static_cast<std::size_t>(number);
+                return true;
+            }
+
+            default:
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token, "size"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @brief determine the type and size for a container
+
+    In the optimized UBJSON format, a type and a size can be provided to allow
+    for a more compact representation.
+
+    @param[out] result  pair of the size and the type
+
+    @return whether pair creation completed
+    */
+    bool get_ubjson_size_type(std::pair<std::size_t, char_int_type>& result)
+    {
+        result.first = string_t::npos; // size
+        result.second = 0; // type
+
+        get_ignore_noop();
+
+        if (current == '$')
+        {
+            result.second = get();  // must not ignore 'N', because 'N' maybe the type
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "type")))
+            {
+                return false;
+            }
+
+            get_ignore_noop();
+            if (JSON_HEDLEY_UNLIKELY(current != '#'))
+            {
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
+                {
+                    return false;
+                }
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "expected '#' after type information; last byte: 0x" + last_token, "size"), BasicJsonType()));
+            }
+
+            return get_ubjson_size_value(result.first);
+        }
+
+        if (current == '#')
+        {
+            return get_ubjson_size_value(result.first);
+        }
+
+        return true;
+    }
+
+    /*!
+    @param prefix  the previously read or set type prefix
+    @return whether value creation completed
+    */
+    bool get_ubjson_value(const char_int_type prefix)
+    {
+        switch (prefix)
+        {
+            case std::char_traits<char_type>::eof():  // EOF
+                return unexpect_eof(input_format_t::ubjson, "value");
+
+            case 'T':  // true
+                return sax->boolean(true);
+            case 'F':  // false
+                return sax->boolean(false);
+
+            case 'Z':  // null
+                return sax->null();
+
+            case 'U':
+            {
+                std::uint8_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_unsigned(number);
+            }
+
+            case 'i':
+            {
+                std::int8_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'I':
+            {
+                std::int16_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'l':
+            {
+                std::int32_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'L':
+            {
+                std::int64_t number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
+            }
+
+            case 'd':
+            {
+                float number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'D':
+            {
+                double number{};
+                return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
+            }
+
+            case 'H':
+            {
+                return get_ubjson_high_precision_number();
+            }
+
+            case 'C':  // char
+            {
+                get();
+                if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "char")))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(current > 127))
+                {
+                    auto last_token = get_token_string();
+                    return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format_t::ubjson, "byte after 'C' must be in range 0x00..0x7F; last byte: 0x" + last_token, "char"), BasicJsonType()));
+                }
+                string_t s(1, static_cast<typename string_t::value_type>(current));
+                return sax->string(s);
+            }
+
+            case 'S':  // string
+            {
+                string_t s;
+                return get_ubjson_string(s) && sax->string(s);
+            }
+
+            case '[':  // array
+                return get_ubjson_array();
+
+            case '{':  // object
+                return get_ubjson_object();
+
+            default: // anything else
+            {
+                auto last_token = get_token_string();
+                return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format_t::ubjson, "invalid byte: 0x" + last_token, "value"), BasicJsonType()));
+            }
+        }
+    }
+
+    /*!
+    @return whether array creation completed
+    */
+    bool get_ubjson_array()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                if (size_and_type.second != 'N')
+                {
+                    for (std::size_t i = 0; i < size_and_type.first; ++i)
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != ']')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal(false)))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+            }
+        }
+
+        return sax->end_array();
+    }
+
+    /*!
+    @return whether object creation completed
+    */
+    bool get_ubjson_object()
+    {
+        std::pair<std::size_t, char_int_type> size_and_type;
+        if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
+        {
+            return false;
+        }
+
+        string_t key;
+        if (size_and_type.first != string_t::npos)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(size_and_type.first)))
+            {
+                return false;
+            }
+
+            if (size_and_type.second != 0)
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+            else
+            {
+                for (std::size_t i = 0; i < size_and_type.first; ++i)
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key) || !sax->key(key)))
+                    {
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                    {
+                        return false;
+                    }
+                    key.clear();
+                }
+            }
+        }
+        else
+        {
+            if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+            {
+                return false;
+            }
+
+            while (current != '}')
+            {
+                if (JSON_HEDLEY_UNLIKELY(!get_ubjson_string(key, false) || !sax->key(key)))
+                {
+                    return false;
+                }
+                if (JSON_HEDLEY_UNLIKELY(!parse_ubjson_internal()))
+                {
+                    return false;
+                }
+                get_ignore_noop();
+                key.clear();
+            }
+        }
+
+        return sax->end_object();
+    }
+
+    // Note, no reader for UBJSON binary types is implemented because they do
+    // not exist
+
+    bool get_ubjson_high_precision_number()
+    {
+        // get size of following number string
+        std::size_t size{};
+        auto res = get_ubjson_size_value(size);
+        if (JSON_HEDLEY_UNLIKELY(!res))
+        {
+            return res;
+        }
+
+        // get number string
+        std::vector<char> number_vector;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number")))
+            {
+                return false;
+            }
+            number_vector.push_back(static_cast<char>(current));
+        }
+
+        // parse number string
+        using ia_type = decltype(detail::input_adapter(number_vector));
+        auto number_lexer = detail::lexer<BasicJsonType, ia_type>(detail::input_adapter(number_vector), false);
+        const auto result_number = number_lexer.scan();
+        const auto number_string = number_lexer.get_token_string();
+        const auto result_remainder = number_lexer.scan();
+
+        using token_type = typename detail::lexer_base<BasicJsonType>::token_type;
+
+        if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
+        {
+            return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"), BasicJsonType()));
+        }
+
+        switch (result_number)
+        {
+            case token_type::value_integer:
+                return sax->number_integer(number_lexer.get_number_integer());
+            case token_type::value_unsigned:
+                return sax->number_unsigned(number_lexer.get_number_unsigned());
+            case token_type::value_float:
+                return sax->number_float(number_lexer.get_number_float(), std::move(number_string));
+            case token_type::uninitialized:
+            case token_type::literal_true:
+            case token_type::literal_false:
+            case token_type::literal_null:
+            case token_type::value_string:
+            case token_type::begin_array:
+            case token_type::begin_object:
+            case token_type::end_array:
+            case token_type::end_object:
+            case token_type::name_separator:
+            case token_type::value_separator:
+            case token_type::parse_error:
+            case token_type::end_of_input:
+            case token_type::literal_or_value:
+            default:
+                return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read, exception_message(input_format_t::ubjson, "invalid number text: " + number_lexer.get_token_string(), "high-precision number"), BasicJsonType()));
+        }
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*!
+    @brief get next character from the input
+
+    This function provides the interface to the used input adapter. It does
+    not throw in case the input reached EOF, but returns a -'ve valued
+    `std::char_traits<char_type>::eof()` in that case.
+
+    @return character read from the input
+    */
+    char_int_type get()
+    {
+        ++chars_read;
+        return current = ia.get_character();
+    }
+
+    /*!
+    @return character read from the input after ignoring all 'N' entries
+    */
+    char_int_type get_ignore_noop()
+    {
+        do
+        {
+            get();
+        }
+        while (current == 'N');
+
+        return current;
+    }
+
+    /*
+    @brief read a number from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format   the current format (for diagnostics)
+    @param[out] result  number of type @a NumberType
+
+    @return whether conversion completed
+
+    @note This function needs to respect the system's endianess, because
+          bytes in CBOR, MessagePack, and UBJSON are stored in network order
+          (big endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool InputIsLittleEndian = false>
+    bool get_number(const input_format_t format, NumberType& result)
+    {
+        // step 1: read input into array with system's byte order
+        std::array<std::uint8_t, sizeof(NumberType)> vec{};
+        for (std::size_t i = 0; i < sizeof(NumberType); ++i)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "number")))
+            {
+                return false;
+            }
+
+            // reverse byte order prior to conversion if necessary
+            if (is_little_endian != InputIsLittleEndian)
+            {
+                vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
+            }
+            else
+            {
+                vec[i] = static_cast<std::uint8_t>(current); // LCOV_EXCL_LINE
+            }
+        }
+
+        // step 2: convert array into number of type T and return
+        std::memcpy(&result, vec.data(), sizeof(NumberType));
+        return true;
+    }
+
+    /*!
+    @brief create a string by reading characters from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of characters to read
+    @param[out] result string created by reading @a len bytes
+
+    @return whether string creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of string memory.
+    */
+    template<typename NumberType>
+    bool get_string(const input_format_t format,
+                    const NumberType len,
+                    string_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "string")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<typename string_t::value_type>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @brief create a byte array by reading bytes from the input
+
+    @tparam NumberType the type of the number
+    @param[in] format the current format (for diagnostics)
+    @param[in] len number of bytes to read
+    @param[out] result byte array created by reading @a len bytes
+
+    @return whether byte array creation completed
+
+    @note We can not reserve @a len bytes for the result, because @a len
+          may be too large. Usually, @ref unexpect_eof() detects the end of
+          the input before we run out of memory.
+    */
+    template<typename NumberType>
+    bool get_binary(const input_format_t format,
+                    const NumberType len,
+                    binary_t& result)
+    {
+        bool success = true;
+        for (NumberType i = 0; i < len; i++)
+        {
+            get();
+            if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(format, "binary")))
+            {
+                success = false;
+                break;
+            }
+            result.push_back(static_cast<std::uint8_t>(current));
+        }
+        return success;
+    }
+
+    /*!
+    @param[in] format   the current format (for diagnostics)
+    @param[in] context  further context information (for diagnostics)
+    @return whether the last read character is not EOF
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    bool unexpect_eof(const input_format_t format, const char* context) const
+    {
+        if (JSON_HEDLEY_UNLIKELY(current == std::char_traits<char_type>::eof()))
+        {
+            return sax->parse_error(chars_read, "<end of file>",
+                                    parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), BasicJsonType()));
+        }
+        return true;
+    }
+
+    /*!
+    @return a string representation of the last read byte
+    */
+    std::string get_token_string() const
+    {
+        std::array<char, 3> cr{{}};
+        (std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast<unsigned char>(current)); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        return std::string{cr.data()};
+    }
+
+    /*!
+    @param[in] format   the current format
+    @param[in] detail   a detailed error message
+    @param[in] context  further context information
+    @return a message string to use in the parse_error exceptions
+    */
+    std::string exception_message(const input_format_t format,
+                                  const std::string& detail,
+                                  const std::string& context) const
+    {
+        std::string error_msg = "syntax error while parsing ";
+
+        switch (format)
+        {
+            case input_format_t::cbor:
+                error_msg += "CBOR";
+                break;
+
+            case input_format_t::msgpack:
+                error_msg += "MessagePack";
+                break;
+
+            case input_format_t::ubjson:
+                error_msg += "UBJSON";
+                break;
+
+            case input_format_t::bson:
+                error_msg += "BSON";
+                break;
+
+            case input_format_t::json: // LCOV_EXCL_LINE
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+
+        return error_msg + " " + context + ": " + detail;
+    }
+
+  private:
+    /// input adapter
+    InputAdapterType ia;
+
+    /// the current character
+    char_int_type current = std::char_traits<char_type>::eof();
+
+    /// the number of characters read
+    std::size_t chars_read = 0;
+
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the SAX parser
+    json_sax_t* sax = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/input/parser.hpp>
+
+
+#include <cmath> // isfinite
+#include <cstdint> // uint8_t
+#include <functional> // function
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/input/input_adapters.hpp>
+
+// #include <nlohmann/detail/input/json_sax.hpp>
+
+// #include <nlohmann/detail/input/lexer.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/is_sax.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+////////////
+// parser //
+////////////
+
+enum class parse_event_t : std::uint8_t
+{
+    /// the parser read `{` and started to process a JSON object
+    object_start,
+    /// the parser read `}` and finished processing a JSON object
+    object_end,
+    /// the parser read `[` and started to process a JSON array
+    array_start,
+    /// the parser read `]` and finished processing a JSON array
+    array_end,
+    /// the parser read a key of a value in an object
+    key,
+    /// the parser finished reading a JSON value
+    value
+};
+
+template<typename BasicJsonType>
+using parser_callback_t =
+    std::function<bool(int /*depth*/, parse_event_t /*event*/, BasicJsonType& /*parsed*/)>;
+
+/*!
+@brief syntax analysis
+
+This class implements a recursive descent parser.
+*/
+template<typename BasicJsonType, typename InputAdapterType>
+class parser
+{
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using string_t = typename BasicJsonType::string_t;
+    using lexer_t = lexer<BasicJsonType, InputAdapterType>;
+    using token_type = typename lexer_t::token_type;
+
+  public:
+    /// a parser reading from an input adapter
+    explicit parser(InputAdapterType&& adapter,
+                    const parser_callback_t<BasicJsonType> cb = nullptr,
+                    const bool allow_exceptions_ = true,
+                    const bool skip_comments = false)
+        : callback(cb)
+        , m_lexer(std::move(adapter), skip_comments)
+        , allow_exceptions(allow_exceptions_)
+    {
+        // read first token
+        get_token();
+    }
+
+    /*!
+    @brief public parser interface
+
+    @param[in] strict      whether to expect the last token to be EOF
+    @param[in,out] result  parsed JSON value
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    void parse(const bool strict, BasicJsonType& result)
+    {
+        if (callback)
+        {
+            json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
+            sax_parse_internal(&sdp);
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(),
+                                                    exception_message(token_type::end_of_input, "value"), BasicJsonType()));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+
+            // set top-level value to null if it was discarded by the callback
+            // function
+            if (result.is_discarded())
+            {
+                result = nullptr;
+            }
+        }
+        else
+        {
+            json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
+            sax_parse_internal(&sdp);
+
+            // in strict mode, input must be completely read
+            if (strict && (get_token() != token_type::end_of_input))
+            {
+                sdp.parse_error(m_lexer.get_position(),
+                                m_lexer.get_token_string(),
+                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), BasicJsonType()));
+            }
+
+            // in case of an error, return discarded value
+            if (sdp.is_errored())
+            {
+                result = value_t::discarded;
+                return;
+            }
+        }
+
+        result.assert_invariant();
+    }
+
+    /*!
+    @brief public accept interface
+
+    @param[in] strict  whether to expect the last token to be EOF
+    @return whether the input is a proper JSON text
+    */
+    bool accept(const bool strict = true)
+    {
+        json_sax_acceptor<BasicJsonType> sax_acceptor;
+        return sax_parse(&sax_acceptor, strict);
+    }
+
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse(SAX* sax, const bool strict = true)
+    {
+        (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
+        const bool result = sax_parse_internal(sax);
+
+        // strict mode: next byte must be EOF
+        if (result && strict && (get_token() != token_type::end_of_input))
+        {
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_of_input, "value"), BasicJsonType()));
+        }
+
+        return result;
+    }
+
+  private:
+    template<typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    bool sax_parse_internal(SAX* sax)
+    {
+        // stack to remember the hierarchy of structured values we are parsing
+        // true = array; false = object
+        std::vector<bool> states;
+        // value to avoid a goto (see comment where set to true)
+        bool skip_to_state_evaluation = false;
+
+        while (true)
+        {
+            if (!skip_to_state_evaluation)
+            {
+                // invariant: get_token() was called before each iteration
+                switch (last_token)
+                {
+                    case token_type::begin_object:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_object(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing } -> we are done
+                        if (get_token() == token_type::end_object)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // parse key
+                        if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), BasicJsonType()));
+                        }
+                        if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        // parse separator (:)
+                        if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), BasicJsonType()));
+                        }
+
+                        // remember we are now inside an object
+                        states.push_back(false);
+
+                        // parse values
+                        get_token();
+                        continue;
+                    }
+
+                    case token_type::begin_array:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->start_array(std::size_t(-1))))
+                        {
+                            return false;
+                        }
+
+                        // closing ] -> we are done
+                        if (get_token() == token_type::end_array)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                            {
+                                return false;
+                            }
+                            break;
+                        }
+
+                        // remember we are now inside an array
+                        states.push_back(true);
+
+                        // parse values (no need to call get_token)
+                        continue;
+                    }
+
+                    case token_type::value_float:
+                    {
+                        const auto res = m_lexer.get_number_float();
+
+                        if (JSON_HEDLEY_UNLIKELY(!std::isfinite(res)))
+                        {
+                            return sax->parse_error(m_lexer.get_position(),
+                                                    m_lexer.get_token_string(),
+                                                    out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'", BasicJsonType()));
+                        }
+
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_float(res, m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+
+                        break;
+                    }
+
+                    case token_type::literal_false:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(false)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_null:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->null()))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::literal_true:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->boolean(true)))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_integer:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(m_lexer.get_number_integer())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_string:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->string(m_lexer.get_string())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::value_unsigned:
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!sax->number_unsigned(m_lexer.get_number_unsigned())))
+                        {
+                            return false;
+                        }
+                        break;
+                    }
+
+                    case token_type::parse_error:
+                    {
+                        // using "uninitialized" to avoid "expected" message
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), BasicJsonType()));
+                    }
+
+                    case token_type::uninitialized:
+                    case token_type::end_array:
+                    case token_type::end_object:
+                    case token_type::name_separator:
+                    case token_type::value_separator:
+                    case token_type::end_of_input:
+                    case token_type::literal_or_value:
+                    default: // the last token was unexpected
+                    {
+                        return sax->parse_error(m_lexer.get_position(),
+                                                m_lexer.get_token_string(),
+                                                parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), BasicJsonType()));
+                    }
+                }
+            }
+            else
+            {
+                skip_to_state_evaluation = false;
+            }
+
+            // we reached this line after we successfully parsed a value
+            if (states.empty())
+            {
+                // empty stack: we reached the end of the hierarchy: done
+                return true;
+            }
+
+            if (states.back())  // array
+            {
+                // comma -> next value
+                if (get_token() == token_type::value_separator)
+                {
+                    // parse a new value
+                    get_token();
+                    continue;
+                }
+
+                // closing ]
+                if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
+                {
+                    if (JSON_HEDLEY_UNLIKELY(!sax->end_array()))
+                    {
+                        return false;
+                    }
+
+                    // We are done with this array. Before we can parse a
+                    // new value, we need to evaluate the new state first.
+                    // By setting skip_to_state_evaluation to false, we
+                    // are effectively jumping to the beginning of this if.
+                    JSON_ASSERT(!states.empty());
+                    states.pop_back();
+                    skip_to_state_evaluation = true;
+                    continue;
+                }
+
+                return sax->parse_error(m_lexer.get_position(),
+                                        m_lexer.get_token_string(),
+                                        parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_array, "array"), BasicJsonType()));
+            }
+
+            // states.back() is false -> object
+
+            // comma -> next value
+            if (get_token() == token_type::value_separator)
+            {
+                // parse key
+                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
+                {
+                    return sax->parse_error(m_lexer.get_position(),
+                                            m_lexer.get_token_string(),
+                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::value_string, "object key"), BasicJsonType()));
+                }
+
+                if (JSON_HEDLEY_UNLIKELY(!sax->key(m_lexer.get_string())))
+                {
+                    return false;
+                }
+
+                // parse separator (:)
+                if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
+                {
+                    return sax->parse_error(m_lexer.get_position(),
+                                            m_lexer.get_token_string(),
+                                            parse_error::create(101, m_lexer.get_position(), exception_message(token_type::name_separator, "object separator"), BasicJsonType()));
+                }
+
+                // parse values
+                get_token();
+                continue;
+            }
+
+            // closing }
+            if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
+            {
+                if (JSON_HEDLEY_UNLIKELY(!sax->end_object()))
+                {
+                    return false;
+                }
+
+                // We are done with this object. Before we can parse a
+                // new value, we need to evaluate the new state first.
+                // By setting skip_to_state_evaluation to false, we
+                // are effectively jumping to the beginning of this if.
+                JSON_ASSERT(!states.empty());
+                states.pop_back();
+                skip_to_state_evaluation = true;
+                continue;
+            }
+
+            return sax->parse_error(m_lexer.get_position(),
+                                    m_lexer.get_token_string(),
+                                    parse_error::create(101, m_lexer.get_position(), exception_message(token_type::end_object, "object"), BasicJsonType()));
+        }
+    }
+
+    /// get next token from lexer
+    token_type get_token()
+    {
+        return last_token = m_lexer.scan();
+    }
+
+    std::string exception_message(const token_type expected, const std::string& context)
+    {
+        std::string error_msg = "syntax error ";
+
+        if (!context.empty())
+        {
+            error_msg += "while parsing " + context + " ";
+        }
+
+        error_msg += "- ";
+
+        if (last_token == token_type::parse_error)
+        {
+            error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" +
+                         m_lexer.get_token_string() + "'";
+        }
+        else
+        {
+            error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token));
+        }
+
+        if (expected != token_type::uninitialized)
+        {
+            error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
+        }
+
+        return error_msg;
+    }
+
+  private:
+    /// callback function
+    const parser_callback_t<BasicJsonType> callback = nullptr;
+    /// the type of the last read token
+    token_type last_token = token_type::uninitialized;
+    /// the lexer
+    lexer_t m_lexer;
+    /// whether to throw exceptions in case of errors
+    const bool allow_exceptions = true;
+};
+
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <limits>  // numeric_limits
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/*
+@brief an iterator for primitive JSON types
+
+This class models an iterator for primitive JSON types (boolean, number,
+string). It's only purpose is to allow the iterator/const_iterator classes
+to "iterate" over primitive values. Internally, the iterator is modeled by
+a `difference_type` variable. Value begin_value (`0`) models the begin,
+end_value (`1`) models past the end.
+*/
+class primitive_iterator_t
+{
+  private:
+    using difference_type = std::ptrdiff_t;
+    static constexpr difference_type begin_value = 0;
+    static constexpr difference_type end_value = begin_value + 1;
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /// iterator as signed integer type
+    difference_type m_it = (std::numeric_limits<std::ptrdiff_t>::min)();
+
+  public:
+    constexpr difference_type get_value() const noexcept
+    {
+        return m_it;
+    }
+
+    /// set iterator to a defined beginning
+    void set_begin() noexcept
+    {
+        m_it = begin_value;
+    }
+
+    /// set iterator to a defined past the end
+    void set_end() noexcept
+    {
+        m_it = end_value;
+    }
+
+    /// return whether the iterator can be dereferenced
+    constexpr bool is_begin() const noexcept
+    {
+        return m_it == begin_value;
+    }
+
+    /// return whether the iterator is at end
+    constexpr bool is_end() const noexcept
+    {
+        return m_it == end_value;
+    }
+
+    friend constexpr bool operator==(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it == rhs.m_it;
+    }
+
+    friend constexpr bool operator<(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it < rhs.m_it;
+    }
+
+    primitive_iterator_t operator+(difference_type n) noexcept
+    {
+        auto result = *this;
+        result += n;
+        return result;
+    }
+
+    friend constexpr difference_type operator-(primitive_iterator_t lhs, primitive_iterator_t rhs) noexcept
+    {
+        return lhs.m_it - rhs.m_it;
+    }
+
+    primitive_iterator_t& operator++() noexcept
+    {
+        ++m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator++(int) noexcept // NOLINT(readability-const-return-type)
+    {
+        auto result = *this;
+        ++m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator--() noexcept
+    {
+        --m_it;
+        return *this;
+    }
+
+    primitive_iterator_t const operator--(int) noexcept // NOLINT(readability-const-return-type)
+    {
+        auto result = *this;
+        --m_it;
+        return result;
+    }
+
+    primitive_iterator_t& operator+=(difference_type n) noexcept
+    {
+        m_it += n;
+        return *this;
+    }
+
+    primitive_iterator_t& operator-=(difference_type n) noexcept
+    {
+        m_it -= n;
+        return *this;
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/*!
+@brief an iterator value
+
+@note This structure could easily be a union, but MSVC currently does not allow
+unions members with complex constructors, see https://github.com/nlohmann/json/pull/105.
+*/
+template<typename BasicJsonType> struct internal_iterator
+{
+    /// iterator for JSON objects
+    typename BasicJsonType::object_t::iterator object_iterator {};
+    /// iterator for JSON arrays
+    typename BasicJsonType::array_t::iterator array_iterator {};
+    /// generic iterator for all other types
+    primitive_iterator_t primitive_iterator {};
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iter_impl.hpp>
+
+
+#include <iterator> // iterator, random_access_iterator_tag, bidirectional_iterator_tag, advance, next
+#include <type_traits> // conditional, is_const, remove_const
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/iterators/internal_iterator.hpp>
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+// forward declare, to be able to friend it later on
+template<typename IteratorType> class iteration_proxy;
+template<typename IteratorType> class iteration_proxy_value;
+
+/*!
+@brief a template for a bidirectional iterator for the @ref basic_json class
+This class implements a both iterators (iterator and const_iterator) for the
+@ref basic_json class.
+@note An iterator is called *initialized* when a pointer to a JSON value has
+      been set (e.g., by a constructor or a copy assignment). If the iterator is
+      default-constructed, it is *uninitialized* and most methods are undefined.
+      **The library uses assertions to detect calls on uninitialized iterators.**
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+@since version 1.0.0, simplified in version 2.0.9, change to bidirectional
+       iterators in version 3.0.0 (see https://github.com/nlohmann/json/issues/593)
+*/
+template<typename BasicJsonType>
+class iter_impl
+{
+    /// the iterator with BasicJsonType of different const-ness
+    using other_iter_impl = iter_impl<typename std::conditional<std::is_const<BasicJsonType>::value, typename std::remove_const<BasicJsonType>::type, const BasicJsonType>::type>;
+    /// allow basic_json to access private members
+    friend other_iter_impl;
+    friend BasicJsonType;
+    friend iteration_proxy<iter_impl>;
+    friend iteration_proxy_value<iter_impl>;
+
+    using object_t = typename BasicJsonType::object_t;
+    using array_t = typename BasicJsonType::array_t;
+    // make sure BasicJsonType is basic_json or const basic_json
+    static_assert(is_basic_json<typename std::remove_const<BasicJsonType>::type>::value,
+                  "iter_impl only accepts (const) basic_json");
+
+  public:
+
+    /// The std::iterator class template (used as a base class to provide typedefs) is deprecated in C++17.
+    /// The C++ Standard has never required user-defined iterators to derive from std::iterator.
+    /// A user-defined iterator should provide publicly accessible typedefs named
+    /// iterator_category, value_type, difference_type, pointer, and reference.
+    /// Note that value_type is required to be non-const, even for constant iterators.
+    using iterator_category = std::bidirectional_iterator_tag;
+
+    /// the type of the values when the iterator is dereferenced
+    using value_type = typename BasicJsonType::value_type;
+    /// a type to represent differences between iterators
+    using difference_type = typename BasicJsonType::difference_type;
+    /// defines a pointer to the type iterated over (value_type)
+    using pointer = typename std::conditional<std::is_const<BasicJsonType>::value,
+          typename BasicJsonType::const_pointer,
+          typename BasicJsonType::pointer>::type;
+    /// defines a reference to the type iterated over (value_type)
+    using reference =
+        typename std::conditional<std::is_const<BasicJsonType>::value,
+        typename BasicJsonType::const_reference,
+        typename BasicJsonType::reference>::type;
+
+    iter_impl() = default;
+    ~iter_impl() = default;
+    iter_impl(iter_impl&&) noexcept = default;
+    iter_impl& operator=(iter_impl&&) noexcept = default;
+
+    /*!
+    @brief constructor for a given JSON instance
+    @param[in] object  pointer to a JSON object for this iterator
+    @pre object != nullptr
+    @post The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    explicit iter_impl(pointer object) noexcept : m_object(object)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = typename object_t::iterator();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = typename array_t::iterator();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator = primitive_iterator_t();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @note The conventional copy constructor and copy assignment are implicitly
+          defined. Combined with the following converting constructor and
+          assignment, they support: (1) copy from iterator to iterator, (2)
+          copy from const iterator to const iterator, and (3) conversion from
+          iterator to const iterator. However conversion from const iterator
+          to iterator is not defined.
+    */
+
+    /*!
+    @brief const copy constructor
+    @param[in] other const iterator to copy from
+    @note This copy constructor had to be defined explicitly to circumvent a bug
+          occurring on msvc v19.0 compiler (VS 2015) debug build. For more
+          information refer to: https://github.com/nlohmann/json/issues/1608
+    */
+    iter_impl(const iter_impl<const BasicJsonType>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<const BasicJsonType>& other) noexcept
+    {
+        if (&other != this)
+        {
+            m_object = other.m_object;
+            m_it = other.m_it;
+        }
+        return *this;
+    }
+
+    /*!
+    @brief converting constructor
+    @param[in] other  non-const iterator to copy from
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept
+        : m_object(other.m_object), m_it(other.m_it)
+    {}
+
+    /*!
+    @brief converting assignment
+    @param[in] other  non-const iterator to copy from
+    @return const/non-const iterator
+    @note It is not checked whether @a other is initialized.
+    */
+    iter_impl& operator=(const iter_impl<typename std::remove_const<BasicJsonType>::type>& other) noexcept // NOLINT(cert-oop54-cpp)
+    {
+        m_object = other.m_object;
+        m_it = other.m_it;
+        return *this;
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /*!
+    @brief set the iterator to the first value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_begin() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->begin();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->begin();
+                break;
+            }
+
+            case value_t::null:
+            {
+                // set to end so begin()==end() is true: null is empty
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator.set_begin();
+                break;
+            }
+        }
+    }
+
+    /*!
+    @brief set the iterator past the last value
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    void set_end() noexcept
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                m_it.object_iterator = m_object->m_value.object->end();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_it.array_iterator = m_object->m_value.array->end();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator.set_end();
+                break;
+            }
+        }
+    }
+
+  public:
+    /*!
+    @brief return a reference to the value pointed to by the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator*() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                return m_it.object_iterator->second;
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                return *m_it.array_iterator;
+            }
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief dereference the iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    pointer operator->() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end());
+                return &(m_it.object_iterator->second);
+            }
+
+            case value_t::array:
+            {
+                JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end());
+                return &*m_it.array_iterator;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.is_begin()))
+                {
+                    return m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief post-increment (it++)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator++(int) // NOLINT(readability-const-return-type)
+    {
+        auto result = *this;
+        ++(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-increment (++it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator++()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, 1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, 1);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                ++m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief post-decrement (it--)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl const operator--(int) // NOLINT(readability-const-return-type)
+    {
+        auto result = *this;
+        --(*this);
+        return result;
+    }
+
+    /*!
+    @brief pre-decrement (--it)
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator--()
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+            {
+                std::advance(m_it.object_iterator, -1);
+                break;
+            }
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, -1);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                --m_it.primitive_iterator;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief comparison: equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
+    bool operator==(const IterImpl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", *m_object));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                return (m_it.object_iterator == other.m_it.object_iterator);
+
+            case value_t::array:
+                return (m_it.array_iterator == other.m_it.array_iterator);
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return (m_it.primitive_iterator == other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief comparison: not equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    template < typename IterImpl, detail::enable_if_t < (std::is_same<IterImpl, iter_impl>::value || std::is_same<IterImpl, other_iter_impl>::value), std::nullptr_t > = nullptr >
+    bool operator!=(const IterImpl& other) const
+    {
+        return !operator==(other);
+    }
+
+    /*!
+    @brief comparison: smaller
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<(const iter_impl& other) const
+    {
+        // if objects are not the same, the comparison is undefined
+        if (JSON_HEDLEY_UNLIKELY(m_object != other.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(212, "cannot compare iterators of different containers", *m_object));
+        }
+
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", *m_object));
+
+            case value_t::array:
+                return (m_it.array_iterator < other.m_it.array_iterator);
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return (m_it.primitive_iterator < other.m_it.primitive_iterator);
+        }
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator<=(const iter_impl& other) const
+    {
+        return !other.operator < (*this);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>(const iter_impl& other) const
+    {
+        return !operator<=(other);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    bool operator>=(const iter_impl& other) const
+    {
+        return !operator<(other);
+    }
+
+    /*!
+    @brief add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator+=(difference_type i)
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", *m_object));
+
+            case value_t::array:
+            {
+                std::advance(m_it.array_iterator, i);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                m_it.primitive_iterator += i;
+                break;
+            }
+        }
+
+        return *this;
+    }
+
+    /*!
+    @brief subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl& operator-=(difference_type i)
+    {
+        return operator+=(-i);
+    }
+
+    /*!
+    @brief add to iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator+(difference_type i) const
+    {
+        auto result = *this;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief addition of distance and iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    friend iter_impl operator+(difference_type i, const iter_impl& it)
+    {
+        auto result = it;
+        result += i;
+        return result;
+    }
+
+    /*!
+    @brief subtract from iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    iter_impl operator-(difference_type i) const
+    {
+        auto result = *this;
+        result -= i;
+        return result;
+    }
+
+    /*!
+    @brief return difference
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    difference_type operator-(const iter_impl& other) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", *m_object));
+
+            case value_t::array:
+                return m_it.array_iterator - other.m_it.array_iterator;
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                return m_it.primitive_iterator - other.m_it.primitive_iterator;
+        }
+    }
+
+    /*!
+    @brief access to successor
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference operator[](difference_type n) const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        switch (m_object->m_type)
+        {
+            case value_t::object:
+                JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", *m_object));
+
+            case value_t::array:
+                return *std::next(m_it.array_iterator, n);
+
+            case value_t::null:
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                if (JSON_HEDLEY_LIKELY(m_it.primitive_iterator.get_value() == -n))
+                {
+                    return *m_object;
+                }
+
+                JSON_THROW(invalid_iterator::create(214, "cannot get value", *m_object));
+            }
+        }
+    }
+
+    /*!
+    @brief return the key of an object iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    const typename object_t::key_type& key() const
+    {
+        JSON_ASSERT(m_object != nullptr);
+
+        if (JSON_HEDLEY_LIKELY(m_object->is_object()))
+        {
+            return m_it.object_iterator->first;
+        }
+
+        JSON_THROW(invalid_iterator::create(207, "cannot use key() for non-object iterators", *m_object));
+    }
+
+    /*!
+    @brief return the value of an iterator
+    @pre The iterator is initialized; i.e. `m_object != nullptr`.
+    */
+    reference value() const
+    {
+        return operator*();
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /// associated JSON instance
+    pointer m_object = nullptr;
+    /// the actual iterator of the associated instance
+    internal_iterator<typename std::remove_const<BasicJsonType>::type> m_it {};
+};
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/iteration_proxy.hpp>
+
+// #include <nlohmann/detail/iterators/json_reverse_iterator.hpp>
+
+
+#include <cstddef> // ptrdiff_t
+#include <iterator> // reverse_iterator
+#include <utility> // declval
+
+namespace nlohmann
+{
+namespace detail
+{
+//////////////////////
+// reverse_iterator //
+//////////////////////
+
+/*!
+@brief a template for a reverse iterator class
+
+@tparam Base the base iterator type to reverse. Valid types are @ref
+iterator (to create @ref reverse_iterator) and @ref const_iterator (to
+create @ref const_reverse_iterator).
+
+@requirement The class satisfies the following concept requirements:
+-
+[BidirectionalIterator](https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator):
+  The iterator that can be moved can be moved in both directions (i.e.
+  incremented and decremented).
+- [OutputIterator](https://en.cppreference.com/w/cpp/named_req/OutputIterator):
+  It is possible to write to the pointed-to element (only if @a Base is
+  @ref iterator).
+
+@since version 1.0.0
+*/
+template<typename Base>
+class json_reverse_iterator : public std::reverse_iterator<Base>
+{
+  public:
+    using difference_type = std::ptrdiff_t;
+    /// shortcut to the reverse iterator adapter
+    using base_iterator = std::reverse_iterator<Base>;
+    /// the reference type for the pointed-to element
+    using reference = typename Base::reference;
+
+    /// create reverse iterator from iterator
+    explicit json_reverse_iterator(const typename base_iterator::iterator_type& it) noexcept
+        : base_iterator(it) {}
+
+    /// create reverse iterator from base class
+    explicit json_reverse_iterator(const base_iterator& it) noexcept : base_iterator(it) {}
+
+    /// post-increment (it++)
+    json_reverse_iterator const operator++(int) // NOLINT(readability-const-return-type)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator++(1));
+    }
+
+    /// pre-increment (++it)
+    json_reverse_iterator& operator++()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator++());
+    }
+
+    /// post-decrement (it--)
+    json_reverse_iterator const operator--(int) // NOLINT(readability-const-return-type)
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator--(1));
+    }
+
+    /// pre-decrement (--it)
+    json_reverse_iterator& operator--()
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator--());
+    }
+
+    /// add to iterator
+    json_reverse_iterator& operator+=(difference_type i)
+    {
+        return static_cast<json_reverse_iterator&>(base_iterator::operator+=(i));
+    }
+
+    /// add to iterator
+    json_reverse_iterator operator+(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator+(i));
+    }
+
+    /// subtract from iterator
+    json_reverse_iterator operator-(difference_type i) const
+    {
+        return static_cast<json_reverse_iterator>(base_iterator::operator-(i));
+    }
+
+    /// return difference
+    difference_type operator-(const json_reverse_iterator& other) const
+    {
+        return base_iterator(*this) - base_iterator(other);
+    }
+
+    /// access to successor
+    reference operator[](difference_type n) const
+    {
+        return *(this->operator+(n));
+    }
+
+    /// return the key of an object iterator
+    auto key() const -> decltype(std::declval<Base>().key())
+    {
+        auto it = --this->base();
+        return it.key();
+    }
+
+    /// return the value of an iterator
+    reference value() const
+    {
+        auto it = --this->base();
+        return it.operator * ();
+    }
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/iterators/primitive_iterator.hpp>
+
+// #include <nlohmann/detail/json_pointer.hpp>
+
+
+#include <algorithm> // all_of
+#include <cctype> // isdigit
+#include <limits> // max
+#include <numeric> // accumulate
+#include <string> // string
+#include <utility> // move
+#include <vector> // vector
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+template<typename BasicJsonType>
+class json_pointer
+{
+    // allow basic_json to access private members
+    NLOHMANN_BASIC_JSON_TPL_DECLARATION
+    friend class basic_json;
+
+  public:
+    /*!
+    @brief create JSON pointer
+
+    Create a JSON pointer according to the syntax described in
+    [Section 3 of RFC6901](https://tools.ietf.org/html/rfc6901#section-3).
+
+    @param[in] s  string representing the JSON pointer; if omitted, the empty
+                  string is assumed which references the whole JSON value
+
+    @throw parse_error.107 if the given JSON pointer @a s is nonempty and does
+                           not begin with a slash (`/`); see example below
+
+    @throw parse_error.108 if a tilde (`~`) in the given JSON pointer @a s is
+    not followed by `0` (representing `~`) or `1` (representing `/`); see
+    example below
+
+    @liveexample{The example shows the construction several valid JSON pointers
+    as well as the exceptional behavior.,json_pointer}
+
+    @since version 2.0.0
+    */
+    explicit json_pointer(const std::string& s = "")
+        : reference_tokens(split(s))
+    {}
+
+    /*!
+    @brief return a string representation of the JSON pointer
+
+    @invariant For each JSON pointer `ptr`, it holds:
+    @code {.cpp}
+    ptr == json_pointer(ptr.to_string());
+    @endcode
+
+    @return a string representation of the JSON pointer
+
+    @liveexample{The example shows the result of `to_string`.,json_pointer__to_string}
+
+    @since version 2.0.0
+    */
+    std::string to_string() const
+    {
+        return std::accumulate(reference_tokens.begin(), reference_tokens.end(),
+                               std::string{},
+                               [](const std::string & a, const std::string & b)
+        {
+            return a + "/" + detail::escape(b);
+        });
+    }
+
+    /// @copydoc to_string()
+    operator std::string() const
+    {
+        return to_string();
+    }
+
+    /*!
+    @brief append another JSON pointer at the end of this JSON pointer
+
+    @param[in] ptr  JSON pointer to append
+    @return JSON pointer with @a ptr appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa see @ref operator/=(std::string) to append a reference token
+    @sa see @ref operator/=(std::size_t) to append an array index
+    @sa see @ref operator/(const json_pointer&, const json_pointer&) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(const json_pointer& ptr)
+    {
+        reference_tokens.insert(reference_tokens.end(),
+                                ptr.reference_tokens.begin(),
+                                ptr.reference_tokens.end());
+        return *this;
+    }
+
+    /*!
+    @brief append an unescaped reference token at the end of this JSON pointer
+
+    @param[in] token  reference token to append
+    @return JSON pointer with @a token appended without escaping @a token
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa see @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa see @ref operator/=(std::size_t) to append an array index
+    @sa see @ref operator/(const json_pointer&, std::size_t) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::string token)
+    {
+        push_back(std::move(token));
+        return *this;
+    }
+
+    /*!
+    @brief append an array index at the end of this JSON pointer
+
+    @param[in] array_idx  array index to append
+    @return JSON pointer with @a array_idx appended
+
+    @liveexample{The example shows the usage of `operator/=`.,json_pointer__operator_add}
+
+    @complexity Amortized constant.
+
+    @sa see @ref operator/=(const json_pointer&) to append a JSON pointer
+    @sa see @ref operator/=(std::string) to append a reference token
+    @sa see @ref operator/(const json_pointer&, std::string) for a binary operator
+
+    @since version 3.6.0
+    */
+    json_pointer& operator/=(std::size_t array_idx)
+    {
+        return *this /= std::to_string(array_idx);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the right JSON pointer at the end of the left JSON pointer
+
+    @param[in] lhs  JSON pointer
+    @param[in] rhs  JSON pointer
+    @return a new JSON pointer with @a rhs appended to @a lhs
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a lhs and @a rhs.
+
+    @sa see @ref operator/=(const json_pointer&) to append a JSON pointer
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& lhs,
+                                  const json_pointer& rhs)
+    {
+        return json_pointer(lhs) /= rhs;
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the unescaped token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] token  reference token
+    @return a new JSON pointer with unescaped @a token appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa see @ref operator/=(std::string) to append a reference token
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::string token) // NOLINT(performance-unnecessary-value-param)
+    {
+        return json_pointer(ptr) /= std::move(token);
+    }
+
+    /*!
+    @brief create a new JSON pointer by appending the array-index-token at the end of the JSON pointer
+
+    @param[in] ptr  JSON pointer
+    @param[in] array_idx  array index
+    @return a new JSON pointer with @a array_idx appended to @a ptr
+
+    @liveexample{The example shows the usage of `operator/`.,json_pointer__operator_add_binary}
+
+    @complexity Linear in the length of @a ptr.
+
+    @sa see @ref operator/=(std::size_t) to append an array index
+
+    @since version 3.6.0
+    */
+    friend json_pointer operator/(const json_pointer& ptr, std::size_t array_idx)
+    {
+        return json_pointer(ptr) /= array_idx;
+    }
+
+    /*!
+    @brief returns the parent of this JSON pointer
+
+    @return parent of this JSON pointer; in case this JSON pointer is the root,
+            the root itself is returned
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @liveexample{The example shows the result of `parent_pointer` for different
+    JSON Pointers.,json_pointer__parent_pointer}
+
+    @since version 3.6.0
+    */
+    json_pointer parent_pointer() const
+    {
+        if (empty())
+        {
+            return *this;
+        }
+
+        json_pointer res = *this;
+        res.pop_back();
+        return res;
+    }
+
+    /*!
+    @brief remove last reference token
+
+    @pre not `empty()`
+
+    @liveexample{The example shows the usage of `pop_back`.,json_pointer__pop_back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    void pop_back()
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType()));
+        }
+
+        reference_tokens.pop_back();
+    }
+
+    /*!
+    @brief return last reference token
+
+    @pre not `empty()`
+    @return last reference token
+
+    @liveexample{The example shows the usage of `back`.,json_pointer__back}
+
+    @complexity Constant.
+
+    @throw out_of_range.405 if JSON pointer has no parent
+
+    @since version 3.6.0
+    */
+    const std::string& back() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType()));
+        }
+
+        return reference_tokens.back();
+    }
+
+    /*!
+    @brief append an unescaped token at the end of the reference pointer
+
+    @param[in] token  token to add
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows the result of `push_back` for different
+    JSON Pointers.,json_pointer__push_back}
+
+    @since version 3.6.0
+    */
+    void push_back(const std::string& token)
+    {
+        reference_tokens.push_back(token);
+    }
+
+    /// @copydoc push_back(const std::string&)
+    void push_back(std::string&& token)
+    {
+        reference_tokens.push_back(std::move(token));
+    }
+
+    /*!
+    @brief return whether pointer points to the root document
+
+    @return true iff the JSON pointer points to the root document
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example shows the result of `empty` for different JSON
+    Pointers.,json_pointer__empty}
+
+    @since version 3.6.0
+    */
+    bool empty() const noexcept
+    {
+        return reference_tokens.empty();
+    }
+
+  private:
+    /*!
+    @param[in] s  reference token to be converted into an array index
+
+    @return integer representation of @a s
+
+    @throw parse_error.106  if an array index begins with '0'
+    @throw parse_error.109  if an array index begins not with a digit
+    @throw out_of_range.404 if string @a s could not be converted to an integer
+    @throw out_of_range.410 if an array index exceeds size_type
+    */
+    static typename BasicJsonType::size_type array_index(const std::string& s)
+    {
+        using size_type = typename BasicJsonType::size_type;
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && s[0] == '0'))
+        {
+            JSON_THROW(detail::parse_error::create(106, 0, "array index '" + s + "' must not begin with '0'", BasicJsonType()));
+        }
+
+        // error condition (cf. RFC 6901, Sect. 4)
+        if (JSON_HEDLEY_UNLIKELY(s.size() > 1 && !(s[0] >= '1' && s[0] <= '9')))
+        {
+            JSON_THROW(detail::parse_error::create(109, 0, "array index '" + s + "' is not a number", BasicJsonType()));
+        }
+
+        std::size_t processed_chars = 0;
+        unsigned long long res = 0;  // NOLINT(runtime/int)
+        JSON_TRY
+        {
+            res = std::stoull(s, &processed_chars);
+        }
+        JSON_CATCH(std::out_of_range&)
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'", BasicJsonType()));
+        }
+
+        // check if the string was completely read
+        if (JSON_HEDLEY_UNLIKELY(processed_chars != s.size()))
+        {
+            JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + s + "'", BasicJsonType()));
+        }
+
+        // only triggered on special platforms (like 32bit), see also
+        // https://github.com/nlohmann/json/pull/2203
+        if (res >= static_cast<unsigned long long>((std::numeric_limits<size_type>::max)()))  // NOLINT(runtime/int)
+        {
+            JSON_THROW(detail::out_of_range::create(410, "array index " + s + " exceeds size_type", BasicJsonType())); // LCOV_EXCL_LINE
+        }
+
+        return static_cast<size_type>(res);
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    json_pointer top() const
+    {
+        if (JSON_HEDLEY_UNLIKELY(empty()))
+        {
+            JSON_THROW(detail::out_of_range::create(405, "JSON pointer has no parent", BasicJsonType()));
+        }
+
+        json_pointer result = *this;
+        result.reference_tokens = {reference_tokens[0]};
+        return result;
+    }
+
+  private:
+    /*!
+    @brief create and return a reference to the pointed to value
+
+    @complexity Linear in the number of reference tokens.
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.313 if value cannot be unflattened
+    */
+    BasicJsonType& get_and_create(BasicJsonType& j) const
+    {
+        auto* result = &j;
+
+        // in case no reference tokens exist, return a reference to the JSON value
+        // j which will be overwritten by a primitive value
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (result->type())
+            {
+                case detail::value_t::null:
+                {
+                    if (reference_token == "0")
+                    {
+                        // start a new array if reference token is 0
+                        result = &result->operator[](0);
+                    }
+                    else
+                    {
+                        // start a new object otherwise
+                        result = &result->operator[](reference_token);
+                    }
+                    break;
+                }
+
+                case detail::value_t::object:
+                {
+                    // create an entry in the object
+                    result = &result->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    // create an entry in the array
+                    result = &result->operator[](array_index(reference_token));
+                    break;
+                }
+
+                /*
+                The following code is only reached if there exists a reference
+                token _and_ the current value is primitive. In this case, we have
+                an error situation, because primitive values may only occur as
+                single value; that is, with an empty list of reference tokens.
+                */
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::type_error::create(313, "invalid value to unflatten", j));
+            }
+        }
+
+        return *result;
+    }
+
+    /*!
+    @brief return a reference to the pointed to value
+
+    @note This version does not throw if a value is not present, but tries to
+          create nested values instead. For instance, calling this function
+          with pointer `"/this/that"` on a null value is equivalent to calling
+          `operator[]("this").operator[]("that")` on that value, effectively
+          changing the null value to an object.
+
+    @param[in] ptr  a JSON value
+
+    @return reference to the JSON value pointed to by the JSON pointer
+
+    @complexity Linear in the length of the JSON pointer.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_unchecked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            // convert null values to arrays or objects before continuing
+            if (ptr->is_null())
+            {
+                // check if reference token is a number
+                const bool nums =
+                    std::all_of(reference_token.begin(), reference_token.end(),
+                                [](const unsigned char x)
+                {
+                    return std::isdigit(x);
+                });
+
+                // change value to array for numbers or "-" or to object otherwise
+                *ptr = (nums || reference_token == "-")
+                       ? detail::value_t::array
+                       : detail::value_t::object;
+            }
+
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (reference_token == "-")
+                    {
+                        // explicitly treat "-" as index beyond the end
+                        ptr = &ptr->operator[](ptr->m_value.array->size());
+                    }
+                    else
+                    {
+                        // convert array index to number; unchecked access
+                        ptr = &ptr->operator[](array_index(reference_token));
+                    }
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    BasicJsonType& get_checked(BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range", *ptr));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @brief return a const reference to the pointed to value
+
+    @param[in] ptr  a JSON value
+
+    @return const reference to the JSON value pointed to by the JSON
+    pointer
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_unchecked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // use unchecked object access
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" cannot be used for const access
+                        JSON_THROW(detail::out_of_range::create(402, "array index '-' (" + std::to_string(ptr->m_value.array->size()) + ") is out of range", *ptr));
+                    }
+
+                    // use unchecked array access
+                    ptr = &ptr->operator[](array_index(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+    */
+    const BasicJsonType& get_checked(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    // note: at performs range check
+                    ptr = &ptr->at(reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        JSON_THROW(detail::out_of_range::create(402,
+                                                                "array index '-' (" + std::to_string(ptr->m_value.array->size()) +
+                                                                ") is out of range", *ptr));
+                    }
+
+                    // note: at performs range check
+                    ptr = &ptr->at(array_index(reference_token));
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                    JSON_THROW(detail::out_of_range::create(404, "unresolved reference token '" + reference_token + "'", *ptr));
+            }
+        }
+
+        return *ptr;
+    }
+
+    /*!
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    */
+    bool contains(const BasicJsonType* ptr) const
+    {
+        for (const auto& reference_token : reference_tokens)
+        {
+            switch (ptr->type())
+            {
+                case detail::value_t::object:
+                {
+                    if (!ptr->contains(reference_token))
+                    {
+                        // we did not find the key in the object
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](reference_token);
+                    break;
+                }
+
+                case detail::value_t::array:
+                {
+                    if (JSON_HEDLEY_UNLIKELY(reference_token == "-"))
+                    {
+                        // "-" always fails the range check
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() == 1 && !("0" <= reference_token && reference_token <= "9")))
+                    {
+                        // invalid char
+                        return false;
+                    }
+                    if (JSON_HEDLEY_UNLIKELY(reference_token.size() > 1))
+                    {
+                        if (JSON_HEDLEY_UNLIKELY(!('1' <= reference_token[0] && reference_token[0] <= '9')))
+                        {
+                            // first char should be between '1' and '9'
+                            return false;
+                        }
+                        for (std::size_t i = 1; i < reference_token.size(); i++)
+                        {
+                            if (JSON_HEDLEY_UNLIKELY(!('0' <= reference_token[i] && reference_token[i] <= '9')))
+                            {
+                                // other char should be between '0' and '9'
+                                return false;
+                            }
+                        }
+                    }
+
+                    const auto idx = array_index(reference_token);
+                    if (idx >= ptr->size())
+                    {
+                        // index out of range
+                        return false;
+                    }
+
+                    ptr = &ptr->operator[](idx);
+                    break;
+                }
+
+                case detail::value_t::null:
+                case detail::value_t::string:
+                case detail::value_t::boolean:
+                case detail::value_t::number_integer:
+                case detail::value_t::number_unsigned:
+                case detail::value_t::number_float:
+                case detail::value_t::binary:
+                case detail::value_t::discarded:
+                default:
+                {
+                    // we do not expect primitive values if there is still a
+                    // reference token to process
+                    return false;
+                }
+            }
+        }
+
+        // no reference token left means we found a primitive value
+        return true;
+    }
+
+    /*!
+    @brief split the string input to reference tokens
+
+    @note This function is only called by the json_pointer constructor.
+          All exceptions below are documented there.
+
+    @throw parse_error.107  if the pointer is not empty or begins with '/'
+    @throw parse_error.108  if character '~' is not followed by '0' or '1'
+    */
+    static std::vector<std::string> split(const std::string& reference_string)
+    {
+        std::vector<std::string> result;
+
+        // special case: empty reference string -> no reference tokens
+        if (reference_string.empty())
+        {
+            return result;
+        }
+
+        // check if nonempty reference string begins with slash
+        if (JSON_HEDLEY_UNLIKELY(reference_string[0] != '/'))
+        {
+            JSON_THROW(detail::parse_error::create(107, 1, "JSON pointer must be empty or begin with '/' - was: '" + reference_string + "'", BasicJsonType()));
+        }
+
+        // extract the reference tokens:
+        // - slash: position of the last read slash (or end of string)
+        // - start: position after the previous slash
+        for (
+            // search for the first slash after the first character
+            std::size_t slash = reference_string.find_first_of('/', 1),
+            // set the beginning of the first reference token
+            start = 1;
+            // we can stop if start == 0 (if slash == std::string::npos)
+            start != 0;
+            // set the beginning of the next reference token
+            // (will eventually be 0 if slash == std::string::npos)
+            start = (slash == std::string::npos) ? 0 : slash + 1,
+            // find next slash
+            slash = reference_string.find_first_of('/', start))
+        {
+            // use the text between the beginning of the reference token
+            // (start) and the last slash (slash).
+            auto reference_token = reference_string.substr(start, slash - start);
+
+            // check reference tokens are properly escaped
+            for (std::size_t pos = reference_token.find_first_of('~');
+                    pos != std::string::npos;
+                    pos = reference_token.find_first_of('~', pos + 1))
+            {
+                JSON_ASSERT(reference_token[pos] == '~');
+
+                // ~ must be followed by 0 or 1
+                if (JSON_HEDLEY_UNLIKELY(pos == reference_token.size() - 1 ||
+                                         (reference_token[pos + 1] != '0' &&
+                                          reference_token[pos + 1] != '1')))
+                {
+                    JSON_THROW(detail::parse_error::create(108, 0, "escape character '~' must be followed with '0' or '1'", BasicJsonType()));
+                }
+            }
+
+            // finally, store the reference token
+            detail::unescape(reference_token);
+            result.push_back(reference_token);
+        }
+
+        return result;
+    }
+
+  private:
+    /*!
+    @param[in] reference_string  the reference string to the current value
+    @param[in] value             the value to consider
+    @param[in,out] result        the result object to insert values to
+
+    @note Empty objects or arrays are flattened to `null`.
+    */
+    static void flatten(const std::string& reference_string,
+                        const BasicJsonType& value,
+                        BasicJsonType& result)
+    {
+        switch (value.type())
+        {
+            case detail::value_t::array:
+            {
+                if (value.m_value.array->empty())
+                {
+                    // flatten empty array as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate array and use index as reference string
+                    for (std::size_t i = 0; i < value.m_value.array->size(); ++i)
+                    {
+                        flatten(reference_string + "/" + std::to_string(i),
+                                value.m_value.array->operator[](i), result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::object:
+            {
+                if (value.m_value.object->empty())
+                {
+                    // flatten empty object as null
+                    result[reference_string] = nullptr;
+                }
+                else
+                {
+                    // iterate object and use keys as reference string
+                    for (const auto& element : *value.m_value.object)
+                    {
+                        flatten(reference_string + "/" + detail::escape(element.first), element.second, result);
+                    }
+                }
+                break;
+            }
+
+            case detail::value_t::null:
+            case detail::value_t::string:
+            case detail::value_t::boolean:
+            case detail::value_t::number_integer:
+            case detail::value_t::number_unsigned:
+            case detail::value_t::number_float:
+            case detail::value_t::binary:
+            case detail::value_t::discarded:
+            default:
+            {
+                // add primitive value with its reference string
+                result[reference_string] = value;
+                break;
+            }
+        }
+    }
+
+    /*!
+    @param[in] value  flattened JSON
+
+    @return unflattened JSON
+
+    @throw parse_error.109 if array index is not a number
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+    @throw type_error.313  if value cannot be unflattened
+    */
+    static BasicJsonType
+    unflatten(const BasicJsonType& value)
+    {
+        if (JSON_HEDLEY_UNLIKELY(!value.is_object()))
+        {
+            JSON_THROW(detail::type_error::create(314, "only objects can be unflattened", value));
+        }
+
+        BasicJsonType result;
+
+        // iterate the JSON object values
+        for (const auto& element : *value.m_value.object)
+        {
+            if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive()))
+            {
+                JSON_THROW(detail::type_error::create(315, "values in object must be primitive", element.second));
+            }
+
+            // assign value to reference pointed to by JSON pointer; Note that if
+            // the JSON pointer is "" (i.e., points to the whole value), function
+            // get_and_create returns a reference to result itself. An assignment
+            // will then create a primitive value.
+            json_pointer(element.first).get_and_create(result) = element.second;
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief compares two JSON pointers for equality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is equal to @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator==(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return lhs.reference_tokens == rhs.reference_tokens;
+    }
+
+    /*!
+    @brief compares two JSON pointers for inequality
+
+    @param[in] lhs  JSON pointer to compare
+    @param[in] rhs  JSON pointer to compare
+    @return whether @a lhs is not equal @a rhs
+
+    @complexity Linear in the length of the JSON pointer
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+    */
+    friend bool operator!=(json_pointer const& lhs,
+                           json_pointer const& rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /// the reference tokens
+    std::vector<std::string> reference_tokens;
+};
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/json_ref.hpp>
+
+
+#include <initializer_list>
+#include <utility>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+template<typename BasicJsonType>
+class json_ref
+{
+  public:
+    using value_type = BasicJsonType;
+
+    json_ref(value_type&& value)
+        : owned_value(std::move(value))
+    {}
+
+    json_ref(const value_type& value)
+        : value_ref(&value)
+    {}
+
+    json_ref(std::initializer_list<json_ref> init)
+        : owned_value(init)
+    {}
+
+    template <
+        class... Args,
+        enable_if_t<std::is_constructible<value_type, Args...>::value, int> = 0 >
+    json_ref(Args && ... args)
+        : owned_value(std::forward<Args>(args)...)
+    {}
+
+    // class should be movable only
+    json_ref(json_ref&&) noexcept = default;
+    json_ref(const json_ref&) = delete;
+    json_ref& operator=(const json_ref&) = delete;
+    json_ref& operator=(json_ref&&) = delete;
+    ~json_ref() = default;
+
+    value_type moved_or_copied() const
+    {
+        if (value_ref == nullptr)
+        {
+            return std::move(owned_value);
+        }
+        return *value_ref;
+    }
+
+    value_type const& operator*() const
+    {
+        return value_ref ? *value_ref : owned_value;
+    }
+
+    value_type const* operator->() const
+    {
+        return &** this;
+    }
+
+  private:
+    mutable value_type owned_value = nullptr;
+    value_type const* value_ref = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/string_escape.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/meta/type_traits.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+
+#include <algorithm> // reverse
+#include <array> // array
+#include <cmath> // isnan, isinf
+#include <cstdint> // uint8_t, uint16_t, uint32_t, uint64_t
+#include <cstring> // memcpy
+#include <limits> // numeric_limits
+#include <string> // string
+#include <utility> // move
+
+// #include <nlohmann/detail/input/binary_reader.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+
+#include <algorithm> // copy
+#include <cstddef> // size_t
+#include <iterator> // back_inserter
+#include <memory> // shared_ptr, make_shared
+#include <string> // basic_string
+#include <vector> // vector
+
+#ifndef JSON_NO_IO
+    #include <ios>      // streamsize
+    #include <ostream>  // basic_ostream
+#endif  // JSON_NO_IO
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+/// abstract output adapter interface
+template<typename CharType> struct output_adapter_protocol
+{
+    virtual void write_character(CharType c) = 0;
+    virtual void write_characters(const CharType* s, std::size_t length) = 0;
+    virtual ~output_adapter_protocol() = default;
+
+    output_adapter_protocol() = default;
+    output_adapter_protocol(const output_adapter_protocol&) = default;
+    output_adapter_protocol(output_adapter_protocol&&) noexcept = default;
+    output_adapter_protocol& operator=(const output_adapter_protocol&) = default;
+    output_adapter_protocol& operator=(output_adapter_protocol&&) noexcept = default;
+};
+
+/// a type to simplify interfaces
+template<typename CharType>
+using output_adapter_t = std::shared_ptr<output_adapter_protocol<CharType>>;
+
+/// output adapter for byte vectors
+template<typename CharType>
+class output_vector_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_vector_adapter(std::vector<CharType>& vec) noexcept
+        : v(vec)
+    {}
+
+    void write_character(CharType c) override
+    {
+        v.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        std::copy(s, s + length, std::back_inserter(v));
+    }
+
+  private:
+    std::vector<CharType>& v;
+};
+
+#ifndef JSON_NO_IO
+/// output adapter for output streams
+template<typename CharType>
+class output_stream_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_stream_adapter(std::basic_ostream<CharType>& s) noexcept
+        : stream(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        stream.put(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        stream.write(s, static_cast<std::streamsize>(length));
+    }
+
+  private:
+    std::basic_ostream<CharType>& stream;
+};
+#endif  // JSON_NO_IO
+
+/// output adapter for basic_string
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_string_adapter : public output_adapter_protocol<CharType>
+{
+  public:
+    explicit output_string_adapter(StringType& s) noexcept
+        : str(s)
+    {}
+
+    void write_character(CharType c) override
+    {
+        str.push_back(c);
+    }
+
+    JSON_HEDLEY_NON_NULL(2)
+    void write_characters(const CharType* s, std::size_t length) override
+    {
+        str.append(s, length);
+    }
+
+  private:
+    StringType& str;
+};
+
+template<typename CharType, typename StringType = std::basic_string<CharType>>
+class output_adapter
+{
+  public:
+    output_adapter(std::vector<CharType>& vec)
+        : oa(std::make_shared<output_vector_adapter<CharType>>(vec)) {}
+
+#ifndef JSON_NO_IO
+    output_adapter(std::basic_ostream<CharType>& s)
+        : oa(std::make_shared<output_stream_adapter<CharType>>(s)) {}
+#endif  // JSON_NO_IO
+
+    output_adapter(StringType& s)
+        : oa(std::make_shared<output_string_adapter<CharType, StringType>>(s)) {}
+
+    operator output_adapter_t<CharType>()
+    {
+        return oa;
+    }
+
+  private:
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// binary writer //
+///////////////////
+
+/*!
+@brief serialization to CBOR and MessagePack values
+*/
+template<typename BasicJsonType, typename CharType>
+class binary_writer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using binary_t = typename BasicJsonType::binary_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+
+  public:
+    /*!
+    @brief create a binary writer
+
+    @param[in] adapter  output adapter to write to
+    */
+    explicit binary_writer(output_adapter_t<CharType> adapter) : oa(std::move(adapter))
+    {
+        JSON_ASSERT(oa);
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @pre       j.type() == value_t::object
+    */
+    void write_bson(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+            {
+                write_bson_object(*j.m_value.object);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::array:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                JSON_THROW(type_error::create(317, "to serialize to BSON, top-level type must be object, but is " + std::string(j.type_name()), j));
+            }
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_cbor(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                oa->write_character(to_char_type(0xF6));
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xF5)
+                                    : to_char_type(0xF4));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // CBOR does not differentiate between positive signed
+                    // integers and unsigned integers. Therefore, we used the
+                    // code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_integer <= 0x17)
+                    {
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x18));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x19));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x1A));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x1B));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    // The conversions below encode the sign in the first
+                    // byte, and the value is converted to a positive number.
+                    const auto positive_number = -1 - j.m_value.number_integer;
+                    if (j.m_value.number_integer >= -24)
+                    {
+                        write_number(static_cast<std::uint8_t>(0x20 + positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x38));
+                        write_number(static_cast<std::uint8_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x39));
+                        write_number(static_cast<std::uint16_t>(positive_number));
+                    }
+                    else if (positive_number <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        oa->write_character(to_char_type(0x3A));
+                        write_number(static_cast<std::uint32_t>(positive_number));
+                    }
+                    else
+                    {
+                        oa->write_character(to_char_type(0x3B));
+                        write_number(static_cast<std::uint64_t>(positive_number));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x18));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x19));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_unsigned));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x1A));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_unsigned));
+                }
+                else
+                {
+                    oa->write_character(to_char_type(0x1B));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_unsigned));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                if (std::isnan(j.m_value.number_float))
+                {
+                    // NaN is 0xf97e00 in CBOR
+                    oa->write_character(to_char_type(0xF9));
+                    oa->write_character(to_char_type(0x7E));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else if (std::isinf(j.m_value.number_float))
+                {
+                    // Infinity is 0xf97c00, -Infinity is 0xf9fc00
+                    oa->write_character(to_char_type(0xf9));
+                    oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC));
+                    oa->write_character(to_char_type(0x00));
+                }
+                else
+                {
+                    write_compact_float(j.m_value.number_float, detail::input_format_t::cbor);
+                }
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x60 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x78));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x79));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x7B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x80 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x98));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x99));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x9B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_cbor(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (j.m_value.binary->has_subtype())
+                {
+                    if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xd8));
+                        write_number(static_cast<std::uint8_t>(j.m_value.binary->subtype()));
+                    }
+                    else if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xd9));
+                        write_number(static_cast<std::uint16_t>(j.m_value.binary->subtype()));
+                    }
+                    else if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xda));
+                        write_number(static_cast<std::uint32_t>(j.m_value.binary->subtype()));
+                    }
+                    else if (j.m_value.binary->subtype() <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        write_number(static_cast<std::uint8_t>(0xdb));
+                        write_number(static_cast<std::uint64_t>(j.m_value.binary->subtype()));
+                    }
+                }
+
+                // step 1: write control byte and the binary array size
+                const auto N = j.m_value.binary->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0x40 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x58));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x59));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5A));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0x5B));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 0x17)
+                {
+                    write_number(static_cast<std::uint8_t>(0xA0 + N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB8));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xB9));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBA));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+                // LCOV_EXCL_START
+                else if (N <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    oa->write_character(to_char_type(0xBB));
+                    write_number(static_cast<std::uint64_t>(N));
+                }
+                // LCOV_EXCL_STOP
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_cbor(el.first);
+                    write_cbor(el.second);
+                }
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    */
+    void write_msgpack(const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::null: // nil
+            {
+                oa->write_character(to_char_type(0xC0));
+                break;
+            }
+
+            case value_t::boolean: // true and false
+            {
+                oa->write_character(j.m_value.boolean
+                                    ? to_char_type(0xC3)
+                                    : to_char_type(0xC2));
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                if (j.m_value.number_integer >= 0)
+                {
+                    // MessagePack does not differentiate between positive
+                    // signed integers and unsigned integers. Therefore, we used
+                    // the code from the value_t::number_unsigned case here.
+                    if (j.m_value.number_unsigned < 128)
+                    {
+                        // positive fixnum
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                    {
+                        // uint 8
+                        oa->write_character(to_char_type(0xCC));
+                        write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                    {
+                        // uint 16
+                        oa->write_character(to_char_type(0xCD));
+                        write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                    {
+                        // uint 32
+                        oa->write_character(to_char_type(0xCE));
+                        write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                    {
+                        // uint 64
+                        oa->write_character(to_char_type(0xCF));
+                        write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                    }
+                }
+                else
+                {
+                    if (j.m_value.number_integer >= -32)
+                    {
+                        // negative fixnum
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int8_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                    {
+                        // int 8
+                        oa->write_character(to_char_type(0xD0));
+                        write_number(static_cast<std::int8_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int16_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                    {
+                        // int 16
+                        oa->write_character(to_char_type(0xD1));
+                        write_number(static_cast<std::int16_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int32_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                    {
+                        // int 32
+                        oa->write_character(to_char_type(0xD2));
+                        write_number(static_cast<std::int32_t>(j.m_value.number_integer));
+                    }
+                    else if (j.m_value.number_integer >= (std::numeric_limits<std::int64_t>::min)() &&
+                             j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                    {
+                        // int 64
+                        oa->write_character(to_char_type(0xD3));
+                        write_number(static_cast<std::int64_t>(j.m_value.number_integer));
+                    }
+                }
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned < 128)
+                {
+                    // positive fixnum
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // uint 8
+                    oa->write_character(to_char_type(0xCC));
+                    write_number(static_cast<std::uint8_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // uint 16
+                    oa->write_character(to_char_type(0xCD));
+                    write_number(static_cast<std::uint16_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // uint 32
+                    oa->write_character(to_char_type(0xCE));
+                    write_number(static_cast<std::uint32_t>(j.m_value.number_integer));
+                }
+                else if (j.m_value.number_unsigned <= (std::numeric_limits<std::uint64_t>::max)())
+                {
+                    // uint 64
+                    oa->write_character(to_char_type(0xCF));
+                    write_number(static_cast<std::uint64_t>(j.m_value.number_integer));
+                }
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack);
+                break;
+            }
+
+            case value_t::string:
+            {
+                // step 1: write control byte and the string length
+                const auto N = j.m_value.string->size();
+                if (N <= 31)
+                {
+                    // fixstr
+                    write_number(static_cast<std::uint8_t>(0xA0 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    // str 8
+                    oa->write_character(to_char_type(0xD9));
+                    write_number(static_cast<std::uint8_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // str 16
+                    oa->write_character(to_char_type(0xDA));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // str 32
+                    oa->write_character(to_char_type(0xDB));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write the string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                // step 1: write control byte and the array size
+                const auto N = j.m_value.array->size();
+                if (N <= 15)
+                {
+                    // fixarray
+                    write_number(static_cast<std::uint8_t>(0x90 | N));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // array 16
+                    oa->write_character(to_char_type(0xDC));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // array 32
+                    oa->write_character(to_char_type(0xDD));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_msgpack(el);
+                }
+                break;
+            }
+
+            case value_t::binary:
+            {
+                // step 0: determine if the binary type has a set subtype to
+                // determine whether or not to use the ext or fixext types
+                const bool use_ext = j.m_value.binary->has_subtype();
+
+                // step 1: write control byte and the byte string length
+                const auto N = j.m_value.binary->size();
+                if (N <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    std::uint8_t output_type{};
+                    bool fixed = true;
+                    if (use_ext)
+                    {
+                        switch (N)
+                        {
+                            case 1:
+                                output_type = 0xD4; // fixext 1
+                                break;
+                            case 2:
+                                output_type = 0xD5; // fixext 2
+                                break;
+                            case 4:
+                                output_type = 0xD6; // fixext 4
+                                break;
+                            case 8:
+                                output_type = 0xD7; // fixext 8
+                                break;
+                            case 16:
+                                output_type = 0xD8; // fixext 16
+                                break;
+                            default:
+                                output_type = 0xC7; // ext 8
+                                fixed = false;
+                                break;
+                        }
+
+                    }
+                    else
+                    {
+                        output_type = 0xC4; // bin 8
+                        fixed = false;
+                    }
+
+                    oa->write_character(to_char_type(output_type));
+                    if (!fixed)
+                    {
+                        write_number(static_cast<std::uint8_t>(N));
+                    }
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    std::uint8_t output_type = use_ext
+                                               ? 0xC8 // ext 16
+                                               : 0xC5; // bin 16
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    std::uint8_t output_type = use_ext
+                                               ? 0xC9 // ext 32
+                                               : 0xC6; // bin 32
+
+                    oa->write_character(to_char_type(output_type));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 1.5: if this is an ext type, write the subtype
+                if (use_ext)
+                {
+                    write_number(static_cast<std::int8_t>(j.m_value.binary->subtype()));
+                }
+
+                // step 2: write the byte string
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                    N);
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // step 1: write control byte and the object size
+                const auto N = j.m_value.object->size();
+                if (N <= 15)
+                {
+                    // fixmap
+                    write_number(static_cast<std::uint8_t>(0x80 | (N & 0xF)));
+                }
+                else if (N <= (std::numeric_limits<std::uint16_t>::max)())
+                {
+                    // map 16
+                    oa->write_character(to_char_type(0xDE));
+                    write_number(static_cast<std::uint16_t>(N));
+                }
+                else if (N <= (std::numeric_limits<std::uint32_t>::max)())
+                {
+                    // map 32
+                    oa->write_character(to_char_type(0xDF));
+                    write_number(static_cast<std::uint32_t>(N));
+                }
+
+                // step 2: write each element
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_msgpack(el.first);
+                    write_msgpack(el.second);
+                }
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @param[in] j  JSON value to serialize
+    @param[in] use_count   whether to use '#' prefixes (optimized format)
+    @param[in] use_type    whether to use '$' prefixes (optimized format)
+    @param[in] add_prefix  whether prefixes need to be used for this value
+    */
+    void write_ubjson(const BasicJsonType& j, const bool use_count,
+                      const bool use_type, const bool add_prefix = true)
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('Z'));
+                }
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(j.m_value.boolean
+                                        ? to_char_type('T')
+                                        : to_char_type('F'));
+                }
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix);
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix);
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix);
+                break;
+            }
+
+            case value_t::string:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('S'));
+                }
+                write_number_with_ubjson_prefix(j.m_value.string->size(), true);
+                oa->write_characters(
+                    reinterpret_cast<const CharType*>(j.m_value.string->c_str()),
+                    j.m_value.string->size());
+                break;
+            }
+
+            case value_t::array:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_value.array->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin() + 1, j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.array->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.array)
+                {
+                    write_ubjson(el, use_count, use_type, prefix_required);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::binary:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('['));
+                }
+
+                if (use_type && !j.m_value.binary->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    oa->write_character(to_char_type('$'));
+                    oa->write_character('U');
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.binary->size(), true);
+                }
+
+                if (use_type)
+                {
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(j.m_value.binary->data()),
+                        j.m_value.binary->size());
+                }
+                else
+                {
+                    for (size_t i = 0; i < j.m_value.binary->size(); ++i)
+                    {
+                        oa->write_character(to_char_type('U'));
+                        oa->write_character(j.m_value.binary->data()[i]);
+                    }
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type(']'));
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                if (add_prefix)
+                {
+                    oa->write_character(to_char_type('{'));
+                }
+
+                bool prefix_required = true;
+                if (use_type && !j.m_value.object->empty())
+                {
+                    JSON_ASSERT(use_count);
+                    const CharType first_prefix = ubjson_prefix(j.front());
+                    const bool same_prefix = std::all_of(j.begin(), j.end(),
+                                                         [this, first_prefix](const BasicJsonType & v)
+                    {
+                        return ubjson_prefix(v) == first_prefix;
+                    });
+
+                    if (same_prefix)
+                    {
+                        prefix_required = false;
+                        oa->write_character(to_char_type('$'));
+                        oa->write_character(first_prefix);
+                    }
+                }
+
+                if (use_count)
+                {
+                    oa->write_character(to_char_type('#'));
+                    write_number_with_ubjson_prefix(j.m_value.object->size(), true);
+                }
+
+                for (const auto& el : *j.m_value.object)
+                {
+                    write_number_with_ubjson_prefix(el.first.size(), true);
+                    oa->write_characters(
+                        reinterpret_cast<const CharType*>(el.first.c_str()),
+                        el.first.size());
+                    write_ubjson(el.second, use_count, use_type, prefix_required);
+                }
+
+                if (!use_count)
+                {
+                    oa->write_character(to_char_type('}'));
+                }
+
+                break;
+            }
+
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+  private:
+    //////////
+    // BSON //
+    //////////
+
+    /*!
+    @return The size of a BSON document entry header, including the id marker
+            and the entry name size (and its null-terminator).
+    */
+    static std::size_t calc_bson_entry_header_size(const string_t& name, const BasicJsonType& j)
+    {
+        const auto it = name.find(static_cast<typename string_t::value_type>(0));
+        if (JSON_HEDLEY_UNLIKELY(it != BasicJsonType::string_t::npos))
+        {
+            JSON_THROW(out_of_range::create(409, "BSON key cannot contain code point U+0000 (at byte " + std::to_string(it) + ")", j));
+            static_cast<void>(j);
+        }
+
+        return /*id*/ 1ul + name.size() + /*zero-terminator*/1u;
+    }
+
+    /*!
+    @brief Writes the given @a element_type and @a name to the output adapter
+    */
+    void write_bson_entry_header(const string_t& name,
+                                 const std::uint8_t element_type)
+    {
+        oa->write_character(to_char_type(element_type)); // boolean
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(name.c_str()),
+            name.size() + 1u);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and boolean value @a value
+    */
+    void write_bson_boolean(const string_t& name,
+                            const bool value)
+    {
+        write_bson_entry_header(name, 0x08);
+        oa->write_character(value ? to_char_type(0x01) : to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and double value @a value
+    */
+    void write_bson_double(const string_t& name,
+                           const double value)
+    {
+        write_bson_entry_header(name, 0x01);
+        write_number<double, true>(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded string in @a value
+    */
+    static std::size_t calc_bson_string_size(const string_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and string value @a value
+    */
+    void write_bson_string(const string_t& name,
+                           const string_t& value)
+    {
+        write_bson_entry_header(name, 0x02);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size() + 1ul));
+        oa->write_characters(
+            reinterpret_cast<const CharType*>(value.c_str()),
+            value.size() + 1);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and null value
+    */
+    void write_bson_null(const string_t& name)
+    {
+        write_bson_entry_header(name, 0x0A);
+    }
+
+    /*!
+    @return The size of the BSON-encoded integer @a value
+    */
+    static std::size_t calc_bson_integer_size(const std::int64_t value)
+    {
+        return (std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)()
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and integer @a value
+    */
+    void write_bson_integer(const string_t& name,
+                            const std::int64_t value)
+    {
+        if ((std::numeric_limits<std::int32_t>::min)() <= value && value <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            write_bson_entry_header(name, 0x10); // int32
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(value));
+        }
+        else
+        {
+            write_bson_entry_header(name, 0x12); // int64
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(value));
+        }
+    }
+
+    /*!
+    @return The size of the BSON-encoded unsigned integer in @a j
+    */
+    static constexpr std::size_t calc_bson_unsigned_size(const std::uint64_t value) noexcept
+    {
+        return (value <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+               ? sizeof(std::int32_t)
+               : sizeof(std::int64_t);
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and unsigned @a value
+    */
+    void write_bson_unsigned(const string_t& name,
+                             const BasicJsonType& j)
+    {
+        if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x10 /* int32 */);
+            write_number<std::int32_t, true>(static_cast<std::int32_t>(j.m_value.number_unsigned));
+        }
+        else if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            write_bson_entry_header(name, 0x12 /* int64 */);
+            write_number<std::int64_t, true>(static_cast<std::int64_t>(j.m_value.number_unsigned));
+        }
+        else
+        {
+            JSON_THROW(out_of_range::create(407, "integer number " + std::to_string(j.m_value.number_unsigned) + " cannot be represented by BSON as it does not fit int64", j));
+        }
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and object @a value
+    */
+    void write_bson_object_entry(const string_t& name,
+                                 const typename BasicJsonType::object_t& value)
+    {
+        write_bson_entry_header(name, 0x03); // object
+        write_bson_object(value);
+    }
+
+    /*!
+    @return The size of the BSON-encoded array @a value
+    */
+    static std::size_t calc_bson_array_size(const typename BasicJsonType::array_t& value)
+    {
+        std::size_t array_index = 0ul;
+
+        const std::size_t embedded_document_size = std::accumulate(std::begin(value), std::end(value), std::size_t(0), [&array_index](std::size_t result, const typename BasicJsonType::array_t::value_type & el)
+        {
+            return result + calc_bson_element_size(std::to_string(array_index++), el);
+        });
+
+        return sizeof(std::int32_t) + embedded_document_size + 1ul;
+    }
+
+    /*!
+    @return The size of the BSON-encoded binary array @a value
+    */
+    static std::size_t calc_bson_binary_size(const typename BasicJsonType::binary_t& value)
+    {
+        return sizeof(std::int32_t) + value.size() + 1ul;
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and array @a value
+    */
+    void write_bson_array(const string_t& name,
+                          const typename BasicJsonType::array_t& value)
+    {
+        write_bson_entry_header(name, 0x04); // array
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_array_size(value)));
+
+        std::size_t array_index = 0ul;
+
+        for (const auto& el : value)
+        {
+            write_bson_element(std::to_string(array_index++), el);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    /*!
+    @brief Writes a BSON element with key @a name and binary value @a value
+    */
+    void write_bson_binary(const string_t& name,
+                           const binary_t& value)
+    {
+        write_bson_entry_header(name, 0x05);
+
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(value.size()));
+        write_number(value.has_subtype() ? static_cast<std::uint8_t>(value.subtype()) : std::uint8_t(0x00));
+
+        oa->write_characters(reinterpret_cast<const CharType*>(value.data()), value.size());
+    }
+
+    /*!
+    @brief Calculates the size necessary to serialize the JSON value @a j with its @a name
+    @return The calculated size for the BSON document entry for @a j with the given @a name.
+    */
+    static std::size_t calc_bson_element_size(const string_t& name,
+            const BasicJsonType& j)
+    {
+        const auto header_size = calc_bson_entry_header_size(name, j);
+        switch (j.type())
+        {
+            case value_t::object:
+                return header_size + calc_bson_object_size(*j.m_value.object);
+
+            case value_t::array:
+                return header_size + calc_bson_array_size(*j.m_value.array);
+
+            case value_t::binary:
+                return header_size + calc_bson_binary_size(*j.m_value.binary);
+
+            case value_t::boolean:
+                return header_size + 1ul;
+
+            case value_t::number_float:
+                return header_size + 8ul;
+
+            case value_t::number_integer:
+                return header_size + calc_bson_integer_size(j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned);
+
+            case value_t::string:
+                return header_size + calc_bson_string_size(*j.m_value.string);
+
+            case value_t::null:
+                return header_size + 0ul;
+
+            // LCOV_EXCL_START
+            case value_t::discarded:
+            default:
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
+                return 0ul;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Serializes the JSON value @a j to BSON and associates it with the
+           key @a name.
+    @param name The name to associate with the JSON entity @a j within the
+                current BSON document
+    */
+    void write_bson_element(const string_t& name,
+                            const BasicJsonType& j)
+    {
+        switch (j.type())
+        {
+            case value_t::object:
+                return write_bson_object_entry(name, *j.m_value.object);
+
+            case value_t::array:
+                return write_bson_array(name, *j.m_value.array);
+
+            case value_t::binary:
+                return write_bson_binary(name, *j.m_value.binary);
+
+            case value_t::boolean:
+                return write_bson_boolean(name, j.m_value.boolean);
+
+            case value_t::number_float:
+                return write_bson_double(name, j.m_value.number_float);
+
+            case value_t::number_integer:
+                return write_bson_integer(name, j.m_value.number_integer);
+
+            case value_t::number_unsigned:
+                return write_bson_unsigned(name, j);
+
+            case value_t::string:
+                return write_bson_string(name, *j.m_value.string);
+
+            case value_t::null:
+                return write_bson_null(name);
+
+            // LCOV_EXCL_START
+            case value_t::discarded:
+            default:
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert)
+                return;
+                // LCOV_EXCL_STOP
+        }
+    }
+
+    /*!
+    @brief Calculates the size of the BSON serialization of the given
+           JSON-object @a j.
+    @param[in] value  JSON value to serialize
+    @pre       value.type() == value_t::object
+    */
+    static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value)
+    {
+        std::size_t document_size = std::accumulate(value.begin(), value.end(), std::size_t(0),
+                                    [](size_t result, const typename BasicJsonType::object_t::value_type & el)
+        {
+            return result += calc_bson_element_size(el.first, el.second);
+        });
+
+        return sizeof(std::int32_t) + document_size + 1ul;
+    }
+
+    /*!
+    @param[in] value  JSON value to serialize
+    @pre       value.type() == value_t::object
+    */
+    void write_bson_object(const typename BasicJsonType::object_t& value)
+    {
+        write_number<std::int32_t, true>(static_cast<std::int32_t>(calc_bson_object_size(value)));
+
+        for (const auto& el : value)
+        {
+            write_bson_element(el.first, el.second);
+        }
+
+        oa->write_character(to_char_type(0x00));
+    }
+
+    //////////
+    // CBOR //
+    //////////
+
+    static constexpr CharType get_cbor_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xFA);  // Single-Precision Float
+    }
+
+    static constexpr CharType get_cbor_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xFB);  // Double-Precision Float
+    }
+
+    /////////////
+    // MsgPack //
+    /////////////
+
+    static constexpr CharType get_msgpack_float_prefix(float /*unused*/)
+    {
+        return to_char_type(0xCA);  // float 32
+    }
+
+    static constexpr CharType get_msgpack_float_prefix(double /*unused*/)
+    {
+        return to_char_type(0xCB);  // float 64
+    }
+
+    ////////////
+    // UBJSON //
+    ////////////
+
+    // UBJSON: write number (floating point)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_floating_point<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (add_prefix)
+        {
+            oa->write_character(get_ubjson_float_prefix(n));
+        }
+        write_number(n);
+    }
+
+    // UBJSON: write number (unsigned integer)
+    template<typename NumberType, typename std::enable_if<
+                 std::is_unsigned<NumberType>::value, int>::type = 0>
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= (std::numeric_limits<std::uint8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if (n <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+    }
+
+    // UBJSON: write number (signed integer)
+    template < typename NumberType, typename std::enable_if <
+                   std::is_signed<NumberType>::value&&
+                   !std::is_floating_point<NumberType>::value, int >::type = 0 >
+    void write_number_with_ubjson_prefix(const NumberType n,
+                                         const bool add_prefix)
+    {
+        if ((std::numeric_limits<std::int8_t>::min)() <= n && n <= (std::numeric_limits<std::int8_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('i'));  // int8
+            }
+            write_number(static_cast<std::int8_t>(n));
+        }
+        else if (static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::min)()) <= n && n <= static_cast<std::int64_t>((std::numeric_limits<std::uint8_t>::max)()))
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('U'));  // uint8
+            }
+            write_number(static_cast<std::uint8_t>(n));
+        }
+        else if ((std::numeric_limits<std::int16_t>::min)() <= n && n <= (std::numeric_limits<std::int16_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('I'));  // int16
+            }
+            write_number(static_cast<std::int16_t>(n));
+        }
+        else if ((std::numeric_limits<std::int32_t>::min)() <= n && n <= (std::numeric_limits<std::int32_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('l'));  // int32
+            }
+            write_number(static_cast<std::int32_t>(n));
+        }
+        else if ((std::numeric_limits<std::int64_t>::min)() <= n && n <= (std::numeric_limits<std::int64_t>::max)())
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('L'));  // int64
+            }
+            write_number(static_cast<std::int64_t>(n));
+        }
+        // LCOV_EXCL_START
+        else
+        {
+            if (add_prefix)
+            {
+                oa->write_character(to_char_type('H'));  // high-precision number
+            }
+
+            const auto number = BasicJsonType(n).dump();
+            write_number_with_ubjson_prefix(number.size(), true);
+            for (std::size_t i = 0; i < number.size(); ++i)
+            {
+                oa->write_character(to_char_type(static_cast<std::uint8_t>(number[i])));
+            }
+        }
+        // LCOV_EXCL_STOP
+    }
+
+    /*!
+    @brief determine the type prefix of container values
+    */
+    CharType ubjson_prefix(const BasicJsonType& j) const noexcept
+    {
+        switch (j.type())
+        {
+            case value_t::null:
+                return 'Z';
+
+            case value_t::boolean:
+                return j.m_value.boolean ? 'T' : 'F';
+
+            case value_t::number_integer:
+            {
+                if ((std::numeric_limits<std::int8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int8_t>::max)())
+                {
+                    return 'i';
+                }
+                if ((std::numeric_limits<std::uint8_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::uint8_t>::max)())
+                {
+                    return 'U';
+                }
+                if ((std::numeric_limits<std::int16_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int16_t>::max)())
+                {
+                    return 'I';
+                }
+                if ((std::numeric_limits<std::int32_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int32_t>::max)())
+                {
+                    return 'l';
+                }
+                if ((std::numeric_limits<std::int64_t>::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits<std::int64_t>::max)())
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_unsigned:
+            {
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int8_t>::max)()))
+                {
+                    return 'i';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::uint8_t>::max)()))
+                {
+                    return 'U';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int16_t>::max)()))
+                {
+                    return 'I';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int32_t>::max)()))
+                {
+                    return 'l';
+                }
+                if (j.m_value.number_unsigned <= static_cast<std::uint64_t>((std::numeric_limits<std::int64_t>::max)()))
+                {
+                    return 'L';
+                }
+                // anything else is treated as high-precision number
+                return 'H'; // LCOV_EXCL_LINE
+            }
+
+            case value_t::number_float:
+                return get_ubjson_float_prefix(j.m_value.number_float);
+
+            case value_t::string:
+                return 'S';
+
+            case value_t::array: // fallthrough
+            case value_t::binary:
+                return '[';
+
+            case value_t::object:
+                return '{';
+
+            case value_t::discarded:
+            default:  // discarded values
+                return 'N';
+        }
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(float /*unused*/)
+    {
+        return 'd';  // float 32
+    }
+
+    static constexpr CharType get_ubjson_float_prefix(double /*unused*/)
+    {
+        return 'D';  // float 64
+    }
+
+    ///////////////////////
+    // Utility functions //
+    ///////////////////////
+
+    /*
+    @brief write a number to output input
+    @param[in] n number of type @a NumberType
+    @tparam NumberType the type of the number
+    @tparam OutputIsLittleEndian Set to true if output data is
+                                 required to be little endian
+
+    @note This function needs to respect the system's endianess, because bytes
+          in CBOR, MessagePack, and UBJSON are stored in network order (big
+          endian) and therefore need reordering on little endian systems.
+    */
+    template<typename NumberType, bool OutputIsLittleEndian = false>
+    void write_number(const NumberType n)
+    {
+        // step 1: write number to array of length NumberType
+        std::array<CharType, sizeof(NumberType)> vec{};
+        std::memcpy(vec.data(), &n, sizeof(NumberType));
+
+        // step 2: write array to output (with possible reordering)
+        if (is_little_endian != OutputIsLittleEndian)
+        {
+            // reverse byte order prior to conversion if necessary
+            std::reverse(vec.begin(), vec.end());
+        }
+
+        oa->write_characters(vec.data(), sizeof(NumberType));
+    }
+
+    void write_compact_float(const number_float_t n, detail::input_format_t format)
+    {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        if (static_cast<double>(n) >= static_cast<double>(std::numeric_limits<float>::lowest()) &&
+                static_cast<double>(n) <= static_cast<double>((std::numeric_limits<float>::max)()) &&
+                static_cast<double>(static_cast<float>(n)) == static_cast<double>(n))
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(static_cast<float>(n))
+                                : get_msgpack_float_prefix(static_cast<float>(n)));
+            write_number(static_cast<float>(n));
+        }
+        else
+        {
+            oa->write_character(format == detail::input_format_t::cbor
+                                ? get_cbor_float_prefix(n)
+                                : get_msgpack_float_prefix(n));
+            write_number(n);
+        }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+  public:
+    // The following to_char_type functions are implement the conversion
+    // between uint8_t and CharType. In case CharType is not unsigned,
+    // such a conversion is required to allow values greater than 128.
+    // See <https://github.com/nlohmann/json/issues/1286> for a discussion.
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_signed<char>::value > * = nullptr >
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return *reinterpret_cast<char*>(&x);
+    }
+
+    template < typename C = CharType,
+               enable_if_t < std::is_signed<C>::value && std::is_unsigned<char>::value > * = nullptr >
+    static CharType to_char_type(std::uint8_t x) noexcept
+    {
+        static_assert(sizeof(std::uint8_t) == sizeof(CharType), "size of CharType must be equal to std::uint8_t");
+        static_assert(std::is_trivial<CharType>::value, "CharType must be trivial");
+        CharType result;
+        std::memcpy(&result, &x, sizeof(x));
+        return result;
+    }
+
+    template<typename C = CharType,
+             enable_if_t<std::is_unsigned<C>::value>* = nullptr>
+    static constexpr CharType to_char_type(std::uint8_t x) noexcept
+    {
+        return x;
+    }
+
+    template < typename InputCharType, typename C = CharType,
+               enable_if_t <
+                   std::is_signed<C>::value &&
+                   std::is_signed<char>::value &&
+                   std::is_same<char, typename std::remove_cv<InputCharType>::type>::value
+                   > * = nullptr >
+    static constexpr CharType to_char_type(InputCharType x) noexcept
+    {
+        return x;
+    }
+
+  private:
+    /// whether we can assume little endianess
+    const bool is_little_endian = little_endianess();
+
+    /// the output
+    output_adapter_t<CharType> oa = nullptr;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/output/serializer.hpp>
+
+
+#include <algorithm> // reverse, remove, fill, find, none_of
+#include <array> // array
+#include <clocale> // localeconv, lconv
+#include <cmath> // labs, isfinite, isnan, signbit
+#include <cstddef> // size_t, ptrdiff_t
+#include <cstdint> // uint8_t
+#include <cstdio> // snprintf
+#include <limits> // numeric_limits
+#include <string> // string, char_traits
+#include <type_traits> // is_same
+#include <utility> // move
+
+// #include <nlohmann/detail/conversions/to_chars.hpp>
+
+
+#include <array> // array
+#include <cmath>   // signbit, isfinite
+#include <cstdint> // intN_t, uintN_t
+#include <cstring> // memcpy, memmove
+#include <limits> // numeric_limits
+#include <type_traits> // conditional
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+
+/*!
+@brief implements the Grisu2 algorithm for binary to decimal floating-point
+conversion.
+
+This implementation is a slightly modified version of the reference
+implementation which may be obtained from
+http://florian.loitsch.com/publications (bench.tar.gz).
+
+The code is distributed under the MIT license, Copyright (c) 2009 Florian Loitsch.
+
+For a detailed description of the algorithm see:
+
+[1] Loitsch, "Printing Floating-Point Numbers Quickly and Accurately with
+    Integers", Proceedings of the ACM SIGPLAN 2010 Conference on Programming
+    Language Design and Implementation, PLDI 2010
+[2] Burger, Dybvig, "Printing Floating-Point Numbers Quickly and Accurately",
+    Proceedings of the ACM SIGPLAN 1996 Conference on Programming Language
+    Design and Implementation, PLDI 1996
+*/
+namespace dtoa_impl
+{
+
+template<typename Target, typename Source>
+Target reinterpret_bits(const Source source)
+{
+    static_assert(sizeof(Target) == sizeof(Source), "size mismatch");
+
+    Target target;
+    std::memcpy(&target, &source, sizeof(Source));
+    return target;
+}
+
+struct diyfp // f * 2^e
+{
+    static constexpr int kPrecision = 64; // = q
+
+    std::uint64_t f = 0;
+    int e = 0;
+
+    constexpr diyfp(std::uint64_t f_, int e_) noexcept : f(f_), e(e_) {}
+
+    /*!
+    @brief returns x - y
+    @pre x.e == y.e and x.f >= y.f
+    */
+    static diyfp sub(const diyfp& x, const diyfp& y) noexcept
+    {
+        JSON_ASSERT(x.e == y.e);
+        JSON_ASSERT(x.f >= y.f);
+
+        return {x.f - y.f, x.e};
+    }
+
+    /*!
+    @brief returns x * y
+    @note The result is rounded. (Only the upper q bits are returned.)
+    */
+    static diyfp mul(const diyfp& x, const diyfp& y) noexcept
+    {
+        static_assert(kPrecision == 64, "internal error");
+
+        // Computes:
+        //  f = round((x.f * y.f) / 2^q)
+        //  e = x.e + y.e + q
+
+        // Emulate the 64-bit * 64-bit multiplication:
+        //
+        // p = u * v
+        //   = (u_lo + 2^32 u_hi) (v_lo + 2^32 v_hi)
+        //   = (u_lo v_lo         ) + 2^32 ((u_lo v_hi         ) + (u_hi v_lo         )) + 2^64 (u_hi v_hi         )
+        //   = (p0                ) + 2^32 ((p1                ) + (p2                )) + 2^64 (p3                )
+        //   = (p0_lo + 2^32 p0_hi) + 2^32 ((p1_lo + 2^32 p1_hi) + (p2_lo + 2^32 p2_hi)) + 2^64 (p3                )
+        //   = (p0_lo             ) + 2^32 (p0_hi + p1_lo + p2_lo                      ) + 2^64 (p1_hi + p2_hi + p3)
+        //   = (p0_lo             ) + 2^32 (Q                                          ) + 2^64 (H                 )
+        //   = (p0_lo             ) + 2^32 (Q_lo + 2^32 Q_hi                           ) + 2^64 (H                 )
+        //
+        // (Since Q might be larger than 2^32 - 1)
+        //
+        //   = (p0_lo + 2^32 Q_lo) + 2^64 (Q_hi + H)
+        //
+        // (Q_hi + H does not overflow a 64-bit int)
+        //
+        //   = p_lo + 2^64 p_hi
+
+        const std::uint64_t u_lo = x.f & 0xFFFFFFFFu;
+        const std::uint64_t u_hi = x.f >> 32u;
+        const std::uint64_t v_lo = y.f & 0xFFFFFFFFu;
+        const std::uint64_t v_hi = y.f >> 32u;
+
+        const std::uint64_t p0 = u_lo * v_lo;
+        const std::uint64_t p1 = u_lo * v_hi;
+        const std::uint64_t p2 = u_hi * v_lo;
+        const std::uint64_t p3 = u_hi * v_hi;
+
+        const std::uint64_t p0_hi = p0 >> 32u;
+        const std::uint64_t p1_lo = p1 & 0xFFFFFFFFu;
+        const std::uint64_t p1_hi = p1 >> 32u;
+        const std::uint64_t p2_lo = p2 & 0xFFFFFFFFu;
+        const std::uint64_t p2_hi = p2 >> 32u;
+
+        std::uint64_t Q = p0_hi + p1_lo + p2_lo;
+
+        // The full product might now be computed as
+        //
+        // p_hi = p3 + p2_hi + p1_hi + (Q >> 32)
+        // p_lo = p0_lo + (Q << 32)
+        //
+        // But in this particular case here, the full p_lo is not required.
+        // Effectively we only need to add the highest bit in p_lo to p_hi (and
+        // Q_hi + 1 does not overflow).
+
+        Q += std::uint64_t{1} << (64u - 32u - 1u); // round, ties up
+
+        const std::uint64_t h = p3 + p2_hi + p1_hi + (Q >> 32u);
+
+        return {h, x.e + y.e + 64};
+    }
+
+    /*!
+    @brief normalize x such that the significand is >= 2^(q-1)
+    @pre x.f != 0
+    */
+    static diyfp normalize(diyfp x) noexcept
+    {
+        JSON_ASSERT(x.f != 0);
+
+        while ((x.f >> 63u) == 0)
+        {
+            x.f <<= 1u;
+            x.e--;
+        }
+
+        return x;
+    }
+
+    /*!
+    @brief normalize x such that the result has the exponent E
+    @pre e >= x.e and the upper e - x.e bits of x.f must be zero.
+    */
+    static diyfp normalize_to(const diyfp& x, const int target_exponent) noexcept
+    {
+        const int delta = x.e - target_exponent;
+
+        JSON_ASSERT(delta >= 0);
+        JSON_ASSERT(((x.f << delta) >> delta) == x.f);
+
+        return {x.f << delta, target_exponent};
+    }
+};
+
+struct boundaries
+{
+    diyfp w;
+    diyfp minus;
+    diyfp plus;
+};
+
+/*!
+Compute the (normalized) diyfp representing the input number 'value' and its
+boundaries.
+
+@pre value must be finite and positive
+*/
+template<typename FloatType>
+boundaries compute_boundaries(FloatType value)
+{
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // Convert the IEEE representation into a diyfp.
+    //
+    // If v is denormal:
+    //      value = 0.F * 2^(1 - bias) = (          F) * 2^(1 - bias - (p-1))
+    // If v is normalized:
+    //      value = 1.F * 2^(E - bias) = (2^(p-1) + F) * 2^(E - bias - (p-1))
+
+    static_assert(std::numeric_limits<FloatType>::is_iec559,
+                  "internal error: dtoa_short requires an IEEE-754 floating-point implementation");
+
+    constexpr int      kPrecision = std::numeric_limits<FloatType>::digits; // = p (includes the hidden bit)
+    constexpr int      kBias      = std::numeric_limits<FloatType>::max_exponent - 1 + (kPrecision - 1);
+    constexpr int      kMinExp    = 1 - kBias;
+    constexpr std::uint64_t kHiddenBit = std::uint64_t{1} << (kPrecision - 1); // = 2^(p-1)
+
+    using bits_type = typename std::conditional<kPrecision == 24, std::uint32_t, std::uint64_t >::type;
+
+    const auto bits = static_cast<std::uint64_t>(reinterpret_bits<bits_type>(value));
+    const std::uint64_t E = bits >> (kPrecision - 1);
+    const std::uint64_t F = bits & (kHiddenBit - 1);
+
+    const bool is_denormal = E == 0;
+    const diyfp v = is_denormal
+                    ? diyfp(F, kMinExp)
+                    : diyfp(F + kHiddenBit, static_cast<int>(E) - kBias);
+
+    // Compute the boundaries m- and m+ of the floating-point value
+    // v = f * 2^e.
+    //
+    // Determine v- and v+, the floating-point predecessor and successor if v,
+    // respectively.
+    //
+    //      v- = v - 2^e        if f != 2^(p-1) or e == e_min                (A)
+    //         = v - 2^(e-1)    if f == 2^(p-1) and e > e_min                (B)
+    //
+    //      v+ = v + 2^e
+    //
+    // Let m- = (v- + v) / 2 and m+ = (v + v+) / 2. All real numbers _strictly_
+    // between m- and m+ round to v, regardless of how the input rounding
+    // algorithm breaks ties.
+    //
+    //      ---+-------------+-------------+-------------+-------------+---  (A)
+    //         v-            m-            v             m+            v+
+    //
+    //      -----------------+------+------+-------------+-------------+---  (B)
+    //                       v-     m-     v             m+            v+
+
+    const bool lower_boundary_is_closer = F == 0 && E > 1;
+    const diyfp m_plus = diyfp(2 * v.f + 1, v.e - 1);
+    const diyfp m_minus = lower_boundary_is_closer
+                          ? diyfp(4 * v.f - 1, v.e - 2)  // (B)
+                          : diyfp(2 * v.f - 1, v.e - 1); // (A)
+
+    // Determine the normalized w+ = m+.
+    const diyfp w_plus = diyfp::normalize(m_plus);
+
+    // Determine w- = m- such that e_(w-) = e_(w+).
+    const diyfp w_minus = diyfp::normalize_to(m_minus, w_plus.e);
+
+    return {diyfp::normalize(v), w_minus, w_plus};
+}
+
+// Given normalized diyfp w, Grisu needs to find a (normalized) cached
+// power-of-ten c, such that the exponent of the product c * w = f * 2^e lies
+// within a certain range [alpha, gamma] (Definition 3.2 from [1])
+//
+//      alpha <= e = e_c + e_w + q <= gamma
+//
+// or
+//
+//      f_c * f_w * 2^alpha <= f_c 2^(e_c) * f_w 2^(e_w) * 2^q
+//                          <= f_c * f_w * 2^gamma
+//
+// Since c and w are normalized, i.e. 2^(q-1) <= f < 2^q, this implies
+//
+//      2^(q-1) * 2^(q-1) * 2^alpha <= c * w * 2^q < 2^q * 2^q * 2^gamma
+//
+// or
+//
+//      2^(q - 2 + alpha) <= c * w < 2^(q + gamma)
+//
+// The choice of (alpha,gamma) determines the size of the table and the form of
+// the digit generation procedure. Using (alpha,gamma)=(-60,-32) works out well
+// in practice:
+//
+// The idea is to cut the number c * w = f * 2^e into two parts, which can be
+// processed independently: An integral part p1, and a fractional part p2:
+//
+//      f * 2^e = ( (f div 2^-e) * 2^-e + (f mod 2^-e) ) * 2^e
+//              = (f div 2^-e) + (f mod 2^-e) * 2^e
+//              = p1 + p2 * 2^e
+//
+// The conversion of p1 into decimal form requires a series of divisions and
+// modulos by (a power of) 10. These operations are faster for 32-bit than for
+// 64-bit integers, so p1 should ideally fit into a 32-bit integer. This can be
+// achieved by choosing
+//
+//      -e >= 32   or   e <= -32 := gamma
+//
+// In order to convert the fractional part
+//
+//      p2 * 2^e = p2 / 2^-e = d[-1] / 10^1 + d[-2] / 10^2 + ...
+//
+// into decimal form, the fraction is repeatedly multiplied by 10 and the digits
+// d[-i] are extracted in order:
+//
+//      (10 * p2) div 2^-e = d[-1]
+//      (10 * p2) mod 2^-e = d[-2] / 10^1 + ...
+//
+// The multiplication by 10 must not overflow. It is sufficient to choose
+//
+//      10 * p2 < 16 * p2 = 2^4 * p2 <= 2^64.
+//
+// Since p2 = f mod 2^-e < 2^-e,
+//
+//      -e <= 60   or   e >= -60 := alpha
+
+constexpr int kAlpha = -60;
+constexpr int kGamma = -32;
+
+struct cached_power // c = f * 2^e ~= 10^k
+{
+    std::uint64_t f;
+    int e;
+    int k;
+};
+
+/*!
+For a normalized diyfp w = f * 2^e, this function returns a (normalized) cached
+power-of-ten c = f_c * 2^e_c, such that the exponent of the product w * c
+satisfies (Definition 3.2 from [1])
+
+     alpha <= e_c + e + q <= gamma.
+*/
+inline cached_power get_cached_power_for_binary_exponent(int e)
+{
+    // Now
+    //
+    //      alpha <= e_c + e + q <= gamma                                    (1)
+    //      ==> f_c * 2^alpha <= c * 2^e * 2^q
+    //
+    // and since the c's are normalized, 2^(q-1) <= f_c,
+    //
+    //      ==> 2^(q - 1 + alpha) <= c * 2^(e + q)
+    //      ==> 2^(alpha - e - 1) <= c
+    //
+    // If c were an exact power of ten, i.e. c = 10^k, one may determine k as
+    //
+    //      k = ceil( log_10( 2^(alpha - e - 1) ) )
+    //        = ceil( (alpha - e - 1) * log_10(2) )
+    //
+    // From the paper:
+    // "In theory the result of the procedure could be wrong since c is rounded,
+    //  and the computation itself is approximated [...]. In practice, however,
+    //  this simple function is sufficient."
+    //
+    // For IEEE double precision floating-point numbers converted into
+    // normalized diyfp's w = f * 2^e, with q = 64,
+    //
+    //      e >= -1022      (min IEEE exponent)
+    //           -52        (p - 1)
+    //           -52        (p - 1, possibly normalize denormal IEEE numbers)
+    //           -11        (normalize the diyfp)
+    //         = -1137
+    //
+    // and
+    //
+    //      e <= +1023      (max IEEE exponent)
+    //           -52        (p - 1)
+    //           -11        (normalize the diyfp)
+    //         = 960
+    //
+    // This binary exponent range [-1137,960] results in a decimal exponent
+    // range [-307,324]. One does not need to store a cached power for each
+    // k in this range. For each such k it suffices to find a cached power
+    // such that the exponent of the product lies in [alpha,gamma].
+    // This implies that the difference of the decimal exponents of adjacent
+    // table entries must be less than or equal to
+    //
+    //      floor( (gamma - alpha) * log_10(2) ) = 8.
+    //
+    // (A smaller distance gamma-alpha would require a larger table.)
+
+    // NB:
+    // Actually this function returns c, such that -60 <= e_c + e + 64 <= -34.
+
+    constexpr int kCachedPowersMinDecExp = -300;
+    constexpr int kCachedPowersDecStep = 8;
+
+    static constexpr std::array<cached_power, 79> kCachedPowers =
+    {
+        {
+            { 0xAB70FE17C79AC6CA, -1060, -300 },
+            { 0xFF77B1FCBEBCDC4F, -1034, -292 },
+            { 0xBE5691EF416BD60C, -1007, -284 },
+            { 0x8DD01FAD907FFC3C,  -980, -276 },
+            { 0xD3515C2831559A83,  -954, -268 },
+            { 0x9D71AC8FADA6C9B5,  -927, -260 },
+            { 0xEA9C227723EE8BCB,  -901, -252 },
+            { 0xAECC49914078536D,  -874, -244 },
+            { 0x823C12795DB6CE57,  -847, -236 },
+            { 0xC21094364DFB5637,  -821, -228 },
+            { 0x9096EA6F3848984F,  -794, -220 },
+            { 0xD77485CB25823AC7,  -768, -212 },
+            { 0xA086CFCD97BF97F4,  -741, -204 },
+            { 0xEF340A98172AACE5,  -715, -196 },
+            { 0xB23867FB2A35B28E,  -688, -188 },
+            { 0x84C8D4DFD2C63F3B,  -661, -180 },
+            { 0xC5DD44271AD3CDBA,  -635, -172 },
+            { 0x936B9FCEBB25C996,  -608, -164 },
+            { 0xDBAC6C247D62A584,  -582, -156 },
+            { 0xA3AB66580D5FDAF6,  -555, -148 },
+            { 0xF3E2F893DEC3F126,  -529, -140 },
+            { 0xB5B5ADA8AAFF80B8,  -502, -132 },
+            { 0x87625F056C7C4A8B,  -475, -124 },
+            { 0xC9BCFF6034C13053,  -449, -116 },
+            { 0x964E858C91BA2655,  -422, -108 },
+            { 0xDFF9772470297EBD,  -396, -100 },
+            { 0xA6DFBD9FB8E5B88F,  -369,  -92 },
+            { 0xF8A95FCF88747D94,  -343,  -84 },
+            { 0xB94470938FA89BCF,  -316,  -76 },
+            { 0x8A08F0F8BF0F156B,  -289,  -68 },
+            { 0xCDB02555653131B6,  -263,  -60 },
+            { 0x993FE2C6D07B7FAC,  -236,  -52 },
+            { 0xE45C10C42A2B3B06,  -210,  -44 },
+            { 0xAA242499697392D3,  -183,  -36 },
+            { 0xFD87B5F28300CA0E,  -157,  -28 },
+            { 0xBCE5086492111AEB,  -130,  -20 },
+            { 0x8CBCCC096F5088CC,  -103,  -12 },
+            { 0xD1B71758E219652C,   -77,   -4 },
+            { 0x9C40000000000000,   -50,    4 },
+            { 0xE8D4A51000000000,   -24,   12 },
+            { 0xAD78EBC5AC620000,     3,   20 },
+            { 0x813F3978F8940984,    30,   28 },
+            { 0xC097CE7BC90715B3,    56,   36 },
+            { 0x8F7E32CE7BEA5C70,    83,   44 },
+            { 0xD5D238A4ABE98068,   109,   52 },
+            { 0x9F4F2726179A2245,   136,   60 },
+            { 0xED63A231D4C4FB27,   162,   68 },
+            { 0xB0DE65388CC8ADA8,   189,   76 },
+            { 0x83C7088E1AAB65DB,   216,   84 },
+            { 0xC45D1DF942711D9A,   242,   92 },
+            { 0x924D692CA61BE758,   269,  100 },
+            { 0xDA01EE641A708DEA,   295,  108 },
+            { 0xA26DA3999AEF774A,   322,  116 },
+            { 0xF209787BB47D6B85,   348,  124 },
+            { 0xB454E4A179DD1877,   375,  132 },
+            { 0x865B86925B9BC5C2,   402,  140 },
+            { 0xC83553C5C8965D3D,   428,  148 },
+            { 0x952AB45CFA97A0B3,   455,  156 },
+            { 0xDE469FBD99A05FE3,   481,  164 },
+            { 0xA59BC234DB398C25,   508,  172 },
+            { 0xF6C69A72A3989F5C,   534,  180 },
+            { 0xB7DCBF5354E9BECE,   561,  188 },
+            { 0x88FCF317F22241E2,   588,  196 },
+            { 0xCC20CE9BD35C78A5,   614,  204 },
+            { 0x98165AF37B2153DF,   641,  212 },
+            { 0xE2A0B5DC971F303A,   667,  220 },
+            { 0xA8D9D1535CE3B396,   694,  228 },
+            { 0xFB9B7CD9A4A7443C,   720,  236 },
+            { 0xBB764C4CA7A44410,   747,  244 },
+            { 0x8BAB8EEFB6409C1A,   774,  252 },
+            { 0xD01FEF10A657842C,   800,  260 },
+            { 0x9B10A4E5E9913129,   827,  268 },
+            { 0xE7109BFBA19C0C9D,   853,  276 },
+            { 0xAC2820D9623BF429,   880,  284 },
+            { 0x80444B5E7AA7CF85,   907,  292 },
+            { 0xBF21E44003ACDD2D,   933,  300 },
+            { 0x8E679C2F5E44FF8F,   960,  308 },
+            { 0xD433179D9C8CB841,   986,  316 },
+            { 0x9E19DB92B4E31BA9,  1013,  324 },
+        }
+    };
+
+    // This computation gives exactly the same results for k as
+    //      k = ceil((kAlpha - e - 1) * 0.30102999566398114)
+    // for |e| <= 1500, but doesn't require floating-point operations.
+    // NB: log_10(2) ~= 78913 / 2^18
+    JSON_ASSERT(e >= -1500);
+    JSON_ASSERT(e <=  1500);
+    const int f = kAlpha - e - 1;
+    const int k = (f * 78913) / (1 << 18) + static_cast<int>(f > 0);
+
+    const int index = (-kCachedPowersMinDecExp + k + (kCachedPowersDecStep - 1)) / kCachedPowersDecStep;
+    JSON_ASSERT(index >= 0);
+    JSON_ASSERT(static_cast<std::size_t>(index) < kCachedPowers.size());
+
+    const cached_power cached = kCachedPowers[static_cast<std::size_t>(index)];
+    JSON_ASSERT(kAlpha <= cached.e + e + 64);
+    JSON_ASSERT(kGamma >= cached.e + e + 64);
+
+    return cached;
+}
+
+/*!
+For n != 0, returns k, such that pow10 := 10^(k-1) <= n < 10^k.
+For n == 0, returns 1 and sets pow10 := 1.
+*/
+inline int find_largest_pow10(const std::uint32_t n, std::uint32_t& pow10)
+{
+    // LCOV_EXCL_START
+    if (n >= 1000000000)
+    {
+        pow10 = 1000000000;
+        return 10;
+    }
+    // LCOV_EXCL_STOP
+    if (n >= 100000000)
+    {
+        pow10 = 100000000;
+        return  9;
+    }
+    if (n >= 10000000)
+    {
+        pow10 = 10000000;
+        return  8;
+    }
+    if (n >= 1000000)
+    {
+        pow10 = 1000000;
+        return  7;
+    }
+    if (n >= 100000)
+    {
+        pow10 = 100000;
+        return  6;
+    }
+    if (n >= 10000)
+    {
+        pow10 = 10000;
+        return  5;
+    }
+    if (n >= 1000)
+    {
+        pow10 = 1000;
+        return  4;
+    }
+    if (n >= 100)
+    {
+        pow10 = 100;
+        return  3;
+    }
+    if (n >= 10)
+    {
+        pow10 = 10;
+        return  2;
+    }
+
+    pow10 = 1;
+    return 1;
+}
+
+inline void grisu2_round(char* buf, int len, std::uint64_t dist, std::uint64_t delta,
+                         std::uint64_t rest, std::uint64_t ten_k)
+{
+    JSON_ASSERT(len >= 1);
+    JSON_ASSERT(dist <= delta);
+    JSON_ASSERT(rest <= delta);
+    JSON_ASSERT(ten_k > 0);
+
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    //                                  ten_k
+    //                                <------>
+    //                                       <---- rest ---->
+    // --------------[------------------+----+--------------]--------------
+    //                                  w    V
+    //                                       = buf * 10^k
+    //
+    // ten_k represents a unit-in-the-last-place in the decimal representation
+    // stored in buf.
+    // Decrement buf by ten_k while this takes buf closer to w.
+
+    // The tests are written in this order to avoid overflow in unsigned
+    // integer arithmetic.
+
+    while (rest < dist
+            && delta - rest >= ten_k
+            && (rest + ten_k < dist || dist - rest > rest + ten_k - dist))
+    {
+        JSON_ASSERT(buf[len - 1] != '0');
+        buf[len - 1]--;
+        rest += ten_k;
+    }
+}
+
+/*!
+Generates V = buffer * 10^decimal_exponent, such that M- <= V <= M+.
+M- and M+ must be normalized and share the same exponent -60 <= e <= -32.
+*/
+inline void grisu2_digit_gen(char* buffer, int& length, int& decimal_exponent,
+                             diyfp M_minus, diyfp w, diyfp M_plus)
+{
+    static_assert(kAlpha >= -60, "internal error");
+    static_assert(kGamma <= -32, "internal error");
+
+    // Generates the digits (and the exponent) of a decimal floating-point
+    // number V = buffer * 10^decimal_exponent in the range [M-, M+]. The diyfp's
+    // w, M- and M+ share the same exponent e, which satisfies alpha <= e <= gamma.
+    //
+    //               <--------------------------- delta ---->
+    //                                  <---- dist --------->
+    // --------------[------------------+-------------------]--------------
+    //               M-                 w                   M+
+    //
+    // Grisu2 generates the digits of M+ from left to right and stops as soon as
+    // V is in [M-,M+].
+
+    JSON_ASSERT(M_plus.e >= kAlpha);
+    JSON_ASSERT(M_plus.e <= kGamma);
+
+    std::uint64_t delta = diyfp::sub(M_plus, M_minus).f; // (significand of (M+ - M-), implicit exponent is e)
+    std::uint64_t dist  = diyfp::sub(M_plus, w      ).f; // (significand of (M+ - w ), implicit exponent is e)
+
+    // Split M+ = f * 2^e into two parts p1 and p2 (note: e < 0):
+    //
+    //      M+ = f * 2^e
+    //         = ((f div 2^-e) * 2^-e + (f mod 2^-e)) * 2^e
+    //         = ((p1        ) * 2^-e + (p2        )) * 2^e
+    //         = p1 + p2 * 2^e
+
+    const diyfp one(std::uint64_t{1} << -M_plus.e, M_plus.e);
+
+    auto p1 = static_cast<std::uint32_t>(M_plus.f >> -one.e); // p1 = f div 2^-e (Since -e >= 32, p1 fits into a 32-bit int.)
+    std::uint64_t p2 = M_plus.f & (one.f - 1);                    // p2 = f mod 2^-e
+
+    // 1)
+    //
+    // Generate the digits of the integral part p1 = d[n-1]...d[1]d[0]
+
+    JSON_ASSERT(p1 > 0);
+
+    std::uint32_t pow10{};
+    const int k = find_largest_pow10(p1, pow10);
+
+    //      10^(k-1) <= p1 < 10^k, pow10 = 10^(k-1)
+    //
+    //      p1 = (p1 div 10^(k-1)) * 10^(k-1) + (p1 mod 10^(k-1))
+    //         = (d[k-1]         ) * 10^(k-1) + (p1 mod 10^(k-1))
+    //
+    //      M+ = p1                                             + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + (p1 mod 10^(k-1))          + p2 * 2^e
+    //         = d[k-1] * 10^(k-1) + ((p1 mod 10^(k-1)) * 2^-e + p2) * 2^e
+    //         = d[k-1] * 10^(k-1) + (                         rest) * 2^e
+    //
+    // Now generate the digits d[n] of p1 from left to right (n = k-1,...,0)
+    //
+    //      p1 = d[k-1]...d[n] * 10^n + d[n-1]...d[0]
+    //
+    // but stop as soon as
+    //
+    //      rest * 2^e = (d[n-1]...d[0] * 2^-e + p2) * 2^e <= delta * 2^e
+
+    int n = k;
+    while (n > 0)
+    {
+        // Invariants:
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)    (buffer = 0 for n = k)
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        //
+        const std::uint32_t d = p1 / pow10;  // d = p1 div 10^(n-1)
+        const std::uint32_t r = p1 % pow10;  // r = p1 mod 10^(n-1)
+        //
+        //      M+ = buffer * 10^n + (d * 10^(n-1) + r) + p2 * 2^e
+        //         = (buffer * 10 + d) * 10^(n-1) + (r + p2 * 2^e)
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(n-1) + (r + p2 * 2^e)
+        //
+        p1 = r;
+        n--;
+        //
+        //      M+ = buffer * 10^n + (p1 + p2 * 2^e)
+        //      pow10 = 10^n
+        //
+
+        // Now check if enough digits have been generated.
+        // Compute
+        //
+        //      p1 + p2 * 2^e = (p1 * 2^-e + p2) * 2^e = rest * 2^e
+        //
+        // Note:
+        // Since rest and delta share the same exponent e, it suffices to
+        // compare the significands.
+        const std::uint64_t rest = (std::uint64_t{p1} << -one.e) + p2;
+        if (rest <= delta)
+        {
+            // V = buffer * 10^n, with M- <= V <= M+.
+
+            decimal_exponent += n;
+
+            // We may now just stop. But instead look if the buffer could be
+            // decremented to bring V closer to w.
+            //
+            // pow10 = 10^n is now 1 ulp in the decimal representation V.
+            // The rounding procedure works with diyfp's with an implicit
+            // exponent of e.
+            //
+            //      10^n = (10^n * 2^-e) * 2^e = ulp * 2^e
+            //
+            const std::uint64_t ten_n = std::uint64_t{pow10} << -one.e;
+            grisu2_round(buffer, length, dist, delta, rest, ten_n);
+
+            return;
+        }
+
+        pow10 /= 10;
+        //
+        //      pow10 = 10^(n-1) <= p1 < 10^n
+        // Invariants restored.
+    }
+
+    // 2)
+    //
+    // The digits of the integral part have been generated:
+    //
+    //      M+ = d[k-1]...d[1]d[0] + p2 * 2^e
+    //         = buffer            + p2 * 2^e
+    //
+    // Now generate the digits of the fractional part p2 * 2^e.
+    //
+    // Note:
+    // No decimal point is generated: the exponent is adjusted instead.
+    //
+    // p2 actually represents the fraction
+    //
+    //      p2 * 2^e
+    //          = p2 / 2^-e
+    //          = d[-1] / 10^1 + d[-2] / 10^2 + ...
+    //
+    // Now generate the digits d[-m] of p1 from left to right (m = 1,2,...)
+    //
+    //      p2 * 2^e = d[-1]d[-2]...d[-m] * 10^-m
+    //                      + 10^-m * (d[-m-1] / 10^1 + d[-m-2] / 10^2 + ...)
+    //
+    // using
+    //
+    //      10^m * p2 = ((10^m * p2) div 2^-e) * 2^-e + ((10^m * p2) mod 2^-e)
+    //                = (                   d) * 2^-e + (                   r)
+    //
+    // or
+    //      10^m * p2 * 2^e = d + r * 2^e
+    //
+    // i.e.
+    //
+    //      M+ = buffer + p2 * 2^e
+    //         = buffer + 10^-m * (d + r * 2^e)
+    //         = (buffer * 10^m + d) * 10^-m + 10^-m * r * 2^e
+    //
+    // and stop as soon as 10^-m * r * 2^e <= delta * 2^e
+
+    JSON_ASSERT(p2 > delta);
+
+    int m = 0;
+    for (;;)
+    {
+        // Invariant:
+        //      M+ = buffer * 10^-m + 10^-m * (d[-m-1] / 10 + d[-m-2] / 10^2 + ...) * 2^e
+        //         = buffer * 10^-m + 10^-m * (p2                                 ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (10 * p2)                   ) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * ((10*p2 div 2^-e) * 2^-e + (10*p2 mod 2^-e)) * 2^e
+        //
+        JSON_ASSERT(p2 <= (std::numeric_limits<std::uint64_t>::max)() / 10);
+        p2 *= 10;
+        const std::uint64_t d = p2 >> -one.e;     // d = (10 * p2) div 2^-e
+        const std::uint64_t r = p2 & (one.f - 1); // r = (10 * p2) mod 2^-e
+        //
+        //      M+ = buffer * 10^-m + 10^-m * (1/10 * (d * 2^-e + r) * 2^e
+        //         = buffer * 10^-m + 10^-m * (1/10 * (d + r * 2^e))
+        //         = (buffer * 10 + d) * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        JSON_ASSERT(d <= 9);
+        buffer[length++] = static_cast<char>('0' + d); // buffer := buffer * 10 + d
+        //
+        //      M+ = buffer * 10^(-m-1) + 10^(-m-1) * r * 2^e
+        //
+        p2 = r;
+        m++;
+        //
+        //      M+ = buffer * 10^-m + 10^-m * p2 * 2^e
+        // Invariant restored.
+
+        // Check if enough digits have been generated.
+        //
+        //      10^-m * p2 * 2^e <= delta * 2^e
+        //              p2 * 2^e <= 10^m * delta * 2^e
+        //                    p2 <= 10^m * delta
+        delta *= 10;
+        dist  *= 10;
+        if (p2 <= delta)
+        {
+            break;
+        }
+    }
+
+    // V = buffer * 10^-m, with M- <= V <= M+.
+
+    decimal_exponent -= m;
+
+    // 1 ulp in the decimal representation is now 10^-m.
+    // Since delta and dist are now scaled by 10^m, we need to do the
+    // same with ulp in order to keep the units in sync.
+    //
+    //      10^m * 10^-m = 1 = 2^-e * 2^e = ten_m * 2^e
+    //
+    const std::uint64_t ten_m = one.f;
+    grisu2_round(buffer, length, dist, delta, p2, ten_m);
+
+    // By construction this algorithm generates the shortest possible decimal
+    // number (Loitsch, Theorem 6.2) which rounds back to w.
+    // For an input number of precision p, at least
+    //
+    //      N = 1 + ceil(p * log_10(2))
+    //
+    // decimal digits are sufficient to identify all binary floating-point
+    // numbers (Matula, "In-and-Out conversions").
+    // This implies that the algorithm does not produce more than N decimal
+    // digits.
+    //
+    //      N = 17 for p = 53 (IEEE double precision)
+    //      N = 9  for p = 24 (IEEE single precision)
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline void grisu2(char* buf, int& len, int& decimal_exponent,
+                   diyfp m_minus, diyfp v, diyfp m_plus)
+{
+    JSON_ASSERT(m_plus.e == m_minus.e);
+    JSON_ASSERT(m_plus.e == v.e);
+
+    //  --------(-----------------------+-----------------------)--------    (A)
+    //          m-                      v                       m+
+    //
+    //  --------------------(-----------+-----------------------)--------    (B)
+    //                      m-          v                       m+
+    //
+    // First scale v (and m- and m+) such that the exponent is in the range
+    // [alpha, gamma].
+
+    const cached_power cached = get_cached_power_for_binary_exponent(m_plus.e);
+
+    const diyfp c_minus_k(cached.f, cached.e); // = c ~= 10^-k
+
+    // The exponent of the products is = v.e + c_minus_k.e + q and is in the range [alpha,gamma]
+    const diyfp w       = diyfp::mul(v,       c_minus_k);
+    const diyfp w_minus = diyfp::mul(m_minus, c_minus_k);
+    const diyfp w_plus  = diyfp::mul(m_plus,  c_minus_k);
+
+    //  ----(---+---)---------------(---+---)---------------(---+---)----
+    //          w-                      w                       w+
+    //          = c*m-                  = c*v                   = c*m+
+    //
+    // diyfp::mul rounds its result and c_minus_k is approximated too. w, w- and
+    // w+ are now off by a small amount.
+    // In fact:
+    //
+    //      w - v * 10^k < 1 ulp
+    //
+    // To account for this inaccuracy, add resp. subtract 1 ulp.
+    //
+    //  --------+---[---------------(---+---)---------------]---+--------
+    //          w-  M-                  w                   M+  w+
+    //
+    // Now any number in [M-, M+] (bounds included) will round to w when input,
+    // regardless of how the input rounding algorithm breaks ties.
+    //
+    // And digit_gen generates the shortest possible such number in [M-, M+].
+    // Note that this does not mean that Grisu2 always generates the shortest
+    // possible number in the interval (m-, m+).
+    const diyfp M_minus(w_minus.f + 1, w_minus.e);
+    const diyfp M_plus (w_plus.f  - 1, w_plus.e );
+
+    decimal_exponent = -cached.k; // = -(-k) = k
+
+    grisu2_digit_gen(buf, len, decimal_exponent, M_minus, w, M_plus);
+}
+
+/*!
+v = buf * 10^decimal_exponent
+len is the length of the buffer (number of decimal digits)
+The buffer must be large enough, i.e. >= max_digits10.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1)
+void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value)
+{
+    static_assert(diyfp::kPrecision >= std::numeric_limits<FloatType>::digits + 3,
+                  "internal error: not enough precision");
+
+    JSON_ASSERT(std::isfinite(value));
+    JSON_ASSERT(value > 0);
+
+    // If the neighbors (and boundaries) of 'value' are always computed for double-precision
+    // numbers, all float's can be recovered using strtod (and strtof). However, the resulting
+    // decimal representations are not exactly "short".
+    //
+    // The documentation for 'std::to_chars' (https://en.cppreference.com/w/cpp/utility/to_chars)
+    // says "value is converted to a string as if by std::sprintf in the default ("C") locale"
+    // and since sprintf promotes float's to double's, I think this is exactly what 'std::to_chars'
+    // does.
+    // On the other hand, the documentation for 'std::to_chars' requires that "parsing the
+    // representation using the corresponding std::from_chars function recovers value exactly". That
+    // indicates that single precision floating-point numbers should be recovered using
+    // 'std::strtof'.
+    //
+    // NB: If the neighbors are computed for single-precision numbers, there is a single float
+    //     (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision
+    //     value is off by 1 ulp.
+#if 0
+    const boundaries w = compute_boundaries(static_cast<double>(value));
+#else
+    const boundaries w = compute_boundaries(value);
+#endif
+
+    grisu2(buf, len, decimal_exponent, w.minus, w.w, w.plus);
+}
+
+/*!
+@brief appends a decimal representation of e to buf
+@return a pointer to the element following the exponent.
+@pre -1000 < e < 1000
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* append_exponent(char* buf, int e)
+{
+    JSON_ASSERT(e > -1000);
+    JSON_ASSERT(e <  1000);
+
+    if (e < 0)
+    {
+        e = -e;
+        *buf++ = '-';
+    }
+    else
+    {
+        *buf++ = '+';
+    }
+
+    auto k = static_cast<std::uint32_t>(e);
+    if (k < 10)
+    {
+        // Always print at least two digits in the exponent.
+        // This is for compatibility with printf("%g").
+        *buf++ = '0';
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else if (k < 100)
+    {
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+    else
+    {
+        *buf++ = static_cast<char>('0' + k / 100);
+        k %= 100;
+        *buf++ = static_cast<char>('0' + k / 10);
+        k %= 10;
+        *buf++ = static_cast<char>('0' + k);
+    }
+
+    return buf;
+}
+
+/*!
+@brief prettify v = buf * 10^decimal_exponent
+
+If v is in the range [10^min_exp, 10^max_exp) it will be printed in fixed-point
+notation. Otherwise it will be printed in exponential notation.
+
+@pre min_exp < 0
+@pre max_exp > 0
+*/
+JSON_HEDLEY_NON_NULL(1)
+JSON_HEDLEY_RETURNS_NON_NULL
+inline char* format_buffer(char* buf, int len, int decimal_exponent,
+                           int min_exp, int max_exp)
+{
+    JSON_ASSERT(min_exp < 0);
+    JSON_ASSERT(max_exp > 0);
+
+    const int k = len;
+    const int n = len + decimal_exponent;
+
+    // v = buf * 10^(n-k)
+    // k is the length of the buffer (number of decimal digits)
+    // n is the position of the decimal point relative to the start of the buffer.
+
+    if (k <= n && n <= max_exp)
+    {
+        // digits[000]
+        // len <= max_exp + 2
+
+        std::memset(buf + k, '0', static_cast<size_t>(n) - static_cast<size_t>(k));
+        // Make it look like a floating-point number (#362, #378)
+        buf[n + 0] = '.';
+        buf[n + 1] = '0';
+        return buf + (static_cast<size_t>(n) + 2);
+    }
+
+    if (0 < n && n <= max_exp)
+    {
+        // dig.its
+        // len <= max_digits10 + 1
+
+        JSON_ASSERT(k > n);
+
+        std::memmove(buf + (static_cast<size_t>(n) + 1), buf + n, static_cast<size_t>(k) - static_cast<size_t>(n));
+        buf[n] = '.';
+        return buf + (static_cast<size_t>(k) + 1U);
+    }
+
+    if (min_exp < n && n <= 0)
+    {
+        // 0.[000]digits
+        // len <= 2 + (-min_exp - 1) + max_digits10
+
+        std::memmove(buf + (2 + static_cast<size_t>(-n)), buf, static_cast<size_t>(k));
+        buf[0] = '0';
+        buf[1] = '.';
+        std::memset(buf + 2, '0', static_cast<size_t>(-n));
+        return buf + (2U + static_cast<size_t>(-n) + static_cast<size_t>(k));
+    }
+
+    if (k == 1)
+    {
+        // dE+123
+        // len <= 1 + 5
+
+        buf += 1;
+    }
+    else
+    {
+        // d.igitsE+123
+        // len <= max_digits10 + 1 + 5
+
+        std::memmove(buf + 2, buf + 1, static_cast<size_t>(k) - 1);
+        buf[1] = '.';
+        buf += 1 + static_cast<size_t>(k);
+    }
+
+    *buf++ = 'e';
+    return append_exponent(buf, n - 1);
+}
+
+} // namespace dtoa_impl
+
+/*!
+@brief generates a decimal representation of the floating-point number value in [first, last).
+
+The format of the resulting decimal representation is similar to printf's %g
+format. Returns an iterator pointing past-the-end of the decimal representation.
+
+@note The input number must be finite, i.e. NaN's and Inf's are not supported.
+@note The buffer must be large enough.
+@note The result is NOT null-terminated.
+*/
+template<typename FloatType>
+JSON_HEDLEY_NON_NULL(1, 2)
+JSON_HEDLEY_RETURNS_NON_NULL
+char* to_chars(char* first, const char* last, FloatType value)
+{
+    static_cast<void>(last); // maybe unused - fix warning
+    JSON_ASSERT(std::isfinite(value));
+
+    // Use signbit(value) instead of (value < 0) since signbit works for -0.
+    if (std::signbit(value))
+    {
+        value = -value;
+        *first++ = '-';
+    }
+
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+    if (value == 0) // +-0
+    {
+        *first++ = '0';
+        // Make it look like a floating-point number (#362, #378)
+        *first++ = '.';
+        *first++ = '0';
+        return first;
+    }
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10);
+
+    // Compute v = buffer * 10^decimal_exponent.
+    // The decimal digits are stored in the buffer, which needs to be interpreted
+    // as an unsigned decimal integer.
+    // len is the length of the buffer, i.e. the number of decimal digits.
+    int len = 0;
+    int decimal_exponent = 0;
+    dtoa_impl::grisu2(first, len, decimal_exponent, value);
+
+    JSON_ASSERT(len <= std::numeric_limits<FloatType>::max_digits10);
+
+    // Format the buffer like printf("%.*g", prec, value)
+    constexpr int kMinExp = -4;
+    // Use digits10 here to increase compatibility with version 2.
+    constexpr int kMaxExp = std::numeric_limits<FloatType>::digits10;
+
+    JSON_ASSERT(last - first >= kMaxExp + 2);
+    JSON_ASSERT(last - first >= 2 + (-kMinExp - 1) + std::numeric_limits<FloatType>::max_digits10);
+    JSON_ASSERT(last - first >= std::numeric_limits<FloatType>::max_digits10 + 6);
+
+    return dtoa_impl::format_buffer(first, len, decimal_exponent, kMinExp, kMaxExp);
+}
+
+} // namespace detail
+} // namespace nlohmann
+
+// #include <nlohmann/detail/exceptions.hpp>
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+// #include <nlohmann/detail/meta/cpp_future.hpp>
+
+// #include <nlohmann/detail/output/binary_writer.hpp>
+
+// #include <nlohmann/detail/output/output_adapters.hpp>
+
+// #include <nlohmann/detail/value_t.hpp>
+
+
+namespace nlohmann
+{
+namespace detail
+{
+///////////////////
+// serialization //
+///////////////////
+
+/// how to treat decoding errors
+enum class error_handler_t
+{
+    strict,  ///< throw a type_error exception in case of invalid UTF-8
+    replace, ///< replace invalid UTF-8 sequences with U+FFFD
+    ignore   ///< ignore invalid UTF-8 sequences
+};
+
+template<typename BasicJsonType>
+class serializer
+{
+    using string_t = typename BasicJsonType::string_t;
+    using number_float_t = typename BasicJsonType::number_float_t;
+    using number_integer_t = typename BasicJsonType::number_integer_t;
+    using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+    using binary_char_t = typename BasicJsonType::binary_t::value_type;
+    static constexpr std::uint8_t UTF8_ACCEPT = 0;
+    static constexpr std::uint8_t UTF8_REJECT = 1;
+
+  public:
+    /*!
+    @param[in] s  output stream to serialize to
+    @param[in] ichar  indentation character to use
+    @param[in] error_handler_  how to react on decoding errors
+    */
+    serializer(output_adapter_t<char> s, const char ichar,
+               error_handler_t error_handler_ = error_handler_t::strict)
+        : o(std::move(s))
+        , loc(std::localeconv())
+        , thousands_sep(loc->thousands_sep == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->thousands_sep)))
+        , decimal_point(loc->decimal_point == nullptr ? '\0' : std::char_traits<char>::to_char_type(* (loc->decimal_point)))
+        , indent_char(ichar)
+        , indent_string(512, indent_char)
+        , error_handler(error_handler_)
+    {}
+
+    // delete because of pointer members
+    serializer(const serializer&) = delete;
+    serializer& operator=(const serializer&) = delete;
+    serializer(serializer&&) = delete;
+    serializer& operator=(serializer&&) = delete;
+    ~serializer() = default;
+
+    /*!
+    @brief internal implementation of the serialization function
+
+    This function is called by the public member function dump and organizes
+    the serialization internally. The indentation level is propagated as
+    additional parameter. In case of arrays and objects, the function is
+    called recursively.
+
+    - strings and object keys are escaped using `escape_string()`
+    - integer numbers are converted implicitly via `operator<<`
+    - floating-point numbers are converted to a string using `"%g"` format
+    - binary values are serialized as objects containing the subtype and the
+      byte array
+
+    @param[in] val               value to serialize
+    @param[in] pretty_print      whether the output shall be pretty-printed
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] indent_step       the indent level
+    @param[in] current_indent    the current indent level (only used internally)
+    */
+    void dump(const BasicJsonType& val,
+              const bool pretty_print,
+              const bool ensure_ascii,
+              const unsigned int indent_step,
+              const unsigned int current_indent = 0)
+    {
+        switch (val.m_type)
+        {
+            case value_t::object:
+            {
+                if (val.m_value.object->empty())
+                {
+                    o->write_characters("{}", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\": ", 3);
+                        dump(i->second, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\": ", 3);
+                    dump(i->second, true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_character('{');
+
+                    // first n-1 elements
+                    auto i = val.m_value.object->cbegin();
+                    for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i)
+                    {
+                        o->write_character('\"');
+                        dump_escaped(i->first, ensure_ascii);
+                        o->write_characters("\":", 2);
+                        dump(i->second, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(i != val.m_value.object->cend());
+                    JSON_ASSERT(std::next(i) == val.m_value.object->cend());
+                    o->write_character('\"');
+                    dump_escaped(i->first, ensure_ascii);
+                    o->write_characters("\":", 2);
+                    dump(i->second, false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character('}');
+                }
+
+                return;
+            }
+
+            case value_t::array:
+            {
+                if (val.m_value.array->empty())
+                {
+                    o->write_characters("[]", 2);
+                    return;
+                }
+
+                if (pretty_print)
+                {
+                    o->write_characters("[\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        o->write_characters(indent_string.c_str(), new_indent);
+                        dump(*i, true, ensure_ascii, indent_step, new_indent);
+                        o->write_characters(",\n", 2);
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_value.array->empty());
+                    o->write_characters(indent_string.c_str(), new_indent);
+                    dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent);
+
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character(']');
+                }
+                else
+                {
+                    o->write_character('[');
+
+                    // first n-1 elements
+                    for (auto i = val.m_value.array->cbegin();
+                            i != val.m_value.array->cend() - 1; ++i)
+                    {
+                        dump(*i, false, ensure_ascii, indent_step, current_indent);
+                        o->write_character(',');
+                    }
+
+                    // last element
+                    JSON_ASSERT(!val.m_value.array->empty());
+                    dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent);
+
+                    o->write_character(']');
+                }
+
+                return;
+            }
+
+            case value_t::string:
+            {
+                o->write_character('\"');
+                dump_escaped(*val.m_value.string, ensure_ascii);
+                o->write_character('\"');
+                return;
+            }
+
+            case value_t::binary:
+            {
+                if (pretty_print)
+                {
+                    o->write_characters("{\n", 2);
+
+                    // variable to hold indentation for recursive calls
+                    const auto new_indent = current_indent + indent_step;
+                    if (JSON_HEDLEY_UNLIKELY(indent_string.size() < new_indent))
+                    {
+                        indent_string.resize(indent_string.size() * 2, ' ');
+                    }
+
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"bytes\": [", 10);
+
+                    if (!val.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_value.binary->cbegin();
+                                i != val.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_characters(", ", 2);
+                        }
+                        dump_integer(val.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\n", 3);
+                    o->write_characters(indent_string.c_str(), new_indent);
+
+                    o->write_characters("\"subtype\": ", 11);
+                    if (val.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_value.binary->subtype());
+                    }
+                    else
+                    {
+                        o->write_characters("null", 4);
+                    }
+                    o->write_character('\n');
+                    o->write_characters(indent_string.c_str(), current_indent);
+                    o->write_character('}');
+                }
+                else
+                {
+                    o->write_characters("{\"bytes\":[", 10);
+
+                    if (!val.m_value.binary->empty())
+                    {
+                        for (auto i = val.m_value.binary->cbegin();
+                                i != val.m_value.binary->cend() - 1; ++i)
+                        {
+                            dump_integer(*i);
+                            o->write_character(',');
+                        }
+                        dump_integer(val.m_value.binary->back());
+                    }
+
+                    o->write_characters("],\"subtype\":", 12);
+                    if (val.m_value.binary->has_subtype())
+                    {
+                        dump_integer(val.m_value.binary->subtype());
+                        o->write_character('}');
+                    }
+                    else
+                    {
+                        o->write_characters("null}", 5);
+                    }
+                }
+                return;
+            }
+
+            case value_t::boolean:
+            {
+                if (val.m_value.boolean)
+                {
+                    o->write_characters("true", 4);
+                }
+                else
+                {
+                    o->write_characters("false", 5);
+                }
+                return;
+            }
+
+            case value_t::number_integer:
+            {
+                dump_integer(val.m_value.number_integer);
+                return;
+            }
+
+            case value_t::number_unsigned:
+            {
+                dump_integer(val.m_value.number_unsigned);
+                return;
+            }
+
+            case value_t::number_float:
+            {
+                dump_float(val.m_value.number_float);
+                return;
+            }
+
+            case value_t::discarded:
+            {
+                o->write_characters("<discarded>", 11);
+                return;
+            }
+
+            case value_t::null:
+            {
+                o->write_characters("null", 4);
+                return;
+            }
+
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+    }
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /*!
+    @brief dump escaped string
+
+    Escape a string by replacing certain special characters by a sequence of an
+    escape character (backslash) and another character and other control
+    characters by a sequence of "\u" followed by a four-digit hex
+    representation. The escaped string is written to output stream @a o.
+
+    @param[in] s  the string to escape
+    @param[in] ensure_ascii  whether to escape non-ASCII characters with
+                             \uXXXX sequences
+
+    @complexity Linear in the length of string @a s.
+    */
+    void dump_escaped(const string_t& s, const bool ensure_ascii)
+    {
+        std::uint32_t codepoint{};
+        std::uint8_t state = UTF8_ACCEPT;
+        std::size_t bytes = 0;  // number of bytes written to string_buffer
+
+        // number of bytes written at the point of the last valid byte
+        std::size_t bytes_after_last_accept = 0;
+        std::size_t undumped_chars = 0;
+
+        for (std::size_t i = 0; i < s.size(); ++i)
+        {
+            const auto byte = static_cast<std::uint8_t>(s[i]);
+
+            switch (decode(state, codepoint, byte))
+            {
+                case UTF8_ACCEPT:  // decode found a new code point
+                {
+                    switch (codepoint)
+                    {
+                        case 0x08: // backspace
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'b';
+                            break;
+                        }
+
+                        case 0x09: // horizontal tab
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 't';
+                            break;
+                        }
+
+                        case 0x0A: // newline
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'n';
+                            break;
+                        }
+
+                        case 0x0C: // formfeed
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'f';
+                            break;
+                        }
+
+                        case 0x0D: // carriage return
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = 'r';
+                            break;
+                        }
+
+                        case 0x22: // quotation mark
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\"';
+                            break;
+                        }
+
+                        case 0x5C: // reverse solidus
+                        {
+                            string_buffer[bytes++] = '\\';
+                            string_buffer[bytes++] = '\\';
+                            break;
+                        }
+
+                        default:
+                        {
+                            // escape control characters (0x00..0x1F) or, if
+                            // ensure_ascii parameter is used, non-ASCII characters
+                            if ((codepoint <= 0x1F) || (ensure_ascii && (codepoint >= 0x7F)))
+                            {
+                                if (codepoint <= 0xFFFF)
+                                {
+                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                                    (std::snprintf)(string_buffer.data() + bytes, 7, "\\u%04x",
+                                                    static_cast<std::uint16_t>(codepoint));
+                                    bytes += 6;
+                                }
+                                else
+                                {
+                                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                                    (std::snprintf)(string_buffer.data() + bytes, 13, "\\u%04x\\u%04x",
+                                                    static_cast<std::uint16_t>(0xD7C0u + (codepoint >> 10u)),
+                                                    static_cast<std::uint16_t>(0xDC00u + (codepoint & 0x3FFu)));
+                                    bytes += 12;
+                                }
+                            }
+                            else
+                            {
+                                // copy byte to buffer (all previous bytes
+                                // been copied have in default case above)
+                                string_buffer[bytes++] = s[i];
+                            }
+                            break;
+                        }
+                    }
+
+                    // write buffer and reset index; there must be 13 bytes
+                    // left, as this is the maximal number of bytes to be
+                    // written ("\uxxxx\uxxxx\0") for one code point
+                    if (string_buffer.size() - bytes < 13)
+                    {
+                        o->write_characters(string_buffer.data(), bytes);
+                        bytes = 0;
+                    }
+
+                    // remember the byte position of this accept
+                    bytes_after_last_accept = bytes;
+                    undumped_chars = 0;
+                    break;
+                }
+
+                case UTF8_REJECT:  // decode found invalid UTF-8 byte
+                {
+                    switch (error_handler)
+                    {
+                        case error_handler_t::strict:
+                        {
+                            std::string sn(9, '\0');
+                            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                            (std::snprintf)(&sn[0], sn.size(), "%.2X", byte);
+                            JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn, BasicJsonType()));
+                        }
+
+                        case error_handler_t::ignore:
+                        case error_handler_t::replace:
+                        {
+                            // in case we saw this character the first time, we
+                            // would like to read it again, because the byte
+                            // may be OK for itself, but just not OK for the
+                            // previous sequence
+                            if (undumped_chars > 0)
+                            {
+                                --i;
+                            }
+
+                            // reset length buffer to the last accepted index;
+                            // thus removing/ignoring the invalid characters
+                            bytes = bytes_after_last_accept;
+
+                            if (error_handler == error_handler_t::replace)
+                            {
+                                // add a replacement character
+                                if (ensure_ascii)
+                                {
+                                    string_buffer[bytes++] = '\\';
+                                    string_buffer[bytes++] = 'u';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'f';
+                                    string_buffer[bytes++] = 'd';
+                                }
+                                else
+                                {
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xEF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBF');
+                                    string_buffer[bytes++] = detail::binary_writer<BasicJsonType, char>::to_char_type('\xBD');
+                                }
+
+                                // write buffer and reset index; there must be 13 bytes
+                                // left, as this is the maximal number of bytes to be
+                                // written ("\uxxxx\uxxxx\0") for one code point
+                                if (string_buffer.size() - bytes < 13)
+                                {
+                                    o->write_characters(string_buffer.data(), bytes);
+                                    bytes = 0;
+                                }
+
+                                bytes_after_last_accept = bytes;
+                            }
+
+                            undumped_chars = 0;
+
+                            // continue processing the string
+                            state = UTF8_ACCEPT;
+                            break;
+                        }
+
+                        default:            // LCOV_EXCL_LINE
+                            JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+
+                default:  // decode found yet incomplete multi-byte code point
+                {
+                    if (!ensure_ascii)
+                    {
+                        // code point will not be escaped - copy byte to buffer
+                        string_buffer[bytes++] = s[i];
+                    }
+                    ++undumped_chars;
+                    break;
+                }
+            }
+        }
+
+        // we finished processing the string
+        if (JSON_HEDLEY_LIKELY(state == UTF8_ACCEPT))
+        {
+            // write buffer
+            if (bytes > 0)
+            {
+                o->write_characters(string_buffer.data(), bytes);
+            }
+        }
+        else
+        {
+            // we finish reading, but do not accept: string was incomplete
+            switch (error_handler)
+            {
+                case error_handler_t::strict:
+                {
+                    std::string sn(9, '\0');
+                    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+                    (std::snprintf)(&sn[0], sn.size(), "%.2X", static_cast<std::uint8_t>(s.back()));
+                    JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn, BasicJsonType()));
+                }
+
+                case error_handler_t::ignore:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    break;
+                }
+
+                case error_handler_t::replace:
+                {
+                    // write all accepted bytes
+                    o->write_characters(string_buffer.data(), bytes_after_last_accept);
+                    // add a replacement character
+                    if (ensure_ascii)
+                    {
+                        o->write_characters("\\ufffd", 6);
+                    }
+                    else
+                    {
+                        o->write_characters("\xEF\xBF\xBD", 3);
+                    }
+                    break;
+                }
+
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            }
+        }
+    }
+
+  private:
+    /*!
+    @brief count digits
+
+    Count the number of decimal (base 10) digits for an input unsigned integer.
+
+    @param[in] x  unsigned integer number to count its digits
+    @return    number of decimal digits
+    */
+    inline unsigned int count_digits(number_unsigned_t x) noexcept
+    {
+        unsigned int n_digits = 1;
+        for (;;)
+        {
+            if (x < 10)
+            {
+                return n_digits;
+            }
+            if (x < 100)
+            {
+                return n_digits + 1;
+            }
+            if (x < 1000)
+            {
+                return n_digits + 2;
+            }
+            if (x < 10000)
+            {
+                return n_digits + 3;
+            }
+            x = x / 10000u;
+            n_digits += 4;
+        }
+    }
+
+    /*!
+    @brief dump an integer
+
+    Dump a given integer to output stream @a o. Works internally with
+    @a number_buffer.
+
+    @param[in] x  integer number (signed or unsigned) to dump
+    @tparam NumberType either @a number_integer_t or @a number_unsigned_t
+    */
+    template < typename NumberType, detail::enable_if_t <
+                   std::is_integral<NumberType>::value ||
+                   std::is_same<NumberType, number_unsigned_t>::value ||
+                   std::is_same<NumberType, number_integer_t>::value ||
+                   std::is_same<NumberType, binary_char_t>::value,
+                   int > = 0 >
+    void dump_integer(NumberType x)
+    {
+        static constexpr std::array<std::array<char, 2>, 100> digits_to_99
+        {
+            {
+                {{'0', '0'}}, {{'0', '1'}}, {{'0', '2'}}, {{'0', '3'}}, {{'0', '4'}}, {{'0', '5'}}, {{'0', '6'}}, {{'0', '7'}}, {{'0', '8'}}, {{'0', '9'}},
+                {{'1', '0'}}, {{'1', '1'}}, {{'1', '2'}}, {{'1', '3'}}, {{'1', '4'}}, {{'1', '5'}}, {{'1', '6'}}, {{'1', '7'}}, {{'1', '8'}}, {{'1', '9'}},
+                {{'2', '0'}}, {{'2', '1'}}, {{'2', '2'}}, {{'2', '3'}}, {{'2', '4'}}, {{'2', '5'}}, {{'2', '6'}}, {{'2', '7'}}, {{'2', '8'}}, {{'2', '9'}},
+                {{'3', '0'}}, {{'3', '1'}}, {{'3', '2'}}, {{'3', '3'}}, {{'3', '4'}}, {{'3', '5'}}, {{'3', '6'}}, {{'3', '7'}}, {{'3', '8'}}, {{'3', '9'}},
+                {{'4', '0'}}, {{'4', '1'}}, {{'4', '2'}}, {{'4', '3'}}, {{'4', '4'}}, {{'4', '5'}}, {{'4', '6'}}, {{'4', '7'}}, {{'4', '8'}}, {{'4', '9'}},
+                {{'5', '0'}}, {{'5', '1'}}, {{'5', '2'}}, {{'5', '3'}}, {{'5', '4'}}, {{'5', '5'}}, {{'5', '6'}}, {{'5', '7'}}, {{'5', '8'}}, {{'5', '9'}},
+                {{'6', '0'}}, {{'6', '1'}}, {{'6', '2'}}, {{'6', '3'}}, {{'6', '4'}}, {{'6', '5'}}, {{'6', '6'}}, {{'6', '7'}}, {{'6', '8'}}, {{'6', '9'}},
+                {{'7', '0'}}, {{'7', '1'}}, {{'7', '2'}}, {{'7', '3'}}, {{'7', '4'}}, {{'7', '5'}}, {{'7', '6'}}, {{'7', '7'}}, {{'7', '8'}}, {{'7', '9'}},
+                {{'8', '0'}}, {{'8', '1'}}, {{'8', '2'}}, {{'8', '3'}}, {{'8', '4'}}, {{'8', '5'}}, {{'8', '6'}}, {{'8', '7'}}, {{'8', '8'}}, {{'8', '9'}},
+                {{'9', '0'}}, {{'9', '1'}}, {{'9', '2'}}, {{'9', '3'}}, {{'9', '4'}}, {{'9', '5'}}, {{'9', '6'}}, {{'9', '7'}}, {{'9', '8'}}, {{'9', '9'}},
+            }
+        };
+
+        // special case for "0"
+        if (x == 0)
+        {
+            o->write_character('0');
+            return;
+        }
+
+        // use a pointer to fill the buffer
+        auto buffer_ptr = number_buffer.begin(); // NOLINT(llvm-qualified-auto,readability-qualified-auto,cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+
+        const bool is_negative = std::is_signed<NumberType>::value && !(x >= 0); // see issue #755
+        number_unsigned_t abs_value;
+
+        unsigned int n_chars{};
+
+        if (is_negative)
+        {
+            *buffer_ptr = '-';
+            abs_value = remove_sign(static_cast<number_integer_t>(x));
+
+            // account one more byte for the minus sign
+            n_chars = 1 + count_digits(abs_value);
+        }
+        else
+        {
+            abs_value = static_cast<number_unsigned_t>(x);
+            n_chars = count_digits(abs_value);
+        }
+
+        // spare 1 byte for '\0'
+        JSON_ASSERT(n_chars < number_buffer.size() - 1);
+
+        // jump to the end to generate the string from backward
+        // so we later avoid reversing the result
+        buffer_ptr += n_chars;
+
+        // Fast int2ascii implementation inspired by "Fastware" talk by Andrei Alexandrescu
+        // See: https://www.youtube.com/watch?v=o4-CwDo2zpg
+        while (abs_value >= 100)
+        {
+            const auto digits_index = static_cast<unsigned>((abs_value % 100));
+            abs_value /= 100;
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+
+        if (abs_value >= 10)
+        {
+            const auto digits_index = static_cast<unsigned>(abs_value);
+            *(--buffer_ptr) = digits_to_99[digits_index][1];
+            *(--buffer_ptr) = digits_to_99[digits_index][0];
+        }
+        else
+        {
+            *(--buffer_ptr) = static_cast<char>('0' + abs_value);
+        }
+
+        o->write_characters(number_buffer.data(), n_chars);
+    }
+
+    /*!
+    @brief dump a floating-point number
+
+    Dump a given floating-point number to output stream @a o. Works internally
+    with @a number_buffer.
+
+    @param[in] x  floating-point number to dump
+    */
+    void dump_float(number_float_t x)
+    {
+        // NaN / inf
+        if (!std::isfinite(x))
+        {
+            o->write_characters("null", 4);
+            return;
+        }
+
+        // If number_float_t is an IEEE-754 single or double precision number,
+        // use the Grisu2 algorithm to produce short numbers which are
+        // guaranteed to round-trip, using strtof and strtod, resp.
+        //
+        // NB: The test below works if <long double> == <double>.
+        static constexpr bool is_ieee_single_or_double
+            = (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 24 && std::numeric_limits<number_float_t>::max_exponent == 128) ||
+              (std::numeric_limits<number_float_t>::is_iec559 && std::numeric_limits<number_float_t>::digits == 53 && std::numeric_limits<number_float_t>::max_exponent == 1024);
+
+        dump_float(x, std::integral_constant<bool, is_ieee_single_or_double>());
+    }
+
+    void dump_float(number_float_t x, std::true_type /*is_ieee_single_or_double*/)
+    {
+        auto* begin = number_buffer.data();
+        auto* end = ::nlohmann::detail::to_chars(begin, begin + number_buffer.size(), x);
+
+        o->write_characters(begin, static_cast<size_t>(end - begin));
+    }
+
+    void dump_float(number_float_t x, std::false_type /*is_ieee_single_or_double*/)
+    {
+        // get number of digits for a float -> text -> float round-trip
+        static constexpr auto d = std::numeric_limits<number_float_t>::max_digits10;
+
+        // the actual conversion
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
+        std::ptrdiff_t len = (std::snprintf)(number_buffer.data(), number_buffer.size(), "%.*g", d, x);
+
+        // negative value indicates an error
+        JSON_ASSERT(len > 0);
+        // check if buffer was large enough
+        JSON_ASSERT(static_cast<std::size_t>(len) < number_buffer.size());
+
+        // erase thousands separator
+        if (thousands_sep != '\0')
+        {
+            auto* const end = std::remove(number_buffer.begin(),
+                                          number_buffer.begin() + len, thousands_sep);
+            std::fill(end, number_buffer.end(), '\0');
+            JSON_ASSERT((end - number_buffer.begin()) <= len);
+            len = (end - number_buffer.begin());
+        }
+
+        // convert decimal point to '.'
+        if (decimal_point != '\0' && decimal_point != '.')
+        {
+            auto* const dec_pos = std::find(number_buffer.begin(), number_buffer.end(), decimal_point);
+            if (dec_pos != number_buffer.end())
+            {
+                *dec_pos = '.';
+            }
+        }
+
+        o->write_characters(number_buffer.data(), static_cast<std::size_t>(len));
+
+        // determine if need to append ".0"
+        const bool value_is_int_like =
+            std::none_of(number_buffer.begin(), number_buffer.begin() + len + 1,
+                         [](char c)
+        {
+            return c == '.' || c == 'e';
+        });
+
+        if (value_is_int_like)
+        {
+            o->write_characters(".0", 2);
+        }
+    }
+
+    /*!
+    @brief check whether a string is UTF-8 encoded
+
+    The function checks each byte of a string whether it is UTF-8 encoded. The
+    result of the check is stored in the @a state parameter. The function must
+    be called initially with state 0 (accept). State 1 means the string must
+    be rejected, because the current byte is not allowed. If the string is
+    completely processed, but the state is non-zero, the string ended
+    prematurely; that is, the last byte indicated more bytes should have
+    followed.
+
+    @param[in,out] state  the state of the decoding
+    @param[in,out] codep  codepoint (valid only if resulting state is UTF8_ACCEPT)
+    @param[in] byte       next byte to decode
+    @return               new state
+
+    @note The function has been edited: a std::array is used.
+
+    @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+    @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+    */
+    static std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
+    {
+        static const std::array<std::uint8_t, 400> utf8d =
+        {
+            {
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
+                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
+                8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
+                0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+                0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+                0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
+                1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
+                1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
+                1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
+            }
+        };
+
+        JSON_ASSERT(byte < utf8d.size());
+        const std::uint8_t type = utf8d[byte];
+
+        codep = (state != UTF8_ACCEPT)
+                ? (byte & 0x3fu) | (codep << 6u)
+                : (0xFFu >> type) & (byte);
+
+        std::size_t index = 256u + static_cast<size_t>(state) * 16u + static_cast<size_t>(type);
+        JSON_ASSERT(index < 400);
+        state = utf8d[index];
+        return state;
+    }
+
+    /*
+     * Overload to make the compiler happy while it is instantiating
+     * dump_integer for number_unsigned_t.
+     * Must never be called.
+     */
+    number_unsigned_t remove_sign(number_unsigned_t x)
+    {
+        JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        return x; // LCOV_EXCL_LINE
+    }
+
+    /*
+     * Helper function for dump_integer
+     *
+     * This function takes a negative signed integer and returns its absolute
+     * value as unsigned integer. The plus/minus shuffling is necessary as we can
+     * not directly remove the sign of an arbitrary signed integer as the
+     * absolute values of INT_MIN and INT_MAX are usually not the same. See
+     * #1708 for details.
+     */
+    inline number_unsigned_t remove_sign(number_integer_t x) noexcept
+    {
+        JSON_ASSERT(x < 0 && x < (std::numeric_limits<number_integer_t>::max)()); // NOLINT(misc-redundant-expression)
+        return static_cast<number_unsigned_t>(-(x + 1)) + 1;
+    }
+
+  private:
+    /// the output of the serializer
+    output_adapter_t<char> o = nullptr;
+
+    /// a (hopefully) large enough character buffer
+    std::array<char, 64> number_buffer{{}};
+
+    /// the locale
+    const std::lconv* loc = nullptr;
+    /// the locale's thousand separator character
+    const char thousands_sep = '\0';
+    /// the locale's decimal point character
+    const char decimal_point = '\0';
+
+    /// string buffer
+    std::array<char, 512> string_buffer{{}};
+
+    /// the indentation character
+    const char indent_char;
+    /// the indentation string
+    string_t indent_string;
+
+    /// error_handler how to react on decoding errors
+    const error_handler_t error_handler;
+};
+}  // namespace detail
+}  // namespace nlohmann
+
+// #include <nlohmann/detail/value_t.hpp>
+
+// #include <nlohmann/json_fwd.hpp>
+
+// #include <nlohmann/ordered_map.hpp>
+
+
+#include <functional> // less
+#include <initializer_list> // initializer_list
+#include <iterator> // input_iterator_tag, iterator_traits
+#include <memory> // allocator
+#include <stdexcept> // for out_of_range
+#include <type_traits> // enable_if, is_convertible
+#include <utility> // pair
+#include <vector> // vector
+
+// #include <nlohmann/detail/macro_scope.hpp>
+
+
+namespace nlohmann
+{
+
+/// ordered_map: a minimal map-like container that preserves insertion order
+/// for use within nlohmann::basic_json<ordered_map>
+template <class Key, class T, class IgnoredLess = std::less<Key>,
+          class Allocator = std::allocator<std::pair<const Key, T>>>
+                  struct ordered_map : std::vector<std::pair<const Key, T>, Allocator>
+{
+    using key_type = Key;
+    using mapped_type = T;
+    using Container = std::vector<std::pair<const Key, T>, Allocator>;
+    using typename Container::iterator;
+    using typename Container::const_iterator;
+    using typename Container::size_type;
+    using typename Container::value_type;
+
+    // Explicit constructors instead of `using Container::Container`
+    // otherwise older compilers choke on it (GCC <= 5.5, xcode <= 9.4)
+    ordered_map(const Allocator& alloc = Allocator()) : Container{alloc} {}
+    template <class It>
+    ordered_map(It first, It last, const Allocator& alloc = Allocator())
+        : Container{first, last, alloc} {}
+    ordered_map(std::initializer_list<T> init, const Allocator& alloc = Allocator() )
+        : Container{init, alloc} {}
+
+    std::pair<iterator, bool> emplace(const key_type& key, T&& t)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return {it, false};
+            }
+        }
+        Container::emplace_back(key, t);
+        return {--this->end(), true};
+    }
+
+    T& operator[](const Key& key)
+    {
+        return emplace(key, T{}).first->second;
+    }
+
+    const T& operator[](const Key& key) const
+    {
+        return at(key);
+    }
+
+    T& at(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    const T& at(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it->second;
+            }
+        }
+
+        JSON_THROW(std::out_of_range("key not found"));
+    }
+
+    size_type erase(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                // Since we cannot move const Keys, re-construct them in place
+                for (auto next = it; ++next != this->end(); ++it)
+                {
+                    it->~value_type(); // Destroy but keep allocation
+                    new (&*it) value_type{std::move(*next)};
+                }
+                Container::pop_back();
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator erase(iterator pos)
+    {
+        auto it = pos;
+
+        // Since we cannot move const Keys, re-construct them in place
+        for (auto next = it; ++next != this->end(); ++it)
+        {
+            it->~value_type(); // Destroy but keep allocation
+            new (&*it) value_type{std::move(*next)};
+        }
+        Container::pop_back();
+        return pos;
+    }
+
+    size_type count(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return 1;
+            }
+        }
+        return 0;
+    }
+
+    iterator find(const Key& key)
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    const_iterator find(const Key& key) const
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == key)
+            {
+                return it;
+            }
+        }
+        return Container::end();
+    }
+
+    std::pair<iterator, bool> insert( value_type&& value )
+    {
+        return emplace(value.first, std::move(value.second));
+    }
+
+    std::pair<iterator, bool> insert( const value_type& value )
+    {
+        for (auto it = this->begin(); it != this->end(); ++it)
+        {
+            if (it->first == value.first)
+            {
+                return {it, false};
+            }
+        }
+        Container::push_back(value);
+        return {--this->end(), true};
+    }
+
+    template<typename InputIt>
+    using require_input_iter = typename std::enable_if<std::is_convertible<typename std::iterator_traits<InputIt>::iterator_category,
+            std::input_iterator_tag>::value>::type;
+
+    template<typename InputIt, typename = require_input_iter<InputIt>>
+    void insert(InputIt first, InputIt last)
+    {
+        for (auto it = first; it != last; ++it)
+        {
+            insert(*it);
+        }
+    }
+};
+
+}  // namespace nlohmann
+
+
+#if defined(JSON_HAS_CPP_17)
+    #include <string_view>
+#endif
+
+/*!
+@brief namespace for Niels Lohmann
+@see https://github.com/nlohmann
+@since version 1.0.0
+*/
+namespace nlohmann
+{
+
+/*!
+@brief a class to store JSON values
+
+@tparam ObjectType type for JSON objects (`std::map` by default; will be used
+in @ref object_t)
+@tparam ArrayType type for JSON arrays (`std::vector` by default; will be used
+in @ref array_t)
+@tparam StringType type for JSON strings and object keys (`std::string` by
+default; will be used in @ref string_t)
+@tparam BooleanType type for JSON booleans (`bool` by default; will be used
+in @ref boolean_t)
+@tparam NumberIntegerType type for JSON integer numbers (`int64_t` by
+default; will be used in @ref number_integer_t)
+@tparam NumberUnsignedType type for JSON unsigned integer numbers (@c
+`uint64_t` by default; will be used in @ref number_unsigned_t)
+@tparam NumberFloatType type for JSON floating-point numbers (`double` by
+default; will be used in @ref number_float_t)
+@tparam BinaryType type for packed binary data for compatibility with binary
+serialization formats (`std::vector<std::uint8_t>` by default; will be used in
+@ref binary_t)
+@tparam AllocatorType type of the allocator to use (`std::allocator` by
+default)
+@tparam JSONSerializer the serializer to resolve internal calls to `to_json()`
+and `from_json()` (@ref adl_serializer by default)
+
+@requirement The class satisfies the following concept requirements:
+- Basic
+ - [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible):
+   JSON values can be default constructed. The result will be a JSON null
+   value.
+ - [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible):
+   A JSON value can be constructed from an rvalue argument.
+ - [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible):
+   A JSON value can be copy-constructed from an lvalue expression.
+ - [MoveAssignable](https://en.cppreference.com/w/cpp/named_req/MoveAssignable):
+   A JSON value van be assigned from an rvalue argument.
+ - [CopyAssignable](https://en.cppreference.com/w/cpp/named_req/CopyAssignable):
+   A JSON value can be copy-assigned from an lvalue expression.
+ - [Destructible](https://en.cppreference.com/w/cpp/named_req/Destructible):
+   JSON values can be destructed.
+- Layout
+ - [StandardLayoutType](https://en.cppreference.com/w/cpp/named_req/StandardLayoutType):
+   JSON values have
+   [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard_layout):
+   All non-static data members are private and standard layout types, the
+   class has no virtual functions or (virtual) base classes.
+- Library-wide
+ - [EqualityComparable](https://en.cppreference.com/w/cpp/named_req/EqualityComparable):
+   JSON values can be compared with `==`, see @ref
+   operator==(const_reference,const_reference).
+ - [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable):
+   JSON values can be compared with `<`, see @ref
+   operator<(const_reference,const_reference).
+ - [Swappable](https://en.cppreference.com/w/cpp/named_req/Swappable):
+   Any JSON lvalue or rvalue of can be swapped with any lvalue or rvalue of
+   other compatible types, using unqualified function call @ref swap().
+ - [NullablePointer](https://en.cppreference.com/w/cpp/named_req/NullablePointer):
+   JSON values can be compared against `std::nullptr_t` objects which are used
+   to model the `null` value.
+- Container
+ - [Container](https://en.cppreference.com/w/cpp/named_req/Container):
+   JSON values can be used like STL containers and provide iterator access.
+ - [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer);
+   JSON values can be used like STL containers and provide reverse iterator
+   access.
+
+@invariant The member variables @a m_value and @a m_type have the following
+relationship:
+- If `m_type == value_t::object`, then `m_value.object != nullptr`.
+- If `m_type == value_t::array`, then `m_value.array != nullptr`.
+- If `m_type == value_t::string`, then `m_value.string != nullptr`.
+The invariants are checked by member function assert_invariant().
+
+@internal
+@note ObjectType trick from https://stackoverflow.com/a/9860911
+@endinternal
+
+@see [RFC 8259: The JavaScript Object Notation (JSON) Data Interchange
+Format](https://tools.ietf.org/html/rfc8259)
+
+@since version 1.0.0
+
+@nosubgrouping
+*/
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions)
+{
+  private:
+    template<detail::value_t> friend struct detail::external_constructor;
+    friend ::nlohmann::json_pointer<basic_json>;
+
+    template<typename BasicJsonType, typename InputType>
+    friend class ::nlohmann::detail::parser;
+    friend ::nlohmann::detail::serializer<basic_json>;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::iter_impl;
+    template<typename BasicJsonType, typename CharType>
+    friend class ::nlohmann::detail::binary_writer;
+    template<typename BasicJsonType, typename InputType, typename SAX>
+    friend class ::nlohmann::detail::binary_reader;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_parser;
+    template<typename BasicJsonType>
+    friend class ::nlohmann::detail::json_sax_dom_callback_parser;
+    friend class ::nlohmann::detail::exception;
+
+    /// workaround type for MSVC
+    using basic_json_t = NLOHMANN_BASIC_JSON_TPL;
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    // convenience aliases for types residing in namespace detail;
+    using lexer = ::nlohmann::detail::lexer_base<basic_json>;
+
+    template<typename InputAdapterType>
+    static ::nlohmann::detail::parser<basic_json, InputAdapterType> parser(
+        InputAdapterType adapter,
+        detail::parser_callback_t<basic_json>cb = nullptr,
+        const bool allow_exceptions = true,
+        const bool ignore_comments = false
+                                 )
+    {
+        return ::nlohmann::detail::parser<basic_json, InputAdapterType>(std::move(adapter),
+                std::move(cb), allow_exceptions, ignore_comments);
+    }
+
+  private:
+    using primitive_iterator_t = ::nlohmann::detail::primitive_iterator_t;
+    template<typename BasicJsonType>
+    using internal_iterator = ::nlohmann::detail::internal_iterator<BasicJsonType>;
+    template<typename BasicJsonType>
+    using iter_impl = ::nlohmann::detail::iter_impl<BasicJsonType>;
+    template<typename Iterator>
+    using iteration_proxy = ::nlohmann::detail::iteration_proxy<Iterator>;
+    template<typename Base> using json_reverse_iterator = ::nlohmann::detail::json_reverse_iterator<Base>;
+
+    template<typename CharType>
+    using output_adapter_t = ::nlohmann::detail::output_adapter_t<CharType>;
+
+    template<typename InputType>
+    using binary_reader = ::nlohmann::detail::binary_reader<basic_json, InputType>;
+    template<typename CharType> using binary_writer = ::nlohmann::detail::binary_writer<basic_json, CharType>;
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    using serializer = ::nlohmann::detail::serializer<basic_json>;
+
+  public:
+    using value_t = detail::value_t;
+    /// JSON Pointer, see @ref nlohmann::json_pointer
+    using json_pointer = ::nlohmann::json_pointer<basic_json>;
+    template<typename T, typename SFINAE>
+    using json_serializer = JSONSerializer<T, SFINAE>;
+    /// how to treat decoding errors
+    using error_handler_t = detail::error_handler_t;
+    /// how to treat CBOR tags
+    using cbor_tag_handler_t = detail::cbor_tag_handler_t;
+    /// helper type for initializer lists of basic_json values
+    using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
+
+    using input_format_t = detail::input_format_t;
+    /// SAX interface type, see @ref nlohmann::json_sax
+    using json_sax_t = json_sax<basic_json>;
+
+    ////////////////
+    // exceptions //
+    ////////////////
+
+    /// @name exceptions
+    /// Classes to implement user-defined exceptions.
+    /// @{
+
+    /// @copydoc detail::exception
+    using exception = detail::exception;
+    /// @copydoc detail::parse_error
+    using parse_error = detail::parse_error;
+    /// @copydoc detail::invalid_iterator
+    using invalid_iterator = detail::invalid_iterator;
+    /// @copydoc detail::type_error
+    using type_error = detail::type_error;
+    /// @copydoc detail::out_of_range
+    using out_of_range = detail::out_of_range;
+    /// @copydoc detail::other_error
+    using other_error = detail::other_error;
+
+    /// @}
+
+
+    /////////////////////
+    // container types //
+    /////////////////////
+
+    /// @name container types
+    /// The canonic container types to use @ref basic_json like any other STL
+    /// container.
+    /// @{
+
+    /// the type of elements in a basic_json container
+    using value_type = basic_json;
+
+    /// the type of an element reference
+    using reference = value_type&;
+    /// the type of an element const reference
+    using const_reference = const value_type&;
+
+    /// a type to represent differences between iterators
+    using difference_type = std::ptrdiff_t;
+    /// a type to represent container sizes
+    using size_type = std::size_t;
+
+    /// the allocator type
+    using allocator_type = AllocatorType<basic_json>;
+
+    /// the type of an element pointer
+    using pointer = typename std::allocator_traits<allocator_type>::pointer;
+    /// the type of an element const pointer
+    using const_pointer = typename std::allocator_traits<allocator_type>::const_pointer;
+
+    /// an iterator for a basic_json container
+    using iterator = iter_impl<basic_json>;
+    /// a const iterator for a basic_json container
+    using const_iterator = iter_impl<const basic_json>;
+    /// a reverse iterator for a basic_json container
+    using reverse_iterator = json_reverse_iterator<typename basic_json::iterator>;
+    /// a const reverse iterator for a basic_json container
+    using const_reverse_iterator = json_reverse_iterator<typename basic_json::const_iterator>;
+
+    /// @}
+
+
+    /*!
+    @brief returns the allocator associated with the container
+    */
+    static allocator_type get_allocator()
+    {
+        return allocator_type();
+    }
+
+    /*!
+    @brief returns version information on the library
+
+    This function returns a JSON object with information about the library,
+    including the version number and information on the platform and compiler.
+
+    @return JSON object holding version information
+    key         | description
+    ----------- | ---------------
+    `compiler`  | Information on the used compiler. It is an object with the following keys: `c++` (the used C++ standard), `family` (the compiler family; possible values are `clang`, `icc`, `gcc`, `ilecpp`, `msvc`, `pgcpp`, `sunpro`, and `unknown`), and `version` (the compiler version).
+    `copyright` | The copyright line for the library as string.
+    `name`      | The name of the library as string.
+    `platform`  | The used platform as string. Possible values are `win32`, `linux`, `apple`, `unix`, and `unknown`.
+    `url`       | The URL of the project as string.
+    `version`   | The version of the library. It is an object with the following keys: `major`, `minor`, and `patch` as defined by [Semantic Versioning](http://semver.org), and `string` (the version string).
+
+    @liveexample{The following code shows an example output of the `meta()`
+    function.,meta}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @complexity Constant.
+
+    @since 2.1.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json meta()
+    {
+        basic_json result;
+
+        result["copyright"] = "(C) 2013-2021 Niels Lohmann";
+        result["name"] = "JSON for Modern C++";
+        result["url"] = "https://github.com/nlohmann/json";
+        result["version"]["string"] =
+            std::to_string(NLOHMANN_JSON_VERSION_MAJOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_MINOR) + "." +
+            std::to_string(NLOHMANN_JSON_VERSION_PATCH);
+        result["version"]["major"] = NLOHMANN_JSON_VERSION_MAJOR;
+        result["version"]["minor"] = NLOHMANN_JSON_VERSION_MINOR;
+        result["version"]["patch"] = NLOHMANN_JSON_VERSION_PATCH;
+
+#ifdef _WIN32
+        result["platform"] = "win32";
+#elif defined __linux__
+        result["platform"] = "linux";
+#elif defined __APPLE__
+        result["platform"] = "apple";
+#elif defined __unix__
+        result["platform"] = "unix";
+#else
+        result["platform"] = "unknown";
+#endif
+
+#if defined(__ICC) || defined(__INTEL_COMPILER)
+        result["compiler"] = {{"family", "icc"}, {"version", __INTEL_COMPILER}};
+#elif defined(__clang__)
+        result["compiler"] = {{"family", "clang"}, {"version", __clang_version__}};
+#elif defined(__GNUC__) || defined(__GNUG__)
+        result["compiler"] = {{"family", "gcc"}, {"version", std::to_string(__GNUC__) + "." + std::to_string(__GNUC_MINOR__) + "." + std::to_string(__GNUC_PATCHLEVEL__)}};
+#elif defined(__HP_cc) || defined(__HP_aCC)
+        result["compiler"] = "hp"
+#elif defined(__IBMCPP__)
+        result["compiler"] = {{"family", "ilecpp"}, {"version", __IBMCPP__}};
+#elif defined(_MSC_VER)
+        result["compiler"] = {{"family", "msvc"}, {"version", _MSC_VER}};
+#elif defined(__PGI)
+        result["compiler"] = {{"family", "pgcpp"}, {"version", __PGI}};
+#elif defined(__SUNPRO_CC)
+        result["compiler"] = {{"family", "sunpro"}, {"version", __SUNPRO_CC}};
+#else
+        result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}};
+#endif
+
+#ifdef __cplusplus
+        result["compiler"]["c++"] = std::to_string(__cplusplus);
+#else
+        result["compiler"]["c++"] = "unknown";
+#endif
+        return result;
+    }
+
+
+    ///////////////////////////
+    // JSON value data types //
+    ///////////////////////////
+
+    /// @name JSON value data types
+    /// The data types to store a JSON value. These types are derived from
+    /// the template arguments passed to class @ref basic_json.
+    /// @{
+
+#if defined(JSON_HAS_CPP_14)
+    // Use transparent comparator if possible, combined with perfect forwarding
+    // on find() and count() calls prevents unnecessary string construction.
+    using object_comparator_t = std::less<>;
+#else
+    using object_comparator_t = std::less<StringType>;
+#endif
+
+    /*!
+    @brief a type for an object
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) describes JSON objects as follows:
+    > An object is an unordered collection of zero or more name/value pairs,
+    > where a name is a string and a value is a string, number, boolean, null,
+    > object, or array.
+
+    To store objects in C++, a type is defined by the template parameters
+    described below.
+
+    @tparam ObjectType  the container to store objects (e.g., `std::map` or
+    `std::unordered_map`)
+    @tparam StringType the type of the keys or names (e.g., `std::string`).
+    The comparison function `std::less<StringType>` is used to order elements
+    inside the container.
+    @tparam AllocatorType the allocator to use for objects (e.g.,
+    `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ObjectType (`std::map`), @a StringType
+    (`std::string`), and @a AllocatorType (`std::allocator`), the default
+    value for @a object_t is:
+
+    @code {.cpp}
+    std::map<
+      std::string, // key_type
+      basic_json, // value_type
+      std::less<std::string>, // key_compare
+      std::allocator<std::pair<const std::string, basic_json>> // allocator_type
+    >
+    @endcode
+
+    #### Behavior
+
+    The choice of @a object_t influences the behavior of the JSON class. With
+    the default type, objects have the following behavior:
+
+    - When all names are unique, objects will be interoperable in the sense
+      that all software implementations receiving that object will agree on
+      the name-value mappings.
+    - When the names within an object are not unique, it is unspecified which
+      one of the values for a given key will be chosen. For instance,
+      `{"key": 2, "key": 1}` could be equal to either `{"key": 1}` or
+      `{"key": 2}`.
+    - Internally, name/value pairs are stored in lexicographical order of the
+      names. Objects will also be serialized (see @ref dump) in this order.
+      For instance, `{"b": 1, "a": 2}` and `{"a": 2, "b": 1}` will be stored
+      and serialized as `{"a": 2, "b": 1}`.
+    - When comparing objects, the order of the name/value pairs is irrelevant.
+      This makes objects interoperable in the sense that they will not be
+      affected by these differences. For instance, `{"b": 1, "a": 2}` and
+      `{"a": 2, "b": 1}` will be treated as equal.
+
+    #### Limits
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the object's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON object.
+
+    #### Storage
+
+    Objects are stored as pointers in a @ref basic_json type. That is, for any
+    access to object values, a pointer of type `object_t*` must be
+    dereferenced.
+
+    @sa see @ref array_t -- type for an array value
+
+    @since version 1.0.0
+
+    @note The order name/value pairs are added to the object is *not*
+    preserved by the library. Therefore, iterating an object may return
+    name/value pairs in a different order than they were originally stored. In
+    fact, keys will be traversed in alphabetical order as `std::map` with
+    `std::less` is used by default. Please note this behavior conforms to [RFC
+    8259](https://tools.ietf.org/html/rfc8259), because any order implements the
+    specified "unordered" nature of JSON objects.
+    */
+    using object_t = ObjectType<StringType,
+          basic_json,
+          object_comparator_t,
+          AllocatorType<std::pair<const StringType,
+          basic_json>>>;
+
+    /*!
+    @brief a type for an array
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) describes JSON arrays as follows:
+    > An array is an ordered sequence of zero or more values.
+
+    To store objects in C++, a type is defined by the template parameters
+    explained below.
+
+    @tparam ArrayType  container type to store arrays (e.g., `std::vector` or
+    `std::list`)
+    @tparam AllocatorType allocator to use for arrays (e.g., `std::allocator`)
+
+    #### Default type
+
+    With the default values for @a ArrayType (`std::vector`) and @a
+    AllocatorType (`std::allocator`), the default value for @a array_t is:
+
+    @code {.cpp}
+    std::vector<
+      basic_json, // value_type
+      std::allocator<basic_json> // allocator_type
+    >
+    @endcode
+
+    #### Limits
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies:
+    > An implementation may set limits on the maximum depth of nesting.
+
+    In this class, the array's limit of nesting is not explicitly constrained.
+    However, a maximum depth of nesting may be introduced by the compiler or
+    runtime environment. A theoretical limit can be queried by calling the
+    @ref max_size function of a JSON array.
+
+    #### Storage
+
+    Arrays are stored as pointers in a @ref basic_json type. That is, for any
+    access to array values, a pointer of type `array_t*` must be dereferenced.
+
+    @sa see @ref object_t -- type for an object value
+
+    @since version 1.0.0
+    */
+    using array_t = ArrayType<basic_json, AllocatorType<basic_json>>;
+
+    /*!
+    @brief a type for a string
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) describes JSON strings as follows:
+    > A string is a sequence of zero or more Unicode characters.
+
+    To store objects in C++, a type is defined by the template parameter
+    described below. Unicode values are split by the JSON class into
+    byte-sized characters during deserialization.
+
+    @tparam StringType  the container to store strings (e.g., `std::string`).
+    Note this container is used for keys/names in objects, see @ref object_t.
+
+    #### Default type
+
+    With the default values for @a StringType (`std::string`), the default
+    value for @a string_t is:
+
+    @code {.cpp}
+    std::string
+    @endcode
+
+    #### Encoding
+
+    Strings are stored in UTF-8 encoding. Therefore, functions like
+    `std::string::size()` or `std::string::length()` return the number of
+    bytes in the string rather than the number of characters or glyphs.
+
+    #### String comparison
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) states:
+    > Software implementations are typically required to test names of object
+    > members for equality. Implementations that transform the textual
+    > representation into sequences of Unicode code units and then perform the
+    > comparison numerically, code unit by code unit, are interoperable in the
+    > sense that implementations will agree in all cases on equality or
+    > inequality of two strings. For example, implementations that compare
+    > strings with escaped characters unconverted may incorrectly find that
+    > `"a\\b"` and `"a\u005Cb"` are not equal.
+
+    This implementation is interoperable as it does compare strings code unit
+    by code unit.
+
+    #### Storage
+
+    String values are stored as pointers in a @ref basic_json type. That is,
+    for any access to string values, a pointer of type `string_t*` must be
+    dereferenced.
+
+    @since version 1.0.0
+    */
+    using string_t = StringType;
+
+    /*!
+    @brief a type for a boolean
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) implicitly describes a boolean as a
+    type which differentiates the two literals `true` and `false`.
+
+    To store objects in C++, a type is defined by the template parameter @a
+    BooleanType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a BooleanType (`bool`), the default value for
+    @a boolean_t is:
+
+    @code {.cpp}
+    bool
+    @endcode
+
+    #### Storage
+
+    Boolean values are stored directly inside a @ref basic_json type.
+
+    @since version 1.0.0
+    */
+    using boolean_t = BooleanType;
+
+    /*!
+    @brief a type for a number (integer)
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store integer numbers in C++, a type is defined by the template
+    parameter @a NumberIntegerType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberIntegerType (`int64_t`), the default
+    value for @a number_integer_t is:
+
+    @code {.cpp}
+    int64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `9223372036854775807` (INT64_MAX) and the minimal integer number
+    that can be stored is `-9223372036854775808` (INT64_MIN). Integer numbers
+    that are out of range will yield over/underflow when used in a
+    constructor. During deserialization, too large or small integer numbers
+    will be automatically be stored as @ref number_unsigned_t or @ref
+    number_float_t.
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange of the exactly supported range [INT64_MIN,
+    INT64_MAX], this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa see @ref number_float_t -- type for number values (floating-point)
+
+    @sa see @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_integer_t = NumberIntegerType;
+
+    /*!
+    @brief a type for a number (unsigned)
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store unsigned integer numbers in C++, a type is defined by the
+    template parameter @a NumberUnsignedType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberUnsignedType (`uint64_t`), the
+    default value for @a number_unsigned_t is:
+
+    @code {.cpp}
+    uint64_t
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in integer literals lead to an interpretation as octal
+      number. Internally, the value will be stored as decimal number. For
+      instance, the C++ integer literal `010` will be serialized to `8`.
+      During deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) specifies:
+    > An implementation may set limits on the range and precision of numbers.
+
+    When the default type is used, the maximal integer number that can be
+    stored is `18446744073709551615` (UINT64_MAX) and the minimal integer
+    number that can be stored is `0`. Integer numbers that are out of range
+    will yield over/underflow when used in a constructor. During
+    deserialization, too large or small integer numbers will be automatically
+    be stored as @ref number_integer_t or @ref number_float_t.
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) further states:
+    > Note that when such software is used, numbers that are integers and are
+    > in the range \f$[-2^{53}+1, 2^{53}-1]\f$ are interoperable in the sense
+    > that implementations will agree exactly on their numeric values.
+
+    As this range is a subrange (when considered in conjunction with the
+    number_integer_t type) of the exactly supported range [0, UINT64_MAX],
+    this class's integer type is interoperable.
+
+    #### Storage
+
+    Integer number values are stored directly inside a @ref basic_json type.
+
+    @sa see @ref number_float_t -- type for number values (floating-point)
+    @sa see @ref number_integer_t -- type for number values (integer)
+
+    @since version 2.0.0
+    */
+    using number_unsigned_t = NumberUnsignedType;
+
+    /*!
+    @brief a type for a number (floating-point)
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) describes numbers as follows:
+    > The representation of numbers is similar to that used in most
+    > programming languages. A number is represented in base 10 using decimal
+    > digits. It contains an integer component that may be prefixed with an
+    > optional minus sign, which may be followed by a fraction part and/or an
+    > exponent part. Leading zeros are not allowed. (...) Numeric values that
+    > cannot be represented in the grammar below (such as Infinity and NaN)
+    > are not permitted.
+
+    This description includes both integer and floating-point numbers.
+    However, C++ allows more precise storage if it is known whether the number
+    is a signed integer, an unsigned integer or a floating-point number.
+    Therefore, three different types, @ref number_integer_t, @ref
+    number_unsigned_t and @ref number_float_t are used.
+
+    To store floating-point numbers in C++, a type is defined by the template
+    parameter @a NumberFloatType which chooses the type to use.
+
+    #### Default type
+
+    With the default values for @a NumberFloatType (`double`), the default
+    value for @a number_float_t is:
+
+    @code {.cpp}
+    double
+    @endcode
+
+    #### Default behavior
+
+    - The restrictions about leading zeros is not enforced in C++. Instead,
+      leading zeros in floating-point literals will be ignored. Internally,
+      the value will be stored as decimal number. For instance, the C++
+      floating-point literal `01.2` will be serialized to `1.2`. During
+      deserialization, leading zeros yield an error.
+    - Not-a-number (NaN) values will be serialized to `null`.
+
+    #### Limits
+
+    [RFC 8259](https://tools.ietf.org/html/rfc8259) states:
+    > This specification allows implementations to set limits on the range and
+    > precision of numbers accepted. Since software that implements IEEE
+    > 754-2008 binary64 (double precision) numbers is generally available and
+    > widely used, good interoperability can be achieved by implementations
+    > that expect no more precision or range than these provide, in the sense
+    > that implementations will approximate JSON numbers within the expected
+    > precision.
+
+    This implementation does exactly follow this approach, as it uses double
+    precision floating-point numbers. Note values smaller than
+    `-1.79769313486232e+308` and values greater than `1.79769313486232e+308`
+    will be stored as NaN internally and be serialized to `null`.
+
+    #### Storage
+
+    Floating-point number values are stored directly inside a @ref basic_json
+    type.
+
+    @sa see @ref number_integer_t -- type for number values (integer)
+
+    @sa see @ref number_unsigned_t -- type for number values (unsigned integer)
+
+    @since version 1.0.0
+    */
+    using number_float_t = NumberFloatType;
+
+    /*!
+    @brief a type for a packed binary type
+
+    This type is a type designed to carry binary data that appears in various
+    serialized formats, such as CBOR's Major Type 2, MessagePack's bin, and
+    BSON's generic binary subtype. This type is NOT a part of standard JSON and
+    exists solely for compatibility with these binary types. As such, it is
+    simply defined as an ordered sequence of zero or more byte values.
+
+    Additionally, as an implementation detail, the subtype of the binary data is
+    carried around as a `std::uint8_t`, which is compatible with both of the
+    binary data formats that use binary subtyping, (though the specific
+    numbering is incompatible with each other, and it is up to the user to
+    translate between them).
+
+    [CBOR's RFC 7049](https://tools.ietf.org/html/rfc7049) describes this type
+    as:
+    > Major type 2: a byte string. The string's length in bytes is represented
+    > following the rules for positive integers (major type 0).
+
+    [MessagePack's documentation on the bin type
+    family](https://github.com/msgpack/msgpack/blob/master/spec.md#bin-format-family)
+    describes this type as:
+    > Bin format family stores an byte array in 2, 3, or 5 bytes of extra bytes
+    > in addition to the size of the byte array.
+
+    [BSON's specifications](http://bsonspec.org/spec.html) describe several
+    binary types; however, this type is intended to represent the generic binary
+    type which has the description:
+    > Generic binary subtype - This is the most commonly used binary subtype and
+    > should be the 'default' for drivers and tools.
+
+    None of these impose any limitations on the internal representation other
+    than the basic unit of storage be some type of array whose parts are
+    decomposable into bytes.
+
+    The default representation of this binary format is a
+    `std::vector<std::uint8_t>`, which is a very common way to represent a byte
+    array in modern C++.
+
+    #### Default type
+
+    The default values for @a BinaryType is `std::vector<std::uint8_t>`
+
+    #### Storage
+
+    Binary Arrays are stored as pointers in a @ref basic_json type. That is,
+    for any access to array values, a pointer of the type `binary_t*` must be
+    dereferenced.
+
+    #### Notes on subtypes
+
+    - CBOR
+       - Binary values are represented as byte strings. Subtypes are serialized
+         as tagged values.
+    - MessagePack
+       - If a subtype is given and the binary array contains exactly 1, 2, 4, 8,
+         or 16 elements, the fixext family (fixext1, fixext2, fixext4, fixext8)
+         is used. For other sizes, the ext family (ext8, ext16, ext32) is used.
+         The subtype is then added as singed 8-bit integer.
+       - If no subtype is given, the bin family (bin8, bin16, bin32) is used.
+    - BSON
+       - If a subtype is given, it is used and added as unsigned 8-bit integer.
+       - If no subtype is given, the generic binary subtype 0x00 is used.
+
+    @sa see @ref binary -- create a binary array
+
+    @since version 3.8.0
+    */
+    using binary_t = nlohmann::byte_container_with_subtype<BinaryType>;
+    /// @}
+
+  private:
+
+    /// helper for exception-safe object creation
+    template<typename T, typename... Args>
+    JSON_HEDLEY_RETURNS_NON_NULL
+    static T* create(Args&& ... args)
+    {
+        AllocatorType<T> alloc;
+        using AllocatorTraits = std::allocator_traits<AllocatorType<T>>;
+
+        auto deleter = [&](T * obj)
+        {
+            AllocatorTraits::deallocate(alloc, obj, 1);
+        };
+        std::unique_ptr<T, decltype(deleter)> obj(AllocatorTraits::allocate(alloc, 1), deleter);
+        AllocatorTraits::construct(alloc, obj.get(), std::forward<Args>(args)...);
+        JSON_ASSERT(obj != nullptr);
+        return obj.release();
+    }
+
+    ////////////////////////
+    // JSON value storage //
+    ////////////////////////
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    /*!
+    @brief a JSON value
+
+    The actual storage for a JSON value of the @ref basic_json class. This
+    union combines the different storage types for the JSON value types
+    defined in @ref value_t.
+
+    JSON type | value_t type    | used type
+    --------- | --------------- | ------------------------
+    object    | object          | pointer to @ref object_t
+    array     | array           | pointer to @ref array_t
+    string    | string          | pointer to @ref string_t
+    boolean   | boolean         | @ref boolean_t
+    number    | number_integer  | @ref number_integer_t
+    number    | number_unsigned | @ref number_unsigned_t
+    number    | number_float    | @ref number_float_t
+    binary    | binary          | pointer to @ref binary_t
+    null      | null            | *no value is stored*
+
+    @note Variable-length types (objects, arrays, and strings) are stored as
+    pointers. The size of the union should not exceed 64 bits if the default
+    value types are used.
+
+    @since version 1.0.0
+    */
+    union json_value
+    {
+        /// object (stored with pointer to save storage)
+        object_t* object;
+        /// array (stored with pointer to save storage)
+        array_t* array;
+        /// string (stored with pointer to save storage)
+        string_t* string;
+        /// binary (stored with pointer to save storage)
+        binary_t* binary;
+        /// boolean
+        boolean_t boolean;
+        /// number (integer)
+        number_integer_t number_integer;
+        /// number (unsigned integer)
+        number_unsigned_t number_unsigned;
+        /// number (floating-point)
+        number_float_t number_float;
+
+        /// default constructor (for null values)
+        json_value() = default;
+        /// constructor for booleans
+        json_value(boolean_t v) noexcept : boolean(v) {}
+        /// constructor for numbers (integer)
+        json_value(number_integer_t v) noexcept : number_integer(v) {}
+        /// constructor for numbers (unsigned)
+        json_value(number_unsigned_t v) noexcept : number_unsigned(v) {}
+        /// constructor for numbers (floating-point)
+        json_value(number_float_t v) noexcept : number_float(v) {}
+        /// constructor for empty values of a given type
+        json_value(value_t t)
+        {
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    object = create<object_t>();
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    array = create<array_t>();
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    string = create<string_t>("");
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    binary = create<binary_t>();
+                    break;
+                }
+
+                case value_t::boolean:
+                {
+                    boolean = boolean_t(false);
+                    break;
+                }
+
+                case value_t::number_integer:
+                {
+                    number_integer = number_integer_t(0);
+                    break;
+                }
+
+                case value_t::number_unsigned:
+                {
+                    number_unsigned = number_unsigned_t(0);
+                    break;
+                }
+
+                case value_t::number_float:
+                {
+                    number_float = number_float_t(0.0);
+                    break;
+                }
+
+                case value_t::null:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    break;
+                }
+
+                case value_t::discarded:
+                default:
+                {
+                    object = nullptr;  // silence warning, see #821
+                    if (JSON_HEDLEY_UNLIKELY(t == value_t::null))
+                    {
+                        JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.10.2", basic_json())); // LCOV_EXCL_LINE
+                    }
+                    break;
+                }
+            }
+        }
+
+        /// constructor for strings
+        json_value(const string_t& value)
+        {
+            string = create<string_t>(value);
+        }
+
+        /// constructor for rvalue strings
+        json_value(string_t&& value)
+        {
+            string = create<string_t>(std::move(value));
+        }
+
+        /// constructor for objects
+        json_value(const object_t& value)
+        {
+            object = create<object_t>(value);
+        }
+
+        /// constructor for rvalue objects
+        json_value(object_t&& value)
+        {
+            object = create<object_t>(std::move(value));
+        }
+
+        /// constructor for arrays
+        json_value(const array_t& value)
+        {
+            array = create<array_t>(value);
+        }
+
+        /// constructor for rvalue arrays
+        json_value(array_t&& value)
+        {
+            array = create<array_t>(std::move(value));
+        }
+
+        /// constructor for binary arrays
+        json_value(const typename binary_t::container_type& value)
+        {
+            binary = create<binary_t>(value);
+        }
+
+        /// constructor for rvalue binary arrays
+        json_value(typename binary_t::container_type&& value)
+        {
+            binary = create<binary_t>(std::move(value));
+        }
+
+        /// constructor for binary arrays (internal type)
+        json_value(const binary_t& value)
+        {
+            binary = create<binary_t>(value);
+        }
+
+        /// constructor for rvalue binary arrays (internal type)
+        json_value(binary_t&& value)
+        {
+            binary = create<binary_t>(std::move(value));
+        }
+
+        void destroy(value_t t)
+        {
+            if (t == value_t::array || t == value_t::object)
+            {
+                // flatten the current json_value to a heap-allocated stack
+                std::vector<basic_json> stack;
+
+                // move the top-level items to stack
+                if (t == value_t::array)
+                {
+                    stack.reserve(array->size());
+                    std::move(array->begin(), array->end(), std::back_inserter(stack));
+                }
+                else
+                {
+                    stack.reserve(object->size());
+                    for (auto&& it : *object)
+                    {
+                        stack.push_back(std::move(it.second));
+                    }
+                }
+
+                while (!stack.empty())
+                {
+                    // move the last item to local variable to be processed
+                    basic_json current_item(std::move(stack.back()));
+                    stack.pop_back();
+
+                    // if current_item is array/object, move
+                    // its children to the stack to be processed later
+                    if (current_item.is_array())
+                    {
+                        std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(), std::back_inserter(stack));
+
+                        current_item.m_value.array->clear();
+                    }
+                    else if (current_item.is_object())
+                    {
+                        for (auto&& it : *current_item.m_value.object)
+                        {
+                            stack.push_back(std::move(it.second));
+                        }
+
+                        current_item.m_value.object->clear();
+                    }
+
+                    // it's now safe that current_item get destructed
+                    // since it doesn't have any children
+                }
+            }
+
+            switch (t)
+            {
+                case value_t::object:
+                {
+                    AllocatorType<object_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, object);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, object, 1);
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    AllocatorType<array_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, array);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, array, 1);
+                    break;
+                }
+
+                case value_t::string:
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, string, 1);
+                    break;
+                }
+
+                case value_t::binary:
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, binary, 1);
+                    break;
+                }
+
+                case value_t::null:
+                case value_t::boolean:
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                case value_t::discarded:
+                default:
+                {
+                    break;
+                }
+            }
+        }
+    };
+
+  private:
+    /*!
+    @brief checks the class invariants
+
+    This function asserts the class invariants. It needs to be called at the
+    end of every constructor to make sure that created objects respect the
+    invariant. Furthermore, it has to be called each time the type of a JSON
+    value is changed, because the invariant expresses a relationship between
+    @a m_type and @a m_value.
+
+    Furthermore, the parent relation is checked for arrays and objects: If
+    @a check_parents true and the value is an array or object, then the
+    container's elements must have the current value as parent.
+
+    @param[in] check_parents  whether the parent relation should be checked.
+               The value is true by default and should only be set to false
+               during destruction of objects when the invariant does not
+               need to hold.
+    */
+    void assert_invariant(bool check_parents = true) const noexcept
+    {
+        JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr);
+        JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr);
+        JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr);
+        JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr);
+
+#if JSON_DIAGNOSTICS
+        JSON_TRY
+        {
+            // cppcheck-suppress assertWithSideEffect
+            JSON_ASSERT(!check_parents || !is_structured() || std::all_of(begin(), end(), [this](const basic_json & j)
+            {
+                return j.m_parent == this;
+            }));
+        }
+        JSON_CATCH(...) {} // LCOV_EXCL_LINE
+#endif
+        static_cast<void>(check_parents);
+    }
+
+    void set_parents()
+    {
+#if JSON_DIAGNOSTICS
+        switch (m_type)
+        {
+            case value_t::array:
+            {
+                for (auto& element : *m_value.array)
+                {
+                    element.m_parent = this;
+                }
+                break;
+            }
+
+            case value_t::object:
+            {
+                for (auto& element : *m_value.object)
+                {
+                    element.second.m_parent = this;
+                }
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                break;
+        }
+#endif
+    }
+
+    iterator set_parents(iterator it, typename iterator::difference_type count)
+    {
+#if JSON_DIAGNOSTICS
+        for (typename iterator::difference_type i = 0; i < count; ++i)
+        {
+            (it + i)->m_parent = this;
+        }
+#else
+        static_cast<void>(count);
+#endif
+        return it;
+    }
+
+    reference set_parent(reference j, std::size_t old_capacity = std::size_t(-1))
+    {
+#if JSON_DIAGNOSTICS
+        if (old_capacity != std::size_t(-1))
+        {
+            // see https://github.com/nlohmann/json/issues/2838
+            JSON_ASSERT(type() == value_t::array);
+            if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity))
+            {
+                // capacity has changed: update all parents
+                set_parents();
+                return j;
+            }
+        }
+
+        // ordered_json uses a vector internally, so pointers could have
+        // been invalidated; see https://github.com/nlohmann/json/issues/2962
+#ifdef JSON_HEDLEY_MSVC_VERSION
+#pragma warning(push )
+#pragma warning(disable : 4127) // ignore warning to replace if with if constexpr
+#endif
+        if (detail::is_ordered_map<object_t>::value)
+        {
+            set_parents();
+            return j;
+        }
+#ifdef JSON_HEDLEY_MSVC_VERSION
+#pragma warning( pop )
+#endif
+
+        j.m_parent = this;
+#else
+        static_cast<void>(j);
+        static_cast<void>(old_capacity);
+#endif
+        return j;
+    }
+
+  public:
+    //////////////////////////
+    // JSON parser callback //
+    //////////////////////////
+
+    /*!
+    @brief parser event types
+
+    The parser callback distinguishes the following events:
+    - `object_start`: the parser read `{` and started to process a JSON object
+    - `key`: the parser read a key of a value in an object
+    - `object_end`: the parser read `}` and finished processing a JSON object
+    - `array_start`: the parser read `[` and started to process a JSON array
+    - `array_end`: the parser read `]` and finished processing a JSON array
+    - `value`: the parser finished reading a JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    @sa see @ref parser_callback_t for more information and examples
+    */
+    using parse_event_t = detail::parse_event_t;
+
+    /*!
+    @brief per-element parser callback type
+
+    With a parser callback function, the result of parsing a JSON text can be
+    influenced. When passed to @ref parse, it is called on certain events
+    (passed as @ref parse_event_t via parameter @a event) with a set recursion
+    depth @a depth and context JSON value @a parsed. The return value of the
+    callback function is a boolean indicating whether the element that emitted
+    the callback shall be kept or not.
+
+    We distinguish six scenarios (determined by the event type) in which the
+    callback function can be called. The following table describes the values
+    of the parameters @a depth, @a event, and @a parsed.
+
+    parameter @a event | description | parameter @a depth | parameter @a parsed
+    ------------------ | ----------- | ------------------ | -------------------
+    parse_event_t::object_start | the parser read `{` and started to process a JSON object | depth of the parent of the JSON object | a JSON value with type discarded
+    parse_event_t::key | the parser read a key of a value in an object | depth of the currently parsed JSON object | a JSON string containing the key
+    parse_event_t::object_end | the parser read `}` and finished processing a JSON object | depth of the parent of the JSON object | the parsed JSON object
+    parse_event_t::array_start | the parser read `[` and started to process a JSON array | depth of the parent of the JSON array | a JSON value with type discarded
+    parse_event_t::array_end | the parser read `]` and finished processing a JSON array | depth of the parent of the JSON array | the parsed JSON array
+    parse_event_t::value | the parser finished reading a JSON value | depth of the value | the parsed JSON value
+
+    @image html callback_events.png "Example when certain parse events are triggered"
+
+    Discarding a value (i.e., returning `false`) has different effects
+    depending on the context in which function was called:
+
+    - Discarded values in structured types are skipped. That is, the parser
+      will behave as if the discarded value was never read.
+    - In case a value outside a structured type is skipped, it is replaced
+      with `null`. This case happens if the top-level element is skipped.
+
+    @param[in] depth  the depth of the recursion during parsing
+
+    @param[in] event  an event of type parse_event_t indicating the context in
+    the callback function has been called
+
+    @param[in,out] parsed  the current intermediate parse result; note that
+    writing to this value has no effect for parse_event_t::key events
+
+    @return Whether the JSON value which called the function during parsing
+    should be kept (`true`) or not (`false`). In the latter case, it is either
+    skipped completely or replaced by an empty discarded object.
+
+    @sa see @ref parse for examples
+
+    @since version 1.0.0
+    */
+    using parser_callback_t = detail::parser_callback_t<basic_json>;
+
+    //////////////////
+    // constructors //
+    //////////////////
+
+    /// @name constructors and destructors
+    /// Constructors of class @ref basic_json, copy/move constructor, copy
+    /// assignment, static functions creating objects, and the destructor.
+    /// @{
+
+    /*!
+    @brief create an empty value with a given type
+
+    Create an empty JSON value with a given type. The value will be default
+    initialized with an empty value which depends on the type:
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    object      | `{}`
+    array       | `[]`
+    binary      | empty array
+
+    @param[in] v  the type of the value to create
+
+    @complexity Constant.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows the constructor for different @ref
+    value_t values,basic_json__value_t}
+
+    @sa see @ref clear() -- restores the postcondition of this constructor
+
+    @since version 1.0.0
+    */
+    basic_json(const value_t v)
+        : m_type(v), m_value(v)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a null object
+
+    Create a `null` JSON value. It either takes a null pointer as parameter
+    (explicitly creating `null`) or no parameter (implicitly creating `null`).
+    The passed null pointer itself is not read -- it is only used to choose
+    the right constructor.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @liveexample{The following code shows the constructor with and without a
+    null pointer parameter.,basic_json__nullptr_t}
+
+    @since version 1.0.0
+    */
+    basic_json(std::nullptr_t = nullptr) noexcept
+        : basic_json(value_t::null)
+    {
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value
+
+    This is a "catch all" constructor for all compatible JSON types; that is,
+    types for which a `to_json()` method exists. The constructor forwards the
+    parameter @a val to that method (to `json_serializer<U>::to_json` method
+    with `U = uncvref_t<CompatibleType>`, to be exact).
+
+    Template type @a CompatibleType includes, but is not limited to, the
+    following types:
+    - **arrays**: @ref array_t and all kinds of compatible containers such as
+      `std::vector`, `std::deque`, `std::list`, `std::forward_list`,
+      `std::array`, `std::valarray`, `std::set`, `std::unordered_set`,
+      `std::multiset`, and `std::unordered_multiset` with a `value_type` from
+      which a @ref basic_json value can be constructed.
+    - **objects**: @ref object_t and all kinds of compatible associative
+      containers such as `std::map`, `std::unordered_map`, `std::multimap`,
+      and `std::unordered_multimap` with a `key_type` compatible to
+      @ref string_t and a `value_type` from which a @ref basic_json value can
+      be constructed.
+    - **strings**: @ref string_t, string literals, and all compatible string
+      containers can be used.
+    - **numbers**: @ref number_integer_t, @ref number_unsigned_t,
+      @ref number_float_t, and all convertible number types such as `int`,
+      `size_t`, `int64_t`, `float` or `double` can be used.
+    - **boolean**: @ref boolean_t / `bool` can be used.
+    - **binary**: @ref binary_t / `std::vector<std::uint8_t>` may be used,
+      unfortunately because string literals cannot be distinguished from binary
+      character arrays by the C++ type system, all types compatible with `const
+      char*` will be directed to the string constructor instead.  This is both
+      for backwards compatibility, and due to the fact that a binary type is not
+      a standard JSON type.
+
+    See the examples below.
+
+    @tparam CompatibleType a type such that:
+    - @a CompatibleType is not derived from `std::istream`,
+    - @a CompatibleType is not @ref basic_json (to avoid hijacking copy/move
+         constructors),
+    - @a CompatibleType is not a different @ref basic_json type (i.e. with different template arguments)
+    - @a CompatibleType is not a @ref basic_json nested type (e.g.,
+         @ref json_pointer, @ref iterator, etc ...)
+    - `json_serializer<U>` has a `to_json(basic_json_t&, CompatibleType&&)` method
+
+    @tparam U = `uncvref_t<CompatibleType>`
+
+    @param[in] val the value to be forwarded to the respective constructor
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @liveexample{The following code shows the constructor with several
+    compatible types.,basic_json__CompatibleType}
+
+    @since version 2.1.0
+    */
+    template < typename CompatibleType,
+               typename U = detail::uncvref_t<CompatibleType>,
+               detail::enable_if_t <
+                   !detail::is_basic_json<U>::value && detail::is_compatible_type<basic_json_t, U>::value, int > = 0 >
+    basic_json(CompatibleType && val) noexcept(noexcept( // NOLINT(bugprone-forwarding-reference-overload,bugprone-exception-escape)
+                JSONSerializer<U>::to_json(std::declval<basic_json_t&>(),
+                                           std::forward<CompatibleType>(val))))
+    {
+        JSONSerializer<U>::to_json(*this, std::forward<CompatibleType>(val));
+        set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a JSON value from an existing one
+
+    This is a constructor for existing @ref basic_json types.
+    It does not hijack copy/move constructors, since the parameter has different
+    template arguments than the current ones.
+
+    The constructor tries to convert the internal @ref m_value of the parameter.
+
+    @tparam BasicJsonType a type such that:
+    - @a BasicJsonType is a @ref basic_json type.
+    - @a BasicJsonType has different template arguments than @ref basic_json_t.
+
+    @param[in] val the @ref basic_json value to be converted.
+
+    @complexity Usually linear in the size of the passed @a val, also
+                depending on the implementation of the called `to_json()`
+                method.
+
+    @exceptionsafety Depends on the called constructor. For types directly
+    supported by the library (i.e., all types for which no `to_json()` function
+    was provided), strong guarantee holds: if an exception is thrown, there are
+    no changes to any JSON value.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value&& !std::is_same<basic_json, BasicJsonType>::value, int > = 0 >
+    basic_json(const BasicJsonType& val)
+    {
+        using other_boolean_t = typename BasicJsonType::boolean_t;
+        using other_number_float_t = typename BasicJsonType::number_float_t;
+        using other_number_integer_t = typename BasicJsonType::number_integer_t;
+        using other_number_unsigned_t = typename BasicJsonType::number_unsigned_t;
+        using other_string_t = typename BasicJsonType::string_t;
+        using other_object_t = typename BasicJsonType::object_t;
+        using other_array_t = typename BasicJsonType::array_t;
+        using other_binary_t = typename BasicJsonType::binary_t;
+
+        switch (val.type())
+        {
+            case value_t::boolean:
+                JSONSerializer<other_boolean_t>::to_json(*this, val.template get<other_boolean_t>());
+                break;
+            case value_t::number_float:
+                JSONSerializer<other_number_float_t>::to_json(*this, val.template get<other_number_float_t>());
+                break;
+            case value_t::number_integer:
+                JSONSerializer<other_number_integer_t>::to_json(*this, val.template get<other_number_integer_t>());
+                break;
+            case value_t::number_unsigned:
+                JSONSerializer<other_number_unsigned_t>::to_json(*this, val.template get<other_number_unsigned_t>());
+                break;
+            case value_t::string:
+                JSONSerializer<other_string_t>::to_json(*this, val.template get_ref<const other_string_t&>());
+                break;
+            case value_t::object:
+                JSONSerializer<other_object_t>::to_json(*this, val.template get_ref<const other_object_t&>());
+                break;
+            case value_t::array:
+                JSONSerializer<other_array_t>::to_json(*this, val.template get_ref<const other_array_t&>());
+                break;
+            case value_t::binary:
+                JSONSerializer<other_binary_t>::to_json(*this, val.template get_ref<const other_binary_t&>());
+                break;
+            case value_t::null:
+                *this = nullptr;
+                break;
+            case value_t::discarded:
+                m_type = value_t::discarded;
+                break;
+            default:            // LCOV_EXCL_LINE
+                JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+        }
+        set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief create a container (array or object) from an initializer list
+
+    Creates a JSON value of type array or object from the passed initializer
+    list @a init. In case @a type_deduction is `true` (default), the type of
+    the JSON value to be created is deducted from the initializer list @a init
+    according to the following rules:
+
+    1. If the list is empty, an empty JSON object value `{}` is created.
+    2. If the list consists of pairs whose first element is a string, a JSON
+       object value is created where the first elements of the pairs are
+       treated as keys and the second elements are as values.
+    3. In all other cases, an array is created.
+
+    The rules aim to create the best fit between a C++ initializer list and
+    JSON values. The rationale is as follows:
+
+    1. The empty initializer list is written as `{}` which is exactly an empty
+       JSON object.
+    2. C++ has no way of describing mapped types other than to list a list of
+       pairs. As JSON requires that keys must be of type string, rule 2 is the
+       weakest constraint one can pose on initializer lists to interpret them
+       as an object.
+    3. In all other cases, the initializer list could not be interpreted as
+       JSON object type, so interpreting it as JSON array type is safe.
+
+    With the rules described above, the following JSON values cannot be
+    expressed by an initializer list:
+
+    - the empty array (`[]`): use @ref array(initializer_list_t)
+      with an empty initializer list in this case
+    - arrays whose elements satisfy rule 2: use @ref
+      array(initializer_list_t) with the same initializer list
+      in this case
+
+    @note When used without parentheses around an empty initializer list, @ref
+    basic_json() is called instead of this function, yielding the JSON null
+    value.
+
+    @param[in] init  initializer list with JSON values
+
+    @param[in] type_deduction internal parameter; when set to `true`, the type
+    of the JSON value is deducted from the initializer list @a init; when set
+    to `false`, the type provided via @a manual_type is forced. This mode is
+    used by the functions @ref array(initializer_list_t) and
+    @ref object(initializer_list_t).
+
+    @param[in] manual_type internal parameter; when @a type_deduction is set
+    to `false`, the created JSON value will use the provided type (only @ref
+    value_t::array and @ref value_t::object are valid); when @a type_deduction
+    is set to `true`, this parameter has no effect
+
+    @throw type_error.301 if @a type_deduction is `false`, @a manual_type is
+    `value_t::object`, but @a init contains an element which is not a pair
+    whose first element is a string. In this case, the constructor could not
+    create an object. If @a type_deduction would have be `true`, an array
+    would have been created. See @ref object(initializer_list_t)
+    for an example.
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows how JSON values are created from
+    initializer lists.,basic_json__list_init_t}
+
+    @sa see @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+    @sa see @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    basic_json(initializer_list_t init,
+               bool type_deduction = true,
+               value_t manual_type = value_t::array)
+    {
+        // check if each element is an array with two elements whose first
+        // element is a string
+        bool is_an_object = std::all_of(init.begin(), init.end(),
+                                        [](const detail::json_ref<basic_json>& element_ref)
+        {
+            return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string();
+        });
+
+        // adjust type if type deduction is not wanted
+        if (!type_deduction)
+        {
+            // if array is wanted, do not create an object though possible
+            if (manual_type == value_t::array)
+            {
+                is_an_object = false;
+            }
+
+            // if object is wanted but impossible, throw an exception
+            if (JSON_HEDLEY_UNLIKELY(manual_type == value_t::object && !is_an_object))
+            {
+                JSON_THROW(type_error::create(301, "cannot create object from initializer list", basic_json()));
+            }
+        }
+
+        if (is_an_object)
+        {
+            // the initializer list is a list of pairs -> create object
+            m_type = value_t::object;
+            m_value = value_t::object;
+
+            for (auto& element_ref : init)
+            {
+                auto element = element_ref.moved_or_copied();
+                m_value.object->emplace(
+                    std::move(*((*element.m_value.array)[0].m_value.string)),
+                    std::move((*element.m_value.array)[1]));
+            }
+        }
+        else
+        {
+            // the initializer list describes an array -> create array
+            m_type = value_t::array;
+            m_value.array = create<array_t>(init.begin(), init.end());
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief explicitly create a binary array (without subtype)
+
+    Creates a JSON binary array value from a given binary container. Binary
+    values are part of various binary formats, such as CBOR, MessagePack, and
+    BSON. This constructor is used to create a value for serialization to those
+    formats.
+
+    @note Note, this function exists because of the difficulty in correctly
+    specifying the correct template overload in the standard value ctor, as both
+    JSON arrays and JSON binary arrays are backed with some form of a
+    `std::vector`. Because JSON binary arrays are a non-standard extension it
+    was decided that it would be best to prevent automatic initialization of a
+    binary array type, for backwards compatibility and so it does not happen on
+    accident.
+
+    @param[in] init container containing bytes to use as binary type
+
+    @return JSON binary array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @since version 3.8.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = init;
+        return res;
+    }
+
+    /*!
+    @brief explicitly create a binary array (with subtype)
+
+    Creates a JSON binary array value from a given binary container. Binary
+    values are part of various binary formats, such as CBOR, MessagePack, and
+    BSON. This constructor is used to create a value for serialization to those
+    formats.
+
+    @note Note, this function exists because of the difficulty in correctly
+    specifying the correct template overload in the standard value ctor, as both
+    JSON arrays and JSON binary arrays are backed with some form of a
+    `std::vector`. Because JSON binary arrays are a non-standard extension it
+    was decided that it would be best to prevent automatic initialization of a
+    binary array type, for backwards compatibility and so it does not happen on
+    accident.
+
+    @param[in] init container containing bytes to use as binary type
+    @param[in] subtype subtype to use in MessagePack and BSON
+
+    @return JSON binary array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @since version 3.8.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = binary_t(init, subtype);
+        return res;
+    }
+
+    /// @copydoc binary(const typename binary_t::container_type&)
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = std::move(init);
+        return res;
+    }
+
+    /// @copydoc binary(const typename binary_t::container_type&, typename binary_t::subtype_type)
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype)
+    {
+        auto res = basic_json();
+        res.m_type = value_t::binary;
+        res.m_value = binary_t(std::move(init), subtype);
+        return res;
+    }
+
+    /*!
+    @brief explicitly create an array from an initializer list
+
+    Creates a JSON array value from a given initializer list. That is, given a
+    list of values `a, b, c`, creates the JSON value `[a, b, c]`. If the
+    initializer list is empty, the empty array `[]` is created.
+
+    @note This function is only needed to express two edge cases that cannot
+    be realized with the initializer list constructor (@ref
+    basic_json(initializer_list_t, bool, value_t)). These cases
+    are:
+    1. creating an array whose elements are all pairs whose first element is a
+    string -- in this case, the initializer list constructor would create an
+    object, taking the first elements as keys
+    2. creating an empty array -- passing the empty initializer list to the
+    initializer list constructor yields an empty object
+
+    @param[in] init  initializer list with JSON values to create an array from
+    (optional)
+
+    @return JSON array value
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `array`
+    function.,array}
+
+    @sa see @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa see @ref object(initializer_list_t) -- create a JSON object
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json array(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::array);
+    }
+
+    /*!
+    @brief explicitly create an object from an initializer list
+
+    Creates a JSON object value from a given initializer list. The initializer
+    lists elements must be pairs, and their first elements must be strings. If
+    the initializer list is empty, the empty object `{}` is created.
+
+    @note This function is only added for symmetry reasons. In contrast to the
+    related function @ref array(initializer_list_t), there are
+    no cases which can only be expressed by this function. That is, any
+    initializer list @a init can also be passed to the initializer list
+    constructor @ref basic_json(initializer_list_t, bool, value_t).
+
+    @param[in] init  initializer list to create an object from (optional)
+
+    @return JSON object value
+
+    @throw type_error.301 if @a init is not a list of pairs whose first
+    elements are strings. In this case, no object can be created. When such a
+    value is passed to @ref basic_json(initializer_list_t, bool, value_t),
+    an array would have been created from the passed initializer list @a init.
+    See example below.
+
+    @complexity Linear in the size of @a init.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows an example for the `object`
+    function.,object}
+
+    @sa see @ref basic_json(initializer_list_t, bool, value_t) --
+    create a JSON value from an initializer list
+    @sa see @ref array(initializer_list_t) -- create a JSON array
+    value from an initializer list
+
+    @since version 1.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json object(initializer_list_t init = {})
+    {
+        return basic_json(init, false, value_t::object);
+    }
+
+    /*!
+    @brief construct an array with count copies of given value
+
+    Constructs a JSON array value by creating @a cnt copies of a passed value.
+    In case @a cnt is `0`, an empty array is created.
+
+    @param[in] cnt  the number of JSON copies of @a val to create
+    @param[in] val  the JSON value to copy
+
+    @post `std::distance(begin(),end()) == cnt` holds.
+
+    @complexity Linear in @a cnt.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The following code shows examples for the @ref
+    basic_json(size_type\, const basic_json&)
+    constructor.,basic_json__size_type_basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(size_type cnt, const basic_json& val)
+        : m_type(value_t::array)
+    {
+        m_value.array = create<array_t>(cnt, val);
+        set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief construct a JSON container given an iterator range
+
+    Constructs the JSON value with the contents of the range `[first, last)`.
+    The semantics depends on the different types a JSON value can have:
+    - In case of a null type, invalid_iterator.206 is thrown.
+    - In case of other primitive types (number, boolean, or string), @a first
+      must be `begin()` and @a last must be `end()`. In this case, the value is
+      copied. Otherwise, invalid_iterator.204 is thrown.
+    - In case of structured types (array, object), the constructor behaves as
+      similar versions for `std::vector` or `std::map`; that is, a JSON array
+      or object is constructed from the values in the range.
+
+    @tparam InputIT an input iterator type (@ref iterator or @ref
+    const_iterator)
+
+    @param[in] first begin of the range to copy from (included)
+    @param[in] last end of the range to copy from (excluded)
+
+    @pre Iterators @a first and @a last must be initialized. **This
+         precondition is enforced with an assertion (see warning).** If
+         assertions are switched off, a violation of this precondition yields
+         undefined behavior.
+
+    @pre Range `[first, last)` is valid. Usually, this precondition cannot be
+         checked efficiently. Only certain edge cases are detected; see the
+         description of the exceptions below. A violation of this precondition
+         yields undefined behavior.
+
+    @warning A precondition is enforced with a runtime assertion that will
+             result in calling `std::abort` if this precondition is not met.
+             Assertions can be disabled by defining `NDEBUG` at compile time.
+             See https://en.cppreference.com/w/cpp/error/assert for more
+             information.
+
+    @throw invalid_iterator.201 if iterators @a first and @a last are not
+    compatible (i.e., do not belong to the same JSON value). In this case,
+    the range `[first, last)` is undefined.
+    @throw invalid_iterator.204 if iterators @a first and @a last belong to a
+    primitive type (number, boolean, or string), but @a first does not point
+    to the first element any more. In this case, the range `[first, last)` is
+    undefined. See example code below.
+    @throw invalid_iterator.206 if iterators @a first and @a last belong to a
+    null value. In this case, the range `[first, last)` is undefined.
+
+    @complexity Linear in distance between @a first and @a last.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @liveexample{The example below shows several ways to create JSON values by
+    specifying a subrange with iterators.,basic_json__InputIt_InputIt}
+
+    @since version 1.0.0
+    */
+    template < class InputIT, typename std::enable_if <
+                   std::is_same<InputIT, typename basic_json_t::iterator>::value ||
+                   std::is_same<InputIT, typename basic_json_t::const_iterator>::value, int >::type = 0 >
+    basic_json(InputIT first, InputIT last)
+    {
+        JSON_ASSERT(first.m_object != nullptr);
+        JSON_ASSERT(last.m_object != nullptr);
+
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(201, "iterators are not compatible", basic_json()));
+        }
+
+        // copy type from first iterator
+        m_type = first.m_object->m_type;
+
+        // check if iterator range is complete for primitive values
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!first.m_it.primitive_iterator.is_begin()
+                                         || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", *first.m_object));
+                }
+                break;
+            }
+
+            case value_t::null:
+            case value_t::object:
+            case value_t::array:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+                break;
+        }
+
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = first.m_object->m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = first.m_object->m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = first.m_object->m_value.number_float;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = first.m_object->m_value.boolean;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *first.m_object->m_value.string;
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object = create<object_t>(first.m_it.object_iterator,
+                                                  last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array = create<array_t>(first.m_it.array_iterator,
+                                                last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value = *first.m_object->m_value.binary;
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(invalid_iterator::create(206, "cannot construct with iterators from " + std::string(first.m_object->type_name()), *first.m_object));
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+
+    ///////////////////////////////////////
+    // other constructors and destructor //
+    ///////////////////////////////////////
+
+    template<typename JsonRef,
+             detail::enable_if_t<detail::conjunction<detail::is_json_ref<JsonRef>,
+                                 std::is_same<typename JsonRef::value_type, basic_json>>::value, int> = 0 >
+    basic_json(const JsonRef& ref) : basic_json(ref.moved_or_copied()) {}
+
+    /*!
+    @brief copy constructor
+
+    Creates a copy of a given JSON value.
+
+    @param[in] other  the JSON value to copy
+
+    @post `*this == other`
+
+    @complexity Linear in the size of @a other.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes to any JSON value.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - As postcondition, it holds: `other == basic_json(other)`.
+
+    @liveexample{The following code shows an example for the copy
+    constructor.,basic_json__basic_json}
+
+    @since version 1.0.0
+    */
+    basic_json(const basic_json& other)
+        : m_type(other.m_type)
+    {
+        // check of passed value is valid
+        other.assert_invariant();
+
+        switch (m_type)
+        {
+            case value_t::object:
+            {
+                m_value = *other.m_value.object;
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value = *other.m_value.array;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value = *other.m_value.string;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value = other.m_value.boolean;
+                break;
+            }
+
+            case value_t::number_integer:
+            {
+                m_value = other.m_value.number_integer;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value = other.m_value.number_unsigned;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value = other.m_value.number_float;
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value = *other.m_value.binary;
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                break;
+        }
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief move constructor
+
+    Move constructor. Constructs a JSON value with the contents of the given
+    value @a other using move semantics. It "steals" the resources from @a
+    other and leaves it as JSON null value.
+
+    @param[in,out] other  value to move to this object
+
+    @post `*this` has the same value as @a other before the call.
+    @post @a other is a JSON null value.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this constructor never throws
+    exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [MoveConstructible](https://en.cppreference.com/w/cpp/named_req/MoveConstructible)
+    requirements.
+
+    @liveexample{The code below shows the move constructor explicitly called
+    via std::move.,basic_json__moveconstructor}
+
+    @since version 1.0.0
+    */
+    basic_json(basic_json&& other) noexcept
+        : m_type(std::move(other.m_type)),
+          m_value(std::move(other.m_value))
+    {
+        // check that passed value is valid
+        other.assert_invariant(false);
+
+        // invalidate payload
+        other.m_type = value_t::null;
+        other.m_value = {};
+
+        set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief copy assignment
+
+    Copy assignment operator. Copies a JSON value via the "copy and swap"
+    strategy: It is expressed in terms of the copy constructor, destructor,
+    and the `swap()` member function.
+
+    @param[in] other  value to copy from
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+
+    @liveexample{The code below shows and example for the copy assignment. It
+    creates a copy of value `a` which is then swapped with `b`. Finally\, the
+    copy of `a` (which is the null value after the swap) is
+    destroyed.,basic_json__copyassignment}
+
+    @since version 1.0.0
+    */
+    basic_json& operator=(basic_json other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        // check that passed value is valid
+        other.assert_invariant();
+
+        using std::swap;
+        swap(m_type, other.m_type);
+        swap(m_value, other.m_value);
+
+        set_parents();
+        assert_invariant();
+        return *this;
+    }
+
+    /*!
+    @brief destructor
+
+    Destroys the JSON value and frees all allocated memory.
+
+    @complexity Linear.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is linear.
+    - All stored elements are destroyed and all memory is freed.
+
+    @since version 1.0.0
+    */
+    ~basic_json() noexcept
+    {
+        assert_invariant(false);
+        m_value.destroy(m_type);
+    }
+
+    /// @}
+
+  public:
+    ///////////////////////
+    // object inspection //
+    ///////////////////////
+
+    /// @name object inspection
+    /// Functions to inspect the type of a JSON value.
+    /// @{
+
+    /*!
+    @brief serialization
+
+    Serialization function for JSON values. The function tries to mimic
+    Python's `json.dumps()` function, and currently supports its @a indent
+    and @a ensure_ascii parameters.
+
+    @param[in] indent If indent is nonnegative, then array elements and object
+    members will be pretty-printed with that indent level. An indent level of
+    `0` will only insert newlines. `-1` (the default) selects the most compact
+    representation.
+    @param[in] indent_char The character to use for indentation if @a indent is
+    greater than `0`. The default is ` ` (space).
+    @param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
+    in the output are escaped with `\uXXXX` sequences, and the result consists
+    of ASCII characters only.
+    @param[in] error_handler  how to react on decoding errors; there are three
+    possible values: `strict` (throws and exception in case a decoding error
+    occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
+    and `ignore` (ignore invalid UTF-8 sequences during serialization; all
+    bytes are copied to the output unchanged).
+
+    @return string containing the serialization of the JSON value
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded and @a error_handler is set to strict
+
+    @note Binary values are serialized as object containing two keys:
+      - "bytes": an array of bytes as integers
+      - "subtype": the subtype as integer or "null" if the binary has no subtype
+
+    @complexity Linear.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @liveexample{The following example shows the effect of different @a indent\,
+    @a indent_char\, and @a ensure_ascii parameters to the result of the
+    serialization.,dump}
+
+    @see https://docs.python.org/2/library/json.html#json.dump
+
+    @since version 1.0.0; indentation character @a indent_char, option
+           @a ensure_ascii and exceptions added in version 3.0.0; error
+           handlers added in version 3.4.0; serialization of binary values added
+           in version 3.8.0.
+    */
+    string_t dump(const int indent = -1,
+                  const char indent_char = ' ',
+                  const bool ensure_ascii = false,
+                  const error_handler_t error_handler = error_handler_t::strict) const
+    {
+        string_t result;
+        serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
+
+        if (indent >= 0)
+        {
+            s.dump(*this, true, ensure_ascii, static_cast<unsigned int>(indent));
+        }
+        else
+        {
+            s.dump(*this, false, ensure_ascii, 0);
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief return the type of the JSON value (explicit)
+
+    Return the type of the JSON value as a value from the @ref value_t
+    enumeration.
+
+    @return the type of the JSON value
+            Value type                | return value
+            ------------------------- | -------------------------
+            null                      | value_t::null
+            boolean                   | value_t::boolean
+            string                    | value_t::string
+            number (integer)          | value_t::number_integer
+            number (unsigned integer) | value_t::number_unsigned
+            number (floating-point)   | value_t::number_float
+            object                    | value_t::object
+            array                     | value_t::array
+            binary                    | value_t::binary
+            discarded                 | value_t::discarded
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `type()` for all JSON
+    types.,type}
+
+    @sa see @ref operator value_t() -- return the type of the JSON value (implicit)
+    @sa see @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr value_t type() const noexcept
+    {
+        return m_type;
+    }
+
+    /*!
+    @brief return whether type is primitive
+
+    This function returns true if and only if the JSON type is primitive
+    (string, number, boolean, or null).
+
+    @return `true` if type is primitive (string, number, boolean, or null),
+    `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_primitive()` for all JSON
+    types.,is_primitive}
+
+    @sa see @ref is_structured() -- returns whether JSON value is structured
+    @sa see @ref is_null() -- returns whether JSON value is `null`
+    @sa see @ref is_string() -- returns whether JSON value is a string
+    @sa see @ref is_boolean() -- returns whether JSON value is a boolean
+    @sa see @ref is_number() -- returns whether JSON value is a number
+    @sa see @ref is_binary() -- returns whether JSON value is a binary array
+
+    @since version 1.0.0
+    */
+    constexpr bool is_primitive() const noexcept
+    {
+        return is_null() || is_string() || is_boolean() || is_number() || is_binary();
+    }
+
+    /*!
+    @brief return whether type is structured
+
+    This function returns true if and only if the JSON type is structured
+    (array or object).
+
+    @return `true` if type is structured (array or object), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_structured()` for all JSON
+    types.,is_structured}
+
+    @sa see @ref is_primitive() -- returns whether value is primitive
+    @sa see @ref is_array() -- returns whether value is an array
+    @sa see @ref is_object() -- returns whether value is an object
+
+    @since version 1.0.0
+    */
+    constexpr bool is_structured() const noexcept
+    {
+        return is_array() || is_object();
+    }
+
+    /*!
+    @brief return whether value is null
+
+    This function returns true if and only if the JSON value is null.
+
+    @return `true` if type is null, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_null()` for all JSON
+    types.,is_null}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_null() const noexcept
+    {
+        return m_type == value_t::null;
+    }
+
+    /*!
+    @brief return whether value is a boolean
+
+    This function returns true if and only if the JSON value is a boolean.
+
+    @return `true` if type is boolean, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_boolean()` for all JSON
+    types.,is_boolean}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_boolean() const noexcept
+    {
+        return m_type == value_t::boolean;
+    }
+
+    /*!
+    @brief return whether value is a number
+
+    This function returns true if and only if the JSON value is a number. This
+    includes both integer (signed and unsigned) and floating-point values.
+
+    @return `true` if type is number (regardless whether integer, unsigned
+    integer or floating-type), `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number()` for all JSON
+    types.,is_number}
+
+    @sa see @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa see @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa see @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number() const noexcept
+    {
+        return is_number_integer() || is_number_float();
+    }
+
+    /*!
+    @brief return whether value is an integer number
+
+    This function returns true if and only if the JSON value is a signed or
+    unsigned integer number. This excludes floating-point values.
+
+    @return `true` if type is an integer or unsigned integer number, `false`
+    otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_integer()` for all
+    JSON types.,is_number_integer}
+
+    @sa see @ref is_number() -- check if value is a number
+    @sa see @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+    @sa see @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_integer() const noexcept
+    {
+        return m_type == value_t::number_integer || m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is an unsigned integer number
+
+    This function returns true if and only if the JSON value is an unsigned
+    integer number. This excludes floating-point and signed integer values.
+
+    @return `true` if type is an unsigned integer number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_unsigned()` for all
+    JSON types.,is_number_unsigned}
+
+    @sa see @ref is_number() -- check if value is a number
+    @sa see @ref is_number_integer() -- check if value is an integer or unsigned
+    integer number
+    @sa see @ref is_number_float() -- check if value is a floating-point number
+
+    @since version 2.0.0
+    */
+    constexpr bool is_number_unsigned() const noexcept
+    {
+        return m_type == value_t::number_unsigned;
+    }
+
+    /*!
+    @brief return whether value is a floating-point number
+
+    This function returns true if and only if the JSON value is a
+    floating-point number. This excludes signed and unsigned integer values.
+
+    @return `true` if type is a floating-point number, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_number_float()` for all
+    JSON types.,is_number_float}
+
+    @sa see @ref is_number() -- check if value is number
+    @sa see @ref is_number_integer() -- check if value is an integer number
+    @sa see @ref is_number_unsigned() -- check if value is an unsigned integer
+    number
+
+    @since version 1.0.0
+    */
+    constexpr bool is_number_float() const noexcept
+    {
+        return m_type == value_t::number_float;
+    }
+
+    /*!
+    @brief return whether value is an object
+
+    This function returns true if and only if the JSON value is an object.
+
+    @return `true` if type is object, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_object()` for all JSON
+    types.,is_object}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_object() const noexcept
+    {
+        return m_type == value_t::object;
+    }
+
+    /*!
+    @brief return whether value is an array
+
+    This function returns true if and only if the JSON value is an array.
+
+    @return `true` if type is array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_array()` for all JSON
+    types.,is_array}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_array() const noexcept
+    {
+        return m_type == value_t::array;
+    }
+
+    /*!
+    @brief return whether value is a string
+
+    This function returns true if and only if the JSON value is a string.
+
+    @return `true` if type is string, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_string()` for all JSON
+    types.,is_string}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_string() const noexcept
+    {
+        return m_type == value_t::string;
+    }
+
+    /*!
+    @brief return whether value is a binary array
+
+    This function returns true if and only if the JSON value is a binary array.
+
+    @return `true` if type is binary array, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_binary()` for all JSON
+    types.,is_binary}
+
+    @since version 3.8.0
+    */
+    constexpr bool is_binary() const noexcept
+    {
+        return m_type == value_t::binary;
+    }
+
+    /*!
+    @brief return whether value is discarded
+
+    This function returns true if and only if the JSON value was discarded
+    during parsing with a callback function (see @ref parser_callback_t).
+
+    @note This function will always be `false` for JSON values after parsing.
+    That is, discarded values can only occur during parsing, but will be
+    removed when inside a structured value or replaced by null in other cases.
+
+    @return `true` if type is discarded, `false` otherwise.
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies `is_discarded()` for all JSON
+    types.,is_discarded}
+
+    @since version 1.0.0
+    */
+    constexpr bool is_discarded() const noexcept
+    {
+        return m_type == value_t::discarded;
+    }
+
+    /*!
+    @brief return the type of the JSON value (implicit)
+
+    Implicitly return the type of the JSON value as a value from the @ref
+    value_t enumeration.
+
+    @return the type of the JSON value
+
+    @complexity Constant.
+
+    @exceptionsafety No-throw guarantee: this member function never throws
+    exceptions.
+
+    @liveexample{The following code exemplifies the @ref value_t operator for
+    all JSON types.,operator__value_t}
+
+    @sa see @ref type() -- return the type of the JSON value (explicit)
+    @sa see @ref type_name() -- return the type as string
+
+    @since version 1.0.0
+    */
+    constexpr operator value_t() const noexcept
+    {
+        return m_type;
+    }
+
+    /// @}
+
+  private:
+    //////////////////
+    // value access //
+    //////////////////
+
+    /// get a boolean (explicit)
+    boolean_t get_impl(boolean_t* /*unused*/) const
+    {
+        if (JSON_HEDLEY_LIKELY(is_boolean()))
+        {
+            return m_value.boolean;
+        }
+
+        JSON_THROW(type_error::create(302, "type must be boolean, but is " + std::string(type_name()), *this));
+    }
+
+    /// get a pointer to the value (object)
+    object_t* get_impl_ptr(object_t* /*unused*/) noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (object)
+    constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept
+    {
+        return is_object() ? m_value.object : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    array_t* get_impl_ptr(array_t* /*unused*/) noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (array)
+    constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept
+    {
+        return is_array() ? m_value.array : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    string_t* get_impl_ptr(string_t* /*unused*/) noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (string)
+    constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept
+    {
+        return is_string() ? m_value.string : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (boolean)
+    constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept
+    {
+        return is_boolean() ? &m_value.boolean : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (integer number)
+    constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept
+    {
+        return is_number_integer() ? &m_value.number_integer : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (unsigned number)
+    constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept
+    {
+        return is_number_unsigned() ? &m_value.number_unsigned : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (floating-point number)
+    constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept
+    {
+        return is_number_float() ? &m_value.number_float : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept
+    {
+        return is_binary() ? m_value.binary : nullptr;
+    }
+
+    /// get a pointer to the value (binary)
+    constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept
+    {
+        return is_binary() ? m_value.binary : nullptr;
+    }
+
+    /*!
+    @brief helper function to implement get_ref()
+
+    This function helps to implement get_ref() without code duplication for
+    const and non-const overloads
+
+    @tparam ThisType will be deduced as `basic_json` or `const basic_json`
+
+    @throw type_error.303 if ReferenceType does not match underlying value
+    type of the current JSON
+    */
+    template<typename ReferenceType, typename ThisType>
+    static ReferenceType get_ref_impl(ThisType& obj)
+    {
+        // delegate the call to get_ptr<>()
+        auto* ptr = obj.template get_ptr<typename std::add_pointer<ReferenceType>::type>();
+
+        if (JSON_HEDLEY_LIKELY(ptr != nullptr))
+        {
+            return *ptr;
+        }
+
+        JSON_THROW(type_error::create(303, "incompatible ReferenceType for get_ref, actual type is " + std::string(obj.type_name()), obj));
+    }
+
+  public:
+    /// @name value access
+    /// Direct access to the stored value of a JSON value.
+    /// @{
+
+    /*!
+    @brief get a pointer value (implicit)
+
+    Implicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning Writing data to the pointee of the result yields an undefined
+    state.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t. Enforced by a static
+    assertion.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get_ptr}
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get_ptr() noexcept -> decltype(std::declval<basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>()
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+    /*!
+    @brief get a pointer value (implicit)
+    @copydoc get_ptr()
+    */
+    template < typename PointerType, typename std::enable_if <
+                   std::is_pointer<PointerType>::value&&
+                   std::is_const<typename std::remove_pointer<PointerType>::type>::value, int >::type = 0 >
+    constexpr auto get_ptr() const noexcept -> decltype(std::declval<const basic_json_t&>().get_impl_ptr(std::declval<PointerType>()))
+    {
+        // delegate the call to get_impl_ptr<>() const
+        return get_impl_ptr(static_cast<PointerType>(nullptr));
+    }
+
+  private:
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType ret;
+    JSONSerializer<ValueType>::from_json(*this, ret);
+    return ret;
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+    - @ref json_serializer<ValueType> does not have a `from_json()` method of
+      the form `ValueType from_json(const basic_json&)`
+
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get__ValueType_const}
+
+    @since version 2.1.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   detail::is_default_constructible<ValueType>::value&&
+                   detail::has_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType get_impl(detail::priority_tag<0> /*unused*/) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), std::declval<ValueType&>())))
+    {
+        ValueType ret{};
+        JSONSerializer<ValueType>::from_json(*this, ret);
+        return ret;
+    }
+
+    /*!
+    @brief get a value (explicit); special case
+
+    Explicit type conversion between the JSON value and a compatible value
+    which is **not** [CopyConstructible](https://en.cppreference.com/w/cpp/named_req/CopyConstructible)
+    and **not** [DefaultConstructible](https://en.cppreference.com/w/cpp/named_req/DefaultConstructible).
+    The value is converted by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    return JSONSerializer<ValueType>::from_json(*this);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json and
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `ValueType from_json(const basic_json&)`
+
+    @note If @ref json_serializer<ValueType> has both overloads of
+    `from_json()`, this one is chosen.
+
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @a ValueType
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @since version 2.1.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   detail::has_non_default_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType get_impl(detail::priority_tag<1> /*unused*/) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>())))
+    {
+        return JSONSerializer<ValueType>::from_json(*this);
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads converts the current @ref basic_json in a different
+    @ref basic_json type
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this, converted into @a BasicJsonType
+
+    @complexity Depending on the implementation of the called `from_json()`
+                method.
+
+    @since version 3.2.0
+    */
+    template < typename BasicJsonType,
+               detail::enable_if_t <
+                   detail::is_basic_json<BasicJsonType>::value,
+                   int > = 0 >
+    BasicJsonType get_impl(detail::priority_tag<2> /*unused*/) const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get special-case overload
+
+    This overloads avoids a lot of template boilerplate, it can be seen as the
+    identity method
+
+    @tparam BasicJsonType == @ref basic_json
+
+    @return a copy of *this
+
+    @complexity Constant.
+
+    @since version 2.1.0
+    */
+    template<typename BasicJsonType,
+             detail::enable_if_t<
+                 std::is_same<BasicJsonType, basic_json_t>::value,
+                 int> = 0>
+    basic_json get_impl(detail::priority_tag<3> /*unused*/) const
+    {
+        return *this;
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+    @copydoc get()
+    */
+    template<typename PointerType,
+             detail::enable_if_t<
+                 std::is_pointer<PointerType>::value,
+                 int> = 0>
+    constexpr auto get_impl(detail::priority_tag<4> /*unused*/) const noexcept
+    -> decltype(std::declval<const basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+  public:
+    /*!
+    @brief get a (pointer) value (explicit)
+
+    Performs explicit type conversion between the JSON value and a compatible value if required.
+
+    - If the requested type is a pointer to the internally stored JSON value that pointer is returned.
+    No copies are made.
+
+    - If the requested type is the current @ref basic_json, or a different @ref basic_json convertible
+    from the current @ref basic_json.
+
+    - Otherwise the value is converted by calling the @ref json_serializer<ValueType> `from_json()`
+    method.
+
+    @tparam ValueTypeCV the provided value type
+    @tparam ValueType the returned value type
+
+    @return copy of the JSON value, converted to @tparam ValueType if necessary
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws if conversion is required
+
+    @since version 2.1.0
+    */
+    template < typename ValueTypeCV, typename ValueType = detail::uncvref_t<ValueTypeCV>>
+#if defined(JSON_HAS_CPP_14)
+    constexpr
+#endif
+    auto get() const noexcept(
+    noexcept(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {})))
+    -> decltype(std::declval<const basic_json_t&>().template get_impl<ValueType>(detail::priority_tag<4> {}))
+    {
+        // we cannot static_assert on ValueTypeCV being non-const, because
+        // there is support for get<const basic_json_t>(), which is why we
+        // still need the uncvref
+        static_assert(!std::is_reference<ValueTypeCV>::value,
+                      "get() cannot be used with reference types, you might want to use get_ref()");
+        return get_impl<ValueType>(detail::priority_tag<4> {});
+    }
+
+    /*!
+    @brief get a pointer value (explicit)
+
+    Explicit pointer access to the internally stored JSON value. No copies are
+    made.
+
+    @warning The pointer becomes invalid if the underlying JSON object
+    changes.
+
+    @tparam PointerType pointer type; must be a pointer to @ref array_t, @ref
+    object_t, @ref string_t, @ref boolean_t, @ref number_integer_t,
+    @ref number_unsigned_t, or @ref number_float_t.
+
+    @return pointer to the internally stored JSON value if the requested
+    pointer type @a PointerType fits to the JSON value; `nullptr` otherwise
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how pointers to internal values of a
+    JSON value can be requested. Note that no type conversions are made and a
+    `nullptr` is returned if the value and the requested pointer type does not
+    match.,get__PointerType}
+
+    @sa see @ref get_ptr() for explicit pointer-member access
+
+    @since version 1.0.0
+    */
+    template<typename PointerType, typename std::enable_if<
+                 std::is_pointer<PointerType>::value, int>::type = 0>
+    auto get() noexcept -> decltype(std::declval<basic_json_t&>().template get_ptr<PointerType>())
+    {
+        // delegate the call to get_ptr
+        return get_ptr<PointerType>();
+    }
+
+    /*!
+    @brief get a value (explicit)
+
+    Explicit type conversion between the JSON value and a compatible value.
+    The value is filled into the input parameter by calling the @ref json_serializer<ValueType>
+    `from_json()` method.
+
+    The function is equivalent to executing
+    @code {.cpp}
+    ValueType v;
+    JSONSerializer<ValueType>::from_json(*this, v);
+    @endcode
+
+    This overloads is chosen if:
+    - @a ValueType is not @ref basic_json,
+    - @ref json_serializer<ValueType> has a `from_json()` method of the form
+      `void from_json(const basic_json&, ValueType&)`, and
+
+    @tparam ValueType the input parameter type.
+
+    @return the input parameter, allowing chaining calls.
+
+    @throw what @ref json_serializer<ValueType> `from_json()` method throws
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,get_to}
+
+    @since version 3.3.0
+    */
+    template < typename ValueType,
+               detail::enable_if_t <
+                   !detail::is_basic_json<ValueType>::value&&
+                   detail::has_from_json<basic_json_t, ValueType>::value,
+                   int > = 0 >
+    ValueType & get_to(ValueType& v) const noexcept(noexcept(
+                JSONSerializer<ValueType>::from_json(std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<ValueType>::from_json(*this, v);
+        return v;
+    }
+
+    // specialization to allow to call get_to with a basic_json value
+    // see https://github.com/nlohmann/json/issues/2175
+    template<typename ValueType,
+             detail::enable_if_t <
+                 detail::is_basic_json<ValueType>::value,
+                 int> = 0>
+    ValueType & get_to(ValueType& v) const
+    {
+        v = *this;
+        return v;
+    }
+
+    template <
+        typename T, std::size_t N,
+        typename Array = T (&)[N], // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+        detail::enable_if_t <
+            detail::has_from_json<basic_json_t, Array>::value, int > = 0 >
+    Array get_to(T (&v)[N]) const // NOLINT(cppcoreguidelines-avoid-c-arrays,hicpp-avoid-c-arrays,modernize-avoid-c-arrays)
+    noexcept(noexcept(JSONSerializer<Array>::from_json(
+                          std::declval<const basic_json_t&>(), v)))
+    {
+        JSONSerializer<Array>::from_json(*this, v);
+        return v;
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+
+    Implicit reference access to the internally stored JSON value. No copies
+    are made.
+
+    @warning Writing data to the referee of the result yields an undefined
+    state.
+
+    @tparam ReferenceType reference type; must be a reference to @ref array_t,
+    @ref object_t, @ref string_t, @ref boolean_t, @ref number_integer_t, or
+    @ref number_float_t. Enforced by static assertion.
+
+    @return reference to the internally stored JSON value if the requested
+    reference type @a ReferenceType fits to the JSON value; throws
+    type_error.303 otherwise
+
+    @throw type_error.303 in case passed type @a ReferenceType is incompatible
+    with the stored JSON value; see example below
+
+    @complexity Constant.
+
+    @liveexample{The example shows several calls to `get_ref()`.,get_ref}
+
+    @since version 1.1.0
+    */
+    template<typename ReferenceType, typename std::enable_if<
+                 std::is_reference<ReferenceType>::value, int>::type = 0>
+    ReferenceType get_ref()
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a reference value (implicit)
+    @copydoc get_ref()
+    */
+    template < typename ReferenceType, typename std::enable_if <
+                   std::is_reference<ReferenceType>::value&&
+                   std::is_const<typename std::remove_reference<ReferenceType>::type>::value, int >::type = 0 >
+    ReferenceType get_ref() const
+    {
+        // delegate call to get_ref_impl
+        return get_ref_impl<ReferenceType>(*this);
+    }
+
+    /*!
+    @brief get a value (implicit)
+
+    Implicit type conversion between the JSON value and a compatible value.
+    The call is realized by calling @ref get() const.
+
+    @tparam ValueType non-pointer type compatible to the JSON value, for
+    instance `int` for JSON integer numbers, `bool` for JSON booleans, or
+    `std::vector` types for JSON arrays. The character type of @ref string_t
+    as well as an initializer list of this type is excluded to avoid
+    ambiguities as these types implicitly convert to `std::string`.
+
+    @return copy of the JSON value, converted to type @a ValueType
+
+    @throw type_error.302 in case passed type @a ValueType is incompatible
+    to the JSON value type (e.g., the JSON value is of type boolean, but a
+    string is requested); see example below
+
+    @complexity Linear in the size of the JSON value.
+
+    @liveexample{The example below shows several conversions from JSON values
+    to other types. There a few things to note: (1) Floating-point numbers can
+    be converted to integers\, (2) A JSON array can be converted to a standard
+    `std::vector<short>`\, (3) A JSON object can be converted to C++
+    associative containers such as `std::unordered_map<std::string\,
+    json>`.,operator__ValueType}
+
+    @since version 1.0.0
+    */
+    template < typename ValueType, typename std::enable_if <
+                   detail::conjunction <
+                       detail::negation<std::is_pointer<ValueType>>,
+                       detail::negation<std::is_same<ValueType, detail::json_ref<basic_json>>>,
+                                        detail::negation<std::is_same<ValueType, typename string_t::value_type>>,
+                                        detail::negation<detail::is_basic_json<ValueType>>,
+                                        detail::negation<std::is_same<ValueType, std::initializer_list<typename string_t::value_type>>>,
+
+#if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914))
+                                                detail::negation<std::is_same<ValueType, std::string_view>>,
+#endif
+                                                detail::is_detected_lazy<detail::get_template_function, const basic_json_t&, ValueType>
+                                                >::value, int >::type = 0 >
+                                        JSON_EXPLICIT operator ValueType() const
+    {
+        // delegate the call to get<>() const
+        return get<ValueType>();
+    }
+
+    /*!
+    @return reference to the binary value
+
+    @throw type_error.302 if the value is not binary
+
+    @sa see @ref is_binary() to check if the value is binary
+
+    @since version 3.8.0
+    */
+    binary_t& get_binary()
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name()), *this));
+        }
+
+        return *get_ptr<binary_t*>();
+    }
+
+    /// @copydoc get_binary()
+    const binary_t& get_binary() const
+    {
+        if (!is_binary())
+        {
+            JSON_THROW(type_error::create(302, "type must be binary, but is " + std::string(type_name()), *this));
+        }
+
+        return *get_ptr<const binary_t*>();
+    }
+
+    /// @}
+
+
+    ////////////////////
+    // element access //
+    ////////////////////
+
+    /// @name element access
+    /// Access to the JSON value.
+    /// @{
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a reference to the element at specified location @a idx, with
+    bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__size_type}
+    */
+    reference at(size_type idx)
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return set_parent(m_value.array->at(idx));
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief access specified array element with bounds checking
+
+    Returns a const reference to the element at specified location @a idx,
+    with bounds checking.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.304 if the JSON value is not an array; in this case,
+    calling `at` with an index makes no sense. See example below.
+    @throw out_of_range.401 if the index @a idx is out of range of the array;
+    that is, `idx >= size()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how array elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__size_type_const}
+    */
+    const_reference at(size_type idx) const
+    {
+        // at only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            JSON_TRY
+            {
+                return m_value.array->at(idx);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a reference to the element at with specified key @a key, with
+    bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa see @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa see @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read and
+    written using `at()`. It also demonstrates the different exceptions that
+    can be thrown.,at__object_t_key_type}
+    */
+    reference at(const typename object_t::key_type& key)
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return set_parent(m_value.object->at(key));
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found", *this));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief access specified object element with bounds checking
+
+    Returns a const reference to the element at with specified key @a key,
+    with bounds checking.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @throw type_error.304 if the JSON value is not an object; in this case,
+    calling `at` with a key makes no sense. See example below.
+    @throw out_of_range.403 if the key @a key is is not stored in the object;
+    that is, `find(key) == end()`. See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Logarithmic in the size of the container.
+
+    @sa see @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+    @sa see @ref value() for access by value with a default value
+
+    @since version 1.0.0
+
+    @liveexample{The example below shows how object elements can be read using
+    `at()`. It also demonstrates the different exceptions that can be thrown.,
+    at__object_t_key_type_const}
+    */
+    const_reference at(const typename object_t::key_type& key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_TRY
+            {
+                return m_value.object->at(key);
+            }
+            JSON_CATCH (std::out_of_range&)
+            {
+                // create better exception explanation
+                JSON_THROW(out_of_range::create(403, "key '" + key + "' not found", *this));
+            }
+        }
+        else
+        {
+            JSON_THROW(type_error::create(304, "cannot use at() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a reference to the element at specified location @a idx.
+
+    @note If @a idx is beyond the range of the array (i.e., `idx >= size()`),
+    then the array is silently filled up with `null` values to make `idx` a
+    valid reference to the last stored element.
+
+    @param[in] idx  index of the element to access
+
+    @return reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array or null; in that
+    cases, using the [] operator with an index makes no sense.
+
+    @complexity Constant if @a idx is in the range of the array. Otherwise
+    linear in `idx - size()`.
+
+    @liveexample{The example below shows how array elements can be read and
+    written using `[]` operator. Note the addition of `null`
+    values.,operatorarray__size_type}
+
+    @since version 1.0.0
+    */
+    reference operator[](size_type idx)
+    {
+        // implicitly convert null value to an empty array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value.array = create<array_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // fill up array with null values if given idx is outside range
+            if (idx >= m_value.array->size())
+            {
+#if JSON_DIAGNOSTICS
+                // remember array size before resizing
+                const auto previous_size = m_value.array->size();
+#endif
+                m_value.array->resize(idx + 1);
+
+#if JSON_DIAGNOSTICS
+                // set parent for values added above
+                set_parents(begin() + static_cast<typename iterator::difference_type>(previous_size), static_cast<typename iterator::difference_type>(idx + 1 - previous_size));
+#endif
+            }
+
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief access specified array element
+
+    Returns a const reference to the element at specified location @a idx.
+
+    @param[in] idx  index of the element to access
+
+    @return const reference to the element at index @a idx
+
+    @throw type_error.305 if the JSON value is not an array; in that case,
+    using the [] operator with an index makes no sense.
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how array elements can be read using
+    the `[]` operator.,operatorarray__size_type_const}
+
+    @since version 1.0.0
+    */
+    const_reference operator[](size_type idx) const
+    {
+        // const operator[] only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            return m_value.array->operator[](idx);
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a numeric argument with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa see @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa see @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    reference operator[](const typename object_t::key_type& key)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        // operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return set_parent(m_value.object->operator[](key));
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa see @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa see @ref value() for access by value with a default value
+
+    @since version 1.0.0
+    */
+    const_reference operator[](const typename object_t::key_type& key) const
+    {
+        // const operator[] only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief access specified object element
+
+    Returns a reference to the element at with specified key @a key.
+
+    @note If @a key is not found in the object, then it is silently added to
+    the object and filled with a `null` value to make `key` a valid reference.
+    In case the value was `null` before, it is converted to an object.
+
+    @param[in] key  key of the element to access
+
+    @return reference to the element at key @a key
+
+    @throw type_error.305 if the JSON value is not an object or null; in that
+    cases, using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read and
+    written using the `[]` operator.,operatorarray__key_type}
+
+    @sa see @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa see @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    reference operator[](T* key)
+    {
+        // implicitly convert null to object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return set_parent(m_value.object->operator[](key));
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief read-only access specified object element
+
+    Returns a const reference to the element at with specified key @a key. No
+    bounds checking is performed.
+
+    @warning If the element with key @a key does not exist, the behavior is
+    undefined.
+
+    @param[in] key  key of the element to access
+
+    @return const reference to the element at key @a key
+
+    @pre The element with key @a key must exist. **This precondition is
+         enforced with an assertion.**
+
+    @throw type_error.305 if the JSON value is not an object; in that case,
+    using the [] operator with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be read using
+    the `[]` operator.,operatorarray__key_type_const}
+
+    @sa see @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa see @ref value() for access by value with a default value
+
+    @since version 1.1.0
+    */
+    template<typename T>
+    JSON_HEDLEY_NON_NULL(2)
+    const_reference operator[](T* key) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            JSON_ASSERT(m_value.object->find(key) != m_value.object->end());
+            return m_value.object->find(key)->second;
+        }
+
+        JSON_THROW(type_error::create(305, "cannot use operator[] with a string argument with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief access specified object element with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(key);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const typename object_t::key_type&), this function
+    does not throw if the given key @a key was not found.
+
+    @note Unlike @ref operator[](const typename object_t::key_type& key), this
+    function does not implicitly add an element to the position defined by @a
+    key. This function is furthermore also applicable to const objects.
+
+    @param[in] key  key of the element to access
+    @param[in] default_value  the value to return if @a key is not found
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a key
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value}
+
+    @sa see @ref at(const typename object_t::key_type&) for access by reference
+    with range checking
+    @sa see @ref operator[](const typename object_t::key_type&) for unchecked
+    access by reference
+
+    @since version 1.0.0
+    */
+    // using std::is_convertible in a std::enable_if will fail when using explicit conversions
+    template < class ValueType, typename std::enable_if <
+                   detail::is_getable<basic_json_t, ValueType>::value
+                   && !std::is_same<value_t, ValueType>::value, int >::type = 0 >
+    ValueType value(const typename object_t::key_type& key, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if key is found, return value and given default value otherwise
+            const auto it = find(key);
+            if (it != end())
+            {
+                return it->template get<ValueType>();
+            }
+
+            return default_value;
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const typename object_t::key_type&, const ValueType&) const
+    */
+    string_t value(const typename object_t::key_type& key, const char* default_value) const
+    {
+        return value(key, string_t(default_value));
+    }
+
+    /*!
+    @brief access specified object element via JSON Pointer with default value
+
+    Returns either a copy of an object's element at the specified key @a key
+    or a given default value if no element with key @a key exists.
+
+    The function is basically equivalent to executing
+    @code {.cpp}
+    try {
+        return at(ptr);
+    } catch(out_of_range) {
+        return default_value;
+    }
+    @endcode
+
+    @note Unlike @ref at(const json_pointer&), this function does not throw
+    if the given key @a key was not found.
+
+    @param[in] ptr  a JSON pointer to the element to access
+    @param[in] default_value  the value to return if @a ptr found no value
+
+    @tparam ValueType type compatible to JSON values, for instance `int` for
+    JSON integer numbers, `bool` for JSON booleans, or `std::vector` types for
+    JSON arrays. Note the type of the expected value at @a key and the default
+    value @a default_value must be compatible.
+
+    @return copy of the element at key @a key or @a default_value if @a key
+    is not found
+
+    @throw type_error.302 if @a default_value does not match the type of the
+    value at @a ptr
+    @throw type_error.306 if the JSON value is not an object; in that case,
+    using `value()` with a key makes no sense.
+
+    @complexity Logarithmic in the size of the container.
+
+    @liveexample{The example below shows how object elements can be queried
+    with a default value.,basic_json__value_ptr}
+
+    @sa see @ref operator[](const json_pointer&) for unchecked access by reference
+
+    @since version 2.0.2
+    */
+    template<class ValueType, typename std::enable_if<
+                 detail::is_getable<basic_json_t, ValueType>::value, int>::type = 0>
+    ValueType value(const json_pointer& ptr, const ValueType& default_value) const
+    {
+        // at only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            // if pointer resolves a value, return it or use default value
+            JSON_TRY
+            {
+                return ptr.get_checked(this).template get<ValueType>();
+            }
+            JSON_INTERNAL_CATCH (out_of_range&)
+            {
+                return default_value;
+            }
+        }
+
+        JSON_THROW(type_error::create(306, "cannot use value() with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief overload for a default value of type const char*
+    @copydoc basic_json::value(const json_pointer&, ValueType) const
+    */
+    JSON_HEDLEY_NON_NULL(3)
+    string_t value(const json_pointer& ptr, const char* default_value) const
+    {
+        return value(ptr, string_t(default_value));
+    }
+
+    /*!
+    @brief access the first element
+
+    Returns a reference to the first element in the container. For a JSON
+    container `c`, the expression `c.front()` is equivalent to `*c.begin()`.
+
+    @return In case of a structured type (array or object), a reference to the
+    first element is returned. In case of number, string, boolean, or binary
+    values, a reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on `null` value
+
+    @liveexample{The following code shows an example for `front()`.,front}
+
+    @sa see @ref back() -- access the last element
+
+    @since version 1.0.0
+    */
+    reference front()
+    {
+        return *begin();
+    }
+
+    /*!
+    @copydoc basic_json::front()
+    */
+    const_reference front() const
+    {
+        return *cbegin();
+    }
+
+    /*!
+    @brief access the last element
+
+    Returns a reference to the last element in the container. For a JSON
+    container `c`, the expression `c.back()` is equivalent to
+    @code {.cpp}
+    auto tmp = c.end();
+    --tmp;
+    return *tmp;
+    @endcode
+
+    @return In case of a structured type (array or object), a reference to the
+    last element is returned. In case of number, string, boolean, or binary
+    values, a reference to the value is returned.
+
+    @complexity Constant.
+
+    @pre The JSON value must not be `null` (would throw `std::out_of_range`)
+    or an empty array or object (undefined behavior, **guarded by
+    assertions**).
+    @post The JSON value remains unchanged.
+
+    @throw invalid_iterator.214 when called on a `null` value. See example
+    below.
+
+    @liveexample{The following code shows an example for `back()`.,back}
+
+    @sa see @ref front() -- access the first element
+
+    @since version 1.0.0
+    */
+    reference back()
+    {
+        auto tmp = end();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @copydoc basic_json::back()
+    */
+    const_reference back() const
+    {
+        auto tmp = cend();
+        --tmp;
+        return *tmp;
+    }
+
+    /*!
+    @brief remove element given an iterator
+
+    Removes the element specified by iterator @a pos. The iterator @a pos must
+    be valid and dereferenceable. Thus the `end()` iterator (which is valid,
+    but is not dereferenceable) cannot be used as a value for @a pos.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] pos iterator to the element to remove
+    @return Iterator following the last removed element. If the iterator @a
+    pos refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.202 if called on an iterator which does not belong
+    to the current JSON value; example: `"iterator does not fit current
+    value"`
+    @throw invalid_iterator.205 if called on a primitive type with invalid
+    iterator (i.e., any iterator which is not `begin()`); example: `"iterator
+    out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: amortized constant
+    - arrays: linear in distance between @a pos and the end of the container
+    - strings and binary: linear in the length of the member
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType}
+
+    @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa see @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa see @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template < class IteratorType, typename std::enable_if <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
+               = 0 >
+    IteratorType erase(IteratorType pos)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != pos.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_UNLIKELY(!pos.m_it.primitive_iterator.is_begin()))
+                {
+                    JSON_THROW(invalid_iterator::create(205, "iterator out of range", *this));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
+                    m_value.binary = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove elements given an iterator range
+
+    Removes the element specified by the range `[first; last)`. The iterator
+    @a first does not need to be dereferenceable if `first == last`: erasing
+    an empty range is a no-op.
+
+    If called on a primitive type other than `null`, the resulting JSON value
+    will be `null`.
+
+    @param[in] first iterator to the beginning of the range to remove
+    @param[in] last iterator past the end of the range to remove
+    @return Iterator following the last removed element. If the iterator @a
+    second refers to the last element, the `end()` iterator is returned.
+
+    @tparam IteratorType an @ref iterator or @ref const_iterator
+
+    @post Invalidates iterators and references at or after the point of the
+    erase, including the `end()` iterator.
+
+    @throw type_error.307 if called on a `null` value; example: `"cannot use
+    erase() with null"`
+    @throw invalid_iterator.203 if called on iterators which does not belong
+    to the current JSON value; example: `"iterators do not fit current value"`
+    @throw invalid_iterator.204 if called on a primitive type with invalid
+    iterators (i.e., if `first != begin()` and `last != end()`); example:
+    `"iterators out of range"`
+
+    @complexity The complexity depends on the type:
+    - objects: `log(size()) + std::distance(first, last)`
+    - arrays: linear in the distance between @a first and @a last, plus linear
+      in the distance between @a last and end of the container
+    - strings and binary: linear in the length of the member
+    - other types: constant
+
+    @liveexample{The example shows the result of `erase()` for different JSON
+    types.,erase__IteratorType_IteratorType}
+
+    @sa see @ref erase(IteratorType) -- removes the element at a given position
+    @sa see @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+    @sa see @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    template < class IteratorType, typename std::enable_if <
+                   std::is_same<IteratorType, typename basic_json_t::iterator>::value ||
+                   std::is_same<IteratorType, typename basic_json_t::const_iterator>::value, int >::type
+               = 0 >
+    IteratorType erase(IteratorType first, IteratorType last)
+    {
+        // make sure iterator fits the current value
+        if (JSON_HEDLEY_UNLIKELY(this != first.m_object || this != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(203, "iterators do not fit current value", *this));
+        }
+
+        IteratorType result = end();
+
+        switch (m_type)
+        {
+            case value_t::boolean:
+            case value_t::number_float:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::string:
+            case value_t::binary:
+            {
+                if (JSON_HEDLEY_LIKELY(!first.m_it.primitive_iterator.is_begin()
+                                       || !last.m_it.primitive_iterator.is_end()))
+                {
+                    JSON_THROW(invalid_iterator::create(204, "iterators out of range", *this));
+                }
+
+                if (is_string())
+                {
+                    AllocatorType<string_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.string);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.string, 1);
+                    m_value.string = nullptr;
+                }
+                else if (is_binary())
+                {
+                    AllocatorType<binary_t> alloc;
+                    std::allocator_traits<decltype(alloc)>::destroy(alloc, m_value.binary);
+                    std::allocator_traits<decltype(alloc)>::deallocate(alloc, m_value.binary, 1);
+                    m_value.binary = nullptr;
+                }
+
+                m_type = value_t::null;
+                assert_invariant();
+                break;
+            }
+
+            case value_t::object:
+            {
+                result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator,
+                                              last.m_it.object_iterator);
+                break;
+            }
+
+            case value_t::array:
+            {
+                result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator,
+                                             last.m_it.array_iterator);
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief remove element from a JSON object given a key
+
+    Removes elements from a JSON object with the key value @a key.
+
+    @param[in] key value of the elements to remove
+
+    @return Number of elements removed. If @a ObjectType is the default
+    `std::map` type, the return value will always be `0` (@a key was not
+    found) or `1` (@a key was found).
+
+    @post References and iterators to the erased elements are invalidated.
+    Other references and iterators are not affected.
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+
+    @complexity `log(size()) + count(key)`
+
+    @liveexample{The example shows the effect of `erase()`.,erase__key_type}
+
+    @sa see @ref erase(IteratorType) -- removes the element at a given position
+    @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa see @ref erase(const size_type) -- removes the element from an array at
+    the given index
+
+    @since version 1.0.0
+    */
+    size_type erase(const typename object_t::key_type& key)
+    {
+        // this erase only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            return m_value.object->erase(key);
+        }
+
+        JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief remove element from a JSON array given an index
+
+    Removes element from a JSON array at the index @a idx.
+
+    @param[in] idx index of the element to remove
+
+    @throw type_error.307 when called on a type other than JSON object;
+    example: `"cannot use erase() with null"`
+    @throw out_of_range.401 when `idx >= size()`; example: `"array index 17
+    is out of range"`
+
+    @complexity Linear in distance between @a idx and the end of the container.
+
+    @liveexample{The example shows the effect of `erase()`.,erase__size_type}
+
+    @sa see @ref erase(IteratorType) -- removes the element at a given position
+    @sa see @ref erase(IteratorType, IteratorType) -- removes the elements in
+    the given range
+    @sa see @ref erase(const typename object_t::key_type&) -- removes the element
+    from an object at the given key
+
+    @since version 1.0.0
+    */
+    void erase(const size_type idx)
+    {
+        // this erase only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            if (JSON_HEDLEY_UNLIKELY(idx >= size()))
+            {
+                JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", *this));
+            }
+
+            m_value.array->erase(m_value.array->begin() + static_cast<difference_type>(idx));
+        }
+        else
+        {
+            JSON_THROW(type_error::create(307, "cannot use erase() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /// @}
+
+
+    ////////////
+    // lookup //
+    ////////////
+
+    /// @name lookup
+    /// @{
+
+    /*!
+    @brief find an element in a JSON object
+
+    Finds an element in a JSON object with key equivalent to @a key. If the
+    element is not found or the JSON value is not an object, end() is
+    returned.
+
+    @note This method always returns @ref end() when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value of the element to search for.
+
+    @return Iterator to an element with key equivalent to @a key. If no such
+    element is found or the JSON value is not an object, past-the-end (see
+    @ref end()) iterator is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `find()` is used.,find__key_type}
+
+    @sa see @ref contains(KeyT&&) const -- checks whether a key exists
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    iterator find(KeyT&& key)
+    {
+        auto result = end();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief find an element in a JSON object
+    @copydoc find(KeyT&&)
+    */
+    template<typename KeyT>
+    const_iterator find(KeyT&& key) const
+    {
+        auto result = cend();
+
+        if (is_object())
+        {
+            result.m_it.object_iterator = m_value.object->find(std::forward<KeyT>(key));
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief returns the number of occurrences of a key in a JSON object
+
+    Returns the number of elements with key @a key. If ObjectType is the
+    default `std::map` type, the return value will always be `0` (@a key was
+    not found) or `1` (@a key was found).
+
+    @note This method always returns `0` when executed on a JSON type that is
+          not an object.
+
+    @param[in] key key value of the element to count
+
+    @return Number of elements with key @a key. If the JSON value is not an
+    object, the return value will be `0`.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The example shows how `count()` is used.,count}
+
+    @since version 1.0.0
+    */
+    template<typename KeyT>
+    size_type count(KeyT&& key) const
+    {
+        // return 0 for all nonobject types
+        return is_object() ? m_value.object->count(std::forward<KeyT>(key)) : 0;
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object
+
+    Check whether an element exists in a JSON object with key equivalent to
+    @a key. If the element is not found or the JSON value is not an object,
+    false is returned.
+
+    @note This method always returns false when executed on a JSON type
+          that is not an object.
+
+    @param[in] key key value to check its existence.
+
+    @return true if an element with specified @a key exists. If no such
+    element with such key is found or the JSON value is not an object,
+    false is returned.
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains}
+
+    @sa see @ref find(KeyT&&) -- returns an iterator to an object element
+    @sa see @ref contains(const json_pointer&) const -- checks the existence for a JSON pointer
+
+    @since version 3.6.0
+    */
+    template < typename KeyT, typename std::enable_if <
+                   !std::is_same<typename std::decay<KeyT>::type, json_pointer>::value, int >::type = 0 >
+    bool contains(KeyT && key) const
+    {
+        return is_object() && m_value.object->find(std::forward<KeyT>(key)) != m_value.object->end();
+    }
+
+    /*!
+    @brief check the existence of an element in a JSON object given a JSON pointer
+
+    Check whether the given JSON pointer @a ptr can be resolved in the current
+    JSON value.
+
+    @note This method can be executed on any JSON value type.
+
+    @param[in] ptr JSON pointer to check its existence.
+
+    @return true if the JSON pointer can be resolved to a stored value, false
+    otherwise.
+
+    @post If `j.contains(ptr)` returns true, it is safe to call `j[ptr]`.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+
+    @complexity Logarithmic in the size of the JSON object.
+
+    @liveexample{The following code shows an example for `contains()`.,contains_json_pointer}
+
+    @sa see @ref contains(KeyT &&) const -- checks the existence of a key
+
+    @since version 3.7.0
+    */
+    bool contains(const json_pointer& ptr) const
+    {
+        return ptr.contains(this);
+    }
+
+    /// @}
+
+
+    ///////////////
+    // iterators //
+    ///////////////
+
+    /// @name iterators
+    /// @{
+
+    /*!
+    @brief returns an iterator to the first element
+
+    Returns an iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `begin()`.,begin}
+
+    @sa see @ref cbegin() -- returns a const iterator to the beginning
+    @sa see @ref end() -- returns an iterator to the end
+    @sa see @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    iterator begin() noexcept
+    {
+        iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cbegin()
+    */
+    const_iterator begin() const noexcept
+    {
+        return cbegin();
+    }
+
+    /*!
+    @brief returns a const iterator to the first element
+
+    Returns a const iterator to the first element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator to the first element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).begin()`.
+
+    @liveexample{The following code shows an example for `cbegin()`.,cbegin}
+
+    @sa see @ref begin() -- returns an iterator to the beginning
+    @sa see @ref end() -- returns an iterator to the end
+    @sa see @ref cend() -- returns a const iterator to the end
+
+    @since version 1.0.0
+    */
+    const_iterator cbegin() const noexcept
+    {
+        const_iterator result(this);
+        result.set_begin();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to one past the last element
+
+    Returns an iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+
+    @liveexample{The following code shows an example for `end()`.,end}
+
+    @sa see @ref cend() -- returns a const iterator to the end
+    @sa see @ref begin() -- returns an iterator to the beginning
+    @sa see @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    iterator end() noexcept
+    {
+        iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @copydoc basic_json::cend()
+    */
+    const_iterator end() const noexcept
+    {
+        return cend();
+    }
+
+    /*!
+    @brief returns a const iterator to one past the last element
+
+    Returns a const iterator to one past the last element.
+
+    @image html range-begin-end.svg "Illustration from cppreference.com"
+
+    @return const iterator one past the last element
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).end()`.
+
+    @liveexample{The following code shows an example for `cend()`.,cend}
+
+    @sa see @ref end() -- returns an iterator to the end
+    @sa see @ref begin() -- returns an iterator to the beginning
+    @sa see @ref cbegin() -- returns a const iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_iterator cend() const noexcept
+    {
+        const_iterator result(this);
+        result.set_end();
+        return result;
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-beginning
+
+    Returns an iterator to the reverse-beginning; that is, the last element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(end())`.
+
+    @liveexample{The following code shows an example for `rbegin()`.,rbegin}
+
+    @sa see @ref crbegin() -- returns a const reverse iterator to the beginning
+    @sa see @ref rend() -- returns a reverse iterator to the end
+    @sa see @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    reverse_iterator rbegin() noexcept
+    {
+        return reverse_iterator(end());
+    }
+
+    /*!
+    @copydoc basic_json::crbegin()
+    */
+    const_reverse_iterator rbegin() const noexcept
+    {
+        return crbegin();
+    }
+
+    /*!
+    @brief returns an iterator to the reverse-end
+
+    Returns an iterator to the reverse-end; that is, one before the first
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `reverse_iterator(begin())`.
+
+    @liveexample{The following code shows an example for `rend()`.,rend}
+
+    @sa see @ref crend() -- returns a const reverse iterator to the end
+    @sa see @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa see @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    reverse_iterator rend() noexcept
+    {
+        return reverse_iterator(begin());
+    }
+
+    /*!
+    @copydoc basic_json::crend()
+    */
+    const_reverse_iterator rend() const noexcept
+    {
+        return crend();
+    }
+
+    /*!
+    @brief returns a const reverse iterator to the last element
+
+    Returns a const iterator to the reverse-beginning; that is, the last
+    element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rbegin()`.
+
+    @liveexample{The following code shows an example for `crbegin()`.,crbegin}
+
+    @sa see @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa see @ref rend() -- returns a reverse iterator to the end
+    @sa see @ref crend() -- returns a const reverse iterator to the end
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crbegin() const noexcept
+    {
+        return const_reverse_iterator(cend());
+    }
+
+    /*!
+    @brief returns a const reverse iterator to one before the first
+
+    Returns a const reverse iterator to the reverse-end; that is, one before
+    the first element.
+
+    @image html range-rbegin-rend.svg "Illustration from cppreference.com"
+
+    @complexity Constant.
+
+    @requirement This function helps `basic_json` satisfying the
+    [ReversibleContainer](https://en.cppreference.com/w/cpp/named_req/ReversibleContainer)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `const_cast<const basic_json&>(*this).rend()`.
+
+    @liveexample{The following code shows an example for `crend()`.,crend}
+
+    @sa see @ref rend() -- returns a reverse iterator to the end
+    @sa see @ref rbegin() -- returns a reverse iterator to the beginning
+    @sa see @ref crbegin() -- returns a const reverse iterator to the beginning
+
+    @since version 1.0.0
+    */
+    const_reverse_iterator crend() const noexcept
+    {
+        return const_reverse_iterator(cbegin());
+    }
+
+  public:
+    /*!
+    @brief wrapper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without iterator_wrapper:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without iterator proxy:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with iterator proxy:
+
+    @code{cpp}
+    for (auto it : json::iterator_wrapper(j_object))
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example).
+
+    @param[in] ref  reference to a JSON value
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the wrapper is used,iterator_wrapper}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @note The name of this function is not yet final and may change in the
+    future.
+
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use @ref items() instead;
+                that is, replace `json::iterator_wrapper(j)` with `j.items()`.
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<iterator> iterator_wrapper(reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @copydoc iterator_wrapper(reference)
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.1.0, items())
+    static iteration_proxy<const_iterator> iterator_wrapper(const_reference ref) noexcept
+    {
+        return ref.items();
+    }
+
+    /*!
+    @brief helper to access iterator member functions in range-based for
+
+    This function allows to access @ref iterator::key() and @ref
+    iterator::value() during range-based for loops. In these loops, a
+    reference to the JSON values is returned, so there is no access to the
+    underlying iterator.
+
+    For loop without `items()` function:
+
+    @code{cpp}
+    for (auto it = j_object.begin(); it != j_object.end(); ++it)
+    {
+        std::cout << "key: " << it.key() << ", value:" << it.value() << '\n';
+    }
+    @endcode
+
+    Range-based for loop without `items()` function:
+
+    @code{cpp}
+    for (auto it : j_object)
+    {
+        // "it" is of type json::reference and has no key() member
+        std::cout << "value: " << it << '\n';
+    }
+    @endcode
+
+    Range-based for loop with `items()` function:
+
+    @code{cpp}
+    for (auto& el : j_object.items())
+    {
+        std::cout << "key: " << el.key() << ", value:" << el.value() << '\n';
+    }
+    @endcode
+
+    The `items()` function also allows to use
+    [structured bindings](https://en.cppreference.com/w/cpp/language/structured_binding)
+    (C++17):
+
+    @code{cpp}
+    for (auto& [key, val] : j_object.items())
+    {
+        std::cout << "key: " << key << ", value:" << val << '\n';
+    }
+    @endcode
+
+    @note When iterating over an array, `key()` will return the index of the
+          element as string (see example). For primitive types (e.g., numbers),
+          `key()` returns an empty string.
+
+    @warning Using `items()` on temporary objects is dangerous. Make sure the
+             object's lifetime exeeds the iteration. See
+             <https://github.com/nlohmann/json/issues/2040> for more
+             information.
+
+    @return iteration proxy object wrapping @a ref with an interface to use in
+            range-based for loops
+
+    @liveexample{The following code shows how the function is used.,items}
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 3.1.0, structured bindings support since 3.5.0.
+    */
+    iteration_proxy<iterator> items() noexcept
+    {
+        return iteration_proxy<iterator>(*this);
+    }
+
+    /*!
+    @copydoc items()
+    */
+    iteration_proxy<const_iterator> items() const noexcept
+    {
+        return iteration_proxy<const_iterator>(*this);
+    }
+
+    /// @}
+
+
+    //////////////
+    // capacity //
+    //////////////
+
+    /// @name capacity
+    /// @{
+
+    /*!
+    @brief checks whether the container is empty.
+
+    Checks if a JSON value has no elements (i.e. whether its @ref size is `0`).
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `true`
+            boolean     | `false`
+            string      | `false`
+            number      | `false`
+            binary      | `false`
+            object      | result of function `object_t::empty()`
+            array       | result of function `array_t::empty()`
+
+    @liveexample{The following code uses `empty()` to check if a JSON
+    object contains any elements.,empty}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `empty()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return whether a string stored as JSON value
+    is empty - it returns whether the JSON container itself is empty which is
+    false in the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `begin() == end()`.
+
+    @sa see @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    bool empty() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return true;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::empty()
+                return m_value.array->empty();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::empty()
+                return m_value.object->empty();
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // all other types are nonempty
+                return false;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the number of elements
+
+    Returns the number of elements in a JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0`
+            boolean     | `1`
+            string      | `1`
+            number      | `1`
+            binary      | `1`
+            object      | result of function object_t::size()
+            array       | result of function array_t::size()
+
+    @liveexample{The following code calls `size()` on the different value
+    types.,size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their size() functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @note This function does not return the length of a string stored as JSON
+    value - it returns the number of elements in the JSON value which is 1 in
+    the case of a string.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of `std::distance(begin(), end())`.
+
+    @sa see @ref empty() -- checks whether the container is empty
+    @sa see @ref max_size() -- returns the maximal number of elements
+
+    @since version 1.0.0
+    */
+    size_type size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::null:
+            {
+                // null values are empty
+                return 0;
+            }
+
+            case value_t::array:
+            {
+                // delegate call to array_t::size()
+                return m_value.array->size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::size()
+                return m_value.object->size();
+            }
+
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // all other types have size 1
+                return 1;
+            }
+        }
+    }
+
+    /*!
+    @brief returns the maximum possible number of elements
+
+    Returns the maximum number of elements a JSON value is able to hold due to
+    system or library implementation limitations, i.e. `std::distance(begin(),
+    end())` for the JSON value.
+
+    @return The return value depends on the different types and is
+            defined as follows:
+            Value type  | return value
+            ----------- | -------------
+            null        | `0` (same as `size()`)
+            boolean     | `1` (same as `size()`)
+            string      | `1` (same as `size()`)
+            number      | `1` (same as `size()`)
+            binary      | `1` (same as `size()`)
+            object      | result of function `object_t::max_size()`
+            array       | result of function `array_t::max_size()`
+
+    @liveexample{The following code calls `max_size()` on the different value
+    types. Note the output is implementation specific.,max_size}
+
+    @complexity Constant, as long as @ref array_t and @ref object_t satisfy
+    the Container concept; that is, their `max_size()` functions have constant
+    complexity.
+
+    @iterators No changes.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @requirement This function helps `basic_json` satisfying the
+    [Container](https://en.cppreference.com/w/cpp/named_req/Container)
+    requirements:
+    - The complexity is constant.
+    - Has the semantics of returning `b.size()` where `b` is the largest
+      possible JSON value.
+
+    @sa see @ref size() -- returns the number of elements
+
+    @since version 1.0.0
+    */
+    size_type max_size() const noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::array:
+            {
+                // delegate call to array_t::max_size()
+                return m_value.array->max_size();
+            }
+
+            case value_t::object:
+            {
+                // delegate call to object_t::max_size()
+                return m_value.object->max_size();
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // all other types have max_size() == size()
+                return size();
+            }
+        }
+    }
+
+    /// @}
+
+
+    ///////////////
+    // modifiers //
+    ///////////////
+
+    /// @name modifiers
+    /// @{
+
+    /*!
+    @brief clears the contents
+
+    Clears the content of a JSON value and resets it to the default value as
+    if @ref basic_json(value_t) would have been called with the current value
+    type from @ref type():
+
+    Value type  | initial value
+    ----------- | -------------
+    null        | `null`
+    boolean     | `false`
+    string      | `""`
+    number      | `0`
+    binary      | An empty byte vector
+    object      | `{}`
+    array       | `[]`
+
+    @post Has the same effect as calling
+    @code {.cpp}
+    *this = basic_json(type());
+    @endcode
+
+    @liveexample{The example below shows the effect of `clear()` to different
+    JSON types.,clear}
+
+    @complexity Linear in the size of the JSON value.
+
+    @iterators All iterators, pointers and references related to this container
+               are invalidated.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @sa see @ref basic_json(value_t) -- constructor that creates an object with the
+        same value than calling `clear()`
+
+    @since version 1.0.0
+    */
+    void clear() noexcept
+    {
+        switch (m_type)
+        {
+            case value_t::number_integer:
+            {
+                m_value.number_integer = 0;
+                break;
+            }
+
+            case value_t::number_unsigned:
+            {
+                m_value.number_unsigned = 0;
+                break;
+            }
+
+            case value_t::number_float:
+            {
+                m_value.number_float = 0.0;
+                break;
+            }
+
+            case value_t::boolean:
+            {
+                m_value.boolean = false;
+                break;
+            }
+
+            case value_t::string:
+            {
+                m_value.string->clear();
+                break;
+            }
+
+            case value_t::binary:
+            {
+                m_value.binary->clear();
+                break;
+            }
+
+            case value_t::array:
+            {
+                m_value.array->clear();
+                break;
+            }
+
+            case value_t::object:
+            {
+                m_value.object->clear();
+                break;
+            }
+
+            case value_t::null:
+            case value_t::discarded:
+            default:
+                break;
+        }
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Appends the given element @a val to the end of the JSON value. If the
+    function is called on a JSON null value, an empty array is created before
+    appending @a val.
+
+    @param[in] val the value to add to the JSON array
+
+    @throw type_error.308 when called on a type other than JSON array or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON array. Note how the `null` value was silently
+    converted to a JSON array.,push_back}
+
+    @since version 1.0.0
+    */
+    void push_back(basic_json&& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (move semantics)
+        const auto old_capacity = m_value.array->capacity();
+        m_value.array->push_back(std::move(val));
+        set_parent(m_value.array->back(), old_capacity);
+        // if val is moved from, basic_json move constructor marks it null so we do not call the destructor
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(basic_json&& val)
+    {
+        push_back(std::move(val));
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    void push_back(const basic_json& val)
+    {
+        // push_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array
+        const auto old_capacity = m_value.array->capacity();
+        m_value.array->push_back(val);
+        set_parent(m_value.array->back(), old_capacity);
+    }
+
+    /*!
+    @brief add an object to an array
+    @copydoc push_back(basic_json&&)
+    */
+    reference operator+=(const basic_json& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    Inserts the given element @a val to the JSON object. If the function is
+    called on a JSON null value, an empty object is created before inserting
+    @a val.
+
+    @param[in] val the value to add to the JSON object
+
+    @throw type_error.308 when called on a type other than JSON object or
+    null; example: `"cannot use push_back() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `push_back()` and `+=` can be used to
+    add elements to a JSON object. Note how the `null` value was silently
+    converted to a JSON object.,push_back__object_t__value}
+
+    @since version 1.0.0
+    */
+    void push_back(const typename object_t::value_type& val)
+    {
+        // push_back only works for null objects or objects
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(308, "cannot use push_back() with " + std::string(type_name()), *this));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to object
+        auto res = m_value.object->insert(val);
+        set_parent(res.first->second);
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(const typename object_t::value_type&)
+    */
+    reference operator+=(const typename object_t::value_type& val)
+    {
+        push_back(val);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an object
+
+    This function allows to use `push_back` with an initializer list. In case
+
+    1. the current value is an object,
+    2. the initializer list @a init contains only two elements, and
+    3. the first element of @a init is a string,
+
+    @a init is converted into an object element and added using
+    @ref push_back(const typename object_t::value_type&). Otherwise, @a init
+    is converted to a JSON value and added using @ref push_back(basic_json&&).
+
+    @param[in] init  an initializer list
+
+    @complexity Linear in the size of the initializer list @a init.
+
+    @note This function is required to resolve an ambiguous overload error,
+          because pairs like `{"key", "value"}` can be both interpreted as
+          `object_t::value_type` or `std::initializer_list<basic_json>`, see
+          https://github.com/nlohmann/json/issues/235 for more information.
+
+    @liveexample{The example shows how initializer lists are treated as
+    objects when possible.,push_back__initializer_list}
+    */
+    void push_back(initializer_list_t init)
+    {
+        if (is_object() && init.size() == 2 && (*init.begin())->is_string())
+        {
+            basic_json&& key = init.begin()->moved_or_copied();
+            push_back(typename object_t::value_type(
+                          std::move(key.get_ref<string_t&>()), (init.begin() + 1)->moved_or_copied()));
+        }
+        else
+        {
+            push_back(basic_json(init));
+        }
+    }
+
+    /*!
+    @brief add an object to an object
+    @copydoc push_back(initializer_list_t)
+    */
+    reference operator+=(initializer_list_t init)
+    {
+        push_back(init);
+        return *this;
+    }
+
+    /*!
+    @brief add an object to an array
+
+    Creates a JSON value from the passed parameters @a args to the end of the
+    JSON value. If the function is called on a JSON null value, an empty array
+    is created before appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return reference to the inserted element
+
+    @throw type_error.311 when called on a type other than JSON array or
+    null; example: `"cannot use emplace_back() with number"`
+
+    @complexity Amortized constant.
+
+    @liveexample{The example shows how `push_back()` can be used to add
+    elements to a JSON array. Note how the `null` value was silently converted
+    to a JSON array.,emplace_back}
+
+    @since version 2.0.8, returns reference since 3.7.0
+    */
+    template<class... Args>
+    reference emplace_back(Args&& ... args)
+    {
+        // emplace_back only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_array())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace_back() with " + std::string(type_name()), *this));
+        }
+
+        // transform null object into an array
+        if (is_null())
+        {
+            m_type = value_t::array;
+            m_value = value_t::array;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        const auto old_capacity = m_value.array->capacity();
+        m_value.array->emplace_back(std::forward<Args>(args)...);
+        return set_parent(m_value.array->back(), old_capacity);
+    }
+
+    /*!
+    @brief add an object to an object if key does not exist
+
+    Inserts a new element into a JSON object constructed in-place with the
+    given @a args if there is no element with the key in the container. If the
+    function is called on a JSON null value, an empty object is created before
+    appending the value created from @a args.
+
+    @param[in] args arguments to forward to a constructor of @ref basic_json
+    @tparam Args compatible types to create a @ref basic_json object
+
+    @return a pair consisting of an iterator to the inserted element, or the
+            already-existing element if no insertion happened, and a bool
+            denoting whether the insertion took place.
+
+    @throw type_error.311 when called on a type other than JSON object or
+    null; example: `"cannot use emplace() with number"`
+
+    @complexity Logarithmic in the size of the container, O(log(`size()`)).
+
+    @liveexample{The example shows how `emplace()` can be used to add elements
+    to a JSON object. Note how the `null` value was silently converted to a
+    JSON object. Further note how no value is added if there was already one
+    value stored with the same key.,emplace}
+
+    @since version 2.0.8
+    */
+    template<class... Args>
+    std::pair<iterator, bool> emplace(Args&& ... args)
+    {
+        // emplace only works for null objects or arrays
+        if (JSON_HEDLEY_UNLIKELY(!(is_null() || is_object())))
+        {
+            JSON_THROW(type_error::create(311, "cannot use emplace() with " + std::string(type_name()), *this));
+        }
+
+        // transform null object into an object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value = value_t::object;
+            assert_invariant();
+        }
+
+        // add element to array (perfect forwarding)
+        auto res = m_value.object->emplace(std::forward<Args>(args)...);
+        set_parent(res.first->second);
+
+        // create result iterator and set iterator to the result of emplace
+        auto it = begin();
+        it.m_it.object_iterator = res.first;
+
+        // return pair of iterator and boolean
+        return {it, res.second};
+    }
+
+    /// Helper for insertion of an iterator
+    /// @note: This uses std::distance to support GCC 4.8,
+    ///        see https://github.com/nlohmann/json/pull/1257
+    template<typename... Args>
+    iterator insert_iterator(const_iterator pos, Args&& ... args)
+    {
+        iterator result(this);
+        JSON_ASSERT(m_value.array != nullptr);
+
+        auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator);
+        m_value.array->insert(pos.m_it.array_iterator, std::forward<Args>(args)...);
+        result.m_it.array_iterator = m_value.array->begin() + insert_pos;
+
+        // This could have been written as:
+        // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val);
+        // but the return value of insert is missing in GCC 4.8, so it is written this way instead.
+
+        set_parents();
+        return result;
+    }
+
+    /*!
+    @brief inserts element
+
+    Inserts element @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] val element to insert
+    @return iterator pointing to the inserted @a val.
+
+    @throw type_error.309 if called on JSON values other than arrays;
+    example: `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Constant plus linear in the distance between @a pos and end of
+    the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief inserts element
+    @copydoc insert(const_iterator, const basic_json&)
+    */
+    iterator insert(const_iterator pos, basic_json&& val)
+    {
+        return insert(pos, val);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts @a cnt copies of @a val before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] cnt number of copies of @a val to insert
+    @param[in] val element to insert
+    @return iterator pointing to the first element inserted, or @a pos if
+    `cnt==0`
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @complexity Linear in @a cnt plus linear in the distance between @a pos
+    and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__count}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, size_type cnt, const basic_json& val)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            // check if iterator pos fits to this JSON value
+            if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+            {
+                JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
+            }
+
+            // insert to array and return iterator
+            return insert_iterator(pos, cnt, val);
+        }
+
+        JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)` before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+    @throw invalid_iterator.211 if @a first or @a last are iterators into
+    container for which insert is called; example: `"passed iterators may not
+    belong to container"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `first==last`
+
+    @complexity Linear in `std::distance(first, last)` plus linear in the
+    distance between @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, const_iterator first, const_iterator last)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this));
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(first.m_object == this))
+        {
+            JSON_THROW(invalid_iterator::create(211, "passed iterators may not belong to container", *this));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, first.m_it.array_iterator, last.m_it.array_iterator);
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from initializer list @a ilist before iterator @a pos.
+
+    @param[in] pos iterator before which the content will be inserted; may be
+    the end() iterator
+    @param[in] ilist initializer list to insert the values from
+
+    @throw type_error.309 if called on JSON values other than arrays; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if @a pos is not an iterator of *this;
+    example: `"iterator does not fit current value"`
+
+    @return iterator pointing to the first element inserted, or @a pos if
+    `ilist` is empty
+
+    @complexity Linear in `ilist.size()` plus linear in the distance between
+    @a pos and end of the container.
+
+    @liveexample{The example shows how `insert()` is used.,insert__ilist}
+
+    @since version 1.0.0
+    */
+    iterator insert(const_iterator pos, initializer_list_t ilist)
+    {
+        // insert only works for arrays
+        if (JSON_HEDLEY_UNLIKELY(!is_array()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
+        }
+
+        // check if iterator pos fits to this JSON value
+        if (JSON_HEDLEY_UNLIKELY(pos.m_object != this))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterator does not fit current value", *this));
+        }
+
+        // insert to array and return iterator
+        return insert_iterator(pos, ilist.begin(), ilist.end());
+    }
+
+    /*!
+    @brief inserts elements
+
+    Inserts elements from range `[first, last)`.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.309 if called on JSON values other than objects; example:
+    `"cannot use insert() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity Logarithmic: `O(N*log(size() + N))`, where `N` is the number
+    of elements to insert.
+
+    @liveexample{The example shows how `insert()` is used.,insert__range_object}
+
+    @since version 3.0.0
+    */
+    void insert(const_iterator first, const_iterator last)
+    {
+        // insert only works for objects
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(309, "cannot use insert() with " + std::string(type_name()), *this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", *this));
+        }
+
+        m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator);
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from JSON object @a j and overwrites existing keys.
+
+    @param[in] j  JSON object to read values from
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_reference j)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name()), *this));
+        }
+        if (JSON_HEDLEY_UNLIKELY(!j.is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(j.type_name()), *this));
+        }
+
+        for (auto it = j.cbegin(); it != j.cend(); ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief updates a JSON object from another object, overwriting existing keys
+
+    Inserts all values from from range `[first, last)` and overwrites existing
+    keys.
+
+    @param[in] first begin of the range of elements to insert
+    @param[in] last end of the range of elements to insert
+
+    @throw type_error.312 if called on JSON values other than objects; example:
+    `"cannot use update() with string"`
+    @throw invalid_iterator.202 if iterator @a first or @a last does does not
+    point to an object; example: `"iterators first and last must point to
+    objects"`
+    @throw invalid_iterator.210 if @a first and @a last do not belong to the
+    same JSON value; example: `"iterators do not fit"`
+
+    @complexity O(N*log(size() + N)), where N is the number of elements to
+                insert.
+
+    @liveexample{The example shows how `update()` is used__range.,update}
+
+    @sa https://docs.python.org/3.6/library/stdtypes.html#dict.update
+
+    @since version 3.0.0
+    */
+    void update(const_iterator first, const_iterator last)
+    {
+        // implicitly convert null value to an empty object
+        if (is_null())
+        {
+            m_type = value_t::object;
+            m_value.object = create<object_t>();
+            assert_invariant();
+        }
+
+        if (JSON_HEDLEY_UNLIKELY(!is_object()))
+        {
+            JSON_THROW(type_error::create(312, "cannot use update() with " + std::string(type_name()), *this));
+        }
+
+        // check if range iterators belong to the same JSON object
+        if (JSON_HEDLEY_UNLIKELY(first.m_object != last.m_object))
+        {
+            JSON_THROW(invalid_iterator::create(210, "iterators do not fit", *this));
+        }
+
+        // passed iterators must belong to objects
+        if (JSON_HEDLEY_UNLIKELY(!first.m_object->is_object()
+                                 || !last.m_object->is_object()))
+        {
+            JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", *this));
+        }
+
+        for (auto it = first; it != last; ++it)
+        {
+            m_value.object->operator[](it.key()) = it.value();
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    void swap(reference other) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        std::swap(m_type, other.m_type);
+        std::swap(m_value, other.m_value);
+
+        set_parents();
+        other.set_parents();
+        assert_invariant();
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of the JSON value from @a left with those of @a right. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated. implemented as a friend function callable via ADL.
+
+    @param[in,out] left JSON value to exchange the contents with
+    @param[in,out] right JSON value to exchange the contents with
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how JSON values can be swapped with
+    `swap()`.,swap__reference}
+
+    @since version 1.0.0
+    */
+    friend void swap(reference left, reference right) noexcept (
+        std::is_nothrow_move_constructible<value_t>::value&&
+        std::is_nothrow_move_assignable<value_t>::value&&
+        std::is_nothrow_move_constructible<json_value>::value&&
+        std::is_nothrow_move_assignable<json_value>::value
+    )
+    {
+        left.swap(right);
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON array with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other array to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an array; example: `"cannot
+    use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how arrays can be swapped with
+    `swap()`.,swap__array_t}
+
+    @since version 1.0.0
+    */
+    void swap(array_t& other) // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for arrays
+        if (JSON_HEDLEY_LIKELY(is_array()))
+        {
+            std::swap(*(m_value.array), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON object with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other object to exchange the contents with
+
+    @throw type_error.310 when JSON value is not an object; example:
+    `"cannot use swap() with string"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how objects can be swapped with
+    `swap()`.,swap__object_t}
+
+    @since version 1.0.0
+    */
+    void swap(object_t& other) // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for objects
+        if (JSON_HEDLEY_LIKELY(is_object()))
+        {
+            std::swap(*(m_value.object), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other string to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__string_t}
+
+    @since version 1.0.0
+    */
+    void swap(string_t& other) // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_string()))
+        {
+            std::swap(*(m_value.string), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /*!
+    @brief exchanges the values
+
+    Exchanges the contents of a JSON string with those of @a other. Does not
+    invoke any move, copy, or swap operations on individual elements. All
+    iterators and references remain valid. The past-the-end iterator is
+    invalidated.
+
+    @param[in,out] other binary to exchange the contents with
+
+    @throw type_error.310 when JSON value is not a string; example: `"cannot
+    use swap() with boolean"`
+
+    @complexity Constant.
+
+    @liveexample{The example below shows how strings can be swapped with
+    `swap()`.,swap__binary_t}
+
+    @since version 3.8.0
+    */
+    void swap(binary_t& other) // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            std::swap(*(m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /// @copydoc swap(binary_t&)
+    void swap(typename binary_t::container_type& other) // NOLINT(bugprone-exception-escape)
+    {
+        // swap only works for strings
+        if (JSON_HEDLEY_LIKELY(is_binary()))
+        {
+            std::swap(*(m_value.binary), other);
+        }
+        else
+        {
+            JSON_THROW(type_error::create(310, "cannot use swap() with " + std::string(type_name()), *this));
+        }
+    }
+
+    /// @}
+
+  public:
+    //////////////////////////////////////////
+    // lexicographical comparison operators //
+    //////////////////////////////////////////
+
+    /// @name lexicographical comparison operators
+    /// @{
+
+    /*!
+    @brief comparison: equal
+
+    Compares two JSON values for equality according to the following rules:
+    - Two JSON values are equal if (1) they are from the same type and (2)
+      their stored values are the same according to their respective
+      `operator==`.
+    - Integer and floating-point numbers are automatically converted before
+      comparison. Note that two NaN values are always treated as unequal.
+    - Two JSON null values are equal.
+
+    @note Floating-point inside JSON values numbers are compared with
+    `json::number_float_t::operator==` which is `double::operator==` by
+    default. To compare floating-point while respecting an epsilon, an alternative
+    [comparison function](https://github.com/mariokonrad/marnav/blob/master/include/marnav/math/floatingpoint.hpp#L34-#L39)
+    could be used, for instance
+    @code {.cpp}
+    template<typename T, typename = typename std::enable_if<std::is_floating_point<T>::value, T>::type>
+    inline bool is_same(T a, T b, T epsilon = std::numeric_limits<T>::epsilon()) noexcept
+    {
+        return std::abs(a - b) <= epsilon;
+    }
+    @endcode
+    Or you can self-defined operator equal function like this:
+    @code {.cpp}
+    bool my_equal(const_reference lhs, const_reference rhs) {
+    const auto lhs_type lhs.type();
+    const auto rhs_type rhs.type();
+    if (lhs_type == rhs_type) {
+        switch(lhs_type)
+            // self_defined case
+            case value_t::number_float:
+                return std::abs(lhs - rhs) <= std::numeric_limits<float>::epsilon();
+            // other cases remain the same with the original
+            ...
+    }
+    ...
+    }
+    @endcode
+
+    @note NaN values never compare equal to themselves or to other NaN values.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are equal
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Linear.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__equal}
+
+    @since version 1.0.0
+    */
+    friend bool operator==(const_reference lhs, const_reference rhs) noexcept
+    {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    return *lhs.m_value.array == *rhs.m_value.array;
+
+                case value_t::object:
+                    return *lhs.m_value.object == *rhs.m_value.object;
+
+                case value_t::null:
+                    return true;
+
+                case value_t::string:
+                    return *lhs.m_value.string == *rhs.m_value.string;
+
+                case value_t::boolean:
+                    return lhs.m_value.boolean == rhs.m_value.boolean;
+
+                case value_t::number_integer:
+                    return lhs.m_value.number_integer == rhs.m_value.number_integer;
+
+                case value_t::number_unsigned:
+                    return lhs.m_value.number_unsigned == rhs.m_value.number_unsigned;
+
+                case value_t::number_float:
+                    return lhs.m_value.number_float == rhs.m_value.number_float;
+
+                case value_t::binary:
+                    return *lhs.m_value.binary == *rhs.m_value.binary;
+
+                case value_t::discarded:
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float == static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) == rhs.m_value.number_integer;
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer == static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+
+        return false;
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs == basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: equal
+    @copydoc operator==(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator==(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) == rhs;
+    }
+
+    /*!
+    @brief comparison: not equal
+
+    Compares two JSON values for inequality by calculating `not (lhs == rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether the values @a lhs and @a rhs are not equal
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__notequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator!=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs != basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: not equal
+    @copydoc operator!=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator!=(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) != rhs;
+    }
+
+    /*!
+    @brief comparison: less than
+
+    Compares whether one JSON value @a lhs is less than another JSON value @a
+    rhs according to the following rules:
+    - If @a lhs and @a rhs have the same type, the values are compared using
+      the default `<` operator.
+    - Integer and floating-point numbers are automatically converted before
+      comparison
+    - In case @a lhs and @a rhs have different types, the values are ignored
+      and the order of the types is considered, see
+      @ref operator<(const value_t, const value_t).
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__less}
+
+    @since version 1.0.0
+    */
+    friend bool operator<(const_reference lhs, const_reference rhs) noexcept
+    {
+        const auto lhs_type = lhs.type();
+        const auto rhs_type = rhs.type();
+
+        if (lhs_type == rhs_type)
+        {
+            switch (lhs_type)
+            {
+                case value_t::array:
+                    // note parentheses are necessary, see
+                    // https://github.com/nlohmann/json/issues/1530
+                    return (*lhs.m_value.array) < (*rhs.m_value.array);
+
+                case value_t::object:
+                    return (*lhs.m_value.object) < (*rhs.m_value.object);
+
+                case value_t::null:
+                    return false;
+
+                case value_t::string:
+                    return (*lhs.m_value.string) < (*rhs.m_value.string);
+
+                case value_t::boolean:
+                    return (lhs.m_value.boolean) < (rhs.m_value.boolean);
+
+                case value_t::number_integer:
+                    return (lhs.m_value.number_integer) < (rhs.m_value.number_integer);
+
+                case value_t::number_unsigned:
+                    return (lhs.m_value.number_unsigned) < (rhs.m_value.number_unsigned);
+
+                case value_t::number_float:
+                    return (lhs.m_value.number_float) < (rhs.m_value.number_float);
+
+                case value_t::binary:
+                    return (*lhs.m_value.binary) < (*rhs.m_value.binary);
+
+                case value_t::discarded:
+                default:
+                    return false;
+            }
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_integer) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_integer);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float)
+        {
+            return static_cast<number_float_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_float;
+        }
+        else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_float < static_cast<number_float_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned)
+        {
+            return lhs.m_value.number_integer < static_cast<number_integer_t>(rhs.m_value.number_unsigned);
+        }
+        else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer)
+        {
+            return static_cast<number_integer_t>(lhs.m_value.number_unsigned) < rhs.m_value.number_integer;
+        }
+
+        // We only reach this line if we cannot compare values. In that case,
+        // we compare types. Note we have to call the operator explicitly,
+        // because MSVC has problems otherwise.
+        return operator<(lhs_type, rhs_type);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs < basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than
+    @copydoc operator<(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) < rhs;
+    }
+
+    /*!
+    @brief comparison: less than or equal
+
+    Compares whether one JSON value @a lhs is less than or equal to another
+    JSON value by calculating `not (rhs < lhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is less than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greater}
+
+    @since version 1.0.0
+    */
+    friend bool operator<=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(rhs < lhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs <= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: less than or equal
+    @copydoc operator<=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator<=(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) <= rhs;
+    }
+
+    /*!
+    @brief comparison: greater than
+
+    Compares whether one JSON value @a lhs is greater than another
+    JSON value by calculating `not (lhs <= rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__lessequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs <= rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs > basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than
+    @copydoc operator>(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) > rhs;
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+
+    Compares whether one JSON value @a lhs is greater than or equal to another
+    JSON value by calculating `not (lhs < rhs)`.
+
+    @param[in] lhs  first JSON value to consider
+    @param[in] rhs  second JSON value to consider
+    @return whether @a lhs is greater than or equal to @a rhs
+
+    @complexity Linear.
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @liveexample{The example demonstrates comparing several JSON
+    types.,operator__greaterequal}
+
+    @since version 1.0.0
+    */
+    friend bool operator>=(const_reference lhs, const_reference rhs) noexcept
+    {
+        return !(lhs < rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(const_reference lhs, ScalarType rhs) noexcept
+    {
+        return lhs >= basic_json(rhs);
+    }
+
+    /*!
+    @brief comparison: greater than or equal
+    @copydoc operator>=(const_reference, const_reference)
+    */
+    template<typename ScalarType, typename std::enable_if<
+                 std::is_scalar<ScalarType>::value, int>::type = 0>
+    friend bool operator>=(ScalarType lhs, const_reference rhs) noexcept
+    {
+        return basic_json(lhs) >= rhs;
+    }
+
+    /// @}
+
+    ///////////////////
+    // serialization //
+    ///////////////////
+
+    /// @name serialization
+    /// @{
+#ifndef JSON_NO_IO
+    /*!
+    @brief serialize to stream
+
+    Serialize the given JSON value @a j to the output stream @a o. The JSON
+    value will be serialized using the @ref dump member function.
+
+    - The indentation of the output can be controlled with the member variable
+      `width` of the output stream @a o. For instance, using the manipulator
+      `std::setw(4)` on @a o sets the indentation level to `4` and the
+      serialization result is the same as calling `dump(4)`.
+
+    - The indentation character can be controlled with the member variable
+      `fill` of the output stream @a o. For instance, the manipulator
+      `std::setfill('\\t')` sets indentation to use a tab character rather than
+      the default space character.
+
+    @param[in,out] o  stream to serialize to
+    @param[in] j  JSON value to serialize
+
+    @return the stream @a o
+
+    @throw type_error.316 if a string stored inside the JSON value is not
+                          UTF-8 encoded
+
+    @complexity Linear.
+
+    @liveexample{The example below shows the serialization with different
+    parameters to `width` to adjust the indentation level.,operator_serialize}
+
+    @since version 1.0.0; indentation character added in version 3.0.0
+    */
+    friend std::ostream& operator<<(std::ostream& o, const basic_json& j)
+    {
+        // read width member and use it as indentation parameter if nonzero
+        const bool pretty_print = o.width() > 0;
+        const auto indentation = pretty_print ? o.width() : 0;
+
+        // reset width to 0 for subsequent calls to this stream
+        o.width(0);
+
+        // do the actual serialization
+        serializer s(detail::output_adapter<char>(o), o.fill());
+        s.dump(j, pretty_print, false, static_cast<unsigned int>(indentation));
+        return o;
+    }
+
+    /*!
+    @brief serialize to stream
+    @deprecated This stream operator is deprecated and will be removed in
+                future 4.0.0 of the library. Please use
+                @ref operator<<(std::ostream&, const basic_json&)
+                instead; that is, replace calls like `j >> o;` with `o << j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator<<(std::ostream&, const basic_json&))
+    friend std::ostream& operator>>(const basic_json& j, std::ostream& o)
+    {
+        return o << j;
+    }
+#endif  // JSON_NO_IO
+    /// @}
+
+
+    /////////////////////
+    // deserialization //
+    /////////////////////
+
+    /// @name deserialization
+    /// @{
+
+    /*!
+    @brief deserialize from a compatible input
+
+    @tparam InputType A compatible input, for instance
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i  input to read from
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the parser callback function
+    @a cb or reading from the input @a i has a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from an array.,parse__array__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__string__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function with
+    and without callback function.,parse__istream__parser_callback_t}
+
+    @liveexample{The example below demonstrates the `parse()` function reading
+    from a contiguous container.,parse__contiguouscontainer__parser_callback_t}
+
+    @since version 2.0.3 (contiguous containers); version 3.9.0 allowed to
+    ignore comments.
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(InputType&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::forward<InputType>(i)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /*!
+    @brief deserialize from a pair of character iterators
+
+    The value_type of the iterator must be a integral type with size of 1, 2 or
+    4 bytes, which will be interpreted respectively as UTF-8, UTF-16 and UTF-32.
+
+    @param[in] first iterator to start of character range
+    @param[in] last  iterator to end of character range
+    @param[in] cb  a parser callback function of type @ref parser_callback_t
+    which is used to control the deserialization by filtering unwanted values
+    (optional)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json parse(IteratorType first,
+                            IteratorType last,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(detail::input_adapter(std::move(first), std::move(last)), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, parse(ptr, ptr + len))
+    static basic_json parse(detail::span_input_adapter&& i,
+                            const parser_callback_t cb = nullptr,
+                            const bool allow_exceptions = true,
+                            const bool ignore_comments = false)
+    {
+        basic_json result;
+        parser(i.get(), cb, allow_exceptions, ignore_comments).parse(true, result);
+        return result;
+    }
+
+    /*!
+    @brief check if the input is valid JSON
+
+    Unlike the @ref parse(InputType&&, const parser_callback_t,const bool)
+    function, this function neither throws an exception in case of invalid JSON
+    input (i.e., a parse error) nor creates diagnostic information.
+
+    @tparam InputType A compatible input, for instance
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i input to read from
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default)
+
+    @return Whether the input read from @a i is valid JSON.
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `accept()` function reading
+    from a string.,accept__string}
+    */
+    template<typename InputType>
+    static bool accept(InputType&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::forward<InputType>(i)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    template<typename IteratorType>
+    static bool accept(IteratorType first, IteratorType last,
+                       const bool ignore_comments = false)
+    {
+        return parser(detail::input_adapter(std::move(first), std::move(last)), nullptr, false, ignore_comments).accept(true);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, accept(ptr, ptr + len))
+    static bool accept(detail::span_input_adapter&& i,
+                       const bool ignore_comments = false)
+    {
+        return parser(i.get(), nullptr, false, ignore_comments).accept(true);
+    }
+
+    /*!
+    @brief generate SAX events
+
+    The SAX event lister must follow the interface of @ref json_sax.
+
+    This function reads from a compatible input. Examples are:
+    - an std::istream object
+    - a FILE pointer
+    - a C-style array of characters
+    - a pointer to a null-terminated string of single byte characters
+    - an object obj for which begin(obj) and end(obj) produces a valid pair of
+      iterators.
+
+    @param[in] i  input to read from
+    @param[in,out] sax  SAX event listener
+    @param[in] format  the format to parse (JSON, CBOR, MessagePack, or UBJSON)
+    @param[in] strict  whether the input has to be consumed completely
+    @param[in] ignore_comments  whether comments should be ignored and treated
+    like whitespace (true) or yield a parse error (true); (optional, false by
+    default); only applies to the JSON file format.
+
+    @return return value of the last processed SAX event
+
+    @throw parse_error.101 if a parse error occurs; example: `""unexpected end
+    of input; expected string literal""`
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser. The complexity can be higher if the SAX consumer @a sax has
+    a super-linear complexity.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below demonstrates the `sax_parse()` function
+    reading from string and processing the events with a user-defined SAX
+    event consumer.,sax_parse}
+
+    @since version 3.2.0
+    */
+    template <typename InputType, typename SAX>
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(InputType&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    template<class IteratorType, class SAX>
+    JSON_HEDLEY_NON_NULL(3)
+    static bool sax_parse(IteratorType first, IteratorType last, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        return format == input_format_t::json
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+
+    template <typename SAX>
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, sax_parse(ptr, ptr + len, ...))
+    JSON_HEDLEY_NON_NULL(2)
+    static bool sax_parse(detail::span_input_adapter&& i, SAX* sax,
+                          input_format_t format = input_format_t::json,
+                          const bool strict = true,
+                          const bool ignore_comments = false)
+    {
+        auto ia = i.get();
+        return format == input_format_t::json
+               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+               ? parser(std::move(ia), nullptr, true, ignore_comments).sax_parse(sax, strict)
+               // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+               : detail::binary_reader<basic_json, decltype(ia), SAX>(std::move(ia)).sax_parse(format, sax, strict);
+    }
+#ifndef JSON_NO_IO
+    /*!
+    @brief deserialize from stream
+    @deprecated This stream operator is deprecated and will be removed in
+                version 4.0.0 of the library. Please use
+                @ref operator>>(std::istream&, basic_json&)
+                instead; that is, replace calls like `j << i;` with `i >> j;`.
+    @since version 1.0.0; deprecated since version 3.0.0
+    */
+    JSON_HEDLEY_DEPRECATED_FOR(3.0.0, operator>>(std::istream&, basic_json&))
+    friend std::istream& operator<<(basic_json& j, std::istream& i)
+    {
+        return operator>>(i, j);
+    }
+
+    /*!
+    @brief deserialize from stream
+
+    Deserializes an input stream to a JSON value.
+
+    @param[in,out] i  input stream to read a serialized JSON value from
+    @param[in,out] j  JSON value to write the deserialized input to
+
+    @throw parse_error.101 in case of an unexpected token
+    @throw parse_error.102 if to_unicode fails or surrogate error
+    @throw parse_error.103 if to_unicode fails
+
+    @complexity Linear in the length of the input. The parser is a predictive
+    LL(1) parser.
+
+    @note A UTF-8 byte order mark is silently ignored.
+
+    @liveexample{The example below shows how a JSON value is constructed by
+    reading a serialization from a stream.,operator_deserialize}
+
+    @sa parse(std::istream&, const parser_callback_t) for a variant with a
+    parser callback function to filter values while parsing
+
+    @since version 1.0.0
+    */
+    friend std::istream& operator>>(std::istream& i, basic_json& j)
+    {
+        parser(detail::input_adapter(i)).parse(false, j);
+        return i;
+    }
+#endif  // JSON_NO_IO
+    /// @}
+
+    ///////////////////////////
+    // convenience functions //
+    ///////////////////////////
+
+    /*!
+    @brief return the type as string
+
+    Returns the type name as string to be used in error messages - usually to
+    indicate that a function was called on a wrong JSON type.
+
+    @return a string representation of a the @a m_type member:
+            Value type  | return value
+            ----------- | -------------
+            null        | `"null"`
+            boolean     | `"boolean"`
+            string      | `"string"`
+            number      | `"number"` (for all number types)
+            object      | `"object"`
+            array       | `"array"`
+            binary      | `"binary"`
+            discarded   | `"discarded"`
+
+    @exceptionsafety No-throw guarantee: this function never throws exceptions.
+
+    @complexity Constant.
+
+    @liveexample{The following code exemplifies `type_name()` for all JSON
+    types.,type_name}
+
+    @sa see @ref type() -- return the type of the JSON value
+    @sa see @ref operator value_t() -- return the type of the JSON value (implicit)
+
+    @since version 1.0.0, public since 2.1.0, `const char*` and `noexcept`
+    since 3.0.0
+    */
+    JSON_HEDLEY_RETURNS_NON_NULL
+    const char* type_name() const noexcept
+    {
+        {
+            switch (m_type)
+            {
+                case value_t::null:
+                    return "null";
+                case value_t::object:
+                    return "object";
+                case value_t::array:
+                    return "array";
+                case value_t::string:
+                    return "string";
+                case value_t::boolean:
+                    return "boolean";
+                case value_t::binary:
+                    return "binary";
+                case value_t::discarded:
+                    return "discarded";
+                case value_t::number_integer:
+                case value_t::number_unsigned:
+                case value_t::number_float:
+                default:
+                    return "number";
+            }
+        }
+    }
+
+
+  JSON_PRIVATE_UNLESS_TESTED:
+    //////////////////////
+    // member variables //
+    //////////////////////
+
+    /// the type of the current element
+    value_t m_type = value_t::null;
+
+    /// the value of the current element
+    json_value m_value = {};
+
+#if JSON_DIAGNOSTICS
+    /// a pointer to a parent value (for debugging purposes)
+    basic_json* m_parent = nullptr;
+#endif
+
+    //////////////////////////////////////////
+    // binary serialization/deserialization //
+    //////////////////////////////////////////
+
+    /// @name binary serialization/deserialization support
+    /// @{
+
+  public:
+    /*!
+    @brief create a CBOR serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the CBOR (Concise
+    Binary Object Representation) serialization format. CBOR is a binary
+    serialization format which aims to be more compact than JSON itself, yet
+    more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    CBOR types according to the CBOR specification (RFC 7049):
+
+    JSON value type | value/range                                | CBOR type                          | first byte
+    --------------- | ------------------------------------------ | ---------------------------------- | ---------------
+    null            | `null`                                     | Null                               | 0xF6
+    boolean         | `true`                                     | True                               | 0xF5
+    boolean         | `false`                                    | False                              | 0xF4
+    number_integer  | -9223372036854775808..-2147483649          | Negative integer (8 bytes follow)  | 0x3B
+    number_integer  | -2147483648..-32769                        | Negative integer (4 bytes follow)  | 0x3A
+    number_integer  | -32768..-129                               | Negative integer (2 bytes follow)  | 0x39
+    number_integer  | -128..-25                                  | Negative integer (1 byte follow)   | 0x38
+    number_integer  | -24..-1                                    | Negative integer                   | 0x20..0x37
+    number_integer  | 0..23                                      | Integer                            | 0x00..0x17
+    number_integer  | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_integer  | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_integer  | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_integer  | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_unsigned | 0..23                                      | Integer                            | 0x00..0x17
+    number_unsigned | 24..255                                    | Unsigned integer (1 byte follow)   | 0x18
+    number_unsigned | 256..65535                                 | Unsigned integer (2 bytes follow)  | 0x19
+    number_unsigned | 65536..4294967295                          | Unsigned integer (4 bytes follow)  | 0x1A
+    number_unsigned | 4294967296..18446744073709551615           | Unsigned integer (8 bytes follow)  | 0x1B
+    number_float    | *any value representable by a float*       | Single-Precision Float             | 0xFA
+    number_float    | *any value NOT representable by a float*   | Double-Precision Float             | 0xFB
+    string          | *length*: 0..23                            | UTF-8 string                       | 0x60..0x77
+    string          | *length*: 23..255                          | UTF-8 string (1 byte follow)       | 0x78
+    string          | *length*: 256..65535                       | UTF-8 string (2 bytes follow)      | 0x79
+    string          | *length*: 65536..4294967295                | UTF-8 string (4 bytes follow)      | 0x7A
+    string          | *length*: 4294967296..18446744073709551615 | UTF-8 string (8 bytes follow)      | 0x7B
+    array           | *size*: 0..23                              | array                              | 0x80..0x97
+    array           | *size*: 23..255                            | array (1 byte follow)              | 0x98
+    array           | *size*: 256..65535                         | array (2 bytes follow)             | 0x99
+    array           | *size*: 65536..4294967295                  | array (4 bytes follow)             | 0x9A
+    array           | *size*: 4294967296..18446744073709551615   | array (8 bytes follow)             | 0x9B
+    object          | *size*: 0..23                              | map                                | 0xA0..0xB7
+    object          | *size*: 23..255                            | map (1 byte follow)                | 0xB8
+    object          | *size*: 256..65535                         | map (2 bytes follow)               | 0xB9
+    object          | *size*: 65536..4294967295                  | map (4 bytes follow)               | 0xBA
+    object          | *size*: 4294967296..18446744073709551615   | map (8 bytes follow)               | 0xBB
+    binary          | *size*: 0..23                              | byte string                        | 0x40..0x57
+    binary          | *size*: 23..255                            | byte string (1 byte follow)        | 0x58
+    binary          | *size*: 256..65535                         | byte string (2 bytes follow)       | 0x59
+    binary          | *size*: 65536..4294967295                  | byte string (4 bytes follow)       | 0x5A
+    binary          | *size*: 4294967296..18446744073709551615   | byte string (8 bytes follow)       | 0x5B
+
+    Binary values with subtype are mapped to tagged values (0xD8..0xDB)
+    depending on the subtype, followed by a byte string, see "binary" cells
+    in the table above.
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a CBOR value.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The following CBOR types are not used in the conversion:
+          - UTF-8 strings terminated by "break" (0x7F)
+          - arrays terminated by "break" (0x9F)
+          - maps terminated by "break" (0xBF)
+          - byte strings terminated by "break" (0x5F)
+          - date/time (0xC0..0xC1)
+          - bignum (0xC2..0xC3)
+          - decimal fraction (0xC4)
+          - bigfloat (0xC5)
+          - expected conversions (0xD5..0xD7)
+          - simple values (0xE0..0xF3, 0xF8)
+          - undefined (0xF7)
+          - half-precision floats (0xF9)
+          - break (0xFF)
+
+    @param[in] j  JSON value to serialize
+    @return CBOR serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in CBOR format.,to_cbor}
+
+    @sa http://cbor.io
+    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
+        analogous deserialization
+    @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format
+    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9; compact representation of floating-point numbers
+           since version 3.8.0
+    */
+    static std::vector<std::uint8_t> to_cbor(const basic_json& j)
+    {
+        std::vector<std::uint8_t> result;
+        to_cbor(j, result);
+        return result;
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<std::uint8_t> o)
+    {
+        binary_writer<std::uint8_t>(o).write_cbor(j);
+    }
+
+    static void to_cbor(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_cbor(j);
+    }
+
+    /*!
+    @brief create a MessagePack serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the MessagePack
+    serialization format. MessagePack is a binary serialization format which
+    aims to be more compact than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    MessagePack types according to the MessagePack specification:
+
+    JSON value type | value/range                       | MessagePack type | first byte
+    --------------- | --------------------------------- | ---------------- | ----------
+    null            | `null`                            | nil              | 0xC0
+    boolean         | `true`                            | true             | 0xC3
+    boolean         | `false`                           | false            | 0xC2
+    number_integer  | -9223372036854775808..-2147483649 | int64            | 0xD3
+    number_integer  | -2147483648..-32769               | int32            | 0xD2
+    number_integer  | -32768..-129                      | int16            | 0xD1
+    number_integer  | -128..-33                         | int8             | 0xD0
+    number_integer  | -32..-1                           | negative fixint  | 0xE0..0xFF
+    number_integer  | 0..127                            | positive fixint  | 0x00..0x7F
+    number_integer  | 128..255                          | uint 8           | 0xCC
+    number_integer  | 256..65535                        | uint 16          | 0xCD
+    number_integer  | 65536..4294967295                 | uint 32          | 0xCE
+    number_integer  | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_unsigned | 0..127                            | positive fixint  | 0x00..0x7F
+    number_unsigned | 128..255                          | uint 8           | 0xCC
+    number_unsigned | 256..65535                        | uint 16          | 0xCD
+    number_unsigned | 65536..4294967295                 | uint 32          | 0xCE
+    number_unsigned | 4294967296..18446744073709551615  | uint 64          | 0xCF
+    number_float    | *any value representable by a float*     | float 32 | 0xCA
+    number_float    | *any value NOT representable by a float* | float 64 | 0xCB
+    string          | *length*: 0..31                   | fixstr           | 0xA0..0xBF
+    string          | *length*: 32..255                 | str 8            | 0xD9
+    string          | *length*: 256..65535              | str 16           | 0xDA
+    string          | *length*: 65536..4294967295       | str 32           | 0xDB
+    array           | *size*: 0..15                     | fixarray         | 0x90..0x9F
+    array           | *size*: 16..65535                 | array 16         | 0xDC
+    array           | *size*: 65536..4294967295         | array 32         | 0xDD
+    object          | *size*: 0..15                     | fix map          | 0x80..0x8F
+    object          | *size*: 16..65535                 | map 16           | 0xDE
+    object          | *size*: 65536..4294967295         | map 32           | 0xDF
+    binary          | *size*: 0..255                    | bin 8            | 0xC4
+    binary          | *size*: 256..65535                | bin 16           | 0xC5
+    binary          | *size*: 65536..4294967295         | bin 32           | 0xC6
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a MessagePack value.
+
+    @note The following values can **not** be converted to a MessagePack value:
+          - strings with more than 4294967295 bytes
+          - byte strings with more than 4294967295 bytes
+          - arrays with more than 4294967295 elements
+          - objects with more than 4294967295 elements
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @param[in] j  JSON value to serialize
+    @return MessagePack serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in MessagePack format.,to_msgpack}
+
+    @sa http://msgpack.org
+    @sa see @ref from_msgpack for the analogous deserialization
+    @sa see @ref to_cbor(const basic_json& for the related CBOR format
+    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+
+    @since version 2.0.9
+    */
+    static std::vector<std::uint8_t> to_msgpack(const basic_json& j)
+    {
+        std::vector<std::uint8_t> result;
+        to_msgpack(j, result);
+        return result;
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<std::uint8_t> o)
+    {
+        binary_writer<std::uint8_t>(o).write_msgpack(j);
+    }
+
+    static void to_msgpack(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_msgpack(j);
+    }
+
+    /*!
+    @brief create a UBJSON serialization of a given JSON value
+
+    Serializes a given JSON value @a j to a byte vector using the UBJSON
+    (Universal Binary JSON) serialization format. UBJSON aims to be more compact
+    than JSON itself, yet more efficient to parse.
+
+    The library uses the following mapping from JSON values types to
+    UBJSON types according to the UBJSON specification:
+
+    JSON value type | value/range                       | UBJSON type | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | `Z`
+    boolean         | `true`                            | true        | `T`
+    boolean         | `false`                           | false       | `F`
+    number_integer  | -9223372036854775808..-2147483649 | int64       | `L`
+    number_integer  | -2147483648..-32769               | int32       | `l`
+    number_integer  | -32768..-129                      | int16       | `I`
+    number_integer  | -128..127                         | int8        | `i`
+    number_integer  | 128..255                          | uint8       | `U`
+    number_integer  | 256..32767                        | int16       | `I`
+    number_integer  | 32768..2147483647                 | int32       | `l`
+    number_integer  | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 0..127                            | int8        | `i`
+    number_unsigned | 128..255                          | uint8       | `U`
+    number_unsigned | 256..32767                        | int16       | `I`
+    number_unsigned | 32768..2147483647                 | int32       | `l`
+    number_unsigned | 2147483648..9223372036854775807   | int64       | `L`
+    number_unsigned | 2147483649..18446744073709551615  | high-precision | `H`
+    number_float    | *any value*                       | float64     | `D`
+    string          | *with shortest length indicator*  | string      | `S`
+    array           | *see notes on optimized format*   | array       | `[`
+    object          | *see notes on optimized format*   | map         | `{`
+
+    @note The mapping is **complete** in the sense that any JSON value type
+          can be converted to a UBJSON value.
+
+    @note The following values can **not** be converted to a UBJSON value:
+          - strings with more than 9223372036854775807 bytes (theoretical)
+
+    @note The following markers are not used in the conversion:
+          - `Z`: no-op values are not created.
+          - `C`: single-byte strings are serialized with `S` markers.
+
+    @note Any UBJSON output created @ref to_ubjson can be successfully parsed
+          by @ref from_ubjson.
+
+    @note If NaN or Infinity are stored inside a JSON number, they are
+          serialized properly. This behavior differs from the @ref dump()
+          function which serializes NaN or Infinity to `null`.
+
+    @note The optimized formats for containers are supported: Parameter
+          @a use_size adds size information to the beginning of a container and
+          removes the closing marker. Parameter @a use_type further checks
+          whether all elements of a container have the same type and adds the
+          type marker to the beginning of the container. The @a use_type
+          parameter must only be used together with @a use_size = true. Note
+          that @a use_size = true alone may result in larger representations -
+          the benefit of this parameter is that the receiving side is
+          immediately informed on the number of elements of the container.
+
+    @note If the JSON data contains the binary type, the value stored is a list
+          of integers, as suggested by the UBJSON documentation.  In particular,
+          this means that serialization and the deserialization of a JSON
+          containing binary values into UBJSON and back will result in a
+          different JSON object.
+
+    @param[in] j  JSON value to serialize
+    @param[in] use_size  whether to add size annotations to container types
+    @param[in] use_type  whether to add type annotations to container types
+                         (must be combined with @a use_size = true)
+    @return UBJSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in UBJSON format.,to_ubjson}
+
+    @sa http://ubjson.org
+    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the
+        analogous deserialization
+    @sa see @ref to_cbor(const basic_json& for the related CBOR format
+    @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format
+
+    @since version 3.1.0
+    */
+    static std::vector<std::uint8_t> to_ubjson(const basic_json& j,
+            const bool use_size = false,
+            const bool use_type = false)
+    {
+        std::vector<std::uint8_t> result;
+        to_ubjson(j, result, use_size, use_type);
+        return result;
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<std::uint8_t> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<std::uint8_t>(o).write_ubjson(j, use_size, use_type);
+    }
+
+    static void to_ubjson(const basic_json& j, detail::output_adapter<char> o,
+                          const bool use_size = false, const bool use_type = false)
+    {
+        binary_writer<char>(o).write_ubjson(j, use_size, use_type);
+    }
+
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and returns a vector
+           containing the corresponding BSON-representation.
+
+    BSON (Binary JSON) is a binary format in which zero or more ordered key/value pairs are
+    stored as a single entity (a so-called document).
+
+    The library uses the following mapping from JSON values types to BSON types:
+
+    JSON value type | value/range                       | BSON type   | marker
+    --------------- | --------------------------------- | ----------- | ------
+    null            | `null`                            | null        | 0x0A
+    boolean         | `true`, `false`                   | boolean     | 0x08
+    number_integer  | -9223372036854775808..-2147483649 | int64       | 0x12
+    number_integer  | -2147483648..2147483647           | int32       | 0x10
+    number_integer  | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 0..2147483647                     | int32       | 0x10
+    number_unsigned | 2147483648..9223372036854775807   | int64       | 0x12
+    number_unsigned | 9223372036854775808..18446744073709551615| --   | --
+    number_float    | *any value*                       | double      | 0x01
+    string          | *any value*                       | string      | 0x02
+    array           | *any value*                       | document    | 0x04
+    object          | *any value*                       | document    | 0x03
+    binary          | *any value*                       | binary      | 0x05
+
+    @warning The mapping is **incomplete**, since only JSON-objects (and things
+    contained therein) can be serialized to BSON.
+    Also, integers larger than 9223372036854775807 cannot be serialized to BSON,
+    and the keys may not contain U+0000, since they are serialized a
+    zero-terminated c-strings.
+
+    @throw out_of_range.407  if `j.is_number_unsigned() && j.get<std::uint64_t>() > 9223372036854775807`
+    @throw out_of_range.409  if a key in `j` contains a NULL (U+0000)
+    @throw type_error.317    if `!j.is_object()`
+
+    @pre The input `j` is required to be an object: `j.is_object() == true`.
+
+    @note Any BSON output created via @ref to_bson can be successfully parsed
+          by @ref from_bson.
+
+    @param[in] j  JSON value to serialize
+    @return BSON serialization as byte vector
+
+    @complexity Linear in the size of the JSON value @a j.
+
+    @liveexample{The example shows the serialization of a JSON value to a byte
+    vector in BSON format.,to_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa see @ref from_bson(detail::input_adapter&&, const bool strict) for the
+        analogous deserialization
+    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             related UBJSON format
+    @sa see @ref to_cbor(const basic_json&) for the related CBOR format
+    @sa see @ref to_msgpack(const basic_json&) for the related MessagePack format
+    */
+    static std::vector<std::uint8_t> to_bson(const basic_json& j)
+    {
+        std::vector<std::uint8_t> result;
+        to_bson(j, result);
+        return result;
+    }
+
+    /*!
+    @brief Serializes the given JSON object `j` to BSON and forwards the
+           corresponding BSON-representation to the given output_adapter `o`.
+    @param j The JSON object to convert to BSON.
+    @param o The output adapter that receives the binary BSON representation.
+    @pre The input `j` shall be an object: `j.is_object() == true`
+    @sa see @ref to_bson(const basic_json&)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<std::uint8_t> o)
+    {
+        binary_writer<std::uint8_t>(o).write_bson(j);
+    }
+
+    /*!
+    @copydoc to_bson(const basic_json&, detail::output_adapter<std::uint8_t>)
+    */
+    static void to_bson(const basic_json& j, detail::output_adapter<char> o)
+    {
+        binary_writer<char>(o).write_bson(j);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in CBOR format
+
+    Deserializes a given input @a i to a JSON value using the CBOR (Concise
+    Binary Object Representation) serialization format.
+
+    The library maps CBOR types to JSON value types as follows:
+
+    CBOR type              | JSON value type | first byte
+    ---------------------- | --------------- | ----------
+    Integer                | number_unsigned | 0x00..0x17
+    Unsigned integer       | number_unsigned | 0x18
+    Unsigned integer       | number_unsigned | 0x19
+    Unsigned integer       | number_unsigned | 0x1A
+    Unsigned integer       | number_unsigned | 0x1B
+    Negative integer       | number_integer  | 0x20..0x37
+    Negative integer       | number_integer  | 0x38
+    Negative integer       | number_integer  | 0x39
+    Negative integer       | number_integer  | 0x3A
+    Negative integer       | number_integer  | 0x3B
+    Byte string            | binary          | 0x40..0x57
+    Byte string            | binary          | 0x58
+    Byte string            | binary          | 0x59
+    Byte string            | binary          | 0x5A
+    Byte string            | binary          | 0x5B
+    UTF-8 string           | string          | 0x60..0x77
+    UTF-8 string           | string          | 0x78
+    UTF-8 string           | string          | 0x79
+    UTF-8 string           | string          | 0x7A
+    UTF-8 string           | string          | 0x7B
+    UTF-8 string           | string          | 0x7F
+    array                  | array           | 0x80..0x97
+    array                  | array           | 0x98
+    array                  | array           | 0x99
+    array                  | array           | 0x9A
+    array                  | array           | 0x9B
+    array                  | array           | 0x9F
+    map                    | object          | 0xA0..0xB7
+    map                    | object          | 0xB8
+    map                    | object          | 0xB9
+    map                    | object          | 0xBA
+    map                    | object          | 0xBB
+    map                    | object          | 0xBF
+    False                  | `false`         | 0xF4
+    True                   | `true`          | 0xF5
+    Null                   | `null`          | 0xF6
+    Half-Precision Float   | number_float    | 0xF9
+    Single-Precision Float | number_float    | 0xFA
+    Double-Precision Float | number_float    | 0xFB
+
+    @warning The mapping is **incomplete** in the sense that not all CBOR
+             types can be converted to a JSON value. The following CBOR types
+             are not supported and will yield parse errors (parse_error.112):
+             - date/time (0xC0..0xC1)
+             - bignum (0xC2..0xC3)
+             - decimal fraction (0xC4)
+             - bigfloat (0xC5)
+             - expected conversions (0xD5..0xD7)
+             - simple values (0xE0..0xF3, 0xF8)
+             - undefined (0xF7)
+
+    @warning CBOR allows map keys of any type, whereas JSON only allows
+             strings as keys in object values. Therefore, CBOR maps with keys
+             other than UTF-8 strings are rejected (parse_error.113).
+
+    @note Any CBOR output created @ref to_cbor can be successfully parsed by
+          @ref from_cbor.
+
+    @param[in] i  an input in CBOR format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+    @param[in] tag_handler how to treat CBOR tags (optional, error by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from CBOR were
+    used in the given input @a v or if the input is not valid CBOR
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in CBOR
+    format to a JSON value.,from_cbor}
+
+    @sa http://cbor.io
+    @sa see @ref to_cbor(const basic_json&) for the analogous serialization
+    @sa see @ref from_msgpack(InputType&&, const bool, const bool) for the
+        related MessagePack format
+    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the
+        related UBJSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0; added @a tag_handler parameter since 3.9.0.
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_cbor(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler);
+    }
+
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len))
+    static basic_json from_cbor(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true,
+                                const cbor_tag_handler_t tag_handler = cbor_tag_handler_t::error)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::cbor, &sdp, strict, tag_handler);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @brief create a JSON value from an input in MessagePack format
+
+    Deserializes a given input @a i to a JSON value using the MessagePack
+    serialization format.
+
+    The library maps MessagePack types to JSON value types as follows:
+
+    MessagePack type | JSON value type | first byte
+    ---------------- | --------------- | ----------
+    positive fixint  | number_unsigned | 0x00..0x7F
+    fixmap           | object          | 0x80..0x8F
+    fixarray         | array           | 0x90..0x9F
+    fixstr           | string          | 0xA0..0xBF
+    nil              | `null`          | 0xC0
+    false            | `false`         | 0xC2
+    true             | `true`          | 0xC3
+    float 32         | number_float    | 0xCA
+    float 64         | number_float    | 0xCB
+    uint 8           | number_unsigned | 0xCC
+    uint 16          | number_unsigned | 0xCD
+    uint 32          | number_unsigned | 0xCE
+    uint 64          | number_unsigned | 0xCF
+    int 8            | number_integer  | 0xD0
+    int 16           | number_integer  | 0xD1
+    int 32           | number_integer  | 0xD2
+    int 64           | number_integer  | 0xD3
+    str 8            | string          | 0xD9
+    str 16           | string          | 0xDA
+    str 32           | string          | 0xDB
+    array 16         | array           | 0xDC
+    array 32         | array           | 0xDD
+    map 16           | object          | 0xDE
+    map 32           | object          | 0xDF
+    bin 8            | binary          | 0xC4
+    bin 16           | binary          | 0xC5
+    bin 32           | binary          | 0xC6
+    ext 8            | binary          | 0xC7
+    ext 16           | binary          | 0xC8
+    ext 32           | binary          | 0xC9
+    fixext 1         | binary          | 0xD4
+    fixext 2         | binary          | 0xD5
+    fixext 4         | binary          | 0xD6
+    fixext 8         | binary          | 0xD7
+    fixext 16        | binary          | 0xD8
+    negative fixint  | number_integer  | 0xE0-0xFF
+
+    @note Any MessagePack output created @ref to_msgpack can be successfully
+          parsed by @ref from_msgpack.
+
+    @param[in] i  an input in MessagePack format convertible to an input
+                  adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if unsupported features from MessagePack were
+    used in the given input @a i or if the input is not valid MessagePack
+    @throw parse_error.113 if a string was expected as map key, but not found
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    MessagePack format to a JSON value.,from_msgpack}
+
+    @sa http://msgpack.org
+    @sa see @ref to_msgpack(const basic_json&) for the analogous serialization
+    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for
+        the related UBJSON format
+    @sa see @ref from_bson(InputType&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 2.0.9; parameter @a start_index since 2.1.1; changed to
+           consume input adapters, removed start_index parameter, and added
+           @a strict parameter since 3.0.0; added @a allow_exceptions parameter
+           since 3.2.0
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(InputType&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_msgpack(InputType&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_msgpack(IteratorType first, IteratorType last,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(const T* ptr, std::size_t len,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        return from_msgpack(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_msgpack(ptr, ptr + len))
+    static basic_json from_msgpack(detail::span_input_adapter&& i,
+                                   const bool strict = true,
+                                   const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::msgpack, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    /*!
+    @brief create a JSON value from an input in UBJSON format
+
+    Deserializes a given input @a i to a JSON value using the UBJSON (Universal
+    Binary JSON) serialization format.
+
+    The library maps UBJSON types to JSON value types as follows:
+
+    UBJSON type | JSON value type                         | marker
+    ----------- | --------------------------------------- | ------
+    no-op       | *no value, next value is read*          | `N`
+    null        | `null`                                  | `Z`
+    false       | `false`                                 | `F`
+    true        | `true`                                  | `T`
+    float32     | number_float                            | `d`
+    float64     | number_float                            | `D`
+    uint8       | number_unsigned                         | `U`
+    int8        | number_integer                          | `i`
+    int16       | number_integer                          | `I`
+    int32       | number_integer                          | `l`
+    int64       | number_integer                          | `L`
+    high-precision number | number_integer, number_unsigned, or number_float - depends on number string | 'H'
+    string      | string                                  | `S`
+    char        | string                                  | `C`
+    array       | array (optimized values are supported)  | `[`
+    object      | object (optimized values are supported) | `{`
+
+    @note The mapping is **complete** in the sense that any UBJSON value can
+          be converted to a JSON value.
+
+    @param[in] i  an input in UBJSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.110 if the given input ends prematurely or the end of
+    file was not reached when @a strict was set to true
+    @throw parse_error.112 if a parse error occurs
+    @throw parse_error.113 if a string could not be parsed successfully
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    UBJSON format to a JSON value.,from_ubjson}
+
+    @sa http://ubjson.org
+    @sa see @ref to_ubjson(const basic_json&, const bool, const bool) for the
+             analogous serialization
+    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa see @ref from_msgpack(InputType&&, const bool, const bool) for
+        the related MessagePack format
+    @sa see @ref from_bson(InputType&&, const bool, const bool) for
+        the related BSON format
+
+    @since version 3.1.0; added @a allow_exceptions parameter since 3.2.0
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(InputType&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_ubjson(InputType&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_ubjson(IteratorType first, IteratorType last,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(const T* ptr, std::size_t len,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        return from_ubjson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_ubjson(ptr, ptr + len))
+    static basic_json from_ubjson(detail::span_input_adapter&& i,
+                                  const bool strict = true,
+                                  const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::ubjson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+
+    /*!
+    @brief Create a JSON value from an input in BSON format
+
+    Deserializes a given input @a i to a JSON value using the BSON (Binary JSON)
+    serialization format.
+
+    The library maps BSON record types to JSON value types as follows:
+
+    BSON type       | BSON marker byte | JSON value type
+    --------------- | ---------------- | ---------------------------
+    double          | 0x01             | number_float
+    string          | 0x02             | string
+    document        | 0x03             | object
+    array           | 0x04             | array
+    binary          | 0x05             | binary
+    undefined       | 0x06             | still unsupported
+    ObjectId        | 0x07             | still unsupported
+    boolean         | 0x08             | boolean
+    UTC Date-Time   | 0x09             | still unsupported
+    null            | 0x0A             | null
+    Regular Expr.   | 0x0B             | still unsupported
+    DB Pointer      | 0x0C             | still unsupported
+    JavaScript Code | 0x0D             | still unsupported
+    Symbol          | 0x0E             | still unsupported
+    JavaScript Code | 0x0F             | still unsupported
+    int32           | 0x10             | number_integer
+    Timestamp       | 0x11             | still unsupported
+    128-bit decimal float | 0x13       | still unsupported
+    Max Key         | 0x7F             | still unsupported
+    Min Key         | 0xFF             | still unsupported
+
+    @warning The mapping is **incomplete**. The unsupported mappings
+             are indicated in the table above.
+
+    @param[in] i  an input in BSON format convertible to an input adapter
+    @param[in] strict  whether to expect the input to be consumed until EOF
+                       (true by default)
+    @param[in] allow_exceptions  whether to throw exceptions in case of a
+    parse error (optional, true by default)
+
+    @return deserialized JSON value; in case of a parse error and
+            @a allow_exceptions set to `false`, the return value will be
+            value_t::discarded.
+
+    @throw parse_error.114 if an unsupported BSON record type is encountered
+
+    @complexity Linear in the size of the input @a i.
+
+    @liveexample{The example shows the deserialization of a byte vector in
+    BSON format to a JSON value.,from_bson}
+
+    @sa http://bsonspec.org/spec.html
+    @sa see @ref to_bson(const basic_json&) for the analogous serialization
+    @sa see @ref from_cbor(InputType&&, const bool, const bool, const cbor_tag_handler_t) for the
+        related CBOR format
+    @sa see @ref from_msgpack(InputType&&, const bool, const bool) for
+        the related MessagePack format
+    @sa see @ref from_ubjson(InputType&&, const bool, const bool) for the
+        related UBJSON format
+    */
+    template<typename InputType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(InputType&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::forward<InputType>(i));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    /*!
+    @copydoc from_bson(InputType&&, const bool, const bool)
+    */
+    template<typename IteratorType>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json from_bson(IteratorType first, IteratorType last,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = detail::input_adapter(std::move(first), std::move(last));
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+
+    template<typename T>
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(const T* ptr, std::size_t len,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        return from_bson(ptr, ptr + len, strict, allow_exceptions);
+    }
+
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_bson(ptr, ptr + len))
+    static basic_json from_bson(detail::span_input_adapter&& i,
+                                const bool strict = true,
+                                const bool allow_exceptions = true)
+    {
+        basic_json result;
+        detail::json_sax_dom_parser<basic_json> sdp(result, allow_exceptions);
+        auto ia = i.get();
+        // NOLINTNEXTLINE(hicpp-move-const-arg,performance-move-const-arg)
+        const bool res = binary_reader<decltype(ia)>(std::move(ia)).sax_parse(input_format_t::bson, &sdp, strict);
+        return res ? result : basic_json(value_t::discarded);
+    }
+    /// @}
+
+    //////////////////////////
+    // JSON Pointer support //
+    //////////////////////////
+
+    /// @name JSON Pointer functions
+    /// @{
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. Similar to @ref operator[](const typename
+    object_t::key_type&), `null` values are created in arrays and objects if
+    necessary.
+
+    In particular:
+    - If the JSON pointer points to an object key that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned.
+    - If the JSON pointer points to an array index that does not exist, it
+      is created an filled with a `null` value before a reference to it
+      is returned. All indices between the current maximum and the given
+      index are also filled with `null`.
+    - The special value `-` is treated as a synonym for the index past the
+      end.
+
+    @param[in] ptr  a JSON pointer
+
+    @return reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer}
+
+    @since version 2.0.0
+    */
+    reference operator[](const json_pointer& ptr)
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Uses a JSON pointer to retrieve a reference to the respective JSON value.
+    No bound checking is performed. The function does not change the JSON
+    value; no `null` values are created. In particular, the special value
+    `-` yields an exception.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return const reference to the element pointed to by @a ptr
+
+    @complexity Constant.
+
+    @throw parse_error.106   if an array index begins with '0'
+    @throw parse_error.109   if an array index was not a number
+    @throw out_of_range.402  if the array index '-' is used
+    @throw out_of_range.404  if the JSON pointer can not be resolved
+
+    @liveexample{The behavior is shown in the example.,operatorjson_pointer_const}
+
+    @since version 2.0.0
+    */
+    const_reference operator[](const json_pointer& ptr) const
+    {
+        return ptr.get_unchecked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a reference to the element at with specified JSON pointer @a ptr,
+    with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer}
+    */
+    reference at(const json_pointer& ptr)
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief access specified element via JSON Pointer
+
+    Returns a const reference to the element at with specified JSON pointer @a
+    ptr, with bounds checking.
+
+    @param[in] ptr  JSON pointer to the desired element
+
+    @return reference to the element pointed to by @a ptr
+
+    @throw parse_error.106 if an array index in the passed JSON pointer @a ptr
+    begins with '0'. See example below.
+
+    @throw parse_error.109 if an array index in the passed JSON pointer @a ptr
+    is not a number. See example below.
+
+    @throw out_of_range.401 if an array index in the passed JSON pointer @a ptr
+    is out of range. See example below.
+
+    @throw out_of_range.402 if the array index '-' is used in the passed JSON
+    pointer @a ptr. As `at` provides checked access (and no elements are
+    implicitly inserted), the index '-' is always invalid. See example below.
+
+    @throw out_of_range.403 if the JSON pointer describes a key of an object
+    which cannot be found. See example below.
+
+    @throw out_of_range.404 if the JSON pointer @a ptr can not be resolved.
+    See example below.
+
+    @exceptionsafety Strong guarantee: if an exception is thrown, there are no
+    changes in the JSON value.
+
+    @complexity Constant.
+
+    @since version 2.0.0
+
+    @liveexample{The behavior is shown in the example.,at_json_pointer_const}
+    */
+    const_reference at(const json_pointer& ptr) const
+    {
+        return ptr.get_checked(this);
+    }
+
+    /*!
+    @brief return flattened JSON value
+
+    The function creates a JSON object whose keys are JSON pointers (see [RFC
+    6901](https://tools.ietf.org/html/rfc6901)) and whose values are all
+    primitive. The original JSON value can be restored using the @ref
+    unflatten() function.
+
+    @return an object that maps JSON pointers to primitive values
+
+    @note Empty objects and arrays are flattened to `null` and will not be
+          reconstructed correctly by the @ref unflatten() function.
+
+    @complexity Linear in the size the JSON value.
+
+    @liveexample{The following code shows how a JSON object is flattened to an
+    object whose keys consist of JSON pointers.,flatten}
+
+    @sa see @ref unflatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json flatten() const
+    {
+        basic_json result(value_t::object);
+        json_pointer::flatten("", *this, result);
+        return result;
+    }
+
+    /*!
+    @brief unflatten a previously flattened JSON value
+
+    The function restores the arbitrary nesting of a JSON value that has been
+    flattened before using the @ref flatten() function. The JSON value must
+    meet certain constraints:
+    1. The value must be an object.
+    2. The keys must be JSON pointers (see
+       [RFC 6901](https://tools.ietf.org/html/rfc6901))
+    3. The mapped values must be primitive JSON types.
+
+    @return the original JSON from a flattened version
+
+    @note Empty objects and arrays are flattened by @ref flatten() to `null`
+          values and can not unflattened to their original type. Apart from
+          this example, for a JSON value `j`, the following is always true:
+          `j == j.flatten().unflatten()`.
+
+    @complexity Linear in the size the JSON value.
+
+    @throw type_error.314  if value is not an object
+    @throw type_error.315  if object values are not primitive
+
+    @liveexample{The following code shows how a flattened JSON object is
+    unflattened into the original nested JSON object.,unflatten}
+
+    @sa see @ref flatten() for the reverse function
+
+    @since version 2.0.0
+    */
+    basic_json unflatten() const
+    {
+        return json_pointer::unflatten(*this);
+    }
+
+    /// @}
+
+    //////////////////////////
+    // JSON Patch functions //
+    //////////////////////////
+
+    /// @name JSON Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON patch
+
+    [JSON Patch](http://jsonpatch.com) defines a JSON document structure for
+    expressing a sequence of operations to apply to a JSON) document. With
+    this function, a JSON Patch is applied to the current JSON value by
+    executing all operations from the patch.
+
+    @param[in] json_patch  JSON patch document
+    @return patched document
+
+    @note The application of a patch is atomic: Either all operations succeed
+          and the patched document is returned or an exception is thrown. In
+          any case, the original value is not changed: the patch is applied
+          to a copy of the value.
+
+    @throw parse_error.104 if the JSON patch does not consist of an array of
+    objects
+
+    @throw parse_error.105 if the JSON patch is malformed (e.g., mandatory
+    attributes are missing); example: `"operation add must have member path"`
+
+    @throw out_of_range.401 if an array index is out of range.
+
+    @throw out_of_range.403 if a JSON pointer inside the patch could not be
+    resolved successfully in the current JSON value; example: `"key baz not
+    found"`
+
+    @throw out_of_range.405 if JSON pointer has no parent ("add", "remove",
+    "move")
+
+    @throw other_error.501 if "test" operation was unsuccessful
+
+    @complexity Linear in the size of the JSON value and the length of the
+    JSON patch. As usually only a fraction of the JSON value is affected by
+    the patch, the complexity can usually be neglected.
+
+    @liveexample{The following code shows how a JSON patch is applied to a
+    value.,patch}
+
+    @sa see @ref diff -- create a JSON patch by comparing two JSON values
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+    @sa [RFC 6901 (JSON Pointer)](https://tools.ietf.org/html/rfc6901)
+
+    @since version 2.0.0
+    */
+    basic_json patch(const basic_json& json_patch) const
+    {
+        // make a working copy to apply the patch to
+        basic_json result = *this;
+
+        // the valid JSON Patch operations
+        enum class patch_operations {add, remove, replace, move, copy, test, invalid};
+
+        const auto get_op = [](const std::string & op)
+        {
+            if (op == "add")
+            {
+                return patch_operations::add;
+            }
+            if (op == "remove")
+            {
+                return patch_operations::remove;
+            }
+            if (op == "replace")
+            {
+                return patch_operations::replace;
+            }
+            if (op == "move")
+            {
+                return patch_operations::move;
+            }
+            if (op == "copy")
+            {
+                return patch_operations::copy;
+            }
+            if (op == "test")
+            {
+                return patch_operations::test;
+            }
+
+            return patch_operations::invalid;
+        };
+
+        // wrapper for "add" operation; add value at ptr
+        const auto operation_add = [&result](json_pointer & ptr, basic_json val)
+        {
+            // adding to the root of the target document means replacing it
+            if (ptr.empty())
+            {
+                result = val;
+                return;
+            }
+
+            // make sure the top element of the pointer exists
+            json_pointer top_pointer = ptr.top();
+            if (top_pointer != ptr)
+            {
+                result.at(top_pointer);
+            }
+
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result[ptr];
+
+            switch (parent.m_type)
+            {
+                case value_t::null:
+                case value_t::object:
+                {
+                    // use operator[] to add value
+                    parent[last_path] = val;
+                    break;
+                }
+
+                case value_t::array:
+                {
+                    if (last_path == "-")
+                    {
+                        // special case: append to back
+                        parent.push_back(val);
+                    }
+                    else
+                    {
+                        const auto idx = json_pointer::array_index(last_path);
+                        if (JSON_HEDLEY_UNLIKELY(idx > parent.size()))
+                        {
+                            // avoid undefined behavior
+                            JSON_THROW(out_of_range::create(401, "array index " + std::to_string(idx) + " is out of range", parent));
+                        }
+
+                        // default case: insert add offset
+                        parent.insert(parent.begin() + static_cast<difference_type>(idx), val);
+                    }
+                    break;
+                }
+
+                // if there exists a parent it cannot be primitive
+                case value_t::string: // LCOV_EXCL_LINE
+                case value_t::boolean: // LCOV_EXCL_LINE
+                case value_t::number_integer: // LCOV_EXCL_LINE
+                case value_t::number_unsigned: // LCOV_EXCL_LINE
+                case value_t::number_float: // LCOV_EXCL_LINE
+                case value_t::binary: // LCOV_EXCL_LINE
+                case value_t::discarded: // LCOV_EXCL_LINE
+                default:            // LCOV_EXCL_LINE
+                    JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
+            }
+        };
+
+        // wrapper for "remove" operation; remove value at ptr
+        const auto operation_remove = [this, &result](json_pointer & ptr)
+        {
+            // get reference to parent of JSON pointer ptr
+            const auto last_path = ptr.back();
+            ptr.pop_back();
+            basic_json& parent = result.at(ptr);
+
+            // remove child
+            if (parent.is_object())
+            {
+                // perform range check
+                auto it = parent.find(last_path);
+                if (JSON_HEDLEY_LIKELY(it != parent.end()))
+                {
+                    parent.erase(it);
+                }
+                else
+                {
+                    JSON_THROW(out_of_range::create(403, "key '" + last_path + "' not found", *this));
+                }
+            }
+            else if (parent.is_array())
+            {
+                // note erase performs range check
+                parent.erase(json_pointer::array_index(last_path));
+            }
+        };
+
+        // type check: top level value must be an array
+        if (JSON_HEDLEY_UNLIKELY(!json_patch.is_array()))
+        {
+            JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", json_patch));
+        }
+
+        // iterate and apply the operations
+        for (const auto& val : json_patch)
+        {
+            // wrapper to get a value for an operation
+            const auto get_value = [&val](const std::string & op,
+                                          const std::string & member,
+                                          bool string_type) -> basic_json &
+            {
+                // find value
+                auto it = val.m_value.object->find(member);
+
+                // context-sensitive error message
+                const auto error_msg = (op == "op") ? "operation" : "operation '" + op + "'";
+
+                // check if desired value is present
+                if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end()))
+                {
+                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have member '" + member + "'", val));
+                }
+
+                // check if result is of type string
+                if (JSON_HEDLEY_UNLIKELY(string_type && !it->second.is_string()))
+                {
+                    // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+                    JSON_THROW(parse_error::create(105, 0, error_msg + " must have string member '" + member + "'", val));
+                }
+
+                // no error: return value
+                return it->second;
+            };
+
+            // type check: every element of the array must be an object
+            if (JSON_HEDLEY_UNLIKELY(!val.is_object()))
+            {
+                JSON_THROW(parse_error::create(104, 0, "JSON patch must be an array of objects", val));
+            }
+
+            // collect mandatory members
+            const auto op = get_value("op", "op", true).template get<std::string>();
+            const auto path = get_value(op, "path", true).template get<std::string>();
+            json_pointer ptr(path);
+
+            switch (get_op(op))
+            {
+                case patch_operations::add:
+                {
+                    operation_add(ptr, get_value("add", "value", false));
+                    break;
+                }
+
+                case patch_operations::remove:
+                {
+                    operation_remove(ptr);
+                    break;
+                }
+
+                case patch_operations::replace:
+                {
+                    // the "path" location must exist - use at()
+                    result.at(ptr) = get_value("replace", "value", false);
+                    break;
+                }
+
+                case patch_operations::move:
+                {
+                    const auto from_path = get_value("move", "from", true).template get<std::string>();
+                    json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The move operation is functionally identical to a
+                    // "remove" operation on the "from" location, followed
+                    // immediately by an "add" operation at the target
+                    // location with the value that was just removed.
+                    operation_remove(from_ptr);
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::copy:
+                {
+                    const auto from_path = get_value("copy", "from", true).template get<std::string>();
+                    const json_pointer from_ptr(from_path);
+
+                    // the "from" location must exist - use at()
+                    basic_json v = result.at(from_ptr);
+
+                    // The copy is functionally identical to an "add"
+                    // operation at the target location using the value
+                    // specified in the "from" member.
+                    operation_add(ptr, v);
+                    break;
+                }
+
+                case patch_operations::test:
+                {
+                    bool success = false;
+                    JSON_TRY
+                    {
+                        // check if "value" matches the one at "path"
+                        // the "path" location must exist - use at()
+                        success = (result.at(ptr) == get_value("test", "value", false));
+                    }
+                    JSON_INTERNAL_CATCH (out_of_range&)
+                    {
+                        // ignore out of range errors: success remains false
+                    }
+
+                    // throw an exception if test fails
+                    if (JSON_HEDLEY_UNLIKELY(!success))
+                    {
+                        JSON_THROW(other_error::create(501, "unsuccessful: " + val.dump(), val));
+                    }
+
+                    break;
+                }
+
+                case patch_operations::invalid:
+                default:
+                {
+                    // op must be "add", "remove", "replace", "move", "copy", or
+                    // "test"
+                    JSON_THROW(parse_error::create(105, 0, "operation value '" + op + "' is invalid", val));
+                }
+            }
+        }
+
+        return result;
+    }
+
+    /*!
+    @brief creates a diff as a JSON patch
+
+    Creates a [JSON Patch](http://jsonpatch.com) so that value @a source can
+    be changed into the value @a target by calling @ref patch function.
+
+    @invariant For two JSON values @a source and @a target, the following code
+    yields always `true`:
+    @code {.cpp}
+    source.patch(diff(source, target)) == target;
+    @endcode
+
+    @note Currently, only `remove`, `add`, and `replace` operations are
+          generated.
+
+    @param[in] source  JSON value to compare from
+    @param[in] target  JSON value to compare against
+    @param[in] path    helper value to create JSON pointers
+
+    @return a JSON patch to convert the @a source to @a target
+
+    @complexity Linear in the lengths of @a source and @a target.
+
+    @liveexample{The following code shows how a JSON patch is created as a
+    diff for two JSON values.,diff}
+
+    @sa see @ref patch -- apply a JSON patch
+    @sa see @ref merge_patch -- apply a JSON Merge Patch
+
+    @sa [RFC 6902 (JSON Patch)](https://tools.ietf.org/html/rfc6902)
+
+    @since version 2.0.0
+    */
+    JSON_HEDLEY_WARN_UNUSED_RESULT
+    static basic_json diff(const basic_json& source, const basic_json& target,
+                           const std::string& path = "")
+    {
+        // the patch
+        basic_json result(value_t::array);
+
+        // if the values are the same, return empty patch
+        if (source == target)
+        {
+            return result;
+        }
+
+        if (source.type() != target.type())
+        {
+            // different types: replace value
+            result.push_back(
+            {
+                {"op", "replace"}, {"path", path}, {"value", target}
+            });
+            return result;
+        }
+
+        switch (source.type())
+        {
+            case value_t::array:
+            {
+                // first pass: traverse common elements
+                std::size_t i = 0;
+                while (i < source.size() && i < target.size())
+                {
+                    // recursive call to compare array values at index i
+                    auto temp_diff = diff(source[i], target[i], path + "/" + std::to_string(i));
+                    result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    ++i;
+                }
+
+                // i now reached the end of at least one array
+                // in a second pass, traverse the remaining elements
+
+                // remove my remaining elements
+                const auto end_index = static_cast<difference_type>(result.size());
+                while (i < source.size())
+                {
+                    // add operations in reverse order to avoid invalid
+                    // indices
+                    result.insert(result.begin() + end_index, object(
+                    {
+                        {"op", "remove"},
+                        {"path", path + "/" + std::to_string(i)}
+                    }));
+                    ++i;
+                }
+
+                // add other remaining elements
+                while (i < target.size())
+                {
+                    result.push_back(
+                    {
+                        {"op", "add"},
+                        {"path", path + "/-"},
+                        {"value", target[i]}
+                    });
+                    ++i;
+                }
+
+                break;
+            }
+
+            case value_t::object:
+            {
+                // first pass: traverse this object's elements
+                for (auto it = source.cbegin(); it != source.cend(); ++it)
+                {
+                    // escape the key name to be used in a JSON patch
+                    const auto path_key = path + "/" + detail::escape(it.key());
+
+                    if (target.find(it.key()) != target.end())
+                    {
+                        // recursive call to compare object values at key it
+                        auto temp_diff = diff(it.value(), target[it.key()], path_key);
+                        result.insert(result.end(), temp_diff.begin(), temp_diff.end());
+                    }
+                    else
+                    {
+                        // found a key that is not in o -> remove it
+                        result.push_back(object(
+                        {
+                            {"op", "remove"}, {"path", path_key}
+                        }));
+                    }
+                }
+
+                // second pass: traverse other object's elements
+                for (auto it = target.cbegin(); it != target.cend(); ++it)
+                {
+                    if (source.find(it.key()) == source.end())
+                    {
+                        // found a key that is not in this -> add it
+                        const auto path_key = path + "/" + detail::escape(it.key());
+                        result.push_back(
+                        {
+                            {"op", "add"}, {"path", path_key},
+                            {"value", it.value()}
+                        });
+                    }
+                }
+
+                break;
+            }
+
+            case value_t::null:
+            case value_t::string:
+            case value_t::boolean:
+            case value_t::number_integer:
+            case value_t::number_unsigned:
+            case value_t::number_float:
+            case value_t::binary:
+            case value_t::discarded:
+            default:
+            {
+                // both primitive type: replace value
+                result.push_back(
+                {
+                    {"op", "replace"}, {"path", path}, {"value", target}
+                });
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    /// @}
+
+    ////////////////////////////////
+    // JSON Merge Patch functions //
+    ////////////////////////////////
+
+    /// @name JSON Merge Patch functions
+    /// @{
+
+    /*!
+    @brief applies a JSON Merge Patch
+
+    The merge patch format is primarily intended for use with the HTTP PATCH
+    method as a means of describing a set of modifications to a target
+    resource's content. This function applies a merge patch to the current
+    JSON value.
+
+    The function implements the following algorithm from Section 2 of
+    [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396):
+
+    ```
+    define MergePatch(Target, Patch):
+      if Patch is an Object:
+        if Target is not an Object:
+          Target = {} // Ignore the contents and set it to an empty Object
+        for each Name/Value pair in Patch:
+          if Value is null:
+            if Name exists in Target:
+              remove the Name/Value pair from Target
+          else:
+            Target[Name] = MergePatch(Target[Name], Value)
+        return Target
+      else:
+        return Patch
+    ```
+
+    Thereby, `Target` is the current object; that is, the patch is applied to
+    the current value.
+
+    @param[in] apply_patch  the patch to apply
+
+    @complexity Linear in the lengths of @a patch.
+
+    @liveexample{The following code shows how a JSON Merge Patch is applied to
+    a JSON document.,merge_patch}
+
+    @sa see @ref patch -- apply a JSON patch
+    @sa [RFC 7396 (JSON Merge Patch)](https://tools.ietf.org/html/rfc7396)
+
+    @since version 3.0.0
+    */
+    void merge_patch(const basic_json& apply_patch)
+    {
+        if (apply_patch.is_object())
+        {
+            if (!is_object())
+            {
+                *this = object();
+            }
+            for (auto it = apply_patch.begin(); it != apply_patch.end(); ++it)
+            {
+                if (it.value().is_null())
+                {
+                    erase(it.key());
+                }
+                else
+                {
+                    operator[](it.key()).merge_patch(it.value());
+                }
+            }
+        }
+        else
+        {
+            *this = apply_patch;
+        }
+    }
+
+    /// @}
+};
+
+/*!
+@brief user-defined to_string function for JSON values
+
+This function implements a user-defined to_string  for JSON objects.
+
+@param[in] j  a JSON object
+@return a std::string object
+*/
+
+NLOHMANN_BASIC_JSON_TPL_DECLARATION
+std::string to_string(const NLOHMANN_BASIC_JSON_TPL& j)
+{
+    return j.dump();
+}
+} // namespace nlohmann
+
+///////////////////////
+// nonmember support //
+///////////////////////
+
+// specialization of std::swap, and std::hash
+namespace std
+{
+
+/// hash value for JSON objects
+template<>
+struct hash<nlohmann::json>
+{
+    /*!
+    @brief return a hash value for a JSON object
+
+    @since version 1.0.0
+    */
+    std::size_t operator()(const nlohmann::json& j) const
+    {
+        return nlohmann::detail::hash(j);
+    }
+};
+
+/// specialization for std::less<value_t>
+/// @note: do not remove the space after '<',
+///        see https://github.com/nlohmann/json/pull/679
+template<>
+struct less<::nlohmann::detail::value_t>
+{
+    /*!
+    @brief compare two value_t enum values
+    @since version 3.0.0
+    */
+    bool operator()(nlohmann::detail::value_t lhs,
+                    nlohmann::detail::value_t rhs) const noexcept
+    {
+        return nlohmann::detail::operator<(lhs, rhs);
+    }
+};
+
+// C++20 prohibit function specialization in the std namespace.
+#ifndef JSON_HAS_CPP_20
+
+/*!
+@brief exchanges the values of two JSON objects
+
+@since version 1.0.0
+*/
+template<>
+inline void swap<nlohmann::json>(nlohmann::json& j1, nlohmann::json& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name)
+    is_nothrow_move_constructible<nlohmann::json>::value&&  // NOLINT(misc-redundant-expression)
+    is_nothrow_move_assignable<nlohmann::json>::value
+                              )
+{
+    j1.swap(j2);
+}
+
+#endif
+
+} // namespace std
+
+/*!
+@brief user-defined string literal for JSON values
+
+This operator implements a user-defined string literal for JSON objects. It
+can be used by adding `"_json"` to a string literal and returns a JSON object
+if no parse error occurred.
+
+@param[in] s  a string representation of a JSON object
+@param[in] n  the length of string @a s
+@return a JSON object
+
+@since version 1.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+{
+    return nlohmann::json::parse(s, s + n);
+}
+
+/*!
+@brief user-defined string literal for JSON pointer
+
+This operator implements a user-defined string literal for JSON Pointers. It
+can be used by adding `"_json_pointer"` to a string literal and returns a JSON pointer
+object if no parse error occurred.
+
+@param[in] s  a string representation of a JSON Pointer
+@param[in] n  the length of string @a s
+@return a JSON pointer object
+
+@since version 2.0.0
+*/
+JSON_HEDLEY_NON_NULL(1)
+inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+{
+    return nlohmann::json::json_pointer(std::string(s, n));
+}
+
+// #include <nlohmann/detail/macro_unscope.hpp>
+
+
+// restore clang diagnostic settings
+#if defined(__clang__)
+    #pragma clang diagnostic pop
+#endif
+
+// clean up
+#undef JSON_ASSERT
+#undef JSON_INTERNAL_CATCH
+#undef JSON_CATCH
+#undef JSON_THROW
+#undef JSON_TRY
+#undef JSON_PRIVATE_UNLESS_TESTED
+#undef JSON_HAS_CPP_11
+#undef JSON_HAS_CPP_14
+#undef JSON_HAS_CPP_17
+#undef JSON_HAS_CPP_20
+#undef NLOHMANN_BASIC_JSON_TPL_DECLARATION
+#undef NLOHMANN_BASIC_JSON_TPL
+#undef JSON_EXPLICIT
+
+// #include <nlohmann/thirdparty/hedley/hedley_undef.hpp>
+
+
+#undef JSON_HEDLEY_ALWAYS_INLINE
+#undef JSON_HEDLEY_ARM_VERSION
+#undef JSON_HEDLEY_ARM_VERSION_CHECK
+#undef JSON_HEDLEY_ARRAY_PARAM
+#undef JSON_HEDLEY_ASSUME
+#undef JSON_HEDLEY_BEGIN_C_DECLS
+#undef JSON_HEDLEY_CLANG_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_BUILTIN
+#undef JSON_HEDLEY_CLANG_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_CLANG_HAS_EXTENSION
+#undef JSON_HEDLEY_CLANG_HAS_FEATURE
+#undef JSON_HEDLEY_CLANG_HAS_WARNING
+#undef JSON_HEDLEY_COMPCERT_VERSION
+#undef JSON_HEDLEY_COMPCERT_VERSION_CHECK
+#undef JSON_HEDLEY_CONCAT
+#undef JSON_HEDLEY_CONCAT3
+#undef JSON_HEDLEY_CONCAT3_EX
+#undef JSON_HEDLEY_CONCAT_EX
+#undef JSON_HEDLEY_CONST
+#undef JSON_HEDLEY_CONSTEXPR
+#undef JSON_HEDLEY_CONST_CAST
+#undef JSON_HEDLEY_CPP_CAST
+#undef JSON_HEDLEY_CRAY_VERSION
+#undef JSON_HEDLEY_CRAY_VERSION_CHECK
+#undef JSON_HEDLEY_C_DECL
+#undef JSON_HEDLEY_DEPRECATED
+#undef JSON_HEDLEY_DEPRECATED_FOR
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS
+#undef JSON_HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION
+#undef JSON_HEDLEY_DIAGNOSTIC_POP
+#undef JSON_HEDLEY_DIAGNOSTIC_PUSH
+#undef JSON_HEDLEY_DMC_VERSION
+#undef JSON_HEDLEY_DMC_VERSION_CHECK
+#undef JSON_HEDLEY_EMPTY_BASES
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION
+#undef JSON_HEDLEY_EMSCRIPTEN_VERSION_CHECK
+#undef JSON_HEDLEY_END_C_DECLS
+#undef JSON_HEDLEY_FLAGS
+#undef JSON_HEDLEY_FLAGS_CAST
+#undef JSON_HEDLEY_GCC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_BUILTIN
+#undef JSON_HEDLEY_GCC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GCC_HAS_EXTENSION
+#undef JSON_HEDLEY_GCC_HAS_FEATURE
+#undef JSON_HEDLEY_GCC_HAS_WARNING
+#undef JSON_HEDLEY_GCC_NOT_CLANG_VERSION_CHECK
+#undef JSON_HEDLEY_GCC_VERSION
+#undef JSON_HEDLEY_GCC_VERSION_CHECK
+#undef JSON_HEDLEY_GNUC_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_BUILTIN
+#undef JSON_HEDLEY_GNUC_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_GNUC_HAS_EXTENSION
+#undef JSON_HEDLEY_GNUC_HAS_FEATURE
+#undef JSON_HEDLEY_GNUC_HAS_WARNING
+#undef JSON_HEDLEY_GNUC_VERSION
+#undef JSON_HEDLEY_GNUC_VERSION_CHECK
+#undef JSON_HEDLEY_HAS_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_BUILTIN
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_CPP_ATTRIBUTE_NS
+#undef JSON_HEDLEY_HAS_DECLSPEC_ATTRIBUTE
+#undef JSON_HEDLEY_HAS_EXTENSION
+#undef JSON_HEDLEY_HAS_FEATURE
+#undef JSON_HEDLEY_HAS_WARNING
+#undef JSON_HEDLEY_IAR_VERSION
+#undef JSON_HEDLEY_IAR_VERSION_CHECK
+#undef JSON_HEDLEY_IBM_VERSION
+#undef JSON_HEDLEY_IBM_VERSION_CHECK
+#undef JSON_HEDLEY_IMPORT
+#undef JSON_HEDLEY_INLINE
+#undef JSON_HEDLEY_INTEL_CL_VERSION
+#undef JSON_HEDLEY_INTEL_CL_VERSION_CHECK
+#undef JSON_HEDLEY_INTEL_VERSION
+#undef JSON_HEDLEY_INTEL_VERSION_CHECK
+#undef JSON_HEDLEY_IS_CONSTANT
+#undef JSON_HEDLEY_IS_CONSTEXPR_
+#undef JSON_HEDLEY_LIKELY
+#undef JSON_HEDLEY_MALLOC
+#undef JSON_HEDLEY_MCST_LCC_VERSION
+#undef JSON_HEDLEY_MCST_LCC_VERSION_CHECK
+#undef JSON_HEDLEY_MESSAGE
+#undef JSON_HEDLEY_MSVC_VERSION
+#undef JSON_HEDLEY_MSVC_VERSION_CHECK
+#undef JSON_HEDLEY_NEVER_INLINE
+#undef JSON_HEDLEY_NON_NULL
+#undef JSON_HEDLEY_NO_ESCAPE
+#undef JSON_HEDLEY_NO_RETURN
+#undef JSON_HEDLEY_NO_THROW
+#undef JSON_HEDLEY_NULL
+#undef JSON_HEDLEY_PELLES_VERSION
+#undef JSON_HEDLEY_PELLES_VERSION_CHECK
+#undef JSON_HEDLEY_PGI_VERSION
+#undef JSON_HEDLEY_PGI_VERSION_CHECK
+#undef JSON_HEDLEY_PREDICT
+#undef JSON_HEDLEY_PRINTF_FORMAT
+#undef JSON_HEDLEY_PRIVATE
+#undef JSON_HEDLEY_PUBLIC
+#undef JSON_HEDLEY_PURE
+#undef JSON_HEDLEY_REINTERPRET_CAST
+#undef JSON_HEDLEY_REQUIRE
+#undef JSON_HEDLEY_REQUIRE_CONSTEXPR
+#undef JSON_HEDLEY_REQUIRE_MSG
+#undef JSON_HEDLEY_RESTRICT
+#undef JSON_HEDLEY_RETURNS_NON_NULL
+#undef JSON_HEDLEY_SENTINEL
+#undef JSON_HEDLEY_STATIC_ASSERT
+#undef JSON_HEDLEY_STATIC_CAST
+#undef JSON_HEDLEY_STRINGIFY
+#undef JSON_HEDLEY_STRINGIFY_EX
+#undef JSON_HEDLEY_SUNPRO_VERSION
+#undef JSON_HEDLEY_SUNPRO_VERSION_CHECK
+#undef JSON_HEDLEY_TINYC_VERSION
+#undef JSON_HEDLEY_TINYC_VERSION_CHECK
+#undef JSON_HEDLEY_TI_ARMCL_VERSION
+#undef JSON_HEDLEY_TI_ARMCL_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL2000_VERSION
+#undef JSON_HEDLEY_TI_CL2000_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL430_VERSION
+#undef JSON_HEDLEY_TI_CL430_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL6X_VERSION
+#undef JSON_HEDLEY_TI_CL6X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CL7X_VERSION
+#undef JSON_HEDLEY_TI_CL7X_VERSION_CHECK
+#undef JSON_HEDLEY_TI_CLPRU_VERSION
+#undef JSON_HEDLEY_TI_CLPRU_VERSION_CHECK
+#undef JSON_HEDLEY_TI_VERSION
+#undef JSON_HEDLEY_TI_VERSION_CHECK
+#undef JSON_HEDLEY_UNAVAILABLE
+#undef JSON_HEDLEY_UNLIKELY
+#undef JSON_HEDLEY_UNPREDICTABLE
+#undef JSON_HEDLEY_UNREACHABLE
+#undef JSON_HEDLEY_UNREACHABLE_RETURN
+#undef JSON_HEDLEY_VERSION
+#undef JSON_HEDLEY_VERSION_DECODE_MAJOR
+#undef JSON_HEDLEY_VERSION_DECODE_MINOR
+#undef JSON_HEDLEY_VERSION_DECODE_REVISION
+#undef JSON_HEDLEY_VERSION_ENCODE
+#undef JSON_HEDLEY_WARNING
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT
+#undef JSON_HEDLEY_WARN_UNUSED_RESULT_MSG
+#undef JSON_HEDLEY_FALL_THROUGH
+
+
+
+#endif  // INCLUDE_NLOHMANN_JSON_HPP_
diff --git a/extern/tinygltf/README.blender b/extern/tinygltf/README.blender
new file mode 100644
index 00000000000..fe23d320b77
--- /dev/null
+++ b/extern/tinygltf/README.blender
@@ -0,0 +1,5 @@
+Project: TinyGLTF
+URL: https://github.com/syoyo/tinygltf
+License: MIT
+Upstream version: 2.5.0, 19a41d20ec0
+Local modifications: None
diff --git a/extern/tinygltf/tiny_gltf.h b/extern/tinygltf/tiny_gltf.h
new file mode 100644
index 00000000000..185bb0daa98
--- /dev/null
+++ b/extern/tinygltf/tiny_gltf.h
@@ -0,0 +1,7760 @@
+//
+// Header-only tiny glTF 2.0 loader and serializer.
+//
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 - Present Syoyo Fujita, Aurélien Chatelain and many
+// contributors.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+// Version:
+//  - v2.5.0 Add SetPreserveImageChannels() option to load image data as is.
+//  - v2.4.3 Fix null object output when when material has all default
+//  parameters.
+//  - v2.4.2 Decode percent-encoded URI.
+//  - v2.4.1 Fix some glTF object class does not have `extensions` and/or
+//  `extras` property.
+//  - v2.4.0 Experimental RapidJSON and C++14 support(Thanks to @jrkoone).
+//  - v2.3.1 Set default value of minFilter and magFilter in Sampler to -1.
+//  - v2.3.0 Modified Material representation according to glTF 2.0 schema
+//           (and introduced TextureInfo class)
+//           Change the behavior of `Value::IsNumber`. It return true either the
+//           value is int or real.
+//  - v2.2.0 Add loading 16bit PNG support. Add Sparse accessor support(Thanks
+//  to @Ybalrid)
+//  - v2.1.0 Add draco compression.
+//  - v2.0.1 Add comparsion feature(Thanks to @Selmar).
+//  - v2.0.0 glTF 2.0!.
+//
+// Tiny glTF loader is using following third party libraries:
+//
+//  - jsonhpp: C++ JSON library.
+//  - base64: base64 decode/encode library.
+//  - stb_image: Image loading library.
+//
+#ifndef TINY_GLTF_H_
+#define TINY_GLTF_H_
+
+#include <array>
+#include <cassert>
+#include <cmath>  // std::fabs
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <map>
+#include <string>
+#include <vector>
+
+#ifndef TINYGLTF_USE_CPP14
+#include <functional>
+#endif
+
+#ifdef __ANDROID__
+#ifdef TINYGLTF_ANDROID_LOAD_FROM_ASSETS
+#include <android/asset_manager.h>
+#endif
+#endif
+
+#ifdef __GNUC__
+#if (__GNUC__ < 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ <= 8))
+#define TINYGLTF_NOEXCEPT
+#else
+#define TINYGLTF_NOEXCEPT noexcept
+#endif
+#else
+#define TINYGLTF_NOEXCEPT noexcept
+#endif
+
+#define DEFAULT_METHODS(x)             \
+  ~x() = default;                      \
+  x(const x &) = default;              \
+  x(x &&) TINYGLTF_NOEXCEPT = default; \
+  x &operator=(const x &) = default;   \
+  x &operator=(x &&) TINYGLTF_NOEXCEPT = default;
+
+namespace tinygltf {
+
+#define TINYGLTF_MODE_POINTS (0)
+#define TINYGLTF_MODE_LINE (1)
+#define TINYGLTF_MODE_LINE_LOOP (2)
+#define TINYGLTF_MODE_LINE_STRIP (3)
+#define TINYGLTF_MODE_TRIANGLES (4)
+#define TINYGLTF_MODE_TRIANGLE_STRIP (5)
+#define TINYGLTF_MODE_TRIANGLE_FAN (6)
+
+#define TINYGLTF_COMPONENT_TYPE_BYTE (5120)
+#define TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE (5121)
+#define TINYGLTF_COMPONENT_TYPE_SHORT (5122)
+#define TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT (5123)
+#define TINYGLTF_COMPONENT_TYPE_INT (5124)
+#define TINYGLTF_COMPONENT_TYPE_UNSIGNED_INT (5125)
+#define TINYGLTF_COMPONENT_TYPE_FLOAT (5126)
+#define TINYGLTF_COMPONENT_TYPE_DOUBLE (5130)
+
+#define TINYGLTF_TEXTURE_FILTER_NEAREST (9728)
+#define TINYGLTF_TEXTURE_FILTER_LINEAR (9729)
+#define TINYGLTF_TEXTURE_FILTER_NEAREST_MIPMAP_NEAREST (9984)
+#define TINYGLTF_TEXTURE_FILTER_LINEAR_MIPMAP_NEAREST (9985)
+#define TINYGLTF_TEXTURE_FILTER_NEAREST_MIPMAP_LINEAR (9986)
+#define TINYGLTF_TEXTURE_FILTER_LINEAR_MIPMAP_LINEAR (9987)
+
+#define TINYGLTF_TEXTURE_WRAP_REPEAT (10497)
+#define TINYGLTF_TEXTURE_WRAP_CLAMP_TO_EDGE (33071)
+#define TINYGLTF_TEXTURE_WRAP_MIRRORED_REPEAT (33648)
+
+// Redeclarations of the above for technique.parameters.
+#define TINYGLTF_PARAMETER_TYPE_BYTE (5120)
+#define TINYGLTF_PARAMETER_TYPE_UNSIGNED_BYTE (5121)
+#define TINYGLTF_PARAMETER_TYPE_SHORT (5122)
+#define TINYGLTF_PARAMETER_TYPE_UNSIGNED_SHORT (5123)
+#define TINYGLTF_PARAMETER_TYPE_INT (5124)
+#define TINYGLTF_PARAMETER_TYPE_UNSIGNED_INT (5125)
+#define TINYGLTF_PARAMETER_TYPE_FLOAT (5126)
+
+#define TINYGLTF_PARAMETER_TYPE_FLOAT_VEC2 (35664)
+#define TINYGLTF_PARAMETER_TYPE_FLOAT_VEC3 (35665)
+#define TINYGLTF_PARAMETER_TYPE_FLOAT_VEC4 (35666)
+
+#define TINYGLTF_PARAMETER_TYPE_INT_VEC2 (35667)
+#define TINYGLTF_PARAMETER_TYPE_INT_VEC3 (35668)
+#define TINYGLTF_PARAMETER_TYPE_INT_VEC4 (35669)
+
+#define TINYGLTF_PARAMETER_TYPE_BOOL (35670)
+#define TINYGLTF_PARAMETER_TYPE_BOOL_VEC2 (35671)
+#define TINYGLTF_PARAMETER_TYPE_BOOL_VEC3 (35672)
+#define TINYGLTF_PARAMETER_TYPE_BOOL_VEC4 (35673)
+
+#define TINYGLTF_PARAMETER_TYPE_FLOAT_MAT2 (35674)
+#define TINYGLTF_PARAMETER_TYPE_FLOAT_MAT3 (35675)
+#define TINYGLTF_PARAMETER_TYPE_FLOAT_MAT4 (35676)
+
+#define TINYGLTF_PARAMETER_TYPE_SAMPLER_2D (35678)
+
+// End parameter types
+
+#define TINYGLTF_TYPE_VEC2 (2)
+#define TINYGLTF_TYPE_VEC3 (3)
+#define TINYGLTF_TYPE_VEC4 (4)
+#define TINYGLTF_TYPE_MAT2 (32 + 2)
+#define TINYGLTF_TYPE_MAT3 (32 + 3)
+#define TINYGLTF_TYPE_MAT4 (32 + 4)
+#define TINYGLTF_TYPE_SCALAR (64 + 1)
+#define TINYGLTF_TYPE_VECTOR (64 + 4)
+#define TINYGLTF_TYPE_MATRIX (64 + 16)
+
+#define TINYGLTF_IMAGE_FORMAT_JPEG (0)
+#define TINYGLTF_IMAGE_FORMAT_PNG (1)
+#define TINYGLTF_IMAGE_FORMAT_BMP (2)
+#define TINYGLTF_IMAGE_FORMAT_GIF (3)
+
+#define TINYGLTF_TEXTURE_FORMAT_ALPHA (6406)
+#define TINYGLTF_TEXTURE_FORMAT_RGB (6407)
+#define TINYGLTF_TEXTURE_FORMAT_RGBA (6408)
+#define TINYGLTF_TEXTURE_FORMAT_LUMINANCE (6409)
+#define TINYGLTF_TEXTURE_FORMAT_LUMINANCE_ALPHA (6410)
+
+#define TINYGLTF_TEXTURE_TARGET_TEXTURE2D (3553)
+#define TINYGLTF_TEXTURE_TYPE_UNSIGNED_BYTE (5121)
+
+#define TINYGLTF_TARGET_ARRAY_BUFFER (34962)
+#define TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER (34963)
+
+#define TINYGLTF_SHADER_TYPE_VERTEX_SHADER (35633)
+#define TINYGLTF_SHADER_TYPE_FRAGMENT_SHADER (35632)
+
+#define TINYGLTF_DOUBLE_EPS (1.e-12)
+#define TINYGLTF_DOUBLE_EQUAL(a, b) (std::fabs((b) - (a)) < TINYGLTF_DOUBLE_EPS)
+
+#ifdef __ANDROID__
+#ifdef TINYGLTF_ANDROID_LOAD_FROM_ASSETS
+AAssetManager *asset_manager = nullptr;
+#endif
+#endif
+
+typedef enum {
+  NULL_TYPE,
+  REAL_TYPE,
+  INT_TYPE,
+  BOOL_TYPE,
+  STRING_TYPE,
+  ARRAY_TYPE,
+  BINARY_TYPE,
+  OBJECT_TYPE
+} Type;
+
+static inline int32_t GetComponentSizeInBytes(uint32_t componentType) {
+  if (componentType == TINYGLTF_COMPONENT_TYPE_BYTE) {
+    return 1;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE) {
+    return 1;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_SHORT) {
+    return 2;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT) {
+    return 2;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_INT) {
+    return 4;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_INT) {
+    return 4;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_FLOAT) {
+    return 4;
+  } else if (componentType == TINYGLTF_COMPONENT_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    // Unknown componenty type
+    return -1;
+  }
+}
+
+static inline int32_t GetNumComponentsInType(uint32_t ty) {
+  if (ty == TINYGLTF_TYPE_SCALAR) {
+    return 1;
+  } else if (ty == TINYGLTF_TYPE_VEC2) {
+    return 2;
+  } else if (ty == TINYGLTF_TYPE_VEC3) {
+    return 3;
+  } else if (ty == TINYGLTF_TYPE_VEC4) {
+    return 4;
+  } else if (ty == TINYGLTF_TYPE_MAT2) {
+    return 4;
+  } else if (ty == TINYGLTF_TYPE_MAT3) {
+    return 9;
+  } else if (ty == TINYGLTF_TYPE_MAT4) {
+    return 16;
+  } else {
+    // Unknown componenty type
+    return -1;
+  }
+}
+
+// TODO(syoyo): Move these functions to TinyGLTF class
+bool IsDataURI(const std::string &in);
+bool DecodeDataURI(std::vector<unsigned char> *out, std::string &mime_type,
+                   const std::string &in, size_t reqBytes, bool checkSize);
+
+#ifdef __clang__
+#pragma clang diagnostic push
+// Suppress warning for : static Value null_value
+// https://stackoverflow.com/questions/15708411/how-to-deal-with-global-constructor-warning-in-clang
+#pragma clang diagnostic ignored "-Wexit-time-destructors"
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+// Simple class to represent JSON object
+class Value {
+ public:
+  typedef std::vector<Value> Array;
+  typedef std::map<std::string, Value> Object;
+
+  Value()
+      : type_(NULL_TYPE),
+        int_value_(0),
+        real_value_(0.0),
+        boolean_value_(false) {}
+
+  explicit Value(bool b) : type_(BOOL_TYPE) { boolean_value_ = b; }
+  explicit Value(int i) : type_(INT_TYPE) {
+    int_value_ = i;
+    real_value_ = i;
+  }
+  explicit Value(double n) : type_(REAL_TYPE) { real_value_ = n; }
+  explicit Value(const std::string &s) : type_(STRING_TYPE) {
+    string_value_ = s;
+  }
+  explicit Value(std::string &&s)
+      : type_(STRING_TYPE), string_value_(std::move(s)) {}
+  explicit Value(const unsigned char *p, size_t n) : type_(BINARY_TYPE) {
+    binary_value_.resize(n);
+    memcpy(binary_value_.data(), p, n);
+  }
+  explicit Value(std::vector<unsigned char> &&v) noexcept
+      : type_(BINARY_TYPE),
+        binary_value_(std::move(v)) {}
+  explicit Value(const Array &a) : type_(ARRAY_TYPE) { array_value_ = a; }
+  explicit Value(Array &&a) noexcept : type_(ARRAY_TYPE),
+                                       array_value_(std::move(a)) {}
+
+  explicit Value(const Object &o) : type_(OBJECT_TYPE) { object_value_ = o; }
+  explicit Value(Object &&o) noexcept : type_(OBJECT_TYPE),
+                                        object_value_(std::move(o)) {}
+
+  DEFAULT_METHODS(Value)
+
+  char Type() const { return static_cast<char>(type_); }
+
+  bool IsBool() const { return (type_ == BOOL_TYPE); }
+
+  bool IsInt() const { return (type_ == INT_TYPE); }
+
+  bool IsNumber() const { return (type_ == REAL_TYPE) || (type_ == INT_TYPE); }
+
+  bool IsReal() const { return (type_ == REAL_TYPE); }
+
+  bool IsString() const { return (type_ == STRING_TYPE); }
+
+  bool IsBinary() const { return (type_ == BINARY_TYPE); }
+
+  bool IsArray() const { return (type_ == ARRAY_TYPE); }
+
+  bool IsObject() const { return (type_ == OBJECT_TYPE); }
+
+  // Use this function if you want to have number value as double.
+  double GetNumberAsDouble() const {
+    if (type_ == INT_TYPE) {
+      return double(int_value_);
+    } else {
+      return real_value_;
+    }
+  }
+
+  // Use this function if you want to have number value as int.
+  // TODO(syoyo): Support int value larger than 32 bits
+  int GetNumberAsInt() const {
+    if (type_ == REAL_TYPE) {
+      return int(real_value_);
+    } else {
+      return int_value_;
+    }
+  }
+
+  // Accessor
+  template <typename T>
+  const T &Get() const;
+  template <typename T>
+  T &Get();
+
+  // Lookup value from an array
+  const Value &Get(int idx) const {
+    static Value null_value;
+    assert(IsArray());
+    assert(idx >= 0);
+    return (static_cast<size_t>(idx) < array_value_.size())
+               ? array_value_[static_cast<size_t>(idx)]
+               : null_value;
+  }
+
+  // Lookup value from a key-value pair
+  const Value &Get(const std::string &key) const {
+    static Value null_value;
+    assert(IsObject());
+    Object::const_iterator it = object_value_.find(key);
+    return (it != object_value_.end()) ? it->second : null_value;
+  }
+
+  size_t ArrayLen() const {
+    if (!IsArray()) return 0;
+    return array_value_.size();
+  }
+
+  // Valid only for object type.
+  bool Has(const std::string &key) const {
+    if (!IsObject()) return false;
+    Object::const_iterator it = object_value_.find(key);
+    return (it != object_value_.end()) ? true : false;
+  }
+
+  // List keys
+  std::vector<std::string> Keys() const {
+    std::vector<std::string> keys;
+    if (!IsObject()) return keys;  // empty
+
+    for (Object::const_iterator it = object_value_.begin();
+         it != object_value_.end(); ++it) {
+      keys.push_back(it->first);
+    }
+
+    return keys;
+  }
+
+  size_t Size() const { return (IsArray() ? ArrayLen() : Keys().size()); }
+
+  bool operator==(const tinygltf::Value &other) const;
+
+ protected:
+  int type_ = NULL_TYPE;
+
+  int int_value_ = 0;
+  double real_value_ = 0.0;
+  std::string string_value_;
+  std::vector<unsigned char> binary_value_;
+  Array array_value_;
+  Object object_value_;
+  bool boolean_value_ = false;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#define TINYGLTF_VALUE_GET(ctype, var)            \
+  template <>                                     \
+  inline const ctype &Value::Get<ctype>() const { \
+    return var;                                   \
+  }                                               \
+  template <>                                     \
+  inline ctype &Value::Get<ctype>() {             \
+    return var;                                   \
+  }
+TINYGLTF_VALUE_GET(bool, boolean_value_)
+TINYGLTF_VALUE_GET(double, real_value_)
+TINYGLTF_VALUE_GET(int, int_value_)
+TINYGLTF_VALUE_GET(std::string, string_value_)
+TINYGLTF_VALUE_GET(std::vector<unsigned char>, binary_value_)
+TINYGLTF_VALUE_GET(Value::Array, array_value_)
+TINYGLTF_VALUE_GET(Value::Object, object_value_)
+#undef TINYGLTF_VALUE_GET
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+/// Agregate object for representing a color
+using ColorValue = std::array<double, 4>;
+
+// === legacy interface ====
+// TODO(syoyo): Deprecate `Parameter` class.
+struct Parameter {
+  bool bool_value = false;
+  bool has_number_value = false;
+  std::string string_value;
+  std::vector<double> number_array;
+  std::map<std::string, double> json_double_value;
+  double number_value = 0.0;
+
+  // context sensitive methods. depending the type of the Parameter you are
+  // accessing, these are either valid or not
+  // If this parameter represent a texture map in a material, will return the
+  // texture index
+
+  /// Return the index of a texture if this Parameter is a texture map.
+  /// Returned value is only valid if the parameter represent a texture from a
+  /// material
+  int TextureIndex() const {
+    const auto it = json_double_value.find("index");
+    if (it != std::end(json_double_value)) {
+      return int(it->second);
+    }
+    return -1;
+  }
+
+  /// Return the index of a texture coordinate set if this Parameter is a
+  /// texture map. Returned value is only valid if the parameter represent a
+  /// texture from a material
+  int TextureTexCoord() const {
+    const auto it = json_double_value.find("texCoord");
+    if (it != std::end(json_double_value)) {
+      return int(it->second);
+    }
+    // As per the spec, if texCoord is ommited, this parameter is 0
+    return 0;
+  }
+
+  /// Return the scale of a texture if this Parameter is a normal texture map.
+  /// Returned value is only valid if the parameter represent a normal texture
+  /// from a material
+  double TextureScale() const {
+    const auto it = json_double_value.find("scale");
+    if (it != std::end(json_double_value)) {
+      return it->second;
+    }
+    // As per the spec, if scale is ommited, this paramter is 1
+    return 1;
+  }
+
+  /// Return the strength of a texture if this Parameter is a an occlusion map.
+  /// Returned value is only valid if the parameter represent an occlusion map
+  /// from a material
+  double TextureStrength() const {
+    const auto it = json_double_value.find("strength");
+    if (it != std::end(json_double_value)) {
+      return it->second;
+    }
+    // As per the spec, if strenghth is ommited, this parameter is 1
+    return 1;
+  }
+
+  /// Material factor, like the roughness or metalness of a material
+  /// Returned value is only valid if the parameter represent a texture from a
+  /// material
+  double Factor() const { return number_value; }
+
+  /// Return the color of a material
+  /// Returned value is only valid if the parameter represent a texture from a
+  /// material
+  ColorValue ColorFactor() const {
+    return {
+        {// this agregate intialize the std::array object, and uses C++11 RVO.
+         number_array[0], number_array[1], number_array[2],
+         (number_array.size() > 3 ? number_array[3] : 1.0)}};
+  }
+
+  Parameter() = default;
+  DEFAULT_METHODS(Parameter)
+  bool operator==(const Parameter &) const;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+typedef std::map<std::string, Parameter> ParameterMap;
+typedef std::map<std::string, Value> ExtensionMap;
+
+struct AnimationChannel {
+  int sampler;              // required
+  int target_node;          // required (index of the node to target)
+  std::string target_path;  // required in ["translation", "rotation", "scale",
+                            // "weights"]
+  Value extras;
+  ExtensionMap extensions;
+  ExtensionMap target_extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+  std::string target_extensions_json_string;
+
+  AnimationChannel() : sampler(-1), target_node(-1) {}
+  DEFAULT_METHODS(AnimationChannel)
+  bool operator==(const AnimationChannel &) const;
+};
+
+struct AnimationSampler {
+  int input;                  // required
+  int output;                 // required
+  std::string interpolation;  // "LINEAR", "STEP","CUBICSPLINE" or user defined
+                              // string. default "LINEAR"
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  AnimationSampler() : input(-1), output(-1), interpolation("LINEAR") {}
+  DEFAULT_METHODS(AnimationSampler)
+  bool operator==(const AnimationSampler &) const;
+};
+
+struct Animation {
+  std::string name;
+  std::vector<AnimationChannel> channels;
+  std::vector<AnimationSampler> samplers;
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Animation() = default;
+  DEFAULT_METHODS(Animation)
+  bool operator==(const Animation &) const;
+};
+
+struct Skin {
+  std::string name;
+  int inverseBindMatrices;  // required here but not in the spec
+  int skeleton;             // The index of the node used as a skeleton root
+  std::vector<int> joints;  // Indices of skeleton nodes
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Skin() {
+    inverseBindMatrices = -1;
+    skeleton = -1;
+  }
+  DEFAULT_METHODS(Skin)
+  bool operator==(const Skin &) const;
+};
+
+struct Sampler {
+  std::string name;
+  // glTF 2.0 spec does not define default value for `minFilter` and
+  // `magFilter`. Set -1 in TinyGLTF(issue #186)
+  int minFilter =
+      -1;  // optional. -1 = no filter defined. ["NEAREST", "LINEAR",
+           // "NEAREST_MIPMAP_NEAREST", "LINEAR_MIPMAP_NEAREST",
+           // "NEAREST_MIPMAP_LINEAR", "LINEAR_MIPMAP_LINEAR"]
+  int magFilter =
+      -1;  // optional. -1 = no filter defined. ["NEAREST", "LINEAR"]
+  int wrapS =
+      TINYGLTF_TEXTURE_WRAP_REPEAT;  // ["CLAMP_TO_EDGE", "MIRRORED_REPEAT",
+                                     // "REPEAT"], default "REPEAT"
+  int wrapT =
+      TINYGLTF_TEXTURE_WRAP_REPEAT;  // ["CLAMP_TO_EDGE", "MIRRORED_REPEAT",
+                                     // "REPEAT"], default "REPEAT"
+  //int wrapR = TINYGLTF_TEXTURE_WRAP_REPEAT;  // TinyGLTF extension. currently not used.
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Sampler()
+      : minFilter(-1),
+        magFilter(-1),
+        wrapS(TINYGLTF_TEXTURE_WRAP_REPEAT),
+        wrapT(TINYGLTF_TEXTURE_WRAP_REPEAT) {}
+  DEFAULT_METHODS(Sampler)
+  bool operator==(const Sampler &) const;
+};
+
+struct Image {
+  std::string name;
+  int width;
+  int height;
+  int component;
+  int bits;        // bit depth per channel. 8(byte), 16 or 32.
+  int pixel_type;  // pixel type(TINYGLTF_COMPONENT_TYPE_***). usually
+                   // UBYTE(bits = 8) or USHORT(bits = 16)
+  std::vector<unsigned char> image;
+  int bufferView;        // (required if no uri)
+  std::string mimeType;  // (required if no uri) ["image/jpeg", "image/png",
+                         // "image/bmp", "image/gif"]
+  std::string uri;       // (required if no mimeType) uri is not decoded(e.g.
+                         // whitespace may be represented as %20)
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  // When this flag is true, data is stored to `image` in as-is format(e.g. jpeg
+  // compressed for "image/jpeg" mime) This feature is good if you use custom
+  // image loader function. (e.g. delayed decoding of images for faster glTF
+  // parsing) Default parser for Image does not provide as-is loading feature at
+  // the moment. (You can manipulate this by providing your own LoadImageData
+  // function)
+  bool as_is;
+
+  Image() : as_is(false) {
+    bufferView = -1;
+    width = -1;
+    height = -1;
+    component = -1;
+    bits = -1;
+    pixel_type = -1;
+  }
+  DEFAULT_METHODS(Image)
+
+  bool operator==(const Image &) const;
+};
+
+struct Texture {
+  std::string name;
+
+  int sampler;
+  int source;
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Texture() : sampler(-1), source(-1) {}
+  DEFAULT_METHODS(Texture)
+
+  bool operator==(const Texture &) const;
+};
+
+struct TextureInfo {
+  int index = -1;  // required.
+  int texCoord;    // The set index of texture's TEXCOORD attribute used for
+                   // texture coordinate mapping.
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  TextureInfo() : index(-1), texCoord(0) {}
+  DEFAULT_METHODS(TextureInfo)
+  bool operator==(const TextureInfo &) const;
+};
+
+struct NormalTextureInfo {
+  int index = -1;  // required
+  int texCoord;    // The set index of texture's TEXCOORD attribute used for
+                   // texture coordinate mapping.
+  double scale;    // scaledNormal = normalize((<sampled normal texture value>
+                   // * 2.0 - 1.0) * vec3(<normal scale>, <normal scale>, 1.0))
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  NormalTextureInfo() : index(-1), texCoord(0), scale(1.0) {}
+  DEFAULT_METHODS(NormalTextureInfo)
+  bool operator==(const NormalTextureInfo &) const;
+};
+
+struct OcclusionTextureInfo {
+  int index = -1;   // required
+  int texCoord;     // The set index of texture's TEXCOORD attribute used for
+                    // texture coordinate mapping.
+  double strength;  // occludedColor = lerp(color, color * <sampled occlusion
+                    // texture value>, <occlusion strength>)
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  OcclusionTextureInfo() : index(-1), texCoord(0), strength(1.0) {}
+  DEFAULT_METHODS(OcclusionTextureInfo)
+  bool operator==(const OcclusionTextureInfo &) const;
+};
+
+// pbrMetallicRoughness class defined in glTF 2.0 spec.
+struct PbrMetallicRoughness {
+  std::vector<double> baseColorFactor;  // len = 4. default [1,1,1,1]
+  TextureInfo baseColorTexture;
+  double metallicFactor;   // default 1
+  double roughnessFactor;  // default 1
+  TextureInfo metallicRoughnessTexture;
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  PbrMetallicRoughness()
+      : baseColorFactor(std::vector<double>{1.0, 1.0, 1.0, 1.0}),
+        metallicFactor(1.0),
+        roughnessFactor(1.0) {}
+  DEFAULT_METHODS(PbrMetallicRoughness)
+  bool operator==(const PbrMetallicRoughness &) const;
+};
+
+// Each extension should be stored in a ParameterMap.
+// members not in the values could be included in the ParameterMap
+// to keep a single material model
+struct Material {
+  std::string name;
+
+  std::vector<double> emissiveFactor;  // length 3. default [0, 0, 0]
+  std::string alphaMode;               // default "OPAQUE"
+  double alphaCutoff;                  // default 0.5
+  bool doubleSided;                    // default false;
+
+  PbrMetallicRoughness pbrMetallicRoughness;
+
+  NormalTextureInfo normalTexture;
+  OcclusionTextureInfo occlusionTexture;
+  TextureInfo emissiveTexture;
+
+  // For backward compatibility
+  // TODO(syoyo): Remove `values` and `additionalValues` in the next release.
+  ParameterMap values;
+  ParameterMap additionalValues;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Material() : alphaMode("OPAQUE"), alphaCutoff(0.5), doubleSided(false) {}
+  DEFAULT_METHODS(Material)
+
+  bool operator==(const Material &) const;
+};
+
+struct BufferView {
+  std::string name;
+  int buffer{-1};        // Required
+  size_t byteOffset{0};  // minimum 0, default 0
+  size_t byteLength{0};  // required, minimum 1. 0 = invalid
+  size_t byteStride{0};  // minimum 4, maximum 252 (multiple of 4), default 0 =
+                         // understood to be tightly packed
+  int target{0};  // ["ARRAY_BUFFER", "ELEMENT_ARRAY_BUFFER"] for vertex indices
+                  // or atttribs. Could be 0 for other data
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  bool dracoDecoded{false};  // Flag indicating this has been draco decoded
+
+  BufferView()
+      : buffer(-1),
+        byteOffset(0),
+        byteLength(0),
+        byteStride(0),
+        target(0),
+        dracoDecoded(false) {}
+  DEFAULT_METHODS(BufferView)
+  bool operator==(const BufferView &) const;
+};
+
+struct Accessor {
+  int bufferView;  // optional in spec but required here since sparse accessor
+                   // are not supported
+  std::string name;
+  size_t byteOffset;
+  bool normalized;    // optional.
+  int componentType;  // (required) One of TINYGLTF_COMPONENT_TYPE_***
+  size_t count;       // required
+  int type;           // (required) One of TINYGLTF_TYPE_***   ..
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  std::vector<double>
+      minValues;  // optional. integer value is promoted to double
+  std::vector<double>
+      maxValues;  // optional. integer value is promoted to double
+
+  struct {
+    int count;
+    bool isSparse;
+    struct {
+      int byteOffset;
+      int bufferView;
+      int componentType;  // a TINYGLTF_COMPONENT_TYPE_ value
+    } indices;
+    struct {
+      int bufferView;
+      int byteOffset;
+    } values;
+  } sparse;
+
+  ///
+  /// Utility function to compute byteStride for a given bufferView object.
+  /// Returns -1 upon invalid glTF value or parameter configuration.
+  ///
+  int ByteStride(const BufferView &bufferViewObject) const {
+    if (bufferViewObject.byteStride == 0) {
+      // Assume data is tightly packed.
+      int componentSizeInBytes =
+          GetComponentSizeInBytes(static_cast<uint32_t>(componentType));
+      if (componentSizeInBytes <= 0) {
+        return -1;
+      }
+
+      int numComponents = GetNumComponentsInType(static_cast<uint32_t>(type));
+      if (numComponents <= 0) {
+        return -1;
+      }
+
+      return componentSizeInBytes * numComponents;
+    } else {
+      // Check if byteStride is a mulple of the size of the accessor's component
+      // type.
+      int componentSizeInBytes =
+          GetComponentSizeInBytes(static_cast<uint32_t>(componentType));
+      if (componentSizeInBytes <= 0) {
+        return -1;
+      }
+
+      if ((bufferViewObject.byteStride % uint32_t(componentSizeInBytes)) != 0) {
+        return -1;
+      }
+      return static_cast<int>(bufferViewObject.byteStride);
+    }
+
+    // unreachable return 0;
+  }
+
+  Accessor()
+      : bufferView(-1),
+        byteOffset(0),
+        normalized(false),
+        componentType(-1),
+        count(0),
+        type(-1) {
+    sparse.isSparse = false;
+  }
+  DEFAULT_METHODS(Accessor)
+  bool operator==(const tinygltf::Accessor &) const;
+};
+
+struct PerspectiveCamera {
+  double aspectRatio;  // min > 0
+  double yfov;         // required. min > 0
+  double zfar;         // min > 0
+  double znear;        // required. min > 0
+
+  PerspectiveCamera()
+      : aspectRatio(0.0),
+        yfov(0.0),
+        zfar(0.0)  // 0 = use infinite projecton matrix
+        ,
+        znear(0.0) {}
+  DEFAULT_METHODS(PerspectiveCamera)
+  bool operator==(const PerspectiveCamera &) const;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+struct OrthographicCamera {
+  double xmag;   // required. must not be zero.
+  double ymag;   // required. must not be zero.
+  double zfar;   // required. `zfar` must be greater than `znear`.
+  double znear;  // required
+
+  OrthographicCamera() : xmag(0.0), ymag(0.0), zfar(0.0), znear(0.0) {}
+  DEFAULT_METHODS(OrthographicCamera)
+  bool operator==(const OrthographicCamera &) const;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+struct Camera {
+  std::string type;  // required. "perspective" or "orthographic"
+  std::string name;
+
+  PerspectiveCamera perspective;
+  OrthographicCamera orthographic;
+
+  Camera() {}
+  DEFAULT_METHODS(Camera)
+  bool operator==(const Camera &) const;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+struct Primitive {
+  std::map<std::string, int> attributes;  // (required) A dictionary object of
+                                          // integer, where each integer
+                                          // is the index of the accessor
+                                          // containing an attribute.
+  int material;  // The index of the material to apply to this primitive
+                 // when rendering.
+  int indices;   // The index of the accessor that contains the indices.
+  int mode;      // one of TINYGLTF_MODE_***
+  std::vector<std::map<std::string, int> > targets;  // array of morph targets,
+  // where each target is a dict with attribues in ["POSITION, "NORMAL",
+  // "TANGENT"] pointing
+  // to their corresponding accessors
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Primitive() {
+    material = -1;
+    indices = -1;
+    mode = -1;
+  }
+  DEFAULT_METHODS(Primitive)
+  bool operator==(const Primitive &) const;
+};
+
+struct Mesh {
+  std::string name;
+  std::vector<Primitive> primitives;
+  std::vector<double> weights;  // weights to be applied to the Morph Targets
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Mesh() = default;
+  DEFAULT_METHODS(Mesh)
+  bool operator==(const Mesh &) const;
+};
+
+class Node {
+ public:
+  Node() : camera(-1), skin(-1), mesh(-1) {}
+
+  DEFAULT_METHODS(Node)
+
+  bool operator==(const Node &) const;
+
+  int camera;  // the index of the camera referenced by this node
+
+  std::string name;
+  int skin;
+  int mesh;
+  std::vector<int> children;
+  std::vector<double> rotation;     // length must be 0 or 4
+  std::vector<double> scale;        // length must be 0 or 3
+  std::vector<double> translation;  // length must be 0 or 3
+  std::vector<double> matrix;       // length must be 0 or 16
+  std::vector<double> weights;  // The weights of the instantiated Morph Target
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+struct Buffer {
+  std::string name;
+  std::vector<unsigned char> data;
+  std::string
+      uri;  // considered as required here but not in the spec (need to clarify)
+            // uri is not decoded(e.g. whitespace may be represented as %20)
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Buffer() = default;
+  DEFAULT_METHODS(Buffer)
+  bool operator==(const Buffer &) const;
+};
+
+struct Asset {
+  std::string version = "2.0";  // required
+  std::string generator;
+  std::string minVersion;
+  std::string copyright;
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Asset() = default;
+  DEFAULT_METHODS(Asset)
+  bool operator==(const Asset &) const;
+};
+
+struct Scene {
+  std::string name;
+  std::vector<int> nodes;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+
+  Scene() = default;
+  DEFAULT_METHODS(Scene)
+  bool operator==(const Scene &) const;
+};
+
+struct SpotLight {
+  double innerConeAngle;
+  double outerConeAngle;
+
+  SpotLight() : innerConeAngle(0.0), outerConeAngle(0.7853981634) {}
+  DEFAULT_METHODS(SpotLight)
+  bool operator==(const SpotLight &) const;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+struct Light {
+  std::string name;
+  std::vector<double> color;
+  double intensity{1.0};
+  std::string type;
+  double range{0.0};  // 0.0 = inifinite
+  SpotLight spot;
+
+  Light() : intensity(1.0), range(0.0) {}
+  DEFAULT_METHODS(Light)
+
+  bool operator==(const Light &) const;
+
+  ExtensionMap extensions;
+  Value extras;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+class Model {
+ public:
+  Model() = default;
+  DEFAULT_METHODS(Model)
+
+  bool operator==(const Model &) const;
+
+  std::vector<Accessor> accessors;
+  std::vector<Animation> animations;
+  std::vector<Buffer> buffers;
+  std::vector<BufferView> bufferViews;
+  std::vector<Material> materials;
+  std::vector<Mesh> meshes;
+  std::vector<Node> nodes;
+  std::vector<Texture> textures;
+  std::vector<Image> images;
+  std::vector<Skin> skins;
+  std::vector<Sampler> samplers;
+  std::vector<Camera> cameras;
+  std::vector<Scene> scenes;
+  std::vector<Light> lights;
+
+  int defaultScene = -1;
+  std::vector<std::string> extensionsUsed;
+  std::vector<std::string> extensionsRequired;
+
+  Asset asset;
+
+  Value extras;
+  ExtensionMap extensions;
+
+  // Filled when SetStoreOriginalJSONForExtrasAndExtensions is enabled.
+  std::string extras_json_string;
+  std::string extensions_json_string;
+};
+
+enum SectionCheck {
+  NO_REQUIRE = 0x00,
+  REQUIRE_VERSION = 0x01,
+  REQUIRE_SCENE = 0x02,
+  REQUIRE_SCENES = 0x04,
+  REQUIRE_NODES = 0x08,
+  REQUIRE_ACCESSORS = 0x10,
+  REQUIRE_BUFFERS = 0x20,
+  REQUIRE_BUFFER_VIEWS = 0x40,
+  REQUIRE_ALL = 0x7f
+};
+
+///
+/// LoadImageDataFunction type. Signature for custom image loading callbacks.
+///
+typedef bool (*LoadImageDataFunction)(Image *, const int, std::string *,
+                                      std::string *, int, int,
+                                      const unsigned char *, int,
+                                      void *user_pointer);
+
+///
+/// WriteImageDataFunction type. Signature for custom image writing callbacks.
+///
+typedef bool (*WriteImageDataFunction)(const std::string *, const std::string *,
+                                       Image *, bool, void *);
+
+#ifndef TINYGLTF_NO_STB_IMAGE
+// Declaration of default image loader callback
+bool LoadImageData(Image *image, const int image_idx, std::string *err,
+                   std::string *warn, int req_width, int req_height,
+                   const unsigned char *bytes, int size, void *);
+#endif
+
+#ifndef TINYGLTF_NO_STB_IMAGE_WRITE
+// Declaration of default image writer callback
+bool WriteImageData(const std::string *basepath, const std::string *filename,
+                    Image *image, bool embedImages, void *);
+#endif
+
+///
+/// FilExistsFunction type. Signature for custom filesystem callbacks.
+///
+typedef bool (*FileExistsFunction)(const std::string &abs_filename, void *);
+
+///
+/// ExpandFilePathFunction type. Signature for custom filesystem callbacks.
+///
+typedef std::string (*ExpandFilePathFunction)(const std::string &, void *);
+
+///
+/// ReadWholeFileFunction type. Signature for custom filesystem callbacks.
+///
+typedef bool (*ReadWholeFileFunction)(std::vector<unsigned char> *,
+                                      std::string *, const std::string &,
+                                      void *);
+
+///
+/// WriteWholeFileFunction type. Signature for custom filesystem callbacks.
+///
+typedef bool (*WriteWholeFileFunction)(std::string *, const std::string &,
+                                       const std::vector<unsigned char> &,
+                                       void *);
+
+///
+/// A structure containing all required filesystem callbacks and a pointer to
+/// their user data.
+///
+struct FsCallbacks {
+  FileExistsFunction FileExists;
+  ExpandFilePathFunction ExpandFilePath;
+  ReadWholeFileFunction ReadWholeFile;
+  WriteWholeFileFunction WriteWholeFile;
+
+  void *user_data;  // An argument that is passed to all fs callbacks
+};
+
+#ifndef TINYGLTF_NO_FS
+// Declaration of default filesystem callbacks
+
+bool FileExists(const std::string &abs_filename, void *);
+
+///
+/// Expand file path(e.g. `~` to home directory on posix, `%APPDATA%` to
+/// `C:\\Users\\tinygltf\\AppData`)
+///
+/// @param[in] filepath File path string. Assume UTF-8
+/// @param[in] userdata User data. Set to `nullptr` if you don't need it.
+///
+std::string ExpandFilePath(const std::string &filepath, void *userdata);
+
+bool ReadWholeFile(std::vector<unsigned char> *out, std::string *err,
+                   const std::string &filepath, void *);
+
+bool WriteWholeFile(std::string *err, const std::string &filepath,
+                    const std::vector<unsigned char> &contents, void *);
+#endif
+
+///
+/// glTF Parser/Serialier context.
+///
+class TinyGLTF {
+ public:
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#endif
+
+  TinyGLTF() : bin_data_(nullptr), bin_size_(0), is_binary_(false) {}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+  ~TinyGLTF() {}
+
+  ///
+  /// Loads glTF ASCII asset from a file.
+  /// Set warning message to `warn` for example it fails to load asserts.
+  /// Returns false and set error string to `err` if there's an error.
+  ///
+  bool LoadASCIIFromFile(Model *model, std::string *err, std::string *warn,
+                         const std::string &filename,
+                         unsigned int check_sections = REQUIRE_VERSION);
+
+  ///
+  /// Loads glTF ASCII asset from string(memory).
+  /// `length` = strlen(str);
+  /// Set warning message to `warn` for example it fails to load asserts.
+  /// Returns false and set error string to `err` if there's an error.
+  ///
+  bool LoadASCIIFromString(Model *model, std::string *err, std::string *warn,
+                           const char *str, const unsigned int length,
+                           const std::string &base_dir,
+                           unsigned int check_sections = REQUIRE_VERSION);
+
+  ///
+  /// Loads glTF binary asset from a file.
+  /// Set warning message to `warn` for example it fails to load asserts.
+  /// Returns false and set error string to `err` if there's an error.
+  ///
+  bool LoadBinaryFromFile(Model *model, std::string *err, std::string *warn,
+                          const std::string &filename,
+                          unsigned int check_sections = REQUIRE_VERSION);
+
+  ///
+  /// Loads glTF binary asset from memory.
+  /// `length` = strlen(str);
+  /// Set warning message to `warn` for example it fails to load asserts.
+  /// Returns false and set error string to `err` if there's an error.
+  ///
+  bool LoadBinaryFromMemory(Model *model, std::string *err, std::string *warn,
+                            const unsigned char *bytes,
+                            const unsigned int length,
+                            const std::string &base_dir = "",
+                            unsigned int check_sections = REQUIRE_VERSION);
+
+  ///
+  /// Write glTF to stream, buffers and images will be embeded
+  ///
+  bool WriteGltfSceneToStream(Model *model, std::ostream &stream,
+                              bool prettyPrint, bool writeBinary);
+
+  ///
+  /// Write glTF to file.
+  ///
+  bool WriteGltfSceneToFile(Model *model, const std::string &filename,
+                            bool embedImages, bool embedBuffers,
+                            bool prettyPrint, bool writeBinary);
+
+  ///
+  /// Set callback to use for loading image data
+  ///
+  void SetImageLoader(LoadImageDataFunction LoadImageData, void *user_data);
+
+  ///
+  /// Unset(remove) callback of loading image data
+  ///
+  void RemoveImageLoader();
+
+  ///
+  /// Set callback to use for writing image data
+  ///
+  void SetImageWriter(WriteImageDataFunction WriteImageData, void *user_data);
+
+  ///
+  /// Set callbacks to use for filesystem (fs) access and their user data
+  ///
+  void SetFsCallbacks(FsCallbacks callbacks);
+
+  ///
+  /// Set serializing default values(default = false).
+  /// When true, default values are force serialized to .glTF.
+  /// This may be helpfull if you want to serialize a full description of glTF
+  /// data.
+  ///
+  /// TODO(LTE): Supply parsing option as function arguments to
+  /// `LoadASCIIFromFile()` and others, not by a class method
+  ///
+  void SetSerializeDefaultValues(const bool enabled) {
+    serialize_default_values_ = enabled;
+  }
+
+  bool GetSerializeDefaultValues() const { return serialize_default_values_; }
+
+  ///
+  /// Store original JSON string for `extras` and `extensions`.
+  /// This feature will be useful when the user want to reconstruct custom data
+  /// structure from JSON string.
+  ///
+  void SetStoreOriginalJSONForExtrasAndExtensions(const bool enabled) {
+    store_original_json_for_extras_and_extensions_ = enabled;
+  }
+
+  bool GetStoreOriginalJSONForExtrasAndExtensions() const {
+    return store_original_json_for_extras_and_extensions_;
+  }
+
+  ///
+  /// Specify whether preserve image channales when loading images or not.
+  /// (Not effective when the user suppy their own LoadImageData callbacks)
+  ///
+  void SetPreserveImageChannels(bool onoff) {
+    preserve_image_channels_ = onoff;
+  }
+
+  bool GetPreserveImageChannels() const { return preserve_image_channels_; }
+
+ private:
+  ///
+  /// Loads glTF asset from string(memory).
+  /// `length` = strlen(str);
+  /// Set warning message to `warn` for example it fails to load asserts
+  /// Returns false and set error string to `err` if there's an error.
+  ///
+  bool LoadFromString(Model *model, std::string *err, std::string *warn,
+                      const char *str, const unsigned int length,
+                      const std::string &base_dir, unsigned int check_sections);
+
+  const unsigned char *bin_data_ = nullptr;
+  size_t bin_size_ = 0;
+  bool is_binary_ = false;
+
+  bool serialize_default_values_ = false;  ///< Serialize default values?
+
+  bool store_original_json_for_extras_and_extensions_ = false;
+
+  bool preserve_image_channels_ = false;  /// Default false(expand channels to
+                                          /// RGBA) for backward compatibility.
+
+  FsCallbacks fs = {
+#ifndef TINYGLTF_NO_FS
+      &tinygltf::FileExists, &tinygltf::ExpandFilePath,
+      &tinygltf::ReadWholeFile, &tinygltf::WriteWholeFile,
+
+      nullptr  // Fs callback user data
+#else
+      nullptr, nullptr, nullptr, nullptr,
+
+      nullptr  // Fs callback user data
+#endif
+  };
+
+  LoadImageDataFunction LoadImageData =
+#ifndef TINYGLTF_NO_STB_IMAGE
+      &tinygltf::LoadImageData;
+#else
+      nullptr;
+#endif
+  void *load_image_user_data_{nullptr};
+  bool user_image_loader_{false};
+
+  WriteImageDataFunction WriteImageData =
+#ifndef TINYGLTF_NO_STB_IMAGE_WRITE
+      &tinygltf::WriteImageData;
+#else
+      nullptr;
+#endif
+  void *write_image_user_data_{nullptr};
+};
+
+#ifdef __clang__
+#pragma clang diagnostic pop  // -Wpadded
+#endif
+
+}  // namespace tinygltf
+
+#endif  // TINY_GLTF_H_
+
+#if defined(TINYGLTF_IMPLEMENTATION) || defined(__INTELLISENSE__)
+#include <algorithm>
+//#include <cassert>
+#ifndef TINYGLTF_NO_FS
+#include <cstdio>
+#include <fstream>
+#endif
+#include <sstream>
+
+#ifdef __clang__
+// Disable some warnings for external files.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#pragma clang diagnostic ignored "-Wexit-time-destructors"
+#pragma clang diagnostic ignored "-Wconversion"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+#if __has_warning("-Wreserved-id-macro")
+#pragma clang diagnostic ignored "-Wreserved-id-macro"
+#endif
+#pragma clang diagnostic ignored "-Wdisabled-macro-expansion"
+#pragma clang diagnostic ignored "-Wpadded"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#pragma clang diagnostic ignored "-Wdocumentation-unknown-command"
+#pragma clang diagnostic ignored "-Wswitch-enum"
+#pragma clang diagnostic ignored "-Wimplicit-fallthrough"
+#pragma clang diagnostic ignored "-Wweak-vtables"
+#pragma clang diagnostic ignored "-Wcovered-switch-default"
+#if __has_warning("-Wdouble-promotion")
+#pragma clang diagnostic ignored "-Wdouble-promotion"
+#endif
+#if __has_warning("-Wcomma")
+#pragma clang diagnostic ignored "-Wcomma"
+#endif
+#if __has_warning("-Wzero-as-null-pointer-constant")
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
+#endif
+#if __has_warning("-Wcast-qual")
+#pragma clang diagnostic ignored "-Wcast-qual"
+#endif
+#if __has_warning("-Wmissing-variable-declarations")
+#pragma clang diagnostic ignored "-Wmissing-variable-declarations"
+#endif
+#if __has_warning("-Wmissing-prototypes")
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+#endif
+#if __has_warning("-Wcast-align")
+#pragma clang diagnostic ignored "-Wcast-align"
+#endif
+#if __has_warning("-Wnewline-eof")
+#pragma clang diagnostic ignored "-Wnewline-eof"
+#endif
+#if __has_warning("-Wunused-parameter")
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#endif
+#if __has_warning("-Wmismatched-tags")
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+#if __has_warning("-Wextra-semi-stmt")
+#pragma clang diagnostic ignored "-Wextra-semi-stmt"
+#endif
+#endif
+
+// Disable GCC warnigs
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtype-limits"
+#endif  // __GNUC__
+
+#ifndef TINYGLTF_NO_INCLUDE_JSON
+#ifndef TINYGLTF_USE_RAPIDJSON
+#include "json.hpp"
+#else
+#ifndef TINYGLTF_NO_INCLUDE_RAPIDJSON
+#include "document.h"
+#include "prettywriter.h"
+#include "rapidjson.h"
+#include "stringbuffer.h"
+#include "writer.h"
+#endif
+#endif
+#endif
+
+#ifdef TINYGLTF_ENABLE_DRACO
+#include "draco/compression/decode.h"
+#include "draco/core/decoder_buffer.h"
+#endif
+
+#ifndef TINYGLTF_NO_STB_IMAGE
+#ifndef TINYGLTF_NO_INCLUDE_STB_IMAGE
+#include "stb_image.h"
+#endif
+#endif
+
+#ifndef TINYGLTF_NO_STB_IMAGE_WRITE
+#ifndef TINYGLTF_NO_INCLUDE_STB_IMAGE_WRITE
+#include "stb_image_write.h"
+#endif
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+
+#ifdef _WIN32
+
+// issue 143.
+// Define NOMINMAX to avoid min/max defines,
+// but undef it after included windows.h
+#ifndef NOMINMAX
+#define TINYGLTF_INTERNAL_NOMINMAX
+#define NOMINMAX
+#endif
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#define TINYGLTF_INTERNAL_WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>  // include API for expanding a file path
+
+#ifdef TINYGLTF_INTERNAL_WIN32_LEAN_AND_MEAN
+#undef WIN32_LEAN_AND_MEAN
+#endif
+
+#if defined(TINYGLTF_INTERNAL_NOMINMAX)
+#undef NOMINMAX
+#endif
+
+#if defined(__GLIBCXX__)  // mingw
+
+#include <fcntl.h>  // _O_RDONLY
+
+#include <ext/stdio_filebuf.h>  // fstream (all sorts of IO stuff) + stdio_filebuf (=streambuf)
+
+#endif
+
+#elif !defined(__ANDROID__) && !defined(__OpenBSD__)
+#include <wordexp.h>
+#endif
+
+#if defined(__sparcv9)
+// Big endian
+#else
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+#define TINYGLTF_LITTLE_ENDIAN 1
+#endif
+#endif
+
+namespace {
+#ifdef TINYGLTF_USE_RAPIDJSON
+
+#ifdef TINYGLTF_USE_RAPIDJSON_CRTALLOCATOR
+// This uses the RapidJSON CRTAllocator.  It is thread safe and multiple
+// documents may be active at once.
+using json =
+    rapidjson::GenericValue<rapidjson::UTF8<>, rapidjson::CrtAllocator>;
+using json_const_iterator = json::ConstMemberIterator;
+using json_const_array_iterator = json const *;
+using JsonDocument =
+    rapidjson::GenericDocument<rapidjson::UTF8<>, rapidjson::CrtAllocator>;
+rapidjson::CrtAllocator s_CrtAllocator;  // stateless and thread safe
+rapidjson::CrtAllocator &GetAllocator() { return s_CrtAllocator; }
+#else
+// This uses the default RapidJSON MemoryPoolAllocator.  It is very fast, but
+// not thread safe. Only a single JsonDocument may be active at any one time,
+// meaning only a single gltf load/save can be active any one time.
+using json = rapidjson::Value;
+using json_const_iterator = json::ConstMemberIterator;
+using json_const_array_iterator = json const *;
+rapidjson::Document *s_pActiveDocument = nullptr;
+rapidjson::Document::AllocatorType &GetAllocator() {
+  assert(s_pActiveDocument);  // Root json node must be JsonDocument type
+  return s_pActiveDocument->GetAllocator();
+}
+
+#ifdef __clang__
+#pragma clang diagnostic push
+// Suppress JsonDocument(JsonDocument &&rhs) noexcept
+#pragma clang diagnostic ignored "-Wunused-member-function"
+#endif
+
+struct JsonDocument : public rapidjson::Document {
+  JsonDocument() {
+    assert(s_pActiveDocument ==
+           nullptr);  // When using default allocator, only one document can be
+                      // active at a time, if you need multiple active at once,
+                      // define TINYGLTF_USE_RAPIDJSON_CRTALLOCATOR
+    s_pActiveDocument = this;
+  }
+  JsonDocument(const JsonDocument &) = delete;
+  JsonDocument(JsonDocument &&rhs) noexcept
+      : rapidjson::Document(std::move(rhs)) {
+    s_pActiveDocument = this;
+    rhs.isNil = true;
+  }
+  ~JsonDocument() {
+    if (!isNil) {
+      s_pActiveDocument = nullptr;
+    }
+  }
+
+ private:
+  bool isNil = false;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif  // TINYGLTF_USE_RAPIDJSON_CRTALLOCATOR
+
+#else
+using nlohmann::json;
+using json_const_iterator = json::const_iterator;
+using json_const_array_iterator = json_const_iterator;
+using JsonDocument = json;
+#endif
+
+void JsonParse(JsonDocument &doc, const char *str, size_t length,
+               bool throwExc = false) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  (void)throwExc;
+  doc.Parse(str, length);
+#else
+  doc = json::parse(str, str + length, nullptr, throwExc);
+#endif
+}
+}  // namespace
+
+#ifdef __APPLE__
+#include "TargetConditionals.h"
+#endif
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#endif
+
+namespace tinygltf {
+
+///
+/// Internal LoadImageDataOption struct.
+/// This struct is passed through `user_pointer` in LoadImageData.
+/// The struct is not passed when the user supply their own LoadImageData
+/// callbacks.
+///
+struct LoadImageDataOption {
+  // true: preserve image channels(e.g. load as RGB image if the image has RGB
+  // channels) default `false`(channels are expanded to RGBA for backward
+  // compatiblity).
+  bool preserve_channels{false};
+};
+
+// Equals function for Value, for recursivity
+static bool Equals(const tinygltf::Value &one, const tinygltf::Value &other) {
+  if (one.Type() != other.Type()) return false;
+
+  switch (one.Type()) {
+    case NULL_TYPE:
+      return true;
+    case BOOL_TYPE:
+      return one.Get<bool>() == other.Get<bool>();
+    case REAL_TYPE:
+      return TINYGLTF_DOUBLE_EQUAL(one.Get<double>(), other.Get<double>());
+    case INT_TYPE:
+      return one.Get<int>() == other.Get<int>();
+    case OBJECT_TYPE: {
+      auto oneObj = one.Get<tinygltf::Value::Object>();
+      auto otherObj = other.Get<tinygltf::Value::Object>();
+      if (oneObj.size() != otherObj.size()) return false;
+      for (auto &it : oneObj) {
+        auto otherIt = otherObj.find(it.first);
+        if (otherIt == otherObj.end()) return false;
+
+        if (!Equals(it.second, otherIt->second)) return false;
+      }
+      return true;
+    }
+    case ARRAY_TYPE: {
+      if (one.Size() != other.Size()) return false;
+      for (int i = 0; i < int(one.Size()); ++i)
+        if (!Equals(one.Get(i), other.Get(i))) return false;
+      return true;
+    }
+    case STRING_TYPE:
+      return one.Get<std::string>() == other.Get<std::string>();
+    case BINARY_TYPE:
+      return one.Get<std::vector<unsigned char> >() ==
+             other.Get<std::vector<unsigned char> >();
+    default: {
+      // unhandled type
+      return false;
+    }
+  }
+}
+
+// Equals function for std::vector<double> using TINYGLTF_DOUBLE_EPSILON
+static bool Equals(const std::vector<double> &one,
+                   const std::vector<double> &other) {
+  if (one.size() != other.size()) return false;
+  for (int i = 0; i < int(one.size()); ++i) {
+    if (!TINYGLTF_DOUBLE_EQUAL(one[size_t(i)], other[size_t(i)])) return false;
+  }
+  return true;
+}
+
+bool Accessor::operator==(const Accessor &other) const {
+  return this->bufferView == other.bufferView &&
+         this->byteOffset == other.byteOffset &&
+         this->componentType == other.componentType &&
+         this->count == other.count && this->extensions == other.extensions &&
+         this->extras == other.extras &&
+         Equals(this->maxValues, other.maxValues) &&
+         Equals(this->minValues, other.minValues) && this->name == other.name &&
+         this->normalized == other.normalized && this->type == other.type;
+}
+bool Animation::operator==(const Animation &other) const {
+  return this->channels == other.channels &&
+         this->extensions == other.extensions && this->extras == other.extras &&
+         this->name == other.name && this->samplers == other.samplers;
+}
+bool AnimationChannel::operator==(const AnimationChannel &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->target_node == other.target_node &&
+         this->target_path == other.target_path &&
+         this->sampler == other.sampler;
+}
+bool AnimationSampler::operator==(const AnimationSampler &other) const {
+  return this->extras == other.extras && this->extensions == other.extensions &&
+         this->input == other.input &&
+         this->interpolation == other.interpolation &&
+         this->output == other.output;
+}
+bool Asset::operator==(const Asset &other) const {
+  return this->copyright == other.copyright &&
+         this->extensions == other.extensions && this->extras == other.extras &&
+         this->generator == other.generator &&
+         this->minVersion == other.minVersion && this->version == other.version;
+}
+bool Buffer::operator==(const Buffer &other) const {
+  return this->data == other.data && this->extensions == other.extensions &&
+         this->extras == other.extras && this->name == other.name &&
+         this->uri == other.uri;
+}
+bool BufferView::operator==(const BufferView &other) const {
+  return this->buffer == other.buffer && this->byteLength == other.byteLength &&
+         this->byteOffset == other.byteOffset &&
+         this->byteStride == other.byteStride && this->name == other.name &&
+         this->target == other.target && this->extensions == other.extensions &&
+         this->extras == other.extras &&
+         this->dracoDecoded == other.dracoDecoded;
+}
+bool Camera::operator==(const Camera &other) const {
+  return this->name == other.name && this->extensions == other.extensions &&
+         this->extras == other.extras &&
+         this->orthographic == other.orthographic &&
+         this->perspective == other.perspective && this->type == other.type;
+}
+bool Image::operator==(const Image &other) const {
+  return this->bufferView == other.bufferView &&
+         this->component == other.component &&
+         this->extensions == other.extensions && this->extras == other.extras &&
+         this->height == other.height && this->image == other.image &&
+         this->mimeType == other.mimeType && this->name == other.name &&
+         this->uri == other.uri && this->width == other.width;
+}
+bool Light::operator==(const Light &other) const {
+  return Equals(this->color, other.color) && this->name == other.name &&
+         this->type == other.type;
+}
+bool Material::operator==(const Material &other) const {
+  return (this->pbrMetallicRoughness == other.pbrMetallicRoughness) &&
+         (this->normalTexture == other.normalTexture) &&
+         (this->occlusionTexture == other.occlusionTexture) &&
+         (this->emissiveTexture == other.emissiveTexture) &&
+         Equals(this->emissiveFactor, other.emissiveFactor) &&
+         (this->alphaMode == other.alphaMode) &&
+         TINYGLTF_DOUBLE_EQUAL(this->alphaCutoff, other.alphaCutoff) &&
+         (this->doubleSided == other.doubleSided) &&
+         (this->extensions == other.extensions) &&
+         (this->extras == other.extras) && (this->values == other.values) &&
+         (this->additionalValues == other.additionalValues) &&
+         (this->name == other.name);
+}
+bool Mesh::operator==(const Mesh &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->name == other.name && Equals(this->weights, other.weights) &&
+         this->primitives == other.primitives;
+}
+bool Model::operator==(const Model &other) const {
+  return this->accessors == other.accessors &&
+         this->animations == other.animations && this->asset == other.asset &&
+         this->buffers == other.buffers &&
+         this->bufferViews == other.bufferViews &&
+         this->cameras == other.cameras &&
+         this->defaultScene == other.defaultScene &&
+         this->extensions == other.extensions &&
+         this->extensionsRequired == other.extensionsRequired &&
+         this->extensionsUsed == other.extensionsUsed &&
+         this->extras == other.extras && this->images == other.images &&
+         this->lights == other.lights && this->materials == other.materials &&
+         this->meshes == other.meshes && this->nodes == other.nodes &&
+         this->samplers == other.samplers && this->scenes == other.scenes &&
+         this->skins == other.skins && this->textures == other.textures;
+}
+bool Node::operator==(const Node &other) const {
+  return this->camera == other.camera && this->children == other.children &&
+         this->extensions == other.extensions && this->extras == other.extras &&
+         Equals(this->matrix, other.matrix) && this->mesh == other.mesh &&
+         this->name == other.name && Equals(this->rotation, other.rotation) &&
+         Equals(this->scale, other.scale) && this->skin == other.skin &&
+         Equals(this->translation, other.translation) &&
+         Equals(this->weights, other.weights);
+}
+bool SpotLight::operator==(const SpotLight &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         TINYGLTF_DOUBLE_EQUAL(this->innerConeAngle, other.innerConeAngle) &&
+         TINYGLTF_DOUBLE_EQUAL(this->outerConeAngle, other.outerConeAngle);
+}
+bool OrthographicCamera::operator==(const OrthographicCamera &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         TINYGLTF_DOUBLE_EQUAL(this->xmag, other.xmag) &&
+         TINYGLTF_DOUBLE_EQUAL(this->ymag, other.ymag) &&
+         TINYGLTF_DOUBLE_EQUAL(this->zfar, other.zfar) &&
+         TINYGLTF_DOUBLE_EQUAL(this->znear, other.znear);
+}
+bool Parameter::operator==(const Parameter &other) const {
+  if (this->bool_value != other.bool_value ||
+      this->has_number_value != other.has_number_value)
+    return false;
+
+  if (!TINYGLTF_DOUBLE_EQUAL(this->number_value, other.number_value))
+    return false;
+
+  if (this->json_double_value.size() != other.json_double_value.size())
+    return false;
+  for (auto &it : this->json_double_value) {
+    auto otherIt = other.json_double_value.find(it.first);
+    if (otherIt == other.json_double_value.end()) return false;
+
+    if (!TINYGLTF_DOUBLE_EQUAL(it.second, otherIt->second)) return false;
+  }
+
+  if (!Equals(this->number_array, other.number_array)) return false;
+
+  if (this->string_value != other.string_value) return false;
+
+  return true;
+}
+bool PerspectiveCamera::operator==(const PerspectiveCamera &other) const {
+  return TINYGLTF_DOUBLE_EQUAL(this->aspectRatio, other.aspectRatio) &&
+         this->extensions == other.extensions && this->extras == other.extras &&
+         TINYGLTF_DOUBLE_EQUAL(this->yfov, other.yfov) &&
+         TINYGLTF_DOUBLE_EQUAL(this->zfar, other.zfar) &&
+         TINYGLTF_DOUBLE_EQUAL(this->znear, other.znear);
+}
+bool Primitive::operator==(const Primitive &other) const {
+  return this->attributes == other.attributes && this->extras == other.extras &&
+         this->indices == other.indices && this->material == other.material &&
+         this->mode == other.mode && this->targets == other.targets;
+}
+bool Sampler::operator==(const Sampler &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->magFilter == other.magFilter &&
+         this->minFilter == other.minFilter && this->name == other.name &&
+         this->wrapT == other.wrapT;
+
+         //this->wrapR == other.wrapR && this->wrapS == other.wrapS &&
+}
+bool Scene::operator==(const Scene &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->name == other.name && this->nodes == other.nodes;
+}
+bool Skin::operator==(const Skin &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->inverseBindMatrices == other.inverseBindMatrices &&
+         this->joints == other.joints && this->name == other.name &&
+         this->skeleton == other.skeleton;
+}
+bool Texture::operator==(const Texture &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->name == other.name && this->sampler == other.sampler &&
+         this->source == other.source;
+}
+bool TextureInfo::operator==(const TextureInfo &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->index == other.index && this->texCoord == other.texCoord;
+}
+bool NormalTextureInfo::operator==(const NormalTextureInfo &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->index == other.index && this->texCoord == other.texCoord &&
+         TINYGLTF_DOUBLE_EQUAL(this->scale, other.scale);
+}
+bool OcclusionTextureInfo::operator==(const OcclusionTextureInfo &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         this->index == other.index && this->texCoord == other.texCoord &&
+         TINYGLTF_DOUBLE_EQUAL(this->strength, other.strength);
+}
+bool PbrMetallicRoughness::operator==(const PbrMetallicRoughness &other) const {
+  return this->extensions == other.extensions && this->extras == other.extras &&
+         (this->baseColorTexture == other.baseColorTexture) &&
+         (this->metallicRoughnessTexture == other.metallicRoughnessTexture) &&
+         Equals(this->baseColorFactor, other.baseColorFactor) &&
+         TINYGLTF_DOUBLE_EQUAL(this->metallicFactor, other.metallicFactor) &&
+         TINYGLTF_DOUBLE_EQUAL(this->roughnessFactor, other.roughnessFactor);
+}
+bool Value::operator==(const Value &other) const {
+  return Equals(*this, other);
+}
+
+static void swap4(unsigned int *val) {
+#ifdef TINYGLTF_LITTLE_ENDIAN
+  (void)val;
+#else
+  unsigned int tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[3];
+  dst[1] = src[2];
+  dst[2] = src[1];
+  dst[3] = src[0];
+#endif
+}
+
+static std::string JoinPath(const std::string &path0,
+                            const std::string &path1) {
+  if (path0.empty()) {
+    return path1;
+  } else {
+    // check '/'
+    char lastChar = *path0.rbegin();
+    if (lastChar != '/') {
+      return path0 + std::string("/") + path1;
+    } else {
+      return path0 + path1;
+    }
+  }
+}
+
+static std::string FindFile(const std::vector<std::string> &paths,
+                            const std::string &filepath, FsCallbacks *fs) {
+  if (fs == nullptr || fs->ExpandFilePath == nullptr ||
+      fs->FileExists == nullptr) {
+    // Error, fs callback[s] missing
+    return std::string();
+  }
+
+  for (size_t i = 0; i < paths.size(); i++) {
+    std::string absPath =
+        fs->ExpandFilePath(JoinPath(paths[i], filepath), fs->user_data);
+    if (fs->FileExists(absPath, fs->user_data)) {
+      return absPath;
+    }
+  }
+
+  return std::string();
+}
+
+static std::string GetFilePathExtension(const std::string &FileName) {
+  if (FileName.find_last_of(".") != std::string::npos)
+    return FileName.substr(FileName.find_last_of(".") + 1);
+  return "";
+}
+
+static std::string GetBaseDir(const std::string &filepath) {
+  if (filepath.find_last_of("/\\") != std::string::npos)
+    return filepath.substr(0, filepath.find_last_of("/\\"));
+  return "";
+}
+
+// https://stackoverflow.com/questions/8520560/get-a-file-name-from-a-path
+static std::string GetBaseFilename(const std::string &filepath) {
+  return filepath.substr(filepath.find_last_of("/\\") + 1);
+}
+
+std::string base64_encode(unsigned char const *, unsigned int len);
+std::string base64_decode(std::string const &s);
+
+/*
+   base64.cpp and base64.h
+
+   Copyright (C) 2004-2008 René Nyffenegger
+
+   This source code is provided 'as-is', without any express or implied
+   warranty. In no event will the author be held liable for any damages
+   arising from the use of this software.
+
+   Permission is granted to anyone to use this software for any purpose,
+   including commercial applications, and to alter it and redistribute it
+   freely, subject to the following restrictions:
+
+   1. The origin of this source code must not be misrepresented; you must not
+      claim that you wrote the original source code. If you use this source code
+      in a product, an acknowledgment in the product documentation would be
+      appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+      misrepresented as being the original source code.
+
+   3. This notice may not be removed or altered from any source distribution.
+
+   René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+
+*/
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wconversion"
+#endif
+
+static inline bool is_base64(unsigned char c) {
+  return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+std::string base64_encode(unsigned char const *bytes_to_encode,
+                          unsigned int in_len) {
+  std::string ret;
+  int i = 0;
+  int j = 0;
+  unsigned char char_array_3[3];
+  unsigned char char_array_4[4];
+
+  const char *base64_chars =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz"
+      "0123456789+/";
+
+  while (in_len--) {
+    char_array_3[i++] = *(bytes_to_encode++);
+    if (i == 3) {
+      char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+      char_array_4[1] =
+          ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
+      char_array_4[2] =
+          ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
+      char_array_4[3] = char_array_3[2] & 0x3f;
+
+      for (i = 0; (i < 4); i++) ret += base64_chars[char_array_4[i]];
+      i = 0;
+    }
+  }
+
+  if (i) {
+    for (j = i; j < 3; j++) char_array_3[j] = '\0';
+
+    char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+    char_array_4[1] =
+        ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
+    char_array_4[2] =
+        ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
+
+    for (j = 0; (j < i + 1); j++) ret += base64_chars[char_array_4[j]];
+
+    while ((i++ < 3)) ret += '=';
+  }
+
+  return ret;
+}
+
+std::string base64_decode(std::string const &encoded_string) {
+  int in_len = static_cast<int>(encoded_string.size());
+  int i = 0;
+  int j = 0;
+  int in_ = 0;
+  unsigned char char_array_4[4], char_array_3[3];
+  std::string ret;
+
+  const std::string base64_chars =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+      "abcdefghijklmnopqrstuvwxyz"
+      "0123456789+/";
+
+  while (in_len-- && (encoded_string[in_] != '=') &&
+         is_base64(encoded_string[in_])) {
+    char_array_4[i++] = encoded_string[in_];
+    in_++;
+    if (i == 4) {
+      for (i = 0; i < 4; i++)
+        char_array_4[i] =
+            static_cast<unsigned char>(base64_chars.find(char_array_4[i]));
+
+      char_array_3[0] =
+          (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
+      char_array_3[1] =
+          ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+      char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+      for (i = 0; (i < 3); i++) ret += char_array_3[i];
+      i = 0;
+    }
+  }
+
+  if (i) {
+    for (j = i; j < 4; j++) char_array_4[j] = 0;
+
+    for (j = 0; j < 4; j++)
+      char_array_4[j] =
+          static_cast<unsigned char>(base64_chars.find(char_array_4[j]));
+
+    char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
+    char_array_3[1] =
+        ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+    char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+    for (j = 0; (j < i - 1); j++) ret += char_array_3[j];
+  }
+
+  return ret;
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+// https://github.com/syoyo/tinygltf/issues/228
+// TODO(syoyo): Use uriparser https://uriparser.github.io/ for stricter Uri
+// decoding?
+//
+// https://stackoverflow.com/questions/18307429/encode-decode-url-in-c
+// http://dlib.net/dlib/server/server_http.cpp.html
+
+// --- dlib beign ------------------------------------------------------------
+// Copyright (C) 2003  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+
+namespace dlib {
+
+#if 0
+        inline unsigned char to_hex( unsigned char x )
+        {
+            return x + (x > 9 ? ('A'-10) : '0');
+        }
+
+        const std::string urlencode( const std::string& s )
+        {
+            std::ostringstream os;
+
+            for ( std::string::const_iterator ci = s.begin(); ci != s.end(); ++ci )
+            {
+                if ( (*ci >= 'a' && *ci <= 'z') ||
+                     (*ci >= 'A' && *ci <= 'Z') ||
+                     (*ci >= '0' && *ci <= '9') )
+                { // allowed
+                    os << *ci;
+                }
+                else if ( *ci == ' ')
+                {
+                    os << '+';
+                }
+                else
+                {
+                    os << '%' << to_hex(static_cast<unsigned char>(*ci >> 4)) << to_hex(static_cast<unsigned char>(*ci % 16));
+                }
+            }
+
+            return os.str();
+        }
+#endif
+
+inline unsigned char from_hex(unsigned char ch) {
+  if (ch <= '9' && ch >= '0')
+    ch -= '0';
+  else if (ch <= 'f' && ch >= 'a')
+    ch -= 'a' - 10;
+  else if (ch <= 'F' && ch >= 'A')
+    ch -= 'A' - 10;
+  else
+    ch = 0;
+  return ch;
+}
+
+static const std::string urldecode(const std::string &str) {
+  using namespace std;
+  string result;
+  string::size_type i;
+  for (i = 0; i < str.size(); ++i) {
+    if (str[i] == '+') {
+      result += ' ';
+    } else if (str[i] == '%' && str.size() > i + 2) {
+      const unsigned char ch1 =
+          from_hex(static_cast<unsigned char>(str[i + 1]));
+      const unsigned char ch2 =
+          from_hex(static_cast<unsigned char>(str[i + 2]));
+      const unsigned char ch = static_cast<unsigned char>((ch1 << 4) | ch2);
+      result += static_cast<char>(ch);
+      i += 2;
+    } else {
+      result += str[i];
+    }
+  }
+  return result;
+}
+
+}  // namespace dlib
+// --- dlib end --------------------------------------------------------------
+
+static bool LoadExternalFile(std::vector<unsigned char> *out, std::string *err,
+                             std::string *warn, const std::string &filename,
+                             const std::string &basedir, bool required,
+                             size_t reqBytes, bool checkSize, FsCallbacks *fs) {
+  if (fs == nullptr || fs->FileExists == nullptr ||
+      fs->ExpandFilePath == nullptr || fs->ReadWholeFile == nullptr) {
+    // This is a developer error, assert() ?
+    if (err) {
+      (*err) += "FS callback[s] not set\n";
+    }
+    return false;
+  }
+
+  std::string *failMsgOut = required ? err : warn;
+
+  out->clear();
+
+  std::vector<std::string> paths;
+  paths.push_back(basedir);
+  paths.push_back(".");
+
+  std::string filepath = FindFile(paths, filename, fs);
+  if (filepath.empty() || filename.empty()) {
+    if (failMsgOut) {
+      (*failMsgOut) += "File not found : " + filename + "\n";
+    }
+    return false;
+  }
+
+  std::vector<unsigned char> buf;
+  std::string fileReadErr;
+  bool fileRead =
+      fs->ReadWholeFile(&buf, &fileReadErr, filepath, fs->user_data);
+  if (!fileRead) {
+    if (failMsgOut) {
+      (*failMsgOut) +=
+          "File read error : " + filepath + " : " + fileReadErr + "\n";
+    }
+    return false;
+  }
+
+  size_t sz = buf.size();
+  if (sz == 0) {
+    if (failMsgOut) {
+      (*failMsgOut) += "File is empty : " + filepath + "\n";
+    }
+    return false;
+  }
+
+  if (checkSize) {
+    if (reqBytes == sz) {
+      out->swap(buf);
+      return true;
+    } else {
+      std::stringstream ss;
+      ss << "File size mismatch : " << filepath << ", requestedBytes "
+         << reqBytes << ", but got " << sz << std::endl;
+      if (failMsgOut) {
+        (*failMsgOut) += ss.str();
+      }
+      return false;
+    }
+  }
+
+  out->swap(buf);
+  return true;
+}
+
+void TinyGLTF::SetImageLoader(LoadImageDataFunction func, void *user_data) {
+  LoadImageData = func;
+  load_image_user_data_ = user_data;
+  user_image_loader_ = true;
+}
+
+void TinyGLTF::RemoveImageLoader() {
+  LoadImageData =
+#ifndef TINYGLTF_NO_STB_IMAGE
+      &tinygltf::LoadImageData;
+#else
+      nullptr;
+#endif
+
+  load_image_user_data_ = nullptr;
+  user_image_loader_ = false;
+}
+
+#ifndef TINYGLTF_NO_STB_IMAGE
+bool LoadImageData(Image *image, const int image_idx, std::string *err,
+                   std::string *warn, int req_width, int req_height,
+                   const unsigned char *bytes, int size, void *user_data) {
+  (void)warn;
+
+  LoadImageDataOption option;
+  if (user_data) {
+    option = *reinterpret_cast<LoadImageDataOption *>(user_data);
+  }
+
+  int w = 0, h = 0, comp = 0, req_comp = 0;
+
+  unsigned char *data = nullptr;
+
+  // preserve_channels true: Use channels stored in the image file.
+  // false: force 32-bit textures for common Vulkan compatibility. It appears
+  // that some GPU drivers do not support 24-bit images for Vulkan
+  req_comp = option.preserve_channels ? 0 : 4;
+  int bits = 8;
+  int pixel_type = TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE;
+
+  // It is possible that the image we want to load is a 16bit per channel image
+  // We are going to attempt to load it as 16bit per channel, and if it worked,
+  // set the image data accodingly. We are casting the returned pointer into
+  // unsigned char, because we are representing "bytes". But we are updating
+  // the Image metadata to signal that this image uses 2 bytes (16bits) per
+  // channel:
+  if (stbi_is_16_bit_from_memory(bytes, size)) {
+    data = reinterpret_cast<unsigned char *>(
+        stbi_load_16_from_memory(bytes, size, &w, &h, &comp, req_comp));
+    if (data) {
+      bits = 16;
+      pixel_type = TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT;
+    }
+  }
+
+  // at this point, if data is still NULL, it means that the image wasn't
+  // 16bit per channel, we are going to load it as a normal 8bit per channel
+  // mage as we used to do:
+  // if image cannot be decoded, ignore parsing and keep it by its path
+  // don't break in this case
+  // FIXME we should only enter this function if the image is embedded. If
+  // image->uri references
+  // an image file, it should be left as it is. Image loading should not be
+  // mandatory (to support other formats)
+  if (!data) data = stbi_load_from_memory(bytes, size, &w, &h, &comp, req_comp);
+  if (!data) {
+    // NOTE: you can use `warn` instead of `err`
+    if (err) {
+      (*err) +=
+          "Unknown image format. STB cannot decode image data for image[" +
+          std::to_string(image_idx) + "] name = \"" + image->name + "\".\n";
+    }
+    return false;
+  }
+
+  if ((w < 1) || (h < 1)) {
+    stbi_image_free(data);
+    if (err) {
+      (*err) += "Invalid image data for image[" + std::to_string(image_idx) +
+                "] name = \"" + image->name + "\"\n";
+    }
+    return false;
+  }
+
+  if (req_width > 0) {
+    if (req_width != w) {
+      stbi_image_free(data);
+      if (err) {
+        (*err) += "Image width mismatch for image[" +
+                  std::to_string(image_idx) + "] name = \"" + image->name +
+                  "\"\n";
+      }
+      return false;
+    }
+  }
+
+  if (req_height > 0) {
+    if (req_height != h) {
+      stbi_image_free(data);
+      if (err) {
+        (*err) += "Image height mismatch. for image[" +
+                  std::to_string(image_idx) + "] name = \"" + image->name +
+                  "\"\n";
+      }
+      return false;
+    }
+  }
+
+  if (req_comp != 0) {
+    // loaded data has `req_comp` channels(components)
+    comp = req_comp;
+  }
+
+  image->width = w;
+  image->height = h;
+  image->component = comp;
+  image->bits = bits;
+  image->pixel_type = pixel_type;
+  image->image.resize(static_cast<size_t>(w * h * comp) * size_t(bits / 8));
+  std::copy(data, data + w * h * comp * (bits / 8), image->image.begin());
+  stbi_image_free(data);
+
+  return true;
+}
+#endif
+
+void TinyGLTF::SetImageWriter(WriteImageDataFunction func, void *user_data) {
+  WriteImageData = func;
+  write_image_user_data_ = user_data;
+}
+
+#ifndef TINYGLTF_NO_STB_IMAGE_WRITE
+static void WriteToMemory_stbi(void *context, void *data, int size) {
+  std::vector<unsigned char> *buffer =
+      reinterpret_cast<std::vector<unsigned char> *>(context);
+
+  unsigned char *pData = reinterpret_cast<unsigned char *>(data);
+
+  buffer->insert(buffer->end(), pData, pData + size);
+}
+
+bool WriteImageData(const std::string *basepath, const std::string *filename,
+                    Image *image, bool embedImages, void *fsPtr) {
+  const std::string ext = GetFilePathExtension(*filename);
+
+  // Write image to temporary buffer
+  std::string header;
+  std::vector<unsigned char> data;
+
+  if (ext == "png") {
+    if ((image->bits != 8) ||
+        (image->pixel_type != TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE)) {
+      // Unsupported pixel format
+      return false;
+    }
+
+    if (!stbi_write_png_to_func(WriteToMemory_stbi, &data, image->width,
+                                image->height, image->component,
+                                &image->image[0], 0)) {
+      return false;
+    }
+    header = "data:image/png;base64,";
+  } else if (ext == "jpg") {
+    if (!stbi_write_jpg_to_func(WriteToMemory_stbi, &data, image->width,
+                                image->height, image->component,
+                                &image->image[0], 100)) {
+      return false;
+    }
+    header = "data:image/jpeg;base64,";
+  } else if (ext == "bmp") {
+    if (!stbi_write_bmp_to_func(WriteToMemory_stbi, &data, image->width,
+                                image->height, image->component,
+                                &image->image[0])) {
+      return false;
+    }
+    header = "data:image/bmp;base64,";
+  } else if (!embedImages) {
+    // Error: can't output requested format to file
+    return false;
+  }
+
+  if (embedImages) {
+    // Embed base64-encoded image into URI
+    if (data.size()) {
+      image->uri =
+          header +
+          base64_encode(&data[0], static_cast<unsigned int>(data.size()));
+    } else {
+      // Throw error?
+    }
+  } else {
+    // Write image to disc
+    FsCallbacks *fs = reinterpret_cast<FsCallbacks *>(fsPtr);
+    if ((fs != nullptr) && (fs->WriteWholeFile != nullptr)) {
+      const std::string imagefilepath = JoinPath(*basepath, *filename);
+      std::string writeError;
+      if (!fs->WriteWholeFile(&writeError, imagefilepath, data,
+                              fs->user_data)) {
+        // Could not write image file to disc; Throw error ?
+        return false;
+      }
+    } else {
+      // Throw error?
+    }
+    image->uri = *filename;
+  }
+
+  return true;
+}
+#endif
+
+void TinyGLTF::SetFsCallbacks(FsCallbacks callbacks) { fs = callbacks; }
+
+#ifdef _WIN32
+static inline std::wstring UTF8ToWchar(const std::string &str) {
+  int wstr_size =
+      MultiByteToWideChar(CP_UTF8, 0, str.data(), (int)str.size(), nullptr, 0);
+  std::wstring wstr(wstr_size, 0);
+  MultiByteToWideChar(CP_UTF8, 0, str.data(), (int)str.size(), &wstr[0],
+                      (int)wstr.size());
+  return wstr;
+}
+
+static inline std::string WcharToUTF8(const std::wstring &wstr) {
+  int str_size = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
+                                     nullptr, 0, NULL, NULL);
+  std::string str(str_size, 0);
+  WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), &str[0],
+                      (int)str.size(), NULL, NULL);
+  return str;
+}
+#endif
+
+#ifndef TINYGLTF_NO_FS
+// Default implementations of filesystem functions
+
+bool FileExists(const std::string &abs_filename, void *) {
+  bool ret;
+#ifdef TINYGLTF_ANDROID_LOAD_FROM_ASSETS
+  if (asset_manager) {
+    AAsset *asset = AAssetManager_open(asset_manager, abs_filename.c_str(),
+                                       AASSET_MODE_STREAMING);
+    if (!asset) {
+      return false;
+    }
+    AAsset_close(asset);
+    ret = true;
+  } else {
+    return false;
+  }
+#else
+#ifdef _WIN32
+#if defined(_MSC_VER) || defined(__GLIBCXX__)
+  FILE *fp = nullptr;
+  errno_t err = _wfopen_s(&fp, UTF8ToWchar(abs_filename).c_str(), L"rb");
+  if (err != 0) {
+    return false;
+  }
+#else
+  FILE *fp = nullptr;
+  errno_t err = fopen_s(&fp, abs_filename.c_str(), "rb");
+  if (err != 0) {
+    return false;
+  }
+#endif
+
+#else
+  FILE *fp = fopen(abs_filename.c_str(), "rb");
+#endif
+  if (fp) {
+    ret = true;
+    fclose(fp);
+  } else {
+    ret = false;
+  }
+#endif
+
+  return ret;
+}
+
+std::string ExpandFilePath(const std::string &filepath, void *) {
+#ifdef _WIN32
+  // Assume input `filepath` is encoded in UTF-8
+  std::wstring wfilepath = UTF8ToWchar(filepath);
+  DWORD wlen = ExpandEnvironmentStringsW(wfilepath.c_str(), nullptr, 0);
+  wchar_t *wstr = new wchar_t[wlen];
+  ExpandEnvironmentStringsW(wfilepath.c_str(), wstr, wlen);
+
+  std::wstring ws(wstr);
+  delete[] wstr;
+  return WcharToUTF8(ws);
+
+#else
+
+#if defined(TARGET_OS_IPHONE) || defined(TARGET_IPHONE_SIMULATOR) || \
+    defined(__ANDROID__) || defined(__EMSCRIPTEN__) || defined(__OpenBSD__)
+  // no expansion
+  std::string s = filepath;
+#else
+  std::string s;
+  wordexp_t p;
+
+  if (filepath.empty()) {
+    return "";
+  }
+
+  // Quote the string to keep any spaces in filepath intact.
+  std::string quoted_path = "\"" + filepath + "\"";
+  // char** w;
+  int ret = wordexp(quoted_path.c_str(), &p, 0);
+  if (ret) {
+    // err
+    s = filepath;
+    return s;
+  }
+
+  // Use first element only.
+  if (p.we_wordv) {
+    s = std::string(p.we_wordv[0]);
+    wordfree(&p);
+  } else {
+    s = filepath;
+  }
+
+#endif
+
+  return s;
+#endif
+}
+
+bool ReadWholeFile(std::vector<unsigned char> *out, std::string *err,
+                   const std::string &filepath, void *) {
+#ifdef TINYGLTF_ANDROID_LOAD_FROM_ASSETS
+  if (asset_manager) {
+    AAsset *asset = AAssetManager_open(asset_manager, filepath.c_str(),
+                                       AASSET_MODE_STREAMING);
+    if (!asset) {
+      if (err) {
+        (*err) += "File open error : " + filepath + "\n";
+      }
+      return false;
+    }
+    size_t size = AAsset_getLength(asset);
+    if (size == 0) {
+      if (err) {
+        (*err) += "Invalid file size : " + filepath +
+                  " (does the path point to a directory?)";
+      }
+      return false;
+    }
+    out->resize(size);
+    AAsset_read(asset, reinterpret_cast<char *>(&out->at(0)), size);
+    AAsset_close(asset);
+    return true;
+  } else {
+    if (err) {
+      (*err) += "No asset manager specified : " + filepath + "\n";
+    }
+    return false;
+  }
+#else
+#ifdef _WIN32
+#if defined(__GLIBCXX__)  // mingw
+  int file_descriptor =
+      _wopen(UTF8ToWchar(filepath).c_str(), _O_RDONLY | _O_BINARY);
+  __gnu_cxx::stdio_filebuf<char> wfile_buf(file_descriptor, std::ios_base::in);
+  std::istream f(&wfile_buf);
+#elif defined(_MSC_VER) || defined(_LIBCPP_VERSION)
+  // For libcxx, assume _LIBCPP_HAS_OPEN_WITH_WCHAR is defined to accept
+  // `wchar_t *`
+  std::ifstream f(UTF8ToWchar(filepath).c_str(), std::ifstream::binary);
+#else
+  // Unknown compiler/runtime
+  std::ifstream f(filepath.c_str(), std::ifstream::binary);
+#endif
+#else
+  std::ifstream f(filepath.c_str(), std::ifstream::binary);
+#endif
+  if (!f) {
+    if (err) {
+      (*err) += "File open error : " + filepath + "\n";
+    }
+    return false;
+  }
+
+  f.seekg(0, f.end);
+  size_t sz = static_cast<size_t>(f.tellg());
+  f.seekg(0, f.beg);
+
+  if (int64_t(sz) < 0) {
+    if (err) {
+      (*err) += "Invalid file size : " + filepath +
+                " (does the path point to a directory?)";
+    }
+    return false;
+  } else if (sz == 0) {
+    if (err) {
+      (*err) += "File is empty : " + filepath + "\n";
+    }
+    return false;
+  }
+
+  out->resize(sz);
+  f.read(reinterpret_cast<char *>(&out->at(0)),
+         static_cast<std::streamsize>(sz));
+
+  return true;
+#endif
+}
+
+bool WriteWholeFile(std::string *err, const std::string &filepath,
+                    const std::vector<unsigned char> &contents, void *) {
+#ifdef _WIN32
+#if defined(__GLIBCXX__)  // mingw
+  int file_descriptor = _wopen(UTF8ToWchar(filepath).c_str(),
+                               _O_CREAT | _O_WRONLY | _O_TRUNC | _O_BINARY);
+  __gnu_cxx::stdio_filebuf<char> wfile_buf(
+      file_descriptor, std::ios_base::out | std::ios_base::binary);
+  std::ostream f(&wfile_buf);
+#elif defined(_MSC_VER)
+  std::ofstream f(UTF8ToWchar(filepath).c_str(), std::ofstream::binary);
+#else  // clang?
+  std::ofstream f(filepath.c_str(), std::ofstream::binary);
+#endif
+#else
+  std::ofstream f(filepath.c_str(), std::ofstream::binary);
+#endif
+  if (!f) {
+    if (err) {
+      (*err) += "File open error for writing : " + filepath + "\n";
+    }
+    return false;
+  }
+
+  f.write(reinterpret_cast<const char *>(&contents.at(0)),
+          static_cast<std::streamsize>(contents.size()));
+  if (!f) {
+    if (err) {
+      (*err) += "File write error: " + filepath + "\n";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#endif  // TINYGLTF_NO_FS
+
+static std::string MimeToExt(const std::string &mimeType) {
+  if (mimeType == "image/jpeg") {
+    return "jpg";
+  } else if (mimeType == "image/png") {
+    return "png";
+  } else if (mimeType == "image/bmp") {
+    return "bmp";
+  } else if (mimeType == "image/gif") {
+    return "gif";
+  }
+
+  return "";
+}
+
+static void UpdateImageObject(Image &image, std::string &baseDir, int index,
+                              bool embedImages,
+                              WriteImageDataFunction *WriteImageData = nullptr,
+                              void *user_data = nullptr) {
+  std::string filename;
+  std::string ext;
+  // If image has uri, use it it as a filename
+  if (image.uri.size()) {
+    filename = GetBaseFilename(image.uri);
+    ext = GetFilePathExtension(filename);
+  } else if (image.bufferView != -1) {
+    // If there's no URI and the data exists in a buffer,
+    // don't change properties or write images
+  } else if (image.name.size()) {
+    ext = MimeToExt(image.mimeType);
+    // Otherwise use name as filename
+    filename = image.name + "." + ext;
+  } else {
+    ext = MimeToExt(image.mimeType);
+    // Fallback to index of image as filename
+    filename = std::to_string(index) + "." + ext;
+  }
+
+  // If callback is set, modify image data object
+  if (*WriteImageData != nullptr && !filename.empty()) {
+    std::string uri;
+    (*WriteImageData)(&baseDir, &filename, &image, embedImages, user_data);
+  }
+}
+
+bool IsDataURI(const std::string &in) {
+  std::string header = "data:application/octet-stream;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  header = "data:image/jpeg;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  header = "data:image/png;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  header = "data:image/bmp;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  header = "data:image/gif;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  header = "data:text/plain;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  header = "data:application/gltf-buffer;base64,";
+  if (in.find(header) == 0) {
+    return true;
+  }
+
+  return false;
+}
+
+bool DecodeDataURI(std::vector<unsigned char> *out, std::string &mime_type,
+                   const std::string &in, size_t reqBytes, bool checkSize) {
+  std::string header = "data:application/octet-stream;base64,";
+  std::string data;
+  if (in.find(header) == 0) {
+    data = base64_decode(in.substr(header.size()));  // cut mime string.
+  }
+
+  if (data.empty()) {
+    header = "data:image/jpeg;base64,";
+    if (in.find(header) == 0) {
+      mime_type = "image/jpeg";
+      data = base64_decode(in.substr(header.size()));  // cut mime string.
+    }
+  }
+
+  if (data.empty()) {
+    header = "data:image/png;base64,";
+    if (in.find(header) == 0) {
+      mime_type = "image/png";
+      data = base64_decode(in.substr(header.size()));  // cut mime string.
+    }
+  }
+
+  if (data.empty()) {
+    header = "data:image/bmp;base64,";
+    if (in.find(header) == 0) {
+      mime_type = "image/bmp";
+      data = base64_decode(in.substr(header.size()));  // cut mime string.
+    }
+  }
+
+  if (data.empty()) {
+    header = "data:image/gif;base64,";
+    if (in.find(header) == 0) {
+      mime_type = "image/gif";
+      data = base64_decode(in.substr(header.size()));  // cut mime string.
+    }
+  }
+
+  if (data.empty()) {
+    header = "data:text/plain;base64,";
+    if (in.find(header) == 0) {
+      mime_type = "text/plain";
+      data = base64_decode(in.substr(header.size()));
+    }
+  }
+
+  if (data.empty()) {
+    header = "data:application/gltf-buffer;base64,";
+    if (in.find(header) == 0) {
+      data = base64_decode(in.substr(header.size()));
+    }
+  }
+
+  // TODO(syoyo): Allow empty buffer? #229
+  if (data.empty()) {
+    return false;
+  }
+
+  if (checkSize) {
+    if (data.size() != reqBytes) {
+      return false;
+    }
+    out->resize(reqBytes);
+  } else {
+    out->resize(data.size());
+  }
+  std::copy(data.begin(), data.end(), out->begin());
+  return true;
+}
+
+namespace {
+bool GetInt(const json &o, int &val) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  if (!o.IsDouble()) {
+    if (o.IsInt()) {
+      val = o.GetInt();
+      return true;
+    } else if (o.IsUint()) {
+      val = static_cast<int>(o.GetUint());
+      return true;
+    } else if (o.IsInt64()) {
+      val = static_cast<int>(o.GetInt64());
+      return true;
+    } else if (o.IsUint64()) {
+      val = static_cast<int>(o.GetUint64());
+      return true;
+    }
+  }
+
+  return false;
+#else
+  auto type = o.type();
+
+  if ((type == json::value_t::number_integer) ||
+      (type == json::value_t::number_unsigned)) {
+    val = static_cast<int>(o.get<int64_t>());
+    return true;
+  }
+
+  return false;
+#endif
+}
+
+#ifdef TINYGLTF_USE_RAPIDJSON
+bool GetDouble(const json &o, double &val) {
+  if (o.IsDouble()) {
+    val = o.GetDouble();
+    return true;
+  }
+
+  return false;
+}
+#endif
+
+bool GetNumber(const json &o, double &val) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  if (o.IsNumber()) {
+    val = o.GetDouble();
+    return true;
+  }
+
+  return false;
+#else
+  if (o.is_number()) {
+    val = o.get<double>();
+    return true;
+  }
+
+  return false;
+#endif
+}
+
+bool GetString(const json &o, std::string &val) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  if (o.IsString()) {
+    val = o.GetString();
+    return true;
+  }
+
+  return false;
+#else
+  if (o.type() == json::value_t::string) {
+    val = o.get<std::string>();
+    return true;
+  }
+
+  return false;
+#endif
+}
+
+bool IsArray(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.IsArray();
+#else
+  return o.is_array();
+#endif
+}
+
+json_const_array_iterator ArrayBegin(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.Begin();
+#else
+  return o.begin();
+#endif
+}
+
+json_const_array_iterator ArrayEnd(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.End();
+#else
+  return o.end();
+#endif
+}
+
+bool IsObject(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.IsObject();
+#else
+  return o.is_object();
+#endif
+}
+
+json_const_iterator ObjectBegin(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.MemberBegin();
+#else
+  return o.begin();
+#endif
+}
+
+json_const_iterator ObjectEnd(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.MemberEnd();
+#else
+  return o.end();
+#endif
+}
+
+// Making this a const char* results in a pointer to a temporary when
+// TINYGLTF_USE_RAPIDJSON is off.
+std::string GetKey(json_const_iterator &it) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return it->name.GetString();
+#else
+  return it.key().c_str();
+#endif
+}
+
+bool FindMember(const json &o, const char *member, json_const_iterator &it) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  if (!o.IsObject()) {
+    return false;
+  }
+  it = o.FindMember(member);
+  return it != o.MemberEnd();
+#else
+  it = o.find(member);
+  return it != o.end();
+#endif
+}
+
+const json &GetValue(json_const_iterator &it) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return it->value;
+#else
+  return it.value();
+#endif
+}
+
+std::string JsonToString(const json &o, int spacing = -1) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  using namespace rapidjson;
+  StringBuffer buffer;
+  if (spacing == -1) {
+    Writer<StringBuffer> writer(buffer);
+    o.Accept(writer);
+  } else {
+    PrettyWriter<StringBuffer> writer(buffer);
+    writer.SetIndent(' ', uint32_t(spacing));
+    o.Accept(writer);
+  }
+  return buffer.GetString();
+#else
+  return o.dump(spacing);
+#endif
+}
+
+}  // namespace
+
+static bool ParseJsonAsValue(Value *ret, const json &o) {
+  Value val{};
+#ifdef TINYGLTF_USE_RAPIDJSON
+  using rapidjson::Type;
+  switch (o.GetType()) {
+    case Type::kObjectType: {
+      Value::Object value_object;
+      for (auto it = o.MemberBegin(); it != o.MemberEnd(); ++it) {
+        Value entry;
+        ParseJsonAsValue(&entry, it->value);
+        if (entry.Type() != NULL_TYPE)
+          value_object.emplace(GetKey(it), std::move(entry));
+      }
+      if (value_object.size() > 0) val = Value(std::move(value_object));
+    } break;
+    case Type::kArrayType: {
+      Value::Array value_array;
+      value_array.reserve(o.Size());
+      for (auto it = o.Begin(); it != o.End(); ++it) {
+        Value entry;
+        ParseJsonAsValue(&entry, *it);
+        if (entry.Type() != NULL_TYPE)
+          value_array.emplace_back(std::move(entry));
+      }
+      if (value_array.size() > 0) val = Value(std::move(value_array));
+    } break;
+    case Type::kStringType:
+      val = Value(std::string(o.GetString()));
+      break;
+    case Type::kFalseType:
+    case Type::kTrueType:
+      val = Value(o.GetBool());
+      break;
+    case Type::kNumberType:
+      if (!o.IsDouble()) {
+        int i = 0;
+        GetInt(o, i);
+        val = Value(i);
+      } else {
+        double d = 0.0;
+        GetDouble(o, d);
+        val = Value(d);
+      }
+      break;
+    case Type::kNullType:
+      break;
+      // all types are covered, so no `case default`
+  }
+#else
+  switch (o.type()) {
+    case json::value_t::object: {
+      Value::Object value_object;
+      for (auto it = o.begin(); it != o.end(); it++) {
+        Value entry;
+        ParseJsonAsValue(&entry, it.value());
+        if (entry.Type() != NULL_TYPE)
+          value_object.emplace(it.key(), std::move(entry));
+      }
+      if (value_object.size() > 0) val = Value(std::move(value_object));
+    } break;
+    case json::value_t::array: {
+      Value::Array value_array;
+      value_array.reserve(o.size());
+      for (auto it = o.begin(); it != o.end(); it++) {
+        Value entry;
+        ParseJsonAsValue(&entry, it.value());
+        if (entry.Type() != NULL_TYPE)
+          value_array.emplace_back(std::move(entry));
+      }
+      if (value_array.size() > 0) val = Value(std::move(value_array));
+    } break;
+    case json::value_t::string:
+      val = Value(o.get<std::string>());
+      break;
+    case json::value_t::boolean:
+      val = Value(o.get<bool>());
+      break;
+    case json::value_t::number_integer:
+    case json::value_t::number_unsigned:
+      val = Value(static_cast<int>(o.get<int64_t>()));
+      break;
+    case json::value_t::number_float:
+      val = Value(o.get<double>());
+      break;
+    case json::value_t::null:
+    case json::value_t::discarded:
+      // default:
+      break;
+  }
+#endif
+  if (ret) *ret = std::move(val);
+
+  return val.Type() != NULL_TYPE;
+}
+
+static bool ParseExtrasProperty(Value *ret, const json &o) {
+  json_const_iterator it;
+  if (!FindMember(o, "extras", it)) {
+    return false;
+  }
+
+  return ParseJsonAsValue(ret, GetValue(it));
+}
+
+static bool ParseBooleanProperty(bool *ret, std::string *err, const json &o,
+                                 const std::string &property,
+                                 const bool required,
+                                 const std::string &parent_node = "") {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  auto &value = GetValue(it);
+
+  bool isBoolean;
+  bool boolValue = false;
+#ifdef TINYGLTF_USE_RAPIDJSON
+  isBoolean = value.IsBool();
+  if (isBoolean) {
+    boolValue = value.GetBool();
+  }
+#else
+  isBoolean = value.is_boolean();
+  if (isBoolean) {
+    boolValue = value.get<bool>();
+  }
+#endif
+  if (!isBoolean) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not a bool type.\n";
+      }
+    }
+    return false;
+  }
+
+  if (ret) {
+    (*ret) = boolValue;
+  }
+
+  return true;
+}
+
+static bool ParseIntegerProperty(int *ret, std::string *err, const json &o,
+                                 const std::string &property,
+                                 const bool required,
+                                 const std::string &parent_node = "") {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  int intValue;
+  bool isInt = GetInt(GetValue(it), intValue);
+  if (!isInt) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not an integer type.\n";
+      }
+    }
+    return false;
+  }
+
+  if (ret) {
+    (*ret) = intValue;
+  }
+
+  return true;
+}
+
+static bool ParseUnsignedProperty(size_t *ret, std::string *err, const json &o,
+                                  const std::string &property,
+                                  const bool required,
+                                  const std::string &parent_node = "") {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  auto &value = GetValue(it);
+
+  size_t uValue = 0;
+  bool isUValue;
+#ifdef TINYGLTF_USE_RAPIDJSON
+  isUValue = false;
+  if (value.IsUint()) {
+    uValue = value.GetUint();
+    isUValue = true;
+  } else if (value.IsUint64()) {
+    uValue = value.GetUint64();
+    isUValue = true;
+  }
+#else
+  isUValue = value.is_number_unsigned();
+  if (isUValue) {
+    uValue = value.get<size_t>();
+  }
+#endif
+  if (!isUValue) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not a positive integer.\n";
+      }
+    }
+    return false;
+  }
+
+  if (ret) {
+    (*ret) = uValue;
+  }
+
+  return true;
+}
+
+static bool ParseNumberProperty(double *ret, std::string *err, const json &o,
+                                const std::string &property,
+                                const bool required,
+                                const std::string &parent_node = "") {
+  json_const_iterator it;
+
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  double numberValue;
+  bool isNumber = GetNumber(GetValue(it), numberValue);
+
+  if (!isNumber) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not a number type.\n";
+      }
+    }
+    return false;
+  }
+
+  if (ret) {
+    (*ret) = numberValue;
+  }
+
+  return true;
+}
+
+static bool ParseNumberArrayProperty(std::vector<double> *ret, std::string *err,
+                                     const json &o, const std::string &property,
+                                     bool required,
+                                     const std::string &parent_node = "") {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  if (!IsArray(GetValue(it))) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not an array";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  ret->clear();
+  auto end = ArrayEnd(GetValue(it));
+  for (auto i = ArrayBegin(GetValue(it)); i != end; ++i) {
+    double numberValue;
+    const bool isNumber = GetNumber(*i, numberValue);
+    if (!isNumber) {
+      if (required) {
+        if (err) {
+          (*err) += "'" + property + "' property is not a number.\n";
+          if (!parent_node.empty()) {
+            (*err) += " in " + parent_node;
+          }
+          (*err) += ".\n";
+        }
+      }
+      return false;
+    }
+    ret->push_back(numberValue);
+  }
+
+  return true;
+}
+
+static bool ParseIntegerArrayProperty(std::vector<int> *ret, std::string *err,
+                                      const json &o,
+                                      const std::string &property,
+                                      bool required,
+                                      const std::string &parent_node = "") {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  if (!IsArray(GetValue(it))) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not an array";
+        if (!parent_node.empty()) {
+          (*err) += " in " + parent_node;
+        }
+        (*err) += ".\n";
+      }
+    }
+    return false;
+  }
+
+  ret->clear();
+  auto end = ArrayEnd(GetValue(it));
+  for (auto i = ArrayBegin(GetValue(it)); i != end; ++i) {
+    int numberValue;
+    bool isNumber = GetInt(*i, numberValue);
+    if (!isNumber) {
+      if (required) {
+        if (err) {
+          (*err) += "'" + property + "' property is not an integer type.\n";
+          if (!parent_node.empty()) {
+            (*err) += " in " + parent_node;
+          }
+          (*err) += ".\n";
+        }
+      }
+      return false;
+    }
+    ret->push_back(numberValue);
+  }
+
+  return true;
+}
+
+static bool ParseStringProperty(
+    std::string *ret, std::string *err, const json &o,
+    const std::string &property, bool required,
+    const std::string &parent_node = std::string()) {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing";
+        if (parent_node.empty()) {
+          (*err) += ".\n";
+        } else {
+          (*err) += " in `" + parent_node + "'.\n";
+        }
+      }
+    }
+    return false;
+  }
+
+  std::string strValue;
+  if (!GetString(GetValue(it), strValue)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not a string type.\n";
+      }
+    }
+    return false;
+  }
+
+  if (ret) {
+    (*ret) = std::move(strValue);
+  }
+
+  return true;
+}
+
+static bool ParseStringIntegerProperty(std::map<std::string, int> *ret,
+                                       std::string *err, const json &o,
+                                       const std::string &property,
+                                       bool required,
+                                       const std::string &parent = "") {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        if (!parent.empty()) {
+          (*err) +=
+              "'" + property + "' property is missing in " + parent + ".\n";
+        } else {
+          (*err) += "'" + property + "' property is missing.\n";
+        }
+      }
+    }
+    return false;
+  }
+
+  const json &dict = GetValue(it);
+
+  // Make sure we are dealing with an object / dictionary.
+  if (!IsObject(dict)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not an object.\n";
+      }
+    }
+    return false;
+  }
+
+  ret->clear();
+
+  json_const_iterator dictIt(ObjectBegin(dict));
+  json_const_iterator dictItEnd(ObjectEnd(dict));
+
+  for (; dictIt != dictItEnd; ++dictIt) {
+    int intVal;
+    if (!GetInt(GetValue(dictIt), intVal)) {
+      if (required) {
+        if (err) {
+          (*err) += "'" + property + "' value is not an integer type.\n";
+        }
+      }
+      return false;
+    }
+
+    // Insert into the list.
+    (*ret)[GetKey(dictIt)] = intVal;
+  }
+  return true;
+}
+
+static bool ParseJSONProperty(std::map<std::string, double> *ret,
+                              std::string *err, const json &o,
+                              const std::string &property, bool required) {
+  json_const_iterator it;
+  if (!FindMember(o, property.c_str(), it)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is missing. \n'";
+      }
+    }
+    return false;
+  }
+
+  const json &obj = GetValue(it);
+
+  if (!IsObject(obj)) {
+    if (required) {
+      if (err) {
+        (*err) += "'" + property + "' property is not a JSON object.\n";
+      }
+    }
+    return false;
+  }
+
+  ret->clear();
+
+  json_const_iterator it2(ObjectBegin(obj));
+  json_const_iterator itEnd(ObjectEnd(obj));
+  for (; it2 != itEnd; ++it2) {
+    double numVal;
+    if (GetNumber(GetValue(it2), numVal))
+      ret->emplace(std::string(GetKey(it2)), numVal);
+  }
+
+  return true;
+}
+
+static bool ParseParameterProperty(Parameter *param, std::string *err,
+                                   const json &o, const std::string &prop,
+                                   bool required) {
+  // A parameter value can either be a string or an array of either a boolean or
+  // a number. Booleans of any kind aren't supported here. Granted, it
+  // complicates the Parameter structure and breaks it semantically in the sense
+  // that the client probably works off the assumption that if the string is
+  // empty the vector is used, etc. Would a tagged union work?
+  if (ParseStringProperty(&param->string_value, err, o, prop, false)) {
+    // Found string property.
+    return true;
+  } else if (ParseNumberArrayProperty(&param->number_array, err, o, prop,
+                                      false)) {
+    // Found a number array.
+    return true;
+  } else if (ParseNumberProperty(&param->number_value, err, o, prop, false)) {
+    return param->has_number_value = true;
+  } else if (ParseJSONProperty(&param->json_double_value, err, o, prop,
+                               false)) {
+    return true;
+  } else if (ParseBooleanProperty(&param->bool_value, err, o, prop, false)) {
+    return true;
+  } else {
+    if (required) {
+      if (err) {
+        (*err) += "parameter must be a string or number / number array.\n";
+      }
+    }
+    return false;
+  }
+}
+
+static bool ParseExtensionsProperty(ExtensionMap *ret, std::string *err,
+                                    const json &o) {
+  (void)err;
+
+  json_const_iterator it;
+  if (!FindMember(o, "extensions", it)) {
+    return false;
+  }
+
+  auto &obj = GetValue(it);
+  if (!IsObject(obj)) {
+    return false;
+  }
+  ExtensionMap extensions;
+  json_const_iterator extIt = ObjectBegin(obj);  // it.value().begin();
+  json_const_iterator extEnd = ObjectEnd(obj);
+  for (; extIt != extEnd; ++extIt) {
+    auto &itObj = GetValue(extIt);
+    if (!IsObject(itObj)) continue;
+    std::string key(GetKey(extIt));
+    if (!ParseJsonAsValue(&extensions[key], itObj)) {
+      if (!key.empty()) {
+        // create empty object so that an extension object is still of type
+        // object
+        extensions[key] = Value{Value::Object{}};
+      }
+    }
+  }
+  if (ret) {
+    (*ret) = std::move(extensions);
+  }
+  return true;
+}
+
+static bool ParseAsset(Asset *asset, std::string *err, const json &o,
+                       bool store_original_json_for_extras_and_extensions) {
+  ParseStringProperty(&asset->version, err, o, "version", true, "Asset");
+  ParseStringProperty(&asset->generator, err, o, "generator", false, "Asset");
+  ParseStringProperty(&asset->minVersion, err, o, "minVersion", false, "Asset");
+  ParseStringProperty(&asset->copyright, err, o, "copyright", false, "Asset");
+
+  ParseExtensionsProperty(&asset->extensions, err, o);
+
+  // Unity exporter version is added as extra here
+  ParseExtrasProperty(&(asset->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        asset->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        asset->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseImage(Image *image, const int image_idx, std::string *err,
+                       std::string *warn, const json &o,
+                       bool store_original_json_for_extras_and_extensions,
+                       const std::string &basedir, FsCallbacks *fs,
+                       LoadImageDataFunction *LoadImageData = nullptr,
+                       void *load_image_user_data = nullptr) {
+  // A glTF image must either reference a bufferView or an image uri
+
+  // schema says oneOf [`bufferView`, `uri`]
+  // TODO(syoyo): Check the type of each parameters.
+  json_const_iterator it;
+  bool hasBufferView = FindMember(o, "bufferView", it);
+  bool hasURI = FindMember(o, "uri", it);
+
+  ParseStringProperty(&image->name, err, o, "name", false);
+
+  if (hasBufferView && hasURI) {
+    // Should not both defined.
+    if (err) {
+      (*err) +=
+          "Only one of `bufferView` or `uri` should be defined, but both are "
+          "defined for image[" +
+          std::to_string(image_idx) + "] name = \"" + image->name + "\"\n";
+    }
+    return false;
+  }
+
+  if (!hasBufferView && !hasURI) {
+    if (err) {
+      (*err) += "Neither required `bufferView` nor `uri` defined for image[" +
+                std::to_string(image_idx) + "] name = \"" + image->name +
+                "\"\n";
+    }
+    return false;
+  }
+
+  ParseExtensionsProperty(&image->extensions, err, o);
+  ParseExtrasProperty(&image->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator eit;
+      if (FindMember(o, "extensions", eit)) {
+        image->extensions_json_string = JsonToString(GetValue(eit));
+      }
+    }
+    {
+      json_const_iterator eit;
+      if (FindMember(o, "extras", eit)) {
+        image->extras_json_string = JsonToString(GetValue(eit));
+      }
+    }
+  }
+
+  if (hasBufferView) {
+    int bufferView = -1;
+    if (!ParseIntegerProperty(&bufferView, err, o, "bufferView", true)) {
+      if (err) {
+        (*err) += "Failed to parse `bufferView` for image[" +
+                  std::to_string(image_idx) + "] name = \"" + image->name +
+                  "\"\n";
+      }
+      return false;
+    }
+
+    std::string mime_type;
+    ParseStringProperty(&mime_type, err, o, "mimeType", false);
+
+    int width = 0;
+    ParseIntegerProperty(&width, err, o, "width", false);
+
+    int height = 0;
+    ParseIntegerProperty(&height, err, o, "height", false);
+
+    // Just only save some information here. Loading actual image data from
+    // bufferView is done after this `ParseImage` function.
+    image->bufferView = bufferView;
+    image->mimeType = mime_type;
+    image->width = width;
+    image->height = height;
+
+    return true;
+  }
+
+  // Parse URI & Load image data.
+
+  std::string uri;
+  std::string tmp_err;
+  if (!ParseStringProperty(&uri, &tmp_err, o, "uri", true)) {
+    if (err) {
+      (*err) += "Failed to parse `uri` for image[" + std::to_string(image_idx) +
+                "] name = \"" + image->name + "\".\n";
+    }
+    return false;
+  }
+
+  std::vector<unsigned char> img;
+
+  if (IsDataURI(uri)) {
+    if (!DecodeDataURI(&img, image->mimeType, uri, 0, false)) {
+      if (err) {
+        (*err) += "Failed to decode 'uri' for image[" +
+                  std::to_string(image_idx) + "] name = [" + image->name +
+                  "]\n";
+      }
+      return false;
+    }
+  } else {
+    // Assume external file
+    // Keep texture path (for textures that cannot be decoded)
+    image->uri = uri;
+#ifdef TINYGLTF_NO_EXTERNAL_IMAGE
+    return true;
+#endif
+    std::string decoded_uri = dlib::urldecode(uri);
+    if (!LoadExternalFile(&img, err, warn, decoded_uri, basedir,
+                          /* required */ false, /* required bytes */ 0,
+                          /* checksize */ false, fs)) {
+      if (warn) {
+        (*warn) += "Failed to load external 'uri' for image[" +
+                   std::to_string(image_idx) + "] name = [" + image->name +
+                   "]\n";
+      }
+      // If the image cannot be loaded, keep uri as image->uri.
+      return true;
+    }
+
+    if (img.empty()) {
+      if (warn) {
+        (*warn) += "Image data is empty for image[" +
+                   std::to_string(image_idx) + "] name = [" + image->name +
+                   "] \n";
+      }
+      return false;
+    }
+  }
+
+  if (*LoadImageData == nullptr) {
+    if (err) {
+      (*err) += "No LoadImageData callback specified.\n";
+    }
+    return false;
+  }
+  return (*LoadImageData)(image, image_idx, err, warn, 0, 0, &img.at(0),
+                          static_cast<int>(img.size()), load_image_user_data);
+}
+
+static bool ParseTexture(Texture *texture, std::string *err, const json &o,
+                         bool store_original_json_for_extras_and_extensions,
+                         const std::string &basedir) {
+  (void)basedir;
+  int sampler = -1;
+  int source = -1;
+  ParseIntegerProperty(&sampler, err, o, "sampler", false);
+
+  ParseIntegerProperty(&source, err, o, "source", false);
+
+  texture->sampler = sampler;
+  texture->source = source;
+
+  ParseExtensionsProperty(&texture->extensions, err, o);
+  ParseExtrasProperty(&texture->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        texture->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        texture->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  ParseStringProperty(&texture->name, err, o, "name", false);
+
+  return true;
+}
+
+static bool ParseTextureInfo(
+    TextureInfo *texinfo, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  if (texinfo == nullptr) {
+    return false;
+  }
+
+  if (!ParseIntegerProperty(&texinfo->index, err, o, "index",
+                            /* required */ true, "TextureInfo")) {
+    return false;
+  }
+
+  ParseIntegerProperty(&texinfo->texCoord, err, o, "texCoord", false);
+
+  ParseExtensionsProperty(&texinfo->extensions, err, o);
+  ParseExtrasProperty(&texinfo->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        texinfo->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        texinfo->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseNormalTextureInfo(
+    NormalTextureInfo *texinfo, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  if (texinfo == nullptr) {
+    return false;
+  }
+
+  if (!ParseIntegerProperty(&texinfo->index, err, o, "index",
+                            /* required */ true, "NormalTextureInfo")) {
+    return false;
+  }
+
+  ParseIntegerProperty(&texinfo->texCoord, err, o, "texCoord", false);
+  ParseNumberProperty(&texinfo->scale, err, o, "scale", false);
+
+  ParseExtensionsProperty(&texinfo->extensions, err, o);
+  ParseExtrasProperty(&texinfo->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        texinfo->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        texinfo->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseOcclusionTextureInfo(
+    OcclusionTextureInfo *texinfo, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  if (texinfo == nullptr) {
+    return false;
+  }
+
+  if (!ParseIntegerProperty(&texinfo->index, err, o, "index",
+                            /* required */ true, "NormalTextureInfo")) {
+    return false;
+  }
+
+  ParseIntegerProperty(&texinfo->texCoord, err, o, "texCoord", false);
+  ParseNumberProperty(&texinfo->strength, err, o, "strength", false);
+
+  ParseExtensionsProperty(&texinfo->extensions, err, o);
+  ParseExtrasProperty(&texinfo->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        texinfo->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        texinfo->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseBuffer(Buffer *buffer, std::string *err, const json &o,
+                        bool store_original_json_for_extras_and_extensions,
+                        FsCallbacks *fs, const std::string &basedir,
+                        bool is_binary = false,
+                        const unsigned char *bin_data = nullptr,
+                        size_t bin_size = 0) {
+  size_t byteLength;
+  if (!ParseUnsignedProperty(&byteLength, err, o, "byteLength", true,
+                             "Buffer")) {
+    return false;
+  }
+
+  // In glTF 2.0, uri is not mandatory anymore
+  buffer->uri.clear();
+  ParseStringProperty(&buffer->uri, err, o, "uri", false, "Buffer");
+
+  // having an empty uri for a non embedded image should not be valid
+  if (!is_binary && buffer->uri.empty()) {
+    if (err) {
+      (*err) += "'uri' is missing from non binary glTF file buffer.\n";
+    }
+  }
+
+  json_const_iterator type;
+  if (FindMember(o, "type", type)) {
+    std::string typeStr;
+    if (GetString(GetValue(type), typeStr)) {
+      if (typeStr.compare("arraybuffer") == 0) {
+        // buffer.type = "arraybuffer";
+      }
+    }
+  }
+
+  if (is_binary) {
+    // Still binary glTF accepts external dataURI.
+    if (!buffer->uri.empty()) {
+      // First try embedded data URI.
+      if (IsDataURI(buffer->uri)) {
+        std::string mime_type;
+        if (!DecodeDataURI(&buffer->data, mime_type, buffer->uri, byteLength,
+                           true)) {
+          if (err) {
+            (*err) +=
+                "Failed to decode 'uri' : " + buffer->uri + " in Buffer\n";
+          }
+          return false;
+        }
+      } else {
+        // External .bin file.
+        std::string decoded_uri = dlib::urldecode(buffer->uri);
+        if (!LoadExternalFile(&buffer->data, err, /* warn */ nullptr,
+                              decoded_uri, basedir, /* required */ true,
+                              byteLength, /* checkSize */ true, fs)) {
+          return false;
+        }
+      }
+    } else {
+      // load data from (embedded) binary data
+
+      if ((bin_size == 0) || (bin_data == nullptr)) {
+        if (err) {
+          (*err) += "Invalid binary data in `Buffer'.\n";
+        }
+        return false;
+      }
+
+      if (byteLength > bin_size) {
+        if (err) {
+          std::stringstream ss;
+          ss << "Invalid `byteLength'. Must be equal or less than binary size: "
+                "`byteLength' = "
+             << byteLength << ", binary size = " << bin_size << std::endl;
+          (*err) += ss.str();
+        }
+        return false;
+      }
+
+      // Read buffer data
+      buffer->data.resize(static_cast<size_t>(byteLength));
+      memcpy(&(buffer->data.at(0)), bin_data, static_cast<size_t>(byteLength));
+    }
+
+  } else {
+    if (IsDataURI(buffer->uri)) {
+      std::string mime_type;
+      if (!DecodeDataURI(&buffer->data, mime_type, buffer->uri, byteLength,
+                         true)) {
+        if (err) {
+          (*err) += "Failed to decode 'uri' : " + buffer->uri + " in Buffer\n";
+        }
+        return false;
+      }
+    } else {
+      // Assume external .bin file.
+      std::string decoded_uri = dlib::urldecode(buffer->uri);
+      if (!LoadExternalFile(&buffer->data, err, /* warn */ nullptr, decoded_uri,
+                            basedir, /* required */ true, byteLength,
+                            /* checkSize */ true, fs)) {
+        return false;
+      }
+    }
+  }
+
+  ParseStringProperty(&buffer->name, err, o, "name", false);
+
+  ParseExtensionsProperty(&buffer->extensions, err, o);
+  ParseExtrasProperty(&buffer->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        buffer->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        buffer->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseBufferView(
+    BufferView *bufferView, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  int buffer = -1;
+  if (!ParseIntegerProperty(&buffer, err, o, "buffer", true, "BufferView")) {
+    return false;
+  }
+
+  size_t byteOffset = 0;
+  ParseUnsignedProperty(&byteOffset, err, o, "byteOffset", false);
+
+  size_t byteLength = 1;
+  if (!ParseUnsignedProperty(&byteLength, err, o, "byteLength", true,
+                             "BufferView")) {
+    return false;
+  }
+
+  size_t byteStride = 0;
+  if (!ParseUnsignedProperty(&byteStride, err, o, "byteStride", false)) {
+    // Spec says: When byteStride of referenced bufferView is not defined, it
+    // means that accessor elements are tightly packed, i.e., effective stride
+    // equals the size of the element.
+    // We cannot determine the actual byteStride until Accessor are parsed, thus
+    // set 0(= tightly packed) here(as done in OpenGL's VertexAttribPoiner)
+    byteStride = 0;
+  }
+
+  if ((byteStride > 252) || ((byteStride % 4) != 0)) {
+    if (err) {
+      std::stringstream ss;
+      ss << "Invalid `byteStride' value. `byteStride' must be the multiple of "
+            "4 : "
+         << byteStride << std::endl;
+
+      (*err) += ss.str();
+    }
+    return false;
+  }
+
+  int target = 0;
+  ParseIntegerProperty(&target, err, o, "target", false);
+  if ((target == TINYGLTF_TARGET_ARRAY_BUFFER) ||
+      (target == TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER)) {
+    // OK
+  } else {
+    target = 0;
+  }
+  bufferView->target = target;
+
+  ParseStringProperty(&bufferView->name, err, o, "name", false);
+
+  ParseExtensionsProperty(&bufferView->extensions, err, o);
+  ParseExtrasProperty(&bufferView->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        bufferView->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        bufferView->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  bufferView->buffer = buffer;
+  bufferView->byteOffset = byteOffset;
+  bufferView->byteLength = byteLength;
+  bufferView->byteStride = byteStride;
+  return true;
+}
+
+static bool ParseSparseAccessor(Accessor *accessor, std::string *err,
+                                const json &o) {
+  accessor->sparse.isSparse = true;
+
+  int count = 0;
+  ParseIntegerProperty(&count, err, o, "count", true);
+
+  json_const_iterator indices_iterator;
+  json_const_iterator values_iterator;
+  if (!FindMember(o, "indices", indices_iterator)) {
+    (*err) = "the sparse object of this accessor doesn't have indices";
+    return false;
+  }
+
+  if (!FindMember(o, "values", values_iterator)) {
+    (*err) = "the sparse object ob ths accessor doesn't have values";
+    return false;
+  }
+
+  const json &indices_obj = GetValue(indices_iterator);
+  const json &values_obj = GetValue(values_iterator);
+
+  int indices_buffer_view = 0, indices_byte_offset = 0, component_type = 0;
+  ParseIntegerProperty(&indices_buffer_view, err, indices_obj, "bufferView",
+                       true);
+  ParseIntegerProperty(&indices_byte_offset, err, indices_obj, "byteOffset",
+                       true);
+  ParseIntegerProperty(&component_type, err, indices_obj, "componentType",
+                       true);
+
+  int values_buffer_view = 0, values_byte_offset = 0;
+  ParseIntegerProperty(&values_buffer_view, err, values_obj, "bufferView",
+                       true);
+  ParseIntegerProperty(&values_byte_offset, err, values_obj, "byteOffset",
+                       true);
+
+  accessor->sparse.count = count;
+  accessor->sparse.indices.bufferView = indices_buffer_view;
+  accessor->sparse.indices.byteOffset = indices_byte_offset;
+  accessor->sparse.indices.componentType = component_type;
+  accessor->sparse.values.bufferView = values_buffer_view;
+  accessor->sparse.values.byteOffset = values_byte_offset;
+
+  // todo check theses values
+
+  return true;
+}
+
+static bool ParseAccessor(Accessor *accessor, std::string *err, const json &o,
+                          bool store_original_json_for_extras_and_extensions) {
+  int bufferView = -1;
+  ParseIntegerProperty(&bufferView, err, o, "bufferView", false, "Accessor");
+
+  size_t byteOffset = 0;
+  ParseUnsignedProperty(&byteOffset, err, o, "byteOffset", false, "Accessor");
+
+  bool normalized = false;
+  ParseBooleanProperty(&normalized, err, o, "normalized", false, "Accessor");
+
+  size_t componentType = 0;
+  if (!ParseUnsignedProperty(&componentType, err, o, "componentType", true,
+                             "Accessor")) {
+    return false;
+  }
+
+  size_t count = 0;
+  if (!ParseUnsignedProperty(&count, err, o, "count", true, "Accessor")) {
+    return false;
+  }
+
+  std::string type;
+  if (!ParseStringProperty(&type, err, o, "type", true, "Accessor")) {
+    return false;
+  }
+
+  if (type.compare("SCALAR") == 0) {
+    accessor->type = TINYGLTF_TYPE_SCALAR;
+  } else if (type.compare("VEC2") == 0) {
+    accessor->type = TINYGLTF_TYPE_VEC2;
+  } else if (type.compare("VEC3") == 0) {
+    accessor->type = TINYGLTF_TYPE_VEC3;
+  } else if (type.compare("VEC4") == 0) {
+    accessor->type = TINYGLTF_TYPE_VEC4;
+  } else if (type.compare("MAT2") == 0) {
+    accessor->type = TINYGLTF_TYPE_MAT2;
+  } else if (type.compare("MAT3") == 0) {
+    accessor->type = TINYGLTF_TYPE_MAT3;
+  } else if (type.compare("MAT4") == 0) {
+    accessor->type = TINYGLTF_TYPE_MAT4;
+  } else {
+    std::stringstream ss;
+    ss << "Unsupported `type` for accessor object. Got \"" << type << "\"\n";
+    if (err) {
+      (*err) += ss.str();
+    }
+    return false;
+  }
+
+  ParseStringProperty(&accessor->name, err, o, "name", false);
+
+  accessor->minValues.clear();
+  accessor->maxValues.clear();
+  ParseNumberArrayProperty(&accessor->minValues, err, o, "min", false,
+                           "Accessor");
+
+  ParseNumberArrayProperty(&accessor->maxValues, err, o, "max", false,
+                           "Accessor");
+
+  accessor->count = count;
+  accessor->bufferView = bufferView;
+  accessor->byteOffset = byteOffset;
+  accessor->normalized = normalized;
+  {
+    if (componentType >= TINYGLTF_COMPONENT_TYPE_BYTE &&
+        componentType <= TINYGLTF_COMPONENT_TYPE_DOUBLE) {
+      // OK
+      accessor->componentType = int(componentType);
+    } else {
+      std::stringstream ss;
+      ss << "Invalid `componentType` in accessor. Got " << componentType
+         << "\n";
+      if (err) {
+        (*err) += ss.str();
+      }
+      return false;
+    }
+  }
+
+  ParseExtensionsProperty(&(accessor->extensions), err, o);
+  ParseExtrasProperty(&(accessor->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        accessor->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        accessor->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  // check if accessor has a "sparse" object:
+  json_const_iterator iterator;
+  if (FindMember(o, "sparse", iterator)) {
+    // here this accessor has a "sparse" subobject
+    return ParseSparseAccessor(accessor, err, GetValue(iterator));
+  }
+
+  return true;
+}
+
+#ifdef TINYGLTF_ENABLE_DRACO
+
+static void DecodeIndexBuffer(draco::Mesh *mesh, size_t componentSize,
+                              std::vector<uint8_t> &outBuffer) {
+  if (componentSize == 4) {
+    assert(sizeof(mesh->face(draco::FaceIndex(0))[0]) == componentSize);
+    memcpy(outBuffer.data(), &mesh->face(draco::FaceIndex(0))[0],
+           outBuffer.size());
+  } else {
+    size_t faceStride = componentSize * 3;
+    for (draco::FaceIndex f(0); f < mesh->num_faces(); ++f) {
+      const draco::Mesh::Face &face = mesh->face(f);
+      if (componentSize == 2) {
+        uint16_t indices[3] = {(uint16_t)face[0].value(),
+                               (uint16_t)face[1].value(),
+                               (uint16_t)face[2].value()};
+        memcpy(outBuffer.data() + f.value() * faceStride, &indices[0],
+               faceStride);
+      } else {
+        uint8_t indices[3] = {(uint8_t)face[0].value(),
+                              (uint8_t)face[1].value(),
+                              (uint8_t)face[2].value()};
+        memcpy(outBuffer.data() + f.value() * faceStride, &indices[0],
+               faceStride);
+      }
+    }
+  }
+}
+
+template <typename T>
+static bool GetAttributeForAllPoints(draco::Mesh *mesh,
+                                     const draco::PointAttribute *pAttribute,
+                                     std::vector<uint8_t> &outBuffer) {
+  size_t byteOffset = 0;
+  T values[4] = {0, 0, 0, 0};
+  for (draco::PointIndex i(0); i < mesh->num_points(); ++i) {
+    const draco::AttributeValueIndex val_index = pAttribute->mapped_index(i);
+    if (!pAttribute->ConvertValue<T>(val_index, pAttribute->num_components(),
+                                     values))
+      return false;
+
+    memcpy(outBuffer.data() + byteOffset, &values[0],
+           sizeof(T) * pAttribute->num_components());
+    byteOffset += sizeof(T) * pAttribute->num_components();
+  }
+
+  return true;
+}
+
+static bool GetAttributeForAllPoints(uint32_t componentType, draco::Mesh *mesh,
+                                     const draco::PointAttribute *pAttribute,
+                                     std::vector<uint8_t> &outBuffer) {
+  bool decodeResult = false;
+  switch (componentType) {
+    case TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE:
+      decodeResult =
+          GetAttributeForAllPoints<uint8_t>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_BYTE:
+      decodeResult =
+          GetAttributeForAllPoints<int8_t>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT:
+      decodeResult =
+          GetAttributeForAllPoints<uint16_t>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_SHORT:
+      decodeResult =
+          GetAttributeForAllPoints<int16_t>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_INT:
+      decodeResult =
+          GetAttributeForAllPoints<int32_t>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_UNSIGNED_INT:
+      decodeResult =
+          GetAttributeForAllPoints<uint32_t>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_FLOAT:
+      decodeResult =
+          GetAttributeForAllPoints<float>(mesh, pAttribute, outBuffer);
+      break;
+    case TINYGLTF_COMPONENT_TYPE_DOUBLE:
+      decodeResult =
+          GetAttributeForAllPoints<double>(mesh, pAttribute, outBuffer);
+      break;
+    default:
+      return false;
+  }
+
+  return decodeResult;
+}
+
+static bool ParseDracoExtension(Primitive *primitive, Model *model,
+                                std::string *err,
+                                const Value &dracoExtensionValue) {
+  (void)err;
+  auto bufferViewValue = dracoExtensionValue.Get("bufferView");
+  if (!bufferViewValue.IsInt()) return false;
+  auto attributesValue = dracoExtensionValue.Get("attributes");
+  if (!attributesValue.IsObject()) return false;
+
+  auto attributesObject = attributesValue.Get<Value::Object>();
+  int bufferView = bufferViewValue.Get<int>();
+
+  BufferView &view = model->bufferViews[bufferView];
+  Buffer &buffer = model->buffers[view.buffer];
+  // BufferView has already been decoded
+  if (view.dracoDecoded) return true;
+  view.dracoDecoded = true;
+
+  const char *bufferViewData =
+      reinterpret_cast<const char *>(buffer.data.data() + view.byteOffset);
+  size_t bufferViewSize = view.byteLength;
+
+  // decode draco
+  draco::DecoderBuffer decoderBuffer;
+  decoderBuffer.Init(bufferViewData, bufferViewSize);
+  draco::Decoder decoder;
+  auto decodeResult = decoder.DecodeMeshFromBuffer(&decoderBuffer);
+  if (!decodeResult.ok()) {
+    return false;
+  }
+  const std::unique_ptr<draco::Mesh> &mesh = decodeResult.value();
+
+  // create new bufferView for indices
+  if (primitive->indices >= 0) {
+    int32_t componentSize = GetComponentSizeInBytes(
+        model->accessors[primitive->indices].componentType);
+    Buffer decodedIndexBuffer;
+    decodedIndexBuffer.data.resize(mesh->num_faces() * 3 * componentSize);
+
+    DecodeIndexBuffer(mesh.get(), componentSize, decodedIndexBuffer.data);
+
+    model->buffers.emplace_back(std::move(decodedIndexBuffer));
+
+    BufferView decodedIndexBufferView;
+    decodedIndexBufferView.buffer = int(model->buffers.size() - 1);
+    decodedIndexBufferView.byteLength =
+        int(mesh->num_faces() * 3 * componentSize);
+    decodedIndexBufferView.byteOffset = 0;
+    decodedIndexBufferView.byteStride = 0;
+    decodedIndexBufferView.target = TINYGLTF_TARGET_ARRAY_BUFFER;
+    model->bufferViews.emplace_back(std::move(decodedIndexBufferView));
+
+    model->accessors[primitive->indices].bufferView =
+        int(model->bufferViews.size() - 1);
+    model->accessors[primitive->indices].count = int(mesh->num_faces() * 3);
+  }
+
+  for (const auto &attribute : attributesObject) {
+    if (!attribute.second.IsInt()) return false;
+    auto primitiveAttribute = primitive->attributes.find(attribute.first);
+    if (primitiveAttribute == primitive->attributes.end()) return false;
+
+    int dracoAttributeIndex = attribute.second.Get<int>();
+    const auto pAttribute = mesh->GetAttributeByUniqueId(dracoAttributeIndex);
+    const auto componentType =
+        model->accessors[primitiveAttribute->second].componentType;
+
+    // Create a new buffer for this decoded buffer
+    Buffer decodedBuffer;
+    size_t bufferSize = mesh->num_points() * pAttribute->num_components() *
+                        GetComponentSizeInBytes(componentType);
+    decodedBuffer.data.resize(bufferSize);
+
+    if (!GetAttributeForAllPoints(componentType, mesh.get(), pAttribute,
+                                  decodedBuffer.data))
+      return false;
+
+    model->buffers.emplace_back(std::move(decodedBuffer));
+
+    BufferView decodedBufferView;
+    decodedBufferView.buffer = int(model->buffers.size() - 1);
+    decodedBufferView.byteLength = bufferSize;
+    decodedBufferView.byteOffset = pAttribute->byte_offset();
+    decodedBufferView.byteStride = pAttribute->byte_stride();
+    decodedBufferView.target = primitive->indices >= 0
+                                   ? TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER
+                                   : TINYGLTF_TARGET_ARRAY_BUFFER;
+    model->bufferViews.emplace_back(std::move(decodedBufferView));
+
+    model->accessors[primitiveAttribute->second].bufferView =
+        int(model->bufferViews.size() - 1);
+    model->accessors[primitiveAttribute->second].count =
+        int(mesh->num_points());
+  }
+
+  return true;
+}
+#endif
+
+static bool ParsePrimitive(Primitive *primitive, Model *model, std::string *err,
+                           const json &o,
+                           bool store_original_json_for_extras_and_extensions) {
+  int material = -1;
+  ParseIntegerProperty(&material, err, o, "material", false);
+  primitive->material = material;
+
+  int mode = TINYGLTF_MODE_TRIANGLES;
+  ParseIntegerProperty(&mode, err, o, "mode", false);
+  primitive->mode = mode;  // Why only triangled were supported ?
+
+  int indices = -1;
+  ParseIntegerProperty(&indices, err, o, "indices", false);
+  primitive->indices = indices;
+  if (!ParseStringIntegerProperty(&primitive->attributes, err, o, "attributes",
+                                  true, "Primitive")) {
+    return false;
+  }
+
+  // Look for morph targets
+  json_const_iterator targetsObject;
+  if (FindMember(o, "targets", targetsObject) &&
+      IsArray(GetValue(targetsObject))) {
+    auto targetsObjectEnd = ArrayEnd(GetValue(targetsObject));
+    for (json_const_array_iterator i = ArrayBegin(GetValue(targetsObject));
+         i != targetsObjectEnd; ++i) {
+      std::map<std::string, int> targetAttribues;
+
+      const json &dict = *i;
+      if (IsObject(dict)) {
+        json_const_iterator dictIt(ObjectBegin(dict));
+        json_const_iterator dictItEnd(ObjectEnd(dict));
+
+        for (; dictIt != dictItEnd; ++dictIt) {
+          int iVal;
+          if (GetInt(GetValue(dictIt), iVal))
+            targetAttribues[GetKey(dictIt)] = iVal;
+        }
+        primitive->targets.emplace_back(std::move(targetAttribues));
+      }
+    }
+  }
+
+  ParseExtrasProperty(&(primitive->extras), o);
+  ParseExtensionsProperty(&primitive->extensions, err, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        primitive->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        primitive->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+#ifdef TINYGLTF_ENABLE_DRACO
+  auto dracoExtension =
+      primitive->extensions.find("KHR_draco_mesh_compression");
+  if (dracoExtension != primitive->extensions.end()) {
+    ParseDracoExtension(primitive, model, err, dracoExtension->second);
+  }
+#else
+  (void)model;
+#endif
+
+  return true;
+}
+
+static bool ParseMesh(Mesh *mesh, Model *model, std::string *err, const json &o,
+                      bool store_original_json_for_extras_and_extensions) {
+  ParseStringProperty(&mesh->name, err, o, "name", false);
+
+  mesh->primitives.clear();
+  json_const_iterator primObject;
+  if (FindMember(o, "primitives", primObject) &&
+      IsArray(GetValue(primObject))) {
+    json_const_array_iterator primEnd = ArrayEnd(GetValue(primObject));
+    for (json_const_array_iterator i = ArrayBegin(GetValue(primObject));
+         i != primEnd; ++i) {
+      Primitive primitive;
+      if (ParsePrimitive(&primitive, model, err, *i,
+                         store_original_json_for_extras_and_extensions)) {
+        // Only add the primitive if the parsing succeeds.
+        mesh->primitives.emplace_back(std::move(primitive));
+      }
+    }
+  }
+
+  // Should probably check if has targets and if dimensions fit
+  ParseNumberArrayProperty(&mesh->weights, err, o, "weights", false);
+
+  ParseExtensionsProperty(&mesh->extensions, err, o);
+  ParseExtrasProperty(&(mesh->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        mesh->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        mesh->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseNode(Node *node, std::string *err, const json &o,
+                      bool store_original_json_for_extras_and_extensions) {
+  ParseStringProperty(&node->name, err, o, "name", false);
+
+  int skin = -1;
+  ParseIntegerProperty(&skin, err, o, "skin", false);
+  node->skin = skin;
+
+  // Matrix and T/R/S are exclusive
+  if (!ParseNumberArrayProperty(&node->matrix, err, o, "matrix", false)) {
+    ParseNumberArrayProperty(&node->rotation, err, o, "rotation", false);
+    ParseNumberArrayProperty(&node->scale, err, o, "scale", false);
+    ParseNumberArrayProperty(&node->translation, err, o, "translation", false);
+  }
+
+  int camera = -1;
+  ParseIntegerProperty(&camera, err, o, "camera", false);
+  node->camera = camera;
+
+  int mesh = -1;
+  ParseIntegerProperty(&mesh, err, o, "mesh", false);
+  node->mesh = mesh;
+
+  node->children.clear();
+  ParseIntegerArrayProperty(&node->children, err, o, "children", false);
+
+  ParseNumberArrayProperty(&node->weights, err, o, "weights", false);
+
+  ParseExtensionsProperty(&node->extensions, err, o);
+  ParseExtrasProperty(&(node->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        node->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        node->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParsePbrMetallicRoughness(
+    PbrMetallicRoughness *pbr, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  if (pbr == nullptr) {
+    return false;
+  }
+
+  std::vector<double> baseColorFactor;
+  if (ParseNumberArrayProperty(&baseColorFactor, err, o, "baseColorFactor",
+                               /* required */ false)) {
+    if (baseColorFactor.size() != 4) {
+      if (err) {
+        (*err) +=
+            "Array length of `baseColorFactor` parameter in "
+            "pbrMetallicRoughness must be 4, but got " +
+            std::to_string(baseColorFactor.size()) + "\n";
+      }
+      return false;
+    }
+    pbr->baseColorFactor = baseColorFactor;
+  }
+
+  {
+    json_const_iterator it;
+    if (FindMember(o, "baseColorTexture", it)) {
+      ParseTextureInfo(&pbr->baseColorTexture, err, GetValue(it),
+                       store_original_json_for_extras_and_extensions);
+    }
+  }
+
+  {
+    json_const_iterator it;
+    if (FindMember(o, "metallicRoughnessTexture", it)) {
+      ParseTextureInfo(&pbr->metallicRoughnessTexture, err, GetValue(it),
+                       store_original_json_for_extras_and_extensions);
+    }
+  }
+
+  ParseNumberProperty(&pbr->metallicFactor, err, o, "metallicFactor", false);
+  ParseNumberProperty(&pbr->roughnessFactor, err, o, "roughnessFactor", false);
+
+  ParseExtensionsProperty(&pbr->extensions, err, o);
+  ParseExtrasProperty(&pbr->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        pbr->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        pbr->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseMaterial(Material *material, std::string *err, const json &o,
+                          bool store_original_json_for_extras_and_extensions) {
+  ParseStringProperty(&material->name, err, o, "name", /* required */ false);
+
+  if (ParseNumberArrayProperty(&material->emissiveFactor, err, o,
+                               "emissiveFactor",
+                               /* required */ false)) {
+    if (material->emissiveFactor.size() != 3) {
+      if (err) {
+        (*err) +=
+            "Array length of `emissiveFactor` parameter in "
+            "material must be 3, but got " +
+            std::to_string(material->emissiveFactor.size()) + "\n";
+      }
+      return false;
+    }
+  } else {
+    // fill with default values
+    material->emissiveFactor = {0.0, 0.0, 0.0};
+  }
+
+  ParseStringProperty(&material->alphaMode, err, o, "alphaMode",
+                      /* required */ false);
+  ParseNumberProperty(&material->alphaCutoff, err, o, "alphaCutoff",
+                      /* required */ false);
+  ParseBooleanProperty(&material->doubleSided, err, o, "doubleSided",
+                       /* required */ false);
+
+  {
+    json_const_iterator it;
+    if (FindMember(o, "pbrMetallicRoughness", it)) {
+      ParsePbrMetallicRoughness(&material->pbrMetallicRoughness, err,
+                                GetValue(it),
+                                store_original_json_for_extras_and_extensions);
+    }
+  }
+
+  {
+    json_const_iterator it;
+    if (FindMember(o, "normalTexture", it)) {
+      ParseNormalTextureInfo(&material->normalTexture, err, GetValue(it),
+                             store_original_json_for_extras_and_extensions);
+    }
+  }
+
+  {
+    json_const_iterator it;
+    if (FindMember(o, "occlusionTexture", it)) {
+      ParseOcclusionTextureInfo(&material->occlusionTexture, err, GetValue(it),
+                                store_original_json_for_extras_and_extensions);
+    }
+  }
+
+  {
+    json_const_iterator it;
+    if (FindMember(o, "emissiveTexture", it)) {
+      ParseTextureInfo(&material->emissiveTexture, err, GetValue(it),
+                       store_original_json_for_extras_and_extensions);
+    }
+  }
+
+  // Old code path. For backward compatibility, we still store material values
+  // as Parameter. This will create duplicated information for
+  // example(pbrMetallicRoughness), but should be neglible in terms of memory
+  // consumption.
+  // TODO(syoyo): Remove in the next major release.
+  material->values.clear();
+  material->additionalValues.clear();
+
+  json_const_iterator it(ObjectBegin(o));
+  json_const_iterator itEnd(ObjectEnd(o));
+
+  for (; it != itEnd; ++it) {
+    std::string key(GetKey(it));
+    if (key == "pbrMetallicRoughness") {
+      if (IsObject(GetValue(it))) {
+        const json &values_object = GetValue(it);
+
+        json_const_iterator itVal(ObjectBegin(values_object));
+        json_const_iterator itValEnd(ObjectEnd(values_object));
+
+        for (; itVal != itValEnd; ++itVal) {
+          Parameter param;
+          if (ParseParameterProperty(&param, err, values_object, GetKey(itVal),
+                                     false)) {
+            material->values.emplace(GetKey(itVal), std::move(param));
+          }
+        }
+      }
+    } else if (key == "extensions" || key == "extras") {
+      // done later, skip, otherwise poorly parsed contents will be saved in the
+      // parametermap and serialized again later
+    } else {
+      Parameter param;
+      if (ParseParameterProperty(&param, err, o, key, false)) {
+        // names of materials have already been parsed. Putting it in this map
+        // doesn't correctly reflext the glTF specification
+        if (key != "name")
+          material->additionalValues.emplace(std::move(key), std::move(param));
+      }
+    }
+  }
+
+  material->extensions.clear();
+  ParseExtensionsProperty(&material->extensions, err, o);
+  ParseExtrasProperty(&(material->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator eit;
+      if (FindMember(o, "extensions", eit)) {
+        material->extensions_json_string = JsonToString(GetValue(eit));
+      }
+    }
+    {
+      json_const_iterator eit;
+      if (FindMember(o, "extras", eit)) {
+        material->extras_json_string = JsonToString(GetValue(eit));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseAnimationChannel(
+    AnimationChannel *channel, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  int samplerIndex = -1;
+  int targetIndex = -1;
+  if (!ParseIntegerProperty(&samplerIndex, err, o, "sampler", true,
+                            "AnimationChannel")) {
+    if (err) {
+      (*err) += "`sampler` field is missing in animation channels\n";
+    }
+    return false;
+  }
+
+  json_const_iterator targetIt;
+  if (FindMember(o, "target", targetIt) && IsObject(GetValue(targetIt))) {
+    const json &target_object = GetValue(targetIt);
+
+    if (!ParseIntegerProperty(&targetIndex, err, target_object, "node", true)) {
+      if (err) {
+        (*err) += "`node` field is missing in animation.channels.target\n";
+      }
+      return false;
+    }
+
+    if (!ParseStringProperty(&channel->target_path, err, target_object, "path",
+                             true)) {
+      if (err) {
+        (*err) += "`path` field is missing in animation.channels.target\n";
+      }
+      return false;
+    }
+    ParseExtensionsProperty(&channel->target_extensions, err, target_object);
+    if (store_original_json_for_extras_and_extensions) {
+      json_const_iterator it;
+      if (FindMember(target_object, "extensions", it)) {
+        channel->target_extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  channel->sampler = samplerIndex;
+  channel->target_node = targetIndex;
+
+  ParseExtensionsProperty(&channel->extensions, err, o);
+  ParseExtrasProperty(&(channel->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        channel->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        channel->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseAnimation(Animation *animation, std::string *err,
+                           const json &o,
+                           bool store_original_json_for_extras_and_extensions) {
+  {
+    json_const_iterator channelsIt;
+    if (FindMember(o, "channels", channelsIt) &&
+        IsArray(GetValue(channelsIt))) {
+      json_const_array_iterator channelEnd = ArrayEnd(GetValue(channelsIt));
+      for (json_const_array_iterator i = ArrayBegin(GetValue(channelsIt));
+           i != channelEnd; ++i) {
+        AnimationChannel channel;
+        if (ParseAnimationChannel(
+                &channel, err, *i,
+                store_original_json_for_extras_and_extensions)) {
+          // Only add the channel if the parsing succeeds.
+          animation->channels.emplace_back(std::move(channel));
+        }
+      }
+    }
+  }
+
+  {
+    json_const_iterator samplerIt;
+    if (FindMember(o, "samplers", samplerIt) && IsArray(GetValue(samplerIt))) {
+      const json &sampler_array = GetValue(samplerIt);
+
+      json_const_array_iterator it = ArrayBegin(sampler_array);
+      json_const_array_iterator itEnd = ArrayEnd(sampler_array);
+
+      for (; it != itEnd; ++it) {
+        const json &s = *it;
+
+        AnimationSampler sampler;
+        int inputIndex = -1;
+        int outputIndex = -1;
+        if (!ParseIntegerProperty(&inputIndex, err, s, "input", true)) {
+          if (err) {
+            (*err) += "`input` field is missing in animation.sampler\n";
+          }
+          return false;
+        }
+        ParseStringProperty(&sampler.interpolation, err, s, "interpolation",
+                            false);
+        if (!ParseIntegerProperty(&outputIndex, err, s, "output", true)) {
+          if (err) {
+            (*err) += "`output` field is missing in animation.sampler\n";
+          }
+          return false;
+        }
+        sampler.input = inputIndex;
+        sampler.output = outputIndex;
+        ParseExtensionsProperty(&(sampler.extensions), err, o);
+        ParseExtrasProperty(&(sampler.extras), s);
+
+        if (store_original_json_for_extras_and_extensions) {
+          {
+            json_const_iterator eit;
+            if (FindMember(o, "extensions", eit)) {
+              sampler.extensions_json_string = JsonToString(GetValue(eit));
+            }
+          }
+          {
+            json_const_iterator eit;
+            if (FindMember(o, "extras", eit)) {
+              sampler.extras_json_string = JsonToString(GetValue(eit));
+            }
+          }
+        }
+
+        animation->samplers.emplace_back(std::move(sampler));
+      }
+    }
+  }
+
+  ParseStringProperty(&animation->name, err, o, "name", false);
+
+  ParseExtensionsProperty(&animation->extensions, err, o);
+  ParseExtrasProperty(&(animation->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        animation->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        animation->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseSampler(Sampler *sampler, std::string *err, const json &o,
+                         bool store_original_json_for_extras_and_extensions) {
+  ParseStringProperty(&sampler->name, err, o, "name", false);
+
+  int minFilter = -1;
+  int magFilter = -1;
+  int wrapS = TINYGLTF_TEXTURE_WRAP_REPEAT;
+  int wrapT = TINYGLTF_TEXTURE_WRAP_REPEAT;
+  //int wrapR = TINYGLTF_TEXTURE_WRAP_REPEAT;
+  ParseIntegerProperty(&minFilter, err, o, "minFilter", false);
+  ParseIntegerProperty(&magFilter, err, o, "magFilter", false);
+  ParseIntegerProperty(&wrapS, err, o, "wrapS", false);
+  ParseIntegerProperty(&wrapT, err, o, "wrapT", false);
+  //ParseIntegerProperty(&wrapR, err, o, "wrapR", false);  // tinygltf extension
+
+  // TODO(syoyo): Check the value is alloed one.
+  // (e.g. we allow 9728(NEAREST), but don't allow 9727)
+
+  sampler->minFilter = minFilter;
+  sampler->magFilter = magFilter;
+  sampler->wrapS = wrapS;
+  sampler->wrapT = wrapT;
+  //sampler->wrapR = wrapR;
+
+  ParseExtensionsProperty(&(sampler->extensions), err, o);
+  ParseExtrasProperty(&(sampler->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        sampler->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        sampler->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseSkin(Skin *skin, std::string *err, const json &o,
+                      bool store_original_json_for_extras_and_extensions) {
+  ParseStringProperty(&skin->name, err, o, "name", false, "Skin");
+
+  std::vector<int> joints;
+  if (!ParseIntegerArrayProperty(&joints, err, o, "joints", false, "Skin")) {
+    return false;
+  }
+  skin->joints = std::move(joints);
+
+  int skeleton = -1;
+  ParseIntegerProperty(&skeleton, err, o, "skeleton", false, "Skin");
+  skin->skeleton = skeleton;
+
+  int invBind = -1;
+  ParseIntegerProperty(&invBind, err, o, "inverseBindMatrices", true, "Skin");
+  skin->inverseBindMatrices = invBind;
+
+  ParseExtensionsProperty(&(skin->extensions), err, o);
+  ParseExtrasProperty(&(skin->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        skin->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        skin->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParsePerspectiveCamera(
+    PerspectiveCamera *camera, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  double yfov = 0.0;
+  if (!ParseNumberProperty(&yfov, err, o, "yfov", true, "OrthographicCamera")) {
+    return false;
+  }
+
+  double znear = 0.0;
+  if (!ParseNumberProperty(&znear, err, o, "znear", true,
+                           "PerspectiveCamera")) {
+    return false;
+  }
+
+  double aspectRatio = 0.0;  // = invalid
+  ParseNumberProperty(&aspectRatio, err, o, "aspectRatio", false,
+                      "PerspectiveCamera");
+
+  double zfar = 0.0;  // = invalid
+  ParseNumberProperty(&zfar, err, o, "zfar", false, "PerspectiveCamera");
+
+  camera->aspectRatio = aspectRatio;
+  camera->zfar = zfar;
+  camera->yfov = yfov;
+  camera->znear = znear;
+
+  ParseExtensionsProperty(&camera->extensions, err, o);
+  ParseExtrasProperty(&(camera->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        camera->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        camera->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  // TODO(syoyo): Validate parameter values.
+
+  return true;
+}
+
+static bool ParseSpotLight(SpotLight *light, std::string *err, const json &o,
+                           bool store_original_json_for_extras_and_extensions) {
+  ParseNumberProperty(&light->innerConeAngle, err, o, "innerConeAngle", false);
+  ParseNumberProperty(&light->outerConeAngle, err, o, "outerConeAngle", false);
+
+  ParseExtensionsProperty(&light->extensions, err, o);
+  ParseExtrasProperty(&light->extras, o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        light->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        light->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  // TODO(syoyo): Validate parameter values.
+
+  return true;
+}
+
+static bool ParseOrthographicCamera(
+    OrthographicCamera *camera, std::string *err, const json &o,
+    bool store_original_json_for_extras_and_extensions) {
+  double xmag = 0.0;
+  if (!ParseNumberProperty(&xmag, err, o, "xmag", true, "OrthographicCamera")) {
+    return false;
+  }
+
+  double ymag = 0.0;
+  if (!ParseNumberProperty(&ymag, err, o, "ymag", true, "OrthographicCamera")) {
+    return false;
+  }
+
+  double zfar = 0.0;
+  if (!ParseNumberProperty(&zfar, err, o, "zfar", true, "OrthographicCamera")) {
+    return false;
+  }
+
+  double znear = 0.0;
+  if (!ParseNumberProperty(&znear, err, o, "znear", true,
+                           "OrthographicCamera")) {
+    return false;
+  }
+
+  ParseExtensionsProperty(&camera->extensions, err, o);
+  ParseExtrasProperty(&(camera->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        camera->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        camera->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  camera->xmag = xmag;
+  camera->ymag = ymag;
+  camera->zfar = zfar;
+  camera->znear = znear;
+
+  // TODO(syoyo): Validate parameter values.
+
+  return true;
+}
+
+static bool ParseCamera(Camera *camera, std::string *err, const json &o,
+                        bool store_original_json_for_extras_and_extensions) {
+  if (!ParseStringProperty(&camera->type, err, o, "type", true, "Camera")) {
+    return false;
+  }
+
+  if (camera->type.compare("orthographic") == 0) {
+    json_const_iterator orthoIt;
+    if (!FindMember(o, "orthographic", orthoIt)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Orhographic camera description not found." << std::endl;
+        (*err) += ss.str();
+      }
+      return false;
+    }
+
+    const json &v = GetValue(orthoIt);
+    if (!IsObject(v)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "\"orthographic\" is not a JSON object." << std::endl;
+        (*err) += ss.str();
+      }
+      return false;
+    }
+
+    if (!ParseOrthographicCamera(
+            &camera->orthographic, err, v,
+            store_original_json_for_extras_and_extensions)) {
+      return false;
+    }
+  } else if (camera->type.compare("perspective") == 0) {
+    json_const_iterator perspIt;
+    if (!FindMember(o, "perspective", perspIt)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Perspective camera description not found." << std::endl;
+        (*err) += ss.str();
+      }
+      return false;
+    }
+
+    const json &v = GetValue(perspIt);
+    if (!IsObject(v)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "\"perspective\" is not a JSON object." << std::endl;
+        (*err) += ss.str();
+      }
+      return false;
+    }
+
+    if (!ParsePerspectiveCamera(
+            &camera->perspective, err, v,
+            store_original_json_for_extras_and_extensions)) {
+      return false;
+    }
+  } else {
+    if (err) {
+      std::stringstream ss;
+      ss << "Invalid camera type: \"" << camera->type
+         << "\". Must be \"perspective\" or \"orthographic\"" << std::endl;
+      (*err) += ss.str();
+    }
+    return false;
+  }
+
+  ParseStringProperty(&camera->name, err, o, "name", false);
+
+  ParseExtensionsProperty(&camera->extensions, err, o);
+  ParseExtrasProperty(&(camera->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        camera->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        camera->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+static bool ParseLight(Light *light, std::string *err, const json &o,
+                       bool store_original_json_for_extras_and_extensions) {
+  if (!ParseStringProperty(&light->type, err, o, "type", true)) {
+    return false;
+  }
+
+  if (light->type == "spot") {
+    json_const_iterator spotIt;
+    if (!FindMember(o, "spot", spotIt)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "Spot light description not found." << std::endl;
+        (*err) += ss.str();
+      }
+      return false;
+    }
+
+    const json &v = GetValue(spotIt);
+    if (!IsObject(v)) {
+      if (err) {
+        std::stringstream ss;
+        ss << "\"spot\" is not a JSON object." << std::endl;
+        (*err) += ss.str();
+      }
+      return false;
+    }
+
+    if (!ParseSpotLight(&light->spot, err, v,
+                        store_original_json_for_extras_and_extensions)) {
+      return false;
+    }
+  }
+
+  ParseStringProperty(&light->name, err, o, "name", false);
+  ParseNumberArrayProperty(&light->color, err, o, "color", false);
+  ParseNumberProperty(&light->range, err, o, "range", false);
+  ParseNumberProperty(&light->intensity, err, o, "intensity", false);
+  ParseExtensionsProperty(&light->extensions, err, o);
+  ParseExtrasProperty(&(light->extras), o);
+
+  if (store_original_json_for_extras_and_extensions) {
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        light->extensions_json_string = JsonToString(GetValue(it));
+      }
+    }
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extras", it)) {
+        light->extras_json_string = JsonToString(GetValue(it));
+      }
+    }
+  }
+
+  return true;
+}
+
+bool TinyGLTF::LoadFromString(Model *model, std::string *err, std::string *warn,
+                              const char *json_str,
+                              unsigned int json_str_length,
+                              const std::string &base_dir,
+                              unsigned int check_sections) {
+  if (json_str_length < 4) {
+    if (err) {
+      (*err) = "JSON string too short.\n";
+    }
+    return false;
+  }
+
+  JsonDocument v;
+
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || \
+     defined(_CPPUNWIND)) &&                               \
+    !defined(TINYGLTF_NOEXCEPTION)
+  try {
+    JsonParse(v, json_str, json_str_length, true);
+
+  } catch (const std::exception &e) {
+    if (err) {
+      (*err) = e.what();
+    }
+    return false;
+  }
+#else
+  {
+    JsonParse(v, json_str, json_str_length);
+
+    if (!IsObject(v)) {
+      // Assume parsing was failed.
+      if (err) {
+        (*err) = "Failed to parse JSON object\n";
+      }
+      return false;
+    }
+  }
+#endif
+
+  if (!IsObject(v)) {
+    // root is not an object.
+    if (err) {
+      (*err) = "Root element is not a JSON object\n";
+    }
+    return false;
+  }
+
+  {
+    bool version_found = false;
+    json_const_iterator it;
+    if (FindMember(v, "asset", it) && IsObject(GetValue(it))) {
+      auto &itObj = GetValue(it);
+      json_const_iterator version_it;
+      std::string versionStr;
+      if (FindMember(itObj, "version", version_it) &&
+          GetString(GetValue(version_it), versionStr)) {
+        version_found = true;
+      }
+    }
+    if (version_found) {
+      // OK
+    } else if (check_sections & REQUIRE_VERSION) {
+      if (err) {
+        (*err) += "\"asset\" object not found in .gltf or not an object type\n";
+      }
+      return false;
+    }
+  }
+
+  // scene is not mandatory.
+  // FIXME Maybe a better way to handle it than removing the code
+
+  auto IsArrayMemberPresent = [](const json &_v, const char *name) -> bool {
+    json_const_iterator it;
+    return FindMember(_v, name, it) && IsArray(GetValue(it));
+  };
+
+  {
+    if ((check_sections & REQUIRE_SCENES) &&
+        !IsArrayMemberPresent(v, "scenes")) {
+      if (err) {
+        (*err) += "\"scenes\" object not found in .gltf or not an array type\n";
+      }
+      return false;
+    }
+  }
+
+  {
+    if ((check_sections & REQUIRE_NODES) && !IsArrayMemberPresent(v, "nodes")) {
+      if (err) {
+        (*err) += "\"nodes\" object not found in .gltf\n";
+      }
+      return false;
+    }
+  }
+
+  {
+    if ((check_sections & REQUIRE_ACCESSORS) &&
+        !IsArrayMemberPresent(v, "accessors")) {
+      if (err) {
+        (*err) += "\"accessors\" object not found in .gltf\n";
+      }
+      return false;
+    }
+  }
+
+  {
+    if ((check_sections & REQUIRE_BUFFERS) &&
+        !IsArrayMemberPresent(v, "buffers")) {
+      if (err) {
+        (*err) += "\"buffers\" object not found in .gltf\n";
+      }
+      return false;
+    }
+  }
+
+  {
+    if ((check_sections & REQUIRE_BUFFER_VIEWS) &&
+        !IsArrayMemberPresent(v, "bufferViews")) {
+      if (err) {
+        (*err) += "\"bufferViews\" object not found in .gltf\n";
+      }
+      return false;
+    }
+  }
+
+  model->buffers.clear();
+  model->bufferViews.clear();
+  model->accessors.clear();
+  model->meshes.clear();
+  model->cameras.clear();
+  model->nodes.clear();
+  model->extensionsUsed.clear();
+  model->extensionsRequired.clear();
+  model->extensions.clear();
+  model->defaultScene = -1;
+
+  // 1. Parse Asset
+  {
+    json_const_iterator it;
+    if (FindMember(v, "asset", it) && IsObject(GetValue(it))) {
+      const json &root = GetValue(it);
+
+      ParseAsset(&model->asset, err, root,
+                 store_original_json_for_extras_and_extensions_);
+    }
+  }
+
+#ifdef TINYGLTF_USE_CPP14
+  auto ForEachInArray = [](const json &_v, const char *member,
+                           const auto &cb) -> bool
+#else
+  // The std::function<> implementation can be less efficient because it will
+  // allocate heap when the size of the captured lambda is above 16 bytes with
+  // clang and gcc, but it does not require C++14.
+  auto ForEachInArray = [](const json &_v, const char *member,
+                           const std::function<bool(const json &)> &cb) -> bool
+#endif
+  {
+    json_const_iterator itm;
+    if (FindMember(_v, member, itm) && IsArray(GetValue(itm))) {
+      const json &root = GetValue(itm);
+      auto it = ArrayBegin(root);
+      auto end = ArrayEnd(root);
+      for (; it != end; ++it) {
+        if (!cb(*it)) return false;
+      }
+    }
+    return true;
+  };
+
+  // 2. Parse extensionUsed
+  {
+    ForEachInArray(v, "extensionsUsed", [&](const json &o) {
+      std::string str;
+      GetString(o, str);
+      model->extensionsUsed.emplace_back(std::move(str));
+      return true;
+    });
+  }
+
+  {
+    ForEachInArray(v, "extensionsRequired", [&](const json &o) {
+      std::string str;
+      GetString(o, str);
+      model->extensionsRequired.emplace_back(std::move(str));
+      return true;
+    });
+  }
+
+  // 3. Parse Buffer
+  {
+    bool success = ForEachInArray(v, "buffers", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`buffers' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Buffer buffer;
+      if (!ParseBuffer(&buffer, err, o,
+                       store_original_json_for_extras_and_extensions_, &fs,
+                       base_dir, is_binary_, bin_data_, bin_size_)) {
+        return false;
+      }
+
+      model->buffers.emplace_back(std::move(buffer));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+  // 4. Parse BufferView
+  {
+    bool success = ForEachInArray(v, "bufferViews", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`bufferViews' does not contain an JSON object.";
+        }
+        return false;
+      }
+      BufferView bufferView;
+      if (!ParseBufferView(&bufferView, err, o,
+                           store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->bufferViews.emplace_back(std::move(bufferView));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 5. Parse Accessor
+  {
+    bool success = ForEachInArray(v, "accessors", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`accessors' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Accessor accessor;
+      if (!ParseAccessor(&accessor, err, o,
+                         store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->accessors.emplace_back(std::move(accessor));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 6. Parse Mesh
+  {
+    bool success = ForEachInArray(v, "meshes", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`meshes' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Mesh mesh;
+      if (!ParseMesh(&mesh, model, err, o,
+                     store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->meshes.emplace_back(std::move(mesh));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // Assign missing bufferView target types
+  // - Look for missing Mesh indices
+  // - Look for missing Mesh attributes
+  for (auto &mesh : model->meshes) {
+    for (auto &primitive : mesh.primitives) {
+      if (primitive.indices >
+          -1)  // has indices from parsing step, must be Element Array Buffer
+      {
+        if (size_t(primitive.indices) >= model->accessors.size()) {
+          if (err) {
+            (*err) += "primitive indices accessor out of bounds";
+          }
+          return false;
+        }
+
+        auto bufferView =
+            model->accessors[size_t(primitive.indices)].bufferView;
+        if (bufferView < 0 || size_t(bufferView) >= model->bufferViews.size()) {
+          if (err) {
+            (*err) += "accessor[" + std::to_string(primitive.indices) +
+                      "] invalid bufferView";
+          }
+          return false;
+        }
+
+        model->bufferViews[size_t(bufferView)].target =
+            TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER;
+        // we could optionally check if acessors' bufferView type is Scalar, as
+        // it should be
+      }
+
+      for (auto &attribute : primitive.attributes) {
+        model
+            ->bufferViews[size_t(
+                model->accessors[size_t(attribute.second)].bufferView)]
+            .target = TINYGLTF_TARGET_ARRAY_BUFFER;
+      }
+
+      for (auto &target : primitive.targets) {
+        for (auto &attribute : target) {
+          auto bufferView =
+              model->accessors[size_t(attribute.second)].bufferView;
+          // bufferView could be null(-1) for sparse morph target
+          if (bufferView >= 0) {
+            model->bufferViews[size_t(bufferView)].target =
+                TINYGLTF_TARGET_ARRAY_BUFFER;
+          }
+        }
+      }
+    }
+  }
+
+  // 7. Parse Node
+  {
+    bool success = ForEachInArray(v, "nodes", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`nodes' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Node node;
+      if (!ParseNode(&node, err, o,
+                     store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->nodes.emplace_back(std::move(node));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 8. Parse scenes.
+  {
+    bool success = ForEachInArray(v, "scenes", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`scenes' does not contain an JSON object.";
+        }
+        return false;
+      }
+      std::vector<int> nodes;
+      ParseIntegerArrayProperty(&nodes, err, o, "nodes", false);
+
+      Scene scene;
+      scene.nodes = std::move(nodes);
+
+      ParseStringProperty(&scene.name, err, o, "name", false);
+
+      ParseExtensionsProperty(&scene.extensions, err, o);
+      ParseExtrasProperty(&scene.extras, o);
+
+      if (store_original_json_for_extras_and_extensions_) {
+        {
+          json_const_iterator it;
+          if (FindMember(o, "extensions", it)) {
+            scene.extensions_json_string = JsonToString(GetValue(it));
+          }
+        }
+        {
+          json_const_iterator it;
+          if (FindMember(o, "extras", it)) {
+            scene.extras_json_string = JsonToString(GetValue(it));
+          }
+        }
+      }
+
+      model->scenes.emplace_back(std::move(scene));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 9. Parse default scenes.
+  {
+    json_const_iterator rootIt;
+    int iVal;
+    if (FindMember(v, "scene", rootIt) && GetInt(GetValue(rootIt), iVal)) {
+      model->defaultScene = iVal;
+    }
+  }
+
+  // 10. Parse Material
+  {
+    bool success = ForEachInArray(v, "materials", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`materials' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Material material;
+      ParseStringProperty(&material.name, err, o, "name", false);
+
+      if (!ParseMaterial(&material, err, o,
+                         store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->materials.emplace_back(std::move(material));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 11. Parse Image
+  void *load_image_user_data{nullptr};
+
+  LoadImageDataOption load_image_option;
+
+  if (user_image_loader_) {
+    // Use user supplied pointer
+    load_image_user_data = load_image_user_data_;
+  } else {
+    load_image_option.preserve_channels = preserve_image_channels_;
+    load_image_user_data = reinterpret_cast<void *>(&load_image_option);
+  }
+
+  {
+    int idx = 0;
+    bool success = ForEachInArray(v, "images", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "image[" + std::to_string(idx) + "] is not a JSON object.";
+        }
+        return false;
+      }
+      Image image;
+      if (!ParseImage(&image, idx, err, warn, o,
+                      store_original_json_for_extras_and_extensions_, base_dir,
+                      &fs, &this->LoadImageData, load_image_user_data)) {
+        return false;
+      }
+
+      if (image.bufferView != -1) {
+        // Load image from the buffer view.
+        if (size_t(image.bufferView) >= model->bufferViews.size()) {
+          if (err) {
+            std::stringstream ss;
+            ss << "image[" << idx << "] bufferView \"" << image.bufferView
+               << "\" not found in the scene." << std::endl;
+            (*err) += ss.str();
+          }
+          return false;
+        }
+
+        const BufferView &bufferView =
+            model->bufferViews[size_t(image.bufferView)];
+        if (size_t(bufferView.buffer) >= model->buffers.size()) {
+          if (err) {
+            std::stringstream ss;
+            ss << "image[" << idx << "] buffer \"" << bufferView.buffer
+               << "\" not found in the scene." << std::endl;
+            (*err) += ss.str();
+          }
+          return false;
+        }
+        const Buffer &buffer = model->buffers[size_t(bufferView.buffer)];
+
+        if (*LoadImageData == nullptr) {
+          if (err) {
+            (*err) += "No LoadImageData callback specified.\n";
+          }
+          return false;
+        }
+        bool ret = LoadImageData(
+            &image, idx, err, warn, image.width, image.height,
+            &buffer.data[bufferView.byteOffset],
+            static_cast<int>(bufferView.byteLength), load_image_user_data);
+        if (!ret) {
+          return false;
+        }
+      }
+
+      model->images.emplace_back(std::move(image));
+      ++idx;
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 12. Parse Texture
+  {
+    bool success = ForEachInArray(v, "textures", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`textures' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Texture texture;
+      if (!ParseTexture(&texture, err, o,
+                        store_original_json_for_extras_and_extensions_,
+                        base_dir)) {
+        return false;
+      }
+
+      model->textures.emplace_back(std::move(texture));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 13. Parse Animation
+  {
+    bool success = ForEachInArray(v, "animations", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`animations' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Animation animation;
+      if (!ParseAnimation(&animation, err, o,
+                          store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->animations.emplace_back(std::move(animation));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 14. Parse Skin
+  {
+    bool success = ForEachInArray(v, "skins", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`skins' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Skin skin;
+      if (!ParseSkin(&skin, err, o,
+                     store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->skins.emplace_back(std::move(skin));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 15. Parse Sampler
+  {
+    bool success = ForEachInArray(v, "samplers", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`samplers' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Sampler sampler;
+      if (!ParseSampler(&sampler, err, o,
+                        store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->samplers.emplace_back(std::move(sampler));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 16. Parse Camera
+  {
+    bool success = ForEachInArray(v, "cameras", [&](const json &o) {
+      if (!IsObject(o)) {
+        if (err) {
+          (*err) += "`cameras' does not contain an JSON object.";
+        }
+        return false;
+      }
+      Camera camera;
+      if (!ParseCamera(&camera, err, o,
+                       store_original_json_for_extras_and_extensions_)) {
+        return false;
+      }
+
+      model->cameras.emplace_back(std::move(camera));
+      return true;
+    });
+
+    if (!success) {
+      return false;
+    }
+  }
+
+  // 17. Parse Extensions
+  ParseExtensionsProperty(&model->extensions, err, v);
+
+  // 18. Specific extension implementations
+  {
+    json_const_iterator rootIt;
+    if (FindMember(v, "extensions", rootIt) && IsObject(GetValue(rootIt))) {
+      const json &root = GetValue(rootIt);
+
+      json_const_iterator it(ObjectBegin(root));
+      json_const_iterator itEnd(ObjectEnd(root));
+      for (; it != itEnd; ++it) {
+        // parse KHR_lights_punctual extension
+        std::string key(GetKey(it));
+        if ((key == "KHR_lights_punctual") && IsObject(GetValue(it))) {
+          const json &object = GetValue(it);
+          json_const_iterator itLight;
+          if (FindMember(object, "lights", itLight)) {
+            const json &lights = GetValue(itLight);
+            if (!IsArray(lights)) {
+              continue;
+            }
+
+            auto arrayIt(ArrayBegin(lights));
+            auto arrayItEnd(ArrayEnd(lights));
+            for (; arrayIt != arrayItEnd; ++arrayIt) {
+              Light light;
+              if (!ParseLight(&light, err, *arrayIt,
+                              store_original_json_for_extras_and_extensions_)) {
+                return false;
+              }
+              model->lights.emplace_back(std::move(light));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 19. Parse Extras
+  ParseExtrasProperty(&model->extras, v);
+
+  if (store_original_json_for_extras_and_extensions_) {
+    model->extras_json_string = JsonToString(v["extras"]);
+    model->extensions_json_string = JsonToString(v["extensions"]);
+  }
+
+  return true;
+}
+
+bool TinyGLTF::LoadASCIIFromString(Model *model, std::string *err,
+                                   std::string *warn, const char *str,
+                                   unsigned int length,
+                                   const std::string &base_dir,
+                                   unsigned int check_sections) {
+  is_binary_ = false;
+  bin_data_ = nullptr;
+  bin_size_ = 0;
+
+  return LoadFromString(model, err, warn, str, length, base_dir,
+                        check_sections);
+}
+
+bool TinyGLTF::LoadASCIIFromFile(Model *model, std::string *err,
+                                 std::string *warn, const std::string &filename,
+                                 unsigned int check_sections) {
+  std::stringstream ss;
+
+  if (fs.ReadWholeFile == nullptr) {
+    // Programmer error, assert() ?
+    ss << "Failed to read file: " << filename
+       << ": one or more FS callback not set" << std::endl;
+    if (err) {
+      (*err) = ss.str();
+    }
+    return false;
+  }
+
+  std::vector<unsigned char> data;
+  std::string fileerr;
+  bool fileread = fs.ReadWholeFile(&data, &fileerr, filename, fs.user_data);
+  if (!fileread) {
+    ss << "Failed to read file: " << filename << ": " << fileerr << std::endl;
+    if (err) {
+      (*err) = ss.str();
+    }
+    return false;
+  }
+
+  size_t sz = data.size();
+  if (sz == 0) {
+    if (err) {
+      (*err) = "Empty file.";
+    }
+    return false;
+  }
+
+  std::string basedir = GetBaseDir(filename);
+
+  bool ret = LoadASCIIFromString(
+      model, err, warn, reinterpret_cast<const char *>(&data.at(0)),
+      static_cast<unsigned int>(data.size()), basedir, check_sections);
+
+  return ret;
+}
+
+bool TinyGLTF::LoadBinaryFromMemory(Model *model, std::string *err,
+                                    std::string *warn,
+                                    const unsigned char *bytes,
+                                    unsigned int size,
+                                    const std::string &base_dir,
+                                    unsigned int check_sections) {
+  if (size < 20) {
+    if (err) {
+      (*err) = "Too short data size for glTF Binary.";
+    }
+    return false;
+  }
+
+  if (bytes[0] == 'g' && bytes[1] == 'l' && bytes[2] == 'T' &&
+      bytes[3] == 'F') {
+    // ok
+  } else {
+    if (err) {
+      (*err) = "Invalid magic.";
+    }
+    return false;
+  }
+
+  unsigned int version;       // 4 bytes
+  unsigned int length;        // 4 bytes
+  unsigned int model_length;  // 4 bytes
+  unsigned int model_format;  // 4 bytes;
+
+  // @todo { Endian swap for big endian machine. }
+  memcpy(&version, bytes + 4, 4);
+  swap4(&version);
+  memcpy(&length, bytes + 8, 4);
+  swap4(&length);
+  memcpy(&model_length, bytes + 12, 4);
+  swap4(&model_length);
+  memcpy(&model_format, bytes + 16, 4);
+  swap4(&model_format);
+
+  // In case the Bin buffer is not present, the size is exactly 20 + size of
+  // JSON contents,
+  // so use "greater than" operator.
+  if ((20 + model_length > size) || (model_length < 1) || (length > size) ||
+      (20 + model_length > length) ||
+      (model_format != 0x4E4F534A)) {  // 0x4E4F534A = JSON format.
+    if (err) {
+      (*err) = "Invalid glTF binary.";
+    }
+    return false;
+  }
+
+  // Extract JSON string.
+  std::string jsonString(reinterpret_cast<const char *>(&bytes[20]),
+                         model_length);
+
+  is_binary_ = true;
+  bin_data_ = bytes + 20 + model_length +
+              8;  // 4 bytes (buffer_length) + 4 bytes(buffer_format)
+  bin_size_ =
+      length - (20 + model_length);  // extract header + JSON scene data.
+
+  bool ret = LoadFromString(model, err, warn,
+                            reinterpret_cast<const char *>(&bytes[20]),
+                            model_length, base_dir, check_sections);
+  if (!ret) {
+    return ret;
+  }
+
+  return true;
+}
+
+bool TinyGLTF::LoadBinaryFromFile(Model *model, std::string *err,
+                                  std::string *warn,
+                                  const std::string &filename,
+                                  unsigned int check_sections) {
+  std::stringstream ss;
+
+  if (fs.ReadWholeFile == nullptr) {
+    // Programmer error, assert() ?
+    ss << "Failed to read file: " << filename
+       << ": one or more FS callback not set" << std::endl;
+    if (err) {
+      (*err) = ss.str();
+    }
+    return false;
+  }
+
+  std::vector<unsigned char> data;
+  std::string fileerr;
+  bool fileread = fs.ReadWholeFile(&data, &fileerr, filename, fs.user_data);
+  if (!fileread) {
+    ss << "Failed to read file: " << filename << ": " << fileerr << std::endl;
+    if (err) {
+      (*err) = ss.str();
+    }
+    return false;
+  }
+
+  std::string basedir = GetBaseDir(filename);
+
+  bool ret = LoadBinaryFromMemory(model, err, warn, &data.at(0),
+                                  static_cast<unsigned int>(data.size()),
+                                  basedir, check_sections);
+
+  return ret;
+}
+
+///////////////////////
+// GLTF Serialization
+///////////////////////
+namespace {
+json JsonFromString(const char *s) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return json(s, GetAllocator());
+#else
+  return json(s);
+#endif
+}
+
+void JsonAssign(json &dest, const json &src) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  dest.CopyFrom(src, GetAllocator());
+#else
+  dest = src;
+#endif
+}
+
+void JsonAddMember(json &o, const char *key, json &&value) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  if (!o.IsObject()) {
+    o.SetObject();
+  }
+  o.AddMember(json(key, GetAllocator()), std::move(value), GetAllocator());
+#else
+  o[key] = std::move(value);
+#endif
+}
+
+void JsonPushBack(json &o, json &&value) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  o.PushBack(std::move(value), GetAllocator());
+#else
+  o.push_back(std::move(value));
+#endif
+}
+
+bool JsonIsNull(const json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  return o.IsNull();
+#else
+  return o.is_null();
+#endif
+}
+
+void JsonSetObject(json &o) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  o.SetObject();
+#else
+  o = o.object({});
+#endif
+}
+
+void JsonReserveArray(json &o, size_t s) {
+#ifdef TINYGLTF_USE_RAPIDJSON
+  o.SetArray();
+  o.Reserve(static_cast<rapidjson::SizeType>(s), GetAllocator());
+#endif
+  (void)(o);
+  (void)(s);
+}
+}  // namespace
+
+// typedef std::pair<std::string, json> json_object_pair;
+
+template <typename T>
+static void SerializeNumberProperty(const std::string &key, T number,
+                                    json &obj) {
+  // obj.insert(
+  //    json_object_pair(key, json(static_cast<double>(number))));
+  // obj[key] = static_cast<double>(number);
+  JsonAddMember(obj, key.c_str(), json(number));
+}
+
+#ifdef TINYGLTF_USE_RAPIDJSON
+template <>
+void SerializeNumberProperty(const std::string &key, size_t number, json &obj) {
+  JsonAddMember(obj, key.c_str(), json(static_cast<uint64_t>(number)));
+}
+#endif
+
+template <typename T>
+static void SerializeNumberArrayProperty(const std::string &key,
+                                         const std::vector<T> &value,
+                                         json &obj) {
+  if (value.empty()) return;
+
+  json ary;
+  JsonReserveArray(ary, value.size());
+  for (const auto &s : value) {
+    JsonPushBack(ary, json(s));
+  }
+  JsonAddMember(obj, key.c_str(), std::move(ary));
+}
+
+static void SerializeStringProperty(const std::string &key,
+                                    const std::string &value, json &obj) {
+  JsonAddMember(obj, key.c_str(), JsonFromString(value.c_str()));
+}
+
+static void SerializeStringArrayProperty(const std::string &key,
+                                         const std::vector<std::string> &value,
+                                         json &obj) {
+  json ary;
+  JsonReserveArray(ary, value.size());
+  for (auto &s : value) {
+    JsonPushBack(ary, JsonFromString(s.c_str()));
+  }
+  JsonAddMember(obj, key.c_str(), std::move(ary));
+}
+
+static bool ValueToJson(const Value &value, json *ret) {
+  json obj;
+#ifdef TINYGLTF_USE_RAPIDJSON
+  switch (value.Type()) {
+    case REAL_TYPE:
+      obj.SetDouble(value.Get<double>());
+      break;
+    case INT_TYPE:
+      obj.SetInt(value.Get<int>());
+      break;
+    case BOOL_TYPE:
+      obj.SetBool(value.Get<bool>());
+      break;
+    case STRING_TYPE:
+      obj.SetString(value.Get<std::string>().c_str(), GetAllocator());
+      break;
+    case ARRAY_TYPE: {
+      obj.SetArray();
+      obj.Reserve(static_cast<rapidjson::SizeType>(value.ArrayLen()),
+                  GetAllocator());
+      for (unsigned int i = 0; i < value.ArrayLen(); ++i) {
+        Value elementValue = value.Get(int(i));
+        json elementJson;
+        if (ValueToJson(value.Get(int(i)), &elementJson))
+          obj.PushBack(std::move(elementJson), GetAllocator());
+      }
+      break;
+    }
+    case BINARY_TYPE:
+      // TODO
+      // obj = json(value.Get<std::vector<unsigned char>>());
+      return false;
+      break;
+    case OBJECT_TYPE: {
+      obj.SetObject();
+      Value::Object objMap = value.Get<Value::Object>();
+      for (auto &it : objMap) {
+        json elementJson;
+        if (ValueToJson(it.second, &elementJson)) {
+          obj.AddMember(json(it.first.c_str(), GetAllocator()),
+                        std::move(elementJson), GetAllocator());
+        }
+      }
+      break;
+    }
+    case NULL_TYPE:
+    default:
+      return false;
+  }
+#else
+  switch (value.Type()) {
+    case REAL_TYPE:
+      obj = json(value.Get<double>());
+      break;
+    case INT_TYPE:
+      obj = json(value.Get<int>());
+      break;
+    case BOOL_TYPE:
+      obj = json(value.Get<bool>());
+      break;
+    case STRING_TYPE:
+      obj = json(value.Get<std::string>());
+      break;
+    case ARRAY_TYPE: {
+      for (unsigned int i = 0; i < value.ArrayLen(); ++i) {
+        Value elementValue = value.Get(int(i));
+        json elementJson;
+        if (ValueToJson(value.Get(int(i)), &elementJson))
+          obj.push_back(elementJson);
+      }
+      break;
+    }
+    case BINARY_TYPE:
+      // TODO
+      // obj = json(value.Get<std::vector<unsigned char>>());
+      return false;
+      break;
+    case OBJECT_TYPE: {
+      Value::Object objMap = value.Get<Value::Object>();
+      for (auto &it : objMap) {
+        json elementJson;
+        if (ValueToJson(it.second, &elementJson)) obj[it.first] = elementJson;
+      }
+      break;
+    }
+    case NULL_TYPE:
+    default:
+      return false;
+  }
+#endif
+  if (ret) *ret = std::move(obj);
+  return true;
+}
+
+static void SerializeValue(const std::string &key, const Value &value,
+                           json &obj) {
+  json ret;
+  if (ValueToJson(value, &ret)) {
+    JsonAddMember(obj, key.c_str(), std::move(ret));
+  }
+}
+
+static void SerializeGltfBufferData(const std::vector<unsigned char> &data,
+                                    json &o) {
+  std::string header = "data:application/octet-stream;base64,";
+  if (data.size() > 0) {
+    std::string encodedData =
+        base64_encode(&data[0], static_cast<unsigned int>(data.size()));
+    SerializeStringProperty("uri", header + encodedData, o);
+  } else {
+    // Issue #229
+    // size 0 is allowd. Just emit mime header.
+    SerializeStringProperty("uri", header, o);
+  }
+}
+
+static bool SerializeGltfBufferData(const std::vector<unsigned char> &data,
+                                    const std::string &binFilename) {
+#ifdef _WIN32
+#if defined(__GLIBCXX__)  // mingw
+  int file_descriptor = _wopen(UTF8ToWchar(binFilename).c_str(),
+                               _O_CREAT | _O_WRONLY | _O_TRUNC | _O_BINARY);
+  __gnu_cxx::stdio_filebuf<char> wfile_buf(
+      file_descriptor, std::ios_base::out | std::ios_base::binary);
+  std::ostream output(&wfile_buf);
+  if (!wfile_buf.is_open()) return false;
+#elif defined(_MSC_VER)
+  std::ofstream output(UTF8ToWchar(binFilename).c_str(), std::ofstream::binary);
+  if (!output.is_open()) return false;
+#else
+  std::ofstream output(binFilename.c_str(), std::ofstream::binary);
+  if (!output.is_open()) return false;
+#endif
+#else
+  std::ofstream output(binFilename.c_str(), std::ofstream::binary);
+  if (!output.is_open()) return false;
+#endif
+  if (data.size() > 0) {
+    output.write(reinterpret_cast<const char *>(&data[0]),
+                 std::streamsize(data.size()));
+  } else {
+    // Issue #229
+    // size 0 will be still valid buffer data.
+    // write empty file.
+  }
+  return true;
+}
+
+#if 0  // FIXME(syoyo): not used. will be removed in the future release.
+static void SerializeParameterMap(ParameterMap &param, json &o) {
+  for (ParameterMap::iterator paramIt = param.begin(); paramIt != param.end();
+       ++paramIt) {
+    if (paramIt->second.number_array.size()) {
+      SerializeNumberArrayProperty<double>(paramIt->first,
+                                           paramIt->second.number_array, o);
+    } else if (paramIt->second.json_double_value.size()) {
+      json json_double_value;
+      for (std::map<std::string, double>::iterator it =
+               paramIt->second.json_double_value.begin();
+           it != paramIt->second.json_double_value.end(); ++it) {
+        if (it->first == "index") {
+          json_double_value[it->first] = paramIt->second.TextureIndex();
+        } else {
+          json_double_value[it->first] = it->second;
+        }
+      }
+
+      o[paramIt->first] = json_double_value;
+    } else if (!paramIt->second.string_value.empty()) {
+      SerializeStringProperty(paramIt->first, paramIt->second.string_value, o);
+    } else if (paramIt->second.has_number_value) {
+      o[paramIt->first] = paramIt->second.number_value;
+    } else {
+      o[paramIt->first] = paramIt->second.bool_value;
+    }
+  }
+}
+#endif
+
+static void SerializeExtensionMap(const ExtensionMap &extensions, json &o) {
+  if (!extensions.size()) return;
+
+  json extMap;
+  for (ExtensionMap::const_iterator extIt = extensions.begin();
+       extIt != extensions.end(); ++extIt) {
+    // Allow an empty object for extension(#97)
+    json ret;
+    bool isNull = true;
+    if (ValueToJson(extIt->second, &ret)) {
+      isNull = JsonIsNull(ret);
+      JsonAddMember(extMap, extIt->first.c_str(), std::move(ret));
+    }
+    if (isNull) {
+      if (!(extIt->first.empty())) {  // name should not be empty, but for sure
+        // create empty object so that an extension name is still included in
+        // json.
+        json empty;
+        JsonSetObject(empty);
+        JsonAddMember(extMap, extIt->first.c_str(), std::move(empty));
+      }
+    }
+  }
+  JsonAddMember(o, "extensions", std::move(extMap));
+}
+
+static void SerializeGltfAccessor(Accessor &accessor, json &o) {
+  if (accessor.bufferView >= 0)
+    SerializeNumberProperty<int>("bufferView", accessor.bufferView, o);
+
+  if (accessor.byteOffset != 0)
+    SerializeNumberProperty<int>("byteOffset", int(accessor.byteOffset), o);
+
+  SerializeNumberProperty<int>("componentType", accessor.componentType, o);
+  SerializeNumberProperty<size_t>("count", accessor.count, o);
+
+  if ((accessor.componentType == TINYGLTF_COMPONENT_TYPE_FLOAT) ||
+      (accessor.componentType == TINYGLTF_COMPONENT_TYPE_DOUBLE)) {
+    SerializeNumberArrayProperty<double>("min", accessor.minValues, o);
+    SerializeNumberArrayProperty<double>("max", accessor.maxValues, o);
+  } else {
+    // Issue #301. Serialize as integer.
+    // Assume int value is within [-2**31-1, 2**31-1]
+    {
+      std::vector<int> values;
+      std::transform(accessor.minValues.begin(), accessor.minValues.end(),
+                     std::back_inserter(values),
+                     [](double v) { return static_cast<int>(v); });
+
+      SerializeNumberArrayProperty<int>("min", values, o);
+    }
+
+    {
+      std::vector<int> values;
+      std::transform(accessor.maxValues.begin(), accessor.maxValues.end(),
+                     std::back_inserter(values),
+                     [](double v) { return static_cast<int>(v); });
+
+      SerializeNumberArrayProperty<int>("max", values, o);
+    }
+  }
+
+  if (accessor.normalized)
+    SerializeValue("normalized", Value(accessor.normalized), o);
+  std::string type;
+  switch (accessor.type) {
+    case TINYGLTF_TYPE_SCALAR:
+      type = "SCALAR";
+      break;
+    case TINYGLTF_TYPE_VEC2:
+      type = "VEC2";
+      break;
+    case TINYGLTF_TYPE_VEC3:
+      type = "VEC3";
+      break;
+    case TINYGLTF_TYPE_VEC4:
+      type = "VEC4";
+      break;
+    case TINYGLTF_TYPE_MAT2:
+      type = "MAT2";
+      break;
+    case TINYGLTF_TYPE_MAT3:
+      type = "MAT3";
+      break;
+    case TINYGLTF_TYPE_MAT4:
+      type = "MAT4";
+      break;
+  }
+
+  SerializeStringProperty("type", type, o);
+  if (!accessor.name.empty()) SerializeStringProperty("name", accessor.name, o);
+
+  if (accessor.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", accessor.extras, o);
+  }
+}
+
+static void SerializeGltfAnimationChannel(AnimationChannel &channel, json &o) {
+  SerializeNumberProperty("sampler", channel.sampler, o);
+  {
+    json target;
+    SerializeNumberProperty("node", channel.target_node, target);
+    SerializeStringProperty("path", channel.target_path, target);
+
+    SerializeExtensionMap(channel.target_extensions, target);
+
+    JsonAddMember(o, "target", std::move(target));
+  }
+
+  if (channel.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", channel.extras, o);
+  }
+
+  SerializeExtensionMap(channel.extensions, o);
+}
+
+static void SerializeGltfAnimationSampler(AnimationSampler &sampler, json &o) {
+  SerializeNumberProperty("input", sampler.input, o);
+  SerializeNumberProperty("output", sampler.output, o);
+  SerializeStringProperty("interpolation", sampler.interpolation, o);
+
+  if (sampler.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", sampler.extras, o);
+  }
+}
+
+static void SerializeGltfAnimation(Animation &animation, json &o) {
+  if (!animation.name.empty())
+    SerializeStringProperty("name", animation.name, o);
+
+  {
+    json channels;
+    JsonReserveArray(channels, animation.channels.size());
+    for (unsigned int i = 0; i < animation.channels.size(); ++i) {
+      json channel;
+      AnimationChannel gltfChannel = animation.channels[i];
+      SerializeGltfAnimationChannel(gltfChannel, channel);
+      JsonPushBack(channels, std::move(channel));
+    }
+
+    JsonAddMember(o, "channels", std::move(channels));
+  }
+
+  {
+    json samplers;
+    JsonReserveArray(samplers, animation.samplers.size());
+    for (unsigned int i = 0; i < animation.samplers.size(); ++i) {
+      json sampler;
+      AnimationSampler gltfSampler = animation.samplers[i];
+      SerializeGltfAnimationSampler(gltfSampler, sampler);
+      JsonPushBack(samplers, std::move(sampler));
+    }
+    JsonAddMember(o, "samplers", std::move(samplers));
+  }
+
+  if (animation.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", animation.extras, o);
+  }
+
+  SerializeExtensionMap(animation.extensions, o);
+}
+
+static void SerializeGltfAsset(Asset &asset, json &o) {
+  if (!asset.generator.empty()) {
+    SerializeStringProperty("generator", asset.generator, o);
+  }
+
+  if (!asset.copyright.empty()) {
+    SerializeStringProperty("copyright", asset.copyright, o);
+  }
+
+  if (asset.version.empty()) {
+    // Just in case
+    // `version` must be defined
+    asset.version = "2.0";
+  }
+
+  // TODO(syoyo): Do we need to check if `version` is greater or equal to 2.0?
+  SerializeStringProperty("version", asset.version, o);
+
+  if (asset.extras.Keys().size()) {
+    SerializeValue("extras", asset.extras, o);
+  }
+
+  SerializeExtensionMap(asset.extensions, o);
+}
+
+static void SerializeGltfBufferBin(Buffer &buffer, json &o,
+                                   std::vector<unsigned char> &binBuffer) {
+  SerializeNumberProperty("byteLength", buffer.data.size(), o);
+  binBuffer = buffer.data;
+
+  if (buffer.name.size()) SerializeStringProperty("name", buffer.name, o);
+
+  if (buffer.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", buffer.extras, o);
+  }
+}
+
+static void SerializeGltfBuffer(Buffer &buffer, json &o) {
+  SerializeNumberProperty("byteLength", buffer.data.size(), o);
+  SerializeGltfBufferData(buffer.data, o);
+
+  if (buffer.name.size()) SerializeStringProperty("name", buffer.name, o);
+
+  if (buffer.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", buffer.extras, o);
+  }
+}
+
+static bool SerializeGltfBuffer(Buffer &buffer, json &o,
+                                const std::string &binFilename,
+                                const std::string &binBaseFilename) {
+  if (!SerializeGltfBufferData(buffer.data, binFilename)) return false;
+  SerializeNumberProperty("byteLength", buffer.data.size(), o);
+  SerializeStringProperty("uri", binBaseFilename, o);
+
+  if (buffer.name.size()) SerializeStringProperty("name", buffer.name, o);
+
+  if (buffer.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", buffer.extras, o);
+  }
+  return true;
+}
+
+static void SerializeGltfBufferView(BufferView &bufferView, json &o) {
+  SerializeNumberProperty("buffer", bufferView.buffer, o);
+  SerializeNumberProperty<size_t>("byteLength", bufferView.byteLength, o);
+
+  // byteStride is optional, minimum allowed is 4
+  if (bufferView.byteStride >= 4) {
+    SerializeNumberProperty<size_t>("byteStride", bufferView.byteStride, o);
+  }
+  // byteOffset is optional, default is 0
+  if (bufferView.byteOffset > 0) {
+    SerializeNumberProperty<size_t>("byteOffset", bufferView.byteOffset, o);
+  }
+  // Target is optional, check if it contains a valid value
+  if (bufferView.target == TINYGLTF_TARGET_ARRAY_BUFFER ||
+      bufferView.target == TINYGLTF_TARGET_ELEMENT_ARRAY_BUFFER) {
+    SerializeNumberProperty("target", bufferView.target, o);
+  }
+  if (bufferView.name.size()) {
+    SerializeStringProperty("name", bufferView.name, o);
+  }
+
+  if (bufferView.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", bufferView.extras, o);
+  }
+}
+
+static void SerializeGltfImage(Image &image, json &o) {
+  // if uri empty, the mimeType and bufferview should be set
+  if (image.uri.empty()) {
+    SerializeStringProperty("mimeType", image.mimeType, o);
+    SerializeNumberProperty<int>("bufferView", image.bufferView, o);
+  } else {
+    // TODO(syoyo): dlib::urilencode?
+    SerializeStringProperty("uri", image.uri, o);
+  }
+
+  if (image.name.size()) {
+    SerializeStringProperty("name", image.name, o);
+  }
+
+  if (image.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", image.extras, o);
+  }
+
+  SerializeExtensionMap(image.extensions, o);
+}
+
+static void SerializeGltfTextureInfo(TextureInfo &texinfo, json &o) {
+  SerializeNumberProperty("index", texinfo.index, o);
+
+  if (texinfo.texCoord != 0) {
+    SerializeNumberProperty("texCoord", texinfo.texCoord, o);
+  }
+
+  if (texinfo.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", texinfo.extras, o);
+  }
+
+  SerializeExtensionMap(texinfo.extensions, o);
+}
+
+static void SerializeGltfNormalTextureInfo(NormalTextureInfo &texinfo,
+                                           json &o) {
+  SerializeNumberProperty("index", texinfo.index, o);
+
+  if (texinfo.texCoord != 0) {
+    SerializeNumberProperty("texCoord", texinfo.texCoord, o);
+  }
+
+  if (!TINYGLTF_DOUBLE_EQUAL(texinfo.scale, 1.0)) {
+    SerializeNumberProperty("scale", texinfo.scale, o);
+  }
+
+  if (texinfo.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", texinfo.extras, o);
+  }
+
+  SerializeExtensionMap(texinfo.extensions, o);
+}
+
+static void SerializeGltfOcclusionTextureInfo(OcclusionTextureInfo &texinfo,
+                                              json &o) {
+  SerializeNumberProperty("index", texinfo.index, o);
+
+  if (texinfo.texCoord != 0) {
+    SerializeNumberProperty("texCoord", texinfo.texCoord, o);
+  }
+
+  if (!TINYGLTF_DOUBLE_EQUAL(texinfo.strength, 1.0)) {
+    SerializeNumberProperty("strength", texinfo.strength, o);
+  }
+
+  if (texinfo.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", texinfo.extras, o);
+  }
+
+  SerializeExtensionMap(texinfo.extensions, o);
+}
+
+static void SerializeGltfPbrMetallicRoughness(PbrMetallicRoughness &pbr,
+                                              json &o) {
+  std::vector<double> default_baseColorFactor = {1.0, 1.0, 1.0, 1.0};
+  if (!Equals(pbr.baseColorFactor, default_baseColorFactor)) {
+    SerializeNumberArrayProperty<double>("baseColorFactor", pbr.baseColorFactor,
+                                         o);
+  }
+
+  if (!TINYGLTF_DOUBLE_EQUAL(pbr.metallicFactor, 1.0)) {
+    SerializeNumberProperty("metallicFactor", pbr.metallicFactor, o);
+  }
+
+  if (!TINYGLTF_DOUBLE_EQUAL(pbr.roughnessFactor, 1.0)) {
+    SerializeNumberProperty("roughnessFactor", pbr.roughnessFactor, o);
+  }
+
+  if (pbr.baseColorTexture.index > -1) {
+    json texinfo;
+    SerializeGltfTextureInfo(pbr.baseColorTexture, texinfo);
+    JsonAddMember(o, "baseColorTexture", std::move(texinfo));
+  }
+
+  if (pbr.metallicRoughnessTexture.index > -1) {
+    json texinfo;
+    SerializeGltfTextureInfo(pbr.metallicRoughnessTexture, texinfo);
+    JsonAddMember(o, "metallicRoughnessTexture", std::move(texinfo));
+  }
+
+  SerializeExtensionMap(pbr.extensions, o);
+
+  if (pbr.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", pbr.extras, o);
+  }
+}
+
+static void SerializeGltfMaterial(Material &material, json &o) {
+  if (material.name.size()) {
+    SerializeStringProperty("name", material.name, o);
+  }
+
+  // QUESTION(syoyo): Write material parameters regardless of its default value?
+
+  if (!TINYGLTF_DOUBLE_EQUAL(material.alphaCutoff, 0.5)) {
+    SerializeNumberProperty("alphaCutoff", material.alphaCutoff, o);
+  }
+
+  if (material.alphaMode.compare("OPAQUE") != 0) {
+    SerializeStringProperty("alphaMode", material.alphaMode, o);
+  }
+
+  if (material.doubleSided != false)
+    JsonAddMember(o, "doubleSided", json(material.doubleSided));
+
+  if (material.normalTexture.index > -1) {
+    json texinfo;
+    SerializeGltfNormalTextureInfo(material.normalTexture, texinfo);
+    JsonAddMember(o, "normalTexture", std::move(texinfo));
+  }
+
+  if (material.occlusionTexture.index > -1) {
+    json texinfo;
+    SerializeGltfOcclusionTextureInfo(material.occlusionTexture, texinfo);
+    JsonAddMember(o, "occlusionTexture", std::move(texinfo));
+  }
+
+  if (material.emissiveTexture.index > -1) {
+    json texinfo;
+    SerializeGltfTextureInfo(material.emissiveTexture, texinfo);
+    JsonAddMember(o, "emissiveTexture", std::move(texinfo));
+  }
+
+  std::vector<double> default_emissiveFactor = {0.0, 0.0, 0.0};
+  if (!Equals(material.emissiveFactor, default_emissiveFactor)) {
+    SerializeNumberArrayProperty<double>("emissiveFactor",
+                                         material.emissiveFactor, o);
+  }
+
+  {
+    json pbrMetallicRoughness;
+    SerializeGltfPbrMetallicRoughness(material.pbrMetallicRoughness,
+                                      pbrMetallicRoughness);
+    // Issue 204
+    // Do not serialize `pbrMetallicRoughness` if pbrMetallicRoughness has all
+    // default values(json is null). Otherwise it will serialize to
+    // `pbrMetallicRoughness : null`, which cannot be read by other glTF
+    // importers(and validators).
+    //
+    if (!JsonIsNull(pbrMetallicRoughness)) {
+      JsonAddMember(o, "pbrMetallicRoughness", std::move(pbrMetallicRoughness));
+    }
+  }
+
+#if 0  // legacy way. just for the record.
+  if (material.values.size()) {
+    json pbrMetallicRoughness;
+    SerializeParameterMap(material.values, pbrMetallicRoughness);
+    JsonAddMember(o, "pbrMetallicRoughness", std::move(pbrMetallicRoughness));
+  }
+
+  SerializeParameterMap(material.additionalValues, o);
+#else
+
+#endif
+
+  SerializeExtensionMap(material.extensions, o);
+
+  if (material.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", material.extras, o);
+  }
+}
+
+static void SerializeGltfMesh(Mesh &mesh, json &o) {
+  json primitives;
+  JsonReserveArray(primitives, mesh.primitives.size());
+  for (unsigned int i = 0; i < mesh.primitives.size(); ++i) {
+    json primitive;
+    const Primitive &gltfPrimitive = mesh.primitives[i];  // don't make a copy
+    {
+      json attributes;
+      for (auto attrIt = gltfPrimitive.attributes.begin();
+           attrIt != gltfPrimitive.attributes.end(); ++attrIt) {
+        SerializeNumberProperty<int>(attrIt->first, attrIt->second, attributes);
+      }
+
+      JsonAddMember(primitive, "attributes", std::move(attributes));
+    }
+
+    // Indicies is optional
+    if (gltfPrimitive.indices > -1) {
+      SerializeNumberProperty<int>("indices", gltfPrimitive.indices, primitive);
+    }
+    // Material is optional
+    if (gltfPrimitive.material > -1) {
+      SerializeNumberProperty<int>("material", gltfPrimitive.material,
+                                   primitive);
+    }
+    SerializeNumberProperty<int>("mode", gltfPrimitive.mode, primitive);
+
+    // Morph targets
+    if (gltfPrimitive.targets.size()) {
+      json targets;
+      JsonReserveArray(targets, gltfPrimitive.targets.size());
+      for (unsigned int k = 0; k < gltfPrimitive.targets.size(); ++k) {
+        json targetAttributes;
+        std::map<std::string, int> targetData = gltfPrimitive.targets[k];
+        for (std::map<std::string, int>::iterator attrIt = targetData.begin();
+             attrIt != targetData.end(); ++attrIt) {
+          SerializeNumberProperty<int>(attrIt->first, attrIt->second,
+                                       targetAttributes);
+        }
+        JsonPushBack(targets, std::move(targetAttributes));
+      }
+      JsonAddMember(primitive, "targets", std::move(targets));
+    }
+
+    SerializeExtensionMap(gltfPrimitive.extensions, primitive);
+
+    if (gltfPrimitive.extras.Type() != NULL_TYPE) {
+      SerializeValue("extras", gltfPrimitive.extras, primitive);
+    }
+
+    JsonPushBack(primitives, std::move(primitive));
+  }
+
+  JsonAddMember(o, "primitives", std::move(primitives));
+
+  if (mesh.weights.size()) {
+    SerializeNumberArrayProperty<double>("weights", mesh.weights, o);
+  }
+
+  if (mesh.name.size()) {
+    SerializeStringProperty("name", mesh.name, o);
+  }
+
+  SerializeExtensionMap(mesh.extensions, o);
+  if (mesh.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", mesh.extras, o);
+  }
+}
+
+static void SerializeSpotLight(SpotLight &spot, json &o) {
+  SerializeNumberProperty("innerConeAngle", spot.innerConeAngle, o);
+  SerializeNumberProperty("outerConeAngle", spot.outerConeAngle, o);
+  SerializeExtensionMap(spot.extensions, o);
+  if (spot.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", spot.extras, o);
+  }
+}
+
+static void SerializeGltfLight(Light &light, json &o) {
+  if (!light.name.empty()) SerializeStringProperty("name", light.name, o);
+  SerializeNumberProperty("intensity", light.intensity, o);
+  if (light.range > 0.0) {
+    SerializeNumberProperty("range", light.range, o);
+  }
+  SerializeNumberArrayProperty("color", light.color, o);
+  SerializeStringProperty("type", light.type, o);
+  if (light.type == "spot") {
+    json spot;
+    SerializeSpotLight(light.spot, spot);
+    JsonAddMember(o, "spot", std::move(spot));
+  }
+  SerializeExtensionMap(light.extensions, o);
+  if (light.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", light.extras, o);
+  }
+}
+
+static void SerializeGltfNode(Node &node, json &o) {
+  if (node.translation.size() > 0) {
+    SerializeNumberArrayProperty<double>("translation", node.translation, o);
+  }
+  if (node.rotation.size() > 0) {
+    SerializeNumberArrayProperty<double>("rotation", node.rotation, o);
+  }
+  if (node.scale.size() > 0) {
+    SerializeNumberArrayProperty<double>("scale", node.scale, o);
+  }
+  if (node.matrix.size() > 0) {
+    SerializeNumberArrayProperty<double>("matrix", node.matrix, o);
+  }
+  if (node.mesh != -1) {
+    SerializeNumberProperty<int>("mesh", node.mesh, o);
+  }
+
+  if (node.skin != -1) {
+    SerializeNumberProperty<int>("skin", node.skin, o);
+  }
+
+  if (node.camera != -1) {
+    SerializeNumberProperty<int>("camera", node.camera, o);
+  }
+
+  if (node.weights.size() > 0) {
+    SerializeNumberArrayProperty<double>("weights", node.weights, o);
+  }
+
+  if (node.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", node.extras, o);
+  }
+
+  SerializeExtensionMap(node.extensions, o);
+  if (!node.name.empty()) SerializeStringProperty("name", node.name, o);
+  SerializeNumberArrayProperty<int>("children", node.children, o);
+}
+
+static void SerializeGltfSampler(Sampler &sampler, json &o) {
+  if (sampler.magFilter != -1) {
+    SerializeNumberProperty("magFilter", sampler.magFilter, o);
+  }
+  if (sampler.minFilter != -1) {
+    SerializeNumberProperty("minFilter", sampler.minFilter, o);
+  }
+  //SerializeNumberProperty("wrapR", sampler.wrapR, o);
+  SerializeNumberProperty("wrapS", sampler.wrapS, o);
+  SerializeNumberProperty("wrapT", sampler.wrapT, o);
+
+  if (sampler.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", sampler.extras, o);
+  }
+}
+
+static void SerializeGltfOrthographicCamera(const OrthographicCamera &camera,
+                                            json &o) {
+  SerializeNumberProperty("zfar", camera.zfar, o);
+  SerializeNumberProperty("znear", camera.znear, o);
+  SerializeNumberProperty("xmag", camera.xmag, o);
+  SerializeNumberProperty("ymag", camera.ymag, o);
+
+  if (camera.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", camera.extras, o);
+  }
+}
+
+static void SerializeGltfPerspectiveCamera(const PerspectiveCamera &camera,
+                                           json &o) {
+  SerializeNumberProperty("zfar", camera.zfar, o);
+  SerializeNumberProperty("znear", camera.znear, o);
+  if (camera.aspectRatio > 0) {
+    SerializeNumberProperty("aspectRatio", camera.aspectRatio, o);
+  }
+
+  if (camera.yfov > 0) {
+    SerializeNumberProperty("yfov", camera.yfov, o);
+  }
+
+  if (camera.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", camera.extras, o);
+  }
+}
+
+static void SerializeGltfCamera(const Camera &camera, json &o) {
+  SerializeStringProperty("type", camera.type, o);
+  if (!camera.name.empty()) {
+    SerializeStringProperty("name", camera.name, o);
+  }
+
+  if (camera.type.compare("orthographic") == 0) {
+    json orthographic;
+    SerializeGltfOrthographicCamera(camera.orthographic, orthographic);
+    JsonAddMember(o, "orthographic", std::move(orthographic));
+  } else if (camera.type.compare("perspective") == 0) {
+    json perspective;
+    SerializeGltfPerspectiveCamera(camera.perspective, perspective);
+    JsonAddMember(o, "perspective", std::move(perspective));
+  } else {
+    // ???
+  }
+
+  if (camera.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", camera.extras, o);
+  }
+  SerializeExtensionMap(camera.extensions, o);
+}
+
+static void SerializeGltfScene(Scene &scene, json &o) {
+  SerializeNumberArrayProperty<int>("nodes", scene.nodes, o);
+
+  if (scene.name.size()) {
+    SerializeStringProperty("name", scene.name, o);
+  }
+  if (scene.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", scene.extras, o);
+  }
+  SerializeExtensionMap(scene.extensions, o);
+}
+
+static void SerializeGltfSkin(Skin &skin, json &o) {
+  // required
+  SerializeNumberArrayProperty<int>("joints", skin.joints, o);
+
+  if (skin.inverseBindMatrices >= 0) {
+    SerializeNumberProperty("inverseBindMatrices", skin.inverseBindMatrices, o);
+  }
+
+  if (skin.skeleton >= 0) {
+    SerializeNumberProperty("skeleton", skin.skeleton, o);
+  }
+
+  if (skin.name.size()) {
+    SerializeStringProperty("name", skin.name, o);
+  }
+}
+
+static void SerializeGltfTexture(Texture &texture, json &o) {
+  if (texture.sampler > -1) {
+    SerializeNumberProperty("sampler", texture.sampler, o);
+  }
+  if (texture.source > -1) {
+    SerializeNumberProperty("source", texture.source, o);
+  }
+  if (texture.name.size()) {
+    SerializeStringProperty("name", texture.name, o);
+  }
+  if (texture.extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", texture.extras, o);
+  }
+  SerializeExtensionMap(texture.extensions, o);
+}
+
+///
+/// Serialize all properties except buffers and images.
+///
+static void SerializeGltfModel(Model *model, json &o) {
+  // ACCESSORS
+  if (model->accessors.size()) {
+    json accessors;
+    JsonReserveArray(accessors, model->accessors.size());
+    for (unsigned int i = 0; i < model->accessors.size(); ++i) {
+      json accessor;
+      SerializeGltfAccessor(model->accessors[i], accessor);
+      JsonPushBack(accessors, std::move(accessor));
+    }
+    JsonAddMember(o, "accessors", std::move(accessors));
+  }
+
+  // ANIMATIONS
+  if (model->animations.size()) {
+    json animations;
+    JsonReserveArray(animations, model->animations.size());
+    for (unsigned int i = 0; i < model->animations.size(); ++i) {
+      if (model->animations[i].channels.size()) {
+        json animation;
+        SerializeGltfAnimation(model->animations[i], animation);
+        JsonPushBack(animations, std::move(animation));
+      }
+    }
+
+    JsonAddMember(o, "animations", std::move(animations));
+  }
+
+  // ASSET
+  json asset;
+  SerializeGltfAsset(model->asset, asset);
+  JsonAddMember(o, "asset", std::move(asset));
+
+  // BUFFERVIEWS
+  if (model->bufferViews.size()) {
+    json bufferViews;
+    JsonReserveArray(bufferViews, model->bufferViews.size());
+    for (unsigned int i = 0; i < model->bufferViews.size(); ++i) {
+      json bufferView;
+      SerializeGltfBufferView(model->bufferViews[i], bufferView);
+      JsonPushBack(bufferViews, std::move(bufferView));
+    }
+    JsonAddMember(o, "bufferViews", std::move(bufferViews));
+  }
+
+  // Extensions required
+  if (model->extensionsRequired.size()) {
+    SerializeStringArrayProperty("extensionsRequired",
+                                 model->extensionsRequired, o);
+  }
+
+  // MATERIALS
+  if (model->materials.size()) {
+    json materials;
+    JsonReserveArray(materials, model->materials.size());
+    for (unsigned int i = 0; i < model->materials.size(); ++i) {
+      json material;
+      SerializeGltfMaterial(model->materials[i], material);
+
+      if (JsonIsNull(material)) {
+        // Issue 294.
+        // `material` does not have any required parameters
+        // so the result may be null(unmodified) when all material parameters
+        // have default value.
+        //
+        // null is not allowed thus we create an empty JSON object.
+        JsonSetObject(material);
+      }
+      JsonPushBack(materials, std::move(material));
+    }
+    JsonAddMember(o, "materials", std::move(materials));
+  }
+
+  // MESHES
+  if (model->meshes.size()) {
+    json meshes;
+    JsonReserveArray(meshes, model->meshes.size());
+    for (unsigned int i = 0; i < model->meshes.size(); ++i) {
+      json mesh;
+      SerializeGltfMesh(model->meshes[i], mesh);
+      JsonPushBack(meshes, std::move(mesh));
+    }
+    JsonAddMember(o, "meshes", std::move(meshes));
+  }
+
+  // NODES
+  if (model->nodes.size()) {
+    json nodes;
+    JsonReserveArray(nodes, model->nodes.size());
+    for (unsigned int i = 0; i < model->nodes.size(); ++i) {
+      json node;
+      SerializeGltfNode(model->nodes[i], node);
+      JsonPushBack(nodes, std::move(node));
+    }
+    JsonAddMember(o, "nodes", std::move(nodes));
+  }
+
+  // SCENE
+  if (model->defaultScene > -1) {
+    SerializeNumberProperty<int>("scene", model->defaultScene, o);
+  }
+
+  // SCENES
+  if (model->scenes.size()) {
+    json scenes;
+    JsonReserveArray(scenes, model->scenes.size());
+    for (unsigned int i = 0; i < model->scenes.size(); ++i) {
+      json currentScene;
+      SerializeGltfScene(model->scenes[i], currentScene);
+      JsonPushBack(scenes, std::move(currentScene));
+    }
+    JsonAddMember(o, "scenes", std::move(scenes));
+  }
+
+  // SKINS
+  if (model->skins.size()) {
+    json skins;
+    JsonReserveArray(skins, model->skins.size());
+    for (unsigned int i = 0; i < model->skins.size(); ++i) {
+      json skin;
+      SerializeGltfSkin(model->skins[i], skin);
+      JsonPushBack(skins, std::move(skin));
+    }
+    JsonAddMember(o, "skins", std::move(skins));
+  }
+
+  // TEXTURES
+  if (model->textures.size()) {
+    json textures;
+    JsonReserveArray(textures, model->textures.size());
+    for (unsigned int i = 0; i < model->textures.size(); ++i) {
+      json texture;
+      SerializeGltfTexture(model->textures[i], texture);
+      JsonPushBack(textures, std::move(texture));
+    }
+    JsonAddMember(o, "textures", std::move(textures));
+  }
+
+  // SAMPLERS
+  if (model->samplers.size()) {
+    json samplers;
+    JsonReserveArray(samplers, model->samplers.size());
+    for (unsigned int i = 0; i < model->samplers.size(); ++i) {
+      json sampler;
+      SerializeGltfSampler(model->samplers[i], sampler);
+      JsonPushBack(samplers, std::move(sampler));
+    }
+    JsonAddMember(o, "samplers", std::move(samplers));
+  }
+
+  // CAMERAS
+  if (model->cameras.size()) {
+    json cameras;
+    JsonReserveArray(cameras, model->cameras.size());
+    for (unsigned int i = 0; i < model->cameras.size(); ++i) {
+      json camera;
+      SerializeGltfCamera(model->cameras[i], camera);
+      JsonPushBack(cameras, std::move(camera));
+    }
+    JsonAddMember(o, "cameras", std::move(cameras));
+  }
+
+  // EXTENSIONS
+  SerializeExtensionMap(model->extensions, o);
+
+  auto extensionsUsed = model->extensionsUsed;
+
+  // LIGHTS as KHR_lights_punctual
+  if (model->lights.size()) {
+    json lights;
+    JsonReserveArray(lights, model->lights.size());
+    for (unsigned int i = 0; i < model->lights.size(); ++i) {
+      json light;
+      SerializeGltfLight(model->lights[i], light);
+      JsonPushBack(lights, std::move(light));
+    }
+    json khr_lights_cmn;
+    JsonAddMember(khr_lights_cmn, "lights", std::move(lights));
+    json ext_j;
+
+    {
+      json_const_iterator it;
+      if (FindMember(o, "extensions", it)) {
+        JsonAssign(ext_j, GetValue(it));
+      }
+    }
+
+    JsonAddMember(ext_j, "KHR_lights_punctual", std::move(khr_lights_cmn));
+
+    JsonAddMember(o, "extensions", std::move(ext_j));
+
+    // Also add "KHR_lights_punctual" to `extensionsUsed`
+    {
+      auto has_khr_lights_punctual =
+          std::find_if(extensionsUsed.begin(), extensionsUsed.end(),
+                       [](const std::string &s) {
+                         return (s.compare("KHR_lights_punctual") == 0);
+                       });
+
+      if (has_khr_lights_punctual == extensionsUsed.end()) {
+        extensionsUsed.push_back("KHR_lights_punctual");
+      }
+    }
+  }
+
+  // Extensions used
+  if (extensionsUsed.size()) {
+    SerializeStringArrayProperty("extensionsUsed", extensionsUsed, o);
+  }
+
+  // EXTRAS
+  if (model->extras.Type() != NULL_TYPE) {
+    SerializeValue("extras", model->extras, o);
+  }
+}
+
+static bool WriteGltfStream(std::ostream &stream, const std::string &content) {
+  stream << content << std::endl;
+  return true;
+}
+
+static bool WriteGltfFile(const std::string &output,
+                          const std::string &content) {
+#ifdef _WIN32
+#if defined(_MSC_VER)
+  std::ofstream gltfFile(UTF8ToWchar(output).c_str());
+#elif defined(__GLIBCXX__)
+  int file_descriptor = _wopen(UTF8ToWchar(output).c_str(),
+                               _O_CREAT | _O_WRONLY | _O_TRUNC | _O_BINARY);
+  __gnu_cxx::stdio_filebuf<char> wfile_buf(
+      file_descriptor, std::ios_base::out | std::ios_base::binary);
+  std::ostream gltfFile(&wfile_buf);
+  if (!wfile_buf.is_open()) return false;
+#else
+  std::ofstream gltfFile(output.c_str());
+  if (!gltfFile.is_open()) return false;
+#endif
+#else
+  std::ofstream gltfFile(output.c_str());
+  if (!gltfFile.is_open()) return false;
+#endif
+  return WriteGltfStream(gltfFile, content);
+}
+
+static void WriteBinaryGltfStream(std::ostream &stream,
+                                  const std::string &content,
+                                  const std::vector<unsigned char> &binBuffer) {
+  const std::string header = "glTF";
+  const int version = 2;
+
+  // https://stackoverflow.com/questions/3407012/c-rounding-up-to-the-nearest-multiple-of-a-number
+  auto roundUp = [](uint32_t numToRound, uint32_t multiple) {
+    if (multiple == 0) return numToRound;
+
+    uint32_t remainder = numToRound % multiple;
+    if (remainder == 0) return numToRound;
+
+    return numToRound + multiple - remainder;
+  };
+
+  const uint32_t padding_size =
+      roundUp(uint32_t(content.size()), 4) - uint32_t(content.size());
+
+  // 12 bytes for header, JSON content length, 8 bytes for JSON chunk info.
+  // Chunk data must be located at 4-byte boundary.
+  const uint32_t length =
+      12 + 8 + roundUp(uint32_t(content.size()), 4) +
+      (binBuffer.size() ? (8 + roundUp(uint32_t(binBuffer.size()), 4)) : 0);
+
+  stream.write(header.c_str(), std::streamsize(header.size()));
+  stream.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  stream.write(reinterpret_cast<const char *>(&length), sizeof(length));
+
+  // JSON chunk info, then JSON data
+  const uint32_t model_length = uint32_t(content.size()) + padding_size;
+  const uint32_t model_format = 0x4E4F534A;
+  stream.write(reinterpret_cast<const char *>(&model_length),
+               sizeof(model_length));
+  stream.write(reinterpret_cast<const char *>(&model_format),
+               sizeof(model_format));
+  stream.write(content.c_str(), std::streamsize(content.size()));
+
+  // Chunk must be multiplies of 4, so pad with spaces
+  if (padding_size > 0) {
+    const std::string padding = std::string(size_t(padding_size), ' ');
+    stream.write(padding.c_str(), std::streamsize(padding.size()));
+  }
+  if (binBuffer.size() > 0) {
+    const uint32_t bin_padding_size =
+        roundUp(uint32_t(binBuffer.size()), 4) - uint32_t(binBuffer.size());
+    // BIN chunk info, then BIN data
+    const uint32_t bin_length = uint32_t(binBuffer.size()) + bin_padding_size;
+    const uint32_t bin_format = 0x004e4942;
+    stream.write(reinterpret_cast<const char *>(&bin_length),
+                 sizeof(bin_length));
+    stream.write(reinterpret_cast<const char *>(&bin_format),
+                 sizeof(bin_format));
+    stream.write(reinterpret_cast<const char *>(binBuffer.data()),
+                 std::streamsize(binBuffer.size()));
+    // Chunksize must be multiplies of 4, so pad with zeroes
+    if (bin_padding_size > 0) {
+      const std::vector<unsigned char> padding =
+          std::vector<unsigned char>(size_t(bin_padding_size), 0);
+      stream.write(reinterpret_cast<const char *>(padding.data()),
+                   std::streamsize(padding.size()));
+    }
+  }
+}
+
+static void WriteBinaryGltfFile(const std::string &output,
+                                const std::string &content,
+                                const std::vector<unsigned char> &binBuffer) {
+#ifdef _WIN32
+#if defined(_MSC_VER)
+  std::ofstream gltfFile(UTF8ToWchar(output).c_str(), std::ios::binary);
+#elif defined(__GLIBCXX__)
+  int file_descriptor = _wopen(UTF8ToWchar(output).c_str(),
+                               _O_CREAT | _O_WRONLY | _O_TRUNC | _O_BINARY);
+  __gnu_cxx::stdio_filebuf<char> wfile_buf(
+      file_descriptor, std::ios_base::out | std::ios_base::binary);
+  std::ostream gltfFile(&wfile_buf);
+#else
+  std::ofstream gltfFile(output.c_str(), std::ios::binary);
+#endif
+#else
+  std::ofstream gltfFile(output.c_str(), std::ios::binary);
+#endif
+  WriteBinaryGltfStream(gltfFile, content, binBuffer);
+}
+
+bool TinyGLTF::WriteGltfSceneToStream(Model *model, std::ostream &stream,
+                                      bool prettyPrint = true,
+                                      bool writeBinary = false) {
+  JsonDocument output;
+
+  /// Serialize all properties except buffers and images.
+  SerializeGltfModel(model, output);
+
+  // BUFFERS
+  std::vector<unsigned char> binBuffer;
+  if (model->buffers.size()) {
+    json buffers;
+    JsonReserveArray(buffers, model->buffers.size());
+    for (unsigned int i = 0; i < model->buffers.size(); ++i) {
+      json buffer;
+      if (writeBinary && i == 0 && model->buffers[i].uri.empty()) {
+        SerializeGltfBufferBin(model->buffers[i], buffer, binBuffer);
+      } else {
+        SerializeGltfBuffer(model->buffers[i], buffer);
+      }
+      JsonPushBack(buffers, std::move(buffer));
+    }
+    JsonAddMember(output, "buffers", std::move(buffers));
+  }
+
+  // IMAGES
+  if (model->images.size()) {
+    json images;
+    JsonReserveArray(images, model->images.size());
+    for (unsigned int i = 0; i < model->images.size(); ++i) {
+      json image;
+
+      std::string dummystring = "";
+      // UpdateImageObject need baseDir but only uses it if embeddedImages is
+      // enabled, since we won't write separate images when writing to a stream
+      // we
+      UpdateImageObject(model->images[i], dummystring, int(i), false,
+                        &this->WriteImageData, this->write_image_user_data_);
+      SerializeGltfImage(model->images[i], image);
+      JsonPushBack(images, std::move(image));
+    }
+    JsonAddMember(output, "images", std::move(images));
+  }
+
+  if (writeBinary) {
+    WriteBinaryGltfStream(stream, JsonToString(output), binBuffer);
+  } else {
+    WriteGltfStream(stream, JsonToString(output, prettyPrint ? 2 : -1));
+  }
+
+  return true;
+}
+
+bool TinyGLTF::WriteGltfSceneToFile(Model *model, const std::string &filename,
+                                    bool embedImages = false,
+                                    bool embedBuffers = false,
+                                    bool prettyPrint = true,
+                                    bool writeBinary = false) {
+  JsonDocument output;
+  std::string defaultBinFilename = GetBaseFilename(filename);
+  std::string defaultBinFileExt = ".bin";
+  std::string::size_type pos =
+      defaultBinFilename.rfind('.', defaultBinFilename.length());
+
+  if (pos != std::string::npos) {
+    defaultBinFilename = defaultBinFilename.substr(0, pos);
+  }
+  std::string baseDir = GetBaseDir(filename);
+  if (baseDir.empty()) {
+    baseDir = "./";
+  }
+  /// Serialize all properties except buffers and images.
+  SerializeGltfModel(model, output);
+
+  // BUFFERS
+  std::vector<std::string> usedUris;
+  std::vector<unsigned char> binBuffer;
+  if (model->buffers.size()) {
+    json buffers;
+    JsonReserveArray(buffers, model->buffers.size());
+    for (unsigned int i = 0; i < model->buffers.size(); ++i) {
+      json buffer;
+      if (writeBinary && i == 0 && model->buffers[i].uri.empty()) {
+        SerializeGltfBufferBin(model->buffers[i], buffer, binBuffer);
+      } else if (embedBuffers) {
+        SerializeGltfBuffer(model->buffers[i], buffer);
+      } else {
+        std::string binSavePath;
+        std::string binUri;
+        if (!model->buffers[i].uri.empty() &&
+            !IsDataURI(model->buffers[i].uri)) {
+          binUri = model->buffers[i].uri;
+        } else {
+          binUri = defaultBinFilename + defaultBinFileExt;
+          bool inUse = true;
+          int numUsed = 0;
+          while (inUse) {
+            inUse = false;
+            for (const std::string &usedName : usedUris) {
+              if (binUri.compare(usedName) != 0) continue;
+              inUse = true;
+              binUri = defaultBinFilename + std::to_string(numUsed++) +
+                       defaultBinFileExt;
+              break;
+            }
+          }
+        }
+        usedUris.push_back(binUri);
+        binSavePath = JoinPath(baseDir, binUri);
+        if (!SerializeGltfBuffer(model->buffers[i], buffer, binSavePath,
+                                 binUri)) {
+          return false;
+        }
+      }
+      JsonPushBack(buffers, std::move(buffer));
+    }
+    JsonAddMember(output, "buffers", std::move(buffers));
+  }
+
+  // IMAGES
+  if (model->images.size()) {
+    json images;
+    JsonReserveArray(images, model->images.size());
+    for (unsigned int i = 0; i < model->images.size(); ++i) {
+      json image;
+
+      UpdateImageObject(model->images[i], baseDir, int(i), embedImages,
+                        &this->WriteImageData, this->write_image_user_data_);
+      SerializeGltfImage(model->images[i], image);
+      JsonPushBack(images, std::move(image));
+    }
+    JsonAddMember(output, "images", std::move(images));
+  }
+
+  if (writeBinary) {
+    WriteBinaryGltfFile(filename, JsonToString(output), binBuffer);
+  } else {
+    WriteGltfFile(filename, JsonToString(output, (prettyPrint ? 2 : -1)));
+  }
+
+  return true;
+}
+
+}  // namespace tinygltf
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif  // TINYGLTF_IMPLEMENTATION
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 381248e9bf1..17096d441f0 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -247,7 +247,7 @@ if(WITH_CYCLES_OSL)
 endif()
 
 if(WITH_CYCLES_DEVICE_OPTIX)
-  find_package(OptiX)
+  find_package(OptiX 7.3.0)
 
   if(OPTIX_FOUND)
     add_definitions(-DWITH_OPTIX)
@@ -286,11 +286,17 @@ if(WITH_OPENSUBDIV)
   )
 endif()
 
+if(WITH_OPENIMAGEDENOISE)
+  add_definitions(-DWITH_OPENIMAGEDENOISE)
+  add_definitions(-DOIDN_STATIC_LIB)
+  include_directories(
+    SYSTEM
+    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
+  )
+endif()
+
 if(WITH_CYCLES_STANDALONE)
-  set(WITH_CYCLES_DEVICE_OPENCL TRUE)
   set(WITH_CYCLES_DEVICE_CUDA TRUE)
-  # Experimental and unfinished.
-  set(WITH_CYCLES_NETWORK FALSE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
@@ -386,18 +392,12 @@ if(WITH_CYCLES_BLENDER)
   add_subdirectory(blender)
 endif()
 
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-
-if(WITH_CYCLES_STANDALONE OR WITH_CYCLES_NETWORK OR WITH_CYCLES_CUBIN_COMPILER)
-  add_subdirectory(app)
-endif()
-
+add_subdirectory(app)
 add_subdirectory(bvh)
 add_subdirectory(device)
 add_subdirectory(doc)
 add_subdirectory(graph)
+add_subdirectory(integrator)
 add_subdirectory(kernel)
 add_subdirectory(render)
 add_subdirectory(subd)
diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt
index 7a1e5d62dd2..f9dc5f00802 100644
--- a/intern/cycles/app/CMakeLists.txt
+++ b/intern/cycles/app/CMakeLists.txt
@@ -91,24 +91,6 @@ if(WITH_CYCLES_STANDALONE)
 endif()
 
 #####################################################################
-# Cycles network server executable
-#####################################################################
-
-if(WITH_CYCLES_NETWORK)
-  set(SRC
-    cycles_server.cpp
-  )
-  add_executable(cycles_server ${SRC})
-  target_link_libraries(cycles_server ${LIBRARIES})
-  cycles_target_link_libraries(cycles_server)
-
-  if(UNIX AND NOT APPLE)
-    set_target_properties(cycles_server PROPERTIES INSTALL_RPATH $ORIGIN/lib)
-  endif()
-  unset(SRC)
-endif()
-
-#####################################################################
 # Cycles cubin compiler executable
 #####################################################################
 
diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp
index 6b3513b065a..270096d70b0 100644
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -126,7 +126,7 @@ static BufferParams &session_buffer_params()
 
 static void scene_init()
 {
-  options.scene = new Scene(options.scene_params, options.session->device);
+  options.scene = options.session->scene;
 
   /* Read XML */
   xml_read_file(options.scene, options.filepath.c_str());
@@ -148,7 +148,7 @@ static void scene_init()
 static void session_init()
 {
   options.session_params.write_render_cb = write_render;
-  options.session = new Session(options.session_params);
+  options.session = new Session(options.session_params, options.scene_params);
 
   if (options.session_params.background && !options.quiet)
     options.session->progress.set_update_callback(function_bind(&session_print_status));
@@ -159,7 +159,6 @@ static void session_init()
 
   /* load scene */
   scene_init();
-  options.session->scene = options.scene;
 
   options.session->reset(session_buffer_params(), options.session_params.samples);
   options.session->start();
@@ -527,9 +526,6 @@ static void options_parse(int argc, const char **argv)
     fprintf(stderr, "No file path specified\n");
     exit(EXIT_FAILURE);
   }
-
-  /* For smoother Viewport */
-  options.session_params.start_resolution = 64;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp
index 276d850f1b3..54f97fddbd9 100644
--- a/intern/cycles/app/cycles_xml.cpp
+++ b/intern/cycles/app/cycles_xml.cpp
@@ -703,7 +703,7 @@ void xml_read_file(Scene *scene, const char *filepath)
 
   xml_read_include(state, path_filename(filepath));
 
-  scene->params.bvh_type = SceneParams::BVH_STATIC;
+  scene->params.bvh_type = BVH_TYPE_STATIC;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt
index ee5c6157338..5bdcfd56a4d 100644
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SRC
   blender_device.cpp
   blender_image.cpp
   blender_geometry.cpp
+  blender_gpu_display.cpp
   blender_light.cpp
   blender_mesh.cpp
   blender_object.cpp
@@ -50,6 +51,7 @@ set(SRC
 
   CCL_api.h
   blender_device.h
+  blender_gpu_display.h
   blender_id_map.h
   blender_image.h
   blender_object_cull.h
@@ -93,14 +95,6 @@ set(ADDON_FILES
 
 add_definitions(${GL_DEFINITIONS})
 
-if(WITH_CYCLES_DEVICE_OPENCL)
-  add_definitions(-DWITH_OPENCL)
-endif()
-
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-
 if(WITH_MOD_FLUID)
   add_definitions(-DWITH_FLUID)
 endif()
diff --git a/intern/cycles/blender/addon/__init__.py b/intern/cycles/blender/addon/__init__.py
index f728050a3cf..1ce25a253f9 100644
--- a/intern/cycles/blender/addon/__init__.py
+++ b/intern/cycles/blender/addon/__init__.py
@@ -58,7 +58,6 @@ class CyclesRender(bpy.types.RenderEngine):
     bl_use_eevee_viewport = True
     bl_use_preview = True
     bl_use_exclude_layers = True
-    bl_use_save_buffers = True
     bl_use_spherical_stereo = True
     bl_use_custom_freestyle = True
     bl_use_alembic_procedural = True
@@ -85,6 +84,12 @@ class CyclesRender(bpy.types.RenderEngine):
     def render(self, depsgraph):
         engine.render(self, depsgraph)
 
+    def render_frame_finish(self):
+        engine.render_frame_finish(self)
+
+    def draw(self, context, depsgraph):
+        engine.draw(self, depsgraph, context.space_data)
+
     def bake(self, depsgraph, obj, pass_type, pass_filter, width, height):
         engine.bake(self, depsgraph, obj, pass_type, pass_filter, width, height)
 
@@ -98,7 +103,7 @@ class CyclesRender(bpy.types.RenderEngine):
         engine.sync(self, depsgraph, context.blend_data)
 
     def view_draw(self, context, depsgraph):
-        engine.draw(self, depsgraph, context.region, context.space_data, context.region_data)
+        engine.view_draw(self, depsgraph, context.region, context.space_data, context.region_data)
 
     def update_script_node(self, node):
         if engine.with_osl():
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 489a883f098..e0e8ca10bef 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -18,62 +18,17 @@
 from __future__ import annotations
 
 
-def _is_using_buggy_driver():
-    import gpu
-    # We need to be conservative here because in multi-GPU systems display card
-    # might be quite old, but others one might be just good.
-    #
-    # So We shouldn't disable possible good dedicated cards just because display
-    # card seems weak. And instead we only blacklist configurations which are
-    # proven to cause problems.
-    if gpu.platform.vendor_get() == "ATI Technologies Inc.":
-        import re
-        version = gpu.platform.version_get()
-        if version.endswith("Compatibility Profile Context"):
-            # Old HD 4xxx and 5xxx series drivers did not have driver version
-            # in the version string, but those cards do not quite work and
-            # causing crashes.
-            return True
-        regex = re.compile(".*Compatibility Profile Context ([0-9]+(\\.[0-9]+)+)$")
-        if not regex.match(version):
-            # Skip cards like FireGL
-            return False
-        version = regex.sub("\\1", version).split('.')
-        return int(version[0]) == 8
-    return False
-
-
-def _workaround_buggy_drivers():
-    if _is_using_buggy_driver():
-        import _cycles
-        if hasattr(_cycles, "opencl_disable"):
-            print("Cycles: OpenGL driver known to be buggy, disabling OpenCL platform.")
-            _cycles.opencl_disable()
-
-
 def _configure_argument_parser():
     import argparse
     # No help because it conflicts with general Python scripts argument parsing
     parser = argparse.ArgumentParser(description="Cycles Addon argument parser",
                                      add_help=False)
-    parser.add_argument("--cycles-resumable-num-chunks",
-                        help="Number of chunks to split sample range into",
-                        default=None)
-    parser.add_argument("--cycles-resumable-current-chunk",
-                        help="Current chunk of samples range to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-start-chunk",
-                        help="Start chunk to render",
-                        default=None)
-    parser.add_argument("--cycles-resumable-end-chunk",
-                        help="End chunk to render",
-                        default=None)
     parser.add_argument("--cycles-print-stats",
                         help="Print rendering statistics to stderr",
                         action='store_true')
     parser.add_argument("--cycles-device",
                         help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA', 'OPTIX' or 'OPENCL'."
+                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
                              "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                         default=None)
     return parser
@@ -89,21 +44,6 @@ def _parse_command_line():
     parser = _configure_argument_parser()
     args, _ = parser.parse_known_args(argv[argv.index("--") + 1:])
 
-    if args.cycles_resumable_num_chunks is not None:
-        if args.cycles_resumable_current_chunk is not None:
-            import _cycles
-            _cycles.set_resumable_chunk(
-                int(args.cycles_resumable_num_chunks),
-                int(args.cycles_resumable_current_chunk),
-            )
-        elif args.cycles_resumable_start_chunk is not None and \
-                args.cycles_resumable_end_chunk:
-            import _cycles
-            _cycles.set_resumable_chunk_range(
-                int(args.cycles_resumable_num_chunks),
-                int(args.cycles_resumable_start_chunk),
-                int(args.cycles_resumable_end_chunk),
-            )
     if args.cycles_print_stats:
         import _cycles
         _cycles.enable_print_stats()
@@ -118,23 +58,11 @@ def init():
     import _cycles
     import os.path
 
-    # Workaround possibly buggy legacy drivers which crashes on the OpenCL
-    # device enumeration.
-    #
-    # This checks are not really correct because they might still fail
-    # in the case of multiple GPUs. However, currently buggy drivers
-    # are really old and likely to be used in single GPU systems only
-    # anyway.
-    #
-    # Can't do it in the background mode, so we hope OpenCL is no enabled
-    # in the user preferences.
-    if not bpy.app.background:
-        _workaround_buggy_drivers()
-
     path = os.path.dirname(__file__)
     user_path = os.path.dirname(os.path.abspath(bpy.utils.user_resource('CONFIG', path='')))
+    temp_path = bpy.app.tempdir
 
-    _cycles.init(path, user_path, bpy.app.background)
+    _cycles.init(path, user_path, temp_path, bpy.app.background)
     _parse_command_line()
 
 
@@ -177,6 +105,25 @@ def render(engine, depsgraph):
         _cycles.render(engine.session, depsgraph.as_pointer())
 
 
+def render_frame_finish(engine):
+    if not engine.session:
+        return
+
+    import _cycles
+    _cycles.render_frame_finish(engine.session)
+
+def draw(engine, depsgraph, space_image):
+    if not engine.session:
+        return
+
+    depsgraph_ptr = depsgraph.as_pointer()
+    space_image_ptr = space_image.as_pointer()
+    screen_ptr = space_image.id_data.as_pointer()
+
+    import _cycles
+    _cycles.draw(engine.session, depsgraph_ptr, screen_ptr, space_image_ptr)
+
+
 def bake(engine, depsgraph, obj, pass_type, pass_filter, width, height):
     import _cycles
     session = getattr(engine, "session", None)
@@ -204,14 +151,14 @@ def sync(engine, depsgraph, data):
     _cycles.sync(engine.session, depsgraph.as_pointer())
 
 
-def draw(engine, depsgraph, region, v3d, rv3d):
+def view_draw(engine, depsgraph, region, v3d, rv3d):
     import _cycles
     depsgraph = depsgraph.as_pointer()
     v3d = v3d.as_pointer()
     rv3d = rv3d.as_pointer()
 
     # draw render image
-    _cycles.draw(engine.session, depsgraph, v3d, rv3d)
+    _cycles.view_draw(engine.session, depsgraph, v3d, rv3d)
 
 
 def available_devices():
@@ -224,11 +171,6 @@ def with_osl():
     return _cycles.with_osl
 
 
-def with_network():
-    import _cycles
-    return _cycles.with_network
-
-
 def system_info():
     import _cycles
     return _cycles.system_info()
@@ -243,6 +185,7 @@ def list_render_passes(scene, srl):
     # Data passes.
     if srl.use_pass_z:                     yield ("Depth",         "Z",    'VALUE')
     if srl.use_pass_mist:                  yield ("Mist",          "Z",    'VALUE')
+    if srl.use_pass_position:              yield ("Position",      "XYZ",  'VECTOR')
     if srl.use_pass_normal:                yield ("Normal",        "XYZ",  'VECTOR')
     if srl.use_pass_vector:                yield ("Vector",        "XYZW", 'VECTOR')
     if srl.use_pass_uv:                    yield ("UV",            "UVA",  'VECTOR')
@@ -265,6 +208,7 @@ def list_render_passes(scene, srl):
     if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
     if srl.use_pass_shadow:                yield ("Shadow",        "RGB",  'COLOR')
     if srl.use_pass_ambient_occlusion:     yield ("AO",            "RGB",  'COLOR')
+    if crl.use_pass_shadow_catcher:        yield ("Shadow Catcher",      "RGB",  'COLOR')
 
     # Debug passes.
     if crl.pass_debug_render_time:             yield ("Debug Render Time",             "X",   'VALUE')
@@ -283,30 +227,20 @@ def list_render_passes(scene, srl):
             yield ("CryptoAsset" + '{:02d}'.format(i), "RGBA", 'COLOR')
 
     # Denoising passes.
-    if (scene.cycles.use_denoising and crl.use_denoising) or crl.denoising_store_passes:
+    if scene.cycles.use_denoising and crl.use_denoising:
         yield ("Noisy Image", "RGBA", 'COLOR')
-        if crl.denoising_store_passes:
-            yield ("Denoising Normal",          "XYZ", 'VECTOR')
-            yield ("Denoising Albedo",          "RGB", 'COLOR')
-            yield ("Denoising Depth",           "Z",   'VALUE')
-
-            if scene.cycles.denoiser == 'NLM':
-                yield ("Denoising Shadowing",       "X",   'VALUE')
-                yield ("Denoising Variance",        "RGB", 'COLOR')
-                yield ("Denoising Intensity",       "X",   'VALUE')
-
-                clean_options = ("denoising_diffuse_direct", "denoising_diffuse_indirect",
-                                 "denoising_glossy_direct", "denoising_glossy_indirect",
-                                 "denoising_transmission_direct", "denoising_transmission_indirect")
-                if any(getattr(crl, option) for option in clean_options):
-                    yield ("Denoising Clean", "RGB", 'COLOR')
+        if crl.use_pass_shadow_catcher:
+            yield ("Noisy Shadow Catcher", "RGBA", 'COLOR')
+    if crl.denoising_store_passes:
+        yield ("Denoising Normal",          "XYZ", 'VECTOR')
+        yield ("Denoising Albedo",          "RGB", 'COLOR')
 
     # Custom AOV passes.
     for aov in srl.aovs:
         if aov.type == 'VALUE':
             yield (aov.name, "X", 'VALUE')
         else:
-            yield (aov.name, "RGBA", 'COLOR')
+            yield (aov.name, "RGB", 'COLOR')
 
 
 def register_passes(engine, scene, view_layer):
diff --git a/intern/cycles/blender/addon/presets.py b/intern/cycles/blender/addon/presets.py
index bf33e5dc010..37c39904e30 100644
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -60,32 +60,48 @@ class AddPresetSampling(AddPresetBase, Operator):
     ]
 
     preset_values = [
+        "cycles.use_adaptive_sampling",
         "cycles.samples",
-        "cycles.preview_samples",
-        "cycles.aa_samples",
-        "cycles.preview_aa_samples",
-        "cycles.diffuse_samples",
-        "cycles.glossy_samples",
-        "cycles.transmission_samples",
-        "cycles.ao_samples",
-        "cycles.mesh_light_samples",
-        "cycles.subsurface_samples",
-        "cycles.volume_samples",
-        "cycles.use_square_samples",
-        "cycles.progressive",
-        "cycles.seed",
-        "cycles.sample_clamp_direct",
-        "cycles.sample_clamp_indirect",
-        "cycles.sample_all_lights_direct",
-        "cycles.sample_all_lights_indirect",
+        "cycles.adaptive_threshold",
+        "cycles.adaptive_min_samples",
+        "cycles.time_limit",
+        "cycles.use_denoising",
+        "cycles.denoiser",
+        "cycles.denoising_input_passes",
+        "cycles.denoising_prefilter",
     ]
 
     preset_subdir = "cycles/sampling"
 
 
+class AddPresetViewportSampling(AddPresetBase, Operator):
+    '''Add a Viewport Sampling Preset'''
+    bl_idname = "render.cycles_viewport_sampling_preset_add"
+    bl_label = "Add Viewport Sampling Preset"
+    preset_menu = "CYCLES_PT_viewport_sampling_presets"
+
+    preset_defines = [
+        "cycles = bpy.context.scene.cycles"
+    ]
+
+    preset_values = [
+        "cycles.use_preview_adaptive_sampling",
+        "cycles.preview_samples",
+        "cycles.preview_adaptive_threshold",
+        "cycles.preview_adaptive_min_samples",
+        "cycles.use_preview_denoising",
+        "cycles.preview_denoiser",
+        "cycles.preview_denoising_input_passes",
+        "cycles.preview_denoising_prefilter",
+        "cycles.preview_denoising_start_sample",
+    ]
+
+    preset_subdir = "cycles/viewport_sampling"
+
 classes = (
     AddPresetIntegrator,
     AddPresetSampling,
+    AddPresetViewportSampling,
 )
 
 
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 0c3af3fabeb..c2570e71efd 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -39,11 +39,6 @@ enum_devices = (
     ('GPU', "GPU Compute", "Use GPU compute device for rendering, configured in the system tab in the user preferences"),
 )
 
-from _cycles import with_network
-if with_network:
-    enum_devices += (('NETWORK', "Networked Device", "Use networked device for rendering"),)
-del with_network
-
 enum_feature_set = (
     ('SUPPORTED', "Supported", "Only use finished and supported features"),
     ('EXPERIMENTAL', "Experimental", "Use experimental and incomplete features that might be broken or change in the future", 'ERROR', 1),
@@ -84,15 +79,6 @@ enum_curve_shape = (
     ('THICK', "3D Curves", "Render hair as 3D curve, for accurate results when viewing hair close up"),
 )
 
-enum_tile_order = (
-    ('CENTER', "Center", "Render from center to the edges"),
-    ('RIGHT_TO_LEFT', "Right to Left", "Render from right to left"),
-    ('LEFT_TO_RIGHT', "Left to Right", "Render from left to right"),
-    ('TOP_TO_BOTTOM', "Top to Bottom", "Render from top to bottom"),
-    ('BOTTOM_TO_TOP', "Bottom to Top", "Render from bottom to top"),
-    ('HILBERT_SPIRAL', "Hilbert Spiral", "Render in a Hilbert Spiral"),
-)
-
 enum_use_layer_samples = (
     ('USE', "Use", "Per render layer number of samples override scene samples"),
     ('BOUNDED', "Bounded", "Bound per render layer number of samples by global samples"),
@@ -101,15 +87,9 @@ enum_use_layer_samples = (
 
 enum_sampling_pattern = (
     ('SOBOL', "Sobol", "Use Sobol random sampling pattern"),
-    ('CORRELATED_MUTI_JITTER', "Correlated Multi-Jitter", "Use Correlated Multi-Jitter random sampling pattern"),
     ('PROGRESSIVE_MUTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern"),
 )
 
-enum_integrator = (
-    ('BRANCHED_PATH', "Branched Path Tracing", "Path tracing integrator that branches on the first bounce, giving more control over the number of light and material samples"),
-    ('PATH', "Path Tracing", "Pure path tracing integrator"),
-)
-
 enum_volume_sampling = (
     ('DISTANCE', "Distance", "Use distance sampling, best for dense volumes with lights far away"),
     ('EQUIANGULAR', "Equiangular", "Use equiangular sampling, best for volumes with low density with light inside or near the volume"),
@@ -131,7 +111,6 @@ enum_device_type = (
     ('CPU', "CPU", "CPU", 0),
     ('CUDA', "CUDA", "CUDA", 1),
     ('OPTIX', "OptiX", "OptiX", 3),
-    ('OPENCL', "OpenCL", "OpenCL", 2)
 )
 
 enum_texture_limit = (
@@ -144,39 +123,46 @@ enum_texture_limit = (
     ('4096', "4096", "Limit texture size to 4096 pixels", 6),
     ('8192', "8192", "Limit texture size to 8192 pixels", 7),
 )
-
+ 
+# NOTE: Identifiers are expected to be an upper case version of identifiers from  `Pass::get_type_enum()`
 enum_view3d_shading_render_pass = (
     ('', "General", ""),
 
-    ('COMBINED', "Combined", "Show the Combined Render pass", 1),
-    ('EMISSION', "Emission", "Show the Emission render pass", 33),
-    ('BACKGROUND', "Background", "Show the Background render pass", 34),
-    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass", 35),
+    ('COMBINED', "Combined", "Show the Combined Render pass"),
+    ('EMISSION', "Emission", "Show the Emission render pass"),
+    ('BACKGROUND', "Background", "Show the Background render pass"),
+    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass"),
+    ('SHADOW', "Shadow", "Show the Shadow render pass"),
+    ('SHADOW_CATCHER', "Shadow Catcher", "Show the Shadow Catcher render pass"),
 
     ('', "Light", ""),
 
-    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass", 38),
-    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass", 39),
-    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass", 40),
+    ('DIFFUSE_DIRECT', "Diffuse Direct", "Show the Diffuse Direct render pass"),
+    ('DIFFUSE_INDIRECT', "Diffuse Indirect", "Show the Diffuse Indirect render pass"),
+    ('DIFFUSE_COLOR', "Diffuse Color", "Show the Diffuse Color render pass"),
 
-    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass", 41),
-    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass", 42),
-    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass", 43),
+    ('GLOSSY_DIRECT', "Glossy Direct", "Show the Glossy Direct render pass"),
+    ('GLOSSY_INDIRECT', "Glossy Indirect", "Show the Glossy Indirect render pass"),
+    ('GLOSSY_COLOR', "Glossy Color", "Show the Glossy Color render pass"),
 
     ('', "", ""),
 
-    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass", 44),
-    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass", 45),
-    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass", 46),
+    ('TRANSMISSION_DIRECT', "Transmission Direct", "Show the Transmission Direct render pass"),
+    ('TRANSMISSION_INDIRECT', "Transmission Indirect", "Show the Transmission Indirect render pass"),
+    ('TRANSMISSION_COLOR', "Transmission Color", "Show the Transmission Color render pass"),
 
-    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass", 50),
-    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass", 51),
+    ('VOLUME_DIRECT', "Volume Direct", "Show the Volume Direct render pass"),
+    ('VOLUME_INDIRECT', "Volume Indirect", "Show the Volume Indirect render pass"),
 
     ('', "Data", ""),
 
-    ('NORMAL', "Normal", "Show the Normal render pass", 3),
-    ('UV', "UV", "Show the UV render pass", 4),
-    ('MIST', "Mist", "Show the Mist render pass", 32),
+    ('POSITION', "Position", "Show the Position render pass"),
+    ('NORMAL', "Normal", "Show the Normal render pass"),
+    ('UV', "UV", "Show the UV render pass"),
+    ('MIST', "Mist", "Show the Mist render pass"),
+    ('DENOISING_ALBEDO', "Denoising Albedo", "Albedo pass used by denoiser"),
+    ('DENOISING_NORMAL', "Denoising Normal", "Normal pass used by denoiser"),
+    ('SAMPLE_COUNT', "Sample Count", "Per-pixel number of samples"),
 )
 
 
@@ -208,18 +194,23 @@ def enum_preview_denoiser(self, context):
 
 
 def enum_denoiser(self, context):
-    items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
+    items = []
     items += enum_optix_denoiser(self, context)
     items += enum_openimagedenoise_denoiser(self, context)
     return items
 
 
 enum_denoising_input_passes = (
-    ('RGB', "Color", "Use only color as input", 1),
-    ('RGB_ALBEDO', "Color + Albedo", "Use color and albedo data as input", 2),
-    ('RGB_ALBEDO_NORMAL', "Color + Albedo + Normal", "Use color, albedo and normal data as input", 3),
+    ('RGB', "None", "Don't use utility passes for denoising", 1),
+    ('RGB_ALBEDO', "Albedo", "Use albedo pass for denoising", 2),
+    ('RGB_ALBEDO_NORMAL', "Albedo and Normal", "Use albedo and normal passes for denoising", 3),
 )
 
+enum_denoising_prefilter = (
+    ('NONE', "None", "No prefiltering, use when guiding passes are noise-free", 1),
+    ('FAST', "Fast", "Denoise color and guiding passes together. Improves quality when guiding passes are noisy using least amount of extra processing time", 2),
+    ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
+)
 
 def update_render_passes(self, context):
     scene = context.scene
@@ -252,13 +243,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         description="Use Open Shading Language (CPU rendering only)",
     )
 
-    progressive: EnumProperty(
-        name="Integrator",
-        description="Method to sample lights and materials",
-        items=enum_integrator,
-        default='PATH',
-    )
-
     preview_pause: BoolProperty(
         name="Pause Preview",
         description="Pause all viewport preview renders",
@@ -268,110 +252,88 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
-        default=False,
+        default=True,
         update=update_render_passes,
     )
-    use_preview_denoising: BoolProperty(
-        name="Use Viewport Denoising",
-        description="Denoise the image in the 3D viewport",
-        default=False,
-    )
-
     denoiser: EnumProperty(
         name="Denoiser",
         description="Denoise the image with the selected denoiser. "
-        "For denoising the image after rendering, denoising data render passes "
-        "also adapt to the selected denoiser",
+        "For denoising the image after rendering",
         items=enum_denoiser,
-        default=1,
+        default=4, # Use integer to avoid error in builds without OpenImageDenoise.
         update=update_render_passes,
     )
+    denoising_prefilter: EnumProperty(
+        name="Denoising Prefilter",
+        description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+        items=enum_denoising_prefilter,
+        default='ACCURATE',
+    )
+    denoising_input_passes: EnumProperty(
+        name="Denoising Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO_NORMAL',
+    )
+
+    use_preview_denoising: BoolProperty(
+        name="Use Viewport Denoising",
+        description="Denoise the image in the 3D viewport",
+        default=False,
+    )
     preview_denoiser: EnumProperty(
         name="Viewport Denoiser",
         description="Denoise the image after each preview update with the selected denoiser",
         items=enum_preview_denoiser,
         default=0,
     )
-
-    use_square_samples: BoolProperty(
-        name="Square Samples",
-        description="Square sampling values for easier artist control",
-        default=False,
+    preview_denoising_prefilter: EnumProperty(
+        name="Viewport Denoising Prefilter",
+        description="Prefilter noisy guiding (albedo and normal) passes to improve denoising quality when using OpenImageDenoiser",
+        items=enum_denoising_prefilter,
+        default='FAST',
+    )
+    preview_denoising_input_passes: EnumProperty(
+        name="Viewport Denoising Input Passes",
+        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
+        items=enum_denoising_input_passes,
+        default='RGB_ALBEDO',
+    )
+    preview_denoising_start_sample: IntProperty(
+        name="Start Denoising",
+        description="Sample to start denoising the preview at",
+        min=0, max=(1 << 24),
+        default=1,
     )
 
     samples: IntProperty(
         name="Samples",
         description="Number of samples to render for each pixel",
         min=1, max=(1 << 24),
-        default=128,
+        default=4096,
     )
     preview_samples: IntProperty(
         name="Viewport Samples",
         description="Number of samples to render in the viewport, unlimited if 0",
         min=0, max=(1 << 24),
-        default=32,
-    )
-    aa_samples: IntProperty(
-        name="AA Samples",
-        description="Number of antialiasing samples to render for each pixel",
-        min=1, max=2097151,
-        default=128,
-    )
-    preview_aa_samples: IntProperty(
-        name="AA Samples",
-        description="Number of antialiasing samples to render in the viewport, unlimited if 0",
-        min=0, max=2097151,
-        default=32,
+        default=1024,
     )
 
-    diffuse_samples: IntProperty(
-        name="Diffuse Samples",
-        description="Number of diffuse bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    glossy_samples: IntProperty(
-        name="Glossy Samples",
-        description="Number of glossy bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    transmission_samples: IntProperty(
-        name="Transmission Samples",
-        description="Number of transmission bounce samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    ao_samples: IntProperty(
-        name="Ambient Occlusion Samples",
-        description="Number of ambient occlusion samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    mesh_light_samples: IntProperty(
-        name="Mesh Light Samples",
-        description="Number of mesh emission light samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    subsurface_samples: IntProperty(
-        name="Subsurface Samples",
-        description="Number of subsurface scattering samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
-    )
-    volume_samples: IntProperty(
-        name="Volume Samples",
-        description="Number of volume scattering samples to render for each AA sample",
-        min=1, max=1024,
-        default=1,
+    time_limit: FloatProperty(
+        name="Time Limit",
+        description="Limit the render time (excluding synchronization time)."
+        "Zero disables the limit",
+        min=0.0,
+        default=0.0,
+        step=100.0,
+        unit='TIME_ABSOLUTE',
     )
 
     sampling_pattern: EnumProperty(
         name="Sampling Pattern",
         description="Random sampling pattern used by the integrator",
         items=enum_sampling_pattern,
-        default='SOBOL',
+        default='PROGRESSIVE_MUTI_JITTER',
     )
 
     use_layer_samples: EnumProperty(
@@ -381,17 +343,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default='USE',
     )
 
-    sample_all_lights_direct: BoolProperty(
-        name="Sample All Direct Lights",
-        description="Sample all lights (for direct samples), rather than randomly picking one",
-        default=True,
-    )
-
-    sample_all_lights_indirect: BoolProperty(
-        name="Sample All Indirect Lights",
-        description="Sample all lights (for indirect samples), rather than randomly picking one",
-        default=True,
-    )
     light_sampling_threshold: FloatProperty(
         name="Light Sampling Threshold",
         description="Probabilistically terminate light samples when the light contribution is below this threshold (more noise but faster rendering). "
@@ -403,19 +354,39 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
     use_adaptive_sampling: BoolProperty(
         name="Use Adaptive Sampling",
         description="Automatically reduce the number of samples per pixel based on estimated noise level",
-        default=False,
+        default=True,
     )
-
     adaptive_threshold: FloatProperty(
         name="Adaptive Sampling Threshold",
         description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples",
         min=0.0, max=1.0,
-        default=0.0,
+        soft_min=0.001,
+        default=0.01,
         precision=4,
     )
     adaptive_min_samples: IntProperty(
         name="Adaptive Min Samples",
-        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on number of AA samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold",
+        min=0, max=4096,
+        default=0,
+    )
+
+    use_preview_adaptive_sampling: BoolProperty(
+        name="Use Adaptive Sampling",
+        description="Automatically reduce the number of samples per pixel based on estimated noise level, for viewport renders",
+        default=True,
+    )
+    preview_adaptive_threshold: FloatProperty(
+        name="Adaptive Sampling Threshold",
+        description="Noise level step to stop sampling at, lower values reduce noise at the cost of render time. Zero for automatic setting based on number of AA samples, for viewport renders",
+        min=0.0, max=1.0,
+        soft_min=0.001,
+        default=0.1,
+        precision=4,
+    )
+    preview_adaptive_min_samples: IntProperty(
+        name="Adaptive Min Samples",
+        description="Minimum AA samples for adaptive sampling, to discover noisy features before stopping sampling. Zero for automatic setting based on noise threshold, for viewport renders",
         min=0, max=4096,
         default=0,
     )
@@ -632,53 +603,6 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=10.0,
     )
 
-    debug_tile_size: IntProperty(
-        name="Tile Size",
-        description="",
-        min=1, max=4096,
-        default=1024,
-    )
-
-    preview_start_resolution: IntProperty(
-        name="Start Resolution",
-        description="Resolution to start rendering preview at, "
-        "progressively increasing it to the full viewport size",
-        min=8, max=16384,
-        default=64,
-        subtype='PIXEL'
-    )
-    preview_denoising_start_sample: IntProperty(
-        name="Start Denoising",
-        description="Sample to start denoising the preview at",
-        min=0, max=(1 << 24),
-        default=1,
-    )
-    preview_denoising_input_passes: EnumProperty(
-        name="Viewport Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO',
-    )
-
-    debug_reset_timeout: FloatProperty(
-        name="Reset timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=0.1,
-    )
-    debug_cancel_timeout: FloatProperty(
-        name="Cancel timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=0.1,
-    )
-    debug_text_timeout: FloatProperty(
-        name="Text timeout",
-        description="",
-        min=0.01, max=10.0,
-        default=1.0,
-    )
-
     debug_bvh_type: EnumProperty(
         name="Viewport BVH Type",
         description="Choose between faster updates, or faster render",
@@ -701,38 +625,24 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         default=0,
         min=0, max=16,
     )
-    tile_order: EnumProperty(
-        name="Tile Order",
-        description="Tile order for rendering",
-        items=enum_tile_order,
-        default='HILBERT_SPIRAL',
-        options=set(),  # Not animatable!
-    )
-    use_progressive_refine: BoolProperty(
-        name="Progressive Refine",
-        description="Instead of rendering each tile until it is finished, "
-        "refine the whole image progressively "
-        "(this renders somewhat slower, "
-        "but time can be saved by manually stopping the render when the noise is low enough)",
-        default=False,
-    )
 
     bake_type: EnumProperty(
         name="Bake Type",
         default='COMBINED',
         description="Type of pass to bake",
         items=(
-            ('COMBINED', "Combined", ""),
-            ('AO', "Ambient Occlusion", ""),
-            ('SHADOW', "Shadow", ""),
-            ('NORMAL', "Normal", ""),
-            ('UV', "UV", ""),
-            ('ROUGHNESS', "Roughness", ""),
-            ('EMIT', "Emit", ""),
-            ('ENVIRONMENT', "Environment", ""),
-            ('DIFFUSE', "Diffuse", ""),
-            ('GLOSSY', "Glossy", ""),
-            ('TRANSMISSION', "Transmission", ""),
+            ('COMBINED', "Combined", "", 0),
+            ('AO', "Ambient Occlusion", "", 1),
+            ('SHADOW', "Shadow", "", 2),
+            ('POSITION', "Position", "", 11),
+            ('NORMAL', "Normal", "", 3),
+            ('UV', "UV", "", 4),
+            ('ROUGHNESS', "Roughness", "", 5),
+            ('EMIT', "Emit", "", 6),
+            ('ENVIRONMENT', "Environment", "", 7),
+            ('DIFFUSE', "Diffuse", "", 8),
+            ('GLOSSY', "Glossy", "", 9),
+            ('TRANSMISSION', "Transmission", "", 10),
         ),
     )
 
@@ -827,6 +737,18 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         min=0, max=1024,
     )
 
+    use_auto_tile: BoolProperty(
+        name="Auto Tiles",
+        description="Automatically split image into tiles",
+        default=True,
+    )
+    tile_size: IntProperty(
+        name="Tile Size",
+        default=2048,
+        description="",
+        min=0, max=16384,
+    )
+
     # Various fine-tuning debug flags
 
     def _devices_update_callback(self, context):
@@ -844,45 +766,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
         items=enum_bvh_layouts,
         default='EMBREE',
     )
-    debug_use_cpu_split_kernel: BoolProperty(name="Split Kernel", default=False)
 
     debug_use_cuda_adaptive_compile: BoolProperty(name="Adaptive Compile", default=False)
-    debug_use_cuda_split_kernel: BoolProperty(name="Split Kernel", default=False)
-
-    debug_optix_cuda_streams: IntProperty(name="CUDA Streams", default=1, min=1)
-    debug_optix_curves_api: BoolProperty(name="Native OptiX Curve Primitive", default=False)
-
-    debug_opencl_kernel_type: EnumProperty(
-        name="OpenCL Kernel Type",
-        default='DEFAULT',
-        items=(
-            ('DEFAULT', "Default", ""),
-            ('MEGA', "Mega", ""),
-            ('SPLIT', "Split", ""),
-        ),
-        update=CyclesRenderSettings._devices_update_callback
-    )
 
-    debug_opencl_device_type: EnumProperty(
-        name="OpenCL Device Type",
-        default='ALL',
-        items=(
-            ('NONE', "None", ""),
-            ('ALL', "All", ""),
-            ('DEFAULT', "Default", ""),
-            ('CPU', "CPU", ""),
-            ('GPU', "GPU", ""),
-            ('ACCELERATOR', "Accelerator", ""),
-        ),
-        update=CyclesRenderSettings._devices_update_callback
-    )
-
-    debug_use_opencl_debug: BoolProperty(name="Debug OpenCL", default=False)
-
-    debug_opencl_mem_limit: IntProperty(
-        name="Memory limit",
-        default=0,
-        description="Artificial limit on OpenCL memory usage in MB (0 to disable limit)"
+    debug_use_optix_debug: BoolProperty(
+        name="OptiX Module Debug",
+        description="Load OptiX module in debug mode: lower logging verbosity level, enable validations, and lower optimization level",
+        default=False
     )
 
     @classmethod
@@ -1031,12 +921,6 @@ class CyclesLightSettings(bpy.types.PropertyGroup):
         description="Light casts shadows",
         default=True,
     )
-    samples: IntProperty(
-        name="Samples",
-        description="Number of light samples to render for each AA sample",
-        min=1, max=10000,
-        default=1,
-    )
     max_bounces: IntProperty(
         name="Max Bounces",
         description="Maximum number of bounces the light will contribute to the render",
@@ -1084,12 +968,6 @@ class CyclesWorldSettings(bpy.types.PropertyGroup):
         min=4, max=8192,
         default=1024,
     )
-    samples: IntProperty(
-        name="Samples",
-        description="Number of light samples to render for each AA sample",
-        min=1, max=10000,
-        default=1,
-    )
     max_bounces: IntProperty(
         name="Max Bounces",
         description="Maximum number of bounces the background light will contribute to the render",
@@ -1343,91 +1221,25 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
         update=update_render_passes,
     )
 
+    use_pass_shadow_catcher: BoolProperty(
+        name="Shadow Catcher",
+        description="Pass containing shadows and light which is to be multiplied into backdrop",
+        default=False,
+        update=update_render_passes,
+    )
+
     use_denoising: BoolProperty(
         name="Use Denoising",
         description="Denoise the rendered image",
         default=True,
         update=update_render_passes,
     )
-    denoising_diffuse_direct: BoolProperty(
-        name="Diffuse Direct",
-        description="Denoise the direct diffuse lighting",
-        default=True,
-    )
-    denoising_diffuse_indirect: BoolProperty(
-        name="Diffuse Indirect",
-        description="Denoise the indirect diffuse lighting",
-        default=True,
-    )
-    denoising_glossy_direct: BoolProperty(
-        name="Glossy Direct",
-        description="Denoise the direct glossy lighting",
-        default=True,
-    )
-    denoising_glossy_indirect: BoolProperty(
-        name="Glossy Indirect",
-        description="Denoise the indirect glossy lighting",
-        default=True,
-    )
-    denoising_transmission_direct: BoolProperty(
-        name="Transmission Direct",
-        description="Denoise the direct transmission lighting",
-        default=True,
-    )
-    denoising_transmission_indirect: BoolProperty(
-        name="Transmission Indirect",
-        description="Denoise the indirect transmission lighting",
-        default=True,
-    )
-    denoising_strength: FloatProperty(
-        name="Denoising Strength",
-        description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
-        min=0.0, max=1.0,
-        default=0.5,
-    )
-    denoising_feature_strength: FloatProperty(
-        name="Denoising Feature Strength",
-        description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
-        min=0.0, max=1.0,
-        default=0.5,
-    )
-    denoising_radius: IntProperty(
-        name="Denoising Radius",
-        description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
-        min=1, max=25,
-        default=8,
-        subtype="PIXEL",
-    )
-    denoising_relative_pca: BoolProperty(
-        name="Relative Filter",
-        description="When removing pixels that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
-        default=False,
-    )
     denoising_store_passes: BoolProperty(
         name="Store Denoising Passes",
         description="Store the denoising feature passes and the noisy image. The passes adapt to the denoiser selected for rendering",
         default=False,
         update=update_render_passes,
     )
-    denoising_neighbor_frames: IntProperty(
-        name="Neighbor Frames",
-        description="Number of neighboring frames to use for denoising animations (more frames produce smoother results at the cost of performance)",
-        min=0, max=7,
-        default=0,
-    )
-
-    denoising_optix_input_passes: EnumProperty(
-        name="Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO',
-    )
-    denoising_openimagedenoise_input_passes: EnumProperty(
-        name="Input Passes",
-        description="Passes used by the denoiser to distinguish noise from shader and geometry detail",
-        items=enum_denoising_input_passes,
-        default='RGB_ALBEDO_NORMAL',
-    )
 
     @classmethod
     def register(cls):
@@ -1454,14 +1266,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_optix, has_opencl = _cycles.get_device_types()
+        has_cuda, has_optix = _cycles.get_device_types()
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
         if has_optix:
             list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
-        if has_opencl:
-            list.append(('OPENCL', "OpenCL", "Use OpenCL for GPU acceleration", 2))
         return list
 
     compute_device_type: EnumProperty(
@@ -1486,7 +1296,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'OPENCL', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1520,22 +1330,23 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             elif entry.type == 'CPU':
                 cpu_devices.append(entry)
         # Extend all GPU devices with CPU.
-        if compute_device_type in {'CUDA', 'OPTIX', 'OPENCL'}:
+        if compute_device_type != 'CPU':
             devices.extend(cpu_devices)
         return devices
 
-    # For backwards compatibility, only returns CUDA and OpenCL but still
-    # refreshes all devices.
-    def get_devices(self, compute_device_type=''):
+    # Refresh device list. This does not happen automatically on Blender
+    # startup due to unstable OpenCL implementations that can cause crashes.
+    def refresh_devices(self):
         import _cycles
         # Ensure `self.devices` is not re-allocated when the second call to
         # get_devices_for_type is made, freeing items from the first list.
         for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
             self.update_device_entries(_cycles.available_devices(device_type))
 
-        cuda_devices = self.get_devices_for_type('CUDA')
-        opencl_devices = self.get_devices_for_type('OPENCL')
-        return cuda_devices, opencl_devices
+    # Deprecated: use refresh_devices instead.
+    def get_devices(self, compute_device_type=''):
+        self.refresh_devices()
+        return None
 
     def get_num_gpu_devices(self):
         import _cycles
@@ -1601,6 +1412,10 @@ class CyclesView3DShadingSettings(bpy.types.PropertyGroup):
         items=enum_view3d_shading_render_pass,
         default='COMBINED',
     )
+    show_active_pixels: BoolProperty(
+        name="Show Active Pixels",
+        description="When using adaptive sampling highlight pixels which are being sampled",
+    )
 
 
 def register():
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 47f7b4c6d73..d02627b9936 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -34,6 +34,12 @@ class CYCLES_PT_sampling_presets(PresetPanel, Panel):
     preset_add_operator = "render.cycles_sampling_preset_add"
     COMPAT_ENGINES = {'CYCLES'}
 
+class CYCLES_PT_viewport_sampling_presets(PresetPanel, Panel):
+    bl_label = "Viewport Sampling Presets"
+    preset_subdir = "cycles/viewport_sampling"
+    preset_operator = "script.execute_preset"
+    preset_add_operator = "render.cycles_viewport_sampling_preset_add"
+    COMPAT_ENGINES = {'CYCLES'}
 
 class CYCLES_PT_integrator_presets(PresetPanel, Panel):
     bl_label = "Integrator Presets"
@@ -54,6 +60,15 @@ class CyclesButtonsPanel:
         return context.engine in cls.COMPAT_ENGINES
 
 
+class CyclesDebugButtonsPanel(CyclesButtonsPanel):
+    @classmethod
+    def poll(cls, context):
+        prefs = bpy.context.preferences
+        return (CyclesButtonsPanel.poll(context)
+                and prefs.experimental.use_cycles_debug
+                and prefs.view.show_developer_ui)
+
+
 # Adapt properties editor panel to display in node editor. We have to
 # copy the class rather than inherit due to the way bpy registration works.
 def node_panel(cls):
@@ -78,12 +93,6 @@ def use_cpu(context):
     return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
 
 
-def use_opencl(context):
-    cscene = context.scene.cycles
-
-    return (get_device_type(context) == 'OPENCL' and cscene.device == 'GPU')
-
-
 def use_cuda(context):
     cscene = context.scene.cycles
 
@@ -96,12 +105,6 @@ def use_optix(context):
     return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
 
 
-def use_branched_path(context):
-    cscene = context.scene.cycles
-
-    return (cscene.progressive == 'BRANCHED_PATH' and not use_optix(context))
-
-
 def use_sample_all_lights(context):
     cscene = context.scene.cycles
 
@@ -115,57 +118,33 @@ def show_device_active(context):
     return context.preferences.addons[__package__].preferences.has_active_device()
 
 
-def draw_samples_info(layout, context):
-    cscene = context.scene.cycles
-    integrator = cscene.progressive
+def get_effective_preview_denoiser(context):
+    scene = context.scene
+    cscene = scene.cycles
+
+    if cscene.preview_denoiser != "AUTO":
+        return cscene.preview_denoiser
+
+    if context.preferences.addons[__package__].preferences.get_devices_for_type('OPTIX'):
+        return 'OPTIX'
+
+    return 'OIDN'
 
-    # Calculate sample values
-    if integrator == 'PATH':
-        aa = cscene.samples
-        if cscene.use_square_samples:
-            aa = aa * aa
-    else:
-        aa = cscene.aa_samples
-        d = cscene.diffuse_samples
-        g = cscene.glossy_samples
-        t = cscene.transmission_samples
-        ao = cscene.ao_samples
-        ml = cscene.mesh_light_samples
-        sss = cscene.subsurface_samples
-        vol = cscene.volume_samples
-
-        if cscene.use_square_samples:
-            aa = aa * aa
-            d = d * d
-            g = g * g
-            t = t * t
-            ao = ao * ao
-            ml = ml * ml
-            sss = sss * sss
-            vol = vol * vol
-
-    # Draw interface
-    # Do not draw for progressive, when Square Samples are disabled
-    if use_branched_path(context) or (cscene.use_square_samples and integrator == 'PATH'):
-        col = layout.column(align=True)
-        col.scale_y = 0.6
-        col.label(text="Total Samples:")
-        col.separator()
-        if integrator == 'PATH':
-            col.label(text="%s AA" % aa)
-        else:
-            col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
-                      (aa, d * aa, g * aa, t * aa))
-            col.separator()
-            col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
-                      (ao * aa, ml * aa, sss * aa, vol * aa))
 
 
 class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
     bl_label = "Sampling"
 
+    def draw(self, context):
+        pass
+
+
+class CYCLES_RENDER_PT_sampling_viewport(CyclesButtonsPanel, Panel):
+    bl_label = "Viewport"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+
     def draw_header_preset(self, context):
-        CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
+        CYCLES_PT_viewport_sampling_presets.draw_panel_header(self.layout)
 
     def draw(self, context):
         layout = self.layout
@@ -176,29 +155,31 @@ class CYCLES_RENDER_PT_sampling(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
-        if not use_optix(context):
-            layout.prop(cscene, "progressive")
+        heading = layout.column(align=True, heading="Noise Threshold")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_preview_adaptive_sampling", text="")
+        sub = row.row()
+        sub.active = cscene.use_preview_adaptive_sampling
+        sub.prop(cscene, "preview_adaptive_threshold", text="")
 
-        if not use_branched_path(context):
+        if cscene.use_preview_adaptive_sampling:
             col = layout.column(align=True)
-            col.prop(cscene, "samples", text="Render")
-            col.prop(cscene, "preview_samples", text="Viewport")
+            col.prop(cscene, "preview_samples", text=" Max Samples")
+            col.prop(cscene, "preview_adaptive_min_samples", text="Min Samples")
         else:
-            col = layout.column(align=True)
-            col.prop(cscene, "aa_samples", text="Render")
-            col.prop(cscene, "preview_aa_samples", text="Viewport")
+            layout.prop(cscene, "preview_samples", text="Samples")
 
-        if not use_branched_path(context):
-            draw_samples_info(layout, context)
 
+class CYCLES_RENDER_PT_sampling_viewport_denoise(CyclesButtonsPanel, Panel):
+    bl_label = "Denoise"
+    bl_parent_id = 'CYCLES_RENDER_PT_sampling_viewport'
+    bl_options = {'DEFAULT_CLOSED'}
 
-class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
-    bl_label = "Sub Samples"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    def draw_header(self, context):
+        scene = context.scene
+        cscene = scene.cycles
 
-    @classmethod
-    def poll(cls, context):
-        return use_branched_path(context)
+        self.layout.prop(context.scene.cycles, "use_preview_denoising", text="")
 
     def draw(self, context):
         layout = self.layout
@@ -208,53 +189,61 @@ class CYCLES_RENDER_PT_sampling_sub_samples(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        col = layout.column(align=True)
-        col.prop(cscene, "diffuse_samples", text="Diffuse")
-        col.prop(cscene, "glossy_samples", text="Glossy")
-        col.prop(cscene, "transmission_samples", text="Transmission")
-        col.prop(cscene, "ao_samples", text="AO")
+        col = layout.column()
+        col.active = cscene.use_preview_denoising
+        col.prop(cscene, "preview_denoiser", text="Denoiser")
+        col.prop(cscene, "preview_denoising_input_passes", text="Passes")
 
-        sub = col.row(align=True)
-        sub.active = use_sample_all_lights(context)
-        sub.prop(cscene, "mesh_light_samples", text="Mesh Light")
-        col.prop(cscene, "subsurface_samples", text="Subsurface")
-        col.prop(cscene, "volume_samples", text="Volume")
+        effective_preview_denoiser = get_effective_preview_denoiser(context)
+        if effective_preview_denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cscene, "preview_denoising_prefilter", text="Prefilter")
 
-        draw_samples_info(layout, context)
+        col.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
 
 
-class CYCLES_RENDER_PT_sampling_adaptive(CyclesButtonsPanel, Panel):
-    bl_label = "Adaptive Sampling"
+class CYCLES_RENDER_PT_sampling_render(CyclesButtonsPanel, Panel):
+    bl_label = "Render"
     bl_parent_id = "CYCLES_RENDER_PT_sampling"
-    bl_options = {'DEFAULT_CLOSED'}
 
-    def draw_header(self, context):
-        layout = self.layout
-        scene = context.scene
-        cscene = scene.cycles
-
-        layout.prop(cscene, "use_adaptive_sampling", text="")
+    def draw_header_preset(self, context):
+        CYCLES_PT_sampling_presets.draw_panel_header(self.layout)
 
     def draw(self, context):
         layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
 
         scene = context.scene
         cscene = scene.cycles
 
-        layout.active = cscene.use_adaptive_sampling
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        heading = layout.column(align=True, heading="Noise Threshold")
+        row = heading.row(align=True)
+        row.prop(cscene, "use_adaptive_sampling", text="")
+        sub = row.row()
+        sub.active = cscene.use_adaptive_sampling
+        sub.prop(cscene, "adaptive_threshold", text="")
 
         col = layout.column(align=True)
-        col.prop(cscene, "adaptive_threshold", text="Noise Threshold")
-        col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        if cscene.use_adaptive_sampling:
+            col.prop(cscene, "samples", text=" Max Samples")
+            col.prop(cscene, "adaptive_min_samples", text="Min Samples")
+        else:
+            col.prop(cscene, "samples", text="Samples")
+        col.prop(cscene, "time_limit")
 
 
-class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
-    bl_label = "Denoising"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+class CYCLES_RENDER_PT_sampling_render_denoise(CyclesButtonsPanel, Panel):
+    bl_label = "Denoise"
+    bl_parent_id = 'CYCLES_RENDER_PT_sampling_render'
     bl_options = {'DEFAULT_CLOSED'}
 
+    def draw_header(self, context):
+        scene = context.scene
+        cscene = scene.cycles
+
+        self.layout.prop(context.scene.cycles, "use_denoising", text="")
+
     def draw(self, context):
         layout = self.layout
         layout.use_property_split = True
@@ -263,33 +252,12 @@ class CYCLES_RENDER_PT_sampling_denoising(CyclesButtonsPanel, Panel):
         scene = context.scene
         cscene = scene.cycles
 
-        heading = layout.column(align=True, heading="Render")
-        row = heading.row(align=True)
-        row.prop(cscene, "use_denoising", text="")
-        sub = row.row()
-
-        sub.active = cscene.use_denoising
-        for view_layer in scene.view_layers:
-            if view_layer.cycles.denoising_store_passes:
-                sub.active = True
-
-        sub.prop(cscene, "denoiser", text="")
-
-        layout.separator()
-
-        heading = layout.column(align=False, heading="Viewport")
-        row = heading.row(align=True)
-        row.prop(cscene, "use_preview_denoising", text="")
-        sub = row.row()
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoiser", text="")
-
-        sub = heading.row(align=True)
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoising_start_sample", text="Start Sample")
-        sub = heading.row(align=True)
-        sub.active = cscene.use_preview_denoising
-        sub.prop(cscene, "preview_denoising_input_passes", text="Input Passes")
+        col = layout.column()
+        col.active = cscene.use_denoising
+        col.prop(cscene, "denoiser", text="Denoiser")
+        col.prop(cscene, "denoising_input_passes", text="Passes")
+        if cscene.denoiser == 'OPENIMAGEDENOISE':
+            col.prop(cscene, "denoising_prefilter", text="Prefilter")
 
 
 class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
@@ -313,8 +281,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.active = not(cscene.use_adaptive_sampling)
         col.prop(cscene, "sampling_pattern", text="Pattern")
 
-        layout.prop(cscene, "use_square_samples")
-
         layout.separator()
 
         col = layout.column(align=True)
@@ -322,11 +288,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
         col.prop(cscene, "min_transparent_bounces")
         col.prop(cscene, "light_sampling_threshold", text="Light Threshold")
 
-        if cscene.progressive != 'PATH' and use_branched_path(context):
-            col = layout.column(align=True)
-            col.prop(cscene, "sample_all_lights_direct")
-            col.prop(cscene, "sample_all_lights_indirect")
-
         for view_layer in scene.view_layers:
             if view_layer.samples > 0:
                 layout.separator()
@@ -334,62 +295,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
                 break
 
 
-class CYCLES_RENDER_PT_sampling_total(CyclesButtonsPanel, Panel):
-    bl_label = "Total Samples"
-    bl_parent_id = "CYCLES_RENDER_PT_sampling"
-
-    @classmethod
-    def poll(cls, context):
-        scene = context.scene
-        cscene = scene.cycles
-
-        if cscene.use_square_samples:
-            return True
-
-        return cscene.progressive != 'PATH' and use_branched_path(context)
-
-    def draw(self, context):
-        layout = self.layout
-        cscene = context.scene.cycles
-        integrator = cscene.progressive
-
-        # Calculate sample values
-        if integrator == 'PATH':
-            aa = cscene.samples
-            if cscene.use_square_samples:
-                aa = aa * aa
-        else:
-            aa = cscene.aa_samples
-            d = cscene.diffuse_samples
-            g = cscene.glossy_samples
-            t = cscene.transmission_samples
-            ao = cscene.ao_samples
-            ml = cscene.mesh_light_samples
-            sss = cscene.subsurface_samples
-            vol = cscene.volume_samples
-
-            if cscene.use_square_samples:
-                aa = aa * aa
-                d = d * d
-                g = g * g
-                t = t * t
-                ao = ao * ao
-                ml = ml * ml
-                sss = sss * sss
-                vol = vol * vol
-
-        col = layout.column(align=True)
-        col.scale_y = 0.6
-        if integrator == 'PATH':
-            col.label(text="%s AA" % aa)
-        else:
-            col.label(text="%s AA, %s Diffuse, %s Glossy, %s Transmission" %
-                      (aa, d * aa, g * aa, t * aa))
-            col.separator()
-            col.label(text="%s AO, %s Mesh Light, %s Subsurface, %s Volume" %
-                      (ao * aa, ml * aa, sss * aa, vol * aa))
-
-
 class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
     bl_label = "Subdivision"
     bl_options = {'DEFAULT_CLOSED'}
@@ -548,6 +453,8 @@ class CYCLES_RENDER_PT_light_paths_fast_gi(CyclesButtonsPanel, Panel):
         layout.use_property_split = True
         layout.use_property_decorate = False
 
+        layout.active = cscene.use_fast_gi
+
         col = layout.column(align=True)
         col.prop(cscene, "ao_bounces", text="Viewport Bounces")
         col.prop(cscene, "ao_bounces_render", text="Render Bounces")
@@ -716,19 +623,13 @@ class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
         layout.use_property_decorate = False
 
         scene = context.scene
-        rd = scene.render
         cscene = scene.cycles
 
         col = layout.column()
-
-        sub = col.column(align=True)
-        sub.prop(rd, "tile_x", text="Tiles X")
-        sub.prop(rd, "tile_y", text="Y")
-        col.prop(cscene, "tile_order", text="Order")
-
+        col.prop(cscene, "use_auto_tile")
         sub = col.column()
-        sub.active = not rd.use_save_buffers and not cscene.use_adaptive_sampling
-        sub.prop(cscene, "use_progressive_refine")
+        sub.active = cscene.use_auto_tile
+        sub.prop(cscene, "tile_size")
 
 
 class CYCLES_RENDER_PT_performance_acceleration_structure(CyclesButtonsPanel, Panel):
@@ -778,7 +679,6 @@ class CYCLES_RENDER_PT_performance_final_render(CyclesButtonsPanel, Panel):
 
         col = layout.column()
 
-        col.prop(rd, "use_save_buffers")
         col.prop(rd, "use_persistent_data", text="Persistent Data")
 
 
@@ -797,7 +697,6 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         col.prop(rd, "preview_pixel_size", text="Pixel Size")
-        col.prop(cscene, "preview_start_resolution", text="Start Pixels")
 
 
 class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
@@ -818,7 +717,6 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
 
         col = layout.column(heading="Include")
         col.prop(view_layer, "use_sky", text="Environment")
-        col.prop(view_layer, "use_ao", text="Ambient Occlusion")
         col.prop(view_layer, "use_solid", text="Surfaces")
         col.prop(view_layer, "use_strand", text="Hair")
         col.prop(view_layer, "use_volumes", text="Volumes")
@@ -827,6 +725,9 @@ class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
         sub = col.row()
         sub.prop(view_layer, "use_motion_blur", text="Motion Blur")
         sub.active = rd.use_motion_blur
+        sub = col.row()
+        sub.prop(view_layer.cycles, 'use_denoising', text='Denoising')
+        sub.active = scene.cycles.use_denoising
 
 
 class CYCLES_RENDER_PT_override(CyclesButtonsPanel, Panel):
@@ -872,6 +773,7 @@ class CYCLES_RENDER_PT_passes_data(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_pass_combined")
         col.prop(view_layer, "use_pass_z")
         col.prop(view_layer, "use_pass_mist")
+        col.prop(view_layer, "use_pass_position")
         col.prop(view_layer, "use_pass_normal")
         sub = col.column()
         sub.active = not rd.use_motion_blur
@@ -928,6 +830,7 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
         col.prop(view_layer, "use_pass_environment")
         col.prop(view_layer, "use_pass_shadow")
         col.prop(view_layer, "use_pass_ambient_occlusion", text="Ambient Occlusion")
+        col.prop(cycles_view_layer, "use_pass_shadow_catcher")
 
 
 class CYCLES_RENDER_PT_passes_crypto(CyclesButtonsPanel, ViewLayerCryptomattePanel, Panel):
@@ -942,70 +845,6 @@ class CYCLES_RENDER_PT_passes_aov(CyclesButtonsPanel, ViewLayerAOVPanel):
     bl_parent_id = "CYCLES_RENDER_PT_passes"
 
 
-class CYCLES_RENDER_PT_denoising(CyclesButtonsPanel, Panel):
-    bl_label = "Denoising"
-    bl_context = "view_layer"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    @classmethod
-    def poll(cls, context):
-        cscene = context.scene.cycles
-        return CyclesButtonsPanel.poll(context) and cscene.use_denoising
-
-    def draw_header(self, context):
-        scene = context.scene
-        view_layer = context.view_layer
-        cycles_view_layer = view_layer.cycles
-
-        layout = self.layout
-        layout.prop(cycles_view_layer, "use_denoising", text="")
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
-
-        scene = context.scene
-        view_layer = context.view_layer
-        cycles_view_layer = view_layer.cycles
-        denoiser = scene.cycles.denoiser
-
-        layout.active = denoiser != 'NONE' and cycles_view_layer.use_denoising
-
-        col = layout.column()
-
-        if denoiser == 'OPTIX':
-            col.prop(cycles_view_layer, "denoising_optix_input_passes")
-            return
-        elif denoiser == 'OPENIMAGEDENOISE':
-            col.prop(cycles_view_layer, "denoising_openimagedenoise_input_passes")
-            return
-
-        col.prop(cycles_view_layer, "denoising_radius", text="Radius")
-
-        col = layout.column()
-        col.prop(cycles_view_layer, "denoising_strength", slider=True, text="Strength")
-        col.prop(cycles_view_layer, "denoising_feature_strength", slider=True, text="Feature Strength")
-        col.prop(cycles_view_layer, "denoising_relative_pca")
-
-        layout.separator()
-
-        col = layout.column()
-        col.active = cycles_view_layer.use_denoising or cycles_view_layer.denoising_store_passes
-
-        row = col.row(heading="Diffuse", align=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_diffuse_indirect", text="Indirect", toggle=True)
-
-        row = col.row(heading="Glossy", align=True)
-        row.prop(cycles_view_layer, "denoising_glossy_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_glossy_indirect", text="Indirect", toggle=True)
-
-        row = col.row(heading="Transmission", align=True)
-        row.prop(cycles_view_layer, "denoising_transmission_direct", text="Direct", toggle=True)
-        row.prop(cycles_view_layer, "denoising_transmission_indirect", text="Indirect", toggle=True)
-
-
 class CYCLES_PT_post_processing(CyclesButtonsPanel, Panel):
     bl_label = "Post Processing"
     bl_options = {'DEFAULT_CLOSED'}
@@ -1417,10 +1256,6 @@ class CYCLES_LIGHT_PT_light(CyclesButtonsPanel, Panel):
 
         if not (light.type == 'AREA' and clamp.is_portal):
             sub = col.column()
-            if use_branched_path(context):
-                subsub = sub.row(align=True)
-                subsub.active = use_sample_all_lights(context)
-                subsub.prop(clamp, "samples")
             sub.prop(clamp, "max_bounces")
 
         sub = col.column(align=True)
@@ -1526,34 +1361,6 @@ class CYCLES_WORLD_PT_volume(CyclesButtonsPanel, Panel):
         panel_node_draw(layout, world, 'OUTPUT_WORLD', 'Volume')
 
 
-class CYCLES_WORLD_PT_ambient_occlusion(CyclesButtonsPanel, Panel):
-    bl_label = "Ambient Occlusion"
-    bl_context = "world"
-    bl_options = {'DEFAULT_CLOSED'}
-
-    @classmethod
-    def poll(cls, context):
-        return context.world and CyclesButtonsPanel.poll(context)
-
-    def draw_header(self, context):
-        light = context.world.light_settings
-        self.layout.prop(light, "use_ambient_occlusion", text="")
-
-    def draw(self, context):
-        layout = self.layout
-        layout.use_property_split = True
-        layout.use_property_decorate = False
-
-        light = context.world.light_settings
-        scene = context.scene
-
-        col = layout.column()
-        sub = col.column()
-        sub.active = light.use_ambient_occlusion or scene.render.use_simplify
-        sub.prop(light, "ao_factor", text="Factor")
-        col.prop(light, "distance", text="Distance")
-
-
 class CYCLES_WORLD_PT_mist(CyclesButtonsPanel, Panel):
     bl_label = "Mist Pass"
     bl_context = "world"
@@ -1650,10 +1457,6 @@ class CYCLES_WORLD_PT_settings_surface(CyclesButtonsPanel, Panel):
         subsub = sub.row(align=True)
         subsub.active = cworld.sampling_method == 'MANUAL'
         subsub.prop(cworld, "sample_map_resolution")
-        if use_branched_path(context):
-            subsub = sub.column(align=True)
-            subsub.active = use_sample_all_lights(context)
-            subsub.prop(cworld, "samples")
         sub.prop(cworld, "max_bounces")
 
 
@@ -1677,8 +1480,7 @@ class CYCLES_WORLD_PT_settings_volume(CyclesButtonsPanel, Panel):
         col = layout.column()
 
         sub = col.column()
-        sub.active = use_cpu(context)
-        sub.prop(cworld, "volume_sampling", text="Sampling")
+        col.prop(cworld, "volume_sampling", text="Sampling")
         col.prop(cworld, "volume_interpolation", text="Interpolation")
         col.prop(cworld, "homogeneous_volume", text="Homogeneous")
         sub = col.column()
@@ -1817,8 +1619,7 @@ class CYCLES_MATERIAL_PT_settings_volume(CyclesButtonsPanel, Panel):
 
         col = layout.column()
         sub = col.column()
-        sub.active = use_cpu(context)
-        sub.prop(cmat, "volume_sampling", text="Sampling")
+        col.prop(cmat, "volume_sampling", text="Sampling")
         col.prop(cmat, "volume_interpolation", text="Interpolation")
         col.prop(cmat, "homogeneous_volume", text="Homogeneous")
         sub = col.column()
@@ -1845,9 +1646,6 @@ class CYCLES_RENDER_PT_bake(CyclesButtonsPanel, Panel):
         cbk = scene.render.bake
         rd = scene.render
 
-        if use_optix(context):
-            layout.label(text="Baking is performed using CUDA instead of OptiX", icon='INFO')
-
         if rd.use_bake_multires:
             layout.operator("object.bake_image", icon='RENDER_STILL')
             layout.prop(rd, "use_bake_multires")
@@ -1905,7 +1703,6 @@ class CYCLES_RENDER_PT_bake_influence(CyclesButtonsPanel, Panel):
             col.prop(cbk, "use_pass_diffuse")
             col.prop(cbk, "use_pass_glossy")
             col.prop(cbk, "use_pass_transmission")
-            col.prop(cbk, "use_pass_ambient_occlusion")
             col.prop(cbk, "use_pass_emit")
 
         elif cscene.bake_type in {'DIFFUSE', 'GLOSSY', 'TRANSMISSION'}:
@@ -1989,19 +1786,12 @@ class CYCLES_RENDER_PT_bake_output(CyclesButtonsPanel, Panel):
                 layout.prop(cbk, "use_clear", text="Clear Image")
 
 
-class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
+class CYCLES_RENDER_PT_debug(CyclesDebugButtonsPanel, Panel):
     bl_label = "Debug"
     bl_context = "render"
     bl_options = {'DEFAULT_CLOSED'}
     COMPAT_ENGINES = {'CYCLES'}
 
-    @classmethod
-    def poll(cls, context):
-        prefs = bpy.context.preferences
-        return (CyclesButtonsPanel.poll(context)
-                and prefs.experimental.use_cycles_debug
-                and prefs.view.show_developer_ui)
-
     def draw(self, context):
         layout = self.layout
 
@@ -2018,29 +1808,18 @@ class CYCLES_RENDER_PT_debug(CyclesButtonsPanel, Panel):
         row.prop(cscene, "debug_use_cpu_avx", toggle=True)
         row.prop(cscene, "debug_use_cpu_avx2", toggle=True)
         col.prop(cscene, "debug_bvh_layout")
-        col.prop(cscene, "debug_use_cpu_split_kernel")
 
         col.separator()
 
         col = layout.column()
         col.label(text="CUDA Flags:")
         col.prop(cscene, "debug_use_cuda_adaptive_compile")
-        col.prop(cscene, "debug_use_cuda_split_kernel")
 
         col.separator()
 
         col = layout.column()
         col.label(text="OptiX Flags:")
-        col.prop(cscene, "debug_optix_cuda_streams")
-        col.prop(cscene, "debug_optix_curves_api")
-
-        col.separator()
-
-        col = layout.column()
-        col.label(text="OpenCL Flags:")
-        col.prop(cscene, "debug_opencl_device_type", text="Device")
-        col.prop(cscene, "debug_use_opencl_debug", text="Debug")
-        col.prop(cscene, "debug_opencl_mem_limit")
+        col.prop(cscene, "debug_use_optix_debug")
 
         col.separator()
 
@@ -2141,20 +1920,22 @@ class CYCLES_RENDER_PT_simplify_culling(CyclesButtonsPanel, Panel):
         sub.prop(cscene, "distance_cull_margin", text="")
 
 
-class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
+class CyclesShadingButtonsPanel(CyclesButtonsPanel):
     bl_space_type = 'VIEW_3D'
     bl_region_type = 'HEADER'
-    bl_label = "Render Pass"
     bl_parent_id = 'VIEW3D_PT_shading'
-    COMPAT_ENGINES = {'CYCLES'}
 
     @classmethod
     def poll(cls, context):
         return (
-            context.engine in cls.COMPAT_ENGINES and
+            CyclesButtonsPanel.poll(context) and
             context.space_data.shading.type == 'RENDERED'
         )
 
+
+class CYCLES_VIEW3D_PT_shading_render_pass(CyclesShadingButtonsPanel, Panel):
+    bl_label = "Render Pass"
+
     def draw(self, context):
         shading = context.space_data.shading
 
@@ -2162,6 +1943,26 @@ class CYCLES_VIEW3D_PT_shading_render_pass(Panel):
         layout.prop(shading.cycles, "render_pass", text="")
 
 
+class CYCLES_VIEW3D_PT_shading_debug(CyclesDebugButtonsPanel,
+                                     CyclesShadingButtonsPanel,
+                                     Panel):
+    bl_label = "Debug"
+
+    @classmethod
+    def poll(cls, context):
+        return (
+            CyclesDebugButtonsPanel.poll(context) and
+            CyclesShadingButtonsPanel.poll(context)
+        )
+
+    def draw(self, context):
+        shading = context.space_data.shading
+
+        layout = self.layout
+        layout.active = context.scene.cycles.use_preview_adaptive_sampling
+        layout.prop(shading.cycles, "show_active_pixels")
+
+
 class CYCLES_VIEW3D_PT_shading_lighting(Panel):
     bl_space_type = 'VIEW_3D'
     bl_region_type = 'HEADER'
@@ -2275,11 +2076,13 @@ def get_panels():
 
 classes = (
     CYCLES_PT_sampling_presets,
+    CYCLES_PT_viewport_sampling_presets,
     CYCLES_PT_integrator_presets,
     CYCLES_RENDER_PT_sampling,
-    CYCLES_RENDER_PT_sampling_sub_samples,
-    CYCLES_RENDER_PT_sampling_adaptive,
-    CYCLES_RENDER_PT_sampling_denoising,
+    CYCLES_RENDER_PT_sampling_viewport,
+    CYCLES_RENDER_PT_sampling_viewport_denoise,
+    CYCLES_RENDER_PT_sampling_render,
+    CYCLES_RENDER_PT_sampling_render_denoise,
     CYCLES_RENDER_PT_sampling_advanced,
     CYCLES_RENDER_PT_light_paths,
     CYCLES_RENDER_PT_light_paths_max_bounces,
@@ -2296,6 +2099,7 @@ classes = (
     CYCLES_VIEW3D_PT_simplify_greasepencil,
     CYCLES_VIEW3D_PT_shading_lighting,
     CYCLES_VIEW3D_PT_shading_render_pass,
+    CYCLES_VIEW3D_PT_shading_debug,
     CYCLES_RENDER_PT_motion_blur,
     CYCLES_RENDER_PT_motion_blur_curve,
     CYCLES_RENDER_PT_film,
@@ -2314,7 +2118,6 @@ classes = (
     CYCLES_RENDER_PT_passes_aov,
     CYCLES_RENDER_PT_filter,
     CYCLES_RENDER_PT_override,
-    CYCLES_RENDER_PT_denoising,
     CYCLES_PT_post_processing,
     CYCLES_CAMERA_PT_dof,
     CYCLES_CAMERA_PT_dof_aperture,
@@ -2333,7 +2136,6 @@ classes = (
     CYCLES_WORLD_PT_preview,
     CYCLES_WORLD_PT_surface,
     CYCLES_WORLD_PT_volume,
-    CYCLES_WORLD_PT_ambient_occlusion,
     CYCLES_WORLD_PT_mist,
     CYCLES_WORLD_PT_ray_visibility,
     CYCLES_WORLD_PT_settings,
diff --git a/intern/cycles/blender/addon/version_update.py b/intern/cycles/blender/addon/version_update.py
index 827f84b9873..57da7d7995c 100644
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -109,7 +109,7 @@ def do_versions(self):
         library_versions.setdefault(library.version, []).append(library)
 
     # Do versioning per library, since they might have different versions.
-    max_need_versioning = (2, 93, 7)
+    max_need_versioning = (3, 0, 25)
     for version, libraries in library_versions.items():
         if version > max_need_versioning:
             continue
@@ -166,10 +166,6 @@ def do_versions(self):
                 if not cscene.is_property_set("filter_type"):
                     cscene.pixel_filter_type = 'GAUSSIAN'
 
-                # Tile Order
-                if not cscene.is_property_set("tile_order"):
-                    cscene.tile_order = 'CENTER'
-
             if version <= (2, 76, 10):
                 cscene = scene.cycles
                 if cscene.is_property_set("filter_type"):
@@ -186,10 +182,6 @@ def do_versions(self):
             if version <= (2, 79, 0):
                 cscene = scene.cycles
                 # Default changes
-                if not cscene.is_property_set("aa_samples"):
-                    cscene.aa_samples = 4
-                if not cscene.is_property_set("preview_aa_samples"):
-                    cscene.preview_aa_samples = 4
                 if not cscene.is_property_set("blur_glossy"):
                     cscene.blur_glossy = 0.0
                 if not cscene.is_property_set("sample_clamp_indirect"):
@@ -203,7 +195,6 @@ def do_versions(self):
                     view_layer.use_pass_cryptomatte_material = cview_layer.get("use_pass_crypto_material", False)
                     view_layer.use_pass_cryptomatte_asset = cview_layer.get("use_pass_crypto_asset", False)
                     view_layer.pass_cryptomatte_depth = cview_layer.get("pass_crypto_depth", 6)
-                    view_layer.use_pass_cryptomatte_accurate = cview_layer.get("pass_crypto_accurate", True)
 
             if version <= (2, 93, 7):
                 if scene.render.engine == 'CYCLES':
@@ -229,6 +220,35 @@ def do_versions(self):
                     cscene.ao_bounces = 1
                     cscene.ao_bounces_render = 1
 
+            if version <= (3, 0, 25):
+                cscene = scene.cycles
+
+                # Default changes.
+                if not cscene.is_property_set("samples"):
+                    cscene.samples = 128
+                if not cscene.is_property_set("preview_samples"):
+                    cscene.preview_samples = 32
+                if not cscene.is_property_set("use_adaptive_sampling"):
+                    cscene.use_adaptive_sampling = False
+                    cscene.use_preview_adaptive_sampling = False
+                if not cscene.is_property_set("use_denoising"):
+                    cscene.use_denoising = False
+                if not cscene.is_property_set("use_preview_denoising"):
+                    cscene.use_preview_denoising = False
+                if not cscene.is_property_set("sampling_pattern"):
+                    cscene.sampling_pattern = 'PROGRESSIVE_MUTI_JITTER'
+
+                # Removal of square samples.
+                cscene = scene.cycles
+                use_square_samples = cscene.get("use_square_samples", False)
+
+                if use_square_samples:
+                    cscene.samples *= cscene.samples
+                    cscene.preview_samples *= cscene.preview_samples
+                    for layer in scene.view_layers:
+                        layer.samples *= layer.samples
+                    cscene["use_square_samples"] = False
+
         # Lamps
         for light in bpy.data.lights:
             if light.library not in libraries:
@@ -249,10 +269,6 @@ def do_versions(self):
             if version <= (2, 76, 9):
                 cworld = world.cycles
 
-                # World MIS Samples
-                if not cworld.is_property_set("samples"):
-                    cworld.samples = 4
-
                 # World MIS Resolution
                 if not cworld.is_property_set("sample_map_resolution"):
                     cworld.sample_map_resolution = 256
diff --git a/intern/cycles/blender/blender_camera.cpp b/intern/cycles/blender/blender_camera.cpp
index 6954c5c2f26..4e8df5a99a6 100644
--- a/intern/cycles/blender/blender_camera.cpp
+++ b/intern/cycles/blender/blender_camera.cpp
@@ -894,12 +894,8 @@ void BlenderSync::sync_view(BL::SpaceView3D &b_v3d,
   }
 }
 
-BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
-                                            BL::RegionView3D &b_rv3d,
-                                            Camera *cam,
-                                            int width,
-                                            int height,
-                                            const bool use_denoiser)
+BufferParams BlenderSync::get_buffer_params(
+    BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height)
 {
   BufferParams params;
   bool use_border = false;
@@ -931,11 +927,6 @@ BufferParams BlenderSync::get_buffer_params(BL::SpaceView3D &b_v3d,
     params.height = height;
   }
 
-  PassType display_pass = update_viewport_display_passes(b_v3d, params.passes);
-
-  /* Can only denoise the combined image pass */
-  params.denoising_data_pass = display_pass == PASS_COMBINED && use_denoiser;
-
   return params;
 }
 
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index d51b31de638..ce1770f18a3 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -25,7 +25,6 @@ CCL_NAMESPACE_BEGIN
 enum ComputeDevice {
   COMPUTE_DEVICE_CPU = 0,
   COMPUTE_DEVICE_CUDA = 1,
-  COMPUTE_DEVICE_OPENCL = 2,
   COMPUTE_DEVICE_OPTIX = 3,
 
   COMPUTE_DEVICE_NUM
@@ -68,13 +67,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       device = Device::get_multi_device(devices, threads, background);
     }
   }
-  else if (get_enum(cscene, "device") == 2) {
-    /* Find network device. */
-    vector<DeviceInfo> devices = Device::available_devices(DEVICE_MASK_NETWORK);
-    if (!devices.empty()) {
-      device = devices.front();
-    }
-  }
   else if (get_enum(cscene, "device") == 1) {
     /* Test if we are using GPU devices. */
     ComputeDevice compute_device = (ComputeDevice)get_enum(
@@ -89,9 +81,6 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       else if (compute_device == COMPUTE_DEVICE_OPTIX) {
         mask |= DEVICE_MASK_OPTIX;
       }
-      else if (compute_device == COMPUTE_DEVICE_OPENCL) {
-        mask |= DEVICE_MASK_OPENCL;
-      }
       vector<DeviceInfo> devices = Device::available_devices(mask);
 
       /* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp
index b1de37dac10..fca8cb9eda3 100644
--- a/intern/cycles/blender/blender_geometry.cpp
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -80,7 +80,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
 {
   /* Test if we can instance or if the object is modified. */
   Geometry::Type geom_type = determine_geom_type(b_ob_info, use_particle_hair);
-  GeometryKey key(b_ob_info.object_data, geom_type);
+  BL::ID b_key_id = (BKE_object_is_modified(b_ob_info.real_object)) ? b_ob_info.real_object :
+                                                                      b_ob_info.object_data;
+  GeometryKey key(b_key_id.ptr.data, geom_type);
 
   /* Find shader indices. */
   array<Node *> used_shaders = find_used_shaders(b_ob_info.iter_object);
@@ -110,7 +112,7 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
   }
   else {
     /* Test if we need to update existing geometry. */
-    sync = geometry_map.update(geom, b_ob_info.object_data);
+    sync = geometry_map.update(geom, b_key_id);
   }
 
   if (!sync) {
diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp
new file mode 100644
index 00000000000..c5c3a2bd155
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -0,0 +1,787 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "blender/blender_gpu_display.h"
+
+#include "device/device.h"
+#include "util/util_logging.h"
+#include "util/util_opengl.h"
+
+extern "C" {
+struct RenderEngine;
+
+bool RE_engine_has_render_context(struct RenderEngine *engine);
+void RE_engine_render_context_enable(struct RenderEngine *engine);
+void RE_engine_render_context_disable(struct RenderEngine *engine);
+
+bool DRW_opengl_context_release();
+void DRW_opengl_context_activate(bool drw_state);
+
+void *WM_opengl_context_create();
+void WM_opengl_context_activate(void *gl_context);
+void WM_opengl_context_dispose(void *gl_context);
+void WM_opengl_context_release(void *context);
+}
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * BlenderDisplayShader.
+ */
+
+unique_ptr<BlenderDisplayShader> BlenderDisplayShader::create(BL::RenderEngine &b_engine,
+                                                              BL::Scene &b_scene)
+{
+  if (b_engine.support_display_space_shader(b_scene)) {
+    return make_unique<BlenderDisplaySpaceShader>(b_engine, b_scene);
+  }
+
+  return make_unique<BlenderFallbackDisplayShader>();
+}
+
+int BlenderDisplayShader::get_position_attrib_location()
+{
+  if (position_attribute_location_ == -1) {
+    const uint shader_program = get_shader_program();
+    position_attribute_location_ = glGetAttribLocation(shader_program, position_attribute_name);
+  }
+  return position_attribute_location_;
+}
+
+int BlenderDisplayShader::get_tex_coord_attrib_location()
+{
+  if (tex_coord_attribute_location_ == -1) {
+    const uint shader_program = get_shader_program();
+    tex_coord_attribute_location_ = glGetAttribLocation(shader_program, tex_coord_attribute_name);
+  }
+  return tex_coord_attribute_location_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderFallbackDisplayShader.
+ */
+
+/* TODO move shaders to standalone .glsl file. */
+static const char *FALLBACK_VERTEX_SHADER =
+    "#version 330\n"
+    "uniform vec2 fullscreen;\n"
+    "in vec2 texCoord;\n"
+    "in vec2 pos;\n"
+    "out vec2 texCoord_interp;\n"
+    "\n"
+    "vec2 normalize_coordinates()\n"
+    "{\n"
+    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
+    "}\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
+    "   texCoord_interp = texCoord;\n"
+    "}\n\0";
+
+static const char *FALLBACK_FRAGMENT_SHADER =
+    "#version 330\n"
+    "uniform sampler2D image_texture;\n"
+    "in vec2 texCoord_interp;\n"
+    "out vec4 fragColor;\n"
+    "\n"
+    "void main()\n"
+    "{\n"
+    "   fragColor = texture(image_texture, texCoord_interp);\n"
+    "}\n\0";
+
+static void shader_print_errors(const char *task, const char *log, const char *code)
+{
+  LOG(ERROR) << "Shader: " << task << " error:";
+  LOG(ERROR) << "===== shader string ====";
+
+  stringstream stream(code);
+  string partial;
+
+  int line = 1;
+  while (getline(stream, partial, '\n')) {
+    if (line < 10) {
+      LOG(ERROR) << " " << line << " " << partial;
+    }
+    else {
+      LOG(ERROR) << line << " " << partial;
+    }
+    line++;
+  }
+  LOG(ERROR) << log;
+}
+
+static int compile_fallback_shader(void)
+{
+  const struct Shader {
+    const char *source;
+    const GLenum type;
+  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
+                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
+
+  const GLuint program = glCreateProgram();
+
+  for (int i = 0; i < 2; i++) {
+    const GLuint shader = glCreateShader(shaders[i].type);
+
+    string source_str = shaders[i].source;
+    const char *c_str = source_str.c_str();
+
+    glShaderSource(shader, 1, &c_str, NULL);
+    glCompileShader(shader);
+
+    GLint compile_status;
+    glGetShaderiv(shader, GL_COMPILE_STATUS, &compile_status);
+
+    if (!compile_status) {
+      GLchar log[5000];
+      GLsizei length = 0;
+      glGetShaderInfoLog(shader, sizeof(log), &length, log);
+      shader_print_errors("compile", log, c_str);
+      return 0;
+    }
+
+    glAttachShader(program, shader);
+  }
+
+  /* Link output. */
+  glBindFragDataLocation(program, 0, "fragColor");
+
+  /* Link and error check. */
+  glLinkProgram(program);
+
+  /* TODO(sergey): Find a way to nicely de-duplicate the error checking. */
+  GLint link_status;
+  glGetProgramiv(program, GL_LINK_STATUS, &link_status);
+  if (!link_status) {
+    GLchar log[5000];
+    GLsizei length = 0;
+    /* TODO(sergey): Is it really program passed to glGetShaderInfoLog? */
+    glGetShaderInfoLog(program, sizeof(log), &length, log);
+    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
+    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
+    return 0;
+  }
+
+  return program;
+}
+
+void BlenderFallbackDisplayShader::bind(int width, int height)
+{
+  create_shader_if_needed();
+
+  if (!shader_program_) {
+    return;
+  }
+
+  glUseProgram(shader_program_);
+  glUniform1i(image_texture_location_, 0);
+  glUniform2f(fullscreen_location_, width, height);
+}
+
+void BlenderFallbackDisplayShader::unbind()
+{
+}
+
+uint BlenderFallbackDisplayShader::get_shader_program()
+{
+  return shader_program_;
+}
+
+void BlenderFallbackDisplayShader::create_shader_if_needed()
+{
+  if (shader_program_ || shader_compile_attempted_) {
+    return;
+  }
+
+  shader_compile_attempted_ = true;
+
+  shader_program_ = compile_fallback_shader();
+  if (!shader_program_) {
+    return;
+  }
+
+  glUseProgram(shader_program_);
+
+  image_texture_location_ = glGetUniformLocation(shader_program_, "image_texture");
+  if (image_texture_location_ < 0) {
+    LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
+    destroy_shader();
+    return;
+  }
+
+  fullscreen_location_ = glGetUniformLocation(shader_program_, "fullscreen");
+  if (fullscreen_location_ < 0) {
+    LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
+    destroy_shader();
+    return;
+  }
+}
+
+void BlenderFallbackDisplayShader::destroy_shader()
+{
+  glDeleteProgram(shader_program_);
+  shader_program_ = 0;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderDisplaySpaceShader.
+ */
+
+BlenderDisplaySpaceShader::BlenderDisplaySpaceShader(BL::RenderEngine &b_engine,
+                                                     BL::Scene &b_scene)
+    : b_engine_(b_engine), b_scene_(b_scene)
+{
+  DCHECK(b_engine_.support_display_space_shader(b_scene_));
+}
+
+void BlenderDisplaySpaceShader::bind(int /*width*/, int /*height*/)
+{
+  b_engine_.bind_display_space_shader(b_scene_);
+}
+
+void BlenderDisplaySpaceShader::unbind()
+{
+  b_engine_.unbind_display_space_shader();
+}
+
+uint BlenderDisplaySpaceShader::get_shader_program()
+{
+  if (!shader_program_) {
+    glGetIntegerv(GL_CURRENT_PROGRAM, reinterpret_cast<int *>(&shader_program_));
+  }
+
+  if (!shader_program_) {
+    LOG(ERROR) << "Error retrieving shader program for display space shader.";
+  }
+
+  return shader_program_;
+}
+
+/* --------------------------------------------------------------------
+ * BlenderGPUDisplay.
+ */
+
+BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene)
+    : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene))
+{
+  /* Create context while on the main thread. */
+  gl_context_create();
+}
+
+BlenderGPUDisplay::~BlenderGPUDisplay()
+{
+  gl_resources_destroy();
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams &params,
+                                        int texture_width,
+                                        int texture_height)
+{
+  /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing
+   * the texture does not happen at the same time. This is achieved indirectly.
+   *
+   * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock.
+   * This same lock is also held when do_draw() is called, which together ensure mutual
+   * exclusion.
+   *
+   * This locking is not performed at the GPU display level, because that would cause lock
+   * inversion. */
+  if (!gl_context_enable()) {
+    return false;
+  }
+
+  if (gl_render_sync_) {
+    glWaitSync((GLsync)gl_render_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (!gl_texture_resources_ensure()) {
+    gl_context_disable();
+    return false;
+  }
+
+  /* Update texture dimensions if needed. */
+  if (texture_.width != texture_width || texture_.height != texture_height) {
+    glActiveTexture(GL_TEXTURE0);
+    glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+    glTexImage2D(
+        GL_TEXTURE_2D, 0, GL_RGBA16F, texture_width, texture_height, 0, GL_RGBA, GL_HALF_FLOAT, 0);
+    texture_.width = texture_width;
+    texture_.height = texture_height;
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    /* Texture did change, and no pixel storage was provided. Tag for an explicit zeroing out to
+     * avoid undefined content. */
+    texture_.need_clear = true;
+  }
+
+  /* Update PBO dimensions if needed.
+   *
+   * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+   * at a resolution divider 1. This was we don't need to recreate graphics interoperability
+   * objects which are costly and which are tied to the specific underlying buffer size.
+   * The downside of this approach is that when graphics interoperability is not used we are
+   * sending too much data to GPU when resolution divider is not 1. */
+  /* TODO(sergey): Investigate whether keeping the PBO exact size of the texture makes non-interop
+   * mode faster. */
+  const int buffer_width = params.full_size.x;
+  const int buffer_height = params.full_size.y;
+  if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
+    const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, size_in_bytes, 0, GL_DYNAMIC_DRAW);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    texture_.buffer_width = buffer_width;
+    texture_.buffer_height = buffer_height;
+  }
+
+  /* New content will be provided to the texture in one way or another, so mark this in a
+   * centralized place. */
+  texture_.need_update = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::do_update_end()
+{
+  gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  glFlush();
+
+  gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void BlenderGPUDisplay::do_copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+  /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time
+   * point of view than to copy data directly to the OpenGL texture.
+   *
+   * The possible downside of this approach is that it might require a higher peak memory when
+   * doing partial updates of the texture (although, in practice even partial updates might peak
+   * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */
+
+  half4 *mapped_rgba_pixels = map_texture_buffer();
+  if (!mapped_rgba_pixels) {
+    return;
+  }
+
+  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width &&
+      pixels_height == texture_.height) {
+    const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height;
+    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
+  }
+  else {
+    const half4 *rgba_row = rgba_pixels;
+    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x;
+    for (int y = 0; y < pixels_height;
+         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
+      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+    }
+  }
+
+  unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *BlenderGPUDisplay::do_map_texture_buffer()
+{
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+
+  half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>(
+      glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY));
+  if (!mapped_rgba_pixels) {
+    LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object.";
+  }
+
+  if (texture_.need_clear) {
+    const int64_t texture_width = texture_.width;
+    const int64_t texture_height = texture_.height;
+    memset(reinterpret_cast<void *>(mapped_rgba_pixels),
+           0,
+           texture_width * texture_height * sizeof(half4));
+    texture_.need_clear = false;
+  }
+
+  return mapped_rgba_pixels;
+}
+
+void BlenderGPUDisplay::do_unmap_texture_buffer()
+{
+  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get()
+{
+  DeviceGraphicsInteropDestination interop_dst;
+
+  interop_dst.buffer_width = texture_.buffer_width;
+  interop_dst.buffer_height = texture_.buffer_height;
+  interop_dst.opengl_pbo_id = texture_.gl_pbo_id;
+
+  interop_dst.need_clear = texture_.need_clear;
+  texture_.need_clear = false;
+
+  return interop_dst;
+}
+
+void BlenderGPUDisplay::graphics_interop_activate()
+{
+  gl_context_enable();
+}
+
+void BlenderGPUDisplay::graphics_interop_deactivate()
+{
+  gl_context_disable();
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+void BlenderGPUDisplay::clear()
+{
+  texture_.need_clear = true;
+}
+
+void BlenderGPUDisplay::set_zoom(float zoom_x, float zoom_y)
+{
+  zoom_ = make_float2(zoom_x, zoom_y);
+}
+
+void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
+{
+  /* See do_update_begin() for why no locking is required here. */
+  const bool transparent = true;  // TODO(sergey): Derive this from Film.
+
+  if (texture_.need_clear) {
+    /* Texture is requested to be cleared and was not yet cleared.
+     * Do early return which should be equivalent of drawing all-zero texture. */
+    return;
+  }
+
+  if (!gl_draw_resources_ensure()) {
+    return;
+  }
+
+  if (use_gl_context_) {
+    gl_context_mutex_.lock();
+  }
+
+  if (gl_upload_sync_) {
+    glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
+  }
+
+  if (transparent) {
+    glEnable(GL_BLEND);
+    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+  }
+
+  display_shader_->bind(params.full_size.x, params.full_size.y);
+
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+
+  /* Trick to keep sharp rendering without jagged edges on all GPUs.
+   *
+   * The idea here is to enforce driver to use linear interpolation when the image is not zoomed
+   * in.
+   * For the render result with a resolution divider in effect we always use nearest interpolation.
+   *
+   * Use explicit MIN assignment to make sure the driver does not have an undefined behavior at
+   * the zoom level 1. The MAG filter is always NEAREST. */
+  const float zoomed_width = params.size.x * zoom_.x;
+  const float zoomed_height = params.size.y * zoom_.y;
+  if (texture_.width != params.size.x || texture_.height != params.size.y) {
+    /* Resolution divider is different from 1, force enarest interpolation. */
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  }
+  else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  }
+  else {
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+  }
+
+  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer_);
+
+  texture_update_if_needed();
+  vertex_buffer_update(params);
+
+  /* TODO(sergey): Does it make sense/possible to cache/reuse the VAO? */
+  GLuint vertex_array_object;
+  glGenVertexArrays(1, &vertex_array_object);
+  glBindVertexArray(vertex_array_object);
+
+  const int texcoord_attribute = display_shader_->get_tex_coord_attrib_location();
+  const int position_attribute = display_shader_->get_position_attrib_location();
+
+  glEnableVertexAttribArray(texcoord_attribute);
+  glEnableVertexAttribArray(position_attribute);
+
+  glVertexAttribPointer(
+      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+  glVertexAttribPointer(position_attribute,
+                        2,
+                        GL_FLOAT,
+                        GL_FALSE,
+                        4 * sizeof(float),
+                        (const GLvoid *)(sizeof(float) * 2));
+
+  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+  glBindBuffer(GL_ARRAY_BUFFER, 0);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  glDeleteVertexArrays(1, &vertex_array_object);
+
+  display_shader_->unbind();
+
+  if (transparent) {
+    glDisable(GL_BLEND);
+  }
+
+  gl_render_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  glFlush();
+
+  if (use_gl_context_) {
+    gl_context_mutex_.unlock();
+  }
+}
+
+void BlenderGPUDisplay::gl_context_create()
+{
+  /* When rendering in viewport there is no render context available via engine.
+   * Check whether own context is to be created here.
+   *
+   * NOTE: If the `b_engine_`'s context is not available, we are expected to be on a main thread
+   * here. */
+  use_gl_context_ = !RE_engine_has_render_context(
+      reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+
+  if (use_gl_context_) {
+    const bool drw_state = DRW_opengl_context_release();
+    gl_context_ = WM_opengl_context_create();
+    if (gl_context_) {
+      /* On Windows an old context is restored after creation, and subsequent release of context
+       * generates a Win32 error. Harmless for users, but annoying to have possible misleading
+       * error prints in the console. */
+#ifndef _WIN32
+      WM_opengl_context_release(gl_context_);
+#endif
+    }
+    else {
+      LOG(ERROR) << "Error creating OpenGL context.";
+    }
+
+    DRW_opengl_context_activate(drw_state);
+  }
+}
+
+bool BlenderGPUDisplay::gl_context_enable()
+{
+  if (use_gl_context_) {
+    if (!gl_context_) {
+      return false;
+    }
+    gl_context_mutex_.lock();
+    WM_opengl_context_activate(gl_context_);
+    return true;
+  }
+
+  RE_engine_render_context_enable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+  return true;
+}
+
+void BlenderGPUDisplay::gl_context_disable()
+{
+  if (use_gl_context_) {
+    if (gl_context_) {
+      WM_opengl_context_release(gl_context_);
+      gl_context_mutex_.unlock();
+    }
+    return;
+  }
+
+  RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data));
+}
+
+void BlenderGPUDisplay::gl_context_dispose()
+{
+  if (gl_context_) {
+    const bool drw_state = DRW_opengl_context_release();
+
+    WM_opengl_context_activate(gl_context_);
+    WM_opengl_context_dispose(gl_context_);
+
+    DRW_opengl_context_activate(drw_state);
+  }
+}
+
+bool BlenderGPUDisplay::gl_draw_resources_ensure()
+{
+  if (!texture_.gl_id) {
+    /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can
+     * can not continue. Note that this is not an unrecoverable error, so once the texture is known
+     * we will come back here and create all the GPU resources needed for draw. */
+    return false;
+  }
+
+  if (gl_draw_resource_creation_attempted_) {
+    return gl_draw_resources_created_;
+  }
+  gl_draw_resource_creation_attempted_ = true;
+
+  if (!vertex_buffer_) {
+    glGenBuffers(1, &vertex_buffer_);
+    if (!vertex_buffer_) {
+      LOG(ERROR) << "Error creating vertex buffer.";
+      return false;
+    }
+  }
+
+  gl_draw_resources_created_ = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::gl_resources_destroy()
+{
+  gl_context_enable();
+
+  if (vertex_buffer_ != 0) {
+    glDeleteBuffers(1, &vertex_buffer_);
+  }
+
+  if (texture_.gl_pbo_id) {
+    glDeleteBuffers(1, &texture_.gl_pbo_id);
+    texture_.gl_pbo_id = 0;
+  }
+
+  if (texture_.gl_id) {
+    glDeleteTextures(1, &texture_.gl_id);
+    texture_.gl_id = 0;
+  }
+
+  gl_context_disable();
+
+  gl_context_dispose();
+}
+
+bool BlenderGPUDisplay::gl_texture_resources_ensure()
+{
+  if (texture_.creation_attempted) {
+    return texture_.is_created;
+  }
+  texture_.creation_attempted = true;
+
+  DCHECK(!texture_.gl_id);
+  DCHECK(!texture_.gl_pbo_id);
+
+  /* Create texture. */
+  glGenTextures(1, &texture_.gl_id);
+  if (!texture_.gl_id) {
+    LOG(ERROR) << "Error creating texture.";
+    return false;
+  }
+
+  /* Configure the texture. */
+  glActiveTexture(GL_TEXTURE0);
+  glBindTexture(GL_TEXTURE_2D, texture_.gl_id);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+  glBindTexture(GL_TEXTURE_2D, 0);
+
+  /* Create PBO for the texture. */
+  glGenBuffers(1, &texture_.gl_pbo_id);
+  if (!texture_.gl_pbo_id) {
+    LOG(ERROR) << "Error creating texture pixel buffer object.";
+    return false;
+  }
+
+  /* Creation finished with a success. */
+  texture_.is_created = true;
+
+  return true;
+}
+
+void BlenderGPUDisplay::texture_update_if_needed()
+{
+  if (!texture_.need_update) {
+    return;
+  }
+
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id);
+  glTexSubImage2D(
+      GL_TEXTURE_2D, 0, 0, 0, texture_.width, texture_.height, GL_RGBA, GL_HALF_FLOAT, 0);
+  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+  texture_.need_update = false;
+}
+
+void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams &params)
+{
+  /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be
+   * rendered. */
+  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+  float *vpointer = reinterpret_cast<float *>(glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY));
+  if (!vpointer) {
+    return;
+  }
+
+  vpointer[0] = 0.0f;
+  vpointer[1] = 0.0f;
+  vpointer[2] = params.offset.x;
+  vpointer[3] = params.offset.y;
+
+  vpointer[4] = 1.0f;
+  vpointer[5] = 0.0f;
+  vpointer[6] = (float)params.size.x + params.offset.x;
+  vpointer[7] = params.offset.y;
+
+  vpointer[8] = 1.0f;
+  vpointer[9] = 1.0f;
+  vpointer[10] = (float)params.size.x + params.offset.x;
+  vpointer[11] = (float)params.size.y + params.offset.y;
+
+  vpointer[12] = 0.0f;
+  vpointer[13] = 1.0f;
+  vpointer[14] = params.offset.x;
+  vpointer[15] = (float)params.size.y + params.offset.y;
+
+  glUnmapBuffer(GL_ARRAY_BUFFER);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_gpu_display.h
new file mode 100644
index 00000000000..89420567037
--- /dev/null
+++ b/intern/cycles/blender/blender_gpu_display.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "MEM_guardedalloc.h"
+
+#include "RNA_blender_cpp.h"
+
+#include "render/gpu_display.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Base class of shader used for GPU display rendering. */
+class BlenderDisplayShader {
+ public:
+  static constexpr const char *position_attribute_name = "pos";
+  static constexpr const char *tex_coord_attribute_name = "texCoord";
+
+  /* Create shader implementation suitable for the given render engine and scene configuration. */
+  static unique_ptr<BlenderDisplayShader> create(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+  BlenderDisplayShader() = default;
+  virtual ~BlenderDisplayShader() = default;
+
+  virtual void bind(int width, int height) = 0;
+  virtual void unbind() = 0;
+
+  /* Get attribute location for position and texture coordinate respectively.
+   * NOTE: The shader needs to be bound to have access to those. */
+  virtual int get_position_attrib_location();
+  virtual int get_tex_coord_attrib_location();
+
+ protected:
+  /* Get program of this display shader.
+   * NOTE: The shader needs to be bound to have access to this. */
+  virtual uint get_shader_program() = 0;
+
+  /* Cached values of various OpenGL resources. */
+  int position_attribute_location_ = -1;
+  int tex_coord_attribute_location_ = -1;
+};
+
+/* Implementation of display rendering shader used in the case when render engine does not support
+ * display space shader. */
+class BlenderFallbackDisplayShader : public BlenderDisplayShader {
+ public:
+  virtual void bind(int width, int height) override;
+  virtual void unbind() override;
+
+ protected:
+  virtual uint get_shader_program() override;
+
+  void create_shader_if_needed();
+  void destroy_shader();
+
+  uint shader_program_ = 0;
+  int image_texture_location_ = -1;
+  int fullscreen_location_ = -1;
+
+  /* Shader compilation attempted. Which means, that if the shader program is 0 then compilation or
+   * linking has failed. Do not attempt to re-compile the shader. */
+  bool shader_compile_attempted_ = false;
+};
+
+class BlenderDisplaySpaceShader : public BlenderDisplayShader {
+ public:
+  BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+
+  virtual void bind(int width, int height) override;
+  virtual void unbind() override;
+
+ protected:
+  virtual uint get_shader_program() override;
+
+  BL::RenderEngine b_engine_;
+  BL::Scene &b_scene_;
+
+  /* Cached values of various OpenGL resources. */
+  uint shader_program_ = 0;
+};
+
+/* GPU display implementation which is specific for Blender viewport integration. */
+class BlenderGPUDisplay : public GPUDisplay {
+ public:
+  BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene);
+  ~BlenderGPUDisplay();
+
+  virtual void graphics_interop_activate() override;
+  virtual void graphics_interop_deactivate() override;
+
+  virtual void clear() override;
+
+  void set_zoom(float zoom_x, float zoom_y);
+
+ protected:
+  virtual bool do_update_begin(const GPUDisplayParams &params,
+                               int texture_width,
+                               int texture_height) override;
+  virtual void do_update_end() override;
+
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) override;
+  virtual void do_draw(const GPUDisplayParams &params) override;
+
+  virtual half4 *do_map_texture_buffer() override;
+  virtual void do_unmap_texture_buffer() override;
+
+  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override;
+
+  /* Helper function which allocates new GPU context. */
+  void gl_context_create();
+  bool gl_context_enable();
+  void gl_context_disable();
+  void gl_context_dispose();
+
+  /* Make sure texture is allocated and its initial configuration is performed. */
+  bool gl_texture_resources_ensure();
+
+  /* Ensure all runtime GPU resources needed for drawing are allocated.
+   * Returns true if all resources needed for drawing are available. */
+  bool gl_draw_resources_ensure();
+
+  /* Destroy all GPU resources which are being used by this object. */
+  void gl_resources_destroy();
+
+  /* Update GPU texture dimensions and content if needed (new pixel data was provided).
+   *
+   * NOTE: The texture needs to be bound. */
+  void texture_update_if_needed();
+
+  /* Update vertex buffer with new coordinates of vertex positions and texture coordinates.
+   * This buffer is used to render texture in the viewport.
+   *
+   * NOTE: The buffer needs to be bound. */
+  void vertex_buffer_update(const GPUDisplayParams &params);
+
+  BL::RenderEngine b_engine_;
+
+  /* OpenGL context which is used the render engine doesn't have its own. */
+  void *gl_context_ = nullptr;
+  /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create
+   * its own context. */
+  bool use_gl_context_ = false;
+  /* Mutex used to guard the `gl_context_`. */
+  thread_mutex gl_context_mutex_;
+
+  /* Texture which contains pixels of the render result. */
+  struct {
+    /* Indicates whether texture creation was attempted and succeeded.
+     * Used to avoid multiple attempts of texture creation on GPU issues or GPU context
+     * misconfiguration. */
+    bool creation_attempted = false;
+    bool is_created = false;
+
+    /* OpenGL resource IDs of the texture itself and Pixel Buffer Object (PBO) used to write
+     * pixels to it.
+     *
+     * NOTE: Allocated on the engine's context. */
+    uint gl_id = 0;
+    uint gl_pbo_id = 0;
+
+    /* Is true when new data was written to the PBO, meaning, the texture might need to be resized
+     * and new data is to be uploaded to the GPU. */
+    bool need_update = false;
+
+    /* Content of the texture is to be filled with zeroes. */
+    std::atomic<bool> need_clear = true;
+
+    /* Dimensions of the texture in pixels. */
+    int width = 0;
+    int height = 0;
+
+    /* Dimensions of the underlying PBO. */
+    int buffer_width = 0;
+    int buffer_height = 0;
+  } texture_;
+
+  unique_ptr<BlenderDisplayShader> display_shader_;
+
+  /* Special track of whether GPU resources were attempted to be created, to avoid attempts of
+   * their re-creation on failure on every redraw. */
+  bool gl_draw_resource_creation_attempted_ = false;
+  bool gl_draw_resources_created_ = false;
+
+  /* Vertex buffer which hold vertices of a triangle fan which is textures with the texture
+   * holding the render result. */
+  uint vertex_buffer_ = 0;
+
+  void *gl_render_sync_ = nullptr;
+  void *gl_upload_sync_ = nullptr;
+
+  float2 zoom_ = make_float2(1.0f, 1.0f);
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_light.cpp b/intern/cycles/blender/blender_light.cpp
index 542028f4b2f..4df1e720dde 100644
--- a/intern/cycles/blender/blender_light.cpp
+++ b/intern/cycles/blender/blender_light.cpp
@@ -125,17 +125,10 @@ void BlenderSync::sync_light(BL::Object &b_parent,
   light->set_shader(static_cast<Shader *>(used_shaders[0]));
 
   /* shadow */
-  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
   PointerRNA clight = RNA_pointer_get(&b_light.ptr, "cycles");
   light->set_cast_shadow(get_boolean(clight, "cast_shadow"));
   light->set_use_mis(get_boolean(clight, "use_multiple_importance_sampling"));
 
-  int samples = get_int(clight, "samples");
-  if (get_boolean(cscene, "use_square_samples"))
-    light->set_samples(samples * samples);
-  else
-    light->set_samples(samples);
-
   light->set_max_bounces(get_int(clight, "max_bounces"));
 
   if (b_ob_info.real_object != b_ob_info.iter_object) {
@@ -155,10 +148,12 @@ void BlenderSync::sync_light(BL::Object &b_parent,
 
   /* visibility */
   uint visibility = object_ray_visibility(b_ob_info.real_object);
+  light->set_use_camera((visibility & PATH_RAY_CAMERA) != 0);
   light->set_use_diffuse((visibility & PATH_RAY_DIFFUSE) != 0);
   light->set_use_glossy((visibility & PATH_RAY_GLOSSY) != 0);
   light->set_use_transmission((visibility & PATH_RAY_TRANSMIT) != 0);
   light->set_use_scatter((visibility & PATH_RAY_VOLUME_SCATTER) != 0);
+  light->set_is_shadow_catcher(b_ob_info.real_object.is_shadow_catcher());
 
   /* tag */
   light->tag_update(scene);
@@ -169,7 +164,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
   BL::World b_world = b_scene.world();
 
   if (b_world) {
-    PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
     PointerRNA cworld = RNA_pointer_get(&b_world.ptr, "cycles");
 
     enum SamplingMethod { SAMPLING_NONE = 0, SAMPLING_AUTOMATIC, SAMPLING_MANUAL, SAMPLING_NUM };
@@ -197,12 +191,6 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
         /* force enable light again when world is resynced */
         light->set_is_enabled(true);
 
-        int samples = get_int(cworld, "samples");
-        if (get_boolean(cscene, "use_square_samples"))
-          light->set_samples(samples * samples);
-        else
-          light->set_samples(samples);
-
         light->tag_update(scene);
         light_map.set_recalc(b_world);
       }
@@ -211,7 +199,7 @@ void BlenderSync::sync_background_light(BL::SpaceView3D &b_v3d, bool use_portal)
 
   world_map = b_world.ptr.data;
   world_recalc = false;
-  viewport_parameters = BlenderViewportParameters(b_v3d);
+  viewport_parameters = BlenderViewportParameters(b_v3d, use_developer_ui);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_object.cpp b/intern/cycles/blender/blender_object.cpp
index 22d6edeb099..95da4a2df84 100644
--- a/intern/cycles/blender/blender_object.cpp
+++ b/intern/cycles/blender/blender_object.cpp
@@ -568,7 +568,7 @@ void BlenderSync::sync_objects(BL::Depsgraph &b_depsgraph,
   /* object loop */
   bool cancel = false;
   bool use_portal = false;
-  const bool show_lights = BlenderViewportParameters(b_v3d).use_scene_lights;
+  const bool show_lights = BlenderViewportParameters(b_v3d, use_developer_ui).use_scene_lights;
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
   BL::Depsgraph::object_instances_iterator b_instance_iter;
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 6e06b6a468f..694d8454422 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -45,10 +45,6 @@
 #  include <OSL/oslquery.h>
 #endif
 
-#ifdef WITH_OPENCL
-#  include "device/device_intern.h"
-#endif
-
 CCL_NAMESPACE_BEGIN
 
 namespace {
@@ -72,12 +68,10 @@ PyObject *pyunicode_from_string(const char *str)
 /* Synchronize debug flags from a given Blender scene.
  * Return truth when device list needs invalidation.
  */
-bool debug_flags_sync_from_scene(BL::Scene b_scene)
+static void debug_flags_sync_from_scene(BL::Scene b_scene)
 {
   DebugFlagsRef flags = DebugFlags();
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
-  /* Backup some settings for comparison. */
-  DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
   /* Synchronize shared flags. */
   flags.viewport_static_bvh = get_enum(cscene, "debug_bvh_type");
   /* Synchronize CPU flags. */
@@ -87,50 +81,19 @@ bool debug_flags_sync_from_scene(BL::Scene b_scene)
   flags.cpu.sse3 = get_boolean(cscene, "debug_use_cpu_sse3");
   flags.cpu.sse2 = get_boolean(cscene, "debug_use_cpu_sse2");
   flags.cpu.bvh_layout = (BVHLayout)get_enum(cscene, "debug_bvh_layout");
-  flags.cpu.split_kernel = get_boolean(cscene, "debug_use_cpu_split_kernel");
   /* Synchronize CUDA flags. */
   flags.cuda.adaptive_compile = get_boolean(cscene, "debug_use_cuda_adaptive_compile");
-  flags.cuda.split_kernel = get_boolean(cscene, "debug_use_cuda_split_kernel");
   /* Synchronize OptiX flags. */
-  flags.optix.cuda_streams = get_int(cscene, "debug_optix_cuda_streams");
-  flags.optix.curves_api = get_boolean(cscene, "debug_optix_curves_api");
-  /* Synchronize OpenCL device type. */
-  switch (get_enum(cscene, "debug_opencl_device_type")) {
-    case 0:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
-      break;
-    case 1:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ALL;
-      break;
-    case 2:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
-      break;
-    case 3:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_CPU;
-      break;
-    case 4:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_GPU;
-      break;
-    case 5:
-      flags.opencl.device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
-      break;
-  }
-  /* Synchronize other OpenCL flags. */
-  flags.opencl.debug = get_boolean(cscene, "debug_use_opencl_debug");
-  flags.opencl.mem_limit = ((size_t)get_int(cscene, "debug_opencl_mem_limit")) * 1024 * 1024;
-  return flags.opencl.device_type != opencl_device_type;
+  flags.optix.use_debug = get_boolean(cscene, "debug_use_optix_debug");
 }
 
 /* Reset debug flags to default values.
  * Return truth when device list needs invalidation.
  */
-bool debug_flags_reset()
+static void debug_flags_reset()
 {
   DebugFlagsRef flags = DebugFlags();
-  /* Backup some settings for comparison. */
-  DebugFlags::OpenCL::DeviceType opencl_device_type = flags.opencl.device_type;
   flags.reset();
-  return flags.opencl.device_type != opencl_device_type;
 }
 
 } /* namespace */
@@ -175,18 +138,20 @@ static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
 
 static PyObject *init_func(PyObject * /*self*/, PyObject *args)
 {
-  PyObject *path, *user_path;
+  PyObject *path, *user_path, *temp_path;
   int headless;
 
-  if (!PyArg_ParseTuple(args, "OOi", &path, &user_path, &headless)) {
-    return NULL;
+  if (!PyArg_ParseTuple(args, "OOOi", &path, &user_path, &temp_path, &headless)) {
+    return nullptr;
   }
 
-  PyObject *path_coerce = NULL, *user_path_coerce = NULL;
+  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr, *temp_path_coerce = nullptr;
   path_init(PyC_UnicodeAsByte(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce));
+            PyC_UnicodeAsByte(user_path, &user_path_coerce),
+            PyC_UnicodeAsByte(temp_path, &temp_path_coerce));
   Py_XDECREF(path_coerce);
   Py_XDECREF(user_path_coerce);
+  Py_XDECREF(temp_path_coerce);
 
   BlenderSession::headless = headless;
 
@@ -299,6 +264,50 @@ static PyObject *render_func(PyObject * /*self*/, PyObject *args)
   Py_RETURN_NONE;
 }
 
+static PyObject *render_frame_finish_func(PyObject * /*self*/, PyObject *args)
+{
+  PyObject *pysession;
+
+  if (!PyArg_ParseTuple(args, "O", &pysession)) {
+    return nullptr;
+  }
+
+  BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(pysession);
+
+  /* Allow Blender to execute other Python scripts. */
+  python_thread_state_save(&session->python_thread_state);
+
+  session->render_frame_finish();
+
+  python_thread_state_restore(&session->python_thread_state);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+{
+  PyObject *py_session, *py_graph, *py_screen, *py_space_image;
+
+  if (!PyArg_ParseTuple(args, "OOOO", &py_session, &py_graph, &py_screen, &py_space_image)) {
+    return nullptr;
+  }
+
+  BlenderSession *session = (BlenderSession *)PyLong_AsVoidPtr(py_session);
+
+  ID *b_screen = (ID *)PyLong_AsVoidPtr(py_screen);
+
+  PointerRNA b_space_image_ptr;
+  RNA_pointer_create(b_screen,
+                     &RNA_SpaceImageEditor,
+                     pylong_as_voidptr_typesafe(py_space_image),
+                     &b_space_image_ptr);
+  BL::SpaceImageEditor b_space_image(b_space_image_ptr);
+
+  session->draw(b_space_image);
+
+  Py_RETURN_NONE;
+}
+
 /* pixel_array and result passed as pointers */
 static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
 {
@@ -336,7 +345,7 @@ static PyObject *bake_func(PyObject * /*self*/, PyObject *args)
   Py_RETURN_NONE;
 }
 
-static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
+static PyObject *view_draw_func(PyObject * /*self*/, PyObject *args)
 {
   PyObject *pysession, *pygraph, *pyv3d, *pyrv3d;
 
@@ -350,7 +359,7 @@ static PyObject *draw_func(PyObject * /*self*/, PyObject *args)
     int viewport[4];
     glGetIntegerv(GL_VIEWPORT, viewport);
 
-    session->draw(viewport[2], viewport[3]);
+    session->view_draw(viewport[2], viewport[3]);
   }
 
   Py_RETURN_NONE;
@@ -697,40 +706,6 @@ static PyObject *system_info_func(PyObject * /*self*/, PyObject * /*value*/)
   return pyunicode_from_string(system_info.c_str());
 }
 
-#ifdef WITH_OPENCL
-static PyObject *opencl_disable_func(PyObject * /*self*/, PyObject * /*value*/)
-{
-  VLOG(2) << "Disabling OpenCL platform.";
-  DebugFlags().opencl.device_type = DebugFlags::OpenCL::DEVICE_NONE;
-  Py_RETURN_NONE;
-}
-
-static PyObject *opencl_compile_func(PyObject * /*self*/, PyObject *args)
-{
-  PyObject *sequence = PySequence_Fast(args, "Arguments must be a sequence");
-  if (sequence == NULL) {
-    Py_RETURN_FALSE;
-  }
-
-  vector<string> parameters;
-  for (Py_ssize_t i = 0; i < PySequence_Fast_GET_SIZE(sequence); i++) {
-    PyObject *item = PySequence_Fast_GET_ITEM(sequence, i);
-    PyObject *item_as_string = PyObject_Str(item);
-    const char *parameter_string = PyUnicode_AsUTF8(item_as_string);
-    parameters.push_back(parameter_string);
-    Py_DECREF(item_as_string);
-  }
-  Py_DECREF(sequence);
-
-  if (device_opencl_compile_kernel(parameters)) {
-    Py_RETURN_TRUE;
-  }
-  else {
-    Py_RETURN_FALSE;
-  }
-}
-#endif
-
 static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepaths)
 {
   if (PyUnicode_Check(pyfilepaths)) {
@@ -762,6 +737,10 @@ static bool image_parse_filepaths(PyObject *pyfilepaths, vector<string> &filepat
 
 static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *keywords)
 {
+#if 1
+  (void)args;
+  (void)keywords;
+#else
   static const char *keyword_list[] = {
       "preferences", "scene", "view_layer", "input", "output", "tile_size", "samples", NULL};
   PyObject *pypreferences, *pyscene, *pyviewlayer;
@@ -835,7 +814,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
   }
 
   /* Create denoiser. */
-  Denoiser denoiser(device);
+  DenoiserPipeline denoiser(device);
   denoiser.params = params;
   denoiser.input = input;
   denoiser.output = output;
@@ -852,6 +831,7 @@ static PyObject *denoise_func(PyObject * /*self*/, PyObject *args, PyObject *key
     PyErr_SetString(PyExc_ValueError, denoiser.error.c_str());
     return NULL;
   }
+#endif
 
   Py_RETURN_NONE;
 }
@@ -903,10 +883,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
   RNA_id_pointer_create((ID *)PyLong_AsVoidPtr(pyscene), &sceneptr);
   BL::Scene b_scene(sceneptr);
 
-  if (debug_flags_sync_from_scene(b_scene)) {
-    VLOG(2) << "Tagging device list for update.";
-    Device::tag_update();
-  }
+  debug_flags_sync_from_scene(b_scene);
 
   VLOG(2) << "Debug flags set to:\n" << DebugFlags();
 
@@ -917,10 +894,7 @@ static PyObject *debug_flags_update_func(PyObject * /*self*/, PyObject *args)
 
 static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/)
 {
-  if (debug_flags_reset()) {
-    VLOG(2) << "Tagging device list for update.";
-    Device::tag_update();
-  }
+  debug_flags_reset();
   if (debug_flags_set) {
     VLOG(2) << "Debug flags reset to:\n" << DebugFlags();
     debug_flags_set = false;
@@ -928,84 +902,6 @@ static PyObject *debug_flags_reset_func(PyObject * /*self*/, PyObject * /*args*/
   Py_RETURN_NONE;
 }
 
-static PyObject *set_resumable_chunk_func(PyObject * /*self*/, PyObject *args)
-{
-  int num_resumable_chunks, current_resumable_chunk;
-  if (!PyArg_ParseTuple(args, "ii", &num_resumable_chunks, &current_resumable_chunk)) {
-    Py_RETURN_NONE;
-  }
-
-  if (num_resumable_chunks <= 0) {
-    fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (current_resumable_chunk < 1 || current_resumable_chunk > num_resumable_chunks) {
-    fprintf(stderr, "Cycles: Bad value for current resumable chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-
-  VLOG(1) << "Initialized resumable render: "
-          << "num_resumable_chunks=" << num_resumable_chunks << ", "
-          << "current_resumable_chunk=" << current_resumable_chunk;
-  BlenderSession::num_resumable_chunks = num_resumable_chunks;
-  BlenderSession::current_resumable_chunk = current_resumable_chunk;
-
-  printf("Cycles: Will render chunk %d of %d\n", current_resumable_chunk, num_resumable_chunks);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *set_resumable_chunk_range_func(PyObject * /*self*/, PyObject *args)
-{
-  int num_chunks, start_chunk, end_chunk;
-  if (!PyArg_ParseTuple(args, "iii", &num_chunks, &start_chunk, &end_chunk)) {
-    Py_RETURN_NONE;
-  }
-
-  if (num_chunks <= 0) {
-    fprintf(stderr, "Cycles: Bad value for number of resumable chunks.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (start_chunk < 1 || start_chunk > num_chunks) {
-    fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (end_chunk < 1 || end_chunk > num_chunks) {
-    fprintf(stderr, "Cycles: Bad value for start chunk number.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-  if (start_chunk > end_chunk) {
-    fprintf(stderr, "Cycles: End chunk should be higher than start one.\n");
-    abort();
-    Py_RETURN_NONE;
-  }
-
-  VLOG(1) << "Initialized resumable render: "
-          << "num_resumable_chunks=" << num_chunks << ", "
-          << "start_resumable_chunk=" << start_chunk << "end_resumable_chunk=" << end_chunk;
-  BlenderSession::num_resumable_chunks = num_chunks;
-  BlenderSession::start_resumable_chunk = start_chunk;
-  BlenderSession::end_resumable_chunk = end_chunk;
-
-  printf("Cycles: Will render chunks %d to %d of %d\n", start_chunk, end_chunk, num_chunks);
-
-  Py_RETURN_NONE;
-}
-
-static PyObject *clear_resumable_chunk_func(PyObject * /*self*/, PyObject * /*value*/)
-{
-  VLOG(1) << "Clear resumable render";
-  BlenderSession::num_resumable_chunks = 0;
-  BlenderSession::current_resumable_chunk = 0;
-
-  Py_RETURN_NONE;
-}
-
 static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   BlenderSession::print_render_stats = true;
@@ -1015,16 +911,14 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false, has_opencl = false;
+  bool has_cuda = false, has_optix = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
     has_optix |= (device_type == DEVICE_OPTIX);
-    has_opencl |= (device_type == DEVICE_OPENCL);
   }
-  PyObject *list = PyTuple_New(3);
+  PyObject *list = PyTuple_New(2);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
   PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
-  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_opencl));
   return list;
 }
 
@@ -1044,9 +938,6 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
   if (override == "CPU") {
     BlenderSession::device_override = DEVICE_MASK_CPU;
   }
-  else if (override == "OPENCL") {
-    BlenderSession::device_override = DEVICE_MASK_OPENCL;
-  }
   else if (override == "CUDA") {
     BlenderSession::device_override = DEVICE_MASK_CUDA;
   }
@@ -1072,8 +963,10 @@ static PyMethodDef methods[] = {
     {"create", create_func, METH_VARARGS, ""},
     {"free", free_func, METH_O, ""},
     {"render", render_func, METH_VARARGS, ""},
-    {"bake", bake_func, METH_VARARGS, ""},
+    {"render_frame_finish", render_frame_finish_func, METH_VARARGS, ""},
     {"draw", draw_func, METH_VARARGS, ""},
+    {"bake", bake_func, METH_VARARGS, ""},
+    {"view_draw", view_draw_func, METH_VARARGS, ""},
     {"sync", sync_func, METH_VARARGS, ""},
     {"reset", reset_func, METH_VARARGS, ""},
 #ifdef WITH_OSL
@@ -1082,10 +975,6 @@ static PyMethodDef methods[] = {
 #endif
     {"available_devices", available_devices_func, METH_VARARGS, ""},
     {"system_info", system_info_func, METH_NOARGS, ""},
-#ifdef WITH_OPENCL
-    {"opencl_disable", opencl_disable_func, METH_NOARGS, ""},
-    {"opencl_compile", opencl_compile_func, METH_VARARGS, ""},
-#endif
 
     /* Standalone denoising */
     {"denoise", (PyCFunction)denoise_func, METH_VARARGS | METH_KEYWORDS, ""},
@@ -1098,11 +987,6 @@ static PyMethodDef methods[] = {
     /* Statistics. */
     {"enable_print_stats", enable_print_stats_func, METH_NOARGS, ""},
 
-    /* Resumable render */
-    {"set_resumable_chunk", set_resumable_chunk_func, METH_VARARGS, ""},
-    {"set_resumable_chunk_range", set_resumable_chunk_range_func, METH_VARARGS, ""},
-    {"clear_resumable_chunk", clear_resumable_chunk_func, METH_NOARGS, ""},
-
     /* Compute Device selection */
     {"get_device_types", get_device_types_func, METH_VARARGS, ""},
     {"set_device_override", set_device_override_func, METH_O, ""},
@@ -1153,14 +1037,6 @@ void *CCL_python_module_init()
   PyModule_AddStringConstant(mod, "osl_version_string", "unknown");
 #endif
 
-#ifdef WITH_NETWORK
-  PyModule_AddObject(mod, "with_network", Py_True);
-  Py_INCREF(Py_True);
-#else  /* WITH_NETWORK */
-  PyModule_AddObject(mod, "with_network", Py_False);
-  Py_INCREF(Py_False);
-#endif /* WITH_NETWORK */
-
 #ifdef WITH_EMBREE
   PyModule_AddObject(mod, "with_embree", Py_True);
   Py_INCREF(Py_True);
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 29de886e4ff..d65d89a7ddd 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -38,9 +38,11 @@
 #include "util/util_hash.h"
 #include "util/util_logging.h"
 #include "util/util_murmurhash.h"
+#include "util/util_path.h"
 #include "util/util_progress.h"
 #include "util/util_time.h"
 
+#include "blender/blender_gpu_display.h"
 #include "blender/blender_session.h"
 #include "blender/blender_sync.h"
 #include "blender/blender_util.h"
@@ -49,10 +51,6 @@ CCL_NAMESPACE_BEGIN
 
 DeviceTypeMask BlenderSession::device_override = DEVICE_MASK_ALL;
 bool BlenderSession::headless = false;
-int BlenderSession::num_resumable_chunks = 0;
-int BlenderSession::current_resumable_chunk = 0;
-int BlenderSession::start_resumable_chunk = 0;
-int BlenderSession::end_resumable_chunk = 0;
 bool BlenderSession::print_render_stats = false;
 
 BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
@@ -103,7 +101,9 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
       width(width),
       height(height),
       preview_osl(false),
-      python_thread_state(NULL)
+      python_thread_state(NULL),
+      use_developer_ui(b_userpref.experimental().use_cycles_debug() &&
+                       b_userpref.view().show_developer_ui())
 {
   /* 3d view render */
   background = false;
@@ -119,10 +119,10 @@ BlenderSession::~BlenderSession()
 
 void BlenderSession::create_session()
 {
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
-  bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
   /* reset status/progress */
   last_status = "";
@@ -131,20 +131,18 @@ void BlenderSession::create_session()
   start_resize_time = 0.0;
 
   /* create session */
-  session = new Session(session_params);
-  session->scene = scene;
+  session = new Session(session_params, scene_params);
   session->progress.set_update_callback(function_bind(&BlenderSession::tag_redraw, this));
   session->progress.set_cancel_callback(function_bind(&BlenderSession::test_cancel, this));
   session->set_pause(session_pause);
 
   /* create scene */
-  scene = new Scene(scene_params, session->device);
+  scene = session->scene;
   scene->name = b_scene.name();
 
-  session->scene = scene;
-
   /* create sync */
-  sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+  sync = new BlenderSync(
+      b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
   BL::Object b_camera_override(b_engine.camera_override());
   if (b_v3d) {
     sync->sync_view(b_v3d, b_rv3d, width, height);
@@ -154,13 +152,25 @@ void BlenderSession::create_session()
   }
 
   /* set buffer parameters */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-  session->reset(buffer_params, session_params.samples);
-
-  b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_v3d, b_rv3d, scene->camera, width, height);
+  session->reset(session_params, buffer_params);
+
+  /* Create GPU display. */
+  if (!b_engine.is_preview() && !headless) {
+    unique_ptr<BlenderGPUDisplay> gpu_display = make_unique<BlenderGPUDisplay>(b_engine, b_scene);
+    gpu_display_ = gpu_display.get();
+    session->set_gpu_display(move(gpu_display));
+  }
 
-  update_resumable_tile_manager(session_params.samples);
+  /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform
+   * engine that no tracking of the tiles state is needed.
+   * The offline rendering will make a decision when tile is being written. The penalty of asking
+   * the engine to keep track of tiles state is minimal, so there is nothing to worry about here
+   * about possible single-tiled final render. */
+  if (!b_engine.is_preview() && !b_v3d) {
+    b_engine.use_highlight_tiles(true);
+  }
 }
 
 void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsgraph)
@@ -202,9 +212,9 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
     return;
   }
 
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
 
   if (scene->params.modified(scene_params) || session->params.modified(session_params) ||
       !this->b_render.use_persistent_data()) {
@@ -220,8 +230,6 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   session->progress.reset();
 
-  session->tile_manager.set_tile_order(session_params.tile_order);
-
   /* peak memory usage should show current render peak, not peak for all renders
    * made by this render session
    */
@@ -230,7 +238,8 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
   if (is_new_session) {
     /* Sync object should be re-created for new scene. */
     delete sync;
-    sync = new BlenderSync(b_engine, b_data, b_scene, scene, !background, session->progress);
+    sync = new BlenderSync(
+        b_engine, b_data, b_scene, scene, !background, use_developer_ui, session->progress);
   }
   else {
     /* Sync recalculations to do just the required updates. */
@@ -242,103 +251,85 @@ void BlenderSession::reset_session(BL::BlendData &b_data, BL::Depsgraph &b_depsg
 
   BL::SpaceView3D b_null_space_view3d(PointerRNA_NULL);
   BL::RegionView3D b_null_region_view3d(PointerRNA_NULL);
-  BufferParams buffer_params = BlenderSync::get_buffer_params(b_null_space_view3d,
-                                                              b_null_region_view3d,
-                                                              scene->camera,
-                                                              width,
-                                                              height,
-                                                              session_params.denoising.use);
-  session->reset(buffer_params, session_params.samples);
-
-  b_engine.use_highlight_tiles(session_params.progressive_refine == false);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_null_space_view3d, b_null_region_view3d, scene->camera, width, height);
+  session->reset(session_params, buffer_params);
 
   /* reset time */
   start_resize_time = 0.0;
+
+  {
+    thread_scoped_lock lock(draw_state_.mutex);
+    draw_state_.last_pass_index = -1;
+  }
 }
 
 void BlenderSession::free_session()
 {
-  session->cancel();
+  if (session) {
+    session->cancel(true);
+  }
 
   delete sync;
+  sync = nullptr;
+
   delete session;
+  session = nullptr;
 }
 
-static ShaderEvalType get_shader_type(const string &pass_type)
+void BlenderSession::read_render_tile()
 {
-  const char *shader_type = pass_type.c_str();
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
 
-  /* data passes */
-  if (strcmp(shader_type, "NORMAL") == 0)
-    return SHADER_EVAL_NORMAL;
-  else if (strcmp(shader_type, "UV") == 0)
-    return SHADER_EVAL_UV;
-  else if (strcmp(shader_type, "ROUGHNESS") == 0)
-    return SHADER_EVAL_ROUGHNESS;
-  else if (strcmp(shader_type, "DIFFUSE_COLOR") == 0)
-    return SHADER_EVAL_DIFFUSE_COLOR;
-  else if (strcmp(shader_type, "GLOSSY_COLOR") == 0)
-    return SHADER_EVAL_GLOSSY_COLOR;
-  else if (strcmp(shader_type, "TRANSMISSION_COLOR") == 0)
-    return SHADER_EVAL_TRANSMISSION_COLOR;
-  else if (strcmp(shader_type, "EMIT") == 0)
-    return SHADER_EVAL_EMISSION;
+  /* get render result */
+  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+                                                tile_offset.y,
+                                                tile_size.x,
+                                                tile_size.y,
+                                                b_rlay_name.c_str(),
+                                                b_rview_name.c_str());
 
-  /* light passes */
-  else if (strcmp(shader_type, "AO") == 0)
-    return SHADER_EVAL_AO;
-  else if (strcmp(shader_type, "COMBINED") == 0)
-    return SHADER_EVAL_COMBINED;
-  else if (strcmp(shader_type, "SHADOW") == 0)
-    return SHADER_EVAL_SHADOW;
-  else if (strcmp(shader_type, "DIFFUSE") == 0)
-    return SHADER_EVAL_DIFFUSE;
-  else if (strcmp(shader_type, "GLOSSY") == 0)
-    return SHADER_EVAL_GLOSSY;
-  else if (strcmp(shader_type, "TRANSMISSION") == 0)
-    return SHADER_EVAL_TRANSMISSION;
+  /* can happen if the intersected rectangle gives 0 width or height */
+  if (b_rr.ptr.data == NULL) {
+    return;
+  }
 
-  /* extra */
-  else if (strcmp(shader_type, "ENVIRONMENT") == 0)
-    return SHADER_EVAL_ENVIRONMENT;
+  BL::RenderResult::layers_iterator b_single_rlay;
+  b_rr.layers.begin(b_single_rlay);
 
-  else
-    return SHADER_EVAL_BAKE;
-}
+  /* layer will be missing if it was disabled in the UI */
+  if (b_single_rlay == b_rr.layers.end())
+    return;
 
-static BL::RenderResult begin_render_result(BL::RenderEngine &b_engine,
-                                            int x,
-                                            int y,
-                                            int w,
-                                            int h,
-                                            const char *layername,
-                                            const char *viewname)
-{
-  return b_engine.begin_result(x, y, w, h, layername, viewname);
-}
+  BL::RenderLayer b_rlay = *b_single_rlay;
 
-static void end_render_result(BL::RenderEngine &b_engine,
-                              BL::RenderResult &b_rr,
-                              bool cancel,
-                              bool highlight,
-                              bool do_merge_results)
-{
-  b_engine.end_result(b_rr, (int)cancel, (int)highlight, (int)do_merge_results);
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+  /* Copy each pass.
+   * TODO:copy only the required ones for better performance? */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
+  }
 }
 
-void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
-                                                 bool do_update_only,
-                                                 bool do_read_only,
-                                                 bool highlight)
+void BlenderSession::write_render_tile()
 {
-  int x = rtile.x - session->tile_manager.params.full_x;
-  int y = rtile.y - session->tile_manager.params.full_y;
-  int w = rtile.w;
-  int h = rtile.h;
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
+
+  const string_view render_layer_name = session->get_render_tile_layer();
+  const string_view render_view_name = session->get_render_tile_view();
+
+  b_engine.tile_highlight_clear_all();
 
   /* get render result */
-  BL::RenderResult b_rr = begin_render_result(
-      b_engine, x, y, w, h, b_rlay_name.c_str(), b_rview_name.c_str());
+  BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x,
+                                                tile_offset.y,
+                                                tile_size.x,
+                                                tile_size.y,
+                                                render_layer_name.c_str(),
+                                                render_view_name.c_str());
 
   /* can happen if the intersected rectangle gives 0 width or height */
   if (b_rr.ptr.data == NULL) {
@@ -349,64 +340,34 @@ void BlenderSession::do_write_update_render_tile(RenderTile &rtile,
   b_rr.layers.begin(b_single_rlay);
 
   /* layer will be missing if it was disabled in the UI */
-  if (b_single_rlay == b_rr.layers.end())
+  if (b_single_rlay == b_rr.layers.end()) {
     return;
+  }
 
   BL::RenderLayer b_rlay = *b_single_rlay;
 
-  if (do_read_only) {
-    /* copy each pass */
-    for (BL::RenderPass &b_pass : b_rlay.passes) {
-      /* find matching pass type */
-      PassType pass_type = BlenderSync::get_pass_type(b_pass);
-      int components = b_pass.channels();
-
-      rtile.buffers->set_pass_rect(
-          pass_type, components, (float *)b_pass.rect(), rtile.num_samples);
-    }
-
-    end_render_result(b_engine, b_rr, false, false, false);
-  }
-  else if (do_update_only) {
-    /* Sample would be zero at initial tile update, which is only needed
-     * to tag tile form blender side as IN PROGRESS for proper highlight
-     * no buffers should be sent to blender yet. For denoise we also
-     * keep showing the noisy buffers until denoise is done. */
-    bool merge = (rtile.sample != 0) && (rtile.task != RenderTile::DENOISE);
-
-    if (merge) {
-      update_render_result(b_rlay, rtile);
-    }
+  write_render_result(b_rlay);
 
-    end_render_result(b_engine, b_rr, true, highlight, merge);
-  }
-  else {
-    /* Write final render result. */
-    write_render_result(b_rlay, rtile);
-    end_render_result(b_engine, b_rr, false, false, true);
-  }
+  b_engine.end_result(b_rr, true, false, true);
 }
 
-void BlenderSession::read_render_tile(RenderTile &rtile)
+void BlenderSession::update_render_tile()
 {
-  do_write_update_render_tile(rtile, false, true, false);
-}
+  if (!session->has_multiple_render_tiles()) {
+    /* Don't highlight full-frame tile. */
+    return;
+  }
 
-void BlenderSession::write_render_tile(RenderTile &rtile)
-{
-  do_write_update_render_tile(rtile, false, false, false);
+  const int2 tile_offset = session->get_render_tile_offset();
+  const int2 tile_size = session->get_render_tile_size();
+
+  b_engine.tile_highlight_clear_all();
+  b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true);
 }
 
-void BlenderSession::update_render_tile(RenderTile &rtile, bool highlight)
+void BlenderSession::full_buffer_written(string_view filename)
 {
-  /* use final write for preview renders, otherwise render result wouldn't be
-   * be updated in blender side
-   * would need to be investigated a bit further, but for now shall be fine
-   */
-  if (!b_engine.is_preview())
-    do_write_update_render_tile(rtile, true, false, highlight);
-  else
-    do_write_update_render_tile(rtile, false, false, false);
+  full_buffer_files_.emplace_back(filename);
 }
 
 static void add_cryptomatte_layer(BL::RenderResult &b_rr, string name, string manifest)
@@ -430,12 +391,15 @@ void BlenderSession::stamp_view_layer_metadata(Scene *scene, const string &view_
                             to_string(session->params.samples).c_str());
 
   /* Store ranged samples information. */
+  /* TODO(sergey): Need to bring this information back. */
+#if 0
   if (session->tile_manager.range_num_samples != -1) {
     b_rr.stamp_data_add_field((prefix + "range_start_sample").c_str(),
                               to_string(session->tile_manager.range_start_sample).c_str());
     b_rr.stamp_data_add_field((prefix + "range_num_samples").c_str(),
                               to_string(session->tile_manager.range_num_samples).c_str());
   }
+#endif
 
   /* Write cryptomatte metadata. */
   if (scene->film->get_cryptomatte_passes() & CRYPT_OBJECT) {
@@ -475,38 +439,44 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   }
 
   /* set callback to write out render results */
-  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-  session->update_render_tile_cb = function_bind(
-      &BlenderSession::update_render_tile, this, _1, _2);
+  session->write_render_tile_cb = [&]() { write_render_tile(); };
+
+  /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender
+   * side. */
+  /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */
+  if (b_engine.is_preview()) {
+    session->update_render_tile_cb = [&]() { write_render_tile(); };
+  }
+  else {
+    session->update_render_tile_cb = [&]() { update_render_tile(); };
+  }
+
+  session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); };
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
   /* get buffer parameters */
-  SessionParams session_params = BlenderSync::get_session_params(
-      b_engine, b_userpref, b_scene, background, b_view_layer);
+  const SessionParams session_params = BlenderSync::get_session_params(
+      b_engine, b_userpref, b_scene, background);
   BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
+      b_v3d, b_rv3d, scene->camera, width, height);
 
   /* temporary render result to find needed passes and views */
-  BL::RenderResult b_rr = begin_render_result(
-      b_engine, 0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
+  BL::RenderResult b_rr = b_engine.begin_result(0, 0, 1, 1, b_view_layer.name().c_str(), NULL);
   BL::RenderResult::layers_iterator b_single_rlay;
   b_rr.layers.begin(b_single_rlay);
   BL::RenderLayer b_rlay = *b_single_rlay;
-  b_rlay_name = b_view_layer.name();
 
-  /* Update denoising parameters. */
-  session->set_denoising(session_params.denoising);
+  {
+    thread_scoped_lock lock(draw_state_.mutex);
+    b_rlay_name = b_view_layer.name();
 
-  /* Compute render passes and film settings. */
-  vector<Pass> passes = sync->sync_render_passes(
-      b_scene, b_rlay, b_view_layer, session_params.adaptive_sampling, session_params.denoising);
+    /* Signal that the display pass is to be updated. */
+    draw_state_.last_pass_index = -1;
+  }
 
-  /* Set buffer params, using film settings from sync_render_passes. */
-  buffer_params.passes = passes;
-  buffer_params.denoising_data_pass = scene->film->get_denoising_data_pass();
-  buffer_params.denoising_clean_pass = scene->film->get_denoising_clean_pass();
-  buffer_params.denoising_prefiltered_pass = scene->film->get_denoising_prefiltered_pass();
+  /* Compute render passes and film settings. */
+  sync->sync_render_passes(b_rlay, b_view_layer);
 
   BL::RenderResult::views_iterator b_view_iter;
 
@@ -520,6 +490,9 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
        ++b_view_iter, ++view_index) {
     b_rview_name = b_view_iter->name();
 
+    buffer_params.layer = b_view_layer.name();
+    buffer_params.view = b_rview_name;
+
     /* set the current view */
     b_engine.active_view_set(b_rview_name.c_str());
 
@@ -549,20 +522,16 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     }
 
     /* Update number of samples per layer. */
-    int samples = sync->get_layer_samples();
-    bool bound_samples = sync->get_layer_bound_samples();
-    int effective_layer_samples;
+    const int samples = sync->get_layer_samples();
+    const bool bound_samples = sync->get_layer_bound_samples();
 
-    if (samples != 0 && (!bound_samples || (samples < session_params.samples)))
-      effective_layer_samples = samples;
-    else
-      effective_layer_samples = session_params.samples;
-
-    /* Update tile manager if we're doing resumable render. */
-    update_resumable_tile_manager(effective_layer_samples);
+    SessionParams effective_session_params = session_params;
+    if (samples != 0 && (!bound_samples || (samples < session_params.samples))) {
+      effective_session_params.samples = samples;
+    }
 
     /* Update session itself. */
-    session->reset(buffer_params, effective_layer_samples);
+    session->reset(effective_session_params, buffer_params);
 
     /* render */
     if (!b_engine.is_preview() && background && print_render_stats) {
@@ -586,65 +555,146 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
   stamp_view_layer_metadata(scene, b_rlay_name);
 
   /* free result without merging */
-  end_render_result(b_engine, b_rr, true, true, false);
+  b_engine.end_result(b_rr, true, false, false);
 
   double total_time, render_time;
   session->progress.get_time(total_time, render_time);
   VLOG(1) << "Total render time: " << total_time;
   VLOG(1) << "Render time (without synchronization): " << render_time;
+}
+
+void BlenderSession::render_frame_finish()
+{
+  /* Processing of all layers and views is done. Clear the strings so that we can communicate
+   * progress about reading files and denoising them. */
+  b_rlay_name = "";
+  b_rview_name = "";
+
+  if (!b_render.use_persistent_data()) {
+    /* Free the sync object so that it can properly dereference nodes from the scene graph before
+     * the graph is freed. */
+    delete sync;
+    sync = nullptr;
+
+    session->device_free();
+  }
+
+  for (string_view filename : full_buffer_files_) {
+    session->process_full_buffer_from_disk(filename);
+    path_remove(filename);
+  }
 
   /* clear callback */
   session->write_render_tile_cb = function_null;
   session->update_render_tile_cb = function_null;
+  session->full_buffer_written_cb = function_null;
 }
 
-static int bake_pass_filter_get(const int pass_filter)
+static PassType bake_type_to_pass(const string &bake_type_str, const int bake_filter)
 {
-  int flag = BAKE_FILTER_NONE;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_DIRECT) != 0)
-    flag |= BAKE_FILTER_DIRECT;
-  if ((pass_filter & BL::BakeSettings::pass_filter_INDIRECT) != 0)
-    flag |= BAKE_FILTER_INDIRECT;
-  if ((pass_filter & BL::BakeSettings::pass_filter_COLOR) != 0)
-    flag |= BAKE_FILTER_COLOR;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_DIFFUSE) != 0)
-    flag |= BAKE_FILTER_DIFFUSE;
-  if ((pass_filter & BL::BakeSettings::pass_filter_GLOSSY) != 0)
-    flag |= BAKE_FILTER_GLOSSY;
-  if ((pass_filter & BL::BakeSettings::pass_filter_TRANSMISSION) != 0)
-    flag |= BAKE_FILTER_TRANSMISSION;
-
-  if ((pass_filter & BL::BakeSettings::pass_filter_EMIT) != 0)
-    flag |= BAKE_FILTER_EMISSION;
-  if ((pass_filter & BL::BakeSettings::pass_filter_AO) != 0)
-    flag |= BAKE_FILTER_AO;
-
-  return flag;
+  const char *bake_type = bake_type_str.c_str();
+
+  /* data passes */
+  if (strcmp(bake_type, "POSITION") == 0) {
+    return PASS_POSITION;
+  }
+  else if (strcmp(bake_type, "NORMAL") == 0) {
+    return PASS_NORMAL;
+  }
+  else if (strcmp(bake_type, "UV") == 0) {
+    return PASS_UV;
+  }
+  else if (strcmp(bake_type, "ROUGHNESS") == 0) {
+    return PASS_ROUGHNESS;
+  }
+  else if (strcmp(bake_type, "EMIT") == 0) {
+    return PASS_EMISSION;
+  }
+  /* light passes */
+  else if (strcmp(bake_type, "AO") == 0) {
+    return PASS_AO;
+  }
+  else if (strcmp(bake_type, "COMBINED") == 0) {
+    return PASS_COMBINED;
+  }
+  else if (strcmp(bake_type, "SHADOW") == 0) {
+    return PASS_SHADOW;
+  }
+  else if (strcmp(bake_type, "DIFFUSE") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_DIFFUSE;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_DIFFUSE_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_DIFFUSE_INDIRECT;
+    }
+    else {
+      return PASS_DIFFUSE_COLOR;
+    }
+  }
+  else if (strcmp(bake_type, "GLOSSY") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_GLOSSY;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_GLOSSY_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_GLOSSY_INDIRECT;
+    }
+    else {
+      return PASS_GLOSSY_COLOR;
+    }
+  }
+  else if (strcmp(bake_type, "TRANSMISSION") == 0) {
+    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
+        bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_TRANSMISSION;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_DIRECT) {
+      return PASS_TRANSMISSION_DIRECT;
+    }
+    else if (bake_filter & BL::BakeSettings::pass_filter_INDIRECT) {
+      return PASS_TRANSMISSION_INDIRECT;
+    }
+    else {
+      return PASS_TRANSMISSION_COLOR;
+    }
+  }
+  /* extra */
+  else if (strcmp(bake_type, "ENVIRONMENT") == 0) {
+    return PASS_BACKGROUND;
+  }
+
+  return PASS_COMBINED;
 }
 
 void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
                           BL::Object &b_object,
-                          const string &pass_type,
-                          const int pass_filter,
+                          const string &bake_type,
+                          const int bake_filter,
                           const int bake_width,
                           const int bake_height)
 {
   b_depsgraph = b_depsgraph_;
 
-  ShaderEvalType shader_type = get_shader_type(pass_type);
-  int bake_pass_filter = bake_pass_filter_get(pass_filter);
-
   /* Initialize bake manager, before we load the baking kernels. */
-  scene->bake_manager->set(scene, b_object.name(), shader_type, bake_pass_filter);
+  scene->bake_manager->set(scene, b_object.name());
 
-  /* Passes are identified by name, so in order to return the combined pass we need to set the
-   * name. */
-  Pass::add(PASS_COMBINED, scene->passes, "Combined");
+  /* Add render pass that we want to bake, and name it Combined so that it is
+   * used as that on the Blender side. */
+  Pass *pass = scene->create_node<Pass>();
+  pass->set_name(ustring("Combined"));
+  pass->set_type(bake_type_to_pass(bake_type, bake_filter));
+  pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR));
 
-  session->read_bake_tile_cb = function_bind(&BlenderSession::read_render_tile, this, _1);
-  session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
+  session->read_render_tile_cb = [&]() { read_render_tile(); };
+  session->write_render_tile_cb = [&]() { write_render_tile(); };
+  session->set_gpu_display(nullptr);
 
   if (!session->progress.get_cancel()) {
     /* Sync scene. */
@@ -667,18 +717,15 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
 
   if (object_found && !session->progress.get_cancel()) {
     /* Get session and buffer parameters. */
-    SessionParams session_params = BlenderSync::get_session_params(
+    const SessionParams session_params = BlenderSync::get_session_params(
         b_engine, b_userpref, b_scene, background);
-    session_params.progressive_refine = false;
 
     BufferParams buffer_params;
     buffer_params.width = bake_width;
     buffer_params.height = bake_height;
-    buffer_params.passes = scene->passes;
 
     /* Update session. */
-    session->tile_manager.set_samples(session_params.samples);
-    session->reset(buffer_params, session_params.samples);
+    session->reset(session_params, buffer_params);
 
     session->progress.set_update_callback(
         function_bind(&BlenderSession::update_bake_progress, this));
@@ -690,71 +737,43 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_,
     session->wait();
   }
 
-  session->read_bake_tile_cb = function_null;
+  session->read_render_tile_cb = function_null;
   session->write_render_tile_cb = function_null;
 }
 
-void BlenderSession::do_write_update_render_result(BL::RenderLayer &b_rlay,
-                                                   RenderTile &rtile,
-                                                   bool do_update_only)
+void BlenderSession::write_render_result(BL::RenderLayer &b_rlay)
 {
-  RenderBuffers *buffers = rtile.buffers;
-
-  /* copy data from device */
-  if (!buffers->copy_from_device())
+  if (!session->copy_render_tile_from_device()) {
     return;
-
-  float exposure = scene->film->get_exposure();
-
-  vector<float> pixels(rtile.w * rtile.h * 4);
-
-  /* Adjust absolute sample number to the range. */
-  int sample = rtile.sample;
-  const int range_start_sample = session->tile_manager.range_start_sample;
-  if (range_start_sample != -1) {
-    sample -= range_start_sample;
   }
 
-  if (!do_update_only) {
-    /* copy each pass */
-    for (BL::RenderPass &b_pass : b_rlay.passes) {
-      int components = b_pass.channels();
-
-      /* Copy pixels from regular render passes. */
-      bool read = buffers->get_pass_rect(b_pass.name(), exposure, sample, components, &pixels[0]);
+  const int2 tile_size = session->get_render_tile_size();
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
 
-      /* If denoising pass, */
-      if (!read) {
-        int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
-        if (denoising_offset >= 0) {
-          read = buffers->get_denoising_pass_rect(
-              denoising_offset, exposure, sample, components, &pixels[0]);
-        }
-      }
-
-      if (!read) {
-        memset(&pixels[0], 0, pixels.size() * sizeof(float));
-      }
-
-      b_pass.rect(&pixels[0]);
+  /* Copy each pass. */
+  for (BL::RenderPass &b_pass : b_rlay.passes) {
+    if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) {
+      memset(&pixels[0], 0, pixels.size() * sizeof(float));
     }
-  }
-  else {
-    /* copy combined pass */
-    BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
-    if (buffers->get_pass_rect("Combined", exposure, sample, 4, &pixels[0]))
-      b_combined_pass.rect(&pixels[0]);
+
+    b_pass.rect(&pixels[0]);
   }
 }
 
-void BlenderSession::write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
+void BlenderSession::update_render_result(BL::RenderLayer &b_rlay)
 {
-  do_write_update_render_result(b_rlay, rtile, false);
-}
+  if (!session->copy_render_tile_from_device()) {
+    return;
+  }
 
-void BlenderSession::update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile)
-{
-  do_write_update_render_result(b_rlay, rtile, true);
+  const int2 tile_size = session->get_render_tile_size();
+  vector<float> pixels(tile_size.x * tile_size.y * 4);
+
+  /* Copy combined pass. */
+  BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str()));
+  if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) {
+    b_combined_pass.rect(&pixels[0]);
+  }
 }
 
 void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
@@ -764,19 +783,19 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
     return;
 
   /* on session/scene parameter changes, we recreate session entirely */
-  SessionParams session_params = BlenderSync::get_session_params(
+  const SessionParams session_params = BlenderSync::get_session_params(
       b_engine, b_userpref, b_scene, background);
-  SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
-  bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+  const SceneParams scene_params = BlenderSync::get_scene_params(b_scene, background);
+  const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
   if (session->params.modified(session_params) || scene->params.modified(scene_params)) {
     free_session();
     create_session();
   }
 
-  /* increase samples, but never decrease */
+  /* increase samples and render time, but never decrease */
   session->set_samples(session_params.samples);
-  session->set_denoising_start_sample(session_params.denoising.start_sample);
+  session->set_time_limit(session_params.time_limit);
   session->set_pause(session_pause);
 
   /* copy recalc flags, outside of mutex so we can decide to do the real
@@ -808,21 +827,12 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
     sync->sync_camera(b_render, b_camera_override, width, height, "");
 
   /* get buffer parameters */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-
-  if (!buffer_params.denoising_data_pass) {
-    session_params.denoising.use = false;
-  }
-
-  session->set_denoising(session_params.denoising);
-
-  /* Update film if denoising data was enabled or disabled. */
-  scene->film->set_denoising_data_pass(buffer_params.denoising_data_pass);
+  const BufferParams buffer_params = BlenderSync::get_buffer_params(
+      b_v3d, b_rv3d, scene->camera, width, height);
 
   /* reset if needed */
   if (scene->need_reset()) {
-    session->reset(buffer_params, session_params.samples);
+    session->reset(session_params, buffer_params);
 
     /* After session reset, so device is not accessing image data anymore. */
     builtin_images_load();
@@ -839,7 +849,44 @@ void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_)
   session->start();
 }
 
-bool BlenderSession::draw(int w, int h)
+void BlenderSession::draw(BL::SpaceImageEditor &space_image)
+{
+  if (!session || !session->scene) {
+    /* Offline render drawing does not force the render engine update, which means it's possible
+     * that the Session is not created yet. */
+    return;
+  }
+
+  thread_scoped_lock lock(draw_state_.mutex);
+
+  const int pass_index = space_image.image_user().multilayer_pass();
+  if (pass_index != draw_state_.last_pass_index) {
+    BL::RenderPass b_display_pass(b_engine.pass_by_index_get(b_rlay_name.c_str(), pass_index));
+    if (!b_display_pass) {
+      return;
+    }
+
+    Scene *scene = session->scene;
+
+    thread_scoped_lock lock(scene->mutex);
+
+    const Pass *pass = Pass::find(scene->passes, b_display_pass.name());
+    if (!pass) {
+      return;
+    }
+
+    scene->film->set_display_pass(pass->get_type());
+
+    draw_state_.last_pass_index = pass_index;
+  }
+
+  BL::Array<float, 2> zoom = space_image.zoom();
+  gpu_display_->set_zoom(zoom[0], zoom[1]);
+
+  session->draw();
+}
+
+void BlenderSession::view_draw(int w, int h)
 {
   /* pause in redraw in case update is not being called due to final render */
   session->set_pause(BlenderSync::get_session_pause(b_scene, background));
@@ -885,14 +932,14 @@ bool BlenderSession::draw(int w, int h)
 
     /* reset if requested */
     if (reset) {
-      SessionParams session_params = BlenderSync::get_session_params(
+      const SessionParams session_params = BlenderSync::get_session_params(
           b_engine, b_userpref, b_scene, background);
-      BufferParams buffer_params = BlenderSync::get_buffer_params(
-          b_v3d, b_rv3d, scene->camera, width, height, session_params.denoising.use);
-      bool session_pause = BlenderSync::get_session_pause(b_scene, background);
+      const BufferParams buffer_params = BlenderSync::get_buffer_params(
+          b_v3d, b_rv3d, scene->camera, width, height);
+      const bool session_pause = BlenderSync::get_session_pause(b_scene, background);
 
       if (session_pause == false) {
-        session->reset(buffer_params, session_params.samples);
+        session->reset(session_params, buffer_params);
         start_resize_time = 0.0;
       }
     }
@@ -905,18 +952,7 @@ bool BlenderSession::draw(int w, int h)
   update_status_progress();
 
   /* draw */
-  BufferParams buffer_params = BlenderSync::get_buffer_params(
-      b_v3d, b_rv3d, scene->camera, width, height, session->params.denoising.use);
-  DeviceDrawParams draw_params;
-
-  if (session->params.display_buffer_linear) {
-    draw_params.bind_display_space_shader_cb = function_bind(
-        &BL::RenderEngine::bind_display_space_shader, &b_engine, b_scene);
-    draw_params.unbind_display_space_shader_cb = function_bind(
-        &BL::RenderEngine::unbind_display_space_shader, &b_engine);
-  }
-
-  return !session->draw(buffer_params, draw_params);
+  session->draw();
 }
 
 void BlenderSession::get_status(string &status, string &substatus)
@@ -924,11 +960,6 @@ void BlenderSession::get_status(string &status, string &substatus)
   session->progress.get_status(status, substatus);
 }
 
-void BlenderSession::get_kernel_status(string &kernel_status)
-{
-  session->progress.get_kernel_status(kernel_status);
-}
-
 void BlenderSession::get_progress(float &progress, double &total_time, double &render_time)
 {
   session->progress.get_time(total_time, render_time);
@@ -947,7 +978,7 @@ void BlenderSession::update_bake_progress()
 
 void BlenderSession::update_status_progress()
 {
-  string timestatus, status, substatus, kernel_status;
+  string timestatus, status, substatus;
   string scene_status = "";
   float progress;
   double total_time, remaining_time = 0, render_time;
@@ -955,7 +986,6 @@ void BlenderSession::update_status_progress()
   float mem_peak = (float)session->stats.mem_peak / 1024.0f / 1024.0f;
 
   get_status(status, substatus);
-  get_kernel_status(kernel_status);
   get_progress(progress, total_time, render_time);
 
   if (progress > 0)
@@ -980,14 +1010,12 @@ void BlenderSession::update_status_progress()
       status = " | " + status;
     if (substatus.size() > 0)
       status += " | " + substatus;
-    if (kernel_status.size() > 0)
-      status += " | " + kernel_status;
   }
 
   double current_time = time_dt();
-  /* When rendering in a window, redraw the status at least once per second to keep the elapsed and
-   * remaining time up-to-date. For headless rendering, only report when something significant
-   * changes to keep the console output readable. */
+  /* When rendering in a window, redraw the status at least once per second to keep the elapsed
+   * and remaining time up-to-date. For headless rendering, only report when something
+   * significant changes to keep the console output readable. */
   if (status != last_status || (!headless && (current_time - last_status_time) > 1.0)) {
     b_engine.update_stats("", (timestatus + scene_status + status).c_str());
     b_engine.update_memory_stats(mem_used, mem_peak);
@@ -1048,56 +1076,6 @@ void BlenderSession::test_cancel()
       session->progress.set_cancel("Cancelled");
 }
 
-void BlenderSession::update_resumable_tile_manager(int num_samples)
-{
-  const int num_resumable_chunks = BlenderSession::num_resumable_chunks,
-            current_resumable_chunk = BlenderSession::current_resumable_chunk;
-  if (num_resumable_chunks == 0) {
-    return;
-  }
-
-  if (num_resumable_chunks > num_samples) {
-    fprintf(stderr,
-            "Cycles warning: more sample chunks (%d) than samples (%d), "
-            "this will cause some samples to be included in multiple chunks.\n",
-            num_resumable_chunks,
-            num_samples);
-  }
-
-  const float num_samples_per_chunk = (float)num_samples / num_resumable_chunks;
-
-  float range_start_sample, range_num_samples;
-  if (current_resumable_chunk != 0) {
-    /* Single chunk rendering. */
-    range_start_sample = num_samples_per_chunk * (current_resumable_chunk - 1);
-    range_num_samples = num_samples_per_chunk;
-  }
-  else {
-    /* Ranged-chunks. */
-    const int num_chunks = end_resumable_chunk - start_resumable_chunk + 1;
-    range_start_sample = num_samples_per_chunk * (start_resumable_chunk - 1);
-    range_num_samples = num_chunks * num_samples_per_chunk;
-  }
-
-  /* Round after doing the multiplications with num_chunks and num_samples_per_chunk
-   * to allow for many small chunks. */
-  int rounded_range_start_sample = (int)floorf(range_start_sample + 0.5f);
-  int rounded_range_num_samples = max((int)floorf(range_num_samples + 0.5f), 1);
-
-  /* Make sure we don't overshoot. */
-  if (rounded_range_start_sample + rounded_range_num_samples > num_samples) {
-    rounded_range_num_samples = num_samples - rounded_range_num_samples;
-  }
-
-  VLOG(1) << "Samples range start is " << range_start_sample << ", "
-          << "number of samples to render is " << range_num_samples;
-
-  scene->integrator->set_start_sample(rounded_range_start_sample);
-
-  session->tile_manager.range_start_sample = rounded_range_start_sample;
-  session->tile_manager.range_num_samples = rounded_range_num_samples;
-}
-
 void BlenderSession::free_blender_memory_if_possible()
 {
   if (!background) {
diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h
index d967b81c854..11e2657a325 100644
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -29,12 +29,11 @@
 
 CCL_NAMESPACE_BEGIN
 
+class BlenderGPUDisplay;
 class BlenderSync;
 class ImageMetaData;
 class Scene;
 class Session;
-class RenderBuffers;
-class RenderTile;
 
 class BlenderSession {
  public:
@@ -62,6 +61,8 @@ class BlenderSession {
   /* offline render */
   void render(BL::Depsgraph &b_depsgraph);
 
+  void render_frame_finish();
+
   void bake(BL::Depsgraph &b_depsgrah,
             BL::Object &b_object,
             const string &pass_type,
@@ -69,24 +70,29 @@ class BlenderSession {
             const int bake_width,
             const int bake_height);
 
-  void write_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
-  void write_render_tile(RenderTile &rtile);
-  void read_render_tile(RenderTile &rtile);
+  void write_render_result(BL::RenderLayer &b_rlay);
+  void write_render_tile();
+
+  void update_render_tile();
+
+  void full_buffer_written(string_view filename);
 
   /* update functions are used to update display buffer only after sample was rendered
    * only needed for better visual feedback */
-  void update_render_result(BL::RenderLayer &b_rlay, RenderTile &rtile);
-  void update_render_tile(RenderTile &rtile, bool highlight);
+  void update_render_result(BL::RenderLayer &b_rlay);
+
+  /* read functions for baking input */
+  void read_render_tile();
 
   /* interactive updates */
   void synchronize(BL::Depsgraph &b_depsgraph);
 
   /* drawing */
-  bool draw(int w, int h);
+  void draw(BL::SpaceImageEditor &space_image);
+  void view_draw(int w, int h);
   void tag_redraw();
   void tag_update();
   void get_status(string &status, string &substatus);
-  void get_kernel_status(string &kernel_status);
   void get_progress(float &progress, double &total_time, double &render_time);
   void test_cancel();
   void update_status_progress();
@@ -123,6 +129,8 @@ class BlenderSession {
 
   void *python_thread_state;
 
+  bool use_developer_ui;
+
   /* Global state which is common for all render sessions created from Blender.
    * Usually denotes command line arguments.
    */
@@ -134,41 +142,28 @@ class BlenderSession {
    */
   static bool headless;
 
-  /* ** Resumable render ** */
-
-  /* Overall number of chunks in which the sample range is to be divided. */
-  static int num_resumable_chunks;
-
-  /* Current resumable chunk index to render. */
-  static int current_resumable_chunk;
-
-  /* Alternative to single-chunk rendering to render a range of chunks. */
-  static int start_resumable_chunk;
-  static int end_resumable_chunk;
-
   static bool print_render_stats;
 
  protected:
   void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
 
-  void do_write_update_render_result(BL::RenderLayer &b_rlay,
-                                     RenderTile &rtile,
-                                     bool do_update_only);
-  void do_write_update_render_tile(RenderTile &rtile,
-                                   bool do_update_only,
-                                   bool do_read_only,
-                                   bool highlight);
-
   void builtin_images_load();
 
-  /* Update tile manager to reflect resumable render settings. */
-  void update_resumable_tile_manager(int num_samples);
-
   /* Is used after each render layer synchronization is done with the goal
    * of freeing render engine data which is held from Blender side (for
    * example, dependency graph).
    */
   void free_blender_memory_if_possible();
+
+  struct {
+    thread_mutex mutex;
+    int last_pass_index = -1;
+  } draw_state_;
+
+  /* NOTE: The BlenderSession references the GPU display. */
+  BlenderGPUDisplay *gpu_display_ = nullptr;
+
+  vector<string> full_buffer_files_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp
index de7b2761d00..8c4f789ffd0 100644
--- a/intern/cycles/blender/blender_shader.cpp
+++ b/intern/cycles/blender/blender_shader.cpp
@@ -17,6 +17,7 @@
 #include "render/background.h"
 #include "render/colorspace.h"
 #include "render/graph.h"
+#include "render/integrator.h"
 #include "render/light.h"
 #include "render/nodes.h"
 #include "render/osl.h"
@@ -475,17 +476,11 @@ static ShaderNode *add_node(Scene *scene,
     SubsurfaceScatteringNode *subsurface = graph->create_node<SubsurfaceScatteringNode>();
 
     switch (b_subsurface_node.falloff()) {
-      case BL::ShaderNodeSubsurfaceScattering::falloff_CUBIC:
-        subsurface->set_falloff(CLOSURE_BSSRDF_CUBIC_ID);
-        break;
-      case BL::ShaderNodeSubsurfaceScattering::falloff_GAUSSIAN:
-        subsurface->set_falloff(CLOSURE_BSSRDF_GAUSSIAN_ID);
-        break;
-      case BL::ShaderNodeSubsurfaceScattering::falloff_BURLEY:
-        subsurface->set_falloff(CLOSURE_BSSRDF_BURLEY_ID);
+      case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK_FIXED_RADIUS:
+        subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
         break;
       case BL::ShaderNodeSubsurfaceScattering::falloff_RANDOM_WALK:
-        subsurface->set_falloff(CLOSURE_BSSRDF_RANDOM_WALK_ID);
+        subsurface->set_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
         break;
     }
 
@@ -597,11 +592,11 @@ static ShaderNode *add_node(Scene *scene,
         break;
     }
     switch (b_principled_node.subsurface_method()) {
-      case BL::ShaderNodeBsdfPrincipled::subsurface_method_BURLEY:
-        principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_ID);
+      case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK_FIXED_RADIUS:
+        principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
         break;
       case BL::ShaderNodeBsdfPrincipled::subsurface_method_RANDOM_WALK:
-        principled->set_subsurface_method(CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+        principled->set_subsurface_method(CLOSURE_BSSRDF_RANDOM_WALK_ID);
         break;
     }
     node = principled;
@@ -1360,10 +1355,11 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)
 void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d, bool update_all)
 {
   Background *background = scene->background;
+  Integrator *integrator = scene->integrator;
 
   BL::World b_world = b_scene.world();
 
-  BlenderViewportParameters new_viewport_parameters(b_v3d);
+  BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
 
   if (world_recalc || update_all || b_world.ptr.data != world_map ||
       viewport_parameters.shader_modified(new_viewport_parameters)) {
@@ -1455,9 +1451,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       /* AO */
       BL::WorldLighting b_light = b_world.light_settings();
 
-      background->set_use_ao(b_light.use_ambient_occlusion());
-      background->set_ao_factor(b_light.ao_factor());
-      background->set_ao_distance(b_light.distance());
+      integrator->set_ao_factor(b_light.ao_factor());
+      integrator->set_ao_distance(b_light.distance());
 
       /* visibility */
       PointerRNA cvisibility = RNA_pointer_get(&b_world.ptr, "cycles_visibility");
@@ -1472,9 +1467,8 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
       background->set_visibility(visibility);
     }
     else {
-      background->set_use_ao(false);
-      background->set_ao_factor(0.0f);
-      background->set_ao_distance(FLT_MAX);
+      integrator->set_ao_factor(1.0f);
+      integrator->set_ao_distance(10.0f);
     }
 
     shader->set_graph(graph);
@@ -1496,7 +1490,6 @@ void BlenderSync::sync_world(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d,
 
   background->set_use_shader(view_layer.use_background_shader ||
                              viewport_parameters.use_custom_shader());
-  background->set_use_ao(background->get_use_ao() && view_layer.use_background_ao);
 
   background->tag_update(scene);
 }
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 26d64b7bf85..d6fc7ee1723 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -53,6 +53,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
                          BL::Scene &b_scene,
                          Scene *scene,
                          bool preview,
+                         bool use_developer_ui,
                          Progress &progress)
     : b_engine(b_engine),
       b_data(b_data),
@@ -68,6 +69,7 @@ BlenderSync::BlenderSync(BL::RenderEngine &b_engine,
       scene(scene),
       preview(preview),
       experimental(false),
+      use_developer_ui(use_developer_ui),
       dicing_rate(1.0f),
       max_subdivisions(12),
       progress(progress),
@@ -224,7 +226,7 @@ void BlenderSync::sync_recalc(BL::Depsgraph &b_depsgraph, BL::SpaceView3D &b_v3d
   }
 
   if (b_v3d) {
-    BlenderViewportParameters new_viewport_parameters(b_v3d);
+    BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
 
     if (viewport_parameters.shader_modified(new_viewport_parameters)) {
       world_recalc = true;
@@ -251,9 +253,13 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
   BL::ViewLayer b_view_layer = b_depsgraph.view_layer_eval();
 
+  /* TODO(sergey): This feels weak to pass view layer to the integrator, and even weaker to have an
+   * implicit check on whether it is a background render or not. What is the nicer thing here? */
+  const bool background = !b_v3d;
+
   sync_view_layer(b_view_layer);
-  sync_integrator();
-  sync_film(b_v3d);
+  sync_integrator(b_view_layer, background);
+  sync_film(b_view_layer, b_v3d);
   sync_shaders(b_depsgraph, b_v3d);
   sync_images();
 
@@ -280,7 +286,7 @@ void BlenderSync::sync_data(BL::RenderSettings &b_render,
 
 /* Integrator */
 
-void BlenderSync::sync_integrator()
+void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
@@ -328,59 +334,24 @@ void BlenderSync::sync_integrator()
     integrator->set_motion_blur(view_layer.use_motion_blur);
   }
 
-  integrator->set_method((Integrator::Method)get_enum(
-      cscene, "progressive", Integrator::NUM_METHODS, Integrator::PATH));
-
-  integrator->set_sample_all_lights_direct(get_boolean(cscene, "sample_all_lights_direct"));
-  integrator->set_sample_all_lights_indirect(get_boolean(cscene, "sample_all_lights_indirect"));
   integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));
 
   SamplingPattern sampling_pattern = (SamplingPattern)get_enum(
       cscene, "sampling_pattern", SAMPLING_NUM_PATTERNS, SAMPLING_PATTERN_SOBOL);
-
-  int adaptive_min_samples = INT_MAX;
-
-  if (RNA_boolean_get(&cscene, "use_adaptive_sampling")) {
-    sampling_pattern = SAMPLING_PATTERN_PMJ;
-    adaptive_min_samples = get_int(cscene, "adaptive_min_samples");
-    integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
-  }
-  else {
-    integrator->set_adaptive_threshold(0.0f);
-  }
-
   integrator->set_sampling_pattern(sampling_pattern);
 
-  int diffuse_samples = get_int(cscene, "diffuse_samples");
-  int glossy_samples = get_int(cscene, "glossy_samples");
-  int transmission_samples = get_int(cscene, "transmission_samples");
-  int ao_samples = get_int(cscene, "ao_samples");
-  int mesh_light_samples = get_int(cscene, "mesh_light_samples");
-  int subsurface_samples = get_int(cscene, "subsurface_samples");
-  int volume_samples = get_int(cscene, "volume_samples");
-
-  if (get_boolean(cscene, "use_square_samples")) {
-    integrator->set_diffuse_samples(diffuse_samples * diffuse_samples);
-    integrator->set_glossy_samples(glossy_samples * glossy_samples);
-    integrator->set_transmission_samples(transmission_samples * transmission_samples);
-    integrator->set_ao_samples(ao_samples * ao_samples);
-    integrator->set_mesh_light_samples(mesh_light_samples * mesh_light_samples);
-    integrator->set_subsurface_samples(subsurface_samples * subsurface_samples);
-    integrator->set_volume_samples(volume_samples * volume_samples);
-    adaptive_min_samples = min(adaptive_min_samples * adaptive_min_samples, INT_MAX);
+  if (preview) {
+    integrator->set_use_adaptive_sampling(
+        RNA_boolean_get(&cscene, "use_preview_adaptive_sampling"));
+    integrator->set_adaptive_threshold(get_float(cscene, "preview_adaptive_threshold"));
+    integrator->set_adaptive_min_samples(get_int(cscene, "preview_adaptive_min_samples"));
   }
   else {
-    integrator->set_diffuse_samples(diffuse_samples);
-    integrator->set_glossy_samples(glossy_samples);
-    integrator->set_transmission_samples(transmission_samples);
-    integrator->set_ao_samples(ao_samples);
-    integrator->set_mesh_light_samples(mesh_light_samples);
-    integrator->set_subsurface_samples(subsurface_samples);
-    integrator->set_volume_samples(volume_samples);
+    integrator->set_use_adaptive_sampling(RNA_boolean_get(&cscene, "use_adaptive_sampling"));
+    integrator->set_adaptive_threshold(get_float(cscene, "adaptive_threshold"));
+    integrator->set_adaptive_min_samples(get_int(cscene, "adaptive_min_samples"));
   }
 
-  integrator->set_adaptive_min_samples(adaptive_min_samples);
-
   if (get_boolean(cscene, "use_fast_gi")) {
     if (preview) {
       integrator->set_ao_bounces(get_int(cscene, "ao_bounces"));
@@ -393,20 +364,38 @@ void BlenderSync::sync_integrator()
     integrator->set_ao_bounces(0);
   }
 
-  /* UPDATE_NONE as we don't want to tag the integrator as modified, just tag dependent things */
+  const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
+  integrator->set_use_denoise(denoise_params.use);
+
+  /* Only update denoiser parameters if the denoiser is actually used. This allows to tweak
+   * denoiser parameters before enabling it without render resetting on every change. The downside
+   * is that the interface and the integrator are technically out of sync. */
+  if (denoise_params.use) {
+    integrator->set_denoiser_type(denoise_params.type);
+    integrator->set_denoise_start_sample(denoise_params.start_sample);
+    integrator->set_use_denoise_pass_albedo(denoise_params.use_pass_albedo);
+    integrator->set_use_denoise_pass_normal(denoise_params.use_pass_normal);
+    integrator->set_denoiser_prefilter(denoise_params.prefilter);
+  }
+
+  /* UPDATE_NONE as we don't want to tag the integrator as modified (this was done by the
+   * set calls above), but we need to make sure that the dependent things are tagged. */
   integrator->tag_update(scene, Integrator::UPDATE_NONE);
 }
 
 /* Film */
 
-void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
+void BlenderSync::sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d)
 {
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
   Film *film = scene->film;
 
   if (b_v3d) {
-    film->set_display_pass(update_viewport_display_passes(b_v3d, scene->passes));
+    const BlenderViewportParameters new_viewport_parameters(b_v3d, use_developer_ui);
+    film->set_display_pass(new_viewport_parameters.display_pass);
+    film->set_show_active_pixels(new_viewport_parameters.show_active_pixels);
   }
 
   film->set_exposure(get_float(cscene, "film_exposure"));
@@ -434,6 +423,15 @@ void BlenderSync::sync_film(BL::SpaceView3D &b_v3d)
         break;
     }
   }
+
+  /* Blender viewport does not support proper shadow catcher compositing, so force an approximate
+   * mode to improve visual feedback. */
+  if (b_v3d) {
+    film->set_use_approximate_shadow_catcher(true);
+  }
+  else {
+    film->set_use_approximate_shadow_catcher(!get_boolean(crl, "use_pass_shadow_catcher"));
+  }
 }
 
 /* Render Layer */
@@ -444,7 +442,6 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
 
   /* Filter. */
   view_layer.use_background_shader = b_view_layer.use_sky();
-  view_layer.use_background_ao = b_view_layer.use_ao();
   /* Always enable surfaces for baking, otherwise there is nothing to bake to. */
   view_layer.use_surfaces = b_view_layer.use_solid() || scene->bake_manager->get_baking();
   view_layer.use_hair = b_view_layer.use_strand();
@@ -464,10 +461,7 @@ void BlenderSync::sync_view_layer(BL::ViewLayer &b_view_layer)
 
   if (use_layer_samples != 2) {
     int samples = b_view_layer.samples();
-    if (get_boolean(cscene, "use_square_samples"))
-      view_layer.samples = samples * samples;
-    else
-      view_layer.samples = samples;
+    view_layer.samples = samples;
   }
 }
 
@@ -499,7 +493,8 @@ void BlenderSync::sync_images()
 }
 
 /* Passes */
-PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
+
+static PassType get_blender_pass_type(BL::RenderPass &b_pass)
 {
   string name = b_pass.name();
 #define MAP_PASS(passname, passtype) \
@@ -507,10 +502,15 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
     return passtype; \
   } \
   ((void)0)
+
   /* NOTE: Keep in sync with defined names from DNA_scene_types.h */
+
   MAP_PASS("Combined", PASS_COMBINED);
+  MAP_PASS("Noisy Image", PASS_COMBINED);
+
   MAP_PASS("Depth", PASS_DEPTH);
   MAP_PASS("Mist", PASS_MIST);
+  MAP_PASS("Position", PASS_POSITION);
   MAP_PASS("Normal", PASS_NORMAL);
   MAP_PASS("IndexOB", PASS_OBJECT_ID);
   MAP_PASS("UV", PASS_UV);
@@ -539,118 +539,92 @@ PassType BlenderSync::get_pass_type(BL::RenderPass &b_pass)
   MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE);
   MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL);
 
+  MAP_PASS("Denoising Normal", PASS_DENOISING_NORMAL);
+  MAP_PASS("Denoising Albedo", PASS_DENOISING_ALBEDO);
+
+  MAP_PASS("Shadow Catcher", PASS_SHADOW_CATCHER);
+  MAP_PASS("Noisy Shadow Catcher", PASS_SHADOW_CATCHER);
+
   MAP_PASS("Debug Render Time", PASS_RENDER_TIME);
+
   MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER);
   MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT);
+
   if (string_startswith(name, cryptomatte_prefix)) {
     return PASS_CRYPTOMATTE;
   }
+
 #undef MAP_PASS
 
   return PASS_NONE;
 }
 
-int BlenderSync::get_denoising_pass(BL::RenderPass &b_pass)
+static Pass *pass_add(Scene *scene,
+                      PassType type,
+                      const char *name,
+                      PassMode mode = PassMode::DENOISED)
 {
-  string name = b_pass.name();
+  Pass *pass = scene->create_node<Pass>();
 
-  if (name == "Noisy Image")
-    return DENOISING_PASS_PREFILTERED_COLOR;
+  pass->set_type(type);
+  pass->set_name(ustring(name));
+  pass->set_mode(mode);
 
-  if (name.substr(0, 10) != "Denoising ") {
-    return -1;
-  }
-  name = name.substr(10);
-
-#define MAP_PASS(passname, offset) \
-  if (name == passname) { \
-    return offset; \
-  } \
-  ((void)0)
-  MAP_PASS("Normal", DENOISING_PASS_PREFILTERED_NORMAL);
-  MAP_PASS("Albedo", DENOISING_PASS_PREFILTERED_ALBEDO);
-  MAP_PASS("Depth", DENOISING_PASS_PREFILTERED_DEPTH);
-  MAP_PASS("Shadowing", DENOISING_PASS_PREFILTERED_SHADOWING);
-  MAP_PASS("Variance", DENOISING_PASS_PREFILTERED_VARIANCE);
-  MAP_PASS("Intensity", DENOISING_PASS_PREFILTERED_INTENSITY);
-  MAP_PASS("Clean", DENOISING_PASS_CLEAN);
-#undef MAP_PASS
-
-  return -1;
+  return pass;
 }
 
-vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
-                                             BL::RenderLayer &b_rlay,
-                                             BL::ViewLayer &b_view_layer,
-                                             bool adaptive_sampling,
-                                             const DenoiseParams &denoising)
+void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_view_layer)
 {
-  vector<Pass> passes;
+  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
+
+  /* Delete all existing passes. */
+  set<Pass *> clear_passes(scene->passes.begin(), scene->passes.end());
+  scene->delete_nodes(clear_passes);
 
-  /* loop over passes */
+  /* Always add combined pass. */
+  pass_add(scene, PASS_COMBINED, "Combined");
+
+  /* Blender built-in data and light passes. */
   for (BL::RenderPass &b_pass : b_rlay.passes) {
-    PassType pass_type = get_pass_type(b_pass);
+    const PassType pass_type = get_blender_pass_type(b_pass);
+
+    if (pass_type == PASS_NONE) {
+      LOG(ERROR) << "Unknown pass " << b_pass.name();
+      continue;
+    }
 
     if (pass_type == PASS_MOTION &&
         (b_view_layer.use_motion_blur() && b_scene.render().use_motion_blur())) {
       continue;
     }
-    if (pass_type != PASS_NONE)
-      Pass::add(pass_type, passes, b_pass.name().c_str());
-  }
-
-  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
-  int denoising_flags = 0;
-  if (denoising.use || denoising.store_passes) {
-    if (denoising.type == DENOISER_NLM) {
-#define MAP_OPTION(name, flag) \
-  if (!get_boolean(crl, name)) { \
-    denoising_flags |= flag; \
-  } \
-  ((void)0)
-      MAP_OPTION("denoising_diffuse_direct", DENOISING_CLEAN_DIFFUSE_DIR);
-      MAP_OPTION("denoising_diffuse_indirect", DENOISING_CLEAN_DIFFUSE_IND);
-      MAP_OPTION("denoising_glossy_direct", DENOISING_CLEAN_GLOSSY_DIR);
-      MAP_OPTION("denoising_glossy_indirect", DENOISING_CLEAN_GLOSSY_IND);
-      MAP_OPTION("denoising_transmission_direct", DENOISING_CLEAN_TRANSMISSION_DIR);
-      MAP_OPTION("denoising_transmission_indirect", DENOISING_CLEAN_TRANSMISSION_IND);
-#undef MAP_OPTION
-    }
-    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+    pass_add(scene, pass_type, b_pass.name().c_str());
   }
-  scene->film->set_denoising_flags(denoising_flags);
-
-  if (denoising.store_passes) {
-    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
-    b_engine.add_pass("Denoising Depth", 1, "Z", b_view_layer.name().c_str());
-    if (denoising.type == DENOISER_NLM) {
-      b_engine.add_pass("Denoising Shadowing", 1, "X", b_view_layer.name().c_str());
-      b_engine.add_pass("Denoising Variance", 3, "RGB", b_view_layer.name().c_str());
-      b_engine.add_pass("Denoising Intensity", 1, "X", b_view_layer.name().c_str());
-    }
 
-    if (scene->film->get_denoising_flags() & DENOISING_CLEAN_ALL_PASSES) {
-      b_engine.add_pass("Denoising Clean", 3, "RGB", b_view_layer.name().c_str());
-    }
-  }
+  PointerRNA crl = RNA_pointer_get(&b_view_layer.ptr, "cycles");
 
+  /* Debug passes. */
   if (get_boolean(crl, "pass_debug_render_time")) {
     b_engine.add_pass("Debug Render Time", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_RENDER_TIME, passes, "Debug Render Time");
+    pass_add(scene, PASS_RENDER_TIME, "Debug Render Time");
   }
   if (get_boolean(crl, "pass_debug_sample_count")) {
     b_engine.add_pass("Debug Sample Count", 1, "X", b_view_layer.name().c_str());
-    Pass::add(PASS_SAMPLE_COUNT, passes, "Debug Sample Count");
+    pass_add(scene, PASS_SAMPLE_COUNT, "Debug Sample Count");
   }
+
+  /* Cycles specific passes. */
   if (get_boolean(crl, "use_pass_volume_direct")) {
     b_engine.add_pass("VolumeDir", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_DIRECT, passes, "VolumeDir");
+    pass_add(scene, PASS_VOLUME_DIRECT, "VolumeDir");
   }
   if (get_boolean(crl, "use_pass_volume_indirect")) {
     b_engine.add_pass("VolumeInd", 3, "RGB", b_view_layer.name().c_str());
-    Pass::add(PASS_VOLUME_INDIRECT, passes, "VolumeInd");
+    pass_add(scene, PASS_VOLUME_INDIRECT, "VolumeInd");
+  }
+  if (get_boolean(crl, "use_pass_shadow_catcher")) {
+    b_engine.add_pass("Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+    pass_add(scene, PASS_SHADOW_CATCHER, "Shadow Catcher");
   }
 
   /* Cryptomatte stores two ID/weight pairs per RGBA layer.
@@ -662,7 +636,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Object%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_OBJECT);
   }
@@ -670,7 +644,7 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Material%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_MATERIAL);
   }
@@ -678,22 +652,33 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
     for (int i = 0; i < crypto_depth; i++) {
       string passname = cryptomatte_prefix + string_printf("Asset%02d", i);
       b_engine.add_pass(passname.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_CRYPTOMATTE, passes, passname.c_str());
+      pass_add(scene, PASS_CRYPTOMATTE, passname.c_str());
     }
     cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ASSET);
   }
-  if (b_view_layer.use_pass_cryptomatte_accurate() && cryptomatte_passes != CRYPT_NONE) {
-    cryptomatte_passes = (CryptomatteType)(cryptomatte_passes | CRYPT_ACCURATE);
-  }
   scene->film->set_cryptomatte_passes(cryptomatte_passes);
 
-  if (adaptive_sampling) {
-    Pass::add(PASS_ADAPTIVE_AUX_BUFFER, passes);
-    if (!get_boolean(crl, "pass_debug_sample_count")) {
-      Pass::add(PASS_SAMPLE_COUNT, passes);
+  /* Denoising passes. */
+  const bool use_denoising = get_boolean(cscene, "use_denoising") &&
+                             get_boolean(crl, "use_denoising");
+  const bool store_denoising_passes = get_boolean(crl, "denoising_store_passes");
+  if (use_denoising) {
+    b_engine.add_pass("Noisy Image", 4, "RGBA", b_view_layer.name().c_str());
+    pass_add(scene, PASS_COMBINED, "Noisy Image", PassMode::NOISY);
+    if (get_boolean(crl, "use_pass_shadow_catcher")) {
+      b_engine.add_pass("Noisy Shadow Catcher", 3, "RGB", b_view_layer.name().c_str());
+      pass_add(scene, PASS_SHADOW_CATCHER, "Noisy Shadow Catcher", PassMode::NOISY);
     }
   }
+  if (store_denoising_passes) {
+    b_engine.add_pass("Denoising Normal", 3, "XYZ", b_view_layer.name().c_str());
+    pass_add(scene, PASS_DENOISING_NORMAL, "Denoising Normal", PassMode::NOISY);
+
+    b_engine.add_pass("Denoising Albedo", 3, "RGB", b_view_layer.name().c_str());
+    pass_add(scene, PASS_DENOISING_ALBEDO, "Denoising Albedo", PassMode::NOISY);
+  }
 
+  /* Custom AOV passes. */
   BL::ViewLayer::aovs_iterator b_aov_iter;
   for (b_view_layer.aovs.begin(b_aov_iter); b_aov_iter != b_view_layer.aovs.end(); ++b_aov_iter) {
     BL::AOV b_aov(*b_aov_iter);
@@ -706,28 +691,15 @@ vector<Pass> BlenderSync::sync_render_passes(BL::Scene &b_scene,
 
     if (is_color) {
       b_engine.add_pass(name.c_str(), 4, "RGBA", b_view_layer.name().c_str());
-      Pass::add(PASS_AOV_COLOR, passes, name.c_str());
+      pass_add(scene, PASS_AOV_COLOR, name.c_str());
     }
     else {
       b_engine.add_pass(name.c_str(), 1, "X", b_view_layer.name().c_str());
-      Pass::add(PASS_AOV_VALUE, passes, name.c_str());
+      pass_add(scene, PASS_AOV_VALUE, name.c_str());
     }
   }
 
-  scene->film->set_denoising_data_pass(denoising.use || denoising.store_passes);
-  scene->film->set_denoising_clean_pass(scene->film->get_denoising_flags() &
-                                        DENOISING_CLEAN_ALL_PASSES);
-  scene->film->set_denoising_prefiltered_pass(denoising.store_passes &&
-                                              denoising.type == DENOISER_NLM);
   scene->film->set_pass_alpha_threshold(b_view_layer.pass_alpha_threshold());
-
-  if (!Pass::equals(passes, scene->passes)) {
-    scene->film->tag_passes_update(scene, passes);
-    scene->film->tag_modified();
-    scene->integrator->tag_update(scene, Integrator::UPDATE_ALL);
-  }
-
-  return passes;
 }
 
 void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
@@ -773,9 +745,9 @@ SceneParams BlenderSync::get_scene_params(BL::Scene &b_scene, bool background)
     params.shadingsystem = SHADINGSYSTEM_OSL;
 
   if (background || DebugFlags().viewport_static_bvh)
-    params.bvh_type = SceneParams::BVH_STATIC;
+    params.bvh_type = BVH_TYPE_STATIC;
   else
-    params.bvh_type = SceneParams::BVH_DYNAMIC;
+    params.bvh_type = BVH_TYPE_DYNAMIC;
 
   params.use_bvh_spatial_split = RNA_boolean_get(&cscene, "debug_use_spatial_splits");
   params.use_bvh_unaligned_nodes = RNA_boolean_get(&cscene, "debug_use_hair_bvh");
@@ -818,8 +790,7 @@ bool BlenderSync::get_session_pause(BL::Scene &b_scene, bool background)
 SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
                                               BL::Preferences &b_preferences,
                                               BL::Scene &b_scene,
-                                              bool background,
-                                              BL::ViewLayer b_view_layer)
+                                              bool background)
 {
   SessionParams params;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
@@ -827,7 +798,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   /* feature set */
   params.experimental = (get_enum(cscene, "feature_set") != 0);
 
-  /* Background */
+  /* Headless and background rendering. */
+  params.headless = BlenderSession::headless;
   params.background = background;
 
   /* Device */
@@ -836,111 +808,26 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
 
   /* samples */
   int samples = get_int(cscene, "samples");
-  int aa_samples = get_int(cscene, "aa_samples");
   int preview_samples = get_int(cscene, "preview_samples");
-  int preview_aa_samples = get_int(cscene, "preview_aa_samples");
 
-  if (get_boolean(cscene, "use_square_samples")) {
-    aa_samples = aa_samples * aa_samples;
-    preview_aa_samples = preview_aa_samples * preview_aa_samples;
-
-    samples = samples * samples;
-    preview_samples = preview_samples * preview_samples;
-  }
-
-  if (get_enum(cscene, "progressive") == 0 && params.device.has_branched_path) {
-    if (background) {
-      params.samples = aa_samples;
-    }
-    else {
-      params.samples = preview_aa_samples;
-      if (params.samples == 0)
-        params.samples = INT_MAX;
-    }
+  if (background) {
+    params.samples = samples;
   }
   else {
-    if (background) {
-      params.samples = samples;
-    }
-    else {
-      params.samples = preview_samples;
-      if (params.samples == 0)
-        params.samples = INT_MAX;
-    }
+    params.samples = preview_samples;
+    if (params.samples == 0)
+      params.samples = INT_MAX;
   }
 
   /* Clamp samples. */
   params.samples = min(params.samples, Integrator::MAX_SAMPLES);
 
-  /* Adaptive sampling. */
-  params.adaptive_sampling = RNA_boolean_get(&cscene, "use_adaptive_sampling");
-
-  /* tiles */
-  const bool is_cpu = (params.device.type == DEVICE_CPU);
-  if (!is_cpu && !background) {
-    /* currently GPU could be much slower than CPU when using tiles,
-     * still need to be investigated, but meanwhile make it possible
-     * to work in viewport smoothly
-     */
-    int debug_tile_size = get_int(cscene, "debug_tile_size");
-
-    params.tile_size = make_int2(debug_tile_size, debug_tile_size);
-  }
-  else {
-    int tile_x = b_engine.tile_x();
-    int tile_y = b_engine.tile_y();
-
-    params.tile_size = make_int2(tile_x, tile_y);
-  }
-
-  if ((BlenderSession::headless == false) && background) {
-    params.tile_order = (TileOrder)get_enum(cscene, "tile_order");
-  }
-  else {
-    params.tile_order = TILE_BOTTOM_TO_TOP;
-  }
-
-  /* Denoising */
-  params.denoising = get_denoise_params(b_scene, b_view_layer, background);
-
-  if (params.denoising.use) {
-    /* Add additional denoising devices if we are rendering and denoising
-     * with different devices. */
-    params.device.add_denoising_devices(params.denoising.type);
-
-    /* Check if denoiser is supported by device. */
-    if (!(params.device.denoisers & params.denoising.type)) {
-      params.denoising.use = false;
-    }
-  }
-
   /* Viewport Performance */
-  params.start_resolution = get_int(cscene, "preview_start_resolution");
   params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
 
-  /* other parameters */
-  params.cancel_timeout = (double)get_float(cscene, "debug_cancel_timeout");
-  params.reset_timeout = (double)get_float(cscene, "debug_reset_timeout");
-  params.text_timeout = (double)get_float(cscene, "debug_text_timeout");
-
-  /* progressive refine */
-  BL::RenderSettings b_r = b_scene.render();
-  params.progressive_refine = b_engine.is_preview() ||
-                              get_boolean(cscene, "use_progressive_refine");
-  if (b_r.use_save_buffers() || params.adaptive_sampling)
-    params.progressive_refine = false;
-
   if (background) {
-    if (params.progressive_refine)
-      params.progressive = true;
-    else
-      params.progressive = false;
-
-    params.start_resolution = INT_MAX;
     params.pixel_size = 1;
   }
-  else
-    params.progressive = true;
 
   /* shading system - scene level needs full refresh */
   const bool shadingsystem = RNA_boolean_get(&cscene, "shading_system");
@@ -950,19 +837,30 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
   else if (shadingsystem == 1)
     params.shadingsystem = SHADINGSYSTEM_OSL;
 
-  /* Color management. */
-  params.display_buffer_linear = b_engine.support_display_space_shader(b_scene);
-
-  if (b_engine.is_preview()) {
-    /* For preview rendering we're using same timeout as
-     * blender's job update.
-     */
-    params.progressive_update_timeout = 0.1;
+  /* Time limit. */
+  if (background) {
+    params.time_limit = get_float(cscene, "time_limit");
+  }
+  else {
+    /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
+     * usually higher than acceptable level for the final frame. */
+    /* TODO: It might be useful to support time limit in the viewport as well, but needs some
+     * extra thoughts and input. */
+    params.time_limit = 0.0;
   }
 
+  /* Profiling. */
   params.use_profiling = params.device.has_profiling && !b_engine.is_preview() && background &&
                          BlenderSession::print_render_stats;
 
+  if (background) {
+    params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile");
+    params.tile_size = get_int(cscene, "tile_size");
+  }
+  else {
+    params.use_auto_tile = false;
+  }
+
   return params;
 }
 
@@ -970,33 +868,34 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
                                               BL::ViewLayer &b_view_layer,
                                               bool background)
 {
+  enum DenoiserInput {
+    DENOISER_INPUT_RGB = 1,
+    DENOISER_INPUT_RGB_ALBEDO = 2,
+    DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
+
+    DENOISER_INPUT_NUM,
+  };
+
   DenoiseParams denoising;
   PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
 
+  int input_passes = -1;
+
   if (background) {
     /* Final Render Denoising */
     denoising.use = get_boolean(cscene, "use_denoising");
     denoising.type = (DenoiserType)get_enum(cscene, "denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.prefilter = (DenoiserPrefilter)get_enum(
+        cscene, "denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_NONE);
+
+    input_passes = (DenoiserInput)get_enum(
+        cscene, "denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO_NORMAL);
 
     if (b_view_layer) {
       PointerRNA clayer = RNA_pointer_get(&b_view_layer.ptr, "cycles");
       if (!get_boolean(clayer, "use_denoising")) {
         denoising.use = false;
       }
-
-      denoising.radius = get_int(clayer, "denoising_radius");
-      denoising.strength = get_float(clayer, "denoising_strength");
-      denoising.feature_strength = get_float(clayer, "denoising_feature_strength");
-      denoising.relative_pca = get_boolean(clayer, "denoising_relative_pca");
-
-      denoising.input_passes = (DenoiserInput)get_enum(
-          clayer,
-          (denoising.type == DENOISER_OPTIX) ? "denoising_optix_input_passes" :
-                                               "denoising_openimagedenoise_input_passes",
-          DENOISER_INPUT_NUM,
-          DENOISER_INPUT_RGB_ALBEDO_NORMAL);
-
-      denoising.store_passes = get_boolean(clayer, "denoising_store_passes");
     }
   }
   else {
@@ -1004,10 +903,12 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
     denoising.use = get_boolean(cscene, "use_preview_denoising");
     denoising.type = (DenoiserType)get_enum(
         cscene, "preview_denoiser", DENOISER_NUM, DENOISER_NONE);
+    denoising.prefilter = (DenoiserPrefilter)get_enum(
+        cscene, "preview_denoising_prefilter", DENOISER_PREFILTER_NUM, DENOISER_PREFILTER_FAST);
     denoising.start_sample = get_int(cscene, "preview_denoising_start_sample");
 
-    denoising.input_passes = (DenoiserInput)get_enum(
-        cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, (int)denoising.input_passes);
+    input_passes = (DenoiserInput)get_enum(
+        cscene, "preview_denoising_input_passes", DENOISER_INPUT_NUM, DENOISER_INPUT_RGB_ALBEDO);
 
     /* Auto select fastest denoiser. */
     if (denoising.type == DENOISER_NONE) {
@@ -1023,6 +924,27 @@ DenoiseParams BlenderSync::get_denoise_params(BL::Scene &b_scene,
     }
   }
 
+  switch (input_passes) {
+    case DENOISER_INPUT_RGB:
+      denoising.use_pass_albedo = false;
+      denoising.use_pass_normal = false;
+      break;
+
+    case DENOISER_INPUT_RGB_ALBEDO:
+      denoising.use_pass_albedo = true;
+      denoising.use_pass_normal = false;
+      break;
+
+    case DENOISER_INPUT_RGB_ALBEDO_NORMAL:
+      denoising.use_pass_albedo = true;
+      denoising.use_pass_normal = true;
+      break;
+
+    default:
+      LOG(ERROR) << "Unhandled input passes enum " << input_passes;
+      break;
+  }
+
   return denoising;
 }
 
diff --git a/intern/cycles/blender/blender_sync.h b/intern/cycles/blender/blender_sync.h
index d25c0ce1bc3..786479ac0f8 100644
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -60,6 +60,7 @@ class BlenderSync {
               BL::Scene &b_scene,
               Scene *scene,
               bool preview,
+              bool use_developer_ui,
               Progress &progress);
   ~BlenderSync();
 
@@ -75,12 +76,8 @@ class BlenderSync {
                  int height,
                  void **python_thread_state);
   void sync_view_layer(BL::ViewLayer &b_view_layer);
-  vector<Pass> sync_render_passes(BL::Scene &b_scene,
-                                  BL::RenderLayer &b_render_layer,
-                                  BL::ViewLayer &b_view_layer,
-                                  bool adaptive_sampling,
-                                  const DenoiseParams &denoising);
-  void sync_integrator();
+  void sync_render_passes(BL::RenderLayer &b_render_layer, BL::ViewLayer &b_view_layer);
+  void sync_integrator(BL::ViewLayer &b_view_layer, bool background);
   void sync_camera(BL::RenderSettings &b_render,
                    BL::Object &b_override,
                    int width,
@@ -98,22 +95,13 @@ class BlenderSync {
 
   /* get parameters */
   static SceneParams get_scene_params(BL::Scene &b_scene, bool background);
-  static SessionParams get_session_params(
-      BL::RenderEngine &b_engine,
-      BL::Preferences &b_userpref,
-      BL::Scene &b_scene,
-      bool background,
-      BL::ViewLayer b_view_layer = BL::ViewLayer(PointerRNA_NULL));
+  static SessionParams get_session_params(BL::RenderEngine &b_engine,
+                                          BL::Preferences &b_userpref,
+                                          BL::Scene &b_scene,
+                                          bool background);
   static bool get_session_pause(BL::Scene &b_scene, bool background);
-  static BufferParams get_buffer_params(BL::SpaceView3D &b_v3d,
-                                        BL::RegionView3D &b_rv3d,
-                                        Camera *cam,
-                                        int width,
-                                        int height,
-                                        const bool use_denoiser);
-
-  static PassType get_pass_type(BL::RenderPass &b_pass);
-  static int get_denoising_pass(BL::RenderPass &b_pass);
+  static BufferParams get_buffer_params(
+      BL::SpaceView3D &b_v3d, BL::RegionView3D &b_rv3d, Camera *cam, int width, int height);
 
  private:
   static DenoiseParams get_denoise_params(BL::Scene &b_scene,
@@ -131,7 +119,7 @@ class BlenderSync {
                    int width,
                    int height,
                    void **python_thread_state);
-  void sync_film(BL::SpaceView3D &b_v3d);
+  void sync_film(BL::ViewLayer &b_view_layer, BL::SpaceView3D &b_v3d);
   void sync_view();
 
   /* Shader */
@@ -245,6 +233,7 @@ class BlenderSync {
   Scene *scene;
   bool preview;
   bool experimental;
+  bool use_developer_ui;
 
   float dicing_rate;
   int max_subdivisions;
@@ -253,7 +242,6 @@ class BlenderSync {
     RenderLayerInfo()
         : material_override(PointerRNA_NULL),
           use_background_shader(true),
-          use_background_ao(true),
           use_surfaces(true),
           use_hair(true),
           use_volumes(true),
@@ -266,7 +254,6 @@ class BlenderSync {
     string name;
     BL::Material material_override;
     bool use_background_shader;
-    bool use_background_ao;
     bool use_surfaces;
     bool use_hair;
     bool use_volumes;
diff --git a/intern/cycles/blender/blender_viewport.cpp b/intern/cycles/blender/blender_viewport.cpp
index 18bdfc74de0..62e32240bba 100644
--- a/intern/cycles/blender/blender_viewport.cpp
+++ b/intern/cycles/blender/blender_viewport.cpp
@@ -17,6 +17,8 @@
 #include "blender_viewport.h"
 
 #include "blender_util.h"
+#include "render/pass.h"
+#include "util/util_logging.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -26,11 +28,12 @@ BlenderViewportParameters::BlenderViewportParameters()
       studiolight_rotate_z(0.0f),
       studiolight_intensity(1.0f),
       studiolight_background_alpha(1.0f),
-      display_pass(PASS_COMBINED)
+      display_pass(PASS_COMBINED),
+      show_active_pixels(false)
 {
 }
 
-BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
+BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui)
     : BlenderViewportParameters()
 {
   if (!b_v3d) {
@@ -55,7 +58,25 @@ BlenderViewportParameters::BlenderViewportParameters(BL::SpaceView3D &b_v3d)
   }
 
   /* Film. */
-  display_pass = (PassType)get_enum(cshading, "render_pass", -1, -1);
+
+  /* Lookup display pass based on the enum identifier.
+   * This is because integer values of python enum are not aligned with the passes definition in
+   * the kernel. */
+
+  display_pass = PASS_COMBINED;
+
+  const string display_pass_identifier = get_enum_identifier(cshading, "render_pass");
+  if (!display_pass_identifier.empty()) {
+    const ustring pass_type_identifier(string_to_lower(display_pass_identifier));
+    const NodeEnum *pass_type_enum = Pass::get_type_enum();
+    if (pass_type_enum->exists(pass_type_identifier)) {
+      display_pass = static_cast<PassType>((*pass_type_enum)[pass_type_identifier]);
+    }
+  }
+
+  if (use_developer_ui) {
+    show_active_pixels = get_boolean(cshading, "show_active_pixels");
+  }
 }
 
 bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters &other) const
@@ -69,7 +90,7 @@ bool BlenderViewportParameters::shader_modified(const BlenderViewportParameters
 
 bool BlenderViewportParameters::film_modified(const BlenderViewportParameters &other) const
 {
-  return display_pass != other.display_pass;
+  return display_pass != other.display_pass || show_active_pixels != other.show_active_pixels;
 }
 
 bool BlenderViewportParameters::modified(const BlenderViewportParameters &other) const
@@ -82,18 +103,4 @@ bool BlenderViewportParameters::use_custom_shader() const
   return !(use_scene_world && use_scene_lights);
 }
 
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes)
-{
-  if (b_v3d) {
-    const BlenderViewportParameters viewport_parameters(b_v3d);
-    const PassType display_pass = viewport_parameters.display_pass;
-
-    passes.clear();
-    Pass::add(display_pass, passes);
-
-    return display_pass;
-  }
-  return PASS_NONE;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/blender/blender_viewport.h b/intern/cycles/blender/blender_viewport.h
index d6518597053..b5adafc30c9 100644
--- a/intern/cycles/blender/blender_viewport.h
+++ b/intern/cycles/blender/blender_viewport.h
@@ -39,9 +39,10 @@ class BlenderViewportParameters {
 
   /* Film. */
   PassType display_pass;
+  bool show_active_pixels;
 
   BlenderViewportParameters();
-  explicit BlenderViewportParameters(BL::SpaceView3D &b_v3d);
+  BlenderViewportParameters(BL::SpaceView3D &b_v3d, bool use_developer_ui);
 
   /* Check whether any of shading related settings are different from the given parameters. */
   bool shader_modified(const BlenderViewportParameters &other) const;
@@ -57,8 +58,6 @@ class BlenderViewportParameters {
   bool use_custom_shader() const;
 };
 
-PassType update_viewport_display_passes(BL::SpaceView3D &b_v3d, vector<Pass> &passes);
-
 CCL_NAMESPACE_END
 
 #endif
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 048c2b95e40..d3497f3a8d8 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -832,18 +832,18 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   typedef StackAllocator<256, float2> LeafTimeStackAllocator;
   typedef StackAllocator<256, BVHReference> LeafReferenceStackAllocator;
 
-  vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM_TOTAL];
-  vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM_TOTAL];
-  vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM_TOTAL];
-  vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM_TOTAL];
-  vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM_TOTAL];
+  vector<int, LeafStackAllocator> p_type[PRIMITIVE_NUM];
+  vector<int, LeafStackAllocator> p_index[PRIMITIVE_NUM];
+  vector<int, LeafStackAllocator> p_object[PRIMITIVE_NUM];
+  vector<float2, LeafTimeStackAllocator> p_time[PRIMITIVE_NUM];
+  vector<BVHReference, LeafReferenceStackAllocator> p_ref[PRIMITIVE_NUM];
 
   /* TODO(sergey): In theory we should be able to store references. */
   vector<BVHReference, LeafReferenceStackAllocator> object_references;
 
-  uint visibility[PRIMITIVE_NUM_TOTAL] = {0};
+  uint visibility[PRIMITIVE_NUM] = {0};
   /* NOTE: Keep initialization in sync with actual number of primitives. */
-  BoundBox bounds[PRIMITIVE_NUM_TOTAL] = {
+  BoundBox bounds[PRIMITIVE_NUM] = {
       BoundBox::empty, BoundBox::empty, BoundBox::empty, BoundBox::empty};
   int ob_num = 0;
   int num_new_prims = 0;
@@ -877,7 +877,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
    * TODO(sergey): With some pointer trickery we can write directly to the
    * destination buffers for the non-spatial split BVH.
    */
-  BVHNode *leaves[PRIMITIVE_NUM_TOTAL + 1] = {NULL};
+  BVHNode *leaves[PRIMITIVE_NUM + 1] = {NULL};
   int num_leaves = 0;
   size_t start_index = 0;
   vector<int, LeafStackAllocator> local_prim_type, local_prim_index, local_prim_object;
@@ -888,7 +888,7 @@ BVHNode *BVHBuild::create_leaf_node(const BVHRange &range, const vector<BVHRefer
   if (need_prim_time) {
     local_prim_time.resize(num_new_prims);
   }
-  for (int i = 0; i < PRIMITIVE_NUM_TOTAL; ++i) {
+  for (int i = 0; i < PRIMITIVE_NUM; ++i) {
     int num = (int)p_type[i].size();
     if (num != 0) {
       assert(p_type[i].size() == p_index[i].size());
diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp
index 62f543941a9..96852510b63 100644
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -37,10 +37,10 @@
 /* Kernel includes are necessary so that the filter function for Embree can access the packed BVH.
  */
 #  include "kernel/bvh/bvh_embree.h"
-#  include "kernel/kernel_compat_cpu.h"
-#  include "kernel/kernel_globals.h"
+#  include "kernel/bvh/bvh_util.h"
+#  include "kernel/device/cpu/compat.h"
+#  include "kernel/device/cpu/globals.h"
 #  include "kernel/kernel_random.h"
-#  include "kernel/split/kernel_split_data_types.h"
 
 #  include "render/hair.h"
 #  include "render/mesh.h"
@@ -73,46 +73,69 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
   const RTCRay *ray = (RTCRay *)args->ray;
   RTCHit *hit = (RTCHit *)args->hit;
   CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  KernelGlobals *kg = ctx->kg;
+  const KernelGlobals *kg = ctx->kg;
 
   switch (ctx->type) {
     case CCLIntersectContext::RAY_SHADOW_ALL: {
-      /* Append the intersection to the end of the array. */
-      if (ctx->num_hits < ctx->max_hits) {
-        Intersection current_isect;
-        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        for (size_t i = 0; i < ctx->max_hits; ++i) {
+      Intersection current_isect;
+      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+
+      /* If no transparent shadows, all light is blocked. */
+      const int flags = intersection_get_shader_flags(kg, &current_isect);
+      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->max_hits == 0) {
+        ctx->opaque_hit = true;
+        return;
+      }
+
+      /* Test if we need to record this transparent intersection. */
+      if (ctx->num_hits < ctx->max_hits || ray->tfar < ctx->max_t) {
+        /* Skip already recorded intersections. */
+        int num_recorded_hits = min(ctx->num_hits, ctx->max_hits);
+
+        for (int i = 0; i < num_recorded_hits; ++i) {
           if (current_isect.object == ctx->isect_s[i].object &&
               current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
             /* This intersection was already recorded, skip it. */
             *args->valid = 0;
-            break;
+            return;
           }
         }
-        Intersection *isect = &ctx->isect_s[ctx->num_hits];
-        ++ctx->num_hits;
-        *isect = current_isect;
-        int prim = kernel_tex_fetch(__prim_index, isect->prim);
-        int shader = 0;
-        if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE) {
-          shader = kernel_tex_fetch(__tri_shader, prim);
-        }
-        else {
-          float4 str = kernel_tex_fetch(__curves, prim);
-          shader = __float_as_int(str.z);
-        }
-        int flag = kernel_tex_fetch(__shaders, shader & SHADER_MASK).flags;
-        /* If no transparent shadows, all light is blocked. */
-        if (flag & (SD_HAS_TRANSPARENT_SHADOW)) {
-          /* This tells Embree to continue tracing. */
-          *args->valid = 0;
+
+        /* If maximum number of hits was reached, replace the intersection with the
+         * highest distance. We want to find the N closest intersections. */
+        int isect_index = num_recorded_hits;
+        if (num_recorded_hits + 1 >= ctx->max_hits) {
+          float max_t = ctx->isect_s[0].t;
+          int max_recorded_hit = 0;
+
+          for (int i = 1; i < num_recorded_hits; ++i) {
+            if (ctx->isect_s[i].t > max_t) {
+              max_recorded_hit = i;
+              max_t = ctx->isect_s[i].t;
+            }
+          }
+
+          if (num_recorded_hits >= ctx->max_hits) {
+            isect_index = max_recorded_hit;
+          }
+
+          /* Limit the ray distance and stop counting hits beyond this.
+           * TODO: is there some way we can tell Embree to stop intersecting beyond
+           * this distance when max number of hits is reached?. Or maybe it will
+           * become irrelevant if we make max_hits a very high number on the CPU. */
+          ctx->max_t = max(current_isect.t, max_t);
         }
+
+        ctx->isect_s[isect_index] = current_isect;
       }
-      else {
-        /* Increase the number of hits beyond ray.max_hits
-         * so that the caller can detect this as opaque. */
-        ++ctx->num_hits;
-      }
+
+      /* Always increase the number of hits, even beyond ray.max_hits so that
+       * the caller can detect this as and consider it opaque, or trace another
+       * ray. */
+      ++ctx->num_hits;
+
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
       break;
     }
     case CCLIntersectContext::RAY_LOCAL:
@@ -329,7 +352,7 @@ void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_)
     scene = NULL;
   }
 
-  const bool dynamic = params.bvh_type == SceneParams::BVH_DYNAMIC;
+  const bool dynamic = params.bvh_type == BVH_TYPE_DYNAMIC;
 
   scene = rtcNewScene(rtc_device);
   const RTCSceneFlags scene_flags = (dynamic ? RTC_SCENE_FLAG_DYNAMIC : RTC_SCENE_FLAG_NONE) |
diff --git a/intern/cycles/bvh/bvh_params.h b/intern/cycles/bvh/bvh_params.h
index 2dc10f30363..31b3971c110 100644
--- a/intern/cycles/bvh/bvh_params.h
+++ b/intern/cycles/bvh/bvh_params.h
@@ -31,6 +31,27 @@ CCL_NAMESPACE_BEGIN
  */
 typedef KernelBVHLayout BVHLayout;
 
+/* Type of BVH, in terms whether it is supported dynamic updates of meshes
+ * or whether modifying geometry requires full BVH rebuild.
+ */
+enum BVHType {
+  /* BVH supports dynamic updates of geometry.
+   *
+   * Faster for updating BVH tree when doing modifications in viewport,
+   * but slower for rendering.
+   */
+  BVH_TYPE_DYNAMIC = 0,
+  /* BVH tree is calculated for specific scene, updates in geometry
+   * requires full tree rebuild.
+   *
+   * Slower to update BVH tree when modifying objects in viewport, also
+   * slower to build final BVH tree but gives best possible render speed.
+   */
+  BVH_TYPE_STATIC = 1,
+
+  BVH_NUM_TYPES,
+};
+
 /* Names bitflag type to denote which BVH layouts are supported by
  * particular area.
  *
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index 04ff598621a..da259171844 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -287,9 +287,6 @@ if(CYCLES_STANDALONE_REPOSITORY)
   endif()
 
   set(__boost_packages filesystem regex system thread date_time)
-  if(WITH_CYCLES_NETWORK)
-    list(APPEND __boost_packages serialization)
-  endif()
   if(WITH_CYCLES_OSL)
     list(APPEND __boost_packages wave)
   endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 928249931a3..d18f4360aef 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -36,49 +36,70 @@ endif()
 
 set(SRC
   device.cpp
-  device_cpu.cpp
-  device_cuda.cpp
-  device_denoising.cpp
-  device_dummy.cpp
+  device_denoise.cpp
+  device_graphics_interop.cpp
+  device_kernel.cpp
   device_memory.cpp
-  device_multi.cpp
-  device_opencl.cpp
-  device_optix.cpp
-  device_split_kernel.cpp
-  device_task.cpp
+  device_queue.cpp
+)
+
+set(SRC_CPU
+  cpu/device.cpp
+  cpu/device.h
+  cpu/device_impl.cpp
+  cpu/device_impl.h
+  cpu/kernel.cpp
+  cpu/kernel.h
+  cpu/kernel_function.h
+  cpu/kernel_thread_globals.cpp
+  cpu/kernel_thread_globals.h
 )
 
 set(SRC_CUDA
-  cuda/device_cuda.h
-  cuda/device_cuda_impl.cpp
+  cuda/device.cpp
+  cuda/device.h
+  cuda/device_impl.cpp
+  cuda/device_impl.h
+  cuda/graphics_interop.cpp
+  cuda/graphics_interop.h
+  cuda/kernel.cpp
+  cuda/kernel.h
+  cuda/queue.cpp
+  cuda/queue.h
+  cuda/util.cpp
+  cuda/util.h
 )
 
-set(SRC_OPENCL
-  opencl/device_opencl.h
-  opencl/device_opencl_impl.cpp
-  opencl/memory_manager.h
-  opencl/memory_manager.cpp
-  opencl/opencl_util.cpp
+set(SRC_DUMMY
+  dummy/device.cpp
+  dummy/device.h
 )
 
-if(WITH_CYCLES_NETWORK)
-  list(APPEND SRC
-    device_network.cpp
-  )
-endif()
+set(SRC_MULTI
+  multi/device.cpp
+  multi/device.h
+)
+
+set(SRC_OPTIX
+  optix/device.cpp
+  optix/device.h
+  optix/device_impl.cpp
+  optix/device_impl.h
+  optix/queue.cpp
+  optix/queue.h
+  optix/util.h
+)
 
 set(SRC_HEADERS
   device.h
-  device_denoising.h
+  device_denoise.h
+  device_graphics_interop.h
   device_memory.h
-  device_intern.h
-  device_network.h
-  device_split_kernel.h
-  device_task.h
+  device_kernel.h
+  device_queue.h
 )
 
 set(LIB
-  cycles_render
   cycles_kernel
   cycles_util
   ${CYCLES_GL_LIBRARIES}
@@ -95,15 +116,7 @@ else()
 endif()
 
 add_definitions(${GL_DEFINITIONS})
-if(WITH_CYCLES_NETWORK)
-  add_definitions(-DWITH_NETWORK)
-endif()
-if(WITH_CYCLES_DEVICE_OPENCL)
-  list(APPEND LIB
-    extern_clew
-  )
-  add_definitions(-DWITH_OPENCL)
-endif()
+
 if(WITH_CYCLES_DEVICE_CUDA)
   add_definitions(-DWITH_CUDA)
 endif()
@@ -115,18 +128,27 @@ if(WITH_CYCLES_DEVICE_MULTI)
 endif()
 
 if(WITH_OPENIMAGEDENOISE)
-  add_definitions(-DWITH_OPENIMAGEDENOISE)
-  add_definitions(-DOIDN_STATIC_LIB)
-  list(APPEND INC_SYS
-    ${OPENIMAGEDENOISE_INCLUDE_DIRS}
-  )
   list(APPEND LIB
     ${OPENIMAGEDENOISE_LIBRARIES}
-    ${TBB_LIBRARIES}
   )
 endif()
 
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CUDA} ${SRC_OPENCL} ${SRC_HEADERS})
+cycles_add_library(cycles_device "${LIB}"
+  ${SRC}
+  ${SRC_CPU}
+  ${SRC_CUDA}
+  ${SRC_DUMMY}
+  ${SRC_MULTI}
+  ${SRC_OPTIX}
+  ${SRC_HEADERS}
+)
+
+source_group("cpu" FILES ${SRC_CPU})
+source_group("cuda" FILES ${SRC_CUDA})
+source_group("dummy" FILES ${SRC_DUMMY})
+source_group("multi" FILES ${SRC_MULTI})
+source_group("optix" FILES ${SRC_OPTIX})
+source_group("common" FILES ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/device/cpu/device.cpp b/intern/cycles/device/cpu/device.cpp
new file mode 100644
index 00000000000..68ca8e8bb22
--- /dev/null
+++ b/intern/cycles/device/cpu/device.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device.h"
+#include "device/cpu/device_impl.h"
+
+/* Used for `info.denoisers`. */
+/* TODO(sergey): The denoisers are probably to be moved completely out of the device into their
+ * own class. But until then keep API consistent with how it used to work before. */
+#include "util/util_openimagedenoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new CPUDevice(info, stats, profiler);
+}
+
+void device_cpu_info(vector<DeviceInfo> &devices)
+{
+  DeviceInfo info;
+
+  info.type = DEVICE_CPU;
+  info.description = system_cpu_brand_string();
+  info.id = "CPU";
+  info.num = 0;
+  info.has_osl = true;
+  info.has_half_images = true;
+  info.has_nanovdb = true;
+  info.has_profiling = true;
+  if (openimagedenoise_supported()) {
+    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
+  }
+
+  devices.insert(devices.begin(), info);
+}
+
+string device_cpu_capabilities()
+{
+  string capabilities = "";
+  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+  capabilities += system_cpu_support_avx() ? "AVX " : "";
+  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+  if (capabilities[capabilities.size() - 1] == ' ')
+    capabilities.resize(capabilities.size() - 1);
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl b/intern/cycles/device/cpu/device.h
index dcea2630aef..9cb2e80068d 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_buffer_update.cl
+++ b/intern/cycles/device/cpu/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,22 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_buffer_update.h"
+#pragma once
 
-#define KERNEL_NAME buffer_update
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_cpu_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+
+string device_cpu_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
new file mode 100644
index 00000000000..3b0db6bdd0e
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -0,0 +1,481 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/device_impl.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "device/device.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "bvh/bvh_embree.h"
+
+#include "render/buffers.h"
+
+#include "util/util_debug.h"
+#include "util/util_foreach.h"
+#include "util/util_function.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_opengl.h"
+#include "util/util_openimagedenoise.h"
+#include "util/util_optimization.h"
+#include "util/util_progress.h"
+#include "util/util_system.h"
+#include "util/util_task.h"
+#include "util/util_thread.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+    : Device(info_, stats_, profiler_), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  /* Pick any kernel, all of them are supposed to have same level of microarchitecture
+   * optimization. */
+  VLOG(1) << "Will be using " << kernels.integrator_init_from_camera.get_uarch_name()
+          << " kernels.";
+
+  if (info.cpu_threads == 0) {
+    info.cpu_threads = TaskScheduler::num_threads();
+  }
+
+#ifdef WITH_OSL
+  kernel_globals.osl = &osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  embree_device = rtcNewDevice("verbose=0");
+#endif
+  need_texture_info = false;
+}
+
+CPUDevice::~CPUDevice()
+{
+#ifdef WITH_EMBREE
+  rtcReleaseDevice(embree_device);
+#endif
+
+  texture_info.free();
+}
+
+bool CPUDevice::show_samples() const
+{
+  return (info.cpu_threads == 1);
+}
+
+BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+{
+  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+#ifdef WITH_EMBREE
+  bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+  return bvh_layout_mask;
+}
+
+bool CPUDevice::load_texture_info()
+{
+  if (!need_texture_info) {
+    return false;
+  }
+
+  texture_info.copy_to_device();
+  need_texture_info = false;
+
+  return true;
+}
+
+void CPUDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+    }
+
+    if (mem.type == MEM_DEVICE_ONLY) {
+      assert(!mem.host_pointer);
+      size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+      void *data = util_aligned_malloc(mem.memory_size(), alignment);
+      mem.device_pointer = (device_ptr)data;
+    }
+    else {
+      mem.device_pointer = (device_ptr)mem.host_pointer;
+    }
+
+    mem.device_size = mem.memory_size();
+    stats.mem_alloc(mem.device_size);
+  }
+}
+
+void CPUDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    /* copy is no-op */
+  }
+}
+
+void CPUDevice::mem_copy_from(
+    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+{
+  /* no-op */
+}
+
+void CPUDevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+
+  if (mem.device_pointer) {
+    memset((void *)mem.device_pointer, 0, mem.memory_size());
+  }
+}
+
+void CPUDevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else if (mem.device_pointer) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      util_aligned_free((void *)mem.device_pointer);
+    }
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CPUDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+#if WITH_EMBREE
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    // Update scene handle (since it is different for each device on multi devices)
+    KernelData *const data = (KernelData *)host;
+    data->bvh.scene = embree_scene;
+  }
+#endif
+  kernel_const_copy(&kernel_globals, name, host, size);
+}
+
+void CPUDevice::global_alloc(device_memory &mem)
+{
+  VLOG(1) << "Global memory allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+}
+
+void CPUDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+  }
+}
+
+void CPUDevice::tex_alloc(device_texture &mem)
+{
+  VLOG(1) << "Texture allocate: " << mem.name << ", "
+          << string_human_readable_number(mem.memory_size()) << " bytes. ("
+          << string_human_readable_size(mem.memory_size()) << ")";
+
+  mem.device_pointer = (device_ptr)mem.host_pointer;
+  mem.device_size = mem.memory_size();
+  stats.mem_alloc(mem.device_size);
+
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  texture_info[slot] = mem.info;
+  texture_info[slot].data = (uint64_t)mem.host_pointer;
+  need_texture_info = true;
+}
+
+void CPUDevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    mem.device_pointer = 0;
+    stats.mem_free(mem.device_size);
+    mem.device_size = 0;
+    need_texture_info = true;
+  }
+}
+
+void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+#ifdef WITH_EMBREE
+  if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
+      bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device);
+    }
+
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else
+#endif
+    Device::build_bvh(bvh, progress, refit);
+}
+
+#if 0
+void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+{
+  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+  scoped_timer timer(&tile.buffers->render_time);
+
+  Coverage coverage(kg, tile);
+  if (use_coverage) {
+    coverage.init_path_trace();
+  }
+
+  float *render_buffer = (float *)tile.buffer;
+  int start_sample = tile.start_sample;
+  int end_sample = tile.start_sample + tile.num_samples;
+
+  /* Needed for Embree. */
+  SIMD_SET_FLUSH_TO_ZERO;
+
+  for (int sample = start_sample; sample < end_sample; sample++) {
+    if (task.get_cancel() || TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+
+    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
+      tile.stealing_state = RenderTile::WAS_STOLEN;
+      break;
+    }
+
+    if (tile.task == RenderTile::PATH_TRACE) {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          if (use_coverage) {
+            coverage.init_pixel(x, y);
+          }
+          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    else {
+      for (int y = tile.y; y < tile.y + tile.h; y++) {
+        for (int x = tile.x; x < tile.x + tile.w; x++) {
+          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+        }
+      }
+    }
+    tile.sample = sample + 1;
+
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
+      const bool stop = adaptive_sampling_filter(kg, tile, sample);
+      if (stop) {
+        const int num_progress_samples = end_sample - sample;
+        tile.sample = end_sample;
+        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
+        break;
+      }
+    }
+
+    task.update_progress(&tile, tile.w * tile.h);
+  }
+  if (use_coverage) {
+    coverage.finalize();
+  }
+
+  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
+    adaptive_sampling_post(tile, kg);
+  }
+}
+
+void CPUDevice::thread_render(DeviceTask &task)
+{
+  if (TaskPool::canceled()) {
+    if (task.need_finish_queue == false)
+      return;
+  }
+
+  /* allocate buffer for kernel globals */
+  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
+
+  profiler.add_state(&kg.profiler);
+
+  /* NLM denoiser. */
+  DenoisingTask *denoising = NULL;
+
+  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+   * avoid waiting with mutex locks in the denoiser, we let only a single
+   * thread acquire denoising tiles. */
+  uint tile_types = task.tile_types;
+  bool hold_denoise_lock = false;
+  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+    if (!oidn_task_lock.try_lock()) {
+      tile_types &= ~RenderTile::DENOISE;
+      hold_denoise_lock = true;
+    }
+  }
+
+  RenderTile tile;
+  while (task.acquire_tile(this, tile, tile_types)) {
+    if (tile.task == RenderTile::PATH_TRACE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::BAKE) {
+      render(task, tile, &kg);
+    }
+    else if (tile.task == RenderTile::DENOISE) {
+      denoise_openimagedenoise(task, tile);
+      task.update_progress(&tile, tile.w * tile.h);
+    }
+
+    task.release_tile(tile);
+
+    if (TaskPool::canceled()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  if (hold_denoise_lock) {
+    oidn_task_lock.unlock();
+  }
+
+  profiler.remove_state(&kg.profiler);
+
+  delete denoising;
+}
+
+void CPUDevice::thread_denoise(DeviceTask &task)
+{
+  RenderTile tile;
+  tile.x = task.x;
+  tile.y = task.y;
+  tile.w = task.w;
+  tile.h = task.h;
+  tile.buffer = task.buffer;
+  tile.sample = task.sample + task.num_samples;
+  tile.num_samples = task.num_samples;
+  tile.start_sample = task.sample;
+  tile.offset = task.offset;
+  tile.stride = task.stride;
+  tile.buffers = task.buffers;
+
+  denoise_openimagedenoise(task, tile);
+
+  task.update_progress(&tile, tile.w * tile.h);
+}
+#endif
+
+const CPUKernels *CPUDevice::get_cpu_kernels() const
+{
+  return &kernels;
+}
+
+void CPUDevice::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  /* Ensure latest texture info is loaded into kernel globals before returning. */
+  load_texture_info();
+
+  kernel_thread_globals.clear();
+  void *osl_memory = get_cpu_osl_memory();
+  for (int i = 0; i < info.cpu_threads; i++) {
+    kernel_thread_globals.emplace_back(kernel_globals, osl_memory, profiler);
+  }
+}
+
+void *CPUDevice::get_cpu_osl_memory()
+{
+#ifdef WITH_OSL
+  return &osl_globals;
+#else
+  return NULL;
+#endif
+}
+
+bool CPUDevice::load_kernels(const uint /*kernel_features*/)
+{
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
new file mode 100644
index 00000000000..7d222808652
--- /dev/null
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* So ImathMath is included before our kernel_cpu_compat. */
+#ifdef WITH_OSL
+/* So no context pollution happens from indirectly included windows.h */
+#  include "util/util_windows.h"
+#  include <OSL/oslexec.h>
+#endif
+
+#ifdef WITH_EMBREE
+#  include <embree3/rtcore.h>
+#endif
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+#include "device/device_memory.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+class CPUDevice : public Device {
+ public:
+  KernelGlobals kernel_globals;
+
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+#ifdef WITH_OSL
+  OSLGlobals osl_globals;
+#endif
+#ifdef WITH_EMBREE
+  RTCScene embree_scene = NULL;
+  RTCDevice embree_device;
+#endif
+
+  CPUKernels kernels;
+
+  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
+  ~CPUDevice();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  bool load_texture_info();
+
+  virtual void mem_alloc(device_memory &mem) override;
+  virtual void mem_copy_to(device_memory &mem) override;
+  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_zero(device_memory &mem) override;
+  virtual void mem_free(device_memory &mem) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+  void tex_free(device_texture &mem);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  virtual const CPUKernels *get_cpu_kernels() const override;
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
+  virtual void *get_cpu_osl_memory() override;
+
+ protected:
+  virtual bool load_kernels(uint /*kernel_features*/) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
new file mode 100644
index 00000000000..91282390e27
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel.h"
+
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_FUNCTIONS(name) \
+  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+
+CPUKernels::CPUKernels()
+    : /* Integrator. */
+      REGISTER_KERNEL(integrator_init_from_camera),
+      REGISTER_KERNEL(integrator_init_from_bake),
+      REGISTER_KERNEL(integrator_intersect_closest),
+      REGISTER_KERNEL(integrator_intersect_shadow),
+      REGISTER_KERNEL(integrator_intersect_subsurface),
+      REGISTER_KERNEL(integrator_intersect_volume_stack),
+      REGISTER_KERNEL(integrator_shade_background),
+      REGISTER_KERNEL(integrator_shade_light),
+      REGISTER_KERNEL(integrator_shade_shadow),
+      REGISTER_KERNEL(integrator_shade_surface),
+      REGISTER_KERNEL(integrator_shade_volume),
+      REGISTER_KERNEL(integrator_megakernel),
+      /* Shader evaluation. */
+      REGISTER_KERNEL(shader_eval_displace),
+      REGISTER_KERNEL(shader_eval_background),
+      /* Adaptive sampling. */
+      REGISTER_KERNEL(adaptive_sampling_convergence_check),
+      REGISTER_KERNEL(adaptive_sampling_filter_x),
+      REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Cryptomatte. */
+      REGISTER_KERNEL(cryptomatte_postprocess),
+      /* Bake. */
+      REGISTER_KERNEL(bake)
+{
+}
+
+#undef REGISTER_KERNEL
+#undef KERNEL_FUNCTIONS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
new file mode 100644
index 00000000000..54b18308544
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/cpu/kernel_function.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelGlobals;
+struct IntegratorStateCPU;
+struct TileInfo;
+
+class CPUKernels {
+ public:
+  /* Integrator. */
+
+  using IntegratorFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg, IntegratorStateCPU *state)>;
+  using IntegratorShadeFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer)>;
+  using IntegratorInitFunction = CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                                            IntegratorStateCPU *state,
+                                                            KernelWorkTile *tile,
+                                                            ccl_global float *render_buffer)>;
+
+  IntegratorInitFunction integrator_init_from_camera;
+  IntegratorInitFunction integrator_init_from_bake;
+  IntegratorFunction integrator_intersect_closest;
+  IntegratorFunction integrator_intersect_shadow;
+  IntegratorFunction integrator_intersect_subsurface;
+  IntegratorFunction integrator_intersect_volume_stack;
+  IntegratorShadeFunction integrator_shade_background;
+  IntegratorShadeFunction integrator_shade_light;
+  IntegratorShadeFunction integrator_shade_shadow;
+  IntegratorShadeFunction integrator_shade_surface;
+  IntegratorShadeFunction integrator_shade_volume;
+  IntegratorShadeFunction integrator_megakernel;
+
+  /* Shader evaluation. */
+
+  using ShaderEvalFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, const KernelShaderEvalInput *, float4 *, const int)>;
+
+  ShaderEvalFunction shader_eval_displace;
+  ShaderEvalFunction shader_eval_background;
+
+  /* Adaptive stopping. */
+
+  using AdaptiveSamplingConvergenceCheckFunction =
+      CPUKernelFunction<bool (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int y,
+                                 float threshold,
+                                 bool reset,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterXFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int y,
+                                 int start_x,
+                                 int width,
+                                 int offset,
+                                 int stride)>;
+
+  using AdaptiveSamplingFilterYFunction =
+      CPUKernelFunction<void (*)(const KernelGlobals *kg,
+                                 ccl_global float *render_buffer,
+                                 int x,
+                                 int start_y,
+                                 int height,
+                                 int offset,
+                                 int stride)>;
+
+  AdaptiveSamplingConvergenceCheckFunction adaptive_sampling_convergence_check;
+
+  AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
+  AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
+
+  /* Cryptomatte. */
+
+  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+  CryptomattePostprocessFunction cryptomatte_postprocess;
+
+  /* Bake. */
+
+  CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
+
+  CPUKernels();
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_function.h b/intern/cycles/device/cpu/kernel_function.h
new file mode 100644
index 00000000000..aa18720cc24
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_function.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_debug.h"
+#include "util/util_system.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* A wrapper around per-microarchitecture variant of a kernel function.
+ *
+ * Provides a function-call-like API which gets routed to the most suitable implementation.
+ *
+ * For example, on a computer which only has SSE4.1 the kernel_sse41 will be used. */
+template<typename FunctionType> class CPUKernelFunction {
+ public:
+  CPUKernelFunction(FunctionType kernel_default,
+                    FunctionType kernel_sse2,
+                    FunctionType kernel_sse3,
+                    FunctionType kernel_sse41,
+                    FunctionType kernel_avx,
+                    FunctionType kernel_avx2)
+  {
+    kernel_info_ = get_best_kernel_info(
+        kernel_default, kernel_sse2, kernel_sse3, kernel_sse41, kernel_avx, kernel_avx2);
+  }
+
+  template<typename... Args> inline auto operator()(Args... args) const
+  {
+    assert(kernel_info_.kernel);
+
+    return kernel_info_.kernel(args...);
+  }
+
+  const char *get_uarch_name() const
+  {
+    return kernel_info_.uarch_name;
+  }
+
+ protected:
+  /* Helper class which allows to pass human-readable microarchitecture name together with function
+   * pointer. */
+  class KernelInfo {
+   public:
+    KernelInfo() : KernelInfo("", nullptr)
+    {
+    }
+
+    /* TODO(sergey): Use string view, to have higher-level functionality (i.e. comparison) without
+     * memory allocation. */
+    KernelInfo(const char *uarch_name, FunctionType kernel)
+        : uarch_name(uarch_name), kernel(kernel)
+    {
+    }
+
+    const char *uarch_name;
+    FunctionType kernel;
+  };
+
+  KernelInfo get_best_kernel_info(FunctionType kernel_default,
+                                  FunctionType kernel_sse2,
+                                  FunctionType kernel_sse3,
+                                  FunctionType kernel_sse41,
+                                  FunctionType kernel_avx,
+                                  FunctionType kernel_avx2)
+  {
+    /* Silence warnings about unused variables when compiling without some architectures. */
+    (void)kernel_sse2;
+    (void)kernel_sse3;
+    (void)kernel_sse41;
+    (void)kernel_avx;
+    (void)kernel_avx2;
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+      return KernelInfo("AVX2", kernel_avx2);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+    if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+      return KernelInfo("AVX", kernel_avx);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+    if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+      return KernelInfo("SSE4.1", kernel_sse41);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+    if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+      return KernelInfo("SSE3", kernel_sse3);
+    }
+#endif
+
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+    if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+      return KernelInfo("SSE2", kernel_sse2);
+    }
+#endif
+
+    return KernelInfo("default", kernel_default);
+  }
+
+  KernelInfo kernel_info_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
new file mode 100644
index 00000000000..988b00cd1f0
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/cpu/kernel_thread_globals.h"
+
+// clang-format off
+#include "kernel/osl/osl_shader.h"
+#include "kernel/osl/osl_globals.h"
+// clang-format on
+
+#include "util/util_profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                                               void *osl_globals_memory,
+                                               Profiler &cpu_profiler)
+    : KernelGlobals(kernel_globals), cpu_profiler_(cpu_profiler)
+{
+  reset_runtime_memory();
+
+#ifdef WITH_OSL
+  OSLShader::thread_init(this, reinterpret_cast<OSLGlobals *>(osl_globals_memory));
+#else
+  (void)osl_globals_memory;
+#endif
+}
+
+CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept
+    : KernelGlobals(std::move(other)), cpu_profiler_(other.cpu_profiler_)
+{
+  other.reset_runtime_memory();
+}
+
+CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
+{
+#ifdef WITH_OSL
+  OSLShader::thread_free(this);
+#endif
+}
+
+CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals &&other)
+{
+  if (this == &other) {
+    return *this;
+  }
+
+  *static_cast<KernelGlobals *>(this) = *static_cast<KernelGlobals *>(&other);
+
+  other.reset_runtime_memory();
+
+  return *this;
+}
+
+void CPUKernelThreadGlobals::reset_runtime_memory()
+{
+#ifdef WITH_OSL
+  osl = nullptr;
+#endif
+}
+
+void CPUKernelThreadGlobals::start_profiling()
+{
+  cpu_profiler_.add_state(&profiler);
+}
+
+void CPUKernelThreadGlobals::stop_profiling()
+{
+  cpu_profiler_.remove_state(&profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.h b/intern/cycles/device/cpu/kernel_thread_globals.h
new file mode 100644
index 00000000000..d005c3bb56c
--- /dev/null
+++ b/intern/cycles/device/cpu/kernel_thread_globals.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Profiler;
+
+/* A special class which extends memory ownership of the `KernelGlobals` decoupling any resource
+ * which is not thread-safe for access. Every worker thread which needs to operate on
+ * `KernelGlobals` needs to initialize its own copy of this object.
+ *
+ * NOTE: Only minimal subset of objects are copied: `KernelData` is never copied. This means that
+ * there is no unnecessary data duplication happening when using this object. */
+class CPUKernelThreadGlobals : public KernelGlobals {
+ public:
+  /* TODO(sergey): Would be nice to have properly typed OSLGlobals even in the case when building
+   * without OSL support. Will avoid need to those unnamed pointers and casts. */
+  CPUKernelThreadGlobals(const KernelGlobals &kernel_globals,
+                         void *osl_globals_memory,
+                         Profiler &cpu_profiler);
+
+  ~CPUKernelThreadGlobals();
+
+  CPUKernelThreadGlobals(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) noexcept;
+
+  CPUKernelThreadGlobals &operator=(const CPUKernelThreadGlobals &other) = delete;
+  CPUKernelThreadGlobals &operator=(CPUKernelThreadGlobals &&other);
+
+  void start_profiling();
+  void stop_profiling();
+
+ protected:
+  void reset_runtime_memory();
+
+  Profiler &cpu_profiler_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/cuda/device.cpp
index 2e225ecfaf8..84becd6d081 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -14,21 +14,25 @@
  * limitations under the License.
  */
 
-#ifdef WITH_CUDA
+#include "device/cuda/device.h"
+
+#include "util/util_logging.h"
 
-#  include "device/cuda/device_cuda.h"
+#ifdef WITH_CUDA
+#  include "device/cuda/device_impl.h"
 #  include "device/device.h"
-#  include "device/device_intern.h"
 
-#  include "util/util_logging.h"
 #  include "util/util_string.h"
 #  include "util/util_windows.h"
+#endif /* WITH_CUDA */
 
 CCL_NAMESPACE_BEGIN
 
 bool device_cuda_init()
 {
-#  ifdef WITH_CUDA_DYNLOAD
+#if !defined(WITH_CUDA)
+  return false;
+#elif defined(WITH_CUDA_DYNLOAD)
   static bool initialized = false;
   static bool result = false;
 
@@ -59,16 +63,27 @@ bool device_cuda_init()
   }
 
   return result;
-#  else  /* WITH_CUDA_DYNLOAD */
+#else  /* WITH_CUDA_DYNLOAD */
   return true;
-#  endif /* WITH_CUDA_DYNLOAD */
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new CUDADevice(info, stats, profiler, background);
+#ifdef WITH_CUDA
+  return new CUDADevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create CUDA device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
 }
 
+#ifdef WITH_CUDA
 static CUresult device_cuda_safe_init()
 {
 #  ifdef _WIN32
@@ -86,9 +101,11 @@ static CUresult device_cuda_safe_init()
   return cuInit(0);
 #  endif
 }
+#endif /* WITH_CUDA */
 
 void device_cuda_info(vector<DeviceInfo> &devices)
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE)
@@ -129,9 +146,9 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
     info.has_half_images = (major >= 3);
     info.has_nanovdb = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
 
     /* Check if the device has P2P access to any other device in the system. */
     for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
@@ -182,10 +199,14 @@ void device_cuda_info(vector<DeviceInfo> &devices)
 
   if (!display_devices.empty())
     devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_CUDA */
+  (void)devices;
+#endif /* WITH_CUDA */
 }
 
 string device_cuda_capabilities()
 {
+#ifdef WITH_CUDA
   CUresult result = device_cuda_safe_init();
   if (result != CUDA_SUCCESS) {
     if (result != CUDA_ERROR_NO_DEVICE) {
@@ -310,8 +331,10 @@ string device_cuda_capabilities()
   }
 
   return capabilities;
+
+#else  /* WITH_CUDA */
+  return "";
+#endif /* WITH_CUDA */
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl b/intern/cycles/device/cuda/device.h
index e68d4104a91..b0484904d1a 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_enqueue_inactive.cl
+++ b/intern/cycles/device/cuda/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,24 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
+#pragma once
 
-#define KERNEL_NAME enqueue_inactive
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_cuda_init();
+
+Device *device_cuda_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_cuda_info(vector<DeviceInfo> &devices);
+
+string device_cuda_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
deleted file mode 100644
index c3271c3cfcf..00000000000
--- a/intern/cycles/device/cuda/device_cuda.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_task.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include "cuew.h"
-#  else
-#    include "util/util_opengl.h"
-#    include <cuda.h>
-#    include <cudaGL.h>
-#  endif
-
-CCL_NAMESPACE_BEGIN
-
-class CUDASplitKernel;
-
-class CUDADevice : public Device {
-
-  friend class CUDASplitKernelFunction;
-  friend class CUDASplitKernel;
-  friend class CUDAContextScope;
-
- public:
-  DedicatedTaskPool task_pool;
-  CUdevice cuDevice;
-  CUcontext cuContext;
-  CUmodule cuModule, cuFilterModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
-  int pitch_alignment;
-  int cuDevId;
-  int cuDevArchitecture;
-  bool first_error;
-  CUDASplitKernel *split_kernel;
-
-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  struct PixelMem {
-    GLuint cuPBO;
-    CUgraphicsResource cuPBOresource;
-    GLuint cuTexId;
-    int w, h;
-  };
-  map<device_ptr, PixelMem> pixel_mem_map;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-  /* Kernels */
-  struct {
-    bool loaded;
-
-    CUfunction adaptive_stopping;
-    CUfunction adaptive_filter_x;
-    CUfunction adaptive_filter_y;
-    CUfunction adaptive_scale_samples;
-    int adaptive_num_threads_per_block;
-  } functions;
-
-  static bool have_precompiled_kernels();
-
-  virtual bool show_samples() const override;
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
-
-  void set_error(const string &error) override;
-
-  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_);
-
-  virtual ~CUDADevice();
-
-  bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
-
-  bool check_peer_access(Device *peer_device) override;
-
-  bool use_adaptive_compilation();
-
-  bool use_split_kernel();
-
-  virtual string compile_kernel_get_common_cflags(
-      const DeviceRequestedFeatures &requested_features, bool filter = false, bool split = false);
-
-  string compile_kernel(const DeviceRequestedFeatures &requested_features,
-                        const char *name,
-                        const char *base = "cuda",
-                        bool force_ptx = false);
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features) override;
-
-  void load_functions();
-
-  void reserve_local_memory(const DeviceRequestedFeatures &requested_features);
-
-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
-
-  void mem_alloc(device_memory &mem) override;
-
-  void mem_copy_to(device_memory &mem) override;
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
-
-  void mem_zero(device_memory &mem) override;
-
-  void mem_free(device_memory &mem) override;
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override;
-
-  void global_alloc(device_memory &mem);
-
-  void global_free(device_memory &mem);
-
-  void tex_alloc(device_texture &mem);
-
-  void tex_free(device_texture &mem);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-
-  bool denoising_construct_transform(DenoisingTask *task);
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  void denoise(RenderTile &rtile, DenoisingTask &denoising);
-
-  void adaptive_sampling_filter(uint filter_sample,
-                                WorkTile *wtile,
-                                CUdeviceptr d_wtile,
-                                CUstream stream = 0);
-  void adaptive_sampling_post(RenderTile &rtile,
-                              WorkTile *wtile,
-                              CUdeviceptr d_wtile,
-                              CUstream stream = 0);
-
-  void render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-
-  void shader(DeviceTask &task);
-
-  CUdeviceptr map_pixels(device_ptr mem);
-
-  void unmap_pixels(device_ptr mem);
-
-  void pixels_alloc(device_memory &mem);
-
-  void pixels_copy_from(device_memory &mem, int y, int w, int h);
-
-  void pixels_free(device_memory &mem);
-
-  void draw_pixels(device_memory &mem,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override;
-
-  void thread_run(DeviceTask &task);
-
-  virtual void task_add(DeviceTask &task) override;
-
-  virtual void task_wait() override;
-
-  virtual void task_cancel() override;
-};
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
deleted file mode 100644
index 2d2fcb38705..00000000000
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ /dev/null
@@ -1,2714 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_CUDA
-
-#  include <climits>
-#  include <limits.h>
-#  include <stdio.h>
-#  include <stdlib.h>
-#  include <string.h>
-
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_intern.h"
-#  include "device/device_split_kernel.h"
-
-#  include "render/buffers.h"
-
-#  include "kernel/filter/filter_defines.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_map.h"
-#  include "util/util_md5.h"
-#  include "util/util_opengl.h"
-#  include "util/util_path.h"
-#  include "util/util_string.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-#  include "util/util_types.h"
-#  include "util/util_windows.h"
-
-#  include "kernel/split/kernel_split_data_types.h"
-
-CCL_NAMESPACE_BEGIN
-
-#  ifndef WITH_CUDA_DYNLOAD
-
-/* Transparently implement some functions, so majority of the file does not need
- * to worry about difference between dynamically loaded and linked CUDA at all.
- */
-
-namespace {
-
-const char *cuewErrorString(CUresult result)
-{
-  /* We can only give error code here without major code duplication, that
-   * should be enough since dynamic loading is only being disabled by folks
-   * who knows what they're doing anyway.
-   *
-   * NOTE: Avoid call from several threads.
-   */
-  static string error;
-  error = string_printf("%d", result);
-  return error.c_str();
-}
-
-const char *cuewCompilerPath()
-{
-  return CYCLES_CUDA_NVCC_EXECUTABLE;
-}
-
-int cuewCompilerVersion()
-{
-  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
-}
-
-} /* namespace */
-#  endif /* WITH_CUDA_DYNLOAD */
-
-class CUDADevice;
-
-class CUDASplitKernel : public DeviceSplitKernel {
-  CUDADevice *device;
-
- public:
-  explicit CUDASplitKernel(CUDADevice *device);
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-};
-
-/* Utility to push/pop CUDA context. */
-class CUDAContextScope {
- public:
-  CUDAContextScope(CUDADevice *device);
-  ~CUDAContextScope();
-
- private:
-  CUDADevice *device;
-};
-
-bool CUDADevice::have_precompiled_kernels()
-{
-  string cubins_path = path_get("lib");
-  return path_exists(cubins_path);
-}
-
-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
-{
-  return BVH_LAYOUT_BVH2;
-}
-
-void CUDADevice::set_error(const string &error)
-{
-  Device::set_error(error);
-
-  if (first_error) {
-    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-    fprintf(stderr,
-            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
-    first_error = false;
-  }
-}
-
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-    : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  first_error = true;
-  background = background_;
-
-  cuDevId = info.num;
-  cuDevice = 0;
-  cuContext = 0;
-
-  cuModule = 0;
-  cuFilterModule = 0;
-
-  split_kernel = NULL;
-
-  need_texture_info = false;
-
-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
-  pitch_alignment = 0;
-
-  functions.loaded = false;
-
-  /* Initialize CUDA. */
-  CUresult result = cuInit(0);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  /* Setup device and context. */
-  result = cuDeviceGet(&cuDevice, cuDevId);
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
-                            cuewErrorString(result)));
-    return;
-  }
-
-  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-  cuda_assert(cuDeviceGetAttribute(
-      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-
-  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-  if (can_map_host) {
-    ctx_flags |= CU_CTX_MAP_HOST;
-    init_host_memory();
-  }
-
-  /* Create context. */
-  if (background) {
-    result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-  }
-  else {
-    result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-    if (result != CUDA_SUCCESS) {
-      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-      background = true;
-    }
-  }
-
-  if (result != CUDA_SUCCESS) {
-    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
-    return;
-  }
-
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-  cuDevArchitecture = major * 100 + minor * 10;
-
-  /* Pop context set by cuCtxCreate. */
-  cuCtxPopCurrent(NULL);
-}
-
-CUDADevice::~CUDADevice()
-{
-  task_pool.cancel();
-
-  delete split_kernel;
-
-  texture_info.free();
-
-  cuda_assert(cuCtxDestroy(cuContext));
-}
-
-bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
-{
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* We only support sm_30 and above */
-  if (major < 3) {
-    set_error(string_printf(
-        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
-    return false;
-  }
-
-  return true;
-}
-
-bool CUDADevice::check_peer_access(Device *peer_device)
-{
-  if (peer_device == this) {
-    return false;
-  }
-  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
-    return false;
-  }
-
-  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
-
-  int can_access = 0;
-  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Ensure array access over the link is possible as well (for 3D textures)
-  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
-                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
-                                      cuDevice,
-                                      peer_device_cuda->cuDevice));
-  if (can_access == 0) {
-    return false;
-  }
-
-  // Enable peer access in both directions
-  {
-    const CUDAContextScope scope(this);
-    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-  {
-    const CUDAContextScope scope(peer_device_cuda);
-    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
-    if (result != CUDA_SUCCESS) {
-      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
-                              cuewErrorString(result)));
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool CUDADevice::use_adaptive_compilation()
-{
-  return DebugFlags().cuda.adaptive_compile;
-}
-
-bool CUDADevice::use_split_kernel()
-{
-  return DebugFlags().cuda.split_kernel;
-}
-
-/* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
-string CUDADevice::compile_kernel_get_common_cflags(
-    const DeviceRequestedFeatures &requested_features, bool filter, bool split)
-{
-  const int machine = system_cpu_bits();
-  const string source_path = path_get("source");
-  const string include_path = source_path;
-  string cflags = string_printf(
-      "-m%d "
-      "--ptxas-options=\"-v\" "
-      "--use_fast_math "
-      "-DNVCC "
-      "-I\"%s\"",
-      machine,
-      include_path.c_str());
-  if (!filter && use_adaptive_compilation()) {
-    cflags += " " + requested_features.get_build_options();
-  }
-  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-  if (extra_cflags) {
-    cflags += string(" ") + string(extra_cflags);
-  }
-
-  if (split) {
-    cflags += " -D__SPLIT__";
-  }
-
-#  ifdef WITH_NANOVDB
-  cflags += " -DWITH_NANOVDB";
-#  endif
-
-  return cflags;
-}
-
-string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
-                                  const char *name,
-                                  const char *base,
-                                  bool force_ptx)
-{
-  /* Compute kernel name. */
-  int major, minor;
-  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-  /* Attempt to use kernel provided with Blender. */
-  if (!use_adaptive_compilation()) {
-    if (!force_ptx) {
-      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-      if (path_exists(cubin)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return cubin;
-      }
-    }
-
-    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
-    int ptx_major = major, ptx_minor = minor;
-    while (ptx_major >= 3) {
-      const string ptx = path_get(
-          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
-      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
-      if (path_exists(ptx)) {
-        VLOG(1) << "Using precompiled kernel.";
-        return ptx;
-      }
-
-      if (ptx_minor > 0) {
-        ptx_minor--;
-      }
-      else {
-        ptx_major--;
-        ptx_minor = 9;
-      }
-    }
-  }
-
-  /* Try to use locally compiled kernel. */
-  string source_path = path_get("source");
-  const string source_md5 = path_files_md5_hash(source_path);
-
-  /* We include cflags into md5 so changing cuda toolkit or changing other
-   * compiler command line arguments makes sure cubin gets re-built.
-   */
-  string common_cflags = compile_kernel_get_common_cflags(
-      requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
-  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
-
-  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
-  const char *const kernel_arch = force_ptx ? "compute" : "sm";
-  const string cubin_file = string_printf(
-      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
-  const string cubin = path_cache_get(path_join("kernels", cubin_file));
-  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-  if (path_exists(cubin)) {
-    VLOG(1) << "Using locally compiled kernel.";
-    return cubin;
-  }
-
-#  ifdef _WIN32
-  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
-    if (major < 3) {
-      set_error(
-          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
-                        "Your GPU is not supported.",
-                        major,
-                        minor));
-    }
-    else {
-      set_error(
-          string_printf("CUDA binary kernel for this graphics card compute "
-                        "capability (%d.%d) not found.",
-                        major,
-                        minor));
-    }
-    return string();
-  }
-#  endif
-
-  /* Compile. */
-  const char *const nvcc = cuewCompilerPath();
-  if (nvcc == NULL) {
-    set_error(
-        "CUDA nvcc compiler not found. "
-        "Install CUDA toolkit in default location.");
-    return string();
-  }
-
-  const int nvcc_cuda_version = cuewCompilerVersion();
-  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
-  if (nvcc_cuda_version < 101) {
-    printf(
-        "Unsupported CUDA version %d.%d detected, "
-        "you need CUDA 10.1 or newer.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-    return string();
-  }
-  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
-             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
-    printf(
-        "CUDA version %d.%d detected, build may succeed but only "
-        "CUDA 10.1 to 11.4 are officially supported.\n",
-        nvcc_cuda_version / 10,
-        nvcc_cuda_version % 10);
-  }
-
-  double starttime = time_dt();
-
-  path_create_directories(cubin);
-
-  source_path = path_join(path_join(source_path, "kernel"),
-                          path_join("kernels", path_join(base, string_printf("%s.cu", name))));
-
-  string command = string_printf(
-      "\"%s\" "
-      "-arch=%s_%d%d "
-      "--%s \"%s\" "
-      "-o \"%s\" "
-      "%s",
-      nvcc,
-      kernel_arch,
-      major,
-      minor,
-      kernel_ext,
-      source_path.c_str(),
-      cubin.c_str(),
-      common_cflags.c_str());
-
-  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
-
-#  ifdef _WIN32
-  command = "call " + command;
-#  endif
-  if (system(command.c_str()) != 0) {
-    set_error(
-        "Failed to execute compilation command, "
-        "see console for details.");
-    return string();
-  }
-
-  /* Verify if compilation succeeded */
-  if (!path_exists(cubin)) {
-    set_error(
-        "CUDA kernel compilation failed, "
-        "see console for details.");
-    return string();
-  }
-
-  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-  return cubin;
-}
-
-bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  /* TODO(sergey): Support kernels re-load for CUDA devices.
-   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
-   */
-  if (cuFilterModule && cuModule) {
-    VLOG(1) << "Skipping kernel reload, not currently supported.";
-    return true;
-  }
-
-  /* check if cuda init succeeded */
-  if (cuContext == 0)
-    return false;
-
-  /* check if GPU is supported */
-  if (!support_device(requested_features))
-    return false;
-
-  /* get kernel */
-  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
-  string cubin = compile_kernel(requested_features, kernel_name);
-  if (cubin.empty())
-    return false;
-
-  const char *filter_name = "filter";
-  string filter_cubin = compile_kernel(requested_features, filter_name);
-  if (filter_cubin.empty())
-    return false;
-
-  /* open module */
-  CUDAContextScope scope(this);
-
-  string cubin_data;
-  CUresult result;
-
-  if (path_read_text(cubin, cubin_data))
-    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf(
-        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
-
-  if (path_read_text(filter_cubin, cubin_data))
-    result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-  else
-    result = CUDA_ERROR_FILE_NOT_FOUND;
-
-  if (result != CUDA_SUCCESS)
-    set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
-                            filter_cubin.c_str(),
-                            cuewErrorString(result)));
-
-  if (result == CUDA_SUCCESS) {
-    reserve_local_memory(requested_features);
-  }
-
-  load_functions();
-
-  return (result == CUDA_SUCCESS);
-}
-
-void CUDADevice::load_functions()
-{
-  /* TODO: load all functions here. */
-  if (functions.loaded) {
-    return;
-  }
-  functions.loaded = true;
-
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
-  cuda_assert(cuModuleGetFunction(
-      &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
-
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
-
-  int unused_min_blocks;
-  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
-                                               &functions.adaptive_num_threads_per_block,
-                                               functions.adaptive_scale_samples,
-                                               NULL,
-                                               0,
-                                               0));
-}
-
-void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
-{
-  if (use_split_kernel()) {
-    /* Split kernel mostly uses global memory and adaptive compilation,
-     * difficult to predict how much is needed currently. */
-    return;
-  }
-
-  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-   * needed for kernel launches, so that we can reliably figure out when
-   * to allocate scene data in mapped host memory. */
-  CUDAContextScope scope(this);
-
-  size_t total = 0, free_before = 0, free_after = 0;
-  cuMemGetInfo(&free_before, &total);
-
-  /* Get kernel function. */
-  CUfunction cuRender;
-
-  if (requested_features.use_baking) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (requested_features.use_integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-
-  /* Launch kernel, using just 1 block appears sufficient to reserve
-   * memory for all multiprocessors. It would be good to do this in
-   * parallel for the multi GPU case still to make it faster. */
-  CUdeviceptr d_work_tiles = 0;
-  uint total_work_size = 0;
-
-  void *args[] = {&d_work_tiles, &total_work_size};
-
-  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-  cuda_assert(cuCtxSynchronize());
-
-  cuMemGetInfo(&free_after, &total);
-  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
-          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
-
-#  if 0
-  /* For testing mapped host memory, fill up device memory. */
-  const size_t keep_mb = 1024;
-
-  while (free_after > keep_mb * 1024 * 1024LL) {
-    CUdeviceptr tmp;
-    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-    cuMemGetInfo(&free_after, &total);
-  }
-#  endif
-}
-
-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
-{
-  CUDAContextScope scope(this);
-
-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
-  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
-}
-
-void CUDADevice::generic_copy_to(device_memory &mem)
-{
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
-}
-
-void CUDADevice::generic_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
-
-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
-
-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-
-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
-}
-
-void CUDADevice::mem_alloc(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    assert(!"mem_alloc not supported for textures.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    assert(!"mem_alloc not supported for global memory.");
-  }
-  else {
-    generic_alloc(mem);
-  }
-}
-
-void CUDADevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS) {
-    assert(!"mem_copy_to not supported for pixels.");
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      generic_alloc(mem);
-    }
-    generic_copy_to(mem);
-  }
-}
-
-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_copy_from(mem, y, w, h);
-  }
-  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
-    assert(!"mem_copy_from not supported for textures.");
-  }
-  else if (mem.host_pointer) {
-    const size_t size = elem * w * h;
-    const size_t offset = elem * y * w;
-
-    if (mem.device_pointer) {
-      const CUDAContextScope scope(this);
-      cuda_assert(cuMemcpyDtoH(
-          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
-    }
-    else {
-      memset((char *)mem.host_pointer + offset, 0, size);
-    }
-  }
-}
-
-void CUDADevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-  if (!mem.device_pointer) {
-    return;
-  }
-
-  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
-   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
-  }
-  else if (mem.host_pointer) {
-    memset(mem.host_pointer, 0, mem.memory_size());
-  }
-}
-
-void CUDADevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_PIXELS && !background) {
-    pixels_free(mem);
-  }
-  else if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    generic_free(mem);
-  }
-}
-
-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
-{
-  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-}
-
-void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  CUDAContextScope scope(this);
-  CUdeviceptr mem;
-  size_t bytes;
-
-  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-  // assert(bytes == size);
-  cuda_assert(cuMemcpyHtoD(mem, host, size));
-}
-
-void CUDADevice::global_alloc(device_memory &mem)
-{
-  if (mem.is_resident(this)) {
-    generic_alloc(mem);
-    generic_copy_to(mem);
-  }
-
-  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
-}
-
-void CUDADevice::global_free(device_memory &mem)
-{
-  if (mem.is_resident(this) && mem.device_pointer) {
-    generic_free(mem);
-  }
-}
-
-void CUDADevice::tex_alloc(device_texture &mem)
-{
-  CUDAContextScope scope(this);
-
-  /* General variables for both architectures */
-  string bind_name = mem.name;
-  size_t dsize = datatype_size(mem.data_type);
-  size_t size = mem.memory_size();
-
-  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-  switch (mem.info.extension) {
-    case EXTENSION_REPEAT:
-      address_mode = CU_TR_ADDRESS_MODE_WRAP;
-      break;
-    case EXTENSION_EXTEND:
-      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-      break;
-    case EXTENSION_CLIP:
-      address_mode = CU_TR_ADDRESS_MODE_BORDER;
-      break;
-    default:
-      assert(0);
-      break;
-  }
-
-  CUfilter_mode filter_mode;
-  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
-    filter_mode = CU_TR_FILTER_MODE_POINT;
-  }
-  else {
-    filter_mode = CU_TR_FILTER_MODE_LINEAR;
-  }
-
-  /* Image Texture Storage */
-  CUarray_format_enum format;
-  switch (mem.data_type) {
-    case TYPE_UCHAR:
-      format = CU_AD_FORMAT_UNSIGNED_INT8;
-      break;
-    case TYPE_UINT16:
-      format = CU_AD_FORMAT_UNSIGNED_INT16;
-      break;
-    case TYPE_UINT:
-      format = CU_AD_FORMAT_UNSIGNED_INT32;
-      break;
-    case TYPE_INT:
-      format = CU_AD_FORMAT_SIGNED_INT32;
-      break;
-    case TYPE_FLOAT:
-      format = CU_AD_FORMAT_FLOAT;
-      break;
-    case TYPE_HALF:
-      format = CU_AD_FORMAT_HALF;
-      break;
-    default:
-      assert(0);
-      return;
-  }
-
-  CUDAMem *cmem = NULL;
-  CUarray array_3d = NULL;
-  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-  size_t dst_pitch = src_pitch;
-
-  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-
-    if (mem.data_depth > 1) {
-      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
-    }
-    else if (mem.data_height > 0) {
-      dst_pitch = align_up(src_pitch, pitch_alignment);
-    }
-  }
-  else if (mem.data_depth > 1) {
-    /* 3D texture using array, there is no API for linear memory. */
-    CUDA_ARRAY3D_DESCRIPTOR desc;
-
-    desc.Width = mem.data_width;
-    desc.Height = mem.data_height;
-    desc.Depth = mem.data_depth;
-    desc.Format = format;
-    desc.NumChannels = mem.data_elements;
-    desc.Flags = 0;
-
-    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-    if (!array_3d) {
-      return;
-    }
-
-    CUDA_MEMCPY3D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-    param.dstArray = array_3d;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-    param.Depth = mem.data_depth;
-
-    cuda_assert(cuMemcpy3D(&param));
-
-    mem.device_pointer = (device_ptr)array_3d;
-    mem.device_size = size;
-    stats.mem_alloc(size);
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-    cmem->texobject = 0;
-    cmem->array = array_3d;
-  }
-  else if (mem.data_height > 0) {
-    /* 2D texture, using pitch aligned linear memory. */
-    dst_pitch = align_up(src_pitch, pitch_alignment);
-    size_t dst_size = dst_pitch * mem.data_height;
-
-    cmem = generic_alloc(mem, dst_size - mem.memory_size());
-    if (!cmem) {
-      return;
-    }
-
-    CUDA_MEMCPY2D param;
-    memset(&param, 0, sizeof(param));
-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem.device_pointer;
-    param.dstPitch = dst_pitch;
-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost = mem.host_pointer;
-    param.srcPitch = src_pitch;
-    param.WidthInBytes = param.srcPitch;
-    param.Height = mem.data_height;
-
-    cuda_assert(cuMemcpy2DUnaligned(&param));
-  }
-  else {
-    /* 1D texture, using linear memory. */
-    cmem = generic_alloc(mem);
-    if (!cmem) {
-      return;
-    }
-
-    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-  }
-
-  /* Resize once */
-  const uint slot = mem.slot;
-  if (slot >= texture_info.size()) {
-    /* Allocate some slots in advance, to reduce amount
-     * of re-allocations. */
-    texture_info.resize(slot + 128);
-  }
-
-  /* Set Mapping and tag that we need to (re-)upload to device */
-  texture_info[slot] = mem.info;
-  need_texture_info = true;
-
-  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
-    CUDA_RESOURCE_DESC resDesc;
-    memset(&resDesc, 0, sizeof(resDesc));
-
-    if (array_3d) {
-      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-      resDesc.res.array.hArray = array_3d;
-      resDesc.flags = 0;
-    }
-    else if (mem.data_height > 0) {
-      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-      resDesc.res.pitch2D.devPtr = mem.device_pointer;
-      resDesc.res.pitch2D.format = format;
-      resDesc.res.pitch2D.numChannels = mem.data_elements;
-      resDesc.res.pitch2D.height = mem.data_height;
-      resDesc.res.pitch2D.width = mem.data_width;
-      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-    }
-    else {
-      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-      resDesc.res.linear.devPtr = mem.device_pointer;
-      resDesc.res.linear.format = format;
-      resDesc.res.linear.numChannels = mem.data_elements;
-      resDesc.res.linear.sizeInBytes = mem.device_size;
-    }
-
-    CUDA_TEXTURE_DESC texDesc;
-    memset(&texDesc, 0, sizeof(texDesc));
-    texDesc.addressMode[0] = address_mode;
-    texDesc.addressMode[1] = address_mode;
-    texDesc.addressMode[2] = address_mode;
-    texDesc.filterMode = filter_mode;
-    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
-
-    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-    texture_info[slot].data = (uint64_t)cmem->texobject;
-  }
-  else {
-    texture_info[slot].data = (uint64_t)mem.device_pointer;
-  }
-}
-
-void CUDADevice::tex_free(device_texture &mem)
-{
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    const CUDAMem &cmem = cuda_mem_map[&mem];
-
-    if (cmem.texobject) {
-      /* Free bindless texture. */
-      cuTexObjectDestroy(cmem.texobject);
-    }
-
-    if (!mem.is_resident(this)) {
-      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else if (cmem.array) {
-      /* Free array. */
-      cuArrayDestroy(cmem.array);
-      stats.mem_free(mem.device_size);
-      mem.device_pointer = 0;
-      mem.device_size = 0;
-
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
-    }
-    else {
-      lock.unlock();
-      generic_free(mem);
-    }
-  }
-}
-
-#  define CUDA_GET_BLOCKSIZE(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int threads = (int)sqrt((float)threads_per_block); \
-    int xblocks = ((w) + threads - 1) / threads; \
-    int yblocks = ((h) + threads - 1) / threads;
-
-#  define CUDA_LAUNCH_KERNEL(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
-
-/* Similar as above, but for 1-dimensional blocks. */
-#  define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
-    int threads_per_block; \
-    cuda_assert( \
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-    int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
-    int yblocks = h;
-
-#  define CUDA_LAUNCH_KERNEL_1D(func, args) \
-    cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
-
-bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
-                                           device_ptr guide_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr out_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-  int frame_offset = 0;
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
-  CUdeviceptr scale_ptr = 0;
-
-  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
-  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
-
-  {
-    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-    cuda_assert(cuModuleGetFunction(
-        &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
-    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
-
-    void *calc_difference_args[] = {&guide_ptr,
-                                    &variance_ptr,
-                                    &scale_ptr,
-                                    &difference,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &channel_offset,
-                                    &frame_offset,
-                                    &a,
-                                    &k_2};
-    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *calc_weight_args[] = {
-        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-    void *update_output_args[] = {&blurDifference,
-                                  &image_ptr,
-                                  &out_ptr,
-                                  &weightAccum,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &channel_offset,
-                                  &r,
-                                  &f};
-
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-    CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-  }
-
-  {
-    CUfunction cuNLMNormalize;
-    cuda_assert(
-        cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-    cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-    void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-    CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-    CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-    cuda_assert(cuCtxSynchronize());
-  }
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterConstructTransform;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
-
-  void *args[] = {&task->buffer.mem.device_pointer,
-                  &task->tile_info_mem.device_pointer,
-                  &task->storage.transform.device_pointer,
-                  &task->storage.rank.device_pointer,
-                  &task->filter_area,
-                  &task->rect,
-                  &task->radius,
-                  &task->pca_threshold,
-                  &task->buffer.pass_stride,
-                  &task->buffer.frame_stride,
-                  &task->buffer.use_time};
-  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
-                                      device_ptr color_variance_ptr,
-                                      device_ptr scale_ptr,
-                                      int frame,
-                                      DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  int r = task->radius;
-  int f = 4;
-  float a = 1.0f;
-  float k_2 = task->nlm_k_2;
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  if (have_error())
-    return false;
-
-  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
-  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
-
-  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-  cuda_assert(
-      cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-  cuda_assert(cuModuleGetFunction(
-      &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
-  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                        task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                        num_shifts);
-
-  void *calc_difference_args[] = {&color_ptr,
-                                  &color_variance_ptr,
-                                  &scale_ptr,
-                                  &difference,
-                                  &w,
-                                  &h,
-                                  &stride,
-                                  &pass_stride,
-                                  &r,
-                                  &pass_stride,
-                                  &frame_offset,
-                                  &a,
-                                  &k_2};
-  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-  void *construct_gramian_args[] = {&t,
-                                    &blurDifference,
-                                    &task->buffer.mem.device_pointer,
-                                    &task->storage.transform.device_pointer,
-                                    &task->storage.rank.device_pointer,
-                                    &task->storage.XtWX.device_pointer,
-                                    &task->storage.XtWY.device_pointer,
-                                    &task->reconstruction_state.filter_window,
-                                    &w,
-                                    &h,
-                                    &stride,
-                                    &pass_stride,
-                                    &r,
-                                    &f,
-                                    &frame_offset,
-                                    &task->buffer.use_time};
-
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  CUfunction cuFinalize;
-  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-  void *finalize_args[] = {&output_ptr,
-                           &task->storage.rank.device_pointer,
-                           &task->storage.XtWX.device_pointer,
-                           &task->storage.XtWY.device_pointer,
-                           &task->filter_area,
-                           &task->reconstruction_state.buffer_params.x,
-                           &task->render_buffer.samples};
-  CUDA_GET_BLOCKSIZE(
-      cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
-  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
-                                          device_ptr b_ptr,
-                                          device_ptr mean_ptr,
-                                          device_ptr variance_ptr,
-                                          int r,
-                                          int4 rect,
-                                          DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterCombineHalves;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
-  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
-                                         device_ptr b_ptr,
-                                         device_ptr sample_variance_ptr,
-                                         device_ptr sv_variance_ptr,
-                                         device_ptr buffer_variance_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDivideShadow;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &a_ptr,
-                  &b_ptr,
-                  &sample_variance_ptr,
-                  &sv_variance_ptr,
-                  &buffer_variance_ptr,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_get_feature(int mean_offset,
-                                       int variance_offset,
-                                       device_ptr mean_ptr,
-                                       device_ptr variance_ptr,
-                                       float scale,
-                                       DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterGetFeature;
-  cuda_assert(
-      cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->tile_info_mem.device_pointer,
-                  &mean_offset,
-                  &variance_offset,
-                  &mean_ptr,
-                  &variance_ptr,
-                  &scale,
-                  &task->rect,
-                  &task->render_buffer.pass_stride,
-                  &task->render_buffer.offset};
-  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_write_feature(int out_offset,
-                                         device_ptr from_ptr,
-                                         device_ptr buffer_ptr,
-                                         DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterWriteFeature;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  void *args[] = {&task->render_buffer.samples,
-                  &task->reconstruction_state.buffer_params,
-                  &task->filter_area,
-                  &from_ptr,
-                  &buffer_ptr,
-                  &out_offset,
-                  &task->rect};
-  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
-                                           device_ptr variance_ptr,
-                                           device_ptr depth_ptr,
-                                           device_ptr output_ptr,
-                                           DenoisingTask *task)
-{
-  if (have_error())
-    return false;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilterDetectOutliers;
-  cuda_assert(cuModuleGetFunction(
-      &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-  CUDA_GET_BLOCKSIZE(
-      cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  void *args[] = {
-      &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
-
-  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-  cuda_assert(cuCtxSynchronize());
-
-  return !have_error();
-}
-
-void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &CUDADevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void CUDADevice::adaptive_sampling_filter(uint filter_sample,
-                                          WorkTile *wtile,
-                                          CUdeviceptr d_wtile,
-                                          CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-
-  /* These are a series of tiny kernels because there is no grid synchronization
-   * from within a kernel, so multiple kernel launches it is. */
-  uint total_work_size = wtile->h * wtile->w;
-  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->h;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-  total_work_size = wtile->w;
-  num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args2,
-                             0));
-}
-
-void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
-                                        WorkTile *wtile,
-                                        CUdeviceptr d_wtile,
-                                        CUstream stream)
-{
-  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
-  uint total_work_size = wtile->h * wtile->w;
-
-  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
-  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
-                             num_blocks,
-                             1,
-                             1,
-                             num_threads_per_block,
-                             1,
-                             1,
-                             0,
-                             stream,
-                             args,
-                             0));
-}
-
-void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else if (task.integrator_branched) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  WorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::film_convert(DeviceTask &task,
-                              device_ptr buffer,
-                              device_ptr rgba_byte,
-                              device_ptr rgba_half)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuFilmConvert;
-  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
-  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
-
-  /* get kernel function */
-  if (rgba_half) {
-    cuda_assert(
-        cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-  }
-
-  float sample_scale = 1.0f / (task.sample + 1);
-
-  /* pass in parameters */
-  void *args[] = {&d_rgba,
-                  &d_buffer,
-                  &sample_scale,
-                  &task.x,
-                  &task.y,
-                  &task.w,
-                  &task.h,
-                  &task.offset,
-                  &task.stride};
-
-  /* launch kernel */
-  int threads_per_block;
-  cuda_assert(cuFuncGetAttribute(
-      &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-  int xthreads = (int)sqrt(threads_per_block);
-  int ythreads = (int)sqrt(threads_per_block);
-  int xblocks = (task.w + xthreads - 1) / xthreads;
-  int yblocks = (task.h + ythreads - 1) / ythreads;
-
-  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-  cuda_assert(cuLaunchKernel(cuFilmConvert,
-                             xblocks,
-                             yblocks,
-                             1, /* blocks */
-                             xthreads,
-                             ythreads,
-                             1, /* threads */
-                             0,
-                             0,
-                             args,
-                             0));
-
-  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
-
-  cuda_assert(cuCtxSynchronize());
-}
-
-void CUDADevice::shader(DeviceTask &task)
-{
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-
-  CUfunction cuShader;
-  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
-  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
-
-  /* get kernel function */
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-  }
-
-  /* do tasks in smaller chunks, so we can cancel it */
-  const int shader_chunk_size = 65536;
-  const int start = task.shader_x;
-  const int end = task.shader_x + task.shader_w;
-  int offset = task.offset;
-
-  bool canceled = false;
-  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
-    for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-      int shader_w = min(shader_chunk_size, end - shader_x);
-
-      /* pass in parameters */
-      void *args[8];
-      int arg = 0;
-      args[arg++] = &d_input;
-      args[arg++] = &d_output;
-      args[arg++] = &task.shader_eval_type;
-      if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-        args[arg++] = &task.shader_filter;
-      }
-      args[arg++] = &shader_x;
-      args[arg++] = &shader_w;
-      args[arg++] = &offset;
-      args[arg++] = &sample;
-
-      /* launch kernel */
-      int threads_per_block;
-      cuda_assert(cuFuncGetAttribute(
-          &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-      int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
-
-      cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-      cuda_assert(cuLaunchKernel(cuShader,
-                                 xblocks,
-                                 1,
-                                 1, /* blocks */
-                                 threads_per_block,
-                                 1,
-                                 1, /* threads */
-                                 0,
-                                 0,
-                                 args,
-                                 0));
-
-      cuda_assert(cuCtxSynchronize());
-
-      if (task.get_cancel()) {
-        canceled = true;
-        break;
-      }
-    }
-
-    task.update_progress(NULL);
-  }
-}
-
-CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-    CUdeviceptr buffer;
-
-    size_t bytes;
-    cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-    cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-    return buffer;
-  }
-
-  return (CUdeviceptr)mem;
-}
-
-void CUDADevice::unmap_pixels(device_ptr mem)
-{
-  if (!background) {
-    PixelMem pmem = pixel_mem_map[mem];
-
-    cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-  }
-}
-
-void CUDADevice::pixels_alloc(device_memory &mem)
-{
-  PixelMem pmem;
-
-  pmem.w = mem.data_width;
-  pmem.h = mem.data_height;
-
-  CUDAContextScope scope(this);
-
-  glGenBuffers(1, &pmem.cuPBO);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  if (mem.data_type == TYPE_HALF)
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
-  else
-    glBufferData(
-        GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &pmem.cuTexId);
-  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-  if (mem.data_type == TYPE_HALF)
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-  else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-  glBindTexture(GL_TEXTURE_2D, 0);
-
-  CUresult result = cuGraphicsGLRegisterBuffer(
-      &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-  if (result == CUDA_SUCCESS) {
-    mem.device_pointer = pmem.cuTexId;
-    pixel_mem_map[mem.device_pointer] = pmem;
-
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    return;
-  }
-  else {
-    /* failed to register buffer, fallback to no interop */
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    background = true;
-  }
-}
-
-void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
-{
-  PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-  CUDAContextScope scope(this);
-
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-  size_t offset = sizeof(uchar) * 4 * y * w;
-  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
-  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-}
-
-void CUDADevice::pixels_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-    CUDAContextScope scope(this);
-
-    cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-    glDeleteBuffers(1, &pmem.cuPBO);
-    glDeleteTextures(1, &pmem.cuTexId);
-
-    pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-    mem.device_pointer = 0;
-
-    stats.mem_free(mem.device_size);
-    mem.device_size = 0;
-  }
-}
-
-void CUDADevice::draw_pixels(device_memory &mem,
-                             int y,
-                             int w,
-                             int h,
-                             int width,
-                             int height,
-                             int dx,
-                             int dy,
-                             int dw,
-                             int dh,
-                             bool transparent,
-                             const DeviceDrawParams &draw_params)
-{
-  assert(mem.type == MEM_PIXELS);
-
-  if (!background) {
-    const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-    PixelMem pmem = pixel_mem_map[mem.device_pointer];
-    float *vpointer;
-
-    CUDAContextScope scope(this);
-
-    /* for multi devices, this assumes the inefficient method that we allocate
-     * all pixels on the device even though we only render to a subset */
-    size_t offset = 4 * y * w;
-
-    if (mem.data_type == TYPE_HALF)
-      offset *= sizeof(GLhalf);
-    else
-      offset *= sizeof(uint8_t);
-
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-    if (mem.data_type == TYPE_HALF) {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
-    }
-    else {
-      glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
-    }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-    if (transparent) {
-      glEnable(GL_BLEND);
-      glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-    }
-
-    GLint shader_program;
-    if (use_fallback_shader) {
-      if (!bind_fallback_display_space_shader(dw, dh)) {
-        return;
-      }
-      shader_program = fallback_shader_program;
-    }
-    else {
-      draw_params.bind_display_space_shader_cb();
-      glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-    }
-
-    if (!vertex_buffer) {
-      glGenBuffers(1, &vertex_buffer);
-    }
-
-    glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-    /* invalidate old contents -
-     * avoids stalling if buffer is still waiting in queue to be rendered */
-    glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-    vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-    if (vpointer) {
-      /* texture coordinate - vertex pair */
-      vpointer[0] = 0.0f;
-      vpointer[1] = 0.0f;
-      vpointer[2] = dx;
-      vpointer[3] = dy;
-
-      vpointer[4] = (float)w / (float)pmem.w;
-      vpointer[5] = 0.0f;
-      vpointer[6] = (float)width + dx;
-      vpointer[7] = dy;
-
-      vpointer[8] = (float)w / (float)pmem.w;
-      vpointer[9] = (float)h / (float)pmem.h;
-      vpointer[10] = (float)width + dx;
-      vpointer[11] = (float)height + dy;
-
-      vpointer[12] = 0.0f;
-      vpointer[13] = (float)h / (float)pmem.h;
-      vpointer[14] = dx;
-      vpointer[15] = (float)height + dy;
-
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-
-    GLuint vertex_array_object;
-    GLuint position_attribute, texcoord_attribute;
-
-    glGenVertexArrays(1, &vertex_array_object);
-    glBindVertexArray(vertex_array_object);
-
-    texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-    position_attribute = glGetAttribLocation(shader_program, "pos");
-
-    glEnableVertexAttribArray(texcoord_attribute);
-    glEnableVertexAttribArray(position_attribute);
-
-    glVertexAttribPointer(
-        texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-    glVertexAttribPointer(position_attribute,
-                          2,
-                          GL_FLOAT,
-                          GL_FALSE,
-                          4 * sizeof(float),
-                          (const GLvoid *)(sizeof(float) * 2));
-
-    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-    if (use_fallback_shader) {
-      glUseProgram(0);
-    }
-    else {
-      draw_params.unbind_display_space_shader_cb();
-    }
-
-    if (transparent) {
-      glDisable(GL_BLEND);
-    }
-
-    glBindTexture(GL_TEXTURE_2D, 0);
-
-    return;
-  }
-
-  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    DeviceRequestedFeatures requested_features;
-    if (use_split_kernel()) {
-      if (split_kernel == NULL) {
-        split_kernel = new CUDASplitKernel(this);
-        split_kernel->load_kernels(requested_features);
-      }
-    }
-
-    device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel()) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-        }
-        else {
-          render(task, tile, work_tiles);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-
-        denoise(tile, denoising);
-
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-
-    cuda_assert(cuCtxSynchronize());
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void CUDADevice::task_add(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  /* Load texture info. */
-  load_texture_info();
-
-  /* Synchronize all memory copies before executing task. */
-  cuda_assert(cuCtxSynchronize());
-
-  if (task.type == DeviceTask::FILM_CONVERT) {
-    /* must be done in main thread due to opengl access */
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-}
-
-void CUDADevice::task_wait()
-{
-  task_pool.wait();
-}
-
-void CUDADevice::task_cancel()
-{
-  task_pool.cancel();
-}
-
-/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
- * now that the definition of that class is complete
- */
-#  undef cuda_assert
-#  define cuda_assert(stmt) \
-    { \
-      CUresult result = stmt; \
-      if (result != CUDA_SUCCESS) { \
-        const char *name = cuewErrorString(result); \
-        device->set_error( \
-            string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
-      } \
-    } \
-    (void)0
-
-/* CUDA context scope. */
-
-CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
-{
-  cuda_assert(cuCtxPushCurrent(device->cuContext));
-}
-
-CUDAContextScope::~CUDAContextScope()
-{
-  cuda_assert(cuCtxPopCurrent(NULL));
-}
-
-/* split kernel */
-
-class CUDASplitKernelFunction : public SplitKernelFunction {
-  CUDADevice *device;
-  CUfunction func;
-
- public:
-  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
-  {
-    return enqueue(dim, NULL);
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  bool enqueue(const KernelDimensions &dim, void *args[])
-  {
-    if (device->have_error())
-      return false;
-
-    CUDAContextScope scope(device);
-
-    /* we ignore dim.local_size for now, as this is faster */
-    int threads_per_block;
-    cuda_assert(
-        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
-                  threads_per_block;
-
-    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-    cuda_assert(cuLaunchKernel(func,
-                               xblocks,
-                               1,
-                               1, /* blocks */
-                               threads_per_block,
-                               1,
-                               1, /* threads */
-                               0,
-                               0,
-                               args,
-                               0));
-
-    return !device->have_error();
-  }
-};
-
-CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
-                                            device_memory & /*data*/,
-                                            size_t num_threads)
-{
-  CUDAContextScope scope(device);
-
-  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-  size_buffer.alloc(1);
-  size_buffer.zero_to_device();
-
-  uint threads = num_threads;
-  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
-
-  struct args_t {
-    uint *num_threads;
-    CUdeviceptr *size;
-  };
-
-  args_t args = {&threads, &d_size};
-
-  CUfunction state_buffer_size;
-  cuda_assert(
-      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
-
-  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
-
-  size_buffer.copy_from_device(0, 1, 1);
-  size_t size = size_buffer[0];
-  size_buffer.free();
-
-  return size;
-}
-
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                     RenderTile &rtile,
-                                                     int num_global_elements,
-                                                     device_memory & /*kernel_globals*/,
-                                                     device_memory & /*kernel_data*/,
-                                                     device_memory &split_data,
-                                                     device_memory &ray_state,
-                                                     device_memory &queue_index,
-                                                     device_memory &use_queues_flag,
-                                                     device_memory &work_pool_wgs)
-{
-  CUDAContextScope scope(device);
-
-  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
-  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
-  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
-  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
-  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
-
-  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
-
-  int end_sample = rtile.start_sample + rtile.num_samples;
-  int queue_size = dim.global_size[0] * dim.global_size[1];
-
-  struct args_t {
-    CUdeviceptr *split_data_buffer;
-    int *num_elements;
-    CUdeviceptr *ray_state;
-    int *start_sample;
-    int *end_sample;
-    int *sx;
-    int *sy;
-    int *sw;
-    int *sh;
-    int *offset;
-    int *stride;
-    CUdeviceptr *queue_index;
-    int *queuesize;
-    CUdeviceptr *use_queues_flag;
-    CUdeviceptr *work_pool_wgs;
-    int *num_samples;
-    CUdeviceptr *buffer;
-  };
-
-  args_t args = {&d_split_data,
-                 &num_global_elements,
-                 &d_ray_state,
-                 &rtile.start_sample,
-                 &end_sample,
-                 &rtile.x,
-                 &rtile.y,
-                 &rtile.w,
-                 &rtile.h,
-                 &rtile.offset,
-                 &rtile.stride,
-                 &d_queue_index,
-                 &queue_size,
-                 &d_use_queues_flag,
-                 &d_work_pool_wgs,
-                 &rtile.num_samples,
-                 &d_buffer};
-
-  CUfunction data_init;
-  cuda_assert(
-      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-  if (device->have_error()) {
-    return false;
-  }
-
-  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
-
-  return !device->have_error();
-}
-
-SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                                const DeviceRequestedFeatures &)
-{
-  const CUDAContextScope scope(device);
-
-  CUfunction func;
-  const CUresult result = cuModuleGetFunction(
-      &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
-  if (result != CUDA_SUCCESS) {
-    device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
-                                    kernel_name.data(),
-                                    cuewErrorString(result)));
-    return NULL;
-  }
-
-  return new CUDASplitKernelFunction(device, func);
-}
-
-int2 CUDASplitKernel::split_kernel_local_size()
-{
-  return make_int2(32, 1);
-}
-
-int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
-                                               device_memory &data,
-                                               DeviceTask & /*task*/)
-{
-  CUDAContextScope scope(device);
-  size_t free;
-  size_t total;
-
-  cuda_assert(cuMemGetInfo(&free, &total));
-
-  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
-          << " bytes. (" << string_human_readable_size(free) << ").";
-
-  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-  size_t side = round_down((int)sqrt(num_elements), 32);
-  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-  VLOG(1) << "Global size: " << global_size << ".";
-  return global_size;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp
new file mode 100644
index 00000000000..37fab8f8293
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -0,0 +1,1370 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include <climits>
+#  include <limits.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+
+#  include "device/cuda/device_impl.h"
+
+#  include "render/buffers.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_foreach.h"
+#  include "util/util_logging.h"
+#  include "util/util_map.h"
+#  include "util/util_md5.h"
+#  include "util/util_opengl.h"
+#  include "util/util_path.h"
+#  include "util/util_string.h"
+#  include "util/util_system.h"
+#  include "util/util_time.h"
+#  include "util/util_types.h"
+#  include "util/util_windows.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+bool CUDADevice::have_precompiled_kernels()
+{
+  string cubins_path = path_get("lib");
+  return path_exists(cubins_path);
+}
+
+bool CUDADevice::show_samples() const
+{
+  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+  return true;
+}
+
+BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+void CUDADevice::set_error(const string &error)
+{
+  Device::set_error(error);
+
+  if (first_error) {
+    fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+    fprintf(stderr,
+            "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
+    first_error = false;
+  }
+}
+
+CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL)
+{
+  first_error = true;
+
+  cuDevId = info.num;
+  cuDevice = 0;
+  cuContext = 0;
+
+  cuModule = 0;
+
+  need_texture_info = false;
+
+  device_texture_headroom = 0;
+  device_working_headroom = 0;
+  move_texture_to_host = false;
+  map_host_limit = 0;
+  map_host_used = 0;
+  can_map_host = 0;
+  pitch_alignment = 0;
+
+  /* Initialize CUDA. */
+  CUresult result = cuInit(0);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  /* Setup device and context. */
+  result = cuDeviceGet(&cuDevice, cuDevId);
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
+                            cuewErrorString(result)));
+    return;
+  }
+
+  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+   * so we can predict which memory to map to host. */
+  cuda_assert(
+      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
+  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+  if (can_map_host) {
+    ctx_flags |= CU_CTX_MAP_HOST;
+    init_host_memory();
+  }
+
+  /* Create context. */
+  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+  if (result != CUDA_SUCCESS) {
+    set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
+    return;
+  }
+
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+  cuDevArchitecture = major * 100 + minor * 10;
+
+  /* Pop context set by cuCtxCreate. */
+  cuCtxPopCurrent(NULL);
+}
+
+CUDADevice::~CUDADevice()
+{
+  texture_info.free();
+
+  cuda_assert(cuCtxDestroy(cuContext));
+}
+
+bool CUDADevice::support_device(const uint /*kernel_features*/)
+{
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* We only support sm_30 and above */
+  if (major < 3) {
+    set_error(string_printf(
+        "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
+    return false;
+  }
+
+  return true;
+}
+
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
+    if (result != CUDA_SUCCESS) {
+      set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
+                              cuewErrorString(result)));
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CUDADevice::use_adaptive_compilation()
+{
+  return DebugFlags().cuda.adaptive_compile;
+}
+
+/* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  const int machine = system_cpu_bits();
+  const string source_path = path_get("source");
+  const string include_path = source_path;
+  string cflags = string_printf(
+      "-m%d "
+      "--ptxas-options=\"-v\" "
+      "--use_fast_math "
+      "-DNVCC "
+      "-I\"%s\"",
+      machine,
+      include_path.c_str());
+  if (use_adaptive_compilation()) {
+    cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+  }
+  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+  if (extra_cflags) {
+    cflags += string(" ") + string(extra_cflags);
+  }
+
+#  ifdef WITH_NANOVDB
+  cflags += " -DWITH_NANOVDB";
+#  endif
+
+  return cflags;
+}
+
+string CUDADevice::compile_kernel(const uint kernel_features,
+                                  const char *name,
+                                  const char *base,
+                                  bool force_ptx)
+{
+  /* Compute kernel name. */
+  int major, minor;
+  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+  /* Attempt to use kernel provided with Blender. */
+  if (!use_adaptive_compilation()) {
+    if (!force_ptx) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
+    int ptx_major = major, ptx_minor = minor;
+    while (ptx_major >= 3) {
+      const string ptx = path_get(
+          string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
+
+      if (ptx_minor > 0) {
+        ptx_minor--;
+      }
+      else {
+        ptx_major--;
+        ptx_minor = 9;
+      }
+    }
+  }
+
+  /* Try to use locally compiled kernel. */
+  string source_path = path_get("source");
+  const string source_md5 = path_files_md5_hash(source_path);
+
+  /* We include cflags into md5 so changing cuda toolkit or changing other
+   * compiler command line arguments makes sure cubin gets re-built.
+   */
+  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
+  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
+
+  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
+  const char *const kernel_arch = force_ptx ? "compute" : "sm";
+  const string cubin_file = string_printf(
+      "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
+  const string cubin = path_cache_get(path_join("kernels", cubin_file));
+  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+  if (path_exists(cubin)) {
+    VLOG(1) << "Using locally compiled kernel.";
+    return cubin;
+  }
+
+#  ifdef _WIN32
+  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
+    if (major < 3) {
+      set_error(
+          string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
+                        "Your GPU is not supported.",
+                        major,
+                        minor));
+    }
+    else {
+      set_error(
+          string_printf("CUDA binary kernel for this graphics card compute "
+                        "capability (%d.%d) not found.",
+                        major,
+                        minor));
+    }
+    return string();
+  }
+#  endif
+
+  /* Compile. */
+  const char *const nvcc = cuewCompilerPath();
+  if (nvcc == NULL) {
+    set_error(
+        "CUDA nvcc compiler not found. "
+        "Install CUDA toolkit in default location.");
+    return string();
+  }
+
+  const int nvcc_cuda_version = cuewCompilerVersion();
+  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
+  if (nvcc_cuda_version < 101) {
+    printf(
+        "Unsupported CUDA version %d.%d detected, "
+        "you need CUDA 10.1 or newer.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+    return string();
+  }
+  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
+             nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
+    printf(
+        "CUDA version %d.%d detected, build may succeed but only "
+        "CUDA 10.1 to 11.4 are officially supported.\n",
+        nvcc_cuda_version / 10,
+        nvcc_cuda_version % 10);
+  }
+
+  double starttime = time_dt();
+
+  path_create_directories(cubin);
+
+  source_path = path_join(path_join(source_path, "kernel"),
+                          path_join("device", path_join(base, string_printf("%s.cu", name))));
+
+  string command = string_printf(
+      "\"%s\" "
+      "-arch=%s_%d%d "
+      "--%s \"%s\" "
+      "-o \"%s\" "
+      "%s",
+      nvcc,
+      kernel_arch,
+      major,
+      minor,
+      kernel_ext,
+      source_path.c_str(),
+      cubin.c_str(),
+      common_cflags.c_str());
+
+  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
+
+#  ifdef _WIN32
+  command = "call " + command;
+#  endif
+  if (system(command.c_str()) != 0) {
+    set_error(
+        "Failed to execute compilation command, "
+        "see console for details.");
+    return string();
+  }
+
+  /* Verify if compilation succeeded */
+  if (!path_exists(cubin)) {
+    set_error(
+        "CUDA kernel compilation failed, "
+        "see console for details.");
+    return string();
+  }
+
+  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+  return cubin;
+}
+
+bool CUDADevice::load_kernels(const uint kernel_features)
+{
+  /* TODO(sergey): Support kernels re-load for CUDA devices.
+   *
+   * Currently re-loading kernel will invalidate memory pointers,
+   * causing problems in cuCtxSynchronize.
+   */
+  if (cuModule) {
+    VLOG(1) << "Skipping kernel reload, not currently supported.";
+    return true;
+  }
+
+  /* check if cuda init succeeded */
+  if (cuContext == 0)
+    return false;
+
+  /* check if GPU is supported */
+  if (!support_device(kernel_features))
+    return false;
+
+  /* get kernel */
+  const char *kernel_name = "kernel";
+  string cubin = compile_kernel(kernel_features, kernel_name);
+  if (cubin.empty())
+    return false;
+
+  /* open module */
+  CUDAContextScope scope(this);
+
+  string cubin_data;
+  CUresult result;
+
+  if (path_read_text(cubin, cubin_data))
+    result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+  else
+    result = CUDA_ERROR_FILE_NOT_FOUND;
+
+  if (result != CUDA_SUCCESS)
+    set_error(string_printf(
+        "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
+
+  if (result == CUDA_SUCCESS) {
+    kernels.load(this);
+    reserve_local_memory(kernel_features);
+  }
+
+  return (result == CUDA_SUCCESS);
+}
+
+void CUDADevice::reserve_local_memory(const uint /* kernel_features */)
+{
+  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+   * needed for kernel launches, so that we can reliably figure out when
+   * to allocate scene data in mapped host memory. */
+  size_t total = 0, free_before = 0, free_after = 0;
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_before, &total);
+  }
+
+  {
+    /* Use the biggest kernel for estimation. */
+    const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE;
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
+     * multiprocessors. It would be good to do this in parallel for the multi GPU case
+     * still to make it faster. */
+    CUDADeviceQueue queue(this);
+
+    void *d_path_index = nullptr;
+    void *d_render_buffer = nullptr;
+    int d_work_size = 0;
+    void *args[] = {&d_path_index, &d_render_buffer, &d_work_size};
+
+    queue.init_execution();
+    queue.enqueue(test_kernel, 1, args);
+    queue.synchronize();
+  }
+
+  {
+    CUDAContextScope scope(this);
+    cuMemGetInfo(&free_after, &total);
+  }
+
+  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+          << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
+
+#  if 0
+  /* For testing mapped host memory, fill up device memory. */
+  const size_t keep_mb = 1024;
+
+  while (free_after > keep_mb * 1024 * 1024LL) {
+    CUdeviceptr tmp;
+    cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+    cuMemGetInfo(&free_after, &total);
+  }
+#  endif
+}
+
+void CUDADevice::init_host_memory()
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep is free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower so that some space is left after all
+   * texture memory allocations. */
+  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+          << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void CUDADevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+  }
+}
+
+void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+      device_memory &mem = *pair.first;
+      CUDAMem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple CUDA devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  CUDAContextScope scope(this);
+
+  CUdeviceptr device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  cuMemGetInfo(&free, &total);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    cuMemGetInfo(&free, &total);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = cuMemAlloc(&device_pointer, size);
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (mem_alloc_result != CUDA_SUCCESS && can_map_host) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = CUDA_SUCCESS;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = cuMemHostAlloc(
+          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+
+      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
+             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result == CUDA_SUCCESS) {
+      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (mem_alloc_result != CUDA_SUCCESS) {
+    status = " failed, out of device and host memory";
+    set_error("System is out of GPU and shared host memory");
+  }
+
+  if (mem.name) {
+    VLOG(1) << "Buffer allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  CUDAMem *cmem = &cuda_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * CUDA memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void CUDADevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+   * mem.host_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(
+        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+  }
+}
+
+void CUDADevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          cuMemFreeHost(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      cuda_assert(cuMemFree(mem.device_pointer));
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+  }
+}
+
+void CUDADevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    generic_alloc(mem);
+  }
+}
+
+void CUDADevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer) {
+      generic_alloc(mem);
+    }
+    generic_copy_to(mem);
+  }
+}
+
+void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = elem * w * h;
+    const size_t offset = elem * y * w;
+
+    if (mem.device_pointer) {
+      const CUDAContextScope scope(this);
+      cuda_assert(cuMemcpyDtoH(
+          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+    }
+    else {
+      memset((char *)mem.host_pointer + offset, 0, size);
+    }
+  }
+}
+
+void CUDADevice::mem_zero(device_memory &mem)
+{
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
+   * regardless of mem.host_pointer and mem.shared_pointer. */
+  thread_scoped_lock lock(cuda_mem_map_mutex);
+  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    const CUDAContextScope scope(this);
+    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
+  }
+  else if (mem.host_pointer) {
+    memset(mem.host_pointer, 0, mem.memory_size());
+  }
+}
+
+void CUDADevice::mem_free(device_memory &mem)
+{
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+{
+  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+}
+
+void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  CUDAContextScope scope(this);
+  CUdeviceptr mem;
+  size_t bytes;
+
+  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+  // assert(bytes == size);
+  cuda_assert(cuMemcpyHtoD(mem, host, size));
+}
+
+void CUDADevice::global_alloc(device_memory &mem)
+{
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
+
+  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
+}
+
+void CUDADevice::global_free(device_memory &mem)
+{
+  if (mem.is_resident(this) && mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void CUDADevice::tex_alloc(device_texture &mem)
+{
+  CUDAContextScope scope(this);
+
+  /* General variables for both architectures */
+  string bind_name = mem.name;
+  size_t dsize = datatype_size(mem.data_type);
+  size_t size = mem.memory_size();
+
+  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+  switch (mem.info.extension) {
+    case EXTENSION_REPEAT:
+      address_mode = CU_TR_ADDRESS_MODE_WRAP;
+      break;
+    case EXTENSION_EXTEND:
+      address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+      break;
+    case EXTENSION_CLIP:
+      address_mode = CU_TR_ADDRESS_MODE_BORDER;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  CUfilter_mode filter_mode;
+  if (mem.info.interpolation == INTERPOLATION_CLOSEST) {
+    filter_mode = CU_TR_FILTER_MODE_POINT;
+  }
+  else {
+    filter_mode = CU_TR_FILTER_MODE_LINEAR;
+  }
+
+  /* Image Texture Storage */
+  CUarray_format_enum format;
+  switch (mem.data_type) {
+    case TYPE_UCHAR:
+      format = CU_AD_FORMAT_UNSIGNED_INT8;
+      break;
+    case TYPE_UINT16:
+      format = CU_AD_FORMAT_UNSIGNED_INT16;
+      break;
+    case TYPE_UINT:
+      format = CU_AD_FORMAT_UNSIGNED_INT32;
+      break;
+    case TYPE_INT:
+      format = CU_AD_FORMAT_SIGNED_INT32;
+      break;
+    case TYPE_FLOAT:
+      format = CU_AD_FORMAT_FLOAT;
+      break;
+    case TYPE_HALF:
+      format = CU_AD_FORMAT_HALF;
+      break;
+    default:
+      assert(0);
+      return;
+  }
+
+  CUDAMem *cmem = NULL;
+  CUarray array_3d = NULL;
+  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+  size_t dst_pitch = src_pitch;
+
+  if (!mem.is_resident(this)) {
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
+    /* 3D texture using array, there is no API for linear memory. */
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+
+    desc.Width = mem.data_width;
+    desc.Height = mem.data_height;
+    desc.Depth = mem.data_depth;
+    desc.Format = format;
+    desc.NumChannels = mem.data_elements;
+    desc.Flags = 0;
+
+    VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+            << string_human_readable_number(mem.memory_size()) << " bytes. ("
+            << string_human_readable_size(mem.memory_size()) << ")";
+
+    cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+    if (!array_3d) {
+      return;
+    }
+
+    CUDA_MEMCPY3D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    param.dstArray = array_3d;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+    param.Depth = mem.data_depth;
+
+    cuda_assert(cuMemcpy3D(&param));
+
+    mem.device_pointer = (device_ptr)array_3d;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+    cmem->array = array_3d;
+  }
+  else if (mem.data_height > 0) {
+    /* 2D texture, using pitch aligned linear memory. */
+    dst_pitch = align_up(src_pitch, pitch_alignment);
+    size_t dst_size = dst_pitch * mem.data_height;
+
+    cmem = generic_alloc(mem, dst_size - mem.memory_size());
+    if (!cmem) {
+      return;
+    }
+
+    CUDA_MEMCPY2D param;
+    memset(&param, 0, sizeof(param));
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem.device_pointer;
+    param.dstPitch = dst_pitch;
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost = mem.host_pointer;
+    param.srcPitch = src_pitch;
+    param.WidthInBytes = param.srcPitch;
+    param.Height = mem.data_height;
+
+    cuda_assert(cuMemcpy2DUnaligned(&param));
+  }
+  else {
+    /* 1D texture, using linear memory. */
+    cmem = generic_alloc(mem);
+    if (!cmem) {
+      return;
+    }
+
+    cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+  }
+
+  /* Resize once */
+  const uint slot = mem.slot;
+  if (slot >= texture_info.size()) {
+    /* Allocate some slots in advance, to reduce amount
+     * of re-allocations. */
+    texture_info.resize(slot + 128);
+  }
+
+  /* Set Mapping and tag that we need to (re-)upload to device */
+  texture_info[slot] = mem.info;
+  need_texture_info = true;
+
+  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
+      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    /* Kepler+, bindless textures. */
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    cmem = &cuda_mem_map[&mem];
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    texture_info[slot].data = (uint64_t)cmem->texobject;
+  }
+  else {
+    texture_info[slot].data = (uint64_t)mem.device_pointer;
+  }
+}
+
+void CUDADevice::tex_free(device_texture &mem)
+{
+  if (mem.device_pointer) {
+    CUDAContextScope scope(this);
+    thread_scoped_lock lock(cuda_mem_map_mutex);
+    const CUDAMem &cmem = cuda_mem_map[&mem];
+
+    if (cmem.texobject) {
+      /* Free bindless texture. */
+      cuTexObjectDestroy(cmem.texobject);
+    }
+
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
+      /* Free array. */
+      cuArrayDestroy(cmem.array);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else {
+      lock.unlock();
+      generic_free(mem);
+    }
+  }
+}
+
+#  if 0
+void CUDADevice::render(DeviceTask &task,
+                        RenderTile &rtile,
+                        device_vector<KernelWorkTile> &work_tiles)
+{
+  scoped_timer timer(&rtile.buffers->render_time);
+
+  if (have_error())
+    return;
+
+  CUDAContextScope scope(this);
+  CUfunction cuRender;
+
+  /* Get kernel function. */
+  if (rtile.task == RenderTile::BAKE) {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
+  }
+  else {
+    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
+  }
+
+  if (have_error()) {
+    return;
+  }
+
+  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
+
+  /* Allocate work tile. */
+  work_tiles.alloc(1);
+
+  KernelWorkTile *wtile = work_tiles.data();
+  wtile->x = rtile.x;
+  wtile->y = rtile.y;
+  wtile->w = rtile.w;
+  wtile->h = rtile.h;
+  wtile->offset = rtile.offset;
+  wtile->stride = rtile.stride;
+  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
+
+  /* Prepare work size. More step samples render faster, but for now we
+   * remain conservative for GPUs connected to a display to avoid driver
+   * timeouts and display freezing. */
+  int min_blocks, num_threads_per_block;
+  cuda_assert(
+      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
+  if (!info.display_device) {
+    min_blocks *= 8;
+  }
+
+  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+  /* Render all samples. */
+  uint start_sample = rtile.start_sample;
+  uint end_sample = rtile.start_sample + rtile.num_samples;
+
+  for (int sample = start_sample; sample < end_sample;) {
+    /* Setup and copy work tile to device. */
+    wtile->start_sample = sample;
+    wtile->num_samples = step_samples;
+    if (task.adaptive_sampling.use) {
+      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
+    }
+    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
+    work_tiles.copy_to_device();
+
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+    /* Launch kernel. */
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(
+        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
+    uint filter_sample = sample + wtile->num_samples - 1;
+    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
+      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
+    }
+
+    cuda_assert(cuCtxSynchronize());
+
+    /* Update progress. */
+    sample += wtile->num_samples;
+    rtile.sample = sample;
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+    if (task.get_cancel()) {
+      if (task.need_finish_queue == false)
+        break;
+    }
+  }
+
+  /* Finalize adaptive sampling. */
+  if (task.adaptive_sampling.use) {
+    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
+    adaptive_sampling_post(rtile, wtile, d_work_tiles);
+    cuda_assert(cuCtxSynchronize());
+    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+  }
+}
+
+void CUDADevice::thread_run(DeviceTask &task)
+{
+  CUDAContextScope scope(this);
+
+  if (task.type == DeviceTask::RENDER) {
+    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+    /* keep rendering tiles until done */
+    RenderTile tile;
+    DenoisingTask denoising(this, task);
+
+    while (task.acquire_tile(this, tile, task.tile_types)) {
+      if (tile.task == RenderTile::PATH_TRACE) {
+        render(task, tile, work_tiles);
+      }
+      else if (tile.task == RenderTile::BAKE) {
+        render(task, tile, work_tiles);
+      }
+
+      task.release_tile(tile);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+
+    work_tiles.free();
+  }
+}
+#  endif
+
+unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
+{
+  return make_unique<CUDADeviceQueue>(this);
+}
+
+bool CUDADevice::should_use_graphics_interop()
+{
+  /* Check whether this device is part of OpenGL context.
+   *
+   * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
+   * possible, but from the empiric measurements it can be considerably slower than using naive
+   * pixels copy. */
+
+  CUDAContextScope scope(this);
+
+  int num_all_devices = 0;
+  cuda_assert(cuDeviceGetCount(&num_all_devices));
+
+  if (num_all_devices == 0) {
+    return false;
+  }
+
+  vector<CUdevice> gl_devices(num_all_devices);
+  uint num_gl_devices;
+  cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
+
+  for (CUdevice gl_device : gl_devices) {
+    if (gl_device == cuDevice) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int CUDADevice::get_num_multiprocessors()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
+}
+
+int CUDADevice::get_max_num_threads_per_multiprocessor()
+{
+  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
+}
+
+bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
+{
+  CUDAContextScope scope(this);
+
+  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
+}
+
+int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
+{
+  int value = 0;
+  if (!get_device_attribute(attribute, &value)) {
+    return default_value;
+  }
+  return value;
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h
new file mode 100644
index 00000000000..6b27db54ab4
--- /dev/null
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/queue.h"
+#  include "device/cuda/util.h"
+#  include "device/device.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include "util/util_opengl.h"
+#    include <cuda.h>
+#    include <cudaGL.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class CUDADevice : public Device {
+
+  friend class CUDAContextScope;
+
+ public:
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+  thread_mutex cuda_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUDADeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~CUDADevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "cuda",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(CUdevice_attribute attribute, int *value);
+  int get_device_default_attribute(CUdevice_attribute attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp
new file mode 100644
index 00000000000..e8ca8b90eae
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/graphics_interop.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDADeviceGraphicsInterop::CUDADeviceGraphicsInterop(CUDADeviceQueue *queue)
+    : queue_(queue), device_(static_cast<CUDADevice *>(queue->device))
+{
+}
+
+CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop()
+{
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+}
+
+void CUDADeviceGraphicsInterop::set_destination(
+    const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  need_clear_ = destination.need_clear;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  CUDAContextScope scope(device_);
+
+  if (cu_graphics_resource_) {
+    cuda_device_assert(device_, cuGraphicsUnregisterResource(cu_graphics_resource_));
+  }
+
+  const CUresult result = cuGraphicsGLRegisterBuffer(
+      &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+  if (result != CUDA_SUCCESS) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr CUDADeviceGraphicsInterop::map()
+{
+  if (!cu_graphics_resource_) {
+    return 0;
+  }
+
+  CUDAContextScope scope(device_);
+
+  CUdeviceptr cu_buffer;
+  size_t bytes;
+
+  cuda_device_assert(device_, cuGraphicsMapResources(1, &cu_graphics_resource_, queue_->stream()));
+  cuda_device_assert(
+      device_, cuGraphicsResourceGetMappedPointer(&cu_buffer, &bytes, cu_graphics_resource_));
+
+  if (need_clear_) {
+    cuda_device_assert(
+        device_, cuMemsetD8Async(static_cast<CUdeviceptr>(cu_buffer), 0, bytes, queue_->stream()));
+
+    need_clear_ = false;
+  }
+
+  return static_cast<device_ptr>(cu_buffer);
+}
+
+void CUDADeviceGraphicsInterop::unmap()
+{
+  CUDAContextScope scope(device_);
+
+  cuda_device_assert(device_,
+                     cuGraphicsUnmapResources(1, &cu_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h
new file mode 100644
index 00000000000..8a70c8aa71d
--- /dev/null
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class CUDADeviceQueue;
+
+class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit CUDADeviceGraphicsInterop(CUDADeviceQueue *queue);
+
+  CUDADeviceGraphicsInterop(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop(CUDADeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~CUDADeviceGraphicsInterop();
+
+  CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete;
+  CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  CUDADeviceQueue *queue_ = nullptr;
+  CUDADevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  /* The destination was requested to be cleared. */
+  bool need_clear_ = false;
+
+  CUgraphicsResource cu_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
new file mode 100644
index 00000000000..a4a7bfabce0
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/kernel.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void CUDADeviceKernels::load(CUDADevice *device)
+{
+  CUmodule cuModule = device->cuModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    CUDADeviceKernel &kernel = kernels_[i];
+
+    /* No mega-kernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    cuda_device_assert(device,
+                       cuModuleGetFunction(&kernel.function, cuModule, function_name.c_str()));
+
+    if (kernel.function) {
+      cuda_device_assert(device, cuFuncSetCacheConfig(kernel.function, CU_FUNC_CACHE_PREFER_L1));
+
+      cuda_device_assert(
+          device,
+          cuOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, NULL, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const CUDADeviceKernel &CUDADeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool CUDADeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA*/
diff --git a/intern/cycles/device/cuda/kernel.h b/intern/cycles/device/cuda/kernel.h
new file mode 100644
index 00000000000..b489547a350
--- /dev/null
+++ b/intern/cycles/device/cuda/kernel.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* CUDA kernel and associate occupancy information. */
+class CUDADeviceKernel {
+ public:
+  CUfunction function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of CUDA kernels for each DeviceKernel. */
+class CUDADeviceKernels {
+ public:
+  void load(CUDADevice *device);
+  const CUDADeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  CUDADeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
new file mode 100644
index 00000000000..b7f86c10553
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/queue.h"
+
+#  include "device/cuda/device_impl.h"
+#  include "device/cuda/graphics_interop.h"
+#  include "device/cuda/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+CUDADeviceQueue::CUDADeviceQueue(CUDADevice *device)
+    : DeviceQueue(device), cuda_device_(device), cuda_stream_(nullptr)
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamCreate(&cuda_stream_, CU_STREAM_NON_BLOCKING));
+}
+
+CUDADeviceQueue::~CUDADeviceQueue()
+{
+  const CUDAContextScope scope(cuda_device_);
+  cuStreamDestroy(cuda_stream_);
+}
+
+int CUDADeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states = max(cuda_device_->get_num_multiprocessors() *
+                           cuda_device_->get_max_num_threads_per_multiprocessor() * 16,
+                       1048576);
+
+  const char *factor_str = getenv("CYCLES_CONCURRENT_STATES_FACTOR");
+  if (factor_str) {
+    num_states = max((int)(num_states * atof(factor_str)), 1024);
+  }
+
+  VLOG(3) << "GPU queue concurrent states: " << num_states << ", using up to "
+          << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int CUDADeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = cuda_device_->get_num_multiprocessors() *
+                              cuda_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void CUDADeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  CUDAContextScope scope(cuda_device_);
+  cuda_device_->load_texture_info();
+  cuda_device_assert(cuda_device_, cuCtxSynchronize());
+
+  debug_init_execution();
+}
+
+bool CUDADeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return cuda_device_->kernels.available(kernel);
+}
+
+bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+  const CUDADeviceKernel &cuda_kernel = cuda_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = cuda_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  cuda_device_assert(cuda_device_,
+                     cuLaunchKernel(cuda_kernel.function,
+                                    num_blocks,
+                                    1,
+                                    1,
+                                    num_threads_per_block,
+                                    1,
+                                    1,
+                                    shared_mem_bytes,
+                                    cuda_stream_,
+                                    args,
+                                    0));
+
+  return !(cuda_device_->have_error());
+}
+
+bool CUDADeviceQueue::synchronize()
+{
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  debug_synchronize();
+
+  return !(cuda_device_->have_error());
+}
+
+void CUDADeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    cuda_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+}
+
+void CUDADeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const CUDAContextScope scope(cuda_device_);
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyDtoHAsync(
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+}
+
+unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
+{
+  return make_unique<CUDADeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
new file mode 100644
index 00000000000..62e3aa3d6c2
--- /dev/null
+++ b/intern/cycles/device/cuda/queue.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/cuda/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+class device_memory;
+
+/* Base class for CUDA queues. */
+class CUDADeviceQueue : public DeviceQueue {
+ public:
+  CUDADeviceQueue(CUDADevice *device);
+  ~CUDADeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual CUstream stream()
+  {
+    return cuda_stream_;
+  }
+
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  CUDADevice *cuda_device_;
+  CUstream cuda_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.cpp b/intern/cycles/device/cuda/util.cpp
new file mode 100644
index 00000000000..8f657cc10fe
--- /dev/null
+++ b/intern/cycles/device/cuda/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_CUDA
+
+#  include "device/cuda/util.h"
+#  include "device/cuda/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
+{
+  cuda_device_assert(device, cuCtxPushCurrent(device->cuContext));
+}
+
+CUDAContextScope::~CUDAContextScope()
+{
+  cuda_device_assert(device, cuCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_CUDA_DYNLOAD
+const char *cuewErrorString(CUresult result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *cuewCompilerPath()
+{
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
+}
+
+int cuewCompilerVersion()
+{
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/cuda/util.h b/intern/cycles/device/cuda/util.h
new file mode 100644
index 00000000000..a0898094c08
--- /dev/null
+++ b/intern/cycles/device/cuda/util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_CUDA
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include "cuew.h"
+#  else
+#    include <cuda.h>
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class CUDADevice;
+
+/* Utility to push/pop CUDA context. */
+class CUDAContextScope {
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
+
+ private:
+  CUDADevice *device;
+};
+
+/* Utility for checking return values of CUDA function calls. */
+#  define cuda_device_assert(cuda_device, stmt) \
+    { \
+      CUresult result = stmt; \
+      if (result != CUDA_SUCCESS) { \
+        const char *name = cuewErrorString(result); \
+        cuda_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define cuda_assert(stmt) cuda_device_assert(this, stmt)
+
+#  ifndef WITH_CUDA_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked CUDA at all. */
+const char *cuewErrorString(CUresult result);
+const char *cuewCompilerPath();
+int cuewCompilerVersion();
+#  endif /* WITH_CUDA_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_CUDA */
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index ed53fbb54ae..6ccedcf54ef 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -20,7 +20,13 @@
 #include "bvh/bvh2.h"
 
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/device.h"
+#include "device/cuda/device.h"
+#include "device/dummy/device.h"
+#include "device/multi/device.h"
+#include "device/optix/device.h"
 
 #include "util/util_foreach.h"
 #include "util/util_half.h"
@@ -38,332 +44,15 @@ CCL_NAMESPACE_BEGIN
 bool Device::need_types_update = true;
 bool Device::need_devices_update = true;
 thread_mutex Device::device_mutex;
-vector<DeviceInfo> Device::opencl_devices;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
-vector<DeviceInfo> Device::network_devices;
 uint Device::devices_initialized_mask = 0;
 
-/* Device Requested Features */
-
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
-{
-  os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
-  os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
-  /* TODO(sergey): Decode bitflag into list of names. */
-  os << "Nodes features: " << requested_features.nodes_features << std::endl;
-  os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
-  os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
-     << std::endl;
-  os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
-     << std::endl;
-  os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
-  os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
-  os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
-  os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
-     << std::endl;
-  os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
-     << std::endl;
-  os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
-     << std::endl;
-  os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
-     << std::endl;
-  os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
-  os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
-     << std::endl;
-  os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
-     << std::endl;
-  return os;
-}
-
 /* Device */
 
 Device::~Device() noexcept(false)
 {
-  if (!background) {
-    if (vertex_buffer != 0) {
-      glDeleteBuffers(1, &vertex_buffer);
-    }
-    if (fallback_shader_program != 0) {
-      glDeleteProgram(fallback_shader_program);
-    }
-  }
-}
-
-/* TODO move shaders to standalone .glsl file. */
-const char *FALLBACK_VERTEX_SHADER =
-    "#version 330\n"
-    "uniform vec2 fullscreen;\n"
-    "in vec2 texCoord;\n"
-    "in vec2 pos;\n"
-    "out vec2 texCoord_interp;\n"
-    "\n"
-    "vec2 normalize_coordinates()\n"
-    "{\n"
-    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-    "}\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-    "   texCoord_interp = texCoord;\n"
-    "}\n\0";
-
-const char *FALLBACK_FRAGMENT_SHADER =
-    "#version 330\n"
-    "uniform sampler2D image_texture;\n"
-    "in vec2 texCoord_interp;\n"
-    "out vec4 fragColor;\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   fragColor = texture(image_texture, texCoord_interp);\n"
-    "}\n\0";
-
-static void shader_print_errors(const char *task, const char *log, const char *code)
-{
-  LOG(ERROR) << "Shader: " << task << " error:";
-  LOG(ERROR) << "===== shader string ====";
-
-  stringstream stream(code);
-  string partial;
-
-  int line = 1;
-  while (getline(stream, partial, '\n')) {
-    if (line < 10) {
-      LOG(ERROR) << " " << line << " " << partial;
-    }
-    else {
-      LOG(ERROR) << line << " " << partial;
-    }
-    line++;
-  }
-  LOG(ERROR) << log;
-}
-
-static int bind_fallback_shader(void)
-{
-  GLint status;
-  GLchar log[5000];
-  GLsizei length = 0;
-  GLuint program = 0;
-
-  struct Shader {
-    const char *source;
-    GLenum type;
-  } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
-                  {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
-
-  program = glCreateProgram();
-
-  for (int i = 0; i < 2; i++) {
-    GLuint shader = glCreateShader(shaders[i].type);
-
-    string source_str = shaders[i].source;
-    const char *c_str = source_str.c_str();
-
-    glShaderSource(shader, 1, &c_str, NULL);
-    glCompileShader(shader);
-
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
-
-    if (!status) {
-      glGetShaderInfoLog(shader, sizeof(log), &length, log);
-      shader_print_errors("compile", log, c_str);
-      return 0;
-    }
-
-    glAttachShader(program, shader);
-  }
-
-  /* Link output. */
-  glBindFragDataLocation(program, 0, "fragColor");
-
-  /* Link and error check. */
-  glLinkProgram(program);
-
-  glGetProgramiv(program, GL_LINK_STATUS, &status);
-  if (!status) {
-    glGetShaderInfoLog(program, sizeof(log), &length, log);
-    shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
-    shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
-    return 0;
-  }
-
-  return program;
-}
-
-bool Device::bind_fallback_display_space_shader(const float width, const float height)
-{
-  if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
-    return false;
-  }
-
-  if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
-    fallback_shader_program = bind_fallback_shader();
-    fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
-    if (fallback_shader_program == 0) {
-      return false;
-    }
-
-    glUseProgram(fallback_shader_program);
-    image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
-    if (image_texture_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform.";
-      return false;
-    }
-
-    fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
-    if (fullscreen_location < 0) {
-      LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform.";
-      return false;
-    }
-
-    fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
-  }
-
-  /* Run this every time. */
-  glUseProgram(fallback_shader_program);
-  glUniform1i(image_texture_location, 0);
-  glUniform2f(fullscreen_location, width, height);
-  return true;
-}
-
-void Device::draw_pixels(device_memory &rgba,
-                         int y,
-                         int w,
-                         int h,
-                         int width,
-                         int height,
-                         int dx,
-                         int dy,
-                         int dw,
-                         int dh,
-                         bool transparent,
-                         const DeviceDrawParams &draw_params)
-{
-  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
-  assert(rgba.type == MEM_PIXELS);
-  mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
-  GLuint texid;
-  glActiveTexture(GL_TEXTURE0);
-  glGenTextures(1, &texid);
-  glBindTexture(GL_TEXTURE_2D, texid);
-
-  if (rgba.data_type == TYPE_HALF) {
-    GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
-  }
-  else {
-    uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
-    data_pointer += 4 * y * w;
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
-  }
-
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
-  if (transparent) {
-    glEnable(GL_BLEND);
-    glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-  }
-
-  GLint shader_program;
-  if (use_fallback_shader) {
-    if (!bind_fallback_display_space_shader(dw, dh)) {
-      return;
-    }
-    shader_program = fallback_shader_program;
-  }
-  else {
-    draw_params.bind_display_space_shader_cb();
-    glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-  }
-
-  if (!vertex_buffer) {
-    glGenBuffers(1, &vertex_buffer);
-  }
-
-  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-  /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered
-   */
-  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-  float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-  if (vpointer) {
-    /* texture coordinate - vertex pair */
-    vpointer[0] = 0.0f;
-    vpointer[1] = 0.0f;
-    vpointer[2] = dx;
-    vpointer[3] = dy;
-
-    vpointer[4] = 1.0f;
-    vpointer[5] = 0.0f;
-    vpointer[6] = (float)width + dx;
-    vpointer[7] = dy;
-
-    vpointer[8] = 1.0f;
-    vpointer[9] = 1.0f;
-    vpointer[10] = (float)width + dx;
-    vpointer[11] = (float)height + dy;
-
-    vpointer[12] = 0.0f;
-    vpointer[13] = 1.0f;
-    vpointer[14] = dx;
-    vpointer[15] = (float)height + dy;
-
-    if (vertex_buffer) {
-      glUnmapBuffer(GL_ARRAY_BUFFER);
-    }
-  }
-
-  GLuint vertex_array_object;
-  GLuint position_attribute, texcoord_attribute;
-
-  glGenVertexArrays(1, &vertex_array_object);
-  glBindVertexArray(vertex_array_object);
-
-  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-  position_attribute = glGetAttribLocation(shader_program, "pos");
-
-  glEnableVertexAttribArray(texcoord_attribute);
-  glEnableVertexAttribArray(position_attribute);
-
-  glVertexAttribPointer(
-      texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-  glVertexAttribPointer(position_attribute,
-                        2,
-                        GL_FLOAT,
-                        GL_FALSE,
-                        4 * sizeof(float),
-                        (const GLvoid *)(sizeof(float) * 2));
-
-  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-  if (vertex_buffer) {
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-  }
-
-  if (use_fallback_shader) {
-    glUseProgram(0);
-  }
-  else {
-    draw_params.unbind_display_space_shader_cb();
-  }
-
-  glDeleteVertexArrays(1, &vertex_array_object);
-  glBindTexture(GL_TEXTURE_2D, 0);
-  glDeleteTextures(1, &texid);
-
-  if (transparent) {
-    glDisable(GL_BLEND);
-  }
 }
 
 void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
@@ -379,14 +68,14 @@ void Device::build_bvh(BVH *bvh, Progress &progress, bool refit)
   }
 }
 
-Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
 #ifdef WITH_MULTI
   if (!info.multi_devices.empty()) {
     /* Always create a multi device when info contains multiple devices.
      * This is done so that the type can still be e.g. DEVICE_CPU to indicate
      * that it is a homogeneous collection of devices, which simplifies checks. */
-    return device_multi_create(info, stats, profiler, background);
+    return device_multi_create(info, stats, profiler);
   }
 #endif
 
@@ -394,29 +83,18 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
 
   switch (info.type) {
     case DEVICE_CPU:
-      device = device_cpu_create(info, stats, profiler, background);
+      device = device_cpu_create(info, stats, profiler);
       break;
 #ifdef WITH_CUDA
     case DEVICE_CUDA:
       if (device_cuda_init())
-        device = device_cuda_create(info, stats, profiler, background);
+        device = device_cuda_create(info, stats, profiler);
       break;
 #endif
 #ifdef WITH_OPTIX
     case DEVICE_OPTIX:
       if (device_optix_init())
-        device = device_optix_create(info, stats, profiler, background);
-      break;
-#endif
-#ifdef WITH_NETWORK
-    case DEVICE_NETWORK:
-      device = device_network_create(info, stats, profiler, "127.0.0.1");
-      break;
-#endif
-#ifdef WITH_OPENCL
-    case DEVICE_OPENCL:
-      if (device_opencl_init())
-        device = device_opencl_create(info, stats, profiler, background);
+        device = device_optix_create(info, stats, profiler);
       break;
 #endif
     default:
@@ -424,7 +102,7 @@ Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   }
 
   if (device == NULL) {
-    device = device_dummy_create(info, stats, profiler, background);
+    device = device_dummy_create(info, stats, profiler);
   }
 
   return device;
@@ -438,10 +116,6 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_CUDA;
   else if (strcmp(name, "OPTIX") == 0)
     return DEVICE_OPTIX;
-  else if (strcmp(name, "OPENCL") == 0)
-    return DEVICE_OPENCL;
-  else if (strcmp(name, "NETWORK") == 0)
-    return DEVICE_NETWORK;
   else if (strcmp(name, "MULTI") == 0)
     return DEVICE_MULTI;
 
@@ -456,10 +130,6 @@ string Device::string_from_type(DeviceType type)
     return "CUDA";
   else if (type == DEVICE_OPTIX)
     return "OPTIX";
-  else if (type == DEVICE_OPENCL)
-    return "OPENCL";
-  else if (type == DEVICE_NETWORK)
-    return "NETWORK";
   else if (type == DEVICE_MULTI)
     return "MULTI";
 
@@ -476,12 +146,6 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
   types.push_back(DEVICE_OPTIX);
 #endif
-#ifdef WITH_OPENCL
-  types.push_back(DEVICE_OPENCL);
-#endif
-#ifdef WITH_NETWORK
-  types.push_back(DEVICE_NETWORK);
-#endif
   return types;
 }
 
@@ -493,20 +157,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   thread_scoped_lock lock(device_mutex);
   vector<DeviceInfo> devices;
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
-      if (device_opencl_init()) {
-        device_opencl_info(opencl_devices);
-      }
-      devices_initialized_mask |= DEVICE_MASK_OPENCL;
-    }
-    foreach (DeviceInfo &info, opencl_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
 #if defined(WITH_CUDA) || defined(WITH_OPTIX)
   if (mask & (DEVICE_MASK_CUDA | DEVICE_MASK_OPTIX)) {
     if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
@@ -547,18 +197,6 @@ vector<DeviceInfo> Device::available_devices(uint mask)
     }
   }
 
-#ifdef WITH_NETWORK
-  if (mask & DEVICE_MASK_NETWORK) {
-    if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
-      device_network_info(network_devices);
-      devices_initialized_mask |= DEVICE_MASK_NETWORK;
-    }
-    foreach (DeviceInfo &info, network_devices) {
-      devices.push_back(info);
-    }
-  }
-#endif
-
   return devices;
 }
 
@@ -580,15 +218,6 @@ string Device::device_capabilities(uint mask)
     capabilities += device_cpu_capabilities() + "\n";
   }
 
-#ifdef WITH_OPENCL
-  if (mask & DEVICE_MASK_OPENCL) {
-    if (device_opencl_init()) {
-      capabilities += "\nOpenCL device capabilities:\n";
-      capabilities += device_opencl_capabilities();
-    }
-  }
-#endif
-
 #ifdef WITH_CUDA
   if (mask & DEVICE_MASK_CUDA) {
     if (device_cuda_init()) {
@@ -613,16 +242,13 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   }
 
   DeviceInfo info;
-  info.type = subdevices.front().type;
+  info.type = DEVICE_NONE;
   info.id = "MULTI";
   info.description = "Multi Device";
   info.num = 0;
 
   info.has_half_images = true;
   info.has_nanovdb = true;
-  info.has_volume_decoupled = true;
-  info.has_branched_path = true;
-  info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
   info.has_peer_memory = false;
@@ -660,16 +286,16 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.id += device.id;
 
     /* Set device type to MULTI if subdevices are not of a common type. */
-    if (device.type != info.type) {
+    if (info.type == DEVICE_NONE) {
+      info.type = device.type;
+    }
+    else if (device.type != info.type) {
       info.type = DEVICE_MULTI;
     }
 
     /* Accumulate device info. */
     info.has_half_images &= device.has_half_images;
     info.has_nanovdb &= device.has_nanovdb;
-    info.has_volume_decoupled &= device.has_volume_decoupled;
-    info.has_branched_path &= device.has_branched_path;
-    info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
     info.has_peer_memory |= device.has_peer_memory;
@@ -689,60 +315,32 @@ void Device::free_memory()
   devices_initialized_mask = 0;
   cuda_devices.free_memory();
   optix_devices.free_memory();
-  opencl_devices.free_memory();
   cpu_devices.free_memory();
-  network_devices.free_memory();
 }
 
-/* DeviceInfo */
-
-void DeviceInfo::add_denoising_devices(DenoiserType denoiser_type)
+unique_ptr<DeviceQueue> Device::gpu_queue_create()
 {
-  assert(denoising_devices.empty());
-
-  if (denoiser_type == DENOISER_OPTIX && type != DEVICE_OPTIX) {
-    vector<DeviceInfo> optix_devices = Device::available_devices(DEVICE_MASK_OPTIX);
-    if (!optix_devices.empty()) {
-      /* Convert to a special multi device with separate denoising devices. */
-      if (multi_devices.empty()) {
-        multi_devices.push_back(*this);
-      }
-
-      /* Try to use the same physical devices for denoising. */
-      for (const DeviceInfo &cuda_device : multi_devices) {
-        if (cuda_device.type == DEVICE_CUDA) {
-          for (const DeviceInfo &optix_device : optix_devices) {
-            if (cuda_device.num == optix_device.num) {
-              id += optix_device.id;
-              denoising_devices.push_back(optix_device);
-              break;
-            }
-          }
-        }
-      }
-
-      if (denoising_devices.empty()) {
-        /* Simply use the first available OptiX device. */
-        const DeviceInfo optix_device = optix_devices.front();
-        id += optix_device.id; /* Uniquely identify this special multi device. */
-        denoising_devices.push_back(optix_device);
-      }
+  LOG(FATAL) << "Device does not support queues.";
+  return nullptr;
+}
 
-      denoisers = denoiser_type;
-    }
-  }
-  else if (denoiser_type == DENOISER_OPENIMAGEDENOISE && type != DEVICE_CPU) {
-    /* Convert to a special multi device with separate denoising devices. */
-    if (multi_devices.empty()) {
-      multi_devices.push_back(*this);
-    }
+const CPUKernels *Device::get_cpu_kernels() const
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+  return nullptr;
+}
 
-    /* Add CPU denoising devices. */
-    DeviceInfo cpu_device = Device::available_devices(DEVICE_MASK_CPU).front();
-    denoising_devices.push_back(cpu_device);
+void Device::get_cpu_kernel_thread_globals(
+    vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/)
+{
+  LOG(FATAL) << "Device does not support CPU kernels.";
+}
 
-    denoisers = denoiser_type;
-  }
+void *Device::get_cpu_osl_memory()
+{
+  return nullptr;
 }
 
+/* DeviceInfo */
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index ecf79bcdfa6..399d5eb91df 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -21,31 +21,34 @@
 
 #include "bvh/bvh_params.h"
 
+#include "device/device_denoise.h"
 #include "device/device_memory.h"
-#include "device/device_task.h"
 
+#include "util/util_function.h"
 #include "util/util_list.h"
+#include "util/util_logging.h"
 #include "util/util_stats.h"
 #include "util/util_string.h"
 #include "util/util_texture.h"
 #include "util/util_thread.h"
 #include "util/util_types.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
 
 class BVH;
+class DeviceQueue;
 class Progress;
-class RenderTile;
+class CPUKernels;
+class CPUKernelThreadGlobals;
 
 /* Device Types */
 
 enum DeviceType {
   DEVICE_NONE = 0,
   DEVICE_CPU,
-  DEVICE_OPENCL,
   DEVICE_CUDA,
-  DEVICE_NETWORK,
   DEVICE_MULTI,
   DEVICE_OPTIX,
   DEVICE_DUMMY,
@@ -53,20 +56,11 @@ enum DeviceType {
 
 enum DeviceTypeMask {
   DEVICE_MASK_CPU = (1 << DEVICE_CPU),
-  DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
   DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
-  DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
   DEVICE_MASK_ALL = ~0
 };
 
-enum DeviceKernelStatus {
-  DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
-  DEVICE_KERNEL_USING_FEATURE_KERNEL,
-  DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
-  DEVICE_KERNEL_UNKNOWN,
-};
-
 #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
 
 class DeviceInfo {
@@ -75,20 +69,16 @@ class DeviceInfo {
   string description;
   string id; /* used for user preferences, should stay fixed with changing hardware config */
   int num;
-  bool display_device;               /* GPU is used as a display device. */
-  bool has_half_images;              /* Support half-float textures. */
-  bool has_nanovdb;                  /* Support NanoVDB volumes. */
-  bool has_volume_decoupled;         /* Decoupled volume shading. */
-  bool has_branched_path;            /* Supports branched path tracing. */
-  bool has_adaptive_stop_per_sample; /* Per-sample adaptive sampling stopping. */
-  bool has_osl;                      /* Support Open Shading Language. */
-  bool use_split_kernel;             /* Use split or mega kernel. */
-  bool has_profiling;                /* Supports runtime collection of profiling info. */
-  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
-  DenoiserTypeMask denoisers;        /* Supported denoiser types. */
+  bool display_device;        /* GPU is used as a display device. */
+  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_half_images;       /* Support half-float textures. */
+  bool has_osl;               /* Support Open Shading Language. */
+  bool has_profiling;         /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;         /* Device supports GPU queue. */
+  DenoiserTypeMask denoisers; /* Supported denoiser types. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
-  vector<DeviceInfo> denoising_devices;
   string error_msg;
 
   DeviceInfo()
@@ -100,227 +90,35 @@ class DeviceInfo {
     display_device = false;
     has_half_images = false;
     has_nanovdb = false;
-    has_volume_decoupled = false;
-    has_branched_path = true;
-    has_adaptive_stop_per_sample = false;
     has_osl = false;
-    use_split_kernel = false;
     has_profiling = false;
     has_peer_memory = false;
+    has_gpu_queue = false;
     denoisers = DENOISER_NONE;
   }
 
-  bool operator==(const DeviceInfo &info)
+  bool operator==(const DeviceInfo &info) const
   {
     /* Multiple Devices with the same ID would be very bad. */
     assert(id != info.id ||
            (type == info.type && num == info.num && description == info.description));
     return id == info.id;
   }
-
-  /* Add additional devices needed for the specified denoiser. */
-  void add_denoising_devices(DenoiserType denoiser_type);
-};
-
-class DeviceRequestedFeatures {
- public:
-  /* Use experimental feature set. */
-  bool experimental;
-
-  /* Selective nodes compilation. */
-
-  /* Identifier of a node group up to which all the nodes needs to be
-   * compiled in. Nodes from higher group indices will be ignores.
-   */
-  int max_nodes_group;
-
-  /* Features bitfield indicating which features from the requested group
-   * will be compiled in. Nodes which corresponds to features which are not
-   * in this bitfield will be ignored even if they're in the requested group.
-   */
-  int nodes_features;
-
-  /* BVH/sampling kernel features. */
-  bool use_hair;
-  bool use_hair_thick;
-  bool use_object_motion;
-  bool use_camera_motion;
-
-  /* Denotes whether baking functionality is needed. */
-  bool use_baking;
-
-  /* Use subsurface scattering materials. */
-  bool use_subsurface;
-
-  /* Use volume materials. */
-  bool use_volume;
-
-  /* Use branched integrator. */
-  bool use_integrator_branched;
-
-  /* Use OpenSubdiv patch evaluation */
-  bool use_patch_evaluation;
-
-  /* Use Transparent shadows */
-  bool use_transparent;
-
-  /* Use various shadow tricks, such as shadow catcher. */
-  bool use_shadow_tricks;
-
-  /* Per-uber shader usage flags. */
-  bool use_principled;
-
-  /* Denoising features. */
-  bool use_denoising;
-
-  /* Use raytracing in shaders. */
-  bool use_shader_raytrace;
-
-  /* Use true displacement */
-  bool use_true_displacement;
-
-  /* Use background lights */
-  bool use_background_light;
-
-  DeviceRequestedFeatures()
-  {
-    /* TODO(sergey): Find more meaningful defaults. */
-    max_nodes_group = 0;
-    nodes_features = 0;
-    use_hair = false;
-    use_hair_thick = false;
-    use_object_motion = false;
-    use_camera_motion = false;
-    use_baking = false;
-    use_subsurface = false;
-    use_volume = false;
-    use_integrator_branched = false;
-    use_patch_evaluation = false;
-    use_transparent = false;
-    use_shadow_tricks = false;
-    use_principled = false;
-    use_denoising = false;
-    use_shader_raytrace = false;
-    use_true_displacement = false;
-    use_background_light = false;
-  }
-
-  bool modified(const DeviceRequestedFeatures &requested_features)
-  {
-    return !(max_nodes_group == requested_features.max_nodes_group &&
-             nodes_features == requested_features.nodes_features &&
-             use_hair == requested_features.use_hair &&
-             use_hair_thick == requested_features.use_hair_thick &&
-             use_object_motion == requested_features.use_object_motion &&
-             use_camera_motion == requested_features.use_camera_motion &&
-             use_baking == requested_features.use_baking &&
-             use_subsurface == requested_features.use_subsurface &&
-             use_volume == requested_features.use_volume &&
-             use_integrator_branched == requested_features.use_integrator_branched &&
-             use_patch_evaluation == requested_features.use_patch_evaluation &&
-             use_transparent == requested_features.use_transparent &&
-             use_shadow_tricks == requested_features.use_shadow_tricks &&
-             use_principled == requested_features.use_principled &&
-             use_denoising == requested_features.use_denoising &&
-             use_shader_raytrace == requested_features.use_shader_raytrace &&
-             use_true_displacement == requested_features.use_true_displacement &&
-             use_background_light == requested_features.use_background_light);
-  }
-
-  /* Convert the requested features structure to a build options,
-   * which could then be passed to compilers.
-   */
-  string get_build_options() const
-  {
-    string build_options = "";
-    if (experimental) {
-      build_options += "-D__KERNEL_EXPERIMENTAL__ ";
-    }
-    build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
-    build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
-    if (!use_hair) {
-      build_options += " -D__NO_HAIR__";
-    }
-    if (!use_object_motion) {
-      build_options += " -D__NO_OBJECT_MOTION__";
-    }
-    if (!use_camera_motion) {
-      build_options += " -D__NO_CAMERA_MOTION__";
-    }
-    if (!use_baking) {
-      build_options += " -D__NO_BAKING__";
-    }
-    if (!use_volume) {
-      build_options += " -D__NO_VOLUME__";
-    }
-    if (!use_subsurface) {
-      build_options += " -D__NO_SUBSURFACE__";
-    }
-    if (!use_integrator_branched) {
-      build_options += " -D__NO_BRANCHED_PATH__";
-    }
-    if (!use_patch_evaluation) {
-      build_options += " -D__NO_PATCH_EVAL__";
-    }
-    if (!use_transparent && !use_volume) {
-      build_options += " -D__NO_TRANSPARENT__";
-    }
-    if (!use_shadow_tricks) {
-      build_options += " -D__NO_SHADOW_TRICKS__";
-    }
-    if (!use_principled) {
-      build_options += " -D__NO_PRINCIPLED__";
-    }
-    if (!use_denoising) {
-      build_options += " -D__NO_DENOISING__";
-    }
-    if (!use_shader_raytrace) {
-      build_options += " -D__NO_SHADER_RAYTRACE__";
-    }
-    return build_options;
-  }
 };
 
-std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
-
 /* Device */
 
-struct DeviceDrawParams {
-  function<void()> bind_display_space_shader_cb;
-  function<void()> unbind_display_space_shader_cb;
-};
-
 class Device {
   friend class device_sub_ptr;
 
  protected:
-  enum {
-    FALLBACK_SHADER_STATUS_NONE = 0,
-    FALLBACK_SHADER_STATUS_ERROR,
-    FALLBACK_SHADER_STATUS_SUCCESS,
-  };
-
-  Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
-      : background(background),
-        vertex_buffer(0),
-        fallback_status(FALLBACK_SHADER_STATUS_NONE),
-        fallback_shader_program(0),
-        info(info_),
-        stats(stats_),
-        profiler(profiler_)
+  Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : info(info_), stats(stats_), profiler(profiler_)
   {
   }
 
-  bool background;
   string error_msg;
 
-  /* used for real time display */
-  unsigned int vertex_buffer;
-  int fallback_status, fallback_shader_program;
-  int image_texture_location, fullscreen_location;
-
-  bool bind_fallback_display_space_shader(const float width, const float height);
-
   virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
   {
     /* Only required for devices that implement denoising. */
@@ -361,67 +159,31 @@ class Device {
   Stats &stats;
   Profiler &profiler;
 
-  /* memory alignment */
-  virtual int mem_sub_ptr_alignment()
-  {
-    return MIN_ALIGNMENT_CPU_DATA_TYPES;
-  }
-
   /* constant memory */
   virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
 
-  /* open shading language, only for CPU device */
-  virtual void *osl_memory()
-  {
-    return NULL;
-  }
-
   /* load/compile kernels, must be called before adding tasks */
-  virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+  virtual bool load_kernels(uint /*kernel_features*/)
   {
     return true;
   }
 
-  /* Wait for device to become available to upload data and receive tasks
-   * This method is used by the OpenCL device to load the
-   * optimized kernels or when not (yet) available load the
-   * generic kernels (only during foreground rendering) */
-  virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
-  {
-    return true;
-  }
-  /* Check if there are 'better' kernels available to be used
-   * We can switch over to these kernels
-   * This method is used to determine if we can switch the preview kernels
-   * to regular kernels */
-  virtual DeviceKernelStatus get_active_kernel_switch_state()
-  {
-    return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-  }
+  /* GPU device only functions.
+   * These may not be used on CPU or multi-devices. */
 
-  /* tasks */
-  virtual int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
+  /* Create new queue for executing kernels in. */
+  virtual unique_ptr<DeviceQueue> gpu_queue_create();
+
+  /* CPU device only functions.
+   * These may not be used on GPU or multi-devices. */
 
-  virtual void task_add(DeviceTask &task) = 0;
-  virtual void task_wait() = 0;
-  virtual void task_cancel() = 0;
-
-  /* opengl drawing */
-  virtual void draw_pixels(device_memory &mem,
-                           int y,
-                           int w,
-                           int h,
-                           int width,
-                           int height,
-                           int dx,
-                           int dy,
-                           int dw,
-                           int dh,
-                           bool transparent,
-                           const DeviceDrawParams &draw_params);
+  /* Get CPU kernel functions for native instruction set. */
+  virtual const CPUKernels *get_cpu_kernels() const;
+  /* Get kernel globals to pass to kernels. */
+  virtual void get_cpu_kernel_thread_globals(
+      vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
+  /* Get OpenShadingLanguage memory buffer. */
+  virtual void *get_cpu_osl_memory();
 
   /* acceleration structure building */
   virtual void build_bvh(BVH *bvh, Progress &progress, bool refit);
@@ -429,25 +191,11 @@ class Device {
   /* OptiX specific destructor. */
   virtual void release_optix_bvh(BVH * /*bvh*/){};
 
-#ifdef WITH_NETWORK
-  /* networking */
-  void server_run();
-#endif
-
   /* multi device */
-  virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
-  {
-  }
   virtual int device_number(Device * /*sub_device*/)
   {
     return 0;
   }
-  virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
-  virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/)
-  {
-  }
 
   virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
   {
@@ -460,11 +208,47 @@ class Device {
     return false;
   }
 
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Check display is to be updated using graphics interoperability.
+   * The interoperability can not be used is it is not supported by the device. But the device
+   * might also force disable the interoperability if it detects that it will be slower than
+   * copying pixels from the render buffer. */
+  virtual bool should_use_graphics_interop()
+  {
+    return false;
+  }
+
+  /* Buffer denoising. */
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
+  {
+    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
+    return false;
+  }
+
+  virtual DeviceQueue *get_denoise_queue()
+  {
+    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Sub-devices */
+
+  /* Run given callback for every individual device which will be handling rendering.
+   * For the single device the callback is called for the device itself. For the multi-device the
+   * callback is only called for the sub-devices. */
+  virtual void foreach_device(const function<void(Device *)> &callback)
+  {
+    callback(this);
+  }
+
   /* static */
-  static Device *create(DeviceInfo &info,
-                        Stats &stats,
-                        Profiler &profiler,
-                        bool background = true);
+  static Device *create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
 
   static DeviceType type_from_string(const char *name);
   static string string_from_type(DeviceType type);
@@ -499,9 +283,7 @@ class Device {
   static thread_mutex device_mutex;
   static vector<DeviceInfo> cuda_devices;
   static vector<DeviceInfo> optix_devices;
-  static vector<DeviceInfo> opencl_devices;
   static vector<DeviceInfo> cpu_devices;
-  static vector<DeviceInfo> network_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
deleted file mode 100644
index 4a6e77d6eaa..00000000000
--- a/intern/cycles/device/device_cpu.cpp
+++ /dev/null
@@ -1,1680 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-/* So ImathMath is included before our kernel_cpu_compat. */
-#ifdef WITH_OSL
-/* So no context pollution happens from indirectly included windows.h */
-#  include "util/util_windows.h"
-#  include <OSL/oslexec.h>
-#endif
-
-#ifdef WITH_EMBREE
-#  include <embree3/rtcore.h>
-#endif
-
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_intern.h"
-#include "device/device_split_kernel.h"
-
-// clang-format off
-#include "kernel/kernel.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-#include "kernel/filter/filter.h"
-
-#include "kernel/osl/osl_shader.h"
-#include "kernel/osl/osl_globals.h"
-// clang-format on
-
-#include "bvh/bvh_embree.h"
-
-#include "render/buffers.h"
-#include "render/coverage.h"
-
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_function.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_opengl.h"
-#include "util/util_openimagedenoise.h"
-#include "util/util_optimization.h"
-#include "util/util_progress.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_thread.h"
-
-CCL_NAMESPACE_BEGIN
-
-class CPUDevice;
-
-/* Has to be outside of the class to be shared across template instantiations. */
-static const char *logged_architecture = "";
-
-template<typename F> class KernelFunctions {
- public:
-  KernelFunctions()
-  {
-    kernel = (F)NULL;
-  }
-
-  KernelFunctions(
-      F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
-  {
-    const char *architecture_name = "default";
-    kernel = kernel_default;
-
-    /* Silence potential warnings about unused variables
-     * when compiling without some architectures. */
-    (void)kernel_sse2;
-    (void)kernel_sse3;
-    (void)kernel_sse41;
-    (void)kernel_avx;
-    (void)kernel_avx2;
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-    if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
-      architecture_name = "AVX2";
-      kernel = kernel_avx2;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-        if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
-      architecture_name = "AVX";
-      kernel = kernel_avx;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-        if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
-      architecture_name = "SSE4.1";
-      kernel = kernel_sse41;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-        if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
-      architecture_name = "SSE3";
-      kernel = kernel_sse3;
-    }
-    else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-        if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
-      architecture_name = "SSE2";
-      kernel = kernel_sse2;
-    }
-#else
-    {
-      /* Dummy to prevent the architecture if below become
-       * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-       * is not defined. */
-    }
-#endif
-
-    if (strcmp(architecture_name, logged_architecture) != 0) {
-      VLOG(1) << "Will be using " << architecture_name << " kernels.";
-      logged_architecture = architecture_name;
-    }
-  }
-
-  inline F operator()() const
-  {
-    assert(kernel);
-    return kernel;
-  }
-
- protected:
-  F kernel;
-};
-
-class CPUSplitKernel : public DeviceSplitKernel {
-  CPUDevice *device;
-
- public:
-  explicit CPUSplitKernel(CPUDevice *device);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs);
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &);
-  virtual int2 split_kernel_local_size();
-  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task);
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
-};
-
-class CPUDevice : public Device {
- public:
-  TaskPool task_pool;
-  KernelGlobals kernel_globals;
-
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
-#ifdef WITH_OSL
-  OSLGlobals osl_globals;
-#endif
-#ifdef WITH_OPENIMAGEDENOISE
-  oidn::DeviceRef oidn_device;
-  oidn::FilterRef oidn_filter;
-#endif
-  thread_spin_lock oidn_task_lock;
-#ifdef WITH_EMBREE
-  RTCScene embree_scene = NULL;
-  RTCDevice embree_device;
-#endif
-
-  bool use_split_kernel;
-
-  DeviceRequestedFeatures requested_features;
-
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_half_float_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
-      convert_to_byte_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
-      shader_kernel;
-  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
-
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
-      filter_divide_shadow_kernel;
-  KernelFunctions<void (*)(
-      int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
-      filter_get_feature_kernel;
-  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
-      filter_write_feature_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_detect_outliers_kernel;
-  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
-      filter_combine_halves_kernel;
-
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
-      filter_nlm_calc_difference_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
-  KernelFunctions<void (*)(
-      int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
-      filter_nlm_update_output_kernel;
-  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
-
-  KernelFunctions<void (*)(
-      float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
-      filter_construct_transform_kernel;
-  KernelFunctions<void (*)(int,
-                           int,
-                           int,
-                           float *,
-                           float *,
-                           float *,
-                           int *,
-                           float *,
-                           float3 *,
-                           int *,
-                           int *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           bool)>
-      filter_nlm_construct_gramian_kernel;
-  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
-      filter_finalize_kernel;
-
-  KernelFunctions<void (*)(KernelGlobals *,
-                           ccl_constant KernelData *,
-                           ccl_global void *,
-                           int,
-                           ccl_global char *,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           int,
-                           ccl_global int *,
-                           int,
-                           ccl_global char *,
-                           ccl_global unsigned int *,
-                           unsigned int,
-                           ccl_global float *)>
-      data_init_kernel;
-  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
-
-#define KERNEL_FUNCTIONS(name) \
-  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
-      KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
-      KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
-
-  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_),
-        texture_info(this, "__texture_info", MEM_GLOBAL),
-#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
-        REGISTER_KERNEL(path_trace),
-        REGISTER_KERNEL(convert_to_half_float),
-        REGISTER_KERNEL(convert_to_byte),
-        REGISTER_KERNEL(shader),
-        REGISTER_KERNEL(bake),
-        REGISTER_KERNEL(filter_divide_shadow),
-        REGISTER_KERNEL(filter_get_feature),
-        REGISTER_KERNEL(filter_write_feature),
-        REGISTER_KERNEL(filter_detect_outliers),
-        REGISTER_KERNEL(filter_combine_halves),
-        REGISTER_KERNEL(filter_nlm_calc_difference),
-        REGISTER_KERNEL(filter_nlm_blur),
-        REGISTER_KERNEL(filter_nlm_calc_weight),
-        REGISTER_KERNEL(filter_nlm_update_output),
-        REGISTER_KERNEL(filter_nlm_normalize),
-        REGISTER_KERNEL(filter_construct_transform),
-        REGISTER_KERNEL(filter_nlm_construct_gramian),
-        REGISTER_KERNEL(filter_finalize),
-        REGISTER_KERNEL(data_init)
-#undef REGISTER_KERNEL
-  {
-    if (info.cpu_threads == 0) {
-      info.cpu_threads = TaskScheduler::num_threads();
-    }
-
-#ifdef WITH_OSL
-    kernel_globals.osl = &osl_globals;
-#endif
-#ifdef WITH_EMBREE
-    embree_device = rtcNewDevice("verbose=0");
-#endif
-    use_split_kernel = DebugFlags().cpu.split_kernel;
-    if (use_split_kernel) {
-      VLOG(1) << "Will be using split kernel.";
-    }
-    need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) \
-  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
-      KERNEL_FUNCTIONS(name))
-    REGISTER_SPLIT_KERNEL(path_init);
-    REGISTER_SPLIT_KERNEL(scene_intersect);
-    REGISTER_SPLIT_KERNEL(lamp_emission);
-    REGISTER_SPLIT_KERNEL(do_volume);
-    REGISTER_SPLIT_KERNEL(queue_enqueue);
-    REGISTER_SPLIT_KERNEL(indirect_background);
-    REGISTER_SPLIT_KERNEL(shader_setup);
-    REGISTER_SPLIT_KERNEL(shader_sort);
-    REGISTER_SPLIT_KERNEL(shader_eval);
-    REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
-    REGISTER_SPLIT_KERNEL(subsurface_scatter);
-    REGISTER_SPLIT_KERNEL(direct_lighting);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
-    REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
-    REGISTER_SPLIT_KERNEL(enqueue_inactive);
-    REGISTER_SPLIT_KERNEL(next_iteration_setup);
-    REGISTER_SPLIT_KERNEL(indirect_subsurface);
-    REGISTER_SPLIT_KERNEL(buffer_update);
-    REGISTER_SPLIT_KERNEL(adaptive_stopping);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_x);
-    REGISTER_SPLIT_KERNEL(adaptive_filter_y);
-    REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
-#undef REGISTER_SPLIT_KERNEL
-#undef KERNEL_FUNCTIONS
-  }
-
-  ~CPUDevice()
-  {
-#ifdef WITH_EMBREE
-    rtcReleaseDevice(embree_device);
-#endif
-    task_pool.cancel();
-    texture_info.free();
-  }
-
-  virtual bool show_samples() const override
-  {
-    return (info.cpu_threads == 1);
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
-#ifdef WITH_EMBREE
-    bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
-    return bvh_layout_mask;
-  }
-
-  void load_texture_info()
-  {
-    if (need_texture_info) {
-      texture_info.copy_to_device();
-      need_texture_info = false;
-    }
-  }
-
-  virtual void mem_alloc(device_memory &mem) override
-  {
-    if (mem.type == MEM_TEXTURE) {
-      assert(!"mem_alloc not supported for textures.");
-    }
-    else if (mem.type == MEM_GLOBAL) {
-      assert(!"mem_alloc not supported for global memory.");
-    }
-    else {
-      if (mem.name) {
-        VLOG(1) << "Buffer allocate: " << mem.name << ", "
-                << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                << string_human_readable_size(mem.memory_size()) << ")";
-      }
-
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
-        void *data = util_aligned_malloc(mem.memory_size(), alignment);
-        mem.device_pointer = (device_ptr)data;
-      }
-      else {
-        mem.device_pointer = (device_ptr)mem.host_pointer;
-      }
-
-      mem.device_size = mem.memory_size();
-      stats.mem_alloc(mem.device_size);
-    }
-  }
-
-  virtual void mem_copy_to(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-      global_alloc(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-      tex_alloc((device_texture &)mem);
-    }
-    else if (mem.type == MEM_PIXELS) {
-      assert(!"mem_copy_to not supported for pixels.");
-    }
-    else {
-      if (!mem.device_pointer) {
-        mem_alloc(mem);
-      }
-
-      /* copy is no-op */
-    }
-  }
-
-  virtual void mem_copy_from(
-      device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
-  {
-    /* no-op */
-  }
-
-  virtual void mem_zero(device_memory &mem) override
-  {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    if (mem.device_pointer) {
-      memset((void *)mem.device_pointer, 0, mem.memory_size());
-    }
-  }
-
-  virtual void mem_free(device_memory &mem) override
-  {
-    if (mem.type == MEM_GLOBAL) {
-      global_free(mem);
-    }
-    else if (mem.type == MEM_TEXTURE) {
-      tex_free((device_texture &)mem);
-    }
-    else if (mem.device_pointer) {
-      if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
-        util_aligned_free((void *)mem.device_pointer);
-      }
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
-  {
-    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
-  }
-
-  virtual void const_copy_to(const char *name, void *host, size_t size) override
-  {
-#if WITH_EMBREE
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update scene handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      data->bvh.scene = embree_scene;
-    }
-#endif
-    kernel_const_copy(&kernel_globals, name, host, size);
-  }
-
-  void global_alloc(device_memory &mem)
-  {
-    VLOG(1) << "Global memory allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    kernel_global_memory_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void global_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-
-  void tex_alloc(device_texture &mem)
-  {
-    VLOG(1) << "Texture allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-
-    mem.device_pointer = (device_ptr)mem.host_pointer;
-    mem.device_size = mem.memory_size();
-    stats.mem_alloc(mem.device_size);
-
-    const uint slot = mem.slot;
-    if (slot >= texture_info.size()) {
-      /* Allocate some slots in advance, to reduce amount of re-allocations. */
-      texture_info.resize(slot + 128);
-    }
-
-    texture_info[slot] = mem.info;
-    texture_info[slot].data = (uint64_t)mem.host_pointer;
-    need_texture_info = true;
-  }
-
-  void tex_free(device_texture &mem)
-  {
-    if (mem.device_pointer) {
-      mem.device_pointer = 0;
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-      need_texture_info = true;
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-#ifdef WITH_OSL
-    return &osl_globals;
-#else
-    return NULL;
-#endif
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-#ifdef WITH_EMBREE
-    if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
-        bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE) {
-      BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
-      if (refit) {
-        bvh_embree->refit(progress);
-      }
-      else {
-        bvh_embree->build(progress, &stats, embree_device);
-      }
-
-      if (bvh->params.top_level) {
-        embree_scene = bvh_embree->scene;
-      }
-    }
-    else
-#endif
-      Device::build_bvh(bvh, progress, refit);
-  }
-
-  void thread_run(DeviceTask &task)
-  {
-    if (task.type == DeviceTask::RENDER)
-      thread_render(task);
-    else if (task.type == DeviceTask::SHADER)
-      thread_shader(task);
-    else if (task.type == DeviceTask::FILM_CONVERT)
-      thread_film_convert(task);
-    else if (task.type == DeviceTask::DENOISE_BUFFER)
-      thread_denoise(task);
-  }
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
-    int4 rect = task->rect;
-    int r = task->nlm_state.r;
-    int f = task->nlm_state.f;
-    float a = task->nlm_state.a;
-    float k_2 = task->nlm_state.k_2;
-
-    int w = align_up(rect.z - rect.x, 4);
-    int h = rect.w - rect.y;
-    int stride = task->buffer.stride;
-    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *blurDifference = temporary_mem;
-    float *difference = temporary_mem + task->buffer.pass_stride;
-    float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
-
-    memset(weightAccum, 0, sizeof(float) * w * h);
-    memset((float *)out_ptr, 0, sizeof(float) * w * h);
-
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {
-          max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)guide_ptr,
-                                          (float *)variance_ptr,
-                                          NULL,
-                                          difference,
-                                          local_rect,
-                                          w,
-                                          channel_offset,
-                                          0,
-                                          a,
-                                          k_2);
-
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-      filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
-
-      filter_nlm_update_output_kernel()(dx,
-                                        dy,
-                                        blurDifference,
-                                        (float *)image_ptr,
-                                        difference,
-                                        (float *)out_ptr,
-                                        weightAccum,
-                                        local_rect,
-                                        channel_offset,
-                                        stride,
-                                        f);
-    }
-
-    int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
-    filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
-
-    return true;
-  }
-
-  bool denoising_construct_transform(DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
-                                            task->tile_info,
-                                            x + task->filter_area.x,
-                                            y + task->filter_area.y,
-                                            y * task->filter_area.z + x,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            &task->rect.x,
-                                            task->buffer.pass_stride,
-                                            task->buffer.frame_stride,
-                                            task->buffer.use_time,
-                                            task->radius,
-                                            task->pca_threshold);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
-    float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
-    float *difference = temporary_mem;
-    float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
-    int r = task->radius;
-    int frame_offset = frame * task->buffer.frame_stride;
-    for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
-      int dy = i / (2 * r + 1) - r;
-      int dx = i % (2 * r + 1) - r;
-
-      int local_rect[4] = {max(0, -dx),
-                           max(0, -dy),
-                           task->reconstruction_state.source_w - max(0, dx),
-                           task->reconstruction_state.source_h - max(0, dy)};
-      filter_nlm_calc_difference_kernel()(dx,
-                                          dy,
-                                          (float *)color_ptr,
-                                          (float *)color_variance_ptr,
-                                          (float *)scale_ptr,
-                                          difference,
-                                          local_rect,
-                                          task->buffer.stride,
-                                          task->buffer.pass_stride,
-                                          frame_offset,
-                                          1.0f,
-                                          task->nlm_k_2);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_calc_weight_kernel()(
-          blurDifference, difference, local_rect, task->buffer.stride, 4);
-      filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
-      filter_nlm_construct_gramian_kernel()(dx,
-                                            dy,
-                                            task->tile_info->frames[frame],
-                                            blurDifference,
-                                            (float *)task->buffer.mem.device_pointer,
-                                            (float *)task->storage.transform.device_pointer,
-                                            (int *)task->storage.rank.device_pointer,
-                                            (float *)task->storage.XtWX.device_pointer,
-                                            (float3 *)task->storage.XtWY.device_pointer,
-                                            local_rect,
-                                            &task->reconstruction_state.filter_window.x,
-                                            task->buffer.stride,
-                                            4,
-                                            task->buffer.pass_stride,
-                                            frame_offset,
-                                            task->buffer.use_time);
-    }
-
-    return true;
-  }
-
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_finalize_kernel()(x,
-                                 y,
-                                 y * task->filter_area.z + x,
-                                 (float *)output_ptr,
-                                 (int *)task->storage.rank.device_pointer,
-                                 (float *)task->storage.XtWX.device_pointer,
-                                 (float3 *)task->storage.XtWY.device_pointer,
-                                 &task->reconstruction_state.buffer_params.x,
-                                 task->render_buffer.samples);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = rect.x; x < rect.z; x++) {
-        filter_combine_halves_kernel()(x,
-                                       y,
-                                       (float *)mean_ptr,
-                                       (float *)variance_ptr,
-                                       (float *)a_ptr,
-                                       (float *)b_ptr,
-                                       &rect.x,
-                                       r);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_divide_shadow_kernel()(task->render_buffer.samples,
-                                      task->tile_info,
-                                      x,
-                                      y,
-                                      (float *)a_ptr,
-                                      (float *)b_ptr,
-                                      (float *)sample_variance_ptr,
-                                      (float *)sv_variance_ptr,
-                                      (float *)buffer_variance_ptr,
-                                      &task->rect.x,
-                                      task->render_buffer.pass_stride,
-                                      task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_get_feature_kernel()(task->render_buffer.samples,
-                                    task->tile_info,
-                                    mean_offset,
-                                    variance_offset,
-                                    x,
-                                    y,
-                                    (float *)mean_ptr,
-                                    (float *)variance_ptr,
-                                    scale,
-                                    &task->rect.x,
-                                    task->render_buffer.pass_stride,
-                                    task->render_buffer.offset);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_write_feature(int out_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task)
-  {
-    for (int y = 0; y < task->filter_area.w; y++) {
-      for (int x = 0; x < task->filter_area.z; x++) {
-        filter_write_feature_kernel()(task->render_buffer.samples,
-                                      x + task->filter_area.x,
-                                      y + task->filter_area.y,
-                                      &task->reconstruction_state.buffer_params.x,
-                                      (float *)from_ptr,
-                                      (float *)buffer_ptr,
-                                      out_offset,
-                                      &task->rect.x);
-      }
-    }
-    return true;
-  }
-
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task)
-  {
-    ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
-    for (int y = task->rect.y; y < task->rect.w; y++) {
-      for (int x = task->rect.x; x < task->rect.z; x++) {
-        filter_detect_outliers_kernel()(x,
-                                        y,
-                                        (float *)image_ptr,
-                                        (float *)variance_ptr,
-                                        (float *)depth_ptr,
-                                        (float *)output_ptr,
-                                        &task->rect.x,
-                                        task->buffer.pass_stride);
-      }
-    }
-    return true;
-  }
-
-  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
-  {
-    WorkTile wtile;
-    wtile.x = tile.x;
-    wtile.y = tile.y;
-    wtile.w = tile.w;
-    wtile.h = tile.h;
-    wtile.offset = tile.offset;
-    wtile.stride = tile.stride;
-    wtile.buffer = (float *)tile.buffer;
-
-    /* For CPU we do adaptive stopping per sample so we can stop earlier, but
-     * for combined CPU + GPU rendering we match the GPU and do it per tile
-     * after a given number of sample steps. */
-    if (!kernel_data.integrator.adaptive_stop_per_sample) {
-      for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-        for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-          const int index = wtile.offset + x + y * wtile.stride;
-          float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
-          kernel_do_adaptive_stopping(kg, buffer, sample);
-        }
-      }
-    }
-
-    bool any = false;
-    for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
-      any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
-    }
-    for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
-      any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
-    }
-    return (!any);
-  }
-
-  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
-  {
-    float *render_buffer = (float *)tile.buffer;
-    for (int y = tile.y; y < tile.y + tile.h; y++) {
-      for (int x = tile.x; x < tile.x + tile.w; x++) {
-        int index = tile.offset + x + y * tile.stride;
-        ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
-        if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-          buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-          float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
-          if (sample_multiplier != 1.0f) {
-            kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-          }
-        }
-        else {
-          kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
-        }
-      }
-    }
-  }
-
-  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-  {
-    const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-    scoped_timer timer(&tile.buffers->render_time);
-
-    Coverage coverage(kg, tile);
-    if (use_coverage) {
-      coverage.init_path_trace();
-    }
-
-    float *render_buffer = (float *)tile.buffer;
-    int start_sample = tile.start_sample;
-    int end_sample = tile.start_sample + tile.num_samples;
-
-    /* Needed for Embree. */
-    SIMD_SET_FLUSH_TO_ZERO;
-
-    for (int sample = start_sample; sample < end_sample; sample++) {
-      if (task.get_cancel() || TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-
-      if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-        tile.stealing_state = RenderTile::WAS_STOLEN;
-        break;
-      }
-
-      if (tile.task == RenderTile::PATH_TRACE) {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            if (use_coverage) {
-              coverage.init_pixel(x, y);
-            }
-            path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      else {
-        for (int y = tile.y; y < tile.y + tile.h; y++) {
-          for (int x = tile.x; x < tile.x + tile.w; x++) {
-            bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-          }
-        }
-      }
-      tile.sample = sample + 1;
-
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-        const bool stop = adaptive_sampling_filter(kg, tile, sample);
-        if (stop) {
-          const int num_progress_samples = end_sample - sample;
-          tile.sample = end_sample;
-          task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-          break;
-        }
-      }
-
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-    if (use_coverage) {
-      coverage.finalize();
-    }
-
-    if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-      adaptive_sampling_post(tile, kg);
-    }
-  }
-
-  void denoise_openimagedenoise_buffer(DeviceTask &task,
-                                       float *buffer,
-                                       const size_t offset,
-                                       const size_t stride,
-                                       const size_t x,
-                                       const size_t y,
-                                       const size_t w,
-                                       const size_t h,
-                                       const float scale)
-  {
-#ifdef WITH_OPENIMAGEDENOISE
-    assert(openimagedenoise_supported());
-
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
-     * buffers, and for tiled rendering because creating multiple devices and filters
-     * is slow and memory hungry as well.
-     *
-     * TODO: optimize tiled rendering case, by batching together denoising of many
-     * tiles somehow? */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    /* Create device and filter, cached for reuse. */
-    if (!oidn_device) {
-      oidn_device = oidn::newDevice();
-      oidn_device.commit();
-    }
-    if (!oidn_filter) {
-      oidn_filter = oidn_device.newFilter("RT");
-      oidn_filter.set("hdr", true);
-      oidn_filter.set("srgb", false);
-    }
-
-    /* Set images with appropriate stride for our interleaved pass storage. */
-    struct {
-      const char *name;
-      const int offset;
-      const bool scale;
-      const bool use;
-      array<float> scaled_buffer;
-    } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
-                  {"albedo",
-                   task.pass_denoising_data + DENOISING_PASS_ALBEDO,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
-                  {"normal",
-                   task.pass_denoising_data + DENOISING_PASS_NORMAL,
-                   true,
-                   task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
-                  {"output", 0, false, true},
-                  { NULL,
-                    0 }};
-
-    for (int i = 0; passes[i].name; i++) {
-      if (!passes[i].use) {
-        continue;
-      }
-
-      const int64_t pixel_offset = offset + x + y * stride;
-      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
-      const int64_t pixel_stride = task.pass_stride;
-      const int64_t row_stride = stride * pixel_stride;
-
-      if (passes[i].scale && scale != 1.0f) {
-        /* Normalize albedo and normal passes as they are scaled by the number of samples.
-         * For the color passes OIDN will perform auto-exposure making it unnecessary. */
-        array<float> &scaled_buffer = passes[i].scaled_buffer;
-        scaled_buffer.resize(w * h * 3);
-
-        for (int y = 0; y < h; y++) {
-          const float *pass_row = buffer + buffer_offset + y * row_stride;
-          float *scaled_row = scaled_buffer.data() + y * w * 3;
-
-          for (int x = 0; x < w; x++) {
-            scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
-            scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
-            scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
-          }
-        }
-
-        oidn_filter.setImage(
-            passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
-      }
-      else {
-        oidn_filter.setImage(passes[i].name,
-                             buffer + buffer_offset,
-                             oidn::Format::Float3,
-                             w,
-                             h,
-                             0,
-                             pixel_stride * sizeof(float),
-                             row_stride * sizeof(float));
-      }
-    }
-
-    /* Execute filter. */
-    oidn_filter.commit();
-    oidn_filter.execute();
-#else
-    (void)task;
-    (void)buffer;
-    (void)offset;
-    (void)stride;
-    (void)x;
-    (void)y;
-    (void)w;
-    (void)h;
-    (void)scale;
-#endif
-  }
-
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
-  {
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      /* Copy pixels from compute device to CPU (no-op for CPU device). */
-      rtile.buffers->buffer.copy_from_device();
-
-      denoise_openimagedenoise_buffer(task,
-                                      (float *)rtile.buffer,
-                                      rtile.offset,
-                                      rtile.stride,
-                                      rtile.x,
-                                      rtile.y,
-                                      rtile.w,
-                                      rtile.h,
-                                      1.0f / rtile.sample);
-
-      /* todo: it may be possible to avoid this copy, but we have to ensure that
-       * when other code copies data from the device it doesn't overwrite the
-       * denoiser buffers. */
-      rtile.buffers->buffer.copy_to_device();
-    }
-    else {
-      /* Per-tile denoising. */
-      rtile.sample = rtile.start_sample + rtile.num_samples;
-      const float scale = 1.0f / rtile.sample;
-      const float invscale = rtile.sample;
-      const size_t pass_stride = task.pass_stride;
-
-      /* Map neighboring tiles into one buffer for denoising. */
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      rtile = center_tile;
-
-      /* Calculate size of the tile to denoise (including overlap). The overlap
-       * size was chosen empirically. OpenImageDenoise specifies an overlap size
-       * of 128 but this is significantly bigger than typical tile size. */
-      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
-      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-
-      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
-      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
-
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        RenderTile &ntile = neighbors.tiles[i];
-        if (!ntile.buffer) {
-          continue;
-        }
-
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
-            merged_buffer[x] = tile_buffer[x] * scale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      /* Denoise */
-      denoise_openimagedenoise_buffer(
-          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
-
-      /* Copy back result from merged buffer. */
-      RenderTile &ntile = neighbors.target;
-      if (ntile.buffer) {
-        const int xmin = max(ntile.x, rect.x);
-        const int ymin = max(ntile.y, rect.y);
-        const int xmax = min(ntile.x + ntile.w, rect.z);
-        const int ymax = min(ntile.y + ntile.h, rect.w);
-
-        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
-        float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
-
-        const size_t merged_stride = rect_size.x;
-        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
-        const float *merged_buffer = merged.data() + merged_offset * pass_stride;
-
-        for (int y = ymin; y < ymax; y++) {
-          for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
-            tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
-            tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
-            tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
-          }
-          tile_buffer += ntile.stride * pass_stride;
-          merged_buffer += merged_stride * pass_stride;
-        }
-      }
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-  }
-
-  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
-  {
-    ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
-    tile.sample = tile.start_sample + tile.num_samples;
-
-    denoising.functions.construct_transform = function_bind(
-        &CPUDevice::denoising_construct_transform, this, &denoising);
-    denoising.functions.accumulate = function_bind(
-        &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
-    denoising.functions.divide_shadow = function_bind(
-        &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.non_local_means = function_bind(
-        &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-    denoising.functions.combine_halves = function_bind(
-        &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-    denoising.functions.get_feature = function_bind(
-        &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-    denoising.functions.write_feature = function_bind(
-        &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-    denoising.functions.detect_outliers = function_bind(
-        &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-    denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
-    denoising.render_buffer.samples = tile.sample;
-    denoising.buffer.gpu_temporary_mem = false;
-
-    denoising.run_denoising(tile);
-  }
-
-  void thread_render(DeviceTask &task)
-  {
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        return;
-    }
-
-    /* allocate buffer for kernel globals */
-    device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
-        KernelGlobals(thread_kernel_globals_init());
-
-    profiler.add_state(&kg->profiler);
-
-    CPUSplitKernel *split_kernel = NULL;
-    if (use_split_kernel) {
-      split_kernel = new CPUSplitKernel(this);
-      if (!split_kernel->load_kernels(requested_features)) {
-        thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-        kgbuffer.free();
-        delete split_kernel;
-        return;
-      }
-    }
-
-    /* NLM denoiser. */
-    DenoisingTask *denoising = NULL;
-
-    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-     * avoid waiting with mutex locks in the denoiser, we let only a single
-     * thread acquire denoising tiles. */
-    uint tile_types = task.tile_types;
-    bool hold_denoise_lock = false;
-    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      if (!oidn_task_lock.try_lock()) {
-        tile_types &= ~RenderTile::DENOISE;
-        hold_denoise_lock = true;
-      }
-    }
-
-    RenderTile tile;
-    while (task.acquire_tile(this, tile, tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        if (use_split_kernel) {
-          device_only_memory<uchar> void_buffer(this, "void_buffer");
-          split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
-        }
-        else {
-          render(task, tile, kg);
-        }
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, kg);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-          denoise_openimagedenoise(task, tile);
-        }
-        else if (task.denoising.type == DENOISER_NLM) {
-          if (denoising == NULL) {
-            denoising = new DenoisingTask(this, task);
-            denoising->profiler = &kg->profiler;
-          }
-          denoise_nlm(*denoising, tile);
-        }
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-
-      if (TaskPool::canceled()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    if (hold_denoise_lock) {
-      oidn_task_lock.unlock();
-    }
-
-    profiler.remove_state(&kg->profiler);
-
-    thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
-    kg->~KernelGlobals();
-    kgbuffer.free();
-    delete split_kernel;
-    delete denoising;
-  }
-
-  void thread_denoise(DeviceTask &task)
-  {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      denoise_openimagedenoise(task, tile);
-    }
-    else {
-      DenoisingTask denoising(this, task);
-
-      ProfilingState denoising_profiler_state;
-      profiler.add_state(&denoising_profiler_state);
-      denoising.profiler = &denoising_profiler_state;
-
-      denoise_nlm(denoising, tile);
-
-      profiler.remove_state(&denoising_profiler_state);
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-
-  void thread_film_convert(DeviceTask &task)
-  {
-    float sample_scale = 1.0f / (task.sample + 1);
-
-    if (task.rgba_half) {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_half_float_kernel()(&kernel_globals,
-                                         (uchar4 *)task.rgba_half,
-                                         (float *)task.buffer,
-                                         sample_scale,
-                                         x,
-                                         y,
-                                         task.offset,
-                                         task.stride);
-    }
-    else {
-      for (int y = task.y; y < task.y + task.h; y++)
-        for (int x = task.x; x < task.x + task.w; x++)
-          convert_to_byte_kernel()(&kernel_globals,
-                                   (uchar4 *)task.rgba_byte,
-                                   (float *)task.buffer,
-                                   sample_scale,
-                                   x,
-                                   y,
-                                   task.offset,
-                                   task.stride);
-    }
-  }
-
-  void thread_shader(DeviceTask &task)
-  {
-    KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
-
-    for (int sample = 0; sample < task.num_samples; sample++) {
-      for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-        shader_kernel()(kg,
-                        (uint4 *)task.shader_input,
-                        (float4 *)task.shader_output,
-                        task.shader_eval_type,
-                        task.shader_filter,
-                        x,
-                        task.offset,
-                        sample);
-
-      if (task.get_cancel() || TaskPool::canceled())
-        break;
-
-      task.update_progress(NULL);
-    }
-
-    thread_kernel_globals_free(kg);
-    delete kg;
-  }
-
-  virtual int get_split_task_count(DeviceTask &task) override
-  {
-    if (task.type == DeviceTask::SHADER)
-      return task.get_subtask_count(info.cpu_threads, 256);
-    else
-      return task.get_subtask_count(info.cpu_threads);
-  }
-
-  virtual void task_add(DeviceTask &task) override
-  {
-    /* Load texture info. */
-    load_texture_info();
-
-    /* split task into smaller ones */
-    list<DeviceTask> tasks;
-
-    if (task.type == DeviceTask::DENOISE_BUFFER &&
-        task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-      /* Denoise entire buffer at once with OIDN, it has own threading. */
-      tasks.push_back(task);
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      task.split(tasks, info.cpu_threads, 256);
-    }
-    else {
-      task.split(tasks, info.cpu_threads);
-    }
-
-    foreach (DeviceTask &task, tasks) {
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy);
-      });
-    }
-  }
-
-  virtual void task_wait() override
-  {
-    task_pool.wait_work();
-  }
-
-  virtual void task_cancel() override
-  {
-    task_pool.cancel();
-  }
-
- protected:
-  inline KernelGlobals thread_kernel_globals_init()
-  {
-    KernelGlobals kg = kernel_globals;
-    kg.transparent_shadow_intersections = NULL;
-    const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
-                                sizeof(*kg.decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      kg.decoupled_volume_steps[i] = NULL;
-    }
-    kg.decoupled_volume_steps_index = 0;
-    kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
-#ifdef WITH_OSL
-    OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
-#endif
-    return kg;
-  }
-
-  inline void thread_kernel_globals_free(KernelGlobals *kg)
-  {
-    if (kg == NULL) {
-      return;
-    }
-
-    if (kg->transparent_shadow_intersections != NULL) {
-      free(kg->transparent_shadow_intersections);
-    }
-    const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
-                                sizeof(*kg->decoupled_volume_steps);
-    for (int i = 0; i < decoupled_count; ++i) {
-      if (kg->decoupled_volume_steps[i] != NULL) {
-        free(kg->decoupled_volume_steps[i]);
-      }
-    }
-#ifdef WITH_OSL
-    OSLShader::thread_free(kg);
-#endif
-  }
-
-  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
-  {
-    requested_features = requested_features_;
-
-    return true;
-  }
-};
-
-/* split kernel */
-
-class CPUSplitKernelFunction : public SplitKernelFunction {
- public:
-  CPUDevice *device;
-  void (*func)(KernelGlobals *kg, KernelData *data);
-
-  CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
-  {
-  }
-  ~CPUSplitKernelFunction()
-  {
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim,
-                       device_memory &kernel_globals,
-                       device_memory &data)
-  {
-    if (!func) {
-      return false;
-    }
-
-    KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-    kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-    for (int y = 0; y < dim.global_size[1]; y++) {
-      for (int x = 0; x < dim.global_size[0]; x++) {
-        kg->global_id = make_int2(x, y);
-
-        func(kg, (KernelData *)data.device_pointer);
-      }
-    }
-
-    return true;
-  }
-};
-
-CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
-{
-}
-
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                                    RenderTile &rtile,
-                                                    int num_global_elements,
-                                                    device_memory &kernel_globals,
-                                                    device_memory &data,
-                                                    device_memory &split_data,
-                                                    device_memory &ray_state,
-                                                    device_memory &queue_index,
-                                                    device_memory &use_queues_flags,
-                                                    device_memory &work_pool_wgs)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
-  for (int y = 0; y < dim.global_size[1]; y++) {
-    for (int x = 0; x < dim.global_size[0]; x++) {
-      kg->global_id = make_int2(x, y);
-
-      device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
-                                 (KernelData *)data.device_pointer,
-                                 (void *)split_data.device_pointer,
-                                 num_global_elements,
-                                 (char *)ray_state.device_pointer,
-                                 rtile.start_sample,
-                                 rtile.start_sample + rtile.num_samples,
-                                 rtile.x,
-                                 rtile.y,
-                                 rtile.w,
-                                 rtile.h,
-                                 rtile.offset,
-                                 rtile.stride,
-                                 (int *)queue_index.device_pointer,
-                                 dim.global_size[0] * dim.global_size[1],
-                                 (char *)use_queues_flags.device_pointer,
-                                 (uint *)work_pool_wgs.device_pointer,
-                                 rtile.num_samples,
-                                 (float *)rtile.buffer);
-    }
-  }
-
-  return true;
-}
-
-SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
-                                                               const DeviceRequestedFeatures &)
-{
-  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
-
-  kernel->func = device->split_kernels[kernel_name]();
-  if (!kernel->func) {
-    delete kernel;
-    return NULL;
-  }
-
-  return kernel;
-}
-
-int2 CPUSplitKernel::split_kernel_local_size()
-{
-  return make_int2(1, 1);
-}
-
-int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
-                                              device_memory & /*data*/,
-                                              DeviceTask & /*task*/)
-{
-  return make_int2(1, 1);
-}
-
-uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
-                                           device_memory & /*data*/,
-                                           size_t num_threads)
-{
-  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
-
-  return split_data_buffer_size(kg, num_threads);
-}
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new CPUDevice(info, stats, profiler, background);
-}
-
-void device_cpu_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_CPU;
-  info.description = system_cpu_brand_string();
-  info.id = "CPU";
-  info.num = 0;
-  info.has_volume_decoupled = true;
-  info.has_adaptive_stop_per_sample = true;
-  info.has_osl = true;
-  info.has_half_images = true;
-  info.has_nanovdb = true;
-  info.has_profiling = true;
-  info.denoisers = DENOISER_NLM;
-  if (openimagedenoise_supported()) {
-    info.denoisers |= DENOISER_OPENIMAGEDENOISE;
-  }
-
-  devices.insert(devices.begin(), info);
-}
-
-string device_cpu_capabilities()
-{
-  string capabilities = "";
-  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
-  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
-  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
-  capabilities += system_cpu_support_avx() ? "AVX " : "";
-  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
-  if (capabilities[capabilities.size() - 1] == ' ')
-    capabilities.resize(capabilities.size() - 1);
-  return capabilities;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.cpp b/intern/cycles/device/device_denoise.cpp
new file mode 100644
index 00000000000..aea7868f65d
--- /dev/null
+++ b/intern/cycles/device/device_denoise.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoise.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *denoiserTypeToHumanReadable(DenoiserType type)
+{
+  switch (type) {
+    case DENOISER_OPTIX:
+      return "OptiX";
+    case DENOISER_OPENIMAGEDENOISE:
+      return "OpenImageDenoise";
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      return "UNKNOWN";
+  }
+
+  return "UNKNOWN";
+}
+
+const NodeEnum *DenoiseParams::get_type_enum()
+{
+  static NodeEnum type_enum;
+
+  if (type_enum.empty()) {
+    type_enum.insert("optix", DENOISER_OPTIX);
+    type_enum.insert("openimageio", DENOISER_OPENIMAGEDENOISE);
+  }
+
+  return &type_enum;
+}
+
+const NodeEnum *DenoiseParams::get_prefilter_enum()
+{
+  static NodeEnum prefilter_enum;
+
+  if (prefilter_enum.empty()) {
+    prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+    prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+    prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+  }
+
+  return &prefilter_enum;
+}
+
+NODE_DEFINE(DenoiseParams)
+{
+  NodeType *type = NodeType::add("denoise_params", create);
+
+  const NodeEnum *type_enum = get_type_enum();
+  const NodeEnum *prefilter_enum = get_prefilter_enum();
+
+  SOCKET_BOOLEAN(use, "Use", false);
+
+  SOCKET_ENUM(type, "Type", *type_enum, DENOISER_OPENIMAGEDENOISE);
+
+  SOCKET_INT(start_sample, "Start Sample", 0);
+
+  SOCKET_BOOLEAN(use_pass_albedo, "Use Pass Albedo", true);
+  SOCKET_BOOLEAN(use_pass_normal, "Use Pass Normal", false);
+
+  SOCKET_ENUM(prefilter, "Prefilter", *prefilter_enum, DENOISER_PREFILTER_FAST);
+
+  return type;
+}
+
+DenoiseParams::DenoiseParams() : Node(get_node_type())
+{
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoise.h b/intern/cycles/device/device_denoise.h
new file mode 100644
index 00000000000..dfdc7cc87b3
--- /dev/null
+++ b/intern/cycles/device/device_denoise.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+#include "graph/node.h"
+#include "render/buffers.h"
+
+CCL_NAMESPACE_BEGIN
+
+enum DenoiserType {
+  DENOISER_OPTIX = 2,
+  DENOISER_OPENIMAGEDENOISE = 4,
+  DENOISER_NUM,
+
+  DENOISER_NONE = 0,
+  DENOISER_ALL = ~0,
+};
+
+/* COnstruct human-readable string which denotes the denoiser type. */
+const char *denoiserTypeToHumanReadable(DenoiserType type);
+
+typedef int DenoiserTypeMask;
+
+enum DenoiserPrefilter {
+  /* Best quality of the result without extra processing time, but requires guiding passes to be
+   * noise-free. */
+  DENOISER_PREFILTER_NONE = 1,
+
+  /* Denoise color and guiding passes together.
+   * Improves quality when guiding passes are noisy using least amount of extra processing time. */
+  DENOISER_PREFILTER_FAST = 2,
+
+  /* Prefilter noisy guiding passes before denoising color.
+   * Improves quality when guiding passes are noisy using extra processing time. */
+  DENOISER_PREFILTER_ACCURATE = 3,
+
+  DENOISER_PREFILTER_NUM,
+};
+
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization.
+ * The default values here do not really matter as they are always initialized from the
+ * Integrator node. */
+class DenoiseParams : public Node {
+ public:
+  NODE_DECLARE
+
+  /* Apply denoiser to image. */
+  bool use = false;
+
+  /* Denoiser type. */
+  DenoiserType type = DENOISER_OPENIMAGEDENOISE;
+
+  /* Viewport start sample. */
+  int start_sample = 0;
+
+  /* Auxiliary passes. */
+  bool use_pass_albedo = true;
+  bool use_pass_normal = true;
+
+  DenoiserPrefilter prefilter = DENOISER_PREFILTER_FAST;
+
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_prefilter_enum();
+
+  DenoiseParams();
+
+  bool modified(const DenoiseParams &other) const
+  {
+    return !(use == other.use && type == other.type && start_sample == other.start_sample &&
+             use_pass_albedo == other.use_pass_albedo &&
+             use_pass_normal == other.use_pass_normal && prefilter == other.prefilter);
+  }
+};
+
+/* All the parameters needed to perform buffer denoising on a device.
+ * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+ * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+ * single place where they are all listed, so that it's not required to modify all device methods
+ * when these parameters do change. */
+class DeviceDenoiseTask {
+ public:
+  DenoiseParams params;
+
+  int num_samples;
+
+  RenderBuffers *render_buffers;
+  BufferParams buffer_params;
+
+  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+   * tracer) point of view. */
+  bool allow_inplace_modification;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
deleted file mode 100644
index 38c42d15cab..00000000000
--- a/intern/cycles/device/device_denoising.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_denoising.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-    : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
-      profiler(NULL),
-      storage(device),
-      buffer(device),
-      device(device)
-{
-  radius = task.denoising.radius;
-  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
-  if (task.denoising.relative_pca) {
-    pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
-  }
-  else {
-    pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
-  }
-
-  render_buffer.frame_stride = task.frame_stride;
-  render_buffer.pass_stride = task.pass_stride;
-  render_buffer.offset = task.pass_denoising_data;
-
-  target_buffer.pass_stride = task.target_pass_stride;
-  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
-  target_buffer.offset = 0;
-
-  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
-  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
-  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
-  tile_info->from_render = task.denoising_from_render ? 1 : 0;
-
-  tile_info->frames[0] = 0;
-  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
-  for (int i = 1; i < tile_info->num_frames; i++) {
-    tile_info->frames[i] = task.denoising_frames[i - 1];
-  }
-
-  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
-  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
-}
-
-DenoisingTask::~DenoisingTask()
-{
-  storage.XtWX.free();
-  storage.XtWY.free();
-  storage.transform.free();
-  storage.rank.free();
-  buffer.mem.free();
-  buffer.temporary_mem.free();
-  tile_info_mem.free();
-}
-
-void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors)
-{
-  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-    RenderTile &rtile = neighbors.tiles[i];
-    tile_info->offsets[i] = rtile.offset;
-    tile_info->strides[i] = rtile.stride;
-    tile_info->buffers[i] = rtile.buffer;
-  }
-  tile_info->x[0] = neighbors.tiles[3].x;
-  tile_info->x[1] = neighbors.tiles[4].x;
-  tile_info->x[2] = neighbors.tiles[5].x;
-  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-  tile_info->y[0] = neighbors.tiles[1].y;
-  tile_info->y[1] = neighbors.tiles[4].y;
-  tile_info->y[2] = neighbors.tiles[7].y;
-  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-
-  target_buffer.offset = neighbors.target.offset;
-  target_buffer.stride = neighbors.target.stride;
-  target_buffer.ptr = neighbors.target.buffer;
-
-  if (do_prefilter && neighbors.target.buffers) {
-    target_buffer.denoising_output_offset =
-        neighbors.target.buffers->params.get_denoising_prefiltered_offset();
-  }
-  else {
-    target_buffer.denoising_output_offset = 0;
-  }
-
-  tile_info_mem.copy_to_device();
-}
-
-void DenoisingTask::setup_denoising_buffer()
-{
-  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
-   * tiles */
-  rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
-  rect = rect_expand(rect, radius);
-  rect = rect_clip(rect,
-                   make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
-  buffer.use_intensity = do_prefilter || (tile_info->num_frames > 1);
-  buffer.passes = buffer.use_intensity ? 15 : 14;
-  buffer.width = rect.z - rect.x;
-  buffer.stride = align_up(buffer.width, 4);
-  buffer.h = rect.w - rect.y;
-  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
-  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
-  buffer.frame_stride = buffer.pass_stride * buffer.passes;
-  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
-  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
-  buffer.mem.alloc_to_device(mem_size, false);
-  buffer.use_time = (tile_info->num_frames > 1);
-
-  /* CPUs process shifts sequentially while GPUs process them in parallel. */
-  int num_layers;
-  if (buffer.gpu_temporary_mem) {
-    /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
-    int max_radius = max(radius, 6);
-    int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
-    num_layers = 2 * num_shifts + 1;
-  }
-  else {
-    num_layers = 3;
-  }
-  /* Allocate two layers per shift as well as one for the weight accumulation. */
-  buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
-}
-
-void DenoisingTask::prefilter_shadowing()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
-
-  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
-   * sample variance and the buffer variance. */
-  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
-  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
-   * sample variance. */
-  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
-  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
-  /* Reuse memory, the previous data isn't needed anymore. */
-  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
-  /* Use the smoothed variance to filter the two shadow half images using each other for weight
-   * calculation. */
-  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
-  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
-  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
-  device_ptr residual_var = *sample_var_var;
-  /* Estimate the residual variance between the two filtered halves. */
-  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
-  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
-  /* Use the residual variance for a second filter pass. */
-  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
-  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
-  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
-  /* Combine the two double-filtered halves to a final shadow feature. */
-  device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
-  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
-}
-
-void DenoisingTask::prefilter_features()
-{
-  device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
-  device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
-
-  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
-  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
-  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
-  for (int pass = 0; pass < 7; pass++) {
-    device_sub_ptr feature_pass(
-        buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
-    /* Get the unfiltered pass and its variance from the RenderBuffers. */
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *unfiltered,
-                          *variance,
-                          1.0f / render_buffer.samples);
-    /* Smooth the pass and store the result in the denoising buffers. */
-    nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
-    functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
-  }
-}
-
-void DenoisingTask::prefilter_color()
-{
-  int mean_from[] = {20, 21, 22};
-  int variance_from[] = {23, 24, 25};
-  int mean_to[] = {8, 9, 10};
-  int variance_to[] = {11, 12, 13};
-  int num_color_passes = 3;
-
-  device_only_memory<float> temporary_color(device, "denoising temporary color");
-  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
-
-  for (int pass = 0; pass < num_color_passes; pass++) {
-    device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
-    device_sub_ptr color_var_pass(
-        temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
-    functions.get_feature(mean_from[pass],
-                          variance_from[pass],
-                          *color_pass,
-                          *color_var_pass,
-                          1.0f / render_buffer.samples);
-  }
-
-  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
-  device_sub_ptr color_var_pass(
-      buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
-  functions.detect_outliers(
-      temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
-  if (buffer.use_intensity) {
-    device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-    nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
-    functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
-  }
-}
-
-void DenoisingTask::load_buffer()
-{
-  device_ptr null_ptr = (device_ptr)0;
-
-  int original_offset = render_buffer.offset;
-
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int i = 0; i < tile_info->num_frames; i++) {
-    for (int pass = 0; pass < num_passes; pass++) {
-      device_sub_ptr to_pass(
-          buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
-      bool is_variance = (pass >= 11) && (pass <= 13);
-      functions.get_feature(
-          pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
-    }
-    render_buffer.offset += render_buffer.frame_stride;
-  }
-
-  render_buffer.offset = original_offset;
-}
-
-void DenoisingTask::write_buffer()
-{
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  int num_passes = buffer.use_intensity ? 15 : 14;
-  for (int pass = 0; pass < num_passes; pass++) {
-    device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
-    int out_offset = pass + target_buffer.denoising_output_offset;
-    functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
-  }
-}
-
-void DenoisingTask::construct_transform()
-{
-  storage.w = filter_area.z;
-  storage.h = filter_area.w;
-
-  storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
-  storage.rank.alloc_to_device(storage.w * storage.h, false);
-
-  functions.construct_transform();
-}
-
-void DenoisingTask::reconstruct()
-{
-  storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
-  storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
-  storage.XtWX.zero_to_device();
-  storage.XtWY.zero_to_device();
-
-  reconstruction_state.filter_window = rect_from_shape(
-      filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
-  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
-  reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
-                                                 target_buffer.stride,
-                                                 target_buffer.pass_stride,
-                                                 target_buffer.denoising_clean_offset);
-  reconstruction_state.source_w = rect.z - rect.x;
-  reconstruction_state.source_h = rect.w - rect.y;
-
-  device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
-  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
-  for (int f = 0; f < tile_info->num_frames; f++) {
-    device_ptr scale_ptr = 0;
-    device_sub_ptr *scale_sub_ptr = NULL;
-    if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
-      scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
-      scale_ptr = **scale_sub_ptr;
-    }
-
-    functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
-    delete scale_sub_ptr;
-  }
-  functions.solve(target_buffer.ptr);
-}
-
-void DenoisingTask::run_denoising(RenderTile &tile)
-{
-  RenderTileNeighbors neighbors(tile);
-  functions.map_neighbor_tiles(neighbors);
-  set_render_buffer(neighbors);
-
-  setup_denoising_buffer();
-
-  if (tile_info->from_render) {
-    prefilter_shadowing();
-    prefilter_features();
-    prefilter_color();
-  }
-  else {
-    load_buffer();
-  }
-
-  if (do_filter) {
-    construct_transform();
-    reconstruct();
-  }
-
-  if (do_prefilter) {
-    write_buffer();
-  }
-
-  functions.unmap_neighbor_tiles(neighbors);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
deleted file mode 100644
index bb8bdfdd225..00000000000
--- a/intern/cycles/device/device_denoising.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_DENOISING_H__
-#define __DEVICE_DENOISING_H__
-
-#include "device/device.h"
-
-#include "render/buffers.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "util/util_profiling.h"
-
-CCL_NAMESPACE_BEGIN
-
-class DenoisingTask {
- public:
-  /* Parameters of the denoising algorithm. */
-  int radius;
-  float nlm_k_2;
-  float pca_threshold;
-
-  /* Parameters of the RenderBuffers. */
-  struct RenderBuffers {
-    int offset;
-    int pass_stride;
-    int frame_stride;
-    int samples;
-  } render_buffer;
-
-  /* Pointer and parameters of the target buffer. */
-  struct TargetBuffer {
-    int offset;
-    int stride;
-    int pass_stride;
-    int denoising_clean_offset;
-    int denoising_output_offset;
-    device_ptr ptr;
-  } target_buffer;
-
-  TileInfo *tile_info;
-  device_vector<int> tile_info_mem;
-
-  ProfilingState *profiler;
-
-  int4 rect;
-  int4 filter_area;
-
-  bool do_prefilter;
-  bool do_filter;
-
-  struct DeviceFunctions {
-    function<bool(
-        device_ptr image_ptr,    /* Contains the values that are smoothed. */
-        device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
-        device_ptr variance_ptr, /* Contains the variance of the guide image. */
-        device_ptr out_ptr       /* The filtered output is written into this image. */
-        )>
-        non_local_means;
-    function<bool(
-        device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
-        accumulate;
-    function<bool(device_ptr output_ptr)> solve;
-    function<bool()> construct_transform;
-
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  int r,
-                  int4 rect)>
-        combine_halves;
-    function<bool(device_ptr a_ptr,
-                  device_ptr b_ptr,
-                  device_ptr sample_variance_ptr,
-                  device_ptr sv_variance_ptr,
-                  device_ptr buffer_variance_ptr)>
-        divide_shadow;
-    function<bool(int mean_offset,
-                  int variance_offset,
-                  device_ptr mean_ptr,
-                  device_ptr variance_ptr,
-                  float scale)>
-        get_feature;
-    function<bool(device_ptr image_ptr,
-                  device_ptr variance_ptr,
-                  device_ptr depth_ptr,
-                  device_ptr output_ptr)>
-        detect_outliers;
-    function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
-    function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles;
-    function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles;
-  } functions;
-
-  /* Stores state of the current Reconstruction operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct ReconstructionState {
-    int4 filter_window;
-    int4 buffer_params;
-
-    int source_w;
-    int source_h;
-  } reconstruction_state;
-
-  /* Stores state of the current NLM operation,
-   * which is accessed by the device in order to perform the operation. */
-  struct NLMState {
-    int r;     /* Search radius of the filter. */
-    int f;     /* Patch size of the filter. */
-    float a;   /* Variance compensation factor in the MSE estimation. */
-    float k_2; /* Squared value of the k parameter of the filter. */
-    bool is_color;
-
-    void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
-    {
-      r = r_;
-      f = f_;
-      a = a_, k_2 = k_2_;
-      is_color = is_color_;
-    }
-  } nlm_state;
-
-  struct Storage {
-    device_only_memory<float> transform;
-    device_only_memory<int> rank;
-    device_only_memory<float> XtWX;
-    device_only_memory<float3> XtWY;
-    int w;
-    int h;
-
-    Storage(Device *device)
-        : transform(device, "denoising transform"),
-          rank(device, "denoising rank"),
-          XtWX(device, "denoising XtWX"),
-          XtWY(device, "denoising XtWY")
-    {
-    }
-  } storage;
-
-  DenoisingTask(Device *device, const DeviceTask &task);
-  ~DenoisingTask();
-
-  void run_denoising(RenderTile &tile);
-
-  struct DenoiseBuffers {
-    int pass_stride;
-    int passes;
-    int stride;
-    int h;
-    int width;
-    int frame_stride;
-    device_only_memory<float> mem;
-    device_only_memory<float> temporary_mem;
-    bool use_time;
-    bool use_intensity;
-
-    bool gpu_temporary_mem;
-
-    DenoiseBuffers(Device *device)
-        : mem(device, "denoising pixel buffer"),
-          temporary_mem(device, "denoising temporary mem", true)
-    {
-    }
-  } buffer;
-
- protected:
-  Device *device;
-
-  void set_render_buffer(RenderTileNeighbors &neighbors);
-  void setup_denoising_buffer();
-  void prefilter_shadowing();
-  void prefilter_features();
-  void prefilter_color();
-  void construct_transform();
-  void reconstruct();
-
-  void load_buffer();
-  void write_buffer();
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl b/intern/cycles/device/device_graphics_interop.cpp
index fa210e747c0..a80a236759f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_path_init.cl
+++ b/intern/cycles/device/device_graphics_interop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_path_init.h"
+#include "device/device_graphics_interop.h"
 
-#define KERNEL_NAME path_init
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+CCL_NAMESPACE_BEGIN
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h
new file mode 100644
index 00000000000..671b1c189d7
--- /dev/null
+++ b/intern/cycles/device/device_graphics_interop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Information about interoperability destination.
+ * Is provided by the GPUDisplay. */
+class DeviceGraphicsInteropDestination {
+ public:
+  /* Dimensions of the buffer, in pixels. */
+  int buffer_width = 0;
+  int buffer_height = 0;
+
+  /* OpenGL pixel buffer object. */
+  int opengl_pbo_id = 0;
+
+  /* Clear the entire destination before doing partial write to it. */
+  bool need_clear = false;
+};
+
+/* Device-side graphics interoperability support.
+ *
+ * Takes care of holding all the handlers needed by the device to implement interoperability with
+ * the graphics library. */
+class DeviceGraphicsInterop {
+ public:
+  DeviceGraphicsInterop() = default;
+  virtual ~DeviceGraphicsInterop() = default;
+
+  /* Update this device-side graphics interoperability object with the given destination resource
+   * information. */
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0;
+
+  virtual device_ptr map() = 0;
+  virtual void unmap() = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
deleted file mode 100644
index ecc79c5d7ee..00000000000
--- a/intern/cycles/device/device_intern.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_INTERN_H__
-#define __DEVICE_INTERN_H__
-
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-class Device;
-class DeviceInfo;
-class Profiler;
-class Stats;
-
-Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string> &parameters);
-bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-bool device_optix_init();
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address);
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo> &devices);
-void device_opencl_info(vector<DeviceInfo> &devices);
-void device_cuda_info(vector<DeviceInfo> &devices);
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
-void device_network_info(vector<DeviceInfo> &devices);
-
-string device_cpu_capabilities();
-string device_opencl_capabilities();
-string device_cuda_capabilities();
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
new file mode 100644
index 00000000000..ceaddee4756
--- /dev/null
+++ b/intern/cycles/device/device_kernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_kernel.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel)
+{
+  switch (kernel) {
+    /* Integrator. */
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+      return "integrator_init_from_camera";
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+      return "integrator_init_from_bake";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      return "integrator_intersect_closest";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      return "integrator_intersect_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      return "integrator_intersect_subsurface";
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      return "integrator_intersect_volume_stack";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      return "integrator_shade_background";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      return "integrator_shade_light";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      return "integrator_shade_shadow";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      return "integrator_shade_surface";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      return "integrator_shade_surface_raytrace";
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      return "integrator_shade_volume";
+    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+      return "integrator_megakernel";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+      return "integrator_queued_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+      return "integrator_queued_shadow_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+      return "integrator_active_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+      return "integrator_terminated_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      return "integrator_compact_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+      return "integrator_compact_states";
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+      return "integrator_reset";
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      return "integrator_shadow_catcher_count_possible_splits";
+
+    /* Shader evaluation. */
+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      return "shader_eval_displace";
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      return "shader_eval_background";
+
+      /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+    return "film_convert_" #variant_lowercase; \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+    return "film_convert_" #variant_lowercase "_half_rgba";
+
+      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+      FILM_CONVERT_KERNEL_AS_STRING(LIGHT_PATH, light_path)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+                                    shadow_catcher_matte_with_shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(COMBINED, combined)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
+
+    /* Adaptive sampling. */
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
+      return "adaptive_sampling_convergence_check";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X:
+      return "adaptive_sampling_filter_x";
+    case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y:
+      return "adaptive_sampling_filter_y";
+
+    /* Denoising. */
+    case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS:
+      return "filter_guiding_preprocess";
+    case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO:
+      return "filter_guiding_set_fake_albedo";
+    case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS:
+      return "filter_color_preprocess";
+    case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
+      return "filter_color_postprocess";
+
+    /* Cryptomatte. */
+    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+      return "cryptomatte_postprocess";
+
+    /* Generic */
+    case DEVICE_KERNEL_PREFIX_SUM:
+      return "prefix_sum";
+
+    case DEVICE_KERNEL_NUM:
+      break;
+  };
+  LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
+{
+  os << device_kernel_as_string(kernel);
+  return os;
+}
+
+string device_kernel_mask_as_string(DeviceKernelMask mask)
+{
+  string str;
+
+  for (uint64_t i = 0; i < sizeof(DeviceKernelMask) * 8; i++) {
+    if (mask & (uint64_t(1) << i)) {
+      if (!str.empty()) {
+        str += " ";
+      }
+      str += device_kernel_as_string((DeviceKernel)i);
+    }
+  }
+
+  return str;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/device/device_kernel.h
index 9e1e57beba6..83d959ca87b 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
+++ b/intern/cycles/device/device_kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,20 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#pragma once
 
-#define KERNEL_NAME holdout_emission_blurring_pathtermination_ao
-#define LOCALS_TYPE BackgroundAOLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "kernel/kernel_types.h"
 
+#include "util/util_string.h"
+
+#include <ostream>  // NOLINT
+
+CCL_NAMESPACE_BEGIN
+
+const char *device_kernel_as_string(DeviceKernel kernel);
+std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
+
+typedef uint64_t DeviceKernelMask;
+string device_kernel_mask_as_string(DeviceKernelMask mask);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 80a05fc32fe..c4d45829b83 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
 
 device_memory::device_memory(Device *device, const char *name, MemoryType type)
     : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements),
+      data_elements(device_type_traits<uchar>::num_elements_cpu),
       data_size(0),
       device_size(0),
       data_width(0),
@@ -149,6 +149,11 @@ void device_memory::device_zero()
   }
 }
 
+bool device_memory::device_is_cpu()
+{
+  return (device->info.type == DEVICE_CPU);
+}
+
 void device_memory::swap_device(Device *new_device,
                                 size_t new_device_size,
                                 device_ptr new_device_ptr)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 80f4d7b0468..c51594b8580 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -38,7 +38,6 @@ enum MemoryType {
   MEM_DEVICE_ONLY,
   MEM_GLOBAL,
   MEM_TEXTURE,
-  MEM_PIXELS
 };
 
 /* Supported Data Types */
@@ -54,7 +53,7 @@ enum DataType {
   TYPE_UINT64,
 };
 
-static inline size_t datatype_size(DataType datatype)
+static constexpr size_t datatype_size(DataType datatype)
 {
   switch (datatype) {
     case TYPE_UNKNOWN:
@@ -82,112 +81,155 @@ static inline size_t datatype_size(DataType datatype)
 
 template<typename T> struct device_type_traits {
   static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements = sizeof(T);
+  static const int num_elements_cpu = sizeof(T);
+  static const int num_elements_gpu = sizeof(T);
 };
 
 template<> struct device_type_traits<uchar> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar2> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar3> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uchar4> {
   static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint2> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint3> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 3;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint4> {
   static const DataType data_type = TYPE_UINT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int2> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int3> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 3;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<int4> {
   static const DataType data_type = TYPE_INT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float2> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 2;
+  static const int num_elements_cpu = 2;
+  static const int num_elements_gpu = 2;
+  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float3> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 3;
+  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<float4> {
   static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<ushort4> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint16_t> {
   static const DataType data_type = TYPE_UINT16;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<half4> {
   static const DataType data_type = TYPE_HALF;
-  static const int num_elements = 4;
+  static const int num_elements_cpu = 4;
+  static const int num_elements_gpu = 4;
+  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };
 
 template<> struct device_type_traits<uint64_t> {
   static const DataType data_type = TYPE_UINT64;
-  static const int num_elements = 1;
+  static const int num_elements_cpu = 1;
+  static const int num_elements_gpu = 1;
+  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };
 
 /* Device Memory
@@ -257,6 +299,8 @@ class device_memory {
   void device_copy_from(int y, int w, int h, int elem);
   void device_zero();
 
+  bool device_is_cpu();
+
   device_ptr original_device_ptr;
   size_t original_device_size;
   Device *original_device;
@@ -275,7 +319,9 @@ template<typename T> class device_only_memory : public device_memory {
       : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_type_traits<T>::num_elements, 1);
+    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
+                                          device_type_traits<T>::num_elements_gpu,
+                        1);
   }
 
   device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -331,11 +377,15 @@ template<typename T> class device_only_memory : public device_memory {
 
 template<typename T> class device_vector : public device_memory {
  public:
+  /* Can only use this for types that have the same size on CPU and GPU. */
+  static_assert(device_type_traits<T>::num_elements_cpu ==
+                device_type_traits<T>::num_elements_gpu);
+
   device_vector(Device *device, const char *name, MemoryType type)
       : device_memory(device, name, type)
   {
     data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements;
+    data_elements = device_type_traits<T>::num_elements_cpu;
     modified = true;
     need_realloc_ = true;
 
@@ -477,6 +527,11 @@ template<typename T> class device_vector : public device_memory {
     return (T *)host_pointer;
   }
 
+  const T *data() const
+  {
+    return (T *)host_pointer;
+  }
+
   T &operator[](size_t i)
   {
     assert(i < data_size);
@@ -507,7 +562,7 @@ template<typename T> class device_vector : public device_memory {
 
   void copy_from_device()
   {
-    device_copy_from(0, data_width, data_height, sizeof(T));
+    device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
   }
 
   void copy_from_device(int y, int w, int h)
@@ -535,33 +590,6 @@ template<typename T> class device_vector : public device_memory {
   }
 };
 
-/* Pixel Memory
- *
- * Device memory to efficiently draw as pixels to the screen in interactive
- * rendering. Only copying pixels from the device is supported, not copying to. */
-
-template<typename T> class device_pixels : public device_vector<T> {
- public:
-  device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
-  {
-  }
-
-  void alloc_to_device(size_t width, size_t height, size_t depth = 0)
-  {
-    device_vector<T>::alloc(width, height, depth);
-
-    if (!device_memory::device_pointer) {
-      device_memory::device_alloc();
-    }
-  }
-
-  T *copy_from_device(int y, int w, int h)
-  {
-    device_memory::device_copy_from(y, w, h, sizeof(T));
-    return device_vector<T>::data();
-  }
-};
-
 /* Device Sub Memory
  *
  * Pointer into existing memory. It is not allocated separately, but created
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
deleted file mode 100644
index 85ffa5fcd52..00000000000
--- a/intern/cycles/device/device_multi.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <sstream>
-#include <stdlib.h>
-
-#include "bvh/bvh_multi.h"
-
-#include "device/device.h"
-#include "device/device_intern.h"
-#include "device/device_network.h"
-
-#include "render/buffers.h"
-#include "render/geometry.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_logging.h"
-#include "util/util_map.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-class MultiDevice : public Device {
- public:
-  struct SubDevice {
-    Stats stats;
-    Device *device;
-    map<device_ptr, device_ptr> ptr_map;
-    int peer_island_index = -1;
-  };
-
-  list<SubDevice> devices, denoising_devices;
-  device_ptr unique_key;
-  vector<vector<SubDevice *>> peer_islands;
-  bool use_denoising;
-  bool matching_rendering_and_denoising_devices;
-
-  MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
-      : Device(info, stats, profiler, background_),
-        unique_key(1),
-        use_denoising(!info.denoising_devices.empty())
-  {
-    foreach (DeviceInfo &subinfo, info.multi_devices) {
-      /* Always add CPU devices at the back since GPU devices can change
-       * host memory pointers, which CPU uses as device pointer. */
-      SubDevice *sub;
-      if (subinfo.type == DEVICE_CPU) {
-        devices.emplace_back();
-        sub = &devices.back();
-      }
-      else {
-        devices.emplace_front();
-        sub = &devices.front();
-      }
-
-      /* The pointer to 'sub->stats' will stay valid even after new devices
-       * are added, since 'devices' is a linked list. */
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      denoising_devices.emplace_front();
-      SubDevice *sub = &denoising_devices.front();
-
-      sub->device = Device::create(subinfo, sub->stats, profiler, background);
-    }
-
-    /* Build a list of peer islands for the available render devices */
-    foreach (SubDevice &sub, devices) {
-      /* First ensure that every device is in at least once peer island */
-      if (sub.peer_island_index < 0) {
-        peer_islands.emplace_back();
-        sub.peer_island_index = (int)peer_islands.size() - 1;
-        peer_islands[sub.peer_island_index].push_back(&sub);
-      }
-
-      if (!info.has_peer_memory) {
-        continue;
-      }
-
-      /* Second check peer access between devices and fill up the islands accordingly */
-      foreach (SubDevice &peer_sub, devices) {
-        if (peer_sub.peer_island_index < 0 &&
-            peer_sub.device->info.type == sub.device->info.type &&
-            peer_sub.device->check_peer_access(sub.device)) {
-          peer_sub.peer_island_index = sub.peer_island_index;
-          peer_islands[sub.peer_island_index].push_back(&peer_sub);
-        }
-      }
-    }
-
-    /* Try to re-use memory when denoising and render devices use the same physical devices
-     * (e.g. OptiX denoising and CUDA rendering device pointing to the same GPU).
-     * Ordering has to match as well, so that 'DeviceTask::split' behaves consistent. */
-    matching_rendering_and_denoising_devices = denoising_devices.empty() ||
-                                               (devices.size() == denoising_devices.size());
-    if (matching_rendering_and_denoising_devices) {
-      for (list<SubDevice>::iterator device_it = devices.begin(),
-                                     denoising_device_it = denoising_devices.begin();
-           device_it != devices.end() && denoising_device_it != denoising_devices.end();
-           ++device_it, ++denoising_device_it) {
-        const DeviceInfo &info = device_it->device->info;
-        const DeviceInfo &denoising_info = denoising_device_it->device->info;
-        if ((info.type != DEVICE_CUDA && info.type != DEVICE_OPTIX) ||
-            (denoising_info.type != DEVICE_CUDA && denoising_info.type != DEVICE_OPTIX) ||
-            info.num != denoising_info.num) {
-          matching_rendering_and_denoising_devices = false;
-          break;
-        }
-      }
-    }
-
-#ifdef WITH_NETWORK
-    /* try to add network devices */
-    ServerDiscovery discovery(true);
-    time_sleep(1.0);
-
-    vector<string> servers = discovery.get_server_list();
-
-    foreach (string &server, servers) {
-      Device *device = device_network_create(info, stats, profiler, server.c_str());
-      if (device)
-        devices.push_back(SubDevice(device));
-    }
-#endif
-  }
-
-  ~MultiDevice()
-  {
-    foreach (SubDevice &sub, devices)
-      delete sub.device;
-    foreach (SubDevice &sub, denoising_devices)
-      delete sub.device;
-  }
-
-  const string &error_message() override
-  {
-    error_msg.clear();
-
-    foreach (SubDevice &sub, devices)
-      error_msg += sub.device->error_message();
-    foreach (SubDevice &sub, denoising_devices)
-      error_msg += sub.device->error_message();
-
-    return error_msg;
-  }
-
-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
-    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
-    foreach (const SubDevice &sub_device, devices) {
-      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
-      bvh_layout_mask &= device_bvh_layout_mask;
-      bvh_layout_mask_all |= device_bvh_layout_mask;
-    }
-
-    /* With multiple OptiX devices, every device needs its own acceleration structure */
-    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
-      return BVH_LAYOUT_MULTI_OPTIX;
-    }
-
-    /* When devices do not share a common BVH layout, fall back to creating one for each */
-    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
-    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
-      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
-    }
-
-    return bvh_layout_mask;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->load_kernels(requested_features))
-        return false;
-
-    use_denoising = requested_features.use_denoising;
-    if (requested_features.use_denoising) {
-      /* Only need denoising feature, everything else is unused. */
-      DeviceRequestedFeatures denoising_features;
-      denoising_features.use_denoising = true;
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->load_kernels(denoising_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features) override
-  {
-    foreach (SubDevice &sub, devices)
-      if (!sub.device->wait_for_availability(requested_features))
-        return false;
-
-    if (requested_features.use_denoising) {
-      foreach (SubDevice &sub, denoising_devices)
-        if (!sub.device->wait_for_availability(requested_features))
-          return false;
-    }
-
-    return true;
-  }
-
-  DeviceKernelStatus get_active_kernel_switch_state() override
-  {
-    DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
-    foreach (SubDevice &sub, devices) {
-      DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
-      switch (subresult) {
-        case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
-        case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
-          return subresult;
-
-        case DEVICE_KERNEL_USING_FEATURE_KERNEL:
-        case DEVICE_KERNEL_UNKNOWN:
-          break;
-      }
-    }
-
-    return result;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    /* Try to build and share a single acceleration structure, if possible */
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
-      devices.back().device->build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
-           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
-
-    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
-    bvh_multi->sub_bvhs.resize(devices.size());
-
-    vector<BVHMulti *> geom_bvhs;
-    geom_bvhs.reserve(bvh->geometry.size());
-    foreach (Geometry *geom, bvh->geometry) {
-      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
-    }
-
-    /* Broadcast acceleration structure build to all render devices */
-    size_t i = 0;
-    foreach (SubDevice &sub, devices) {
-      /* Change geometry BVH pointers to the sub BVH */
-      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
-      }
-
-      if (!bvh_multi->sub_bvhs[i]) {
-        BVHParams params = bvh->params;
-        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
-          params.bvh_layout = BVH_LAYOUT_OPTIX;
-        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
-          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
-                                                                      BVH_LAYOUT_EMBREE;
-
-        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
-         * (since they are put into the top level directly, see bvh_embree.cpp) */
-        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
-            !bvh->geometry[0]->is_instanced()) {
-          i++;
-          continue;
-        }
-
-        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
-      }
-
-      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
-      i++;
-    }
-
-    /* Change geometry BVH pointers back to the multi BVH. */
-    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
-      bvh->geometry[k]->bvh = geom_bvhs[k];
-    }
-  }
-
-  virtual void *osl_memory() override
-  {
-    if (devices.size() > 1) {
-      return NULL;
-    }
-    return devices.front().device->osl_memory();
-  }
-
-  bool is_resident(device_ptr key, Device *sub_device) override
-  {
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        return find_matching_mem_device(key, sub)->device == sub_device;
-      }
-    }
-    return false;
-  }
-
-  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
-  {
-    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
-
-    /* Get the memory owner of this key (first try current device, then peer devices) */
-    SubDevice *owner_sub = &sub;
-    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
-      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
-        if (island_sub != owner_sub &&
-            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
-          owner_sub = island_sub;
-        }
-      }
-    }
-    return owner_sub;
-  }
-
-  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
-  {
-    assert(!island.empty());
-
-    /* Get the memory owner of this key or the device with the lowest memory usage when new */
-    SubDevice *owner_sub = island.front();
-    foreach (SubDevice *island_sub, island) {
-      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
-                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
-        owner_sub = island_sub;
-      }
-    }
-    return owner_sub;
-  }
-
-  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
-  {
-    return find_matching_mem_device(key, sub)->ptr_map[key];
-  }
-
-  void mem_alloc(device_memory &mem) override
-  {
-    device_ptr key = unique_key++;
-
-    if (mem.type == MEM_PIXELS) {
-      /* Always allocate pixels memory on all devices
-       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        sub.device->mem_alloc(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
-             mem.type == MEM_DEVICE_ONLY);
-      /* The remaining memory types can be distributed across devices */
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = 0;
-        mem.device_size = 0;
-
-        owner_sub->device->mem_alloc(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size);
-  }
-
-  void mem_copy_to(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* The tile buffers are allocated on each device (see below), so copy to all of them */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_copy_to(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_copy_to(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-
-        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
-          /* Need to create texture objects and update pointer in kernel globals on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_copy_to(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
-  {
-    device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-
-      SubDevice *owner_sub = find_matching_mem_device(key, sub);
-      mem.device = owner_sub->device;
-      mem.device_pointer = owner_sub->ptr_map[key];
-
-      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
-      i++;
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-  }
-
-  void mem_zero(device_memory &mem) override
-  {
-    device_ptr existing_key = mem.device_pointer;
-    device_ptr key = (existing_key) ? existing_key : unique_key++;
-    size_t existing_size = mem.device_size;
-
-    /* This is a hack to only allocate the tile buffers on denoising devices
-     * Similarly the tile buffers also need to be allocated separately on all devices so any
-     * overlap rendered for denoising does not interfere with each other */
-    if (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising) {
-      vector<device_ptr> device_pointers;
-      device_pointers.reserve(devices.size());
-
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        sub.device->mem_zero(mem);
-        sub.ptr_map[key] = mem.device_pointer;
-
-        device_pointers.push_back(mem.device_pointer);
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map[key] = device_pointers.front();
-          device_pointers.erase(device_pointers.begin());
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-          mem.device_size = existing_size;
-
-          sub.device->mem_zero(mem);
-          sub.ptr_map[key] = mem.device_pointer;
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-        mem.device = owner_sub->device;
-        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_zero(mem);
-        owner_sub->ptr_map[key] = mem.device_pointer;
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = key;
-    stats.mem_alloc(mem.device_size - existing_size);
-  }
-
-  void mem_free(device_memory &mem) override
-  {
-    device_ptr key = mem.device_pointer;
-    size_t existing_size = mem.device_size;
-
-    /* Free memory that was allocated for all devices (see above) on each device */
-    if (mem.type == MEM_PIXELS || (strcmp(mem.name, "RenderBuffers") == 0 && use_denoising)) {
-      foreach (SubDevice &sub, devices) {
-        mem.device = sub.device;
-        mem.device_pointer = sub.ptr_map[key];
-        mem.device_size = existing_size;
-
-        sub.device->mem_free(mem);
-        sub.ptr_map.erase(sub.ptr_map.find(key));
-      }
-      foreach (SubDevice &sub, denoising_devices) {
-        if (matching_rendering_and_denoising_devices) {
-          sub.ptr_map.erase(key);
-        }
-        else {
-          mem.device = sub.device;
-          mem.device_pointer = sub.ptr_map[key];
-          mem.device_size = existing_size;
-
-          sub.device->mem_free(mem);
-          sub.ptr_map.erase(sub.ptr_map.find(key));
-        }
-      }
-    }
-    else {
-      foreach (const vector<SubDevice *> &island, peer_islands) {
-        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
-        mem.device = owner_sub->device;
-        mem.device_pointer = owner_sub->ptr_map[key];
-        mem.device_size = existing_size;
-
-        owner_sub->device->mem_free(mem);
-        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
-
-        if (mem.type == MEM_TEXTURE) {
-          /* Free texture objects on all devices */
-          foreach (SubDevice *island_sub, island) {
-            if (island_sub != owner_sub) {
-              island_sub->device->mem_free(mem);
-            }
-          }
-        }
-      }
-    }
-
-    mem.device = this;
-    mem.device_pointer = 0;
-    mem.device_size = 0;
-    stats.mem_free(existing_size);
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->const_copy_to(name, host, size);
-  }
-
-  void draw_pixels(device_memory &rgba,
-                   int y,
-                   int w,
-                   int h,
-                   int width,
-                   int height,
-                   int dx,
-                   int dy,
-                   int dw,
-                   int dh,
-                   bool transparent,
-                   const DeviceDrawParams &draw_params) override
-  {
-    assert(rgba.type == MEM_PIXELS);
-
-    device_ptr key = rgba.device_pointer;
-    int i = 0, sub_h = h / devices.size();
-    int sub_height = height / devices.size();
-
-    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
-      int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
-      int sdy = dy + i * sub_height;
-      /* adjust math for w/width */
-
-      rgba.device_pointer = sub.ptr_map[key];
-      sub.device->draw_pixels(
-          rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
-      i++;
-    }
-
-    rgba.device_pointer = key;
-  }
-
-  void map_tile(Device *sub_device, RenderTile &tile) override
-  {
-    if (!tile.buffer) {
-      return;
-    }
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = find_matching_mem(tile.buffer, sub);
-        return;
-      }
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
-        return;
-      }
-    }
-  }
-
-  int device_number(Device *sub_device) override
-  {
-    int i = 0;
-
-    foreach (SubDevice &sub, devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    foreach (SubDevice &sub, denoising_devices) {
-      if (sub.device == sub_device)
-        return i;
-      i++;
-    }
-
-    return -1;
-  }
-
-  void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-      tile.buffer = mem.device_pointer;
-
-      if (mem.device == this && matching_rendering_and_denoising_devices) {
-        /* Skip unnecessary copies in viewport mode (buffer covers the
-         * whole image), but still need to fix up the tile device pointer. */
-        map_tile(sub_device, tile);
-        continue;
-      }
-
-      /* If the tile was rendered on another device, copy its memory to
-       * to the current device now, for the duration of the denoising task.
-       * Note that this temporarily modifies the RenderBuffers and calls
-       * the device, so this function is not thread safe. */
-      if (mem.device != sub_device) {
-        /* Only copy from device to host once. This is faster, but
-         * also required for the case where a CPU thread is denoising
-         * a tile rendered on the GPU. In that case we have to avoid
-         * overwriting the buffer being de-noised by the CPU thread. */
-        if (!tile.buffers->map_neighbor_copied) {
-          tile.buffers->map_neighbor_copied = true;
-          mem.copy_from_device();
-        }
-
-        if (mem.device == this) {
-          /* Can re-use memory if tile is already allocated on the sub device. */
-          map_tile(sub_device, tile);
-          mem.swap_device(sub_device, mem.device_size, tile.buffer);
-        }
-        else {
-          mem.swap_device(sub_device, 0, 0);
-        }
-
-        mem.copy_to_device();
-
-        tile.buffer = mem.device_pointer;
-        tile.device_size = mem.device_size;
-
-        mem.restore_device();
-      }
-    }
-  }
-
-  void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) override
-  {
-    RenderTile &target_tile = neighbors.target;
-    device_vector<float> &mem = target_tile.buffers->buffer;
-
-    if (mem.device == this && matching_rendering_and_denoising_devices) {
-      return;
-    }
-
-    /* Copy denoised result back to the host. */
-    mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer);
-    mem.copy_from_device();
-    mem.restore_device();
-
-    /* Copy denoised result to the original device. */
-    mem.copy_to_device();
-
-    for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-      RenderTile &tile = neighbors.tiles[i];
-      if (!tile.buffers) {
-        continue;
-      }
-
-      device_vector<float> &mem = tile.buffers->buffer;
-
-      if (mem.device != sub_device && mem.device != this) {
-        /* Free up memory again if it was allocated for the copy above. */
-        mem.swap_device(sub_device, tile.device_size, tile.buffer);
-        sub_device->mem_free(mem);
-        mem.restore_device();
-      }
-    }
-  }
-
-  int get_split_task_count(DeviceTask &task) override
-  {
-    int total_tasks = 0;
-    list<DeviceTask> tasks;
-    task.split(tasks, devices.size());
-    foreach (SubDevice &sub, devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        total_tasks += sub.device->get_split_task_count(subtask);
-      }
-    }
-    return total_tasks;
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    list<SubDevice> task_devices = devices;
-    if (!denoising_devices.empty()) {
-      if (task.type == DeviceTask::DENOISE_BUFFER) {
-        /* Denoising tasks should be redirected to the denoising devices entirely. */
-        task_devices = denoising_devices;
-      }
-      else if (task.type == DeviceTask::RENDER && (task.tile_types & RenderTile::DENOISE)) {
-        const uint tile_types = task.tile_types;
-        /* For normal rendering tasks only redirect the denoising part to the denoising devices.
-         * Do not need to split the task here, since they all run through 'acquire_tile'. */
-        task.tile_types = RenderTile::DENOISE;
-        foreach (SubDevice &sub, denoising_devices) {
-          sub.device->task_add(task);
-        }
-        /* Rendering itself should still be executed on the rendering devices. */
-        task.tile_types = tile_types ^ RenderTile::DENOISE;
-      }
-    }
-
-    list<DeviceTask> tasks;
-    task.split(tasks, task_devices.size());
-
-    foreach (SubDevice &sub, task_devices) {
-      if (!tasks.empty()) {
-        DeviceTask subtask = tasks.front();
-        tasks.pop_front();
-
-        if (task.buffer)
-          subtask.buffer = find_matching_mem(task.buffer, sub);
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = find_matching_mem(task.shader_input, sub);
-        if (task.shader_output)
-          subtask.shader_output = find_matching_mem(task.shader_output, sub);
-
-        sub.device->task_add(subtask);
-
-        if (task.buffers && task.buffers->buffer.device == this) {
-          /* Synchronize access to RenderBuffers, since 'map_neighbor_tiles' is not thread-safe. */
-          sub.device->task_wait();
-        }
-      }
-    }
-  }
-
-  void task_wait() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_wait();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_wait();
-  }
-
-  void task_cancel() override
-  {
-    foreach (SubDevice &sub, devices)
-      sub.device->task_cancel();
-    foreach (SubDevice &sub, denoising_devices)
-      sub.device->task_cancel();
-  }
-};
-
-Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new MultiDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
deleted file mode 100644
index 8904b517e92..00000000000
--- a/intern/cycles/device/device_network.cpp
+++ /dev/null
@@ -1,812 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_network.h"
-#include "device/device.h"
-#include "device/device_intern.h"
-
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-
-#if defined(WITH_NETWORK)
-
-CCL_NAMESPACE_BEGIN
-
-typedef map<device_ptr, device_ptr> PtrMap;
-typedef vector<uint8_t> DataVector;
-typedef map<device_ptr, DataVector> DataMap;
-
-/* tile list */
-typedef vector<RenderTile> TileList;
-
-/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
-{
-  for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
-    if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
-      return it;
-  return tile_list.end();
-}
-
-class NetworkDevice : public Device {
- public:
-  boost::asio::io_service io_service;
-  tcp::socket socket;
-  device_ptr mem_counter;
-  DeviceTask the_task; /* todo: handle multiple tasks */
-
-  thread_mutex rpc_lock;
-
-  virtual bool show_samples() const
-  {
-    return false;
-  }
-
-  NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
-      : Device(info, stats, profiler, true), socket(io_service)
-  {
-    error_func = NetworkError();
-    stringstream portstr;
-    portstr << SERVER_PORT;
-
-    tcp::resolver resolver(io_service);
-    tcp::resolver::query query(address, portstr.str());
-    tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
-    tcp::resolver::iterator end;
-
-    boost::system::error_code error = boost::asio::error::host_not_found;
-    while (error && endpoint_iterator != end) {
-      socket.close();
-      socket.connect(*endpoint_iterator++, error);
-    }
-
-    if (error)
-      error_func.network_error(error.message());
-
-    mem_counter = 0;
-  }
-
-  ~NetworkDevice()
-  {
-    RPCSend snd(socket, &error_func, "stop");
-    snd.write();
-  }
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  void mem_alloc(device_memory &mem)
-  {
-    if (mem.name) {
-      VLOG(1) << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")";
-    }
-
-    thread_scoped_lock lock(rpc_lock);
-
-    mem.device_pointer = ++mem_counter;
-
-    RPCSend snd(socket, &error_func, "mem_alloc");
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_copy_to(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_copy_to");
-
-    snd.add(mem);
-    snd.write();
-    snd.write_buffer(mem.host_pointer, mem.memory_size());
-  }
-
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    size_t data_size = mem.memory_size();
-
-    RPCSend snd(socket, &error_func, "mem_copy_from");
-
-    snd.add(mem);
-    snd.add(y);
-    snd.add(w);
-    snd.add(h);
-    snd.add(elem);
-    snd.write();
-
-    RPCReceive rcv(socket, &error_func);
-    rcv.read_buffer(mem.host_pointer, data_size);
-  }
-
-  void mem_zero(device_memory &mem)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "mem_zero");
-
-    snd.add(mem);
-    snd.write();
-  }
-
-  void mem_free(device_memory &mem)
-  {
-    if (mem.device_pointer) {
-      thread_scoped_lock lock(rpc_lock);
-
-      RPCSend snd(socket, &error_func, "mem_free");
-
-      snd.add(mem);
-      snd.write();
-
-      mem.device_pointer = 0;
-    }
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "const_copy_to");
-
-    string name_string(name);
-
-    snd.add(name_string);
-    snd.add(size);
-    snd.write();
-    snd.write_buffer(host, size);
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features)
-  {
-    if (error_func.have_error())
-      return false;
-
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "load_kernels");
-    snd.add(requested_features.experimental);
-    snd.add(requested_features.max_closure);
-    snd.add(requested_features.max_nodes_group);
-    snd.add(requested_features.nodes_features);
-    snd.write();
-
-    bool result;
-    RPCReceive rcv(socket, &error_func);
-    rcv.read(result);
-
-    return result;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    the_task = task;
-
-    RPCSend snd(socket, &error_func, "task_add");
-    snd.add(task);
-    snd.write();
-  }
-
-  void task_wait()
-  {
-    thread_scoped_lock lock(rpc_lock);
-
-    RPCSend snd(socket, &error_func, "task_wait");
-    snd.write();
-
-    lock.unlock();
-
-    TileList the_tiles;
-
-    /* todo: run this threaded for connecting to multiple clients */
-    for (;;) {
-      if (error_func.have_error())
-        break;
-
-      RenderTile tile;
-
-      lock.lock();
-      RPCReceive rcv(socket, &error_func);
-
-      if (rcv.name == "acquire_tile") {
-        lock.unlock();
-
-        /* todo: watch out for recursive calls! */
-        if (the_task.acquire_tile(this, tile)) { /* write return as bool */
-          the_tiles.push_back(tile);
-
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile");
-          snd.add(tile);
-          snd.write();
-          lock.unlock();
-        }
-        else {
-          lock.lock();
-          RPCSend snd(socket, &error_func, "acquire_tile_none");
-          snd.write();
-          lock.unlock();
-        }
-      }
-      else if (rcv.name == "release_tile") {
-        rcv.read(tile);
-        lock.unlock();
-
-        TileList::iterator it = tile_list_find(the_tiles, tile);
-        if (it != the_tiles.end()) {
-          tile.buffers = it->buffers;
-          the_tiles.erase(it);
-        }
-
-        assert(tile.buffers != NULL);
-
-        the_task.release_tile(tile);
-
-        lock.lock();
-        RPCSend snd(socket, &error_func, "release_tile");
-        snd.write();
-        lock.unlock();
-      }
-      else if (rcv.name == "task_wait_done") {
-        lock.unlock();
-        break;
-      }
-      else
-        lock.unlock();
-    }
-  }
-
-  void task_cancel()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCSend snd(socket, &error_func, "task_cancel");
-    snd.write();
-  }
-
-  int get_split_task_count(DeviceTask &)
-  {
-    return 1;
-  }
-
- private:
-  NetworkError error_func;
-};
-
-Device *device_network_create(DeviceInfo &info,
-                              Stats &stats,
-                              Profiler &profiler,
-                              const char *address)
-{
-  return new NetworkDevice(info, stats, profiler, address);
-}
-
-void device_network_info(vector<DeviceInfo> &devices)
-{
-  DeviceInfo info;
-
-  info.type = DEVICE_NETWORK;
-  info.description = "Network Device";
-  info.id = "NETWORK";
-  info.num = 0;
-
-  /* todo: get this info from device */
-  info.has_volume_decoupled = false;
-  info.has_adaptive_stop_per_sample = false;
-  info.has_osl = false;
-  info.denoisers = DENOISER_NONE;
-
-  devices.push_back(info);
-}
-
-class DeviceServer {
- public:
-  thread_mutex rpc_lock;
-
-  void network_error(const string &message)
-  {
-    error_func.network_error(message);
-  }
-
-  bool have_error()
-  {
-    return error_func.have_error();
-  }
-
-  DeviceServer(Device *device_, tcp::socket &socket_)
-      : device(device_), socket(socket_), stop(false), blocked_waiting(false)
-  {
-    error_func = NetworkError();
-  }
-
-  void listen()
-  {
-    /* receive remote function calls */
-    for (;;) {
-      listen_step();
-
-      if (stop)
-        break;
-    }
-  }
-
- protected:
-  void listen_step()
-  {
-    thread_scoped_lock lock(rpc_lock);
-    RPCReceive rcv(socket, &error_func);
-
-    if (rcv.name == "stop")
-      stop = true;
-    else
-      process(rcv, lock);
-  }
-
-  /* create a memory buffer for a device buffer and insert it into mem_data */
-  DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
-  {
-    /* create a new DataVector and insert it into mem_data */
-    pair<DataMap::iterator, bool> data_ins = mem_data.insert(
-        DataMap::value_type(client_pointer, DataVector()));
-
-    /* make sure it was a unique insertion */
-    assert(data_ins.second);
-
-    /* get a reference to the inserted vector */
-    DataVector &data_v = data_ins.first->second;
-
-    /* size the vector */
-    data_v.resize(data_size);
-
-    return data_v;
-  }
-
-  DataVector &data_vector_find(device_ptr client_pointer)
-  {
-    DataMap::iterator i = mem_data.find(client_pointer);
-    assert(i != mem_data.end());
-    return i->second;
-  }
-
-  /* setup mapping and reverse mapping of client_pointer<->real_pointer */
-  void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
-  {
-    pair<PtrMap::iterator, bool> mapins;
-
-    /* insert mapping from client pointer to our real device pointer */
-    mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
-    assert(mapins.second);
-
-    /* insert reverse mapping from real our device pointer to client pointer */
-    mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
-    assert(mapins.second);
-  }
-
-  device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-    return i->second;
-  }
-
-  device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
-  {
-    PtrMap::iterator i = ptr_map.find(client_pointer);
-    assert(i != ptr_map.end());
-
-    device_ptr result = i->second;
-
-    /* erase the mapping */
-    ptr_map.erase(i);
-
-    /* erase the reverse mapping */
-    PtrMap::iterator irev = ptr_imap.find(result);
-    assert(irev != ptr_imap.end());
-    ptr_imap.erase(irev);
-
-    /* erase the data vector */
-    DataMap::iterator idata = mem_data.find(client_pointer);
-    assert(idata != mem_data.end());
-    mem_data.erase(idata);
-
-    return result;
-  }
-
-  /* note that the lock must be already acquired upon entry.
-   * This is necessary because the caller often peeks at
-   * the header and delegates control to here when it doesn't
-   * specifically handle the current RPC.
-   * The lock must be unlocked before returning */
-  void process(RPCReceive &rcv, thread_scoped_lock &lock)
-  {
-    if (rcv.name == "mem_alloc") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      /* Allocate host side data buffer. */
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      DataVector &data_v = data_vector_insert(client_pointer, data_size);
-      mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-
-      /* Perform the allocation on the actual device. */
-      device->mem_alloc(mem);
-
-      /* Store a mapping to/from client_pointer and real device pointer. */
-      pointer_mapping_insert(client_pointer, mem.device_pointer);
-    }
-    else if (rcv.name == "mem_copy_to") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
-      }
-
-      /* Copy data from network into memory buffer. */
-      rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
-
-      /* Copy the data from the memory buffer to the device buffer. */
-      device->mem_copy_to(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_copy_from") {
-      string name;
-      network_device_memory mem(device);
-      int y, w, h, elem;
-
-      rcv.read(mem, name);
-      rcv.read(y);
-      rcv.read(w);
-      rcv.read(h);
-      rcv.read(elem);
-
-      device_ptr client_pointer = mem.device_pointer;
-      mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
-      DataVector &data_v = data_vector_find(client_pointer);
-
-      mem.host_pointer = (device_ptr) & (data_v[0]);
-
-      device->mem_copy_from(mem, y, w, h, elem);
-
-      size_t data_size = mem.memory_size();
-
-      RPCSend snd(socket, &error_func, "mem_copy_from");
-      snd.write();
-      snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
-      lock.unlock();
-    }
-    else if (rcv.name == "mem_zero") {
-      string name;
-      network_device_memory mem(device);
-      rcv.read(mem, name);
-      lock.unlock();
-
-      size_t data_size = mem.memory_size();
-      device_ptr client_pointer = mem.device_pointer;
-
-      if (client_pointer) {
-        /* Lookup existing host side data buffer. */
-        DataVector &data_v = data_vector_find(client_pointer);
-        mem.host_pointer = (void *)&data_v[0];
-
-        /* Translate the client pointer to a real device pointer. */
-        mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-      }
-      else {
-        /* Allocate host side data buffer. */
-        DataVector &data_v = data_vector_insert(client_pointer, data_size);
-        mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
-      }
-
-      /* Zero memory. */
-      device->mem_zero(mem);
-
-      if (!client_pointer) {
-        /* Store a mapping to/from client_pointer and real device pointer. */
-        pointer_mapping_insert(client_pointer, mem.device_pointer);
-      }
-    }
-    else if (rcv.name == "mem_free") {
-      string name;
-      network_device_memory mem(device);
-
-      rcv.read(mem, name);
-      lock.unlock();
-
-      device_ptr client_pointer = mem.device_pointer;
-
-      mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
-      device->mem_free(mem);
-    }
-    else if (rcv.name == "const_copy_to") {
-      string name_string;
-      size_t size;
-
-      rcv.read(name_string);
-      rcv.read(size);
-
-      vector<char> host_vector(size);
-      rcv.read_buffer(&host_vector[0], size);
-      lock.unlock();
-
-      device->const_copy_to(name_string.c_str(), &host_vector[0], size);
-    }
-    else if (rcv.name == "load_kernels") {
-      DeviceRequestedFeatures requested_features;
-      rcv.read(requested_features.experimental);
-      rcv.read(requested_features.max_closure);
-      rcv.read(requested_features.max_nodes_group);
-      rcv.read(requested_features.nodes_features);
-
-      bool result;
-      result = device->load_kernels(requested_features);
-      RPCSend snd(socket, &error_func, "load_kernels");
-      snd.add(result);
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_add") {
-      DeviceTask task;
-
-      rcv.read(task);
-      lock.unlock();
-
-      if (task.buffer)
-        task.buffer = device_ptr_from_client_pointer(task.buffer);
-
-      if (task.rgba_half)
-        task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
-      if (task.rgba_byte)
-        task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
-      if (task.shader_input)
-        task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
-      if (task.shader_output)
-        task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
-      task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
-      task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
-      task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
-                                                  this);
-      task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
-      task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
-      device->task_add(task);
-    }
-    else if (rcv.name == "task_wait") {
-      lock.unlock();
-
-      blocked_waiting = true;
-      device->task_wait();
-      blocked_waiting = false;
-
-      lock.lock();
-      RPCSend snd(socket, &error_func, "task_wait_done");
-      snd.write();
-      lock.unlock();
-    }
-    else if (rcv.name == "task_cancel") {
-      lock.unlock();
-      device->task_cancel();
-    }
-    else if (rcv.name == "acquire_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      rcv.read(entry.tile);
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "acquire_tile_none") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else if (rcv.name == "release_tile") {
-      AcquireEntry entry;
-      entry.name = rcv.name;
-      acquire_queue.push_back(entry);
-      lock.unlock();
-    }
-    else {
-      cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
-      lock.unlock();
-    }
-  }
-
-  bool task_acquire_tile(Device *, RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    bool result = false;
-
-    RPCSend snd(socket, &error_func, "acquire_tile");
-    snd.write();
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "acquire_tile") {
-          tile = entry.tile;
-
-          if (tile.buffer)
-            tile.buffer = ptr_map[tile.buffer];
-
-          result = true;
-          break;
-        }
-        else if (entry.name == "acquire_tile_none") {
-          break;
-        }
-        else {
-          cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop && !have_error());
-
-    return result;
-  }
-
-  void task_update_progress_sample()
-  {
-    ; /* skip */
-  }
-
-  void task_update_tile_sample(RenderTile &)
-  {
-    ; /* skip */
-  }
-
-  void task_release_tile(RenderTile &tile)
-  {
-    thread_scoped_lock acquire_lock(acquire_mutex);
-
-    if (tile.buffer)
-      tile.buffer = ptr_imap[tile.buffer];
-
-    {
-      thread_scoped_lock lock(rpc_lock);
-      RPCSend snd(socket, &error_func, "release_tile");
-      snd.add(tile);
-      snd.write();
-      lock.unlock();
-    }
-
-    do {
-      if (blocked_waiting)
-        listen_step();
-
-      /* todo: avoid busy wait loop */
-      thread_scoped_lock lock(rpc_lock);
-
-      if (!acquire_queue.empty()) {
-        AcquireEntry entry = acquire_queue.front();
-        acquire_queue.pop_front();
-
-        if (entry.name == "release_tile") {
-          lock.unlock();
-          break;
-        }
-        else {
-          cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
-        }
-      }
-    } while (acquire_queue.empty() && !stop);
-  }
-
-  bool task_get_cancel()
-  {
-    return false;
-  }
-
-  /* properties */
-  Device *device;
-  tcp::socket &socket;
-
-  /* mapping of remote to local pointer */
-  PtrMap ptr_map;
-  PtrMap ptr_imap;
-  DataMap mem_data;
-
-  struct AcquireEntry {
-    string name;
-    RenderTile tile;
-  };
-
-  thread_mutex acquire_mutex;
-  list<AcquireEntry> acquire_queue;
-
-  bool stop;
-  bool blocked_waiting;
-
- private:
-  NetworkError error_func;
-
-  /* todo: free memory and device (osl) on network error */
-};
-
-void Device::server_run()
-{
-  try {
-    /* starts thread that responds to discovery requests */
-    ServerDiscovery discovery;
-
-    for (;;) {
-      /* accept connection */
-      boost::asio::io_service io_service;
-      tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
-      tcp::socket socket(io_service);
-      acceptor.accept(socket);
-
-      string remote_address = socket.remote_endpoint().address().to_string();
-      printf("Connected to remote client at: %s\n", remote_address.c_str());
-
-      DeviceServer server(this, socket);
-      server.listen();
-
-      printf("Disconnected.\n");
-    }
-  }
-  catch (exception &e) {
-    fprintf(stderr, "Network server exception: %s\n", e.what());
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
deleted file mode 100644
index b3a0f6daa57..00000000000
--- a/intern/cycles/device/device_network.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_NETWORK_H__
-#define __DEVICE_NETWORK_H__
-
-#ifdef WITH_NETWORK
-
-#  include <boost/archive/binary_iarchive.hpp>
-#  include <boost/archive/binary_oarchive.hpp>
-#  include <boost/archive/text_iarchive.hpp>
-#  include <boost/archive/text_oarchive.hpp>
-#  include <boost/array.hpp>
-#  include <boost/asio.hpp>
-#  include <boost/bind.hpp>
-#  include <boost/serialization/vector.hpp>
-#  include <boost/thread.hpp>
-
-#  include <deque>
-#  include <iostream>
-#  include <sstream>
-
-#  include "render/buffers.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_list.h"
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-using std::cerr;
-using std::cout;
-using std::exception;
-using std::hex;
-using std::setw;
-
-using boost::asio::ip::tcp;
-
-static const int SERVER_PORT = 5120;
-static const int DISCOVER_PORT = 5121;
-static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
-static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-
-#  if 0
-typedef boost::archive::text_oarchive o_archive;
-typedef boost::archive::text_iarchive i_archive;
-#  else
-typedef boost::archive::binary_oarchive o_archive;
-typedef boost::archive::binary_iarchive i_archive;
-#  endif
-
-/* Serialization of device memory */
-
-class network_device_memory : public device_memory {
- public:
-  network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
-  {
-  }
-
-  ~network_device_memory()
-  {
-    device_pointer = 0;
-  };
-
-  vector<char> local_data;
-};
-
-/* Common network error function / object for both DeviceNetwork and DeviceServer. */
-class NetworkError {
- public:
-  NetworkError()
-  {
-    error = "";
-    error_count = 0;
-  }
-
-  ~NetworkError()
-  {
-  }
-
-  void network_error(const string &message)
-  {
-    error = message;
-    error_count += 1;
-  }
-
-  bool have_error()
-  {
-    return true ? error_count > 0 : false;
-  }
-
- private:
-  string error;
-  int error_count;
-};
-
-/* Remote procedure call Send */
-
-class RPCSend {
- public:
-  RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
-      : name(name_), socket(socket_), archive(archive_stream), sent(false)
-  {
-    archive &name_;
-    error_func = e;
-    fprintf(stderr, "rpc send %s\n", name.c_str());
-  }
-
-  ~RPCSend()
-  {
-  }
-
-  void add(const device_memory &mem)
-  {
-    archive &mem.data_type &mem.data_elements &mem.data_size;
-    archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    archive &mem.type &string(mem.name);
-    archive &mem.interpolation &mem.extension;
-    archive &mem.device_pointer;
-  }
-
-  template<typename T> void add(const T &data)
-  {
-    archive &data;
-  }
-
-  void add(const DeviceTask &task)
-  {
-    int type = (int)task.type;
-    archive &type &task.x &task.y &task.w &task.h;
-    archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    archive &task.offset &task.stride;
-    archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    archive &task.shader_x &task.shader_w;
-    archive &task.need_finish_queue;
-  }
-
-  void add(const RenderTile &tile)
-  {
-    archive &tile.x &tile.y &tile.w &tile.h;
-    archive &tile.start_sample &tile.num_samples &tile.sample;
-    archive &tile.resolution &tile.offset &tile.stride;
-    archive &tile.buffer;
-  }
-
-  void write()
-  {
-    boost::system::error_code error;
-
-    /* get string from stream */
-    string archive_str = archive_stream.str();
-
-    /* first send fixed size header with size of following data */
-    ostringstream header_stream;
-    header_stream << setw(8) << hex << archive_str.size();
-    string header_str = header_stream.str();
-
-    boost::asio::write(
-        socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    /* then send actual data */
-    boost::asio::write(
-        socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-
-    sent = true;
-  }
-
-  void write_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-
-    boost::asio::write(
-        socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
-
-    if (error.value())
-      error_func->network_error(error.message());
-  }
-
- protected:
-  string name;
-  tcp::socket &socket;
-  ostringstream archive_stream;
-  o_archive archive;
-  bool sent;
-  NetworkError *error_func;
-};
-
-/* Remote procedure call Receive */
-
-class RPCReceive {
- public:
-  RPCReceive(tcp::socket &socket_, NetworkError *e)
-      : socket(socket_), archive_stream(NULL), archive(NULL)
-  {
-    error_func = e;
-    /* read head with fixed size */
-    vector<char> header(8);
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    /* verify if we got something */
-    if (len == header.size()) {
-      /* decode header */
-      string header_str(&header[0], header.size());
-      istringstream header_stream(header_str);
-
-      size_t data_size;
-
-      if ((header_stream >> hex >> data_size)) {
-
-        vector<char> data(data_size);
-        size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
-        if (error.value())
-          error_func->network_error(error.message());
-
-        if (len == data_size) {
-          archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
-
-          archive_stream = new istringstream(archive_str);
-          archive = new i_archive(*archive_stream);
-
-          *archive &name;
-          fprintf(stderr, "rpc receive %s\n", name.c_str());
-        }
-        else {
-          error_func->network_error("Network receive error: data size doesn't match header");
-        }
-      }
-      else {
-        error_func->network_error("Network receive error: can't decode data size from header");
-      }
-    }
-    else {
-      error_func->network_error("Network receive error: invalid header size");
-    }
-  }
-
-  ~RPCReceive()
-  {
-    delete archive;
-    delete archive_stream;
-  }
-
-  void read(network_device_memory &mem, string &name)
-  {
-    *archive &mem.data_type &mem.data_elements &mem.data_size;
-    *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
-    *archive &mem.type &name;
-    *archive &mem.interpolation &mem.extension;
-    *archive &mem.device_pointer;
-
-    mem.name = name.c_str();
-    mem.host_pointer = 0;
-
-    /* Can't transfer OpenGL texture over network. */
-    if (mem.type == MEM_PIXELS) {
-      mem.type = MEM_READ_WRITE;
-    }
-  }
-
-  template<typename T> void read(T &data)
-  {
-    *archive &data;
-  }
-
-  void read_buffer(void *buffer, size_t size)
-  {
-    boost::system::error_code error;
-    size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
-    if (error.value()) {
-      error_func->network_error(error.message());
-    }
-
-    if (len != size)
-      cout << "Network receive error: buffer size doesn't match expected size\n";
-  }
-
-  void read(DeviceTask &task)
-  {
-    int type;
-
-    *archive &type &task.x &task.y &task.w &task.h;
-    *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
-    *archive &task.offset &task.stride;
-    *archive &task.shader_input &task.shader_output &task.shader_eval_type;
-    *archive &task.shader_x &task.shader_w;
-    *archive &task.need_finish_queue;
-
-    task.type = (DeviceTask::Type)type;
-  }
-
-  void read(RenderTile &tile)
-  {
-    *archive &tile.x &tile.y &tile.w &tile.h;
-    *archive &tile.start_sample &tile.num_samples &tile.sample;
-    *archive &tile.resolution &tile.offset &tile.stride;
-    *archive &tile.buffer;
-
-    tile.buffers = NULL;
-  }
-
-  string name;
-
- protected:
-  tcp::socket &socket;
-  string archive_str;
-  istringstream *archive_stream;
-  i_archive *archive;
-  NetworkError *error_func;
-};
-
-/* Server auto discovery */
-
-class ServerDiscovery {
- public:
-  explicit ServerDiscovery(bool discover = false)
-      : listen_socket(io_service), collect_servers(false)
-  {
-    /* setup listen socket */
-    listen_endpoint.address(boost::asio::ip::address_v4::any());
-    listen_endpoint.port(DISCOVER_PORT);
-
-    listen_socket.open(listen_endpoint.protocol());
-
-    boost::asio::socket_base::reuse_address option(true);
-    listen_socket.set_option(option);
-
-    listen_socket.bind(listen_endpoint);
-
-    /* setup receive callback */
-    async_receive();
-
-    /* start server discovery */
-    if (discover) {
-      collect_servers = true;
-      servers.clear();
-
-      broadcast_message(DISCOVER_REQUEST_MSG);
-    }
-
-    /* start thread */
-    work = new boost::asio::io_service::work(io_service);
-    thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
-  }
-
-  ~ServerDiscovery()
-  {
-    io_service.stop();
-    thread->join();
-    delete thread;
-    delete work;
-  }
-
-  vector<string> get_server_list()
-  {
-    vector<string> result;
-
-    mutex.lock();
-    result = vector<string>(servers.begin(), servers.end());
-    mutex.unlock();
-
-    return result;
-  }
-
- private:
-  void handle_receive_from(const boost::system::error_code &error, size_t size)
-  {
-    if (error) {
-      cout << "Server discovery receive error: " << error.message() << "\n";
-      return;
-    }
-
-    if (size > 0) {
-      string msg = string(receive_buffer, size);
-
-      /* handle incoming message */
-      if (collect_servers) {
-        if (msg == DISCOVER_REPLY_MSG) {
-          string address = receive_endpoint.address().to_string();
-
-          mutex.lock();
-
-          /* add address if it's not already in the list */
-          bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
-
-          if (!found)
-            servers.push_back(address);
-
-          mutex.unlock();
-        }
-      }
-      else {
-        /* reply to request */
-        if (msg == DISCOVER_REQUEST_MSG)
-          broadcast_message(DISCOVER_REPLY_MSG);
-      }
-    }
-
-    async_receive();
-  }
-
-  void async_receive()
-  {
-    listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
-                                     receive_endpoint,
-                                     boost::bind(&ServerDiscovery::handle_receive_from,
-                                                 this,
-                                                 boost::asio::placeholders::error,
-                                                 boost::asio::placeholders::bytes_transferred));
-  }
-
-  void broadcast_message(const string &msg)
-  {
-    /* setup broadcast socket */
-    boost::asio::ip::udp::socket socket(io_service);
-
-    socket.open(boost::asio::ip::udp::v4());
-
-    boost::asio::socket_base::broadcast option(true);
-    socket.set_option(option);
-
-    boost::asio::ip::udp::endpoint broadcast_endpoint(
-        boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
-    /* broadcast message */
-    socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
-  }
-
-  /* network service and socket */
-  boost::asio::io_service io_service;
-  boost::asio::ip::udp::endpoint listen_endpoint;
-  boost::asio::ip::udp::socket listen_socket;
-
-  /* threading */
-  boost::thread *thread;
-  boost::asio::io_service::work *work;
-  boost::mutex mutex;
-
-  /* buffer and endpoint for receiving messages */
-  char receive_buffer[256];
-  boost::asio::ip::udp::endpoint receive_endpoint;
-
-  // os, version, devices, status, host name, group name, ip as far as fields go
-  struct ServerInfo {
-    string cycles_version;
-    string os;
-    int device_count;
-    string status;
-    string host_name;
-    string group_name;
-    string host_addr;
-  };
-
-  /* collection of server addresses in list */
-  bool collect_servers;
-  vector<string> servers;
-};
-
-CCL_NAMESPACE_END
-
-#endif
-
-#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
deleted file mode 100644
index 9abb7cfb7fe..00000000000
--- a/intern/cycles/device/device_opencl.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/device.h"
-#  include "device/device_intern.h"
-
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_set.h"
-#  include "util/util_string.h"
-
-CCL_NAMESPACE_BEGIN
-
-Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return opencl_create_split_device(info, stats, profiler, background);
-}
-
-bool device_opencl_init()
-{
-  static bool initialized = false;
-  static bool result = false;
-
-  if (initialized)
-    return result;
-
-  initialized = true;
-
-  if (OpenCLInfo::device_type() != 0) {
-    int clew_result = clewInit();
-    if (clew_result == CLEW_SUCCESS) {
-      VLOG(1) << "CLEW initialization succeeded.";
-      result = true;
-    }
-    else {
-      VLOG(1) << "CLEW initialization failed: "
-              << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
-                                                              "Error opening the library");
-    }
-  }
-  else {
-    VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
-    result = false;
-  }
-
-  return result;
-}
-
-static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
-{
-#  ifdef _WIN32
-  __try {
-    return clGetPlatformIDs(0, NULL, num_platforms);
-  }
-  __except (EXCEPTION_EXECUTE_HANDLER) {
-    /* Ignore crashes inside the OpenCL driver and hope we can
-     * survive even with corrupted OpenCL installs. */
-    fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
-  }
-
-  *num_platforms = 0;
-  return CL_DEVICE_NOT_FOUND;
-#  else
-  return clGetPlatformIDs(0, NULL, num_platforms);
-#  endif
-}
-
-void device_opencl_info(vector<DeviceInfo> &devices)
-{
-  cl_uint num_platforms = 0;
-  device_opencl_get_num_platforms_safe(&num_platforms);
-  if (num_platforms == 0) {
-    return;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  /* Devices are numbered consecutively across platforms. */
-  int num_devices = 0;
-  set<string> unique_ids;
-  foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
-    /* Compute unique ID for persistent user preferences. */
-    const string &platform_name = platform_device.platform_name;
-    const string &device_name = platform_device.device_name;
-    string hardware_id = platform_device.hardware_id;
-    if (hardware_id == "") {
-      hardware_id = string_printf("ID_%d", num_devices);
-    }
-    string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
-    /* Hardware ID might not be unique, add device number in that case. */
-    if (unique_ids.find(id) != unique_ids.end()) {
-      id += string_printf("_ID_%d", num_devices);
-    }
-    unique_ids.insert(id);
-
-    /* Create DeviceInfo. */
-    DeviceInfo info;
-    info.type = DEVICE_OPENCL;
-    info.description = string_remove_trademark(string(device_name));
-    info.num = num_devices;
-    /* We don't know if it's used for display, but assume it is. */
-    info.display_device = true;
-    info.use_split_kernel = true;
-    info.has_volume_decoupled = false;
-    info.has_adaptive_stop_per_sample = false;
-    info.denoisers = DENOISER_NLM;
-    info.id = id;
-
-    /* Check OpenCL extensions */
-    info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
-    /* Disabled for now due to apparent AMD driver bug. */
-    info.has_nanovdb = platform_name != "AMD Accelerated Parallel Processing";
-
-    devices.push_back(info);
-    num_devices++;
-  }
-}
-
-string device_opencl_capabilities()
-{
-  if (OpenCLInfo::device_type() == 0) {
-    return "All OpenCL devices are forced to be OFF";
-  }
-  string result = "";
-  string error_msg = ""; /* Only used by opencl_assert(), but in the future
-                          * it could also be nicely reported to the console.
-                          */
-  cl_uint num_platforms = 0;
-  opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
-  if (num_platforms == 0) {
-    return "No OpenCL platforms found\n";
-  }
-  result += string_printf("Number of platforms: %u\n", num_platforms);
-
-  vector<cl_platform_id> platform_ids;
-  platform_ids.resize(num_platforms);
-  opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
-#  define APPEND_INFO(func, id, name, what, type) \
-    do { \
-      type data; \
-      memset(&data, 0, sizeof(data)); \
-      opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
-      result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
-    } while (false)
-#  define APPEND_STRING_INFO_IMPL(func, id, name, what, is_optional) \
-    do { \
-      string value; \
-      size_t length = 0; \
-      if (func(id, what, 0, NULL, &length) == CL_SUCCESS) { \
-        vector<char> buffer(length + 1); \
-        if (func(id, what, buffer.size(), buffer.data(), NULL) == CL_SUCCESS) { \
-          value = string(buffer.data()); \
-        } \
-      } \
-      if (is_optional && !(length != 0 && value[0] != '\0')) { \
-        break; \
-      } \
-      result += string_printf("%s: %s\n", name, value.c_str()); \
-    } while (false)
-#  define APPEND_PLATFORM_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, false)
-#  define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetPlatformInfo, id, "\tPlatform " name, what, true)
-#  define APPEND_PLATFORM_INFO(id, name, what, type) \
-    APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#  define APPEND_DEVICE_INFO(id, name, what, type) \
-    APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#  define APPEND_DEVICE_STRING_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, false)
-#  define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
-    APPEND_STRING_INFO_IMPL(clGetDeviceInfo, id, "\t\t\tDevice " name, what, true)
-
-  vector<cl_device_id> device_ids;
-  for (cl_uint platform = 0; platform < num_platforms; ++platform) {
-    cl_platform_id platform_id = platform_ids[platform];
-
-    result += string_printf("Platform #%u\n", platform);
-
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Name", CL_PLATFORM_NAME);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Version", CL_PLATFORM_VERSION);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE);
-    APPEND_PLATFORM_STRING_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS);
-
-    cl_uint num_devices = 0;
-    opencl_assert(
-        clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
-    result += string_printf("\tNumber of devices: %u\n", num_devices);
-
-    device_ids.resize(num_devices);
-    opencl_assert(clGetDeviceIDs(
-        platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
-    for (cl_uint device = 0; device < num_devices; ++device) {
-      cl_device_id device_id = device_ids[device];
-
-      result += string_printf("\t\tDevice: #%u\n", device);
-
-      APPEND_DEVICE_STRING_INFO(device_id, "Name", CL_DEVICE_NAME);
-      APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
-      APPEND_DEVICE_STRING_INFO(device_id, "Vendor", CL_DEVICE_VENDOR);
-      APPEND_DEVICE_STRING_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Profile", CL_DEVICE_PROFILE);
-      APPEND_DEVICE_STRING_INFO(device_id, "Version", CL_DEVICE_VERSION);
-      APPEND_DEVICE_STRING_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS);
-      APPEND_DEVICE_INFO(
-          device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
-      APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
-    }
-  }
-
-#  undef APPEND_INFO
-#  undef APPEND_STRING_INFO_IMPL
-#  undef APPEND_PLATFORM_STRING_INFO
-#  undef APPEND_STRING_EXTENSION_INFO
-#  undef APPEND_PLATFORM_INFO
-#  undef APPEND_DEVICE_INFO
-#  undef APPEND_DEVICE_STRING_INFO
-#  undef APPEND_DEVICE_STRING_EXTENSION_INFO
-
-  return result;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
deleted file mode 100644
index 6f9a7943722..00000000000
--- a/intern/cycles/device/device_optix.cpp
+++ /dev/null
@@ -1,1936 +0,0 @@
-/*
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPTIX
-
-#  include "bvh/bvh.h"
-#  include "bvh/bvh_optix.h"
-#  include "device/cuda/device_cuda.h"
-#  include "device/device_denoising.h"
-#  include "device/device_intern.h"
-#  include "render/buffers.h"
-#  include "render/hair.h"
-#  include "render/mesh.h"
-#  include "render/object.h"
-#  include "render/scene.h"
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_progress.h"
-#  include "util/util_time.h"
-
-#  ifdef WITH_CUDA_DYNLOAD
-#    include <cuew.h>
-// Do not use CUDA SDK headers when using CUEW
-#    define OPTIX_DONT_INCLUDE_CUDA
-#  endif
-#  include <optix_function_table_definition.h>
-#  include <optix_stubs.h>
-
-// TODO(pmours): Disable this once drivers have native support
-#  define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
-
-CCL_NAMESPACE_BEGIN
-
-/* Make sure this stays in sync with kernel_globals.h */
-struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-};
-struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-};
-
-#  define check_result_cuda(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_cuda_ret(stmt) \
-    { \
-      CUresult res = stmt; \
-      if (res != CUDA_SUCCESS) { \
-        const char *name; \
-        cuGetErrorName(res, &name); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define check_result_optix(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return; \
-      } \
-    } \
-    (void)0
-#  define check_result_optix_ret(stmt) \
-    { \
-      enum OptixResult res = stmt; \
-      if (res != OPTIX_SUCCESS) { \
-        const char *name = optixGetErrorName(res); \
-        set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
-        return false; \
-      } \
-    } \
-    (void)0
-
-#  define launch_filter_kernel(func_name, w, h, args) \
-    { \
-      CUfunction func; \
-      check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
-      check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
-      int threads; \
-      check_result_cuda_ret( \
-          cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-      threads = (int)sqrt((float)threads); \
-      int xblocks = ((w) + threads - 1) / threads; \
-      int yblocks = ((h) + threads - 1) / threads; \
-      check_result_cuda_ret( \
-          cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
-    } \
-    (void)0
-
-class OptiXDevice : public CUDADevice {
-
-  // List of OptiX program groups
-  enum {
-    PG_RGEN,
-    PG_MISS,
-    PG_HITD,  // Default hit group
-    PG_HITS,  // __SHADOW_RECORD_ALL__ hit group
-    PG_HITL,  // __BVH_LOCAL__ hit group (only used for triangles)
-#  if OPTIX_ABI_VERSION >= 36
-    PG_HITD_MOTION,
-    PG_HITS_MOTION,
-#  endif
-    PG_BAKE,  // kernel_bake_evaluate
-    PG_DISP,  // kernel_displace_evaluate
-    PG_BACK,  // kernel_background_evaluate
-    PG_CALL,
-    NUM_PROGRAM_GROUPS = PG_CALL + 3
-  };
-
-  // List of OptiX pipelines
-  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
-
-  // A single shader binding table entry
-  struct SbtRecord {
-    char header[OPTIX_SBT_RECORD_HEADER_SIZE];
-  };
-
-  // Information stored about CUDA memory allocations
-  struct CUDAMem {
-    bool free_map_host = false;
-    CUarray array = NULL;
-    CUtexObject texobject = 0;
-    bool use_mapped_host = false;
-  };
-
-  // Helper class to manage current CUDA context
-  struct CUDAContextScope {
-    CUDAContextScope(CUcontext ctx)
-    {
-      cuCtxPushCurrent(ctx);
-    }
-    ~CUDAContextScope()
-    {
-      cuCtxPopCurrent(NULL);
-    }
-  };
-
-  // Use a pool with multiple threads to support launches with multiple CUDA streams
-  TaskPool task_pool;
-
-  vector<CUstream> cuda_stream;
-  OptixDeviceContext context = NULL;
-
-  OptixModule optix_module = NULL;  // All necessary OptiX kernels are in one module
-  OptixModule builtin_modules[2] = {};
-  OptixPipeline pipelines[NUM_PIPELINES] = {};
-
-  bool motion_blur = false;
-  device_vector<SbtRecord> sbt_data;
-  device_only_memory<KernelParams> launch_params;
-  OptixTraversableHandle tlas_handle = 0;
-
-  OptixDenoiser denoiser = NULL;
-  device_only_memory<unsigned char> denoiser_state;
-  int denoiser_input_passes = 0;
-
-  vector<device_only_memory<char>> delayed_free_bvh_memory;
-  thread_mutex delayed_free_bvh_mutex;
-
- public:
-  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : CUDADevice(info_, stats_, profiler_, background_),
-        sbt_data(this, "__sbt", MEM_READ_ONLY),
-        launch_params(this, "__params", false),
-        denoiser_state(this, "__denoiser_state", true)
-  {
-    // Store number of CUDA streams in device info
-    info.cpu_threads = DebugFlags().optix.cuda_streams;
-
-    // Make the CUDA context current
-    if (!cuContext) {
-      return;  // Do not initialize if CUDA context creation failed already
-    }
-    const CUDAContextScope scope(cuContext);
-
-    // Create OptiX context for this device
-    OptixDeviceContextOptions options = {};
-#  ifdef WITH_CYCLES_LOGGING
-    options.logCallbackLevel = 4;  // Fatal = 1, Error = 2, Warning = 3, Print = 4
-    options.logCallbackFunction =
-        [](unsigned int level, const char *, const char *message, void *) {
-          switch (level) {
-            case 1:
-              LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
-              break;
-            case 2:
-              LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
-              break;
-            case 3:
-              LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
-              break;
-            case 4:
-              LOG_IF(INFO, VLOG_IS_ON(1)) << message;
-              break;
-          }
-        };
-#  endif
-    check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
-#  ifdef WITH_CYCLES_LOGGING
-    check_result_optix(optixDeviceContextSetLogCallback(
-        context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
-#  endif
-
-    // Create launch streams
-    cuda_stream.resize(info.cpu_threads);
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
-
-    // Fix weird compiler bug that assigns wrong size
-    launch_params.data_elements = sizeof(KernelParams);
-    // Allocate launch parameter buffer memory on device
-    launch_params.alloc_to_device(info.cpu_threads);
-  }
-  ~OptiXDevice()
-  {
-    // Stop processing any more tasks
-    task_pool.cancel();
-
-    // Make CUDA context current
-    const CUDAContextScope scope(cuContext);
-
-    free_bvh_memory_delayed();
-
-    sbt_data.free();
-    texture_info.free();
-    launch_params.free();
-    denoiser_state.free();
-
-    // Unload modules
-    if (optix_module != NULL)
-      optixModuleDestroy(optix_module);
-    for (unsigned int i = 0; i < 2; ++i)
-      if (builtin_modules[i] != NULL)
-        optixModuleDestroy(builtin_modules[i]);
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
-      if (pipelines[i] != NULL)
-        optixPipelineDestroy(pipelines[i]);
-
-    // Destroy launch streams
-    for (CUstream stream : cuda_stream)
-      cuStreamDestroy(stream);
-
-    if (denoiser != NULL)
-      optixDenoiserDestroy(denoiser);
-
-    optixDeviceContextDestroy(context);
-  }
-
- private:
-  bool show_samples() const override
-  {
-    // Only show samples if not rendering multiple tiles in parallel
-    return info.cpu_threads == 1;
-  }
-
-  BVHLayoutMask get_bvh_layout_mask() const override
-  {
-    // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
-    if (optix_module == NULL)
-      return CUDADevice::get_bvh_layout_mask();
-
-    // OptiX has its own internal acceleration structure format
-    return BVH_LAYOUT_OPTIX;
-  }
-
-  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
-                                          bool filter,
-                                          bool /*split*/) override
-  {
-    // Split kernel is not supported in OptiX
-    string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
-        requested_features, filter, false);
-
-    // Add OptiX SDK include directory to include paths
-    const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
-    if (optix_sdk_path) {
-      common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
-    }
-
-    // Specialization for shader raytracing
-    if (requested_features.use_shader_raytrace) {
-      common_cflags += " --keep-device-functions";
-    }
-    else {
-      common_cflags += " -D __NO_SHADER_RAYTRACE__";
-    }
-
-    return common_cflags;
-  }
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
-  {
-    if (have_error()) {
-      // Abort early if context creation failed already
-      return false;
-    }
-
-    // Load CUDA modules because we need some of the utility kernels
-    if (!CUDADevice::load_kernels(requested_features)) {
-      return false;
-    }
-
-    // Baking is currently performed using CUDA, so no need to load OptiX kernels
-    if (requested_features.use_baking) {
-      return true;
-    }
-
-    const CUDAContextScope scope(cuContext);
-
-    // Unload existing OptiX module and pipelines first
-    if (optix_module != NULL) {
-      optixModuleDestroy(optix_module);
-      optix_module = NULL;
-    }
-    for (unsigned int i = 0; i < 2; ++i) {
-      if (builtin_modules[i] != NULL) {
-        optixModuleDestroy(builtin_modules[i]);
-        builtin_modules[i] = NULL;
-      }
-    }
-    for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
-      if (pipelines[i] != NULL) {
-        optixPipelineDestroy(pipelines[i]);
-        pipelines[i] = NULL;
-      }
-    }
-
-    OptixModuleCompileOptions module_options = {};
-    module_options.maxRegisterCount = 0;  // Do not set an explicit register limit
-    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
-    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-
-#  if OPTIX_ABI_VERSION >= 41
-    module_options.boundValues = nullptr;
-    module_options.numBoundValues = 0;
-#  endif
-
-    OptixPipelineCompileOptions pipeline_options = {};
-    // Default to no motion blur and two-level graph, since it is the fastest option
-    pipeline_options.usesMotionBlur = false;
-    pipeline_options.traversableGraphFlags =
-        OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
-    pipeline_options.numPayloadValues = 6;
-    pipeline_options.numAttributeValues = 2;  // u, v
-    pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
-    pipeline_options.pipelineLaunchParamsVariableName = "__params";  // See kernel_globals.h
-
-#  if OPTIX_ABI_VERSION >= 36
-    pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
-    if (requested_features.use_hair) {
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
-      }
-      else {
-        pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
-      }
-    }
-#  endif
-
-    // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
-    // This is necessary since objects may be reported to have motion if the Vector pass is
-    // active, but may still need to be rendered without motion blur if that isn't active as well
-    motion_blur = requested_features.use_object_motion;
-
-    if (motion_blur) {
-      pipeline_options.usesMotionBlur = true;
-      // Motion blur can insert motion transforms into the traversal graph
-      // It is no longer a two-level graph then, so need to set flags to allow any configuration
-      pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
-    }
-
-    {  // Load and compile PTX module with OptiX kernels
-      string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
-                                                   "lib/kernel_optix_shader_raytrace.ptx" :
-                                                   "lib/kernel_optix.ptx");
-      if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
-        if (!getenv("OPTIX_ROOT_DIR")) {
-          set_error(
-              "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
-              "the Optix SDK to be able to compile Optix kernels on demand).");
-          return false;
-        }
-        ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
-      }
-      if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
-        set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
-        return false;
-      }
-
-      check_result_optix_ret(optixModuleCreateFromPTX(context,
-                                                      &module_options,
-                                                      &pipeline_options,
-                                                      ptx_data.data(),
-                                                      ptx_data.size(),
-                                                      nullptr,
-                                                      0,
-                                                      &optix_module));
-    }
-
-    // Create program groups
-    OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
-    OptixProgramGroupOptions group_options = {};  // There are no options currently
-    group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-    group_descs[PG_RGEN].raygen.module = optix_module;
-    // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
-    group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
-    group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
-    group_descs[PG_MISS].miss.module = optix_module;
-    group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
-    group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
-    group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
-    group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
-    group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
-
-    if (requested_features.use_hair) {
-      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
-      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
-
-      // Add curve intersection programs
-      if (requested_features.use_hair_thick) {
-        // Slower programs for thick hair since that also slows down ribbons.
-        // Ideally this should not be needed.
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
-      }
-      else {
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
-      }
-
-#  if OPTIX_ABI_VERSION >= 36
-      if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
-        OptixBuiltinISOptions builtin_options = {};
-        builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-        builtin_options.usesMotionBlur = false;
-
-        check_result_optix_ret(optixBuiltinISModuleGet(
-            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
-
-        group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
-        group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
-        group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
-
-        if (motion_blur) {
-          builtin_options.usesMotionBlur = true;
-
-          check_result_optix_ret(optixBuiltinISModuleGet(
-              context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
-
-          group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
-          group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
-          group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
-          group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
-        }
-      }
-#  endif
-    }
-
-    if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
-      // Add hit group for local intersections
-      group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-      group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
-      group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
-    }
-
-    if (requested_features.use_baking) {
-      group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BAKE].raygen.module = optix_module;
-      group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
-    }
-
-    if (requested_features.use_true_displacement) {
-      group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_DISP].raygen.module = optix_module;
-      group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
-    }
-
-    if (requested_features.use_background_light) {
-      group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-      group_descs[PG_BACK].raygen.module = optix_module;
-      group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
-    }
-
-    // Shader raytracing replaces some functions with direct callables
-    if (requested_features.use_shader_raytrace) {
-      group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
-      group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
-          "__direct_callable__kernel_volume_shadow";
-      group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
-      group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
-      group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
-          "__direct_callable__subsurface_scatter_multi_setup";
-    }
-
-    check_result_optix_ret(optixProgramGroupCreate(
-        context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
-
-    // Get program stack sizes
-    OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
-    // Set up SBT, which in this case is used only to select between different programs
-    sbt_data.alloc(NUM_PROGRAM_GROUPS);
-    memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
-      check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
-    }
-    sbt_data.copy_to_device();  // Upload SBT to device
-
-    // Calculate maximum trace continuation stack size
-    unsigned int trace_css = stack_size[PG_HITD].cssCH;
-    // This is based on the maximum of closest-hit and any-hit/intersection programs
-    trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
-    trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
-#  if OPTIX_ABI_VERSION >= 36
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
-    trace_css = std::max(trace_css,
-                         stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
-#  endif
-
-    OptixPipelineLinkOptions link_options = {};
-    link_options.maxTraceDepth = 1;
-    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
-#  if OPTIX_ABI_VERSION < 24
-    link_options.overrideUsesMotionBlur = motion_blur;
-#  endif
-
-    {  // Create path tracing pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_RGEN]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_PATH_TRACE]));
-
-      // Combine ray generation and trace continuation stack size
-      const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
-      // Max direct callable depth is one of the following, so combine accordingly
-      // - __raygen__ -> svm_eval_nodes
-      // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
-      // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      // Set stack size depending on pipeline options
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Only need to create shader evaluation pipeline if one of these features is used:
-    const bool use_shader_eval_pipeline = requested_features.use_baking ||
-                                          requested_features.use_background_light ||
-                                          requested_features.use_true_displacement;
-
-    if (use_shader_eval_pipeline) {  // Create shader evaluation pipeline
-      vector<OptixProgramGroup> pipeline_groups;
-      pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-      pipeline_groups.push_back(groups[PG_BAKE]);
-      pipeline_groups.push_back(groups[PG_DISP]);
-      pipeline_groups.push_back(groups[PG_BACK]);
-      pipeline_groups.push_back(groups[PG_MISS]);
-      pipeline_groups.push_back(groups[PG_HITD]);
-      pipeline_groups.push_back(groups[PG_HITS]);
-      pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-      if (motion_blur) {
-        pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-        pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-      }
-#  endif
-      if (requested_features.use_shader_raytrace) {
-        pipeline_groups.push_back(groups[PG_CALL + 0]);
-        pipeline_groups.push_back(groups[PG_CALL + 1]);
-        pipeline_groups.push_back(groups[PG_CALL + 2]);
-      }
-
-      check_result_optix_ret(optixPipelineCreate(context,
-                                                 &pipeline_options,
-                                                 &link_options,
-                                                 pipeline_groups.data(),
-                                                 pipeline_groups.size(),
-                                                 nullptr,
-                                                 0,
-                                                 &pipelines[PIP_SHADER_EVAL]));
-
-      // Calculate continuation stack size based on the maximum of all ray generation stack sizes
-      const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
-                                        std::max(stack_size[PG_DISP].cssRG,
-                                                 stack_size[PG_BACK].cssRG)) +
-                               link_options.maxTraceDepth * trace_css;
-      const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                               std::max(stack_size[PG_CALL + 1].dssDC,
-                                        stack_size[PG_CALL + 2].dssDC);
-
-      check_result_optix_ret(
-          optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
-                                    0,
-                                    requested_features.use_shader_raytrace ? dss : 0,
-                                    css,
-                                    motion_blur ? 3 : 2));
-    }
-
-    // Clean up program group objects
-    for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
-      optixProgramGroupDestroy(groups[i]);
-    }
-
-    return true;
-  }
-
-  void thread_run(DeviceTask &task, int thread_index)  // Main task entry point
-  {
-    if (have_error())
-      return;  // Abort early if there was an error previously
-
-    if (task.type == DeviceTask::RENDER) {
-      if (thread_index != 0) {
-        // Only execute denoising in a single thread (see also 'task_add')
-        task.tile_types &= ~RenderTile::DENOISE;
-      }
-
-      RenderTile tile;
-      while (task.acquire_tile(this, tile, task.tile_types)) {
-        if (tile.task == RenderTile::PATH_TRACE)
-          launch_render(task, tile, thread_index);
-        else if (tile.task == RenderTile::BAKE) {
-          // Perform baking using CUDA, since it is not currently implemented in OptiX
-          device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-          CUDADevice::render(task, tile, work_tiles);
-        }
-        else if (tile.task == RenderTile::DENOISE)
-          launch_denoise(task, tile);
-        task.release_tile(tile);
-        if (task.get_cancel() && !task.need_finish_queue)
-          break;  // User requested cancellation
-        else if (have_error())
-          break;  // Abort rendering when encountering an error
-      }
-    }
-    else if (task.type == DeviceTask::SHADER) {
-      // CUDA kernels are used when doing baking
-      if (optix_module == NULL)
-        CUDADevice::shader(task);
-      else
-        launch_shader_eval(task, thread_index);
-    }
-    else if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Set up a single tile that covers the whole task and denoise it
-      RenderTile tile;
-      tile.x = task.x;
-      tile.y = task.y;
-      tile.w = task.w;
-      tile.h = task.h;
-      tile.buffer = task.buffer;
-      tile.num_samples = task.num_samples;
-      tile.start_sample = task.sample;
-      tile.offset = task.offset;
-      tile.stride = task.stride;
-      tile.buffers = task.buffers;
-
-      launch_denoise(task, tile);
-    }
-  }
-
-  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
-  {
-    assert(thread_index < launch_params.data_size);
-
-    // Keep track of total render time of this tile
-    const scoped_timer timer(&rtile.buffers->render_time);
-
-    WorkTile wtile;
-    wtile.x = rtile.x;
-    wtile.y = rtile.y;
-    wtile.w = rtile.w;
-    wtile.h = rtile.h;
-    wtile.offset = rtile.offset;
-    wtile.stride = rtile.stride;
-    wtile.buffer = (float *)rtile.buffer;
-
-    const int end_sample = rtile.start_sample + rtile.num_samples;
-    // Keep this number reasonable to avoid running into TDRs
-    int step_samples = (info.display_device ? 8 : 32);
-
-    // Offset into launch params buffer so that streams use separate data
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    const CUDAContextScope scope(cuContext);
-
-    for (int sample = rtile.start_sample; sample < end_sample;) {
-      // Copy work tile information to device
-      wtile.start_sample = sample;
-      wtile.num_samples = step_samples;
-      if (task.adaptive_sampling.use) {
-        wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-      }
-      wtile.num_samples = min(wtile.num_samples, end_sample - sample);
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      check_result_cuda(
-          cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      // Launch the ray generation program
-      check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     // Launch with samples close to each other for better locality
-                                     wtile.w * wtile.num_samples,
-                                     wtile.h,
-                                     1));
-
-      // Run the adaptive sampling kernels at selected samples aligned to step samples.
-      uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
-      if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-        adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      }
-
-      // Wait for launch to finish
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      // Update current sample, so it is displayed correctly
-      sample += wtile.num_samples;
-      rtile.sample = sample;
-      // Update task progress after the kernel completed rendering
-      task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
-
-      if (task.get_cancel() && !task.need_finish_queue)
-        return;  // Cancel rendering
-    }
-
-    // Finalize adaptive sampling
-    if (task.adaptive_sampling.use) {
-      device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
-      adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-      task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
-    }
-  }
-
-  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
-  {
-    // Update current sample (for display and NLM denoising task)
-    rtile.sample = rtile.start_sample + rtile.num_samples;
-
-    // Make CUDA context current now, since it is used for both denoising tasks
-    const CUDAContextScope scope(cuContext);
-
-    // Choose between OptiX and NLM denoising
-    if (task.denoising.type == DENOISER_OPTIX) {
-      // Map neighboring tiles onto this device, indices are as following:
-      // Where index 4 is the center tile and index 9 is the target for the result.
-      //   0 1 2
-      //   3 4 5
-      //   6 7 8  9
-      RenderTileNeighbors neighbors(rtile);
-      task.map_neighbor_tiles(neighbors, this);
-      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-      RenderTile &target_tile = neighbors.target;
-      rtile = center_tile;  // Tile may have been modified by mapping code
-
-      // Calculate size of the tile to denoise (including overlap)
-      int4 rect = center_tile.bounds();
-      // Overlap between tiles has to be at least 64 pixels
-      // TODO(pmours): Query this value from OptiX
-      rect = rect_expand(rect, 64);
-      int4 clip_rect = neighbors.bounds();
-      rect = rect_clip(rect, clip_rect);
-      int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
-      int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
-
-      // Calculate byte offsets and strides
-      int pixel_stride = task.pass_stride * (int)sizeof(float);
-      int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
-      const int pass_offset[3] = {
-          (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
-          (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
-
-      // Start with the current tile pointer offset
-      int input_stride = pixel_stride;
-      device_ptr input_ptr = rtile.buffer + pixel_offset;
-
-      // Copy tile data into a common buffer if necessary
-      device_only_memory<float> input(this, "denoiser input", true);
-      device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
-
-      bool contiguous_memory = true;
-      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-        if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
-          contiguous_memory = false;
-        }
-      }
-
-      if (contiguous_memory) {
-        // Tiles are in continous memory, so can just subtract overlap offset
-        input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
-        // Stride covers the whole width of the image and not just a single tile
-        input_stride *= rtile.stride;
-      }
-      else {
-        // Adjacent tiles are in separate memory regions, so need to copy them into a single one
-        input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
-        // Start with the new input buffer
-        input_ptr = input.device_pointer;
-        // Stride covers the width of the new input buffer, which includes tile width and overlap
-        input_stride *= rect_size.x;
-
-        TileInfo *tile_info = tile_info_mem.alloc(1);
-        for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
-          tile_info->offsets[i] = neighbors.tiles[i].offset;
-          tile_info->strides[i] = neighbors.tiles[i].stride;
-          tile_info->buffers[i] = neighbors.tiles[i].buffer;
-        }
-        tile_info->x[0] = neighbors.tiles[3].x;
-        tile_info->x[1] = neighbors.tiles[4].x;
-        tile_info->x[2] = neighbors.tiles[5].x;
-        tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
-        tile_info->y[0] = neighbors.tiles[1].y;
-        tile_info->y[1] = neighbors.tiles[4].y;
-        tile_info->y[2] = neighbors.tiles[7].y;
-        tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
-        tile_info_mem.copy_to_device();
-
-        void *args[] = {
-            &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
-        launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
-      input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
-
-      void *input_args[] = {&input_rgb.device_pointer,
-                            &input_ptr,
-                            &rect_size.x,
-                            &rect_size.y,
-                            &input_stride,
-                            &task.pass_stride,
-                            const_cast<int *>(pass_offset),
-                            &task.denoising.input_passes,
-                            &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
-
-      input_ptr = input_rgb.device_pointer;
-      pixel_stride = 3 * sizeof(float);
-      input_stride = rect_size.x * pixel_stride;
-#  endif
-
-      const bool recreate_denoiser = (denoiser == NULL) ||
-                                     (task.denoising.input_passes != denoiser_input_passes);
-      if (recreate_denoiser) {
-        // Destroy existing handle before creating new one
-        if (denoiser != NULL) {
-          optixDenoiserDestroy(denoiser);
-        }
-
-        // Create OptiX denoiser handle on demand when it is first used
-        OptixDenoiserOptions denoiser_options = {};
-        assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
-#  if OPTIX_ABI_VERSION >= 47
-        denoiser_options.guideAlbedo = task.denoising.input_passes >= 2;
-        denoiser_options.guideNormal = task.denoising.input_passes >= 3;
-        check_result_optix_ret(optixDenoiserCreate(
-            context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser));
-#  else
-        denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
-            OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
-#    if OPTIX_ABI_VERSION < 28
-        denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
-#    endif
-        check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
-        check_result_optix_ret(
-            optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
-#  endif
-
-        // OptiX denoiser handle was created with the requested number of input passes
-        denoiser_input_passes = task.denoising.input_passes;
-      }
-
-      OptixDenoiserSizes sizes = {};
-      check_result_optix_ret(
-          optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
-
-#  if OPTIX_ABI_VERSION < 28
-      const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
-#  else
-      const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
-#  endif
-      const size_t scratch_offset = sizes.stateSizeInBytes;
-
-      // Allocate denoiser state if tile size has changed since last setup
-      if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
-                                denoiser_state.data_height != rect_size.y)) {
-        denoiser_state.alloc_to_device(scratch_offset + scratch_size);
-
-        // Initialize denoiser state for the current tile size
-        check_result_optix_ret(optixDenoiserSetup(denoiser,
-                                                  0,
-                                                  rect_size.x,
-                                                  rect_size.y,
-                                                  denoiser_state.device_pointer,
-                                                  scratch_offset,
-                                                  denoiser_state.device_pointer + scratch_offset,
-                                                  scratch_size));
-
-        denoiser_state.data_width = rect_size.x;
-        denoiser_state.data_height = rect_size.y;
-      }
-
-      // Set up input and output layer information
-      OptixImage2D input_layers[3] = {};
-      OptixImage2D output_layers[1] = {};
-
-      for (int i = 0; i < 3; ++i) {
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-        input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
-#  else
-        input_layers[i].data = input_ptr + pass_offset[i];
-#  endif
-        input_layers[i].width = rect_size.x;
-        input_layers[i].height = rect_size.y;
-        input_layers[i].rowStrideInBytes = input_stride;
-        input_layers[i].pixelStrideInBytes = pixel_stride;
-        input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-      }
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      output_layers[0].data = input_ptr;
-      output_layers[0].width = rect_size.x;
-      output_layers[0].height = rect_size.y;
-      output_layers[0].rowStrideInBytes = input_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-      int2 output_offset = overlap_offset;
-      overlap_offset = make_int2(0, 0);  // Not supported by denoiser API, so apply manually
-#  else
-      output_layers[0].data = target_tile.buffer + pixel_offset;
-      output_layers[0].width = target_tile.w;
-      output_layers[0].height = target_tile.h;
-      output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
-      output_layers[0].pixelStrideInBytes = pixel_stride;
-#  endif
-      output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
-
-#  if OPTIX_ABI_VERSION >= 47
-      OptixDenoiserLayer image_layers = {};
-      image_layers.input = input_layers[0];
-      image_layers.output = output_layers[0];
-
-      OptixDenoiserGuideLayer guide_layers = {};
-      guide_layers.albedo = input_layers[1];
-      guide_layers.normal = input_layers[2];
-#  endif
-
-      // Finally run denonising
-      OptixDenoiserParams params = {};  // All parameters are disabled/zero
-#  if OPTIX_ABI_VERSION >= 47
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 &guide_layers,
-                                                 &image_layers,
-                                                 1,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  else
-      check_result_optix_ret(optixDenoiserInvoke(denoiser,
-                                                 NULL,
-                                                 &params,
-                                                 denoiser_state.device_pointer,
-                                                 scratch_offset,
-                                                 input_layers,
-                                                 task.denoising.input_passes,
-                                                 overlap_offset.x,
-                                                 overlap_offset.y,
-                                                 output_layers,
-                                                 denoiser_state.device_pointer + scratch_offset,
-                                                 scratch_size));
-#  endif
-
-#  if OPTIX_DENOISER_NO_PIXEL_STRIDE
-      void *output_args[] = {&input_ptr,
-                             &target_tile.buffer,
-                             &output_offset.x,
-                             &output_offset.y,
-                             &rect_size.x,
-                             &rect_size.y,
-                             &target_tile.x,
-                             &target_tile.y,
-                             &target_tile.w,
-                             &target_tile.h,
-                             &target_tile.offset,
-                             &target_tile.stride,
-                             &task.pass_stride,
-                             &rtile.sample};
-      launch_filter_kernel(
-          "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
-#  endif
-
-      check_result_cuda_ret(cuStreamSynchronize(0));
-
-      task.unmap_neighbor_tiles(neighbors, this);
-    }
-    else {
-      // Run CUDA denoising kernels
-      DenoisingTask denoising(this, task);
-      CUDADevice::denoise(rtile, denoising);
-    }
-
-    // Update task progress after the denoiser completed processing
-    task.update_progress(&rtile, rtile.w * rtile.h);
-
-    return true;
-  }
-
-  void launch_shader_eval(DeviceTask &task, int thread_index)
-  {
-    unsigned int rgen_index = PG_BACK;
-    if (task.shader_eval_type >= SHADER_EVAL_BAKE)
-      rgen_index = PG_BAKE;
-    if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
-      rgen_index = PG_DISP;
-
-    const CUDAContextScope scope(cuContext);
-
-    device_ptr launch_params_ptr = launch_params.device_pointer +
-                                   thread_index * launch_params.data_elements;
-
-    for (int sample = 0; sample < task.num_samples; ++sample) {
-      ShaderParams params;
-      params.input = (uint4 *)task.shader_input;
-      params.output = (float4 *)task.shader_output;
-      params.type = task.shader_eval_type;
-      params.filter = task.shader_filter;
-      params.sx = task.shader_x;
-      params.offset = task.offset;
-      params.sample = sample;
-
-      check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
-                                          &params,
-                                          sizeof(params),
-                                          cuda_stream[thread_index]));
-
-      OptixShaderBindingTable sbt_params = {};
-      sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
-      sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
-      sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-      sbt_params.missRecordCount = 1;
-      sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
-      sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
-#  if OPTIX_ABI_VERSION >= 36
-      sbt_params.hitgroupRecordCount = 5;  // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
-#  else
-      sbt_params.hitgroupRecordCount = 3;  // PG_HITD, PG_HITS, PG_HITL
-#  endif
-      sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
-      sbt_params.callablesRecordCount = 3;
-      sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
-
-      check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
-                                     cuda_stream[thread_index],
-                                     launch_params_ptr,
-                                     launch_params.data_elements,
-                                     &sbt_params,
-                                     task.shader_w,
-                                     1,
-                                     1));
-
-      check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
-
-      task.update_progress(NULL);
-    }
-  }
-
-  bool build_optix_bvh(BVHOptiX *bvh,
-                       OptixBuildOperation operation,
-                       const OptixBuildInput &build_input,
-                       uint16_t num_motion_steps)
-  {
-    /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
-     * from running out of memory (since both original and compacted acceleration structure memory
-     * may be allocated at the same time for the duration of this function). The builds would
-     * otherwise happen on the same CUDA stream anyway. */
-    static thread_mutex mutex;
-    thread_scoped_lock lock(mutex);
-
-    const CUDAContextScope scope(cuContext);
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    // Compute memory usage
-    OptixAccelBufferSizes sizes = {};
-    OptixAccelBuildOptions options = {};
-    options.operation = operation;
-    if (use_fast_trace_bvh) {
-      VLOG(2) << "Using fast to trace OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
-    }
-    else {
-      VLOG(2) << "Using fast to update OptiX BVH";
-      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
-    }
-
-    options.motionOptions.numKeys = num_motion_steps;
-    options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
-    options.motionOptions.timeBegin = 0.0f;
-    options.motionOptions.timeEnd = 1.0f;
-
-    check_result_optix_ret(
-        optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
-
-    // Allocate required output buffers
-    device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
-    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
-    if (!temp_mem.device_pointer)
-      return false;  // Make sure temporary memory allocation succeeded
-
-    // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
-    device_only_memory<char> &out_data = bvh->as_data;
-    if (operation == OPTIX_BUILD_OPERATION_BUILD) {
-      assert(out_data.device == this);
-      out_data.alloc_to_device(sizes.outputSizeInBytes);
-      if (!out_data.device_pointer)
-        return false;
-    }
-    else {
-      assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
-    }
-
-    // Finally build the acceleration structure
-    OptixAccelEmitDesc compacted_size_prop = {};
-    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
-    // A tiny space was allocated for this property at the end of the temporary buffer above
-    // Make sure this pointer is 8-byte aligned
-    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
-
-    OptixTraversableHandle out_handle = 0;
-    check_result_optix_ret(optixAccelBuild(context,
-                                           NULL,
-                                           &options,
-                                           &build_input,
-                                           1,
-                                           temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
-                                           sizes.outputSizeInBytes,
-                                           &out_handle,
-                                           use_fast_trace_bvh ? &compacted_size_prop : NULL,
-                                           use_fast_trace_bvh ? 1 : 0));
-    bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-    // Wait for all operations to finish
-    check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-    // Compact acceleration structure to save memory (only if using fast trace as the
-    // OPTIX_BUILD_FLAG_ALLOW_COMPACTION flag is only set in this case).
-    if (use_fast_trace_bvh) {
-      uint64_t compacted_size = sizes.outputSizeInBytes;
-      check_result_cuda_ret(
-          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
-
-      // Temporary memory is no longer needed, so free it now to make space
-      temp_mem.free();
-
-      // There is no point compacting if the size does not change
-      if (compacted_size < sizes.outputSizeInBytes) {
-        device_only_memory<char> compacted_data(this, "optix compacted as", false);
-        compacted_data.alloc_to_device(compacted_size);
-        if (!compacted_data.device_pointer)
-          // Do not compact if memory allocation for compacted acceleration structure fails
-          // Can just use the uncompacted one then, so succeed here regardless
-          return true;
-
-        check_result_optix_ret(optixAccelCompact(context,
-                                                 NULL,
-                                                 out_handle,
-                                                 compacted_data.device_pointer,
-                                                 compacted_size,
-                                                 &out_handle));
-        bvh->traversable_handle = static_cast<uint64_t>(out_handle);
-
-        // Wait for compaction to finish
-        check_result_cuda_ret(cuStreamSynchronize(NULL));
-
-        std::swap(out_data.device_size, compacted_data.device_size);
-        std::swap(out_data.device_pointer, compacted_data.device_pointer);
-        // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
-      }
-    }
-
-    return true;
-  }
-
-  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
-  {
-    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
-      /* For baking CUDA is used, build appropriate BVH for that. */
-      Device::build_bvh(bvh, progress, refit);
-      return;
-    }
-
-    const bool use_fast_trace_bvh = (bvh->params.bvh_type == SceneParams::BVH_STATIC);
-
-    free_bvh_memory_delayed();
-
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    progress.set_substatus("Building OptiX acceleration structure");
-
-    if (!bvh->params.top_level) {
-      assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
-
-      OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
-      /* Refit is only possible when using fast to trace BVH (because AS is built with
-       * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
-      if (refit && !use_fast_trace_bvh) {
-        assert(bvh_optix->traversable_handle != 0);
-        operation = OPTIX_BUILD_OPERATION_UPDATE;
-      }
-      else {
-        bvh_optix->as_data.free();
-        bvh_optix->traversable_handle = 0;
-      }
-
-      // Build bottom level acceleration structures (BLAS)
-      Geometry *const geom = bvh->geometry[0];
-      if (geom->geometry_type == Geometry::HAIR) {
-        // Build BLAS for curve primitives
-        Hair *const hair = static_cast<Hair *const>(geom);
-        if (hair->num_curves() == 0) {
-          return;
-        }
-
-        const size_t num_segments = hair->num_segments();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = hair->get_motion_steps();
-        }
-
-        device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
-#  if OPTIX_ABI_VERSION >= 36
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        // Four control points for each curve segment
-        const size_t num_vertices = num_segments * 4;
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          index_data.alloc(num_segments);
-          vertex_data.alloc(num_vertices * num_motion_steps);
-        }
-        else
-#  endif
-          aabb_data.alloc(num_segments * num_motion_steps);
-
-        // Get AABBs for each motion step
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          // The center step for motion vertices is not stored in the attribute
-          const float3 *keys = hair->get_curve_keys().data();
-          size_t center_step = (num_motion_steps - 1) / 2;
-          if (step != center_step) {
-            size_t attr_offset = (step > center_step) ? step - 1 : step;
-            // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
-            keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
-          }
-
-          for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
-            const Hair::Curve curve = hair->get_curve(j);
-#  if OPTIX_ABI_VERSION >= 36
-            const array<float> &curve_radius = hair->get_curve_radius();
-#  endif
-
-            for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
-#  if OPTIX_ABI_VERSION >= 36
-              if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-                int k0 = curve.first_key + segment;
-                int k1 = k0 + 1;
-                int ka = max(k0 - 1, curve.first_key);
-                int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
-
-                const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
-                const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
-                const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
-                const float4 pw = make_float4(
-                    curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
-
-                // Convert Catmull-Rom data to Bezier spline
-                static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
-                static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
-                static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
-                static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
-
-                index_data[i] = i * 4;
-                float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
-                v[0] = make_float4(
-                    dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
-                v[1] = make_float4(
-                    dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
-                v[2] = make_float4(
-                    dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
-                v[3] = make_float4(
-                    dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
-              }
-              else
-#  endif
-              {
-                BoundBox bounds = BoundBox::empty;
-                curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
-
-                const size_t index = step * num_segments + i;
-                aabb_data[index].minX = bounds.min.x;
-                aabb_data[index].minY = bounds.min.y;
-                aabb_data[index].minZ = bounds.min.z;
-                aabb_data[index].maxX = bounds.max.x;
-                aabb_data[index].maxY = bounds.max.y;
-                aabb_data[index].maxZ = bounds.max.z;
-              }
-            }
-          }
-        }
-
-        // Upload AABB data to GPU
-        aabb_data.copy_to_device();
-#  if OPTIX_ABI_VERSION >= 36
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-#  endif
-
-        vector<device_ptr> aabb_ptrs;
-        aabb_ptrs.reserve(num_motion_steps);
-#  if OPTIX_ABI_VERSION >= 36
-        vector<device_ptr> width_ptrs;
-        vector<device_ptr> vertex_ptrs;
-        width_ptrs.reserve(num_motion_steps);
-        vertex_ptrs.reserve(num_motion_steps);
-#  endif
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
-#  if OPTIX_ABI_VERSION >= 36
-          const device_ptr base_ptr = vertex_data.device_pointer +
-                                      step * num_vertices * sizeof(float4);
-          width_ptrs.push_back(base_ptr + 3 * sizeof(float));  // Offset by vertex size
-          vertex_ptrs.push_back(base_ptr);
-#  endif
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-#  if OPTIX_ABI_VERSION >= 36
-        if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
-          build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
-          build_input.curveArray.numPrimitives = num_segments;
-          build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-          build_input.curveArray.numVertices = num_vertices;
-          build_input.curveArray.vertexStrideInBytes = sizeof(float4);
-          build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
-          build_input.curveArray.widthStrideInBytes = sizeof(float4);
-          build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
-          build_input.curveArray.indexStrideInBytes = sizeof(int);
-          build_input.curveArray.flag = build_flags;
-          build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
-        }
-        else
-#  endif
-        {
-          // Disable visibility test any-hit program, since it is already checked during
-          // intersection. Those trace calls that require anyhit can force it with a ray flag.
-          build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
-
-          build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
-#  if OPTIX_ABI_VERSION < 23
-          build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.aabbArray.numPrimitives = num_segments;
-          build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
-          build_input.aabbArray.flags = &build_flags;
-          build_input.aabbArray.numSbtRecords = 1;
-          build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  else
-          build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
-          build_input.customPrimitiveArray.numPrimitives = num_segments;
-          build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
-          build_input.customPrimitiveArray.flags = &build_flags;
-          build_input.customPrimitiveArray.numSbtRecords = 1;
-          build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
-#  endif
-        }
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-      else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
-        // Build BLAS for triangle primitives
-        Mesh *const mesh = static_cast<Mesh *const>(geom);
-        if (mesh->num_triangles() == 0) {
-          return;
-        }
-
-        const size_t num_verts = mesh->get_verts().size();
-
-        size_t num_motion_steps = 1;
-        Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-        if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
-          num_motion_steps = mesh->get_motion_steps();
-        }
-
-        device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
-        index_data.alloc(mesh->get_triangles().size());
-        memcpy(index_data.data(),
-               mesh->get_triangles().data(),
-               mesh->get_triangles().size() * sizeof(int));
-        device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
-        vertex_data.alloc(num_verts * num_motion_steps);
-
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          const float3 *verts = mesh->get_verts().data();
-
-          size_t center_step = (num_motion_steps - 1) / 2;
-          // The center step for motion vertices is not stored in the attribute
-          if (step != center_step) {
-            verts = motion_keys->data_float3() +
-                    (step > center_step ? step - 1 : step) * num_verts;
-          }
-
-          memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
-        }
-
-        // Upload triangle data to GPU
-        index_data.copy_to_device();
-        vertex_data.copy_to_device();
-
-        vector<device_ptr> vertex_ptrs;
-        vertex_ptrs.reserve(num_motion_steps);
-        for (size_t step = 0; step < num_motion_steps; ++step) {
-          vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
-        }
-
-        // Force a single any-hit call, so shadow record-all behavior works correctly
-        unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
-        OptixBuildInput build_input = {};
-        build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
-        build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
-        build_input.triangleArray.numVertices = num_verts;
-        build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
-        build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
-        build_input.triangleArray.indexBuffer = index_data.device_pointer;
-        build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
-        build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
-        build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
-        build_input.triangleArray.flags = &build_flags;
-        // The SBT does not store per primitive data since Cycles already allocates separate
-        // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
-        // one and rely on that having the same meaning in this case.
-        build_input.triangleArray.numSbtRecords = 1;
-        build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
-
-        if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
-          progress.set_error("Failed to build OptiX acceleration structure");
-        }
-      }
-    }
-    else {
-      unsigned int num_instances = 0;
-      unsigned int max_num_instances = 0xFFFFFFFF;
-
-      bvh_optix->as_data.free();
-      bvh_optix->traversable_handle = 0;
-      bvh_optix->motion_transform_data.free();
-
-      optixDeviceContextGetProperty(context,
-                                    OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
-                                    &max_num_instances,
-                                    sizeof(max_num_instances));
-      // Do not count first bit, which is used to distinguish instanced and non-instanced objects
-      max_num_instances >>= 1;
-      if (bvh->objects.size() > max_num_instances) {
-        progress.set_error(
-            "Failed to build OptiX acceleration structure because there are too many instances");
-        return;
-      }
-
-      // Fill instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
-      aabbs.alloc(bvh->objects.size());
-#  endif
-      device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
-      instances.alloc(bvh->objects.size());
-
-      // Calculate total motion transform size and allocate memory for them
-      size_t motion_transform_offset = 0;
-      if (motion_blur) {
-        size_t total_motion_transform_size = 0;
-        for (Object *const ob : bvh->objects) {
-          if (ob->is_traceable() && ob->use_motion()) {
-            total_motion_transform_size = align_up(total_motion_transform_size,
-                                                   OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-            const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-            total_motion_transform_size = total_motion_transform_size +
-                                          sizeof(OptixSRTMotionTransform) +
-                                          motion_keys * sizeof(OptixSRTData);
-          }
-        }
-
-        assert(bvh_optix->motion_transform_data.device == this);
-        bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
-      }
-
-      for (Object *ob : bvh->objects) {
-        // Skip non-traceable objects
-        if (!ob->is_traceable())
-          continue;
-
-        BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
-        OptixTraversableHandle handle = blas->traversable_handle;
-
-#  if OPTIX_ABI_VERSION < 41
-        OptixAabb &aabb = aabbs[num_instances];
-        aabb.minX = ob->bounds.min.x;
-        aabb.minY = ob->bounds.min.y;
-        aabb.minZ = ob->bounds.min.z;
-        aabb.maxX = ob->bounds.max.x;
-        aabb.maxY = ob->bounds.max.y;
-        aabb.maxZ = ob->bounds.max.z;
-#  endif
-
-        OptixInstance &instance = instances[num_instances++];
-        memset(&instance, 0, sizeof(instance));
-
-        // Clear transform to identity matrix
-        instance.transform[0] = 1.0f;
-        instance.transform[5] = 1.0f;
-        instance.transform[10] = 1.0f;
-
-        // Set user instance ID to object index (but leave low bit blank)
-        instance.instanceId = ob->get_device_index() << 1;
-
-        // Have to have at least one bit in the mask, or else instance would always be culled
-        instance.visibilityMask = 1;
-
-        if (ob->get_geometry()->has_volume) {
-          // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
-          instance.visibilityMask |= 2;
-        }
-
-        if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-          // Same applies to curves (so they can be skipped in local trace calls)
-          instance.visibilityMask |= 4;
-
-#  if OPTIX_ABI_VERSION >= 36
-          if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-              DebugFlags().optix.curves_api &&
-              static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
-            // Select between motion blur and non-motion blur built-in intersection module
-            instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
-          }
-#  endif
-        }
-
-        // Insert motion traversable if object has motion
-        if (motion_blur && ob->use_motion()) {
-          size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
-          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
-                                         motion_keys * sizeof(OptixSRTData);
-
-          const CUDAContextScope scope(cuContext);
-
-          motion_transform_offset = align_up(motion_transform_offset,
-                                             OPTIX_TRANSFORM_BYTE_ALIGNMENT);
-          CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
-                                             motion_transform_offset;
-          motion_transform_offset += motion_transform_size;
-
-          // Allocate host side memory for motion transform and fill it with transform data
-          OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              new uint8_t[motion_transform_size]);
-          motion_transform.child = handle;
-          motion_transform.motionOptions.numKeys = ob->get_motion().size();
-          motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
-          motion_transform.motionOptions.timeBegin = 0.0f;
-          motion_transform.motionOptions.timeEnd = 1.0f;
-
-          OptixSRTData *const srt_data = motion_transform.srtData;
-          array<DecomposedTransform> decomp(ob->get_motion().size());
-          transform_motion_decompose(
-              decomp.data(), ob->get_motion().data(), ob->get_motion().size());
-
-          for (size_t i = 0; i < ob->get_motion().size(); ++i) {
-            // Scale
-            srt_data[i].sx = decomp[i].y.w;  // scale.x.x
-            srt_data[i].sy = decomp[i].z.w;  // scale.y.y
-            srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-
-            // Shear
-            srt_data[i].a = decomp[i].z.x;  // scale.x.y
-            srt_data[i].b = decomp[i].z.y;  // scale.x.z
-            srt_data[i].c = decomp[i].w.x;  // scale.y.z
-            assert(decomp[i].z.z == 0.0f);  // scale.y.x
-            assert(decomp[i].w.y == 0.0f);  // scale.z.x
-            assert(decomp[i].w.z == 0.0f);  // scale.z.y
-
-            // Pivot point
-            srt_data[i].pvx = 0.0f;
-            srt_data[i].pvy = 0.0f;
-            srt_data[i].pvz = 0.0f;
-
-            // Rotation
-            srt_data[i].qx = decomp[i].x.x;
-            srt_data[i].qy = decomp[i].x.y;
-            srt_data[i].qz = decomp[i].x.z;
-            srt_data[i].qw = decomp[i].x.w;
-
-            // Translation
-            srt_data[i].tx = decomp[i].y.x;
-            srt_data[i].ty = decomp[i].y.y;
-            srt_data[i].tz = decomp[i].y.z;
-          }
-
-          // Upload motion transform to GPU
-          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
-          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
-
-          // Disable instance transform if object uses motion transform already
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-
-          // Get traversable handle to motion transform
-          optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu,
-                                                 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
-                                                 &instance.traversableHandle);
-        }
-        else {
-          instance.traversableHandle = handle;
-
-          if (ob->get_geometry()->is_instanced()) {
-            // Set transform matrix
-            memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
-          }
-          else {
-            // Disable instance transform if geometry already has it applied to vertex data
-            instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
-            // Non-instanced objects read ID from 'prim_object', so distinguish
-            // them from instanced objects with the low bit set
-            instance.instanceId |= 1;
-          }
-        }
-      }
-
-      // Upload instance descriptions
-#  if OPTIX_ABI_VERSION < 41
-      aabbs.resize(num_instances);
-      aabbs.copy_to_device();
-#  endif
-      instances.resize(num_instances);
-      instances.copy_to_device();
-
-      // Build top-level acceleration structure (TLAS)
-      OptixBuildInput build_input = {};
-      build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
-#  if OPTIX_ABI_VERSION < 41  // Instance AABBs no longer need to be set since OptiX 7.2
-      build_input.instanceArray.aabbs = aabbs.device_pointer;
-      build_input.instanceArray.numAabbs = num_instances;
-#  endif
-      build_input.instanceArray.instances = instances.device_pointer;
-      build_input.instanceArray.numInstances = num_instances;
-
-      if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
-        progress.set_error("Failed to build OptiX acceleration structure");
-      }
-      tlas_handle = bvh_optix->traversable_handle;
-    }
-  }
-
-  void release_optix_bvh(BVH *bvh) override
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
-     * while GPU is still rendering. */
-    BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
-
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
-    delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
-    bvh_optix->traversable_handle = 0;
-  }
-
-  void free_bvh_memory_delayed()
-  {
-    thread_scoped_lock lock(delayed_free_bvh_mutex);
-    delayed_free_bvh_memory.free_memory();
-  }
-
-  void const_copy_to(const char *name, void *host, size_t size) override
-  {
-    // Set constant memory for CUDA module
-    // TODO(pmours): This is only used for tonemapping (see 'film_convert').
-    //               Could be removed by moving those functions to filter CUDA module.
-    CUDADevice::const_copy_to(name, host, size);
-
-    if (strcmp(name, "__data") == 0) {
-      assert(size <= sizeof(KernelData));
-
-      // Update traversable handle (since it is different for each device on multi devices)
-      KernelData *const data = (KernelData *)host;
-      *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
-
-      update_launch_params(offsetof(KernelParams, data), host, size);
-      return;
-    }
-
-    // Update data storage pointers in launch parameters
-#  define KERNEL_TEX(data_type, tex_name) \
-    if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
-      return; \
-    }
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  }
-
-  void update_launch_params(size_t offset, void *data, size_t data_size)
-  {
-    const CUDAContextScope scope(cuContext);
-
-    for (int i = 0; i < info.cpu_threads; ++i)
-      check_result_cuda(
-          cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
-                       data,
-                       data_size));
-  }
-
-  void task_add(DeviceTask &task) override
-  {
-    // Upload texture information to device if it has changed since last launch
-    load_texture_info();
-
-    if (task.type == DeviceTask::FILM_CONVERT) {
-      // Execute in main thread because of OpenGL access
-      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-      return;
-    }
-
-    if (task.type == DeviceTask::DENOISE_BUFFER) {
-      // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
-      task_pool.push([=] {
-        DeviceTask task_copy = task;
-        thread_run(task_copy, 0);
-      });
-      return;
-    }
-
-    // Split task into smaller ones
-    list<DeviceTask> tasks;
-    task.split(tasks, info.cpu_threads);
-
-    // Queue tasks in internal task pool
-    int task_index = 0;
-    for (DeviceTask &task : tasks) {
-      task_pool.push([=] {
-        // Using task index parameter instead of thread index, since number of CUDA streams may
-        // differ from number of threads
-        DeviceTask task_copy = task;
-        thread_run(task_copy, task_index);
-      });
-      task_index++;
-    }
-  }
-
-  void task_wait() override
-  {
-    // Wait for all queued tasks to finish
-    task_pool.wait_work();
-  }
-
-  void task_cancel() override
-  {
-    // Cancel any remaining tasks in the internal pool
-    task_pool.cancel();
-  }
-};
-
-bool device_optix_init()
-{
-  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
-    return true;  // Already initialized function table
-
-  // Need to initialize CUDA as well
-  if (!device_cuda_init())
-    return false;
-
-  const OptixResult result = optixInit();
-
-  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
-    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
-               "Please update to the latest driver first!";
-    return false;
-  }
-  else if (result != OPTIX_SUCCESS) {
-    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
-    return false;
-  }
-
-  // Loaded OptiX successfully!
-  return true;
-}
-
-void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
-{
-  devices.reserve(cuda_devices.size());
-
-  // Simply add all supported CUDA devices as OptiX devices again
-  for (DeviceInfo info : cuda_devices) {
-    assert(info.type == DEVICE_CUDA);
-
-    int major;
-    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
-    if (major < 5) {
-      continue;  // Only Maxwell and up are supported by OptiX
-    }
-
-    info.type = DEVICE_OPTIX;
-    info.id += "_OptiX";
-    info.denoisers |= DENOISER_OPTIX;
-    info.has_branched_path = false;
-
-    devices.push_back(info);
-  }
-}
-
-Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-{
-  return new OptiXDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp
new file mode 100644
index 00000000000..a89ba68d62c
--- /dev/null
+++ b/intern/cycles/device/device_queue.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_queue.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_time.h"
+
+#include <iomanip>
+
+CCL_NAMESPACE_BEGIN
+
+DeviceQueue::DeviceQueue(Device *device)
+    : device(device), last_kernels_enqueued_(0), last_sync_time_(0.0)
+{
+  DCHECK_NE(device, nullptr);
+}
+
+DeviceQueue::~DeviceQueue()
+{
+  if (VLOG_IS_ON(3)) {
+    /* Print kernel execution times sorted by time. */
+    vector<pair<DeviceKernelMask, double>> stats_sorted;
+    for (const auto &stat : stats_kernel_time_) {
+      stats_sorted.push_back(stat);
+    }
+
+    sort(stats_sorted.begin(),
+         stats_sorted.end(),
+         [](const pair<DeviceKernelMask, double> &a, const pair<DeviceKernelMask, double> &b) {
+           return a.second > b.second;
+         });
+
+    VLOG(3) << "GPU queue stats:";
+    for (const auto &[mask, time] : stats_sorted) {
+      VLOG(3) << "  " << std::setfill(' ') << std::setw(10) << std::fixed << std::setprecision(5)
+              << std::right << time << "s: " << device_kernel_mask_as_string(mask);
+    }
+  }
+}
+
+void DeviceQueue::debug_init_execution()
+{
+  if (VLOG_IS_ON(3)) {
+    last_sync_time_ = time_dt();
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
+{
+  if (VLOG_IS_ON(3)) {
+    VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
+            << work_size;
+    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
+  }
+}
+
+void DeviceQueue::debug_synchronize()
+{
+  if (VLOG_IS_ON(3)) {
+    const double new_time = time_dt();
+    const double elapsed_time = new_time - last_sync_time_;
+    VLOG(4) << "GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time << "s";
+
+    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
+
+    last_sync_time_ = new_time;
+    last_kernels_enqueued_ = 0;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
new file mode 100644
index 00000000000..edda3e61d51
--- /dev/null
+++ b/intern/cycles/device/device_queue.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_kernel.h"
+
+#include "device/device_graphics_interop.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class device_memory;
+
+struct KernelWorkTile;
+
+/* Abstraction of a command queue for a device.
+ * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
+ * from driver side.
+ *
+ * This class encapsulates all properties needed for commands execution. */
+class DeviceQueue {
+ public:
+  virtual ~DeviceQueue();
+
+  /* Number of concurrent states to process for integrator,
+   * based on number of cores and/or available memory. */
+  virtual int num_concurrent_states(const size_t state_size) const = 0;
+
+  /* Number of states which keeps the device occupied with work without loosing performance.
+   * The renderer will add more work (when available) when number of active paths falls below this
+   * value. */
+  virtual int num_concurrent_busy_states() const = 0;
+
+  /* Initialize execution of kernels on this queue.
+   *
+   * Will, for example, load all data required by the kernels from Device to global or path state.
+   *
+   * Use this method after device synchronization has finished before enqueueing any kernels. */
+  virtual void init_execution() = 0;
+
+  /* Test if an optional device kernel is available. */
+  virtual bool kernel_available(DeviceKernel kernel) const = 0;
+
+  /* Enqueue kernel execution.
+   *
+   * Execute the kernel work_size times on the device.
+   * Supported arguments types:
+   * - int: pass pointer to the int
+   * - device memory: pass pointer to device_memory.device_pointer
+   * Return false if there was an error executing this or a previous kernel. */
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) = 0;
+
+  /* Wait unit all enqueued kernels have finished execution.
+   * Return false if there was an error executing any of the enqueued kernels. */
+  virtual bool synchronize() = 0;
+
+  /* Copy memory to/from device as part of the command queue, to ensure
+   * operations are done in order without having to synchronize. */
+  virtual void zero_to_device(device_memory &mem) = 0;
+  virtual void copy_to_device(device_memory &mem) = 0;
+  virtual void copy_from_device(device_memory &mem) = 0;
+
+  /* Graphics resources interoperability.
+   *
+   * The interoperability comes here by the meaning that the device is capable of computing result
+   * directly into an OpenGL (or other graphics library) buffer. */
+
+  /* Create graphics interoperability context which will be taking care of mapping graphics
+   * resource as a buffer writable by kernels of this device. */
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
+  {
+    LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
+    return nullptr;
+  }
+
+  /* Device this queue has been created for. */
+  Device *device;
+
+ protected:
+  /* Hide construction so that allocation via `Device` API is enforced. */
+  explicit DeviceQueue(Device *device);
+
+  /* Implementations call these from the corresponding methods to generate debugging logs. */
+  void debug_init_execution();
+  void debug_enqueue(DeviceKernel kernel, const int work_size);
+  void debug_synchronize();
+
+  /* Combination of kernels enqueued together sync last synchronize. */
+  DeviceKernelMask last_kernels_enqueued_;
+  /* Time of synchronize call. */
+  double last_sync_time_;
+  /* Accumulated execution time for combinations of kernels launched together. */
+  map<DeviceKernelMask, double> stats_kernel_time_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
deleted file mode 100644
index 9889f688aaa..00000000000
--- a/intern/cycles/device/device_split_kernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "device/device_split_kernel.h"
-
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "util/util_logging.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-static const double alpha = 0.1; /* alpha for rolling average */
-
-DeviceSplitKernel::DeviceSplitKernel(Device *device)
-    : device(device),
-      split_data(device, "split_data"),
-      ray_state(device, "ray_state", MEM_READ_WRITE),
-      queue_index(device, "queue_index"),
-      use_queues_flag(device, "use_queues_flag"),
-      work_pool_wgs(device, "work_pool_wgs"),
-      kernel_data_initialized(false)
-{
-  avg_time_per_sample = 0.0;
-
-  kernel_path_init = NULL;
-  kernel_scene_intersect = NULL;
-  kernel_lamp_emission = NULL;
-  kernel_do_volume = NULL;
-  kernel_queue_enqueue = NULL;
-  kernel_indirect_background = NULL;
-  kernel_shader_setup = NULL;
-  kernel_shader_sort = NULL;
-  kernel_shader_eval = NULL;
-  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
-  kernel_subsurface_scatter = NULL;
-  kernel_direct_lighting = NULL;
-  kernel_shadow_blocked_ao = NULL;
-  kernel_shadow_blocked_dl = NULL;
-  kernel_enqueue_inactive = NULL;
-  kernel_next_iteration_setup = NULL;
-  kernel_indirect_subsurface = NULL;
-  kernel_buffer_update = NULL;
-  kernel_adaptive_stopping = NULL;
-  kernel_adaptive_filter_x = NULL;
-  kernel_adaptive_filter_y = NULL;
-  kernel_adaptive_adjust_samples = NULL;
-}
-
-DeviceSplitKernel::~DeviceSplitKernel()
-{
-  split_data.free();
-  ray_state.free();
-  use_queues_flag.free();
-  queue_index.free();
-  work_pool_wgs.free();
-
-  delete kernel_path_init;
-  delete kernel_scene_intersect;
-  delete kernel_lamp_emission;
-  delete kernel_do_volume;
-  delete kernel_queue_enqueue;
-  delete kernel_indirect_background;
-  delete kernel_shader_setup;
-  delete kernel_shader_sort;
-  delete kernel_shader_eval;
-  delete kernel_holdout_emission_blurring_pathtermination_ao;
-  delete kernel_subsurface_scatter;
-  delete kernel_direct_lighting;
-  delete kernel_shadow_blocked_ao;
-  delete kernel_shadow_blocked_dl;
-  delete kernel_enqueue_inactive;
-  delete kernel_next_iteration_setup;
-  delete kernel_indirect_subsurface;
-  delete kernel_buffer_update;
-  delete kernel_adaptive_stopping;
-  delete kernel_adaptive_filter_x;
-  delete kernel_adaptive_filter_y;
-  delete kernel_adaptive_adjust_samples;
-}
-
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-#define LOAD_KERNEL(name) \
-  kernel_##name = get_split_kernel_function(#name, requested_features); \
-  if (!kernel_##name) { \
-    device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
-    return false; \
-  }
-
-  LOAD_KERNEL(path_init);
-  LOAD_KERNEL(scene_intersect);
-  LOAD_KERNEL(lamp_emission);
-  if (requested_features.use_volume) {
-    LOAD_KERNEL(do_volume);
-  }
-  LOAD_KERNEL(queue_enqueue);
-  LOAD_KERNEL(indirect_background);
-  LOAD_KERNEL(shader_setup);
-  LOAD_KERNEL(shader_sort);
-  LOAD_KERNEL(shader_eval);
-  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
-  LOAD_KERNEL(subsurface_scatter);
-  LOAD_KERNEL(direct_lighting);
-  LOAD_KERNEL(shadow_blocked_ao);
-  LOAD_KERNEL(shadow_blocked_dl);
-  LOAD_KERNEL(enqueue_inactive);
-  LOAD_KERNEL(next_iteration_setup);
-  LOAD_KERNEL(indirect_subsurface);
-  LOAD_KERNEL(buffer_update);
-  LOAD_KERNEL(adaptive_stopping);
-  LOAD_KERNEL(adaptive_filter_x);
-  LOAD_KERNEL(adaptive_filter_y);
-  LOAD_KERNEL(adaptive_adjust_samples);
-
-#undef LOAD_KERNEL
-
-  /* Re-initialiaze kernel-dependent data when kernels change. */
-  kernel_data_initialized = false;
-
-  return true;
-}
-
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
-                                                           device_memory &data,
-                                                           uint64_t max_buffer_size)
-{
-  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
-  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
-          << " bytes. (" << string_human_readable_size(size_per_element) << ").";
-  return max_buffer_size / size_per_element;
-}
-
-bool DeviceSplitKernel::path_trace(DeviceTask &task,
-                                   RenderTile &tile,
-                                   device_memory &kgbuffer,
-                                   device_memory &kernel_data)
-{
-  if (device->have_error()) {
-    return false;
-  }
-
-  /* Allocate all required global memory once. */
-  if (!kernel_data_initialized) {
-    kernel_data_initialized = true;
-
-    /* Set local size */
-    int2 lsize = split_kernel_local_size();
-    local_size[0] = lsize[0];
-    local_size[1] = lsize[1];
-
-    /* Set global size */
-    int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
-
-    /* Make sure that set work size is a multiple of local
-     * work size dimensions.
-     */
-    global_size[0] = round_up(gsize[0], local_size[0]);
-    global_size[1] = round_up(gsize[1], local_size[1]);
-
-    int num_global_elements = global_size[0] * global_size[1];
-    assert(num_global_elements % WORK_POOL_SIZE == 0);
-
-    /* Calculate max groups */
-
-    /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
-    unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
-                                                                      WORK_POOL_SIZE_GPU;
-    unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
-
-    /* Allocate work_pool_wgs memory. */
-    work_pool_wgs.alloc_to_device(max_work_groups);
-    queue_index.alloc_to_device(NUM_QUEUES);
-    use_queues_flag.alloc_to_device(1);
-    split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
-    ray_state.alloc(num_global_elements);
-  }
-
-  /* Number of elements in the global state buffer */
-  int num_global_elements = global_size[0] * global_size[1];
-
-#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
-  if (device->have_error()) { \
-    return false; \
-  } \
-  if (!kernel_##name->enqueue( \
-          KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
-    return false; \
-  }
-
-  tile.sample = tile.start_sample;
-
-  /* for exponential increase between tile updates */
-  int time_multiplier = 1;
-
-  while (tile.sample < tile.start_sample + tile.num_samples) {
-    /* to keep track of how long it takes to run a number of samples */
-    double start_time = time_dt();
-
-    /* initial guess to start rolling average */
-    const int initial_num_samples = 1;
-    /* approx number of samples per second */
-    const int samples_per_second = (avg_time_per_sample > 0.0) ?
-                                       int(double(time_multiplier) / avg_time_per_sample) + 1 :
-                                       initial_num_samples;
-
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.sample;
-    subtile.num_samples = samples_per_second;
-
-    if (task.adaptive_sampling.use) {
-      subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
-                                                                 subtile.num_samples);
-    }
-
-    /* Don't go beyond requested number of samples. */
-    subtile.num_samples = min(subtile.num_samples,
-                              tile.start_sample + tile.num_samples - tile.sample);
-
-    if (device->have_error()) {
-      return false;
-    }
-
-    /* reset state memory here as global size for data_init
-     * kernel might not be large enough to do in kernel
-     */
-    work_pool_wgs.zero_to_device();
-    split_data.zero_to_device();
-    ray_state.zero_to_device();
-
-    if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                        subtile,
-                                        num_global_elements,
-                                        kgbuffer,
-                                        kernel_data,
-                                        split_data,
-                                        ray_state,
-                                        queue_index,
-                                        use_queues_flag,
-                                        work_pool_wgs)) {
-      return false;
-    }
-
-    ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
-    bool activeRaysAvailable = true;
-    double cancel_time = DBL_MAX;
-
-    while (activeRaysAvailable) {
-      /* Do path-iteration in host [Enqueue Path-iteration kernels. */
-      for (int PathIter = 0; PathIter < 16; PathIter++) {
-        ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
-        if (kernel_do_volume) {
-          ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
-        }
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(
-            holdout_emission_blurring_pathtermination_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
-        ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
-        if (task.get_cancel() && cancel_time == DBL_MAX) {
-          /* Wait up to twice as many seconds for current samples to finish
-           * to avoid artifacts in render result from ending too soon.
-           */
-          cancel_time = time_dt() + 2.0 * time_multiplier;
-        }
-
-        if (time_dt() > cancel_time) {
-          return true;
-        }
-      }
-
-      /* Decide if we should exit path-iteration in host. */
-      ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
-      activeRaysAvailable = false;
-
-      for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
-        if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
-          if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
-            /* Something went wrong, abort to avoid looping endlessly. */
-            device->set_error("Split kernel error: invalid ray state");
-            return false;
-          }
-
-          /* Not all rays are RAY_INACTIVE. */
-          activeRaysAvailable = true;
-          break;
-        }
-      }
-
-      if (time_dt() > cancel_time) {
-        return true;
-      }
-    }
-
-    int filter_sample = tile.sample + subtile.num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      size_t buffer_size[2];
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(tile.h, local_size[1]);
-      kernel_adaptive_stopping->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.h, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_x->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-      buffer_size[0] = round_up(tile.w, local_size[0]);
-      buffer_size[1] = round_up(1, local_size[1]);
-      kernel_adaptive_filter_y->enqueue(
-          KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-    }
-
-    double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
-
-    if (avg_time_per_sample == 0.0) {
-      /* start rolling average */
-      avg_time_per_sample = time_per_sample;
-    }
-    else {
-      avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
-    }
-
-#undef ENQUEUE_SPLIT_KERNEL
-
-    tile.sample += subtile.num_samples;
-    task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
-
-    time_multiplier = min(time_multiplier << 1, 10);
-
-    if (task.get_cancel()) {
-      return true;
-    }
-  }
-
-  if (task.adaptive_sampling.use) {
-    /* Reset the start samples. */
-    RenderTile subtile = tile;
-    subtile.start_sample = tile.start_sample;
-    subtile.num_samples = tile.sample - tile.start_sample;
-    enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
-                                   subtile,
-                                   num_global_elements,
-                                   kgbuffer,
-                                   kernel_data,
-                                   split_data,
-                                   ray_state,
-                                   queue_index,
-                                   use_queues_flag,
-                                   work_pool_wgs);
-    size_t buffer_size[2];
-    buffer_size[0] = round_up(tile.w, local_size[0]);
-    buffer_size[1] = round_up(tile.h, local_size[1]);
-    kernel_adaptive_adjust_samples->enqueue(
-        KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
-  }
-
-  return true;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
deleted file mode 100644
index 07a21b10299..00000000000
--- a/intern/cycles/device/device_split_kernel.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_SPLIT_KERNEL_H__
-#define __DEVICE_SPLIT_KERNEL_H__
-
-#include "device/device.h"
-#include "render/buffers.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* When allocate global memory in chunks. We may not be able to
- * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
- * Since some bytes may be needed for aligning chunks of memory;
- * This is the amount of memory that we dedicate for that purpose.
- */
-#define DATA_ALLOCATION_MEM_FACTOR 5000000  // 5MB
-
-/* Types used for split kernel */
-
-class KernelDimensions {
- public:
-  size_t global_size[2];
-  size_t local_size[2];
-
-  KernelDimensions(size_t global_size_[2], size_t local_size_[2])
-  {
-    memcpy(global_size, global_size_, sizeof(global_size));
-    memcpy(local_size, local_size_, sizeof(local_size));
-  }
-};
-
-class SplitKernelFunction {
- public:
-  virtual ~SplitKernelFunction()
-  {
-  }
-
-  /* enqueue the kernel, returns false if there is an error */
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
-};
-
-class DeviceSplitKernel {
- private:
-  Device *device;
-
-  SplitKernelFunction *kernel_path_init;
-  SplitKernelFunction *kernel_scene_intersect;
-  SplitKernelFunction *kernel_lamp_emission;
-  SplitKernelFunction *kernel_do_volume;
-  SplitKernelFunction *kernel_queue_enqueue;
-  SplitKernelFunction *kernel_indirect_background;
-  SplitKernelFunction *kernel_shader_setup;
-  SplitKernelFunction *kernel_shader_sort;
-  SplitKernelFunction *kernel_shader_eval;
-  SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
-  SplitKernelFunction *kernel_subsurface_scatter;
-  SplitKernelFunction *kernel_direct_lighting;
-  SplitKernelFunction *kernel_shadow_blocked_ao;
-  SplitKernelFunction *kernel_shadow_blocked_dl;
-  SplitKernelFunction *kernel_enqueue_inactive;
-  SplitKernelFunction *kernel_next_iteration_setup;
-  SplitKernelFunction *kernel_indirect_subsurface;
-  SplitKernelFunction *kernel_buffer_update;
-  SplitKernelFunction *kernel_adaptive_stopping;
-  SplitKernelFunction *kernel_adaptive_filter_x;
-  SplitKernelFunction *kernel_adaptive_filter_y;
-  SplitKernelFunction *kernel_adaptive_adjust_samples;
-
-  /* Global memory variables [porting]; These memory is used for
-   * co-operation between different kernels; Data written by one
-   * kernel will be available to another kernel via this global
-   * memory.
-   */
-  device_only_memory<uchar> split_data;
-  device_vector<uchar> ray_state;
-  device_only_memory<int>
-      queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
-  /* Flag to make sceneintersect and lampemission kernel use queues. */
-  device_only_memory<char> use_queues_flag;
-
-  /* Approximate time it takes to complete one sample */
-  double avg_time_per_sample;
-
-  /* Work pool with respect to each work group. */
-  device_only_memory<unsigned int> work_pool_wgs;
-
-  /* Cached kernel-dependent data, initialized once. */
-  bool kernel_data_initialized;
-  size_t local_size[2];
-  size_t global_size[2];
-
- public:
-  explicit DeviceSplitKernel(Device *device);
-  virtual ~DeviceSplitKernel();
-
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  bool path_trace(DeviceTask &task,
-                  RenderTile &rtile,
-                  device_memory &kgbuffer,
-                  device_memory &kernel_data);
-
-  virtual uint64_t state_buffer_size(device_memory &kg,
-                                     device_memory &data,
-                                     size_t num_threads) = 0;
-  size_t max_elements_for_max_buffer_size(device_memory &kg,
-                                          device_memory &data,
-                                          uint64_t max_buffer_size);
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data_,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs) = 0;
-
-  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
-                                                         const DeviceRequestedFeatures &) = 0;
-  virtual int2 split_kernel_local_size() = 0;
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask &task) = 0;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
deleted file mode 100644
index 55fbaa31e42..00000000000
--- a/intern/cycles/device/device_task.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "device/device_task.h"
-
-#include "render/buffers.h"
-
-#include "util/util_algorithm.h"
-#include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-DeviceTask::DeviceTask(Type type_)
-    : type(type_),
-      x(0),
-      y(0),
-      w(0),
-      h(0),
-      rgba_byte(0),
-      rgba_half(0),
-      buffer(0),
-      sample(0),
-      num_samples(1),
-      shader_input(0),
-      shader_output(0),
-      shader_eval_type(0),
-      shader_filter(0),
-      shader_x(0),
-      shader_w(0),
-      buffers(nullptr),
-      tile_types(0),
-      denoising_from_render(false),
-      pass_stride(0),
-      frame_stride(0),
-      target_pass_stride(0),
-      pass_denoising_data(0),
-      pass_denoising_clean(0),
-      need_finish_queue(false),
-      integrator_branched(false)
-{
-  last_update_time = time_dt();
-}
-
-int DeviceTask::get_subtask_count(int num, int max_size) const
-{
-  if (max_size != 0) {
-    int max_size_num;
-
-    if (type == SHADER) {
-      max_size_num = (shader_w + max_size - 1) / max_size;
-    }
-    else {
-      max_size = max(1, max_size / w);
-      max_size_num = (h + max_size - 1) / max_size;
-    }
-
-    num = max(max_size_num, num);
-  }
-
-  if (type == SHADER) {
-    num = min(shader_w, num);
-  }
-  else if (type == RENDER) {
-  }
-  else {
-    num = min(h, num);
-  }
-
-  return num;
-}
-
-void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) const
-{
-  num = get_subtask_count(num, max_size);
-
-  if (type == SHADER) {
-    for (int i = 0; i < num; i++) {
-      int tx = shader_x + (shader_w / num) * i;
-      int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
-
-      DeviceTask task = *this;
-
-      task.shader_x = tx;
-      task.shader_w = tw;
-
-      tasks.push_back(task);
-    }
-  }
-  else if (type == RENDER) {
-    for (int i = 0; i < num; i++)
-      tasks.push_back(*this);
-  }
-  else {
-    for (int i = 0; i < num; i++) {
-      int ty = y + (h / num) * i;
-      int th = (i == num - 1) ? h - i * (h / num) : h / num;
-
-      DeviceTask task = *this;
-
-      task.y = ty;
-      task.h = th;
-
-      tasks.push_back(task);
-    }
-  }
-}
-
-void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
-{
-  if (type == FILM_CONVERT)
-    return;
-
-  if (update_progress_sample) {
-    if (pixel_samples == -1) {
-      pixel_samples = shader_w;
-    }
-    update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
-  }
-
-  if (update_tile_sample) {
-    double current_time = time_dt();
-
-    if (current_time - last_update_time >= 1.0) {
-      update_tile_sample(*rtile);
-
-      last_update_time = current_time;
-    }
-  }
-}
-
-/* Adaptive Sampling */
-
-AdaptiveSampling::AdaptiveSampling() : use(true), adaptive_step(0), min_samples(0)
-{
-}
-
-/* Render samples in steps that align with the adaptive filtering. */
-int AdaptiveSampling::align_samples(int sample, int num_samples) const
-{
-  int end_sample = sample + num_samples;
-
-  /* Round down end sample to the nearest sample that needs filtering. */
-  end_sample &= ~(adaptive_step - 1);
-
-  if (end_sample <= sample) {
-    /* In order to reach the next sample that needs filtering, we'd need
-     * to increase num_samples. We don't do that in this function, so
-     * just keep it as is and don't filter this time around. */
-    return num_samples;
-  }
-  return end_sample - sample;
-}
-
-bool AdaptiveSampling::need_filter(int sample) const
-{
-  if (sample > min_samples) {
-    return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
-  }
-  else {
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
deleted file mode 100644
index 3f7cf47b692..00000000000
--- a/intern/cycles/device/device_task.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEVICE_TASK_H__
-#define __DEVICE_TASK_H__
-
-#include "device/device_memory.h"
-
-#include "util/util_function.h"
-#include "util/util_list.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Device Task */
-
-class Device;
-class RenderBuffers;
-class RenderTile;
-class RenderTileNeighbors;
-class Tile;
-
-enum DenoiserType {
-  DENOISER_NLM = 1,
-  DENOISER_OPTIX = 2,
-  DENOISER_OPENIMAGEDENOISE = 4,
-  DENOISER_NUM,
-
-  DENOISER_NONE = 0,
-  DENOISER_ALL = ~0,
-};
-
-enum DenoiserInput {
-  DENOISER_INPUT_RGB = 1,
-  DENOISER_INPUT_RGB_ALBEDO = 2,
-  DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3,
-
-  DENOISER_INPUT_NUM,
-};
-
-typedef int DenoiserTypeMask;
-
-class DenoiseParams {
- public:
-  /* Apply denoiser to image. */
-  bool use;
-  /* Output denoising data passes (possibly without applying the denoiser). */
-  bool store_passes;
-
-  /* Denoiser type. */
-  DenoiserType type;
-
-  /* Viewport start sample. */
-  int start_sample;
-
-  /** Native Denoiser. */
-
-  /* Pixel radius for neighboring pixels to take into account. */
-  int radius;
-  /* Controls neighbor pixel weighting for the denoising filter. */
-  float strength;
-  /* Preserve more or less detail based on feature passes. */
-  float feature_strength;
-  /* When removing pixels that don't carry information,
-   * use a relative threshold instead of an absolute one. */
-  bool relative_pca;
-  /* How many frames before and after the current center frame are included. */
-  int neighbor_frames;
-  /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
-  bool clamp_input;
-
-  /** OIDN/Optix Denoiser. */
-
-  /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */
-  DenoiserInput input_passes;
-
-  DenoiseParams()
-  {
-    use = false;
-    store_passes = false;
-
-    type = DENOISER_NLM;
-
-    radius = 8;
-    strength = 0.5f;
-    feature_strength = 0.5f;
-    relative_pca = false;
-    neighbor_frames = 2;
-    clamp_input = true;
-
-    /* Default to color + albedo only, since normal input does not always have the desired effect
-     * when denoising with OptiX. */
-    input_passes = DENOISER_INPUT_RGB_ALBEDO;
-
-    start_sample = 0;
-  }
-
-  /* Test if a denoising task needs to run, also to prefilter passes for the native
-   * denoiser when we are not applying denoising to the combined image. */
-  bool need_denoising_task() const
-  {
-    return (use || (store_passes && type == DENOISER_NLM));
-  }
-};
-
-class AdaptiveSampling {
- public:
-  AdaptiveSampling();
-
-  int align_samples(int sample, int num_samples) const;
-  bool need_filter(int sample) const;
-
-  bool use;
-  int adaptive_step;
-  int min_samples;
-};
-
-class DeviceTask {
- public:
-  typedef enum { RENDER, FILM_CONVERT, SHADER, DENOISE_BUFFER } Type;
-  Type type;
-
-  int x, y, w, h;
-  device_ptr rgba_byte;
-  device_ptr rgba_half;
-  device_ptr buffer;
-  int sample;
-  int num_samples;
-  int offset, stride;
-
-  device_ptr shader_input;
-  device_ptr shader_output;
-  int shader_eval_type;
-  int shader_filter;
-  int shader_x, shader_w;
-
-  RenderBuffers *buffers;
-
-  explicit DeviceTask(Type type = RENDER);
-
-  int get_subtask_count(int num, int max_size = 0) const;
-  void split(list<DeviceTask> &tasks, int num, int max_size = 0) const;
-
-  void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
-  function<bool(Device *device, RenderTile &, uint)> acquire_tile;
-  function<void(long, int)> update_progress_sample;
-  function<void(RenderTile &)> update_tile_sample;
-  function<void(RenderTile &)> release_tile;
-  function<bool()> get_cancel;
-  function<bool()> get_tile_stolen;
-  function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles;
-  function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles;
-
-  uint tile_types;
-  DenoiseParams denoising;
-  bool denoising_from_render;
-  vector<int> denoising_frames;
-
-  int pass_stride;
-  int frame_stride;
-  int target_pass_stride;
-  int pass_denoising_data;
-  int pass_denoising_clean;
-
-  bool need_finish_queue;
-  bool integrator_branched;
-  AdaptiveSampling adaptive_sampling;
-
- protected:
-  double last_update_time;
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/device_dummy.cpp b/intern/cycles/device/dummy/device.cpp
index 5112fc152e5..678276ed025 100644
--- a/intern/cycles/device/device_dummy.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "device/dummy/device.h"
+
 #include "device/device.h"
-#include "device/device_intern.h"
+#include "device/device_queue.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -23,8 +25,8 @@ CCL_NAMESPACE_BEGIN
 
 class DummyDevice : public Device {
  public:
-  DummyDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
-      : Device(info_, stats_, profiler_, background_)
+  DummyDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_)
   {
     error_msg = info.error_msg;
   }
@@ -61,23 +63,11 @@ class DummyDevice : public Device {
   virtual void const_copy_to(const char *, void *, size_t) override
   {
   }
-
-  virtual void task_add(DeviceTask &) override
-  {
-  }
-
-  virtual void task_wait() override
-  {
-  }
-
-  virtual void task_cancel() override
-  {
-  }
 };
 
-Device *device_dummy_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
 {
-  return new DummyDevice(info, stats, profiler, background);
+  return new DummyDevice(info, stats, profiler);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl b/intern/cycles/device/dummy/device.h
index 8afaa686e28..832a9568129 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_do_volume.cl
+++ b/intern/cycles/device/dummy/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_do_volume.h"
+#pragma once
 
-#define KERNEL_NAME do_volume
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp
new file mode 100644
index 00000000000..6dbcce2d9a5
--- /dev/null
+++ b/intern/cycles/device/multi/device.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/multi/device.h"
+
+#include <sstream>
+#include <stdlib.h>
+
+#include "bvh/bvh_multi.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "render/buffers.h"
+#include "render/geometry.h"
+
+#include "util/util_foreach.h"
+#include "util/util_list.h"
+#include "util/util_logging.h"
+#include "util/util_map.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+class MultiDevice : public Device {
+ public:
+  struct SubDevice {
+    Stats stats;
+    Device *device;
+    map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
+  };
+
+  list<SubDevice> devices;
+  device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
+
+  MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+      : Device(info, stats, profiler), unique_key(1)
+  {
+    foreach (const DeviceInfo &subinfo, info.multi_devices) {
+      /* Always add CPU devices at the back since GPU devices can change
+       * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
+      if (subinfo.type == DEVICE_CPU) {
+        devices.emplace_back();
+        sub = &devices.back();
+      }
+      else {
+        devices.emplace_front();
+        sub = &devices.front();
+      }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
+
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
+    }
+  }
+
+  ~MultiDevice()
+  {
+    foreach (SubDevice &sub, devices)
+      delete sub.device;
+  }
+
+  const string &error_message() override
+  {
+    error_msg.clear();
+
+    foreach (SubDevice &sub, devices)
+      error_msg += sub.device->error_message();
+
+    return error_msg;
+  }
+
+  virtual bool show_samples() const override
+  {
+    if (devices.size() > 1) {
+      return false;
+    }
+    return devices.front().device->show_samples();
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  {
+    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
+    foreach (const SubDevice &sub_device, devices) {
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      bvh_layout_mask &= device_bvh_layout_mask;
+      bvh_layout_mask_all |= device_bvh_layout_mask;
+    }
+
+    /* With multiple OptiX devices, every device needs its own acceleration structure */
+    if (bvh_layout_mask == BVH_LAYOUT_OPTIX) {
+      return BVH_LAYOUT_MULTI_OPTIX;
+    }
+
+    /* When devices do not share a common BVH layout, fall back to creating one for each */
+    const BVHLayoutMask BVH_LAYOUT_OPTIX_EMBREE = (BVH_LAYOUT_OPTIX | BVH_LAYOUT_EMBREE);
+    if ((bvh_layout_mask_all & BVH_LAYOUT_OPTIX_EMBREE) == BVH_LAYOUT_OPTIX_EMBREE) {
+      return BVH_LAYOUT_MULTI_OPTIX_EMBREE;
+    }
+
+    return bvh_layout_mask;
+  }
+
+  bool load_kernels(const uint kernel_features) override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_kernels(kernel_features))
+        return false;
+
+    return true;
+  }
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
+  {
+    /* Try to build and share a single acceleration structure, if possible */
+    if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+      devices.back().device->build_bvh(bvh, progress, refit);
+      return;
+    }
+
+    assert(bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX ||
+           bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE);
+
+    BVHMulti *const bvh_multi = static_cast<BVHMulti *>(bvh);
+    bvh_multi->sub_bvhs.resize(devices.size());
+
+    vector<BVHMulti *> geom_bvhs;
+    geom_bvhs.reserve(bvh->geometry.size());
+    foreach (Geometry *geom, bvh->geometry) {
+      geom_bvhs.push_back(static_cast<BVHMulti *>(geom->bvh));
+    }
+
+    /* Broadcast acceleration structure build to all render devices */
+    size_t i = 0;
+    foreach (SubDevice &sub, devices) {
+      /* Change geometry BVH pointers to the sub BVH */
+      for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+        bvh->geometry[k]->bvh = geom_bvhs[k]->sub_bvhs[i];
+      }
+
+      if (!bvh_multi->sub_bvhs[i]) {
+        BVHParams params = bvh->params;
+        if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
+          params.bvh_layout = BVH_LAYOUT_OPTIX;
+        else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
+          params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
+                                                                      BVH_LAYOUT_EMBREE;
+
+        /* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
+         * (since they are put into the top level directly, see bvh_embree.cpp) */
+        if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
+            !bvh->geometry[0]->is_instanced()) {
+          i++;
+          continue;
+        }
+
+        bvh_multi->sub_bvhs[i] = BVH::create(params, bvh->geometry, bvh->objects, sub.device);
+      }
+
+      sub.device->build_bvh(bvh_multi->sub_bvhs[i], progress, refit);
+      i++;
+    }
+
+    /* Change geometry BVH pointers back to the multi BVH. */
+    for (size_t k = 0; k < bvh->geometry.size(); ++k) {
+      bvh->geometry[k]->bvh = geom_bvhs[k];
+    }
+  }
+
+  virtual void *get_cpu_osl_memory() override
+  {
+    if (devices.size() > 1) {
+      return NULL;
+    }
+    return devices.front().device->get_cpu_osl_memory();
+  }
+
+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(key != 0 && (sub.peer_island_index >= 0 || sub.ptr_map.find(key) != sub.ptr_map.end()));
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
+  void mem_alloc(device_memory &mem) override
+  {
+    device_ptr key = unique_key++;
+
+    assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || mem.type == MEM_DEVICE_ONLY);
+    /* The remaining memory types can be distributed across devices */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      owner_sub->device->mem_alloc(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size);
+  }
+
+  void mem_copy_to(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_copy_to(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+
+      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+        /* Need to create texture objects and update pointer in kernel globals on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_copy_to(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    device_ptr key = mem.device_pointer;
+    int i = 0, sub_h = h / devices.size();
+
+    foreach (SubDevice &sub, devices) {
+      int sy = y + i * sub_h;
+      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
+      i++;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+  }
+
+  void mem_zero(device_memory &mem) override
+  {
+    device_ptr existing_key = mem.device_pointer;
+    device_ptr key = (existing_key) ? existing_key : unique_key++;
+    size_t existing_size = mem.device_size;
+
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+      mem.device = owner_sub->device;
+      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_zero(mem);
+      owner_sub->ptr_map[key] = mem.device_pointer;
+    }
+
+    mem.device = this;
+    mem.device_pointer = key;
+    stats.mem_alloc(mem.device_size - existing_size);
+  }
+
+  void mem_free(device_memory &mem) override
+  {
+    device_ptr key = mem.device_pointer;
+    size_t existing_size = mem.device_size;
+
+    /* Free memory that was allocated for all devices (see above) on each device */
+    foreach (const vector<SubDevice *> &island, peer_islands) {
+      SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
+      mem.device_size = existing_size;
+
+      owner_sub->device->mem_free(mem);
+      owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+      if (mem.type == MEM_TEXTURE) {
+        /* Free texture objects on all devices */
+        foreach (SubDevice *island_sub, island) {
+          if (island_sub != owner_sub) {
+            island_sub->device->mem_free(mem);
+          }
+        }
+      }
+    }
+
+    mem.device = this;
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+    stats.mem_free(existing_size);
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size) override
+  {
+    foreach (SubDevice &sub, devices)
+      sub.device->const_copy_to(name, host, size);
+  }
+
+  int device_number(Device *sub_device) override
+  {
+    int i = 0;
+
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device)
+        return i;
+      i++;
+    }
+
+    return -1;
+  }
+
+  virtual void foreach_device(const function<void(Device *)> &callback) override
+  {
+    foreach (SubDevice &sub, devices) {
+      sub.device->foreach_device(callback);
+    }
+  }
+};
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+  return new MultiDevice(info, stats, profiler);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl b/intern/cycles/device/multi/device.h
index 192d01444ba..6e121014a1f 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_background.cl
+++ b/intern/cycles/device/multi/device.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_background.h"
+#pragma once
 
-#define KERNEL_NAME indirect_background
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "util/util_string.h"
+#include "util/util_vector.h"
 
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/device_opencl.h b/intern/cycles/device/opencl/device_opencl.h
deleted file mode 100644
index a65e764b0d4..00000000000
--- a/intern/cycles/device/opencl/device_opencl.h
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device.h"
-#  include "device/device_denoising.h"
-#  include "device/device_split_kernel.h"
-
-#  include "util/util_map.h"
-#  include "util/util_param.h"
-#  include "util/util_string.h"
-#  include "util/util_task.h"
-
-#  include "clew.h"
-
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Disable workarounds, seems to be working fine on latest drivers. */
-#  define CYCLES_DISABLE_DRIVER_WORKAROUNDS
-
-/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workarounds for testing. */
-#  ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
-/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-#    undef clEnqueueNDRangeKernel
-#    define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueWriteBuffer
-#    define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-
-#    undef clEnqueueReadBuffer
-#    define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
-      CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
-      clFinish(a);
-#  endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-
-#  define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
-
-struct OpenCLPlatformDevice {
-  OpenCLPlatformDevice(cl_platform_id platform_id,
-                       const string &platform_name,
-                       cl_device_id device_id,
-                       cl_device_type device_type,
-                       const string &device_name,
-                       const string &hardware_id,
-                       const string &device_extensions)
-      : platform_id(platform_id),
-        platform_name(platform_name),
-        device_id(device_id),
-        device_type(device_type),
-        device_name(device_name),
-        hardware_id(hardware_id),
-        device_extensions(device_extensions)
-  {
-  }
-  cl_platform_id platform_id;
-  string platform_name;
-  cl_device_id device_id;
-  cl_device_type device_type;
-  string device_name;
-  string hardware_id;
-  string device_extensions;
-};
-
-/* Contains all static OpenCL helper functions. */
-class OpenCLInfo {
- public:
-  static cl_device_type device_type();
-  static bool use_debug();
-  static bool device_supported(const string &platform_name, const cl_device_id device_id);
-  static bool platform_version_check(cl_platform_id platform, string *error = NULL);
-  static bool device_version_check(cl_device_id device, string *error = NULL);
-  static bool get_device_version(cl_device_id device,
-                                 int *r_major,
-                                 int *r_minor,
-                                 string *error = NULL);
-  static string get_hardware_id(const string &platform_name, cl_device_id device_id);
-  static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices);
-
-  /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
-  /* Platform information. */
-  static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
-  static cl_uint get_num_platforms();
-
-  static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
-  static vector<cl_platform_id> get_platforms();
-
-  static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
-  static string get_platform_name(cl_platform_id platform_id);
-
-  static bool get_num_platform_devices(cl_platform_id platform_id,
-                                       cl_device_type device_type,
-                                       cl_uint *num_devices,
-                                       cl_int *error = NULL);
-  static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
-
-  static bool get_platform_devices(cl_platform_id platform_id,
-                                   cl_device_type device_type,
-                                   vector<cl_device_id> *device_ids,
-                                   cl_int *error = NULL);
-  static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
-                                                   cl_device_type device_type);
-
-  /* Device information. */
-  static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
-
-  static string get_device_name(cl_device_id device_id);
-
-  static bool get_device_extensions(cl_device_id device_id,
-                                    string *device_extensions,
-                                    cl_int *error = NULL);
-
-  static string get_device_extensions(cl_device_id device_id);
-
-  static bool get_device_type(cl_device_id device_id,
-                              cl_device_type *device_type,
-                              cl_int *error = NULL);
-  static cl_device_type get_device_type(cl_device_id device_id);
-
-  static bool get_driver_version(cl_device_id device_id,
-                                 int *major,
-                                 int *minor,
-                                 cl_int *error = NULL);
-
-  static int mem_sub_ptr_alignment(cl_device_id device_id);
-
-  /* Get somewhat more readable device name.
-   * Main difference is AMD OpenCL here which only gives code name
-   * for the regular device name. This will give more sane device
-   * name using some extensions.
-   */
-  static string get_readable_device_name(cl_device_id device_id);
-};
-
-/* Thread safe cache for contexts and programs.
- */
-class OpenCLCache {
-  struct Slot {
-    struct ProgramEntry {
-      ProgramEntry();
-      ProgramEntry(const ProgramEntry &rhs);
-      ~ProgramEntry();
-      cl_program program;
-      thread_mutex *mutex;
-    };
-
-    Slot();
-    Slot(const Slot &rhs);
-    ~Slot();
-
-    thread_mutex *context_mutex;
-    cl_context context;
-    typedef map<ustring, ProgramEntry> EntryMap;
-    EntryMap programs;
-  };
-
-  /* key is combination of platform ID and device ID */
-  typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
-  /* map of Slot objects */
-  typedef map<PlatformDevicePair, Slot> CacheMap;
-  CacheMap cache;
-
-  /* MD5 hash of the kernel source. */
-  string kernel_md5;
-
-  thread_mutex cache_lock;
-  thread_mutex kernel_md5_lock;
-
-  /* lazy instantiate */
-  static OpenCLCache &global_instance();
-
- public:
-  enum ProgramName {
-    OCL_DEV_BASE_PROGRAM,
-    OCL_DEV_MEGAKERNEL_PROGRAM,
-  };
-
-  /* Lookup context in the cache. If this returns NULL, slot_locker
-   * will be holding a lock for the cache. slot_locker should refer to a
-   * default constructed thread_scoped_lock. */
-  static cl_context get_context(cl_platform_id platform,
-                                cl_device_id device,
-                                thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static cl_program get_program(cl_platform_id platform,
-                                cl_device_id device,
-                                ustring key,
-                                thread_scoped_lock &slot_locker);
-
-  /* Store context in the cache. You MUST have tried to get the item before storing to it. */
-  static void store_context(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_context context,
-                            thread_scoped_lock &slot_locker);
-  /* Same as above. */
-  static void store_program(cl_platform_id platform,
-                            cl_device_id device,
-                            cl_program program,
-                            ustring key,
-                            thread_scoped_lock &slot_locker);
-
-  static string get_kernel_md5();
-};
-
-#  define opencl_device_assert(device, stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if ((device)->error_message() == "") { \
-          (device)->set_error(message); \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-#  define opencl_assert(stmt) \
-    { \
-      cl_int err = stmt; \
-\
-      if (err != CL_SUCCESS) { \
-        string message = string_printf( \
-            "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
-        if (error_msg == "") { \
-          error_msg = message; \
-        } \
-        fprintf(stderr, "%s\n", message.c_str()); \
-      } \
-    } \
-    (void)0
-
-class OpenCLDevice : public Device {
- public:
-  DedicatedTaskPool task_pool;
-
-  /* Task pool for required kernels (base, AO kernels during foreground rendering) */
-  TaskPool load_required_kernel_task_pool;
-  /* Task pool for optional kernels (feature kernels during foreground rendering) */
-  TaskPool load_kernel_task_pool;
-  std::atomic<int> load_kernel_num_compiling;
-
-  cl_context cxContext;
-  cl_command_queue cqCommandQueue;
-  cl_platform_id cpPlatform;
-  cl_device_id cdDevice;
-  cl_int ciErr;
-  int device_num;
-
-  class OpenCLProgram {
-   public:
-    OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
-    {
-    }
-    OpenCLProgram(OpenCLDevice *device,
-                  const string &program_name,
-                  const string &kernel_name,
-                  const string &kernel_build_options,
-                  bool use_stdout = true);
-    ~OpenCLProgram();
-
-    void add_kernel(ustring name);
-
-    /* Try to load the program from device cache or disk */
-    bool load();
-    /* Compile the kernel (first separate, fail-back to local). */
-    void compile();
-    /* Create the OpenCL kernels after loading or compiling */
-    void create_kernels();
-
-    bool is_loaded() const
-    {
-      return loaded;
-    }
-    const string &get_log() const
-    {
-      return log;
-    }
-    void report_error();
-
-    /* Wait until this kernel is available to be used
-     * It will return true when the kernel is available.
-     * It will return false when the kernel is not available
-     * or could not be loaded. */
-    bool wait_for_availability();
-
-    cl_kernel operator()();
-    cl_kernel operator()(ustring name);
-
-    void release();
-
-   private:
-    bool build_kernel(const string *debug_src);
-    /* Build the program by calling the own process.
-     * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
-     * build calls internally if they come from the same process.
-     * If that is not supported, this function just returns false.
-     */
-    bool compile_separate(const string &clbin);
-    /* Build the program by calling OpenCL directly. */
-    bool compile_kernel(const string *debug_src);
-    /* Loading and saving the program from/to disk. */
-    bool load_binary(const string &clbin, const string *debug_src = NULL);
-    bool save_binary(const string &clbin);
-
-    void add_log(const string &msg, bool is_debug);
-    void add_error(const string &msg);
-
-    bool loaded;
-    bool needs_compiling;
-
-    cl_program program;
-    OpenCLDevice *device;
-
-    /* Used for the OpenCLCache key. */
-    string program_name;
-
-    string kernel_file, kernel_build_options, device_md5;
-
-    bool use_stdout;
-    string log, error_msg;
-    string compile_output;
-
-    map<ustring, cl_kernel> kernels;
-  };
-
-  /* Container for all types of split programs. */
-  class OpenCLSplitPrograms {
-   public:
-    OpenCLDevice *device;
-    OpenCLProgram program_split;
-    OpenCLProgram program_lamp_emission;
-    OpenCLProgram program_do_volume;
-    OpenCLProgram program_indirect_background;
-    OpenCLProgram program_shader_eval;
-    OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
-    OpenCLProgram program_subsurface_scatter;
-    OpenCLProgram program_direct_lighting;
-    OpenCLProgram program_shadow_blocked_ao;
-    OpenCLProgram program_shadow_blocked_dl;
-
-    OpenCLSplitPrograms(OpenCLDevice *device);
-    ~OpenCLSplitPrograms();
-
-    /* Load the kernels and put the created kernels in the given
-     * `programs` parameter. */
-    void load_kernels(vector<OpenCLProgram *> &programs,
-                      const DeviceRequestedFeatures &requested_features);
-  };
-
-  DeviceSplitKernel *split_kernel;
-
-  OpenCLProgram base_program;
-  OpenCLProgram bake_program;
-  OpenCLProgram displace_program;
-  OpenCLProgram background_program;
-  OpenCLProgram denoising_program;
-
-  OpenCLSplitPrograms kernel_programs;
-
-  typedef map<string, device_vector<uchar> *> ConstMemMap;
-  typedef map<string, device_ptr> MemMap;
-
-  ConstMemMap const_mem_map;
-  MemMap mem_map;
-
-  bool device_initialized;
-  string platform_name;
-  string device_name;
-
-  bool opencl_error(cl_int err);
-  void opencl_error(const string &message);
-  void opencl_assert_err(cl_int err, const char *where);
-
-  OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
-  ~OpenCLDevice();
-
-  static void CL_CALLBACK context_notify_callback(const char *err_info,
-                                                  const void * /*private_info*/,
-                                                  size_t /*cb*/,
-                                                  void *user_data);
-
-  bool opencl_version_check();
-  OpenCLSplitPrograms *get_split_programs();
-
-  string device_md5_hash(string kernel_custom_build_options = "");
-  bool load_kernels(const DeviceRequestedFeatures &requested_features);
-  void load_required_kernels(const DeviceRequestedFeatures &requested_features);
-
-  bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
-  DeviceKernelStatus get_active_kernel_switch_state();
-
-  /* Get the name of the opencl program for the given kernel */
-  const string get_opencl_program_name(const string &kernel_name);
-  /* Get the program file name to compile (*.cl) for the given kernel */
-  const string get_opencl_program_filename(const string &kernel_name);
-  string get_build_options(const DeviceRequestedFeatures &requested_features,
-                           const string &opencl_program_name);
-  /* Enable the default features to reduce recompilation events */
-  void enable_default_features(DeviceRequestedFeatures &features);
-
-  void mem_alloc(device_memory &mem);
-  void mem_copy_to(device_memory &mem);
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
-  void mem_zero(device_memory &mem);
-  void mem_free(device_memory &mem);
-
-  int mem_sub_ptr_alignment();
-
-  void const_copy_to(const char *name, void *host, size_t size);
-  void global_alloc(device_memory &mem);
-  void global_free(device_memory &mem);
-  void tex_alloc(device_texture &mem);
-  void tex_free(device_texture &mem);
-
-  size_t global_size_round_up(int group_size, int global_size);
-  void enqueue_kernel(cl_kernel kernel,
-                      size_t w,
-                      size_t h,
-                      bool x_workgroups = false,
-                      size_t max_workgroup_size = -1);
-  void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
-  void film_convert(DeviceTask &task,
-                    device_ptr buffer,
-                    device_ptr rgba_byte,
-                    device_ptr rgba_half);
-  void shader(DeviceTask &task);
-  void update_adaptive(DeviceTask &task, RenderTile &tile, int sample);
-  void bake(DeviceTask &task, RenderTile &tile);
-
-  void denoise(RenderTile &tile, DenoisingTask &denoising);
-
-  int get_split_task_count(DeviceTask & /*task*/)
-  {
-    return 1;
-  }
-
-  void task_add(DeviceTask &task)
-  {
-    task_pool.push([=] {
-      DeviceTask task_copy = task;
-      thread_run(task_copy);
-    });
-  }
-
-  void task_wait()
-  {
-    task_pool.wait();
-  }
-
-  void task_cancel()
-  {
-    task_pool.cancel();
-  }
-
-  void thread_run(DeviceTask &task);
-
-  virtual BVHLayoutMask get_bvh_layout_mask() const
-  {
-    return BVH_LAYOUT_BVH2;
-  }
-
-  virtual bool show_samples() const
-  {
-    return true;
-  }
-
- protected:
-  string kernel_build_options(const string *debug_src = NULL);
-
-  void mem_zero_kernel(device_ptr ptr, size_t size);
-
-  bool denoising_non_local_means(device_ptr image_ptr,
-                                 device_ptr guide_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr out_ptr,
-                                 DenoisingTask *task);
-  bool denoising_construct_transform(DenoisingTask *task);
-  bool denoising_accumulate(device_ptr color_ptr,
-                            device_ptr color_variance_ptr,
-                            device_ptr scale_ptr,
-                            int frame,
-                            DenoisingTask *task);
-  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
-  bool denoising_combine_halves(device_ptr a_ptr,
-                                device_ptr b_ptr,
-                                device_ptr mean_ptr,
-                                device_ptr variance_ptr,
-                                int r,
-                                int4 rect,
-                                DenoisingTask *task);
-  bool denoising_divide_shadow(device_ptr a_ptr,
-                               device_ptr b_ptr,
-                               device_ptr sample_variance_ptr,
-                               device_ptr sv_variance_ptr,
-                               device_ptr buffer_variance_ptr,
-                               DenoisingTask *task);
-  bool denoising_get_feature(int mean_offset,
-                             int variance_offset,
-                             device_ptr mean_ptr,
-                             device_ptr variance_ptr,
-                             float scale,
-                             DenoisingTask *task);
-  bool denoising_write_feature(int to_offset,
-                               device_ptr from_ptr,
-                               device_ptr buffer_ptr,
-                               DenoisingTask *task);
-  bool denoising_detect_outliers(device_ptr image_ptr,
-                                 device_ptr variance_ptr,
-                                 device_ptr depth_ptr,
-                                 device_ptr output_ptr,
-                                 DenoisingTask *task);
-
-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
-  void mem_free_sub_ptr(device_ptr ptr);
-
-  class ArgumentWrapper {
-   public:
-    ArgumentWrapper() : size(0), pointer(NULL)
-    {
-    }
-
-    ArgumentWrapper(device_memory &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_vector<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-
-    template<typename T>
-    ArgumentWrapper(device_only_memory<T> &argument)
-        : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
-    {
-    }
-    template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
-    {
-    }
-
-    ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
-    {
-    }
-
-    ArgumentWrapper(float argument)
-        : size(sizeof(float)), float_value(argument), pointer(&float_value)
-    {
-    }
-
-    size_t size;
-    int int_value;
-    float float_value;
-    void *pointer;
-  };
-
-  /* TODO(sergey): In the future we can use variadic templates, once
-   * C++0x is allowed. Should allow to clean this up a bit.
-   */
-  int kernel_set_args(cl_kernel kernel,
-                      int start_argument_index,
-                      const ArgumentWrapper &arg1 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg2 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg3 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg4 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg5 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg6 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg7 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg8 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg9 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg10 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg11 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg12 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg13 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg14 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg15 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg16 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg17 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg18 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg19 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg20 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg21 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg22 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg23 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg24 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg25 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg26 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg27 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg28 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg29 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg30 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg31 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg32 = ArgumentWrapper(),
-                      const ArgumentWrapper &arg33 = ArgumentWrapper());
-
-  void release_kernel_safe(cl_kernel kernel);
-  void release_mem_object_safe(cl_mem mem);
-  void release_program_safe(cl_program program);
-
-  /* ** Those guys are for working around some compiler-specific bugs ** */
-
-  cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
-
-  void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
-
- private:
-  MemoryManager memory_manager;
-  friend class MemoryManager;
-
-  static_assert_align(TextureInfo, 16);
-  device_vector<TextureInfo> texture_info;
-
-  typedef map<string, device_memory *> TexturesMap;
-  TexturesMap textures;
-
-  bool textures_need_update;
-
- protected:
-  void flush_texture_buffers();
-
-  friend class OpenCLSplitKernel;
-  friend class OpenCLSplitKernelFunction;
-};
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background);
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp
deleted file mode 100644
index 31a2265700c..00000000000
--- a/intern/cycles/device/opencl/device_opencl_impl.cpp
+++ /dev/null
@@ -1,2113 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/opencl/device_opencl.h"
-
-#  include "kernel/kernel_types.h"
-#  include "kernel/split/kernel_split_data_types.h"
-
-#  include "util/util_algorithm.h"
-#  include "util/util_debug.h"
-#  include "util/util_foreach.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_time.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct texture_slot_t {
-  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
-  {
-  }
-  string name;
-  int slot;
-};
-
-static const string NON_SPLIT_KERNELS =
-    "denoising "
-    "base "
-    "background "
-    "displace ";
-
-static const string SPLIT_BUNDLE_KERNELS =
-    "data_init "
-    "path_init "
-    "state_buffer_size "
-    "scene_intersect "
-    "queue_enqueue "
-    "shader_setup "
-    "shader_sort "
-    "enqueue_inactive "
-    "next_iteration_setup "
-    "indirect_subsurface "
-    "buffer_update "
-    "adaptive_stopping "
-    "adaptive_filter_x "
-    "adaptive_filter_y "
-    "adaptive_adjust_samples";
-
-const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
-{
-  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
-    return kernel_name;
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "split_bundle";
-  }
-  else {
-    return "split_" + kernel_name;
-  }
-}
-
-const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
-{
-  if (kernel_name == "denoising") {
-    return "filter.cl";
-  }
-  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
-    return "kernel_split_bundle.cl";
-  }
-  else {
-    return "kernel_" + kernel_name + ".cl";
-  }
-}
-
-/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
-{
-  features.use_transparent = true;
-  features.use_shadow_tricks = true;
-  features.use_principled = true;
-  features.use_denoising = true;
-
-  if (!background) {
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.use_camera_motion = false;
-    features.use_object_motion = false;
-  }
-}
-
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
-                                       const string &opencl_program_name)
-{
-  /* first check for non-split kernel programs */
-  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
-    return "";
-  }
-  else if (opencl_program_name == "bake") {
-    /* Note: get_build_options for bake is only requested when baking is enabled.
-     * displace and background are always requested.
-     * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_hair = true;
-    features.use_subsurface = true;
-    features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
-    features.nodes_features = NODE_FEATURE_ALL;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "displace") {
-    /* As displacement does not use any nodes from the Shading group (eg BSDF).
-     * We disable all features that are related to shading. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_denoising = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_baking = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_denoising = false;
-    features.use_principled = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-  else if (opencl_program_name == "background") {
-    /* Background uses Background shading
-     * It is save to disable shadow features, subsurface and volumetric. */
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-    features.use_baking = false;
-    features.use_object_motion = false;
-    features.use_camera_motion = false;
-    features.use_transparent = false;
-    features.use_shadow_tricks = false;
-    features.use_denoising = false;
-    /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
-     * Perhaps we should remove them in UI as it does not make any sense when
-     * rendering background. */
-    features.nodes_features &= ~NODE_FEATURE_VOLUME;
-    features.use_subsurface = false;
-    features.use_volume = false;
-    features.use_shader_raytrace = false;
-    features.use_patch_evaluation = false;
-    features.use_integrator_branched = false;
-    return features.get_build_options();
-  }
-
-  string build_options = "-D__SPLIT_KERNEL__ ";
-  /* Set compute device build option. */
-  cl_device_type device_type;
-  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
-  assert(this->ciErr == CL_SUCCESS);
-  if (device_type == CL_DEVICE_TYPE_GPU) {
-    build_options += "-D__COMPUTE_DEVICE_GPU__ ";
-  }
-
-  DeviceRequestedFeatures nofeatures;
-  enable_default_features(nofeatures);
-
-  /* Add program specific optimized compile directives */
-  if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
-    build_options += nofeatures.get_build_options();
-  }
-  else {
-    DeviceRequestedFeatures features(requested_features);
-    enable_default_features(features);
-
-    /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
-     * this also makes sure that the kernels that are build during baking can be reused
-     * when not doing any baking. */
-    features.use_baking = false;
-
-    /* Do not vary on shaders when program doesn't do any shading.
-     * We have bundled them in a single program. */
-    if (opencl_program_name == "split_bundle") {
-      features.max_nodes_group = 0;
-      features.nodes_features = 0;
-      features.use_shader_raytrace = false;
-    }
-
-    /* No specific settings, just add the regular ones */
-    build_options += features.get_build_options();
-  }
-
-  return build_options;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
-{
-  device = device_;
-}
-
-OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
-{
-  program_split.release();
-  program_lamp_emission.release();
-  program_do_volume.release();
-  program_indirect_background.release();
-  program_shader_eval.release();
-  program_holdout_emission_blurring_pathtermination_ao.release();
-  program_subsurface_scatter.release();
-  program_direct_lighting.release();
-  program_shadow_blocked_ao.release();
-  program_shadow_blocked_dl.release();
-}
-
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
-    vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
-{
-  if (!requested_features.use_baking) {
-#  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
-    program_split.add_kernel(ustring("path_trace_" #kernel_name));
-#  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
-    const string program_name_##kernel_name = "split_" #kernel_name; \
-    program_##kernel_name = OpenCLDevice::OpenCLProgram( \
-        device, \
-        program_name_##kernel_name, \
-        "kernel_" #kernel_name ".cl", \
-        device->get_build_options(requested_features, program_name_##kernel_name)); \
-    program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
-    programs.push_back(&program_##kernel_name);
-
-    /* Ordered with most complex kernels first, to reduce overall compile time. */
-    ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
-    ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
-    ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
-    if (requested_features.use_volume) {
-      ADD_SPLIT_KERNEL_PROGRAM(do_volume);
-    }
-    ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-    ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
-    ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
-    ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
-
-    /* Quick kernels bundled in a single program to reduce overhead of starting
-     * Blender processes. */
-    program_split = OpenCLDevice::OpenCLProgram(
-        device,
-        "split_bundle",
-        "kernel_split_bundle.cl",
-        device->get_build_options(requested_features, "split_bundle"));
-
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
-    ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
-    programs.push_back(&program_split);
-
-#  undef ADD_SPLIT_KERNEL_PROGRAM
-#  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
-  }
-}
-
-namespace {
-
-/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
- * fetch its size.
- */
-typedef struct KernelGlobalsDummy {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-#  undef KERNEL_TEX
-  SplitData split_data;
-  SplitParams split_param_data;
-} KernelGlobalsDummy;
-
-}  // namespace
-
-struct CachedSplitMemory {
-  int id;
-  device_memory *split_data;
-  device_memory *ray_state;
-  device_memory *queue_index;
-  device_memory *use_queues_flag;
-  device_memory *work_pools;
-  device_ptr *buffer;
-};
-
-class OpenCLSplitKernelFunction : public SplitKernelFunction {
- public:
-  OpenCLDevice *device;
-  OpenCLDevice::OpenCLProgram program;
-  CachedSplitMemory &cached_memory;
-  int cached_id;
-
-  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
-      : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
-  {
-  }
-
-  ~OpenCLSplitKernelFunction()
-  {
-    program.release();
-  }
-
-  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
-  {
-    if (cached_id != cached_memory.id) {
-      cl_uint start_arg_index = device->kernel_set_args(
-          program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
-
-      device->set_kernel_arg_buffers(program(), &start_arg_index);
-
-      start_arg_index += device->kernel_set_args(program(),
-                                                 start_arg_index,
-                                                 *cached_memory.queue_index,
-                                                 *cached_memory.use_queues_flag,
-                                                 *cached_memory.work_pools,
-                                                 *cached_memory.buffer);
-
-      cached_id = cached_memory.id;
-    }
-
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           program(),
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    return true;
-  }
-};
-
-class OpenCLSplitKernel : public DeviceSplitKernel {
-  OpenCLDevice *device;
-  CachedSplitMemory cached_memory;
-
- public:
-  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
-  {
-  }
-
-  virtual SplitKernelFunction *get_split_kernel_function(
-      const string &kernel_name, const DeviceRequestedFeatures &requested_features)
-  {
-    OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
-    const string program_name = device->get_opencl_program_name(kernel_name);
-    kernel->program = OpenCLDevice::OpenCLProgram(
-        device,
-        program_name,
-        device->get_opencl_program_filename(kernel_name),
-        device->get_build_options(requested_features, program_name));
-
-    kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
-    kernel->program.load();
-
-    if (!kernel->program.is_loaded()) {
-      delete kernel;
-      return NULL;
-    }
-
-    return kernel;
-  }
-
-  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
-  {
-    device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-    size_buffer.alloc(1);
-    size_buffer.zero_to_device();
-
-    uint threads = num_threads;
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_state_buffer_size = programs->program_split(
-        ustring("path_trace_state_buffer_size"));
-    device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
-    size_t global_size = 64;
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_state_buffer_size,
-                                           1,
-                                           NULL,
-                                           &global_size,
-                                           NULL,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    size_buffer.copy_from_device(0, 1, 1);
-    size_t size = size_buffer[0];
-    size_buffer.free();
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return 0;
-    }
-
-    return size;
-  }
-
-  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
-                                              RenderTile &rtile,
-                                              int num_global_elements,
-                                              device_memory &kernel_globals,
-                                              device_memory &kernel_data,
-                                              device_memory &split_data,
-                                              device_memory &ray_state,
-                                              device_memory &queue_index,
-                                              device_memory &use_queues_flag,
-                                              device_memory &work_pool_wgs)
-  {
-    cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
-    /* Set the range of samples to be processed for every ray in
-     * path-regeneration logic.
-     */
-    cl_int start_sample = rtile.start_sample;
-    cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
-    OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
-    cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
-    cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
-                                                      0,
-                                                      kernel_globals,
-                                                      kernel_data,
-                                                      split_data,
-                                                      num_global_elements,
-                                                      ray_state);
-
-    device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
-    start_arg_index += device->kernel_set_args(kernel_data_init,
-                                               start_arg_index,
-                                               start_sample,
-                                               end_sample,
-                                               rtile.x,
-                                               rtile.y,
-                                               rtile.w,
-                                               rtile.h,
-                                               rtile.offset,
-                                               rtile.stride,
-                                               queue_index,
-                                               dQueue_size,
-                                               use_queues_flag,
-                                               work_pool_wgs,
-                                               rtile.num_samples,
-                                               rtile.buffer);
-
-    /* Enqueue ckPathTraceKernel_data_init kernel. */
-    device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
-                                           kernel_data_init,
-                                           2,
-                                           NULL,
-                                           dim.global_size,
-                                           dim.local_size,
-                                           0,
-                                           NULL,
-                                           NULL);
-
-    device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
-    if (device->ciErr != CL_SUCCESS) {
-      string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
-                                     clewErrorString(device->ciErr));
-      device->opencl_error(message);
-      return false;
-    }
-
-    cached_memory.split_data = &split_data;
-    cached_memory.ray_state = &ray_state;
-    cached_memory.queue_index = &queue_index;
-    cached_memory.use_queues_flag = &use_queues_flag;
-    cached_memory.work_pools = &work_pool_wgs;
-    cached_memory.buffer = &rtile.buffer;
-    cached_memory.id++;
-
-    return true;
-  }
-
-  virtual int2 split_kernel_local_size()
-  {
-    return make_int2(64, 1);
-  }
-
-  virtual int2 split_kernel_global_size(device_memory &kg,
-                                        device_memory &data,
-                                        DeviceTask & /*task*/)
-  {
-    cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
-    /* Use small global size on CPU devices as it seems to be much faster. */
-    if (type == CL_DEVICE_TYPE_CPU) {
-      VLOG(1) << "Global size: (64, 64).";
-      return make_int2(64, 64);
-    }
-
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (DebugFlags().opencl.mem_limit) {
-      max_buffer_size = min(max_buffer_size,
-                            cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
-    }
-
-    VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
-            << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
-
-    /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
-    max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
-
-    size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
-    int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
-                                 (int)sqrt(num_elements));
-
-    if (device->info.description.find("Intel") != string::npos) {
-      global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
-    }
-
-    VLOG(1) << "Global size: " << global_size << ".";
-    return global_size;
-  }
-};
-
-bool OpenCLDevice::opencl_error(cl_int err)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-    return true;
-  }
-
-  return false;
-}
-
-void OpenCLDevice::opencl_error(const string &message)
-{
-  if (error_msg == "")
-    error_msg = message;
-  fprintf(stderr, "%s\n", message.c_str());
-}
-
-void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
-{
-  if (err != CL_SUCCESS) {
-    string message = string_printf(
-        "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
-    if (error_msg == "")
-      error_msg = message;
-    fprintf(stderr, "%s\n", message.c_str());
-#  ifndef NDEBUG
-    abort();
-#  endif
-  }
-}
-
-OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
-    : Device(info, stats, profiler, background),
-      load_kernel_num_compiling(0),
-      kernel_programs(this),
-      memory_manager(this),
-      texture_info(this, "__texture_info", MEM_GLOBAL)
-{
-  cpPlatform = NULL;
-  cdDevice = NULL;
-  cxContext = NULL;
-  cqCommandQueue = NULL;
-  device_initialized = false;
-  textures_need_update = true;
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (usable_devices.size() == 0) {
-    opencl_error("OpenCL: no devices found.");
-    return;
-  }
-  assert(info.num < usable_devices.size());
-  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
-  device_num = info.num;
-  cpPlatform = platform_device.platform_id;
-  cdDevice = platform_device.device_id;
-  platform_name = platform_device.platform_name;
-  device_name = platform_device.device_name;
-  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
-          << device_name << ".";
-
-  {
-    /* try to use cached context */
-    thread_scoped_lock cache_locker;
-    cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
-    if (cxContext == NULL) {
-      /* create context properties array to specify platform */
-      const cl_context_properties context_props[] = {
-          CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
-
-      /* create context */
-      cxContext = clCreateContext(
-          context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
-
-      if (opencl_error(ciErr)) {
-        opencl_error("OpenCL: clCreateContext failed");
-        return;
-      }
-
-      /* cache it */
-      OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
-    }
-  }
-
-  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
-  if (opencl_error(ciErr)) {
-    opencl_error("OpenCL: Error creating command queue");
-    return;
-  }
-
-  /* Allocate this right away so that texture_info
-   * is placed at offset 0 in the device memory buffers. */
-  texture_info.resize(1);
-  memory_manager.alloc("texture_info", texture_info);
-
-  device_initialized = true;
-
-  split_kernel = new OpenCLSplitKernel(this);
-}
-
-OpenCLDevice::~OpenCLDevice()
-{
-  task_pool.cancel();
-  load_required_kernel_task_pool.cancel();
-  load_kernel_task_pool.cancel();
-
-  memory_manager.free();
-
-  ConstMemMap::iterator mt;
-  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
-    delete mt->second;
-  }
-
-  base_program.release();
-  bake_program.release();
-  displace_program.release();
-  background_program.release();
-  denoising_program.release();
-
-  if (cqCommandQueue)
-    clReleaseCommandQueue(cqCommandQueue);
-  if (cxContext)
-    clReleaseContext(cxContext);
-
-  delete split_kernel;
-}
-
-void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
-                                                       const void * /*private_info*/,
-                                                       size_t /*cb*/,
-                                                       void *user_data)
-{
-  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
-  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
-}
-
-bool OpenCLDevice::opencl_version_check()
-{
-  string error;
-  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
-    opencl_error(error);
-    return false;
-  }
-  return true;
-}
-
-string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
-{
-  MD5Hash md5;
-  char version[256], driver[256], name[256], vendor[256];
-
-  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
-  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
-
-  md5.append((uint8_t *)vendor, strlen(vendor));
-  md5.append((uint8_t *)version, strlen(version));
-  md5.append((uint8_t *)name, strlen(name));
-  md5.append((uint8_t *)driver, strlen(driver));
-
-  string options = kernel_build_options();
-  options += kernel_custom_build_options;
-  md5.append((uint8_t *)options.c_str(), options.size());
-
-  return md5.get_hex();
-}
-
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
-  /* Verify if device was initialized. */
-  if (!device_initialized) {
-    fprintf(stderr, "OpenCL: failed to initialize device.\n");
-    return false;
-  }
-
-  /* Verify we have right opencl version. */
-  if (!opencl_version_check())
-    return false;
-
-  load_required_kernels(requested_features);
-
-  vector<OpenCLProgram *> programs;
-  kernel_programs.load_kernels(programs, requested_features);
-
-  if (!requested_features.use_baking && requested_features.use_denoising) {
-    denoising_program = OpenCLProgram(
-        this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
-    denoising_program.add_kernel(ustring("filter_divide_shadow"));
-    denoising_program.add_kernel(ustring("filter_get_feature"));
-    denoising_program.add_kernel(ustring("filter_write_feature"));
-    denoising_program.add_kernel(ustring("filter_detect_outliers"));
-    denoising_program.add_kernel(ustring("filter_combine_halves"));
-    denoising_program.add_kernel(ustring("filter_construct_transform"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
-    denoising_program.add_kernel(ustring("filter_nlm_blur"));
-    denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
-    denoising_program.add_kernel(ustring("filter_nlm_update_output"));
-    denoising_program.add_kernel(ustring("filter_nlm_normalize"));
-    denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
-    denoising_program.add_kernel(ustring("filter_finalize"));
-    programs.push_back(&denoising_program);
-  }
-
-  load_required_kernel_task_pool.wait_work();
-
-  /* Parallel compilation of Cycles kernels, this launches multiple
-   * processes to workaround OpenCL frameworks serializing the calls
-   * internally within a single process. */
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_kernel_num_compiling++;
-      load_kernel_task_pool.push([=] {
-        program->compile();
-        load_kernel_num_compiling--;
-      });
-    }
-  }
-  return true;
-}
-
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
-{
-  vector<OpenCLProgram *> programs;
-  base_program = OpenCLProgram(
-      this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
-  base_program.add_kernel(ustring("convert_to_byte"));
-  base_program.add_kernel(ustring("convert_to_half_float"));
-  base_program.add_kernel(ustring("zero_buffer"));
-  programs.push_back(&base_program);
-
-  if (requested_features.use_true_displacement) {
-    displace_program = OpenCLProgram(
-        this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
-    displace_program.add_kernel(ustring("displace"));
-    programs.push_back(&displace_program);
-  }
-
-  if (requested_features.use_background_light) {
-    background_program = OpenCLProgram(this,
-                                       "background",
-                                       "kernel_background.cl",
-                                       get_build_options(requested_features, "background"));
-    background_program.add_kernel(ustring("background"));
-    programs.push_back(&background_program);
-  }
-
-  if (requested_features.use_baking) {
-    bake_program = OpenCLProgram(
-        this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
-    bake_program.add_kernel(ustring("bake"));
-    programs.push_back(&bake_program);
-  }
-
-  foreach (OpenCLProgram *program, programs) {
-    if (!program->load()) {
-      load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
-    }
-  }
-}
-
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
-{
-  if (requested_features.use_baking) {
-    /* For baking, kernels have already been loaded in load_required_kernels(). */
-    return true;
-  }
-
-  load_kernel_task_pool.wait_work();
-  return split_kernel->load_kernels(requested_features);
-}
-
-OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
-{
-  return &kernel_programs;
-}
-
-DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
-{
-  return DEVICE_KERNEL_USING_FEATURE_KERNEL;
-}
-
-void OpenCLDevice::mem_alloc(device_memory &mem)
-{
-  if (mem.name) {
-    VLOG(1) << "Buffer allocate: " << mem.name << ", "
-            << string_human_readable_number(mem.memory_size()) << " bytes. ("
-            << string_human_readable_size(mem.memory_size()) << ")";
-  }
-
-  size_t size = mem.memory_size();
-
-  /* check there is enough memory available for the allocation */
-  cl_ulong max_alloc_size = 0;
-  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
-  if (DebugFlags().opencl.mem_limit) {
-    max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
-  }
-
-  if (size > max_alloc_size) {
-    string error = "Scene too complex to fit in available memory.";
-    if (mem.name != NULL) {
-      error += string_printf(" (allocating buffer %s failed.)", mem.name);
-    }
-    set_error(error);
-
-    return;
-  }
-
-  cl_mem_flags mem_flag;
-  void *mem_ptr = NULL;
-
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  /* Zero-size allocation might be invoked by render, but not really
-   * supported by OpenCL. Using NULL as device pointer also doesn't really
-   * work for some reason, so for the time being we'll use special case
-   * will null_mem buffer.
-   */
-  if (size != 0) {
-    mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
-    opencl_assert_err(ciErr, "clCreateBuffer");
-  }
-  else {
-    mem.device_pointer = 0;
-  }
-
-  stats.mem_alloc(size);
-  mem.device_size = size;
-}
-
-void OpenCLDevice::mem_copy_to(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-    global_alloc(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-    tex_alloc((device_texture &)mem);
-  }
-  else {
-    if (!mem.device_pointer) {
-      mem_alloc(mem);
-    }
-
-    /* this is blocking */
-    size_t size = mem.memory_size();
-    if (size != 0) {
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         size,
-                                         mem.host_pointer,
-                                         0,
-                                         NULL,
-                                         NULL));
-    }
-  }
-}
-
-void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
-{
-  size_t offset = elem * y * w;
-  size_t size = elem * w * h;
-  assert(size != 0);
-  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
-                                    CL_MEM_PTR(mem.device_pointer),
-                                    CL_TRUE,
-                                    offset,
-                                    size,
-                                    (uchar *)mem.host_pointer + offset,
-                                    0,
-                                    NULL,
-                                    NULL));
-}
-
-void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
-{
-  base_program.wait_for_availability();
-  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-  size_t global_size[] = {1024, 1024};
-  size_t num_threads = global_size[0] * global_size[1];
-
-  cl_mem d_buffer = CL_MEM_PTR(mem);
-  cl_ulong d_offset = 0;
-  cl_ulong d_size = 0;
-
-  while (d_offset < size) {
-    d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
-
-    kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-    ciErr = clEnqueueNDRangeKernel(
-        cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
-    opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-    d_offset += d_size;
-  }
-}
-
-void OpenCLDevice::mem_zero(device_memory &mem)
-{
-  if (!mem.device_pointer) {
-    mem_alloc(mem);
-  }
-
-  if (mem.device_pointer) {
-    if (base_program.is_loaded()) {
-      mem_zero_kernel(mem.device_pointer, mem.memory_size());
-    }
-
-    if (mem.host_pointer) {
-      memset(mem.host_pointer, 0, mem.memory_size());
-    }
-
-    if (!base_program.is_loaded()) {
-      void *zero = mem.host_pointer;
-
-      if (!mem.host_pointer) {
-        zero = util_aligned_malloc(mem.memory_size(), 16);
-        memset(zero, 0, mem.memory_size());
-      }
-
-      opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
-                                         CL_MEM_PTR(mem.device_pointer),
-                                         CL_TRUE,
-                                         0,
-                                         mem.memory_size(),
-                                         zero,
-                                         0,
-                                         NULL,
-                                         NULL));
-
-      if (!mem.host_pointer) {
-        util_aligned_free(zero);
-      }
-    }
-  }
-}
-
-void OpenCLDevice::mem_free(device_memory &mem)
-{
-  if (mem.type == MEM_GLOBAL) {
-    global_free(mem);
-  }
-  else if (mem.type == MEM_TEXTURE) {
-    tex_free((device_texture &)mem);
-  }
-  else {
-    if (mem.device_pointer) {
-      if (mem.device_pointer != 0) {
-        opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
-      }
-      mem.device_pointer = 0;
-
-      stats.mem_free(mem.device_size);
-      mem.device_size = 0;
-    }
-  }
-}
-
-int OpenCLDevice::mem_sub_ptr_alignment()
-{
-  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
-}
-
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
-{
-  cl_mem_flags mem_flag;
-  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
-    mem_flag = CL_MEM_READ_ONLY;
-  else
-    mem_flag = CL_MEM_READ_WRITE;
-
-  cl_buffer_region info;
-  info.origin = mem.memory_elements_size(offset);
-  info.size = mem.memory_elements_size(size);
-
-  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
-      CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
-  opencl_assert_err(ciErr, "clCreateSubBuffer");
-  return sub_buf;
-}
-
-void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
-{
-  if (device_pointer != 0) {
-    opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
-  }
-}
-
-void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
-{
-  ConstMemMap::iterator i = const_mem_map.find(name);
-  device_vector<uchar> *data;
-
-  if (i == const_mem_map.end()) {
-    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
-    data->alloc(size);
-    const_mem_map.insert(ConstMemMap::value_type(name, data));
-  }
-  else {
-    data = i->second;
-  }
-
-  memcpy(data->data(), host, size);
-  data->copy_to_device();
-}
-
-void OpenCLDevice::global_alloc(device_memory &mem)
-{
-  VLOG(1) << "Global memory allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::global_free(device_memory &mem)
-{
-  if (mem.device_pointer) {
-    mem.device_pointer = 0;
-
-    if (memory_manager.free(mem)) {
-      textures_need_update = true;
-    }
-
-    foreach (TexturesMap::value_type &value, textures) {
-      if (value.second == &mem) {
-        textures.erase(value.first);
-        break;
-      }
-    }
-  }
-}
-
-void OpenCLDevice::tex_alloc(device_texture &mem)
-{
-  VLOG(1) << "Texture allocate: " << mem.name << ", "
-          << string_human_readable_number(mem.memory_size()) << " bytes. ("
-          << string_human_readable_size(mem.memory_size()) << ")";
-
-  memory_manager.alloc(mem.name, mem);
-  /* Set the pointer to non-null to keep code that inspects its value from thinking its
-   * unallocated. */
-  mem.device_pointer = 1;
-  textures[mem.name] = &mem;
-  textures_need_update = true;
-}
-
-void OpenCLDevice::tex_free(device_texture &mem)
-{
-  global_free(mem);
-}
-
-size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
-{
-  int r = global_size % group_size;
-  return global_size + ((r == 0) ? 0 : group_size - r);
-}
-
-void OpenCLDevice::enqueue_kernel(
-    cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
-{
-  size_t workgroup_size, max_work_items[3];
-
-  clGetKernelWorkGroupInfo(
-      kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
-  clGetDeviceInfo(
-      cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
-
-  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
-    workgroup_size = max_workgroup_size;
-  }
-
-  /* Try to divide evenly over 2 dimensions. */
-  size_t local_size[2];
-  if (x_workgroups) {
-    local_size[0] = workgroup_size;
-    local_size[1] = 1;
-  }
-  else {
-    size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-    local_size[0] = local_size[1] = sqrt_workgroup_size;
-  }
-
-  /* Some implementations have max size 1 on 2nd dimension. */
-  if (local_size[1] > max_work_items[1]) {
-    local_size[0] = workgroup_size / max_work_items[1];
-    local_size[1] = max_work_items[1];
-  }
-
-  size_t global_size[2] = {global_size_round_up(local_size[0], w),
-                           global_size_round_up(local_size[1], h)};
-
-  /* Vertical size of 1 is coming from bake/shade kernels where we should
-   * not round anything up because otherwise we'll either be doing too
-   * much work per pixel (if we don't check global ID on Y axis) or will
-   * be checking for global ID to always have Y of 0.
-   */
-  if (h == 1) {
-    global_size[h] = 1;
-  }
-
-  /* run kernel */
-  opencl_assert(
-      clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
-  opencl_assert(clFlush(cqCommandQueue));
-}
-
-void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
-{
-  cl_mem ptr;
-
-  MemMap::iterator i = mem_map.find(name);
-  if (i != mem_map.end()) {
-    ptr = CL_MEM_PTR(i->second);
-  }
-  else {
-    ptr = 0;
-  }
-
-  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
-}
-
-void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  flush_texture_buffers();
-
-  memory_manager.set_kernel_arg_buffers(kernel, narg);
-}
-
-void OpenCLDevice::flush_texture_buffers()
-{
-  if (!textures_need_update) {
-    return;
-  }
-  textures_need_update = false;
-
-  /* Setup slots for textures. */
-  int num_slots = 0;
-
-  vector<texture_slot_t> texture_slots;
-
-#  define KERNEL_TEX(type, name) \
-    if (textures.find(#name) != textures.end()) { \
-      texture_slots.push_back(texture_slot_t(#name, num_slots)); \
-    } \
-    num_slots++;
-#  include "kernel/kernel_textures.h"
-
-  int num_data_slots = num_slots;
-
-  foreach (TexturesMap::value_type &tex, textures) {
-    string name = tex.first;
-    device_memory *mem = tex.second;
-
-    if (mem->type == MEM_TEXTURE) {
-      const uint id = ((device_texture *)mem)->slot;
-      texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
-      num_slots = max(num_slots, num_data_slots + id + 1);
-    }
-  }
-
-  /* Realloc texture descriptors buffer. */
-  memory_manager.free(texture_info);
-  texture_info.resize(num_slots);
-  memory_manager.alloc("texture_info", texture_info);
-
-  /* Fill in descriptors */
-  foreach (texture_slot_t &slot, texture_slots) {
-    device_memory *mem = textures[slot.name];
-    TextureInfo &info = texture_info[slot.slot];
-
-    MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
-
-    if (mem->type == MEM_TEXTURE) {
-      info = ((device_texture *)mem)->info;
-    }
-    else {
-      memset(&info, 0, sizeof(TextureInfo));
-    }
-
-    info.data = desc.offset;
-    info.cl_buffer = desc.device_buffer;
-  }
-
-  /* Force write of descriptors. */
-  memory_manager.free(texture_info);
-  memory_manager.alloc("texture_info", texture_info);
-}
-
-void OpenCLDevice::thread_run(DeviceTask &task)
-{
-  flush_texture_buffers();
-
-  if (task.type == DeviceTask::RENDER) {
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    /* Allocate buffer for kernel globals */
-    device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
-    kgbuffer.alloc_to_device(1);
-
-    /* Keep rendering tiles until done. */
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        assert(tile.task == RenderTile::PATH_TRACE);
-        scoped_timer timer(&tile.buffers->render_time);
-
-        split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
-
-        /* Complete kernel execution before release tile. */
-        /* This helps in multi-device render;
-         * The device that reaches the critical-section function
-         * release_tile waits (stalling other devices from entering
-         * release_tile) for all kernels to complete. If device1 (a
-         * slow-render device) reaches release_tile first then it would
-         * stall device2 (a fast-render device) from proceeding to render
-         * next tile.
-         */
-        clFinish(cqCommandQueue);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        bake(task, tile);
-      }
-      else if (tile.task == RenderTile::DENOISE) {
-        tile.sample = tile.start_sample + tile.num_samples;
-        denoise(tile, denoising);
-        task.update_progress(&tile, tile.w * tile.h);
-      }
-
-      task.release_tile(tile);
-    }
-
-    kgbuffer.free();
-  }
-  else if (task.type == DeviceTask::SHADER) {
-    shader(task);
-  }
-  else if (task.type == DeviceTask::FILM_CONVERT) {
-    film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-  }
-  else if (task.type == DeviceTask::DENOISE_BUFFER) {
-    RenderTile tile;
-    tile.x = task.x;
-    tile.y = task.y;
-    tile.w = task.w;
-    tile.h = task.h;
-    tile.buffer = task.buffer;
-    tile.sample = task.sample + task.num_samples;
-    tile.num_samples = task.num_samples;
-    tile.start_sample = task.sample;
-    tile.offset = task.offset;
-    tile.stride = task.stride;
-    tile.buffers = task.buffers;
-
-    DenoisingTask denoising(this, task);
-    denoise(tile, denoising);
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-}
-
-void OpenCLDevice::film_convert(DeviceTask &task,
-                                device_ptr buffer,
-                                device_ptr rgba_byte,
-                                device_ptr rgba_half)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
-  cl_mem d_buffer = CL_MEM_PTR(buffer);
-  cl_int d_x = task.x;
-  cl_int d_y = task.y;
-  cl_int d_w = task.w;
-  cl_int d_h = task.h;
-  cl_float d_sample_scale = 1.0f / (task.sample + 1);
-  cl_int d_offset = task.offset;
-  cl_int d_stride = task.stride;
-
-  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
-                                                base_program(ustring("convert_to_half_float"));
-
-  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
-
-  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(ckFilmConvertKernel,
-                                     start_arg_index,
-                                     d_sample_scale,
-                                     d_x,
-                                     d_y,
-                                     d_w,
-                                     d_h,
-                                     d_offset,
-                                     d_stride);
-
-  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
-}
-
-bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
-                                             device_ptr guide_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr out_ptr,
-                                             DenoisingTask *task)
-{
-  int stride = task->buffer.stride;
-  int w = task->buffer.width;
-  int h = task->buffer.h;
-  int r = task->nlm_state.r;
-  int f = task->nlm_state.f;
-  float a = task->nlm_state.a;
-  float k_2 = task->nlm_state.k_2;
-
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  device_sub_ptr weightAccum(
-      task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
-  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem out_mem = CL_MEM_PTR(out_ptr);
-  cl_mem scale_mem = NULL;
-
-  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
-  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
-  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  guide_mem,
-                  variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  channel_offset,
-                  0,
-                  a,
-                  k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
-  kernel_set_args(ckNLMUpdateOutput,
-                  0,
-                  blurDifference_mem,
-                  image_mem,
-                  out_mem,
-                  weightAccum_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  channel_offset,
-                  r,
-                  f);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
-
-  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
-  enqueue_kernel(ckNLMNormalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
-{
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
-  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterConstructTransform,
-                  arg_ofs,
-                  transform_mem,
-                  rank_mem,
-                  task->filter_area,
-                  task->rect,
-                  task->buffer.pass_stride,
-                  task->buffer.frame_stride,
-                  use_time,
-                  task->radius,
-                  task->pca_threshold);
-
-  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
-                                        device_ptr color_variance_ptr,
-                                        device_ptr scale_ptr,
-                                        int frame,
-                                        DenoisingTask *task)
-{
-  cl_mem color_mem = CL_MEM_PTR(color_ptr);
-  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
-  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
-  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
-  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
-  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
-  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
-  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-  int stride = task->buffer.stride;
-  int frame_offset = frame * task->buffer.frame_stride;
-  int t = task->tile_info->frames[frame];
-  char use_time = task->buffer.use_time ? 1 : 0;
-
-  int r = task->radius;
-  int pass_stride = task->buffer.pass_stride;
-  int num_shifts = (2 * r + 1) * (2 * r + 1);
-
-  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
-  device_sub_ptr blurDifference(
-      task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
-  cl_mem difference_mem = CL_MEM_PTR(*difference);
-  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
-  kernel_set_args(ckNLMCalcDifference,
-                  0,
-                  color_mem,
-                  color_variance_mem,
-                  scale_mem,
-                  difference_mem,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  pass_stride,
-                  frame_offset,
-                  1.0f,
-                  task->nlm_k_2);
-  kernel_set_args(
-      ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(
-      ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
-  kernel_set_args(ckNLMConstructGramian,
-                  0,
-                  t,
-                  blurDifference_mem,
-                  buffer_mem,
-                  transform_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->reconstruction_state.filter_window,
-                  w,
-                  h,
-                  stride,
-                  pass_stride,
-                  r,
-                  4,
-                  frame_offset,
-                  use_time);
-
-  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
-  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
-{
-  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
-  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
-  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
-  int w = task->reconstruction_state.source_w;
-  int h = task->reconstruction_state.source_h;
-
-  kernel_set_args(ckFinalize,
-                  0,
-                  output_mem,
-                  rank_mem,
-                  XtWX_mem,
-                  XtWY_mem,
-                  task->filter_area,
-                  task->reconstruction_state.buffer_params,
-                  task->render_buffer.samples);
-  enqueue_kernel(ckFinalize, w, h);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
-                                            device_ptr b_ptr,
-                                            device_ptr mean_ptr,
-                                            device_ptr variance_ptr,
-                                            int r,
-                                            int4 rect,
-                                            DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
-  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
-  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
-                                           device_ptr b_ptr,
-                                           device_ptr sample_variance_ptr,
-                                           device_ptr sv_variance_ptr,
-                                           device_ptr buffer_variance_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem a_mem = CL_MEM_PTR(a_ptr);
-  cl_mem b_mem = CL_MEM_PTR(b_ptr);
-  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
-  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
-  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
-  int arg_ofs = kernel_set_args(
-      ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterDivideShadow,
-                  arg_ofs,
-                  a_mem,
-                  b_mem,
-                  sample_variance_mem,
-                  sv_variance_mem,
-                  buffer_variance_mem,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_get_feature(int mean_offset,
-                                         int variance_offset,
-                                         device_ptr mean_ptr,
-                                         device_ptr variance_ptr,
-                                         float scale,
-                                         DenoisingTask *task)
-{
-  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
-  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
-  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
-  cl_mem buffers[9];
-  for (int i = 0; i < 9; i++) {
-    buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
-    arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
-  }
-  kernel_set_args(ckFilterGetFeature,
-                  arg_ofs,
-                  mean_offset,
-                  variance_offset,
-                  mean_mem,
-                  variance_mem,
-                  scale,
-                  task->rect,
-                  task->render_buffer.pass_stride,
-                  task->render_buffer.offset);
-  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_write_feature(int out_offset,
-                                           device_ptr from_ptr,
-                                           device_ptr buffer_ptr,
-                                           DenoisingTask *task)
-{
-  cl_mem from_mem = CL_MEM_PTR(from_ptr);
-  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
-  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
-  kernel_set_args(ckFilterWriteFeature,
-                  0,
-                  task->render_buffer.samples,
-                  task->reconstruction_state.buffer_params,
-                  task->filter_area,
-                  from_mem,
-                  buffer_mem,
-                  out_offset,
-                  task->rect);
-  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
-
-  return true;
-}
-
-bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
-                                             device_ptr variance_ptr,
-                                             device_ptr depth_ptr,
-                                             device_ptr output_ptr,
-                                             DenoisingTask *task)
-{
-  cl_mem image_mem = CL_MEM_PTR(image_ptr);
-  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
-  cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
-  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
-  kernel_set_args(ckFilterDetectOutliers,
-                  0,
-                  image_mem,
-                  variance_mem,
-                  depth_mem,
-                  output_mem,
-                  task->rect,
-                  task->buffer.pass_stride);
-  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
-
-  return true;
-}
-
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
-{
-  denoising.functions.construct_transform = function_bind(
-      &OpenCLDevice::denoising_construct_transform, this, &denoising);
-  denoising.functions.accumulate = function_bind(
-      &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
-  denoising.functions.divide_shadow = function_bind(
-      &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.non_local_means = function_bind(
-      &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-  denoising.functions.combine_halves = function_bind(
-      &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-  denoising.functions.get_feature = function_bind(
-      &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-  denoising.functions.write_feature = function_bind(
-      &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-  denoising.functions.detect_outliers = function_bind(
-      &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-  denoising.render_buffer.samples = rtile.sample;
-  denoising.buffer.gpu_temporary_mem = true;
-
-  denoising.run_denoising(rtile);
-}
-
-void OpenCLDevice::shader(DeviceTask &task)
-{
-  /* cast arguments to cl types */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_input = CL_MEM_PTR(task.shader_input);
-  cl_mem d_output = CL_MEM_PTR(task.shader_output);
-  cl_int d_shader_eval_type = task.shader_eval_type;
-  cl_int d_shader_filter = task.shader_filter;
-  cl_int d_shader_x = task.shader_x;
-  cl_int d_shader_w = task.shader_w;
-  cl_int d_offset = task.offset;
-
-  OpenCLDevice::OpenCLProgram *program = &background_program;
-  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-    program = &displace_program;
-  }
-  program->wait_for_availability();
-  cl_kernel kernel = (*program)();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
-  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
-    start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
-  }
-  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
-
-  for (int sample = 0; sample < task.num_samples; sample++) {
-
-    if (task.get_cancel())
-      break;
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, task.shader_w, 1);
-
-    clFinish(cqCommandQueue);
-
-    task.update_progress(NULL);
-  }
-}
-
-void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  /* Cast arguments to cl types. */
-  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-  cl_int d_x = rtile.x;
-  cl_int d_y = rtile.y;
-  cl_int d_w = rtile.w;
-  cl_int d_h = rtile.h;
-  cl_int d_offset = rtile.offset;
-  cl_int d_stride = rtile.stride;
-
-  bake_program.wait_for_availability();
-  cl_kernel kernel = bake_program();
-
-  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
-
-  set_kernel_arg_buffers(kernel, &start_arg_index);
-
-  start_arg_index += kernel_set_args(
-      kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
-
-  int start_sample = rtile.start_sample;
-  int end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    kernel_set_args(kernel, start_arg_index, sample);
-
-    enqueue_kernel(kernel, d_w, d_h);
-    clFinish(cqCommandQueue);
-
-    rtile.sample = sample + 1;
-
-    task.update_progress(&rtile, rtile.w * rtile.h);
-  }
-}
-
-static bool kernel_build_opencl_2(cl_device_id cdDevice)
-{
-  /* Build with OpenCL 2.0 if available, this improves performance
-   * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
-   * Note that OpenCL selects the highest 1.x version by default,
-   * only for 2.0 do we need the explicit compiler flag. */
-  int version_major, version_minor;
-  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
-    if (version_major >= 2) {
-      /* This appears to trigger a driver bug in Radeon RX cards with certain
-       * driver version, so don't use OpenCL 2.0 for those. */
-      string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
-      if (string_startswith(device_name, "Radeon RX 4") ||
-          string_startswith(device_name, "Radeon (TM) RX 4") ||
-          string_startswith(device_name, "Radeon RX 5") ||
-          string_startswith(device_name, "Radeon (TM) RX 5")) {
-        char version[256] = "";
-        int driver_major, driver_minor;
-        clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
-        if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
-          return !(driver_major == 3075 && driver_minor <= 12);
-        }
-      }
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-string OpenCLDevice::kernel_build_options(const string *debug_src)
-{
-  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
-  if (kernel_build_opencl_2(cdDevice)) {
-    build_options += "-cl-std=CL2.0 ";
-  }
-
-  if (platform_name == "NVIDIA CUDA") {
-    build_options +=
-        "-D__KERNEL_OPENCL_NVIDIA__ "
-        "-cl-nv-maxrregcount=32 "
-        "-cl-nv-verbose ";
-
-    uint compute_capability_major, compute_capability_minor;
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_major,
-                    NULL);
-    clGetDeviceInfo(cdDevice,
-                    CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
-                    sizeof(cl_uint),
-                    &compute_capability_minor,
-                    NULL);
-
-    build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
-                                   compute_capability_major * 100 + compute_capability_minor * 10);
-  }
-
-  else if (platform_name == "Apple")
-    build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
-  else if (platform_name == "AMD Accelerated Parallel Processing")
-    build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
-  else if (platform_name == "Intel(R) OpenCL") {
-    build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
-    /* Options for gdb source level kernel debugging.
-     * this segfaults on linux currently.
-     */
-    if (OpenCLInfo::use_debug() && debug_src)
-      build_options += "-g -s \"" + *debug_src + "\" ";
-  }
-
-  if (info.has_half_images) {
-    build_options += "-D__KERNEL_CL_KHR_FP16__ ";
-  }
-
-  if (OpenCLInfo::use_debug()) {
-    build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
-  }
-
-#  ifdef WITH_NANOVDB
-  if (info.has_nanovdb) {
-    build_options += "-DWITH_NANOVDB ";
-  }
-#  endif
-
-  return build_options;
-}
-
-/* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
-int OpenCLDevice::kernel_set_args(cl_kernel kernel,
-                                  int start_argument_index,
-                                  const ArgumentWrapper &arg1,
-                                  const ArgumentWrapper &arg2,
-                                  const ArgumentWrapper &arg3,
-                                  const ArgumentWrapper &arg4,
-                                  const ArgumentWrapper &arg5,
-                                  const ArgumentWrapper &arg6,
-                                  const ArgumentWrapper &arg7,
-                                  const ArgumentWrapper &arg8,
-                                  const ArgumentWrapper &arg9,
-                                  const ArgumentWrapper &arg10,
-                                  const ArgumentWrapper &arg11,
-                                  const ArgumentWrapper &arg12,
-                                  const ArgumentWrapper &arg13,
-                                  const ArgumentWrapper &arg14,
-                                  const ArgumentWrapper &arg15,
-                                  const ArgumentWrapper &arg16,
-                                  const ArgumentWrapper &arg17,
-                                  const ArgumentWrapper &arg18,
-                                  const ArgumentWrapper &arg19,
-                                  const ArgumentWrapper &arg20,
-                                  const ArgumentWrapper &arg21,
-                                  const ArgumentWrapper &arg22,
-                                  const ArgumentWrapper &arg23,
-                                  const ArgumentWrapper &arg24,
-                                  const ArgumentWrapper &arg25,
-                                  const ArgumentWrapper &arg26,
-                                  const ArgumentWrapper &arg27,
-                                  const ArgumentWrapper &arg28,
-                                  const ArgumentWrapper &arg29,
-                                  const ArgumentWrapper &arg30,
-                                  const ArgumentWrapper &arg31,
-                                  const ArgumentWrapper &arg32,
-                                  const ArgumentWrapper &arg33)
-{
-  int current_arg_index = 0;
-#  define FAKE_VARARG_HANDLE_ARG(arg) \
-    do { \
-      if (arg.pointer != NULL) { \
-        opencl_assert(clSetKernelArg( \
-            kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
-        ++current_arg_index; \
-      } \
-      else { \
-        return current_arg_index; \
-      } \
-    } while (false)
-  FAKE_VARARG_HANDLE_ARG(arg1);
-  FAKE_VARARG_HANDLE_ARG(arg2);
-  FAKE_VARARG_HANDLE_ARG(arg3);
-  FAKE_VARARG_HANDLE_ARG(arg4);
-  FAKE_VARARG_HANDLE_ARG(arg5);
-  FAKE_VARARG_HANDLE_ARG(arg6);
-  FAKE_VARARG_HANDLE_ARG(arg7);
-  FAKE_VARARG_HANDLE_ARG(arg8);
-  FAKE_VARARG_HANDLE_ARG(arg9);
-  FAKE_VARARG_HANDLE_ARG(arg10);
-  FAKE_VARARG_HANDLE_ARG(arg11);
-  FAKE_VARARG_HANDLE_ARG(arg12);
-  FAKE_VARARG_HANDLE_ARG(arg13);
-  FAKE_VARARG_HANDLE_ARG(arg14);
-  FAKE_VARARG_HANDLE_ARG(arg15);
-  FAKE_VARARG_HANDLE_ARG(arg16);
-  FAKE_VARARG_HANDLE_ARG(arg17);
-  FAKE_VARARG_HANDLE_ARG(arg18);
-  FAKE_VARARG_HANDLE_ARG(arg19);
-  FAKE_VARARG_HANDLE_ARG(arg20);
-  FAKE_VARARG_HANDLE_ARG(arg21);
-  FAKE_VARARG_HANDLE_ARG(arg22);
-  FAKE_VARARG_HANDLE_ARG(arg23);
-  FAKE_VARARG_HANDLE_ARG(arg24);
-  FAKE_VARARG_HANDLE_ARG(arg25);
-  FAKE_VARARG_HANDLE_ARG(arg26);
-  FAKE_VARARG_HANDLE_ARG(arg27);
-  FAKE_VARARG_HANDLE_ARG(arg28);
-  FAKE_VARARG_HANDLE_ARG(arg29);
-  FAKE_VARARG_HANDLE_ARG(arg30);
-  FAKE_VARARG_HANDLE_ARG(arg31);
-  FAKE_VARARG_HANDLE_ARG(arg32);
-  FAKE_VARARG_HANDLE_ARG(arg33);
-#  undef FAKE_VARARG_HANDLE_ARG
-  return current_arg_index;
-}
-
-void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
-{
-  if (kernel) {
-    clReleaseKernel(kernel);
-  }
-}
-
-void OpenCLDevice::release_mem_object_safe(cl_mem mem)
-{
-  if (mem != NULL) {
-    clReleaseMemObject(mem);
-  }
-}
-
-void OpenCLDevice::release_program_safe(cl_program program)
-{
-  if (program) {
-    clReleaseProgram(program);
-  }
-}
-
-/* ** Those guys are for working around some compiler-specific bugs ** */
-
-cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
-{
-  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
-}
-
-void OpenCLDevice::store_cached_kernel(cl_program program,
-                                       ustring key,
-                                       thread_scoped_lock &cache_locker)
-{
-  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
-}
-
-Device *opencl_create_split_device(DeviceInfo &info,
-                                   Stats &stats,
-                                   Profiler &profiler,
-                                   bool background)
-{
-  return new OpenCLDevice(info, stats, profiler, background);
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
deleted file mode 100644
index 4330e07cb37..00000000000
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "util/util_foreach.h"
-
-#  include "device/opencl/device_opencl.h"
-#  include "device/opencl/memory_manager.h"
-
-CCL_NAMESPACE_BEGIN
-
-void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
-{
-  allocations.push_back(&allocation);
-}
-
-void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
-{
-  bool need_realloc = false;
-
-  /* Calculate total size and remove any freed. */
-  size_t total_size = 0;
-
-  for (int i = allocations.size() - 1; i >= 0; i--) {
-    Allocation *allocation = allocations[i];
-
-    /* Remove allocations that have been freed. */
-    if (!allocation->mem || allocation->mem->memory_size() == 0) {
-      allocation->device_buffer = NULL;
-      allocation->size = 0;
-
-      allocations.erase(allocations.begin() + i);
-
-      need_realloc = true;
-
-      continue;
-    }
-
-    /* Get actual size for allocation. */
-    size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
-    if (allocation->size != alloc_size) {
-      /* Allocation is either new or resized. */
-      allocation->size = alloc_size;
-      allocation->needs_copy_to_device = true;
-
-      need_realloc = true;
-    }
-
-    total_size += alloc_size;
-  }
-
-  /* Always allocate non-empty buffer, NULL pointers cause problems with some drivers. */
-  total_size = std::max(total_size, (size_t)16);
-
-  if (need_realloc) {
-    cl_ulong max_buffer_size;
-    clGetDeviceInfo(
-        device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
-    if (total_size > max_buffer_size) {
-      device->set_error("Scene too complex to fit in available memory.");
-      return;
-    }
-
-    device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
-                                                                          "memory manager buffer");
-
-    new_buffer->alloc_to_device(total_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(new_buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-      else {
-        /* Fast copy from memory already on device. */
-        opencl_device_assert(device,
-                             clEnqueueCopyBuffer(device->cqCommandQueue,
-                                                 CL_MEM_PTR(buffer->device_pointer),
-                                                 CL_MEM_PTR(new_buffer->device_pointer),
-                                                 allocation->desc.offset,
-                                                 offset,
-                                                 allocation->mem->memory_size(),
-                                                 0,
-                                                 NULL,
-                                                 NULL));
-      }
-
-      allocation->desc.offset = offset;
-      offset += allocation->size;
-    }
-
-    delete buffer;
-
-    buffer = new_buffer;
-  }
-  else {
-    assert(total_size == buffer->data_size);
-
-    size_t offset = 0;
-
-    foreach (Allocation *allocation, allocations) {
-      if (allocation->needs_copy_to_device) {
-        /* Copy from host to device. */
-        opencl_device_assert(device,
-                             clEnqueueWriteBuffer(device->cqCommandQueue,
-                                                  CL_MEM_PTR(buffer->device_pointer),
-                                                  CL_FALSE,
-                                                  offset,
-                                                  allocation->mem->memory_size(),
-                                                  allocation->mem->host_pointer,
-                                                  0,
-                                                  NULL,
-                                                  NULL));
-
-        allocation->needs_copy_to_device = false;
-      }
-
-      offset += allocation->size;
-    }
-  }
-
-  /* Not really necessary, but seems to improve responsiveness for some reason. */
-  clFinish(device->cqCommandQueue);
-}
-
-void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
-{
-  buffer->free();
-}
-
-MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
-{
-  DeviceBuffer *smallest = device_buffers;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.size < smallest->size) {
-      smallest = &device_buffer;
-    }
-  }
-
-  return smallest;
-}
-
-MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
-  }
-}
-
-void MemoryManager::free()
-{
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.free(device);
-  }
-}
-
-void MemoryManager::alloc(const char *name, device_memory &mem)
-{
-  Allocation &allocation = allocations[name];
-
-  allocation.mem = &mem;
-  allocation.needs_copy_to_device = true;
-
-  if (!allocation.device_buffer) {
-    DeviceBuffer *device_buffer = smallest_device_buffer();
-    allocation.device_buffer = device_buffer;
-
-    allocation.desc.device_buffer = device_buffer - device_buffers;
-
-    device_buffer->add_allocation(allocation);
-
-    device_buffer->size += mem.memory_size();
-  }
-
-  need_update = true;
-}
-
-bool MemoryManager::free(device_memory &mem)
-{
-  foreach (AllocationsMap::value_type &value, allocations) {
-    Allocation &allocation = value.second;
-    if (allocation.mem == &mem) {
-
-      allocation.device_buffer->size -= mem.memory_size();
-
-      allocation.mem = NULL;
-      allocation.needs_copy_to_device = false;
-
-      need_update = true;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
-{
-  update_device_memory();
-
-  Allocation &allocation = allocations[name];
-  return allocation.desc;
-}
-
-void MemoryManager::update_device_memory()
-{
-  if (!need_update) {
-    return;
-  }
-
-  need_update = false;
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    device_buffer.update_device_memory(device);
-  }
-}
-
-void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
-{
-  update_device_memory();
-
-  foreach (DeviceBuffer &device_buffer, device_buffers) {
-    if (device_buffer.buffer->device_pointer) {
-      device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
-    }
-    else {
-      device->kernel_set_args(kernel, (*narg)++);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
-
-#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
deleted file mode 100644
index 23624f837a6..00000000000
--- a/intern/cycles/device/opencl/memory_manager.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "device/device.h"
-
-#include "util/util_map.h"
-#include "util/util_string.h"
-#include "util/util_vector.h"
-
-#include "clew.h"
-
-CCL_NAMESPACE_BEGIN
-
-class OpenCLDevice;
-
-class MemoryManager {
- public:
-  static const int NUM_DEVICE_BUFFERS = 8;
-
-  struct BufferDescriptor {
-    uint device_buffer;
-    cl_ulong offset;
-  };
-
- private:
-  struct DeviceBuffer;
-
-  struct Allocation {
-    device_memory *mem;
-
-    DeviceBuffer *device_buffer;
-    size_t size; /* Size of actual allocation, may be larger than requested. */
-
-    BufferDescriptor desc;
-
-    bool needs_copy_to_device;
-
-    Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
-    {
-    }
-  };
-
-  struct DeviceBuffer {
-    device_only_memory<uchar> *buffer;
-    vector<Allocation *> allocations;
-    size_t size; /* Size of all allocations. */
-
-    DeviceBuffer() : buffer(NULL), size(0)
-    {
-    }
-
-    ~DeviceBuffer()
-    {
-      delete buffer;
-      buffer = NULL;
-    }
-
-    void add_allocation(Allocation &allocation);
-
-    void update_device_memory(OpenCLDevice *device);
-
-    void free(OpenCLDevice *device);
-  };
-
-  OpenCLDevice *device;
-
-  DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
-
-  typedef unordered_map<string, Allocation> AllocationsMap;
-  AllocationsMap allocations;
-
-  bool need_update;
-
-  DeviceBuffer *smallest_device_buffer();
-
- public:
-  MemoryManager(OpenCLDevice *device);
-
-  void free(); /* Free all memory. */
-
-  void alloc(const char *name, device_memory &mem);
-  bool free(device_memory &mem);
-
-  BufferDescriptor get_descriptor(string name);
-
-  void update_device_memory();
-  void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-};
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
deleted file mode 100644
index 3929cf77f15..00000000000
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ /dev/null
@@ -1,1326 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_OPENCL
-
-#  include "device/device_intern.h"
-#  include "device/opencl/device_opencl.h"
-
-#  include "util/util_debug.h"
-#  include "util/util_logging.h"
-#  include "util/util_md5.h"
-#  include "util/util_path.h"
-#  include "util/util_semaphore.h"
-#  include "util/util_system.h"
-#  include "util/util_time.h"
-
-using std::cerr;
-using std::endl;
-
-CCL_NAMESPACE_BEGIN
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
-    : program(rhs.program), mutex(NULL)
-{
-}
-
-OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
-{
-  delete mutex;
-}
-
-OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
-{
-}
-
-OpenCLCache::Slot::Slot(const Slot &rhs)
-    : context_mutex(NULL), context(NULL), programs(rhs.programs)
-{
-}
-
-OpenCLCache::Slot::~Slot()
-{
-  delete context_mutex;
-}
-
-OpenCLCache &OpenCLCache::global_instance()
-{
-  static OpenCLCache instance;
-  return instance;
-}
-
-cl_context OpenCLCache::get_context(cl_platform_id platform,
-                                    cl_device_id device,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!slot.context_mutex)
-    slot.context_mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*slot.context_mutex);
-
-  /* If the thing isn't cached */
-  if (slot.context == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainContext(slot.context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return slot.context;
-}
-
-cl_program OpenCLCache::get_program(cl_platform_id platform,
-                                    cl_device_id device,
-                                    ustring key,
-                                    thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  pair<CacheMap::iterator, bool> ins = self.cache.insert(
-      CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
-
-  Slot &slot = ins.first->second;
-
-  pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
-      Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
-
-  Slot::ProgramEntry &entry = ins2.first->second;
-
-  /* create slot lock only while holding cache lock */
-  if (!entry.mutex)
-    entry.mutex = new thread_mutex;
-
-  /* need to unlock cache before locking slot, to allow store to complete */
-  cache_lock.unlock();
-
-  /* lock the slot */
-  slot_locker = thread_scoped_lock(*entry.mutex);
-
-  /* If the thing isn't cached */
-  if (entry.program == NULL) {
-    /* return with the caller's lock holder holding the slot lock */
-    return NULL;
-  }
-
-  /* the item was already cached, release the slot lock */
-  slot_locker.unlock();
-
-  cl_int ciErr = clRetainProgram(entry.program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-
-  return entry.program;
-}
-
-void OpenCLCache::store_context(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_context context,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(context != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  cache_lock.unlock();
-
-  Slot &slot = i->second;
-
-  /* sanity check */
-  assert(i != self.cache.end());
-  assert(slot.context == NULL);
-
-  slot.context = context;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* increment reference count in OpenCL.
-   * The caller is going to release the object when done with it. */
-  cl_int ciErr = clRetainContext(context);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-void OpenCLCache::store_program(cl_platform_id platform,
-                                cl_device_id device,
-                                cl_program program,
-                                ustring key,
-                                thread_scoped_lock &slot_locker)
-{
-  assert(platform != NULL);
-  assert(device != NULL);
-  assert(program != NULL);
-
-  OpenCLCache &self = global_instance();
-
-  thread_scoped_lock cache_lock(self.cache_lock);
-
-  CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
-  assert(i != self.cache.end());
-  Slot &slot = i->second;
-
-  Slot::EntryMap::iterator i2 = slot.programs.find(key);
-  assert(i2 != slot.programs.end());
-  Slot::ProgramEntry &entry = i2->second;
-
-  assert(entry.program == NULL);
-
-  cache_lock.unlock();
-
-  entry.program = program;
-
-  /* unlock the slot */
-  slot_locker.unlock();
-
-  /* Increment reference count in OpenCL.
-   * The caller is going to release the object when done with it.
-   */
-  cl_int ciErr = clRetainProgram(program);
-  assert(ciErr == CL_SUCCESS);
-  (void)ciErr;
-}
-
-string OpenCLCache::get_kernel_md5()
-{
-  OpenCLCache &self = global_instance();
-  thread_scoped_lock lock(self.kernel_md5_lock);
-
-  if (self.kernel_md5.empty()) {
-    self.kernel_md5 = path_files_md5_hash(path_get("source"));
-  }
-  return self.kernel_md5;
-}
-
-static string get_program_source(const string &kernel_file)
-{
-  string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
-  /* We compile kernels consisting of many files. unfortunately OpenCL
-   * kernel caches do not seem to recognize changes in included files.
-   * so we force recompile on changes by adding the md5 hash of all files.
-   */
-  source = path_source_replace_includes(source, path_get("source"));
-  source += "\n// " + util_md5_string(source) + "\n";
-  return source;
-}
-
-OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
-                                           const string &program_name,
-                                           const string &kernel_file,
-                                           const string &kernel_build_options,
-                                           bool use_stdout)
-    : device(device),
-      program_name(program_name),
-      kernel_file(kernel_file),
-      kernel_build_options(kernel_build_options),
-      use_stdout(use_stdout)
-{
-  loaded = false;
-  needs_compiling = true;
-  program = NULL;
-}
-
-OpenCLDevice::OpenCLProgram::~OpenCLProgram()
-{
-  release();
-}
-
-void OpenCLDevice::OpenCLProgram::release()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    if (kernel->second) {
-      clReleaseKernel(kernel->second);
-      kernel->second = NULL;
-    }
-  }
-  if (program) {
-    clReleaseProgram(program);
-    program = NULL;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
-{
-  if (!use_stdout) {
-    log += msg + "\n";
-  }
-  else if (!debug) {
-    printf("%s\n", msg.c_str());
-    fflush(stdout);
-  }
-  else {
-    VLOG(2) << msg;
-  }
-}
-
-void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
-{
-  if (use_stdout) {
-    fprintf(stderr, "%s\n", msg.c_str());
-  }
-  if (error_msg == "") {
-    error_msg += "\n";
-  }
-  error_msg += msg;
-}
-
-void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
-{
-  if (!kernels.count(name)) {
-    kernels[name] = NULL;
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
-{
-  string build_options;
-  build_options = device->kernel_build_options(debug_src) + kernel_build_options;
-
-  VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
-  cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-  /* show warnings even if build is successful */
-  size_t ret_val_size = 0;
-
-  clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
-              ", errors in console.");
-  }
-
-  if (ret_val_size > 1) {
-    vector<char> build_log(ret_val_size + 1);
-    clGetProgramBuildInfo(
-        program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
-
-    build_log[ret_val_size] = '\0';
-    /* Skip meaningless empty output from the NVidia compiler. */
-    if (!(ret_val_size == 2 && build_log[0] == '\n')) {
-      add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
-              ciErr == CL_SUCCESS);
-    }
-  }
-
-  return (ciErr == CL_SUCCESS);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
-{
-  string source = get_program_source(kernel_file);
-
-  if (debug_src) {
-    path_write_text(*debug_src, source);
-  }
-
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_int ciErr;
-
-  program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
-
-  if (ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
-    return false;
-  }
-
-  double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  double elapsed = time_dt() - starttime;
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return true;
-}
-
-static void escape_python_string(string &str)
-{
-  /* Escape string to be passed as a Python raw string with '' quotes'. */
-  string_replace(str, "'", "\'");
-}
-
-static int opencl_compile_process_limit()
-{
-  /* Limit number of concurrent processes compiling, with a heuristic based
-   * on total physical RAM and estimate of memory usage needed when compiling
-   * with all Cycles features enabled.
-   *
-   * This is somewhat arbitrary as we don't know the actual available RAM or
-   * how much the kernel compilation will needed depending on the features, but
-   * better than not limiting at all. */
-  static const int64_t GB = 1024LL * 1024LL * 1024LL;
-  static const int64_t process_memory = 2 * GB;
-  static const int64_t base_memory = 2 * GB;
-  static const int64_t system_memory = system_physical_ram();
-  static const int64_t process_limit = (system_memory - base_memory) / process_memory;
-
-  return max((int)process_limit, 1);
-}
-
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
-{
-  /* Construct arguments. */
-  vector<string> args;
-  args.push_back("--background");
-  args.push_back("--factory-startup");
-  args.push_back("--python-expr");
-
-  int device_platform_id = device->device_num;
-  string device_name = device->device_name;
-  string platform_name = device->platform_name;
-  string build_options = device->kernel_build_options(NULL) + kernel_build_options;
-  string kernel_file_escaped = kernel_file;
-  string clbin_escaped = clbin;
-
-  escape_python_string(device_name);
-  escape_python_string(platform_name);
-  escape_python_string(build_options);
-  escape_python_string(kernel_file_escaped);
-  escape_python_string(clbin_escaped);
-
-  args.push_back(string_printf(
-      "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
-      device_platform_id,
-      device_name.c_str(),
-      platform_name.c_str(),
-      build_options.c_str(),
-      kernel_file_escaped.c_str(),
-      clbin_escaped.c_str()));
-
-  /* Limit number of concurrent processes compiling. */
-  static thread_counting_semaphore semaphore(opencl_compile_process_limit());
-  semaphore.acquire();
-
-  /* Compile. */
-  const double starttime = time_dt();
-  add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
-  add_log(string("Build flags: ") + kernel_build_options, true);
-  const bool success = system_call_self(args);
-  const double elapsed = time_dt() - starttime;
-
-  semaphore.release();
-
-  if (!success || !path_exists(clbin)) {
-    return false;
-  }
-
-  add_log(
-      string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
-      false);
-
-  return load_binary(clbin);
-}
-
-/* Compile opencl kernel. This method is called from the _cycles Python
- * module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string> &parameters)
-{
-  int device_platform_id = std::stoi(parameters[0]);
-  const string &device_name = parameters[1];
-  const string &platform_name = parameters[2];
-  const string &build_options = parameters[3];
-  const string &kernel_file = parameters[4];
-  const string &binary_path = parameters[5];
-
-  if (clewInit() != CLEW_SUCCESS) {
-    return false;
-  }
-
-  vector<OpenCLPlatformDevice> usable_devices;
-  OpenCLInfo::get_usable_devices(&usable_devices);
-  if (device_platform_id >= usable_devices.size()) {
-    return false;
-  }
-
-  OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
-  if (platform_device.platform_name != platform_name ||
-      platform_device.device_name != device_name) {
-    return false;
-  }
-
-  cl_platform_id platform = platform_device.platform_id;
-  cl_device_id device = platform_device.device_id;
-  const cl_context_properties context_props[] = {
-      CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
-
-  cl_int err;
-  cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
-  if (err != CL_SUCCESS) {
-    return false;
-  }
-
-  string source = get_program_source(kernel_file);
-  size_t source_len = source.size();
-  const char *source_str = source.c_str();
-  cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
-  bool result = false;
-
-  if (err == CL_SUCCESS) {
-    err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
-    if (err == CL_SUCCESS) {
-      size_t size = 0;
-      clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-      if (size > 0) {
-        vector<uint8_t> binary(size);
-        uint8_t *bytes = &binary[0];
-        clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-        result = path_write_binary(binary_path, binary);
-      }
-    }
-    clReleaseProgram(program);
-  }
-
-  clReleaseContext(context);
-
-  return result;
-}
-
-bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
-{
-  /* read binary into memory */
-  vector<uint8_t> binary;
-
-  if (!path_read_binary(clbin, binary)) {
-    add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
-    return false;
-  }
-
-  /* create program */
-  cl_int status, ciErr;
-  size_t size = binary.size();
-  const uint8_t *bytes = &binary[0];
-
-  program = clCreateProgramWithBinary(
-      device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
-
-  if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
-    add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
-              clewErrorString(status) + " " + clewErrorString(ciErr));
-    return false;
-  }
-
-  if (!build_kernel(debug_src))
-    return false;
-
-  return true;
-}
-
-bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
-{
-  size_t size = 0;
-  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
-
-  if (!size)
-    return false;
-
-  vector<uint8_t> binary(size);
-  uint8_t *bytes = &binary[0];
-
-  clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
-
-  return path_write_binary(clbin, binary);
-}
-
-bool OpenCLDevice::OpenCLProgram::load()
-{
-  loaded = false;
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-  if (!program) {
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* If binary kernel exists already, try use it. */
-    if (path_exists(clbin) && load_binary(clbin)) {
-      /* Kernel loaded from binary, nothing to do. */
-      add_log(string("Loaded program from ") + clbin + ".", true);
-
-      /* Cache the program. */
-      device->store_cached_kernel(program, cache_key, cache_locker);
-    }
-    else {
-      add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
-      cache_locker.unlock();
-    }
-  }
-
-  if (program) {
-    create_kernels();
-    loaded = true;
-    needs_compiling = false;
-  }
-
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::compile()
-{
-  assert(device);
-
-  string device_md5 = device->device_md5_hash(kernel_build_options);
-
-  /* Try to use cached kernel. */
-  thread_scoped_lock cache_locker;
-  ustring cache_key(program_name + device_md5);
-  program = device->load_cached_kernel(cache_key, cache_locker);
-
-  if (!program) {
-
-    add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
-    /* need to create source to get md5 */
-    string source = get_program_source(kernel_file);
-
-    string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
-                      util_md5_string(source);
-    basename = path_cache_get(path_join("kernels", basename));
-    string clbin = basename + ".clbin";
-
-    /* path to preprocessed source for debugging */
-    string clsrc, *debug_src = NULL;
-
-    if (OpenCLInfo::use_debug()) {
-      clsrc = basename + ".cl";
-      debug_src = &clsrc;
-    }
-
-    if (DebugFlags().running_inside_blender && compile_separate(clbin)) {
-      add_log(string("Built and loaded program from ") + clbin + ".", true);
-      loaded = true;
-    }
-    else {
-      if (DebugFlags().running_inside_blender) {
-        add_log(string("Separate-process building of ") + clbin +
-                    " failed, will fall back to regular building.",
-                true);
-      }
-
-      /* If does not exist or loading binary failed, compile kernel. */
-      if (!compile_kernel(debug_src)) {
-        needs_compiling = false;
-        return;
-      }
-
-      /* Save binary for reuse. */
-      if (!save_binary(clbin)) {
-        add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
-      }
-    }
-
-    /* Cache the program. */
-    device->store_cached_kernel(program, cache_key, cache_locker);
-  }
-
-  create_kernels();
-  needs_compiling = false;
-  loaded = true;
-}
-
-void OpenCLDevice::OpenCLProgram::create_kernels()
-{
-  for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
-       ++kernel) {
-    assert(kernel->second == NULL);
-    cl_int ciErr;
-    string name = "kernel_ocl_" + kernel->first.string();
-    kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
-    if (device->opencl_error(ciErr)) {
-      add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
-                clewErrorString(ciErr));
-      return;
-    }
-  }
-}
-
-bool OpenCLDevice::OpenCLProgram::wait_for_availability()
-{
-  add_log(string("Waiting for availability of ") + program_name + ".", true);
-  while (needs_compiling) {
-    time_sleep(0.1);
-  }
-  return loaded;
-}
-
-void OpenCLDevice::OpenCLProgram::report_error()
-{
-  /* If loaded is true, there was no error. */
-  if (loaded)
-    return;
-  /* if use_stdout is true, the error was already reported. */
-  if (use_stdout)
-    return;
-
-  cerr << error_msg << endl;
-  if (!compile_output.empty()) {
-    cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
-    cerr << compile_output << endl;
-  }
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()()
-{
-  assert(kernels.size() == 1);
-  return kernels.begin()->second;
-}
-
-cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
-{
-  assert(kernels.count(name));
-  return kernels[name];
-}
-
-cl_device_type OpenCLInfo::device_type()
-{
-  switch (DebugFlags().opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      return 0;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      return CL_DEVICE_TYPE_ALL;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      return CL_DEVICE_TYPE_DEFAULT;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      return CL_DEVICE_TYPE_CPU;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      return CL_DEVICE_TYPE_GPU;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      return CL_DEVICE_TYPE_ACCELERATOR;
-    default:
-      return CL_DEVICE_TYPE_ALL;
-  }
-}
-
-bool OpenCLInfo::use_debug()
-{
-  return DebugFlags().opencl.debug;
-}
-
-bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return false;
-  }
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return false;
-  }
-
-  int driver_major = 0;
-  int driver_minor = 0;
-  if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
-    return false;
-  }
-  VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
-  if (getenv("CYCLES_OPENCL_TEST")) {
-    return true;
-  }
-
-  /* Allow Intel GPUs on Intel OpenCL platform. */
-  if (platform_name.find("Intel") != string::npos) {
-    if (device_type != CL_DEVICE_TYPE_GPU) {
-      /* OpenCL on Intel CPU is not an officially supported configuration.
-       * Use hybrid CPU+GPU rendering to utilize both GPU and CPU. */
-      return false;
-    }
-
-#  ifdef __APPLE__
-    /* Apple uses own framework, which can also put Iris onto AMD frame-work.
-     * This isn't supported configuration. */
-    return false;
-#  else
-    if (device_name.find("Iris") != string::npos || device_name.find("Xe") != string::npos) {
-      return true;
-    }
-#  endif
-  }
-
-  if (platform_name == "AMD Accelerated Parallel Processing" &&
-      device_type == CL_DEVICE_TYPE_GPU) {
-    if (driver_major < 2236) {
-      VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
-      return false;
-    }
-    const char *blacklist[] = {/* GCN 1 */
-                               "Tahiti",
-                               "Pitcairn",
-                               "Capeverde",
-                               "Oland",
-                               "Hainan",
-                               NULL};
-    for (int i = 0; blacklist[i] != NULL; i++) {
-      if (device_name == blacklist[i]) {
-        VLOG(1) << "AMD device " << device_name << " not supported";
-        return false;
-      }
-    }
-    return true;
-  }
-  if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
-    return false;
-  }
-  return false;
-}
-
-bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  char version[256];
-  clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
-    }
-    return false;
-  }
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf(
-          "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::get_device_version(cl_device_id device, int *r_major, int *r_minor, string *error)
-{
-  char version[256];
-  clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
-  if (sscanf(version, "OpenCL C %d.%d", r_major, r_minor) < 2) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
-{
-  const int req_major = 1, req_minor = 1;
-  int major, minor;
-  if (!get_device_version(device, &major, &minor, error)) {
-    return false;
-  }
-
-  if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
-    if (error != NULL) {
-      *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = "";
-  }
-  return true;
-}
-
-string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
-{
-  if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
-    /* Use cl_amd_device_topology extension. */
-    cl_char topology[24];
-    if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
-        topology[0] == 1) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)topology[21],
-                           (unsigned int)topology[22],
-                           (unsigned int)topology[23]);
-    }
-  }
-  else if (platform_name == "NVIDIA CUDA") {
-    /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
-    cl_int bus_id, slot_id;
-    if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
-        clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
-      return string_printf("%02x:%02x.%01x",
-                           (unsigned int)(bus_id),
-                           (unsigned int)(slot_id >> 3),
-                           (unsigned int)(slot_id & 0x7));
-    }
-  }
-  /* No general way to get a hardware ID from OpenCL => give up. */
-  return "";
-}
-
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
-{
-  const cl_device_type device_type = OpenCLInfo::device_type();
-  static bool first_time = true;
-#  define FIRST_VLOG(severity) \
-    if (first_time) \
-    VLOG(severity)
-
-  usable_devices->clear();
-
-  if (device_type == 0) {
-    FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
-    first_time = false;
-    return;
-  }
-
-  cl_int error;
-  vector<cl_device_id> device_ids;
-  vector<cl_platform_id> platform_ids;
-
-  /* Get platforms. */
-  if (!get_platforms(&platform_ids, &error)) {
-    FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
-    first_time = false;
-    return;
-  }
-  if (platform_ids.size() == 0) {
-    FIRST_VLOG(2) << "No OpenCL platforms were found.";
-    first_time = false;
-    return;
-  }
-  /* Devices are numbered consecutively across platforms. */
-  for (int platform = 0; platform < platform_ids.size(); platform++) {
-    cl_platform_id platform_id = platform_ids[platform];
-    string platform_name;
-    if (!get_platform_name(platform_id, &platform_name)) {
-      FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
-      continue;
-    }
-    FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
-    if (!platform_version_check(platform_id)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << " due to too old compiler version.";
-      continue;
-    }
-    if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name
-                    << ", failed to fetch of devices: " << string(clewErrorString(error));
-      continue;
-    }
-    if (device_ids.size() == 0) {
-      FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
-      continue;
-    }
-    for (int num = 0; num < device_ids.size(); num++) {
-      const cl_device_id device_id = device_ids[num];
-      string device_name;
-      if (!get_device_name(device_id, &device_name, &error)) {
-        FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
-                      << ", ignoring.";
-        continue;
-      }
-      if (!device_version_check(device_id)) {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
-        continue;
-      }
-      if (device_supported(platform_name, device_id)) {
-        cl_device_type device_type;
-        if (!get_device_type(device_id, &device_type, &error)) {
-          FIRST_VLOG(2) << "Ignoring device " << device_name
-                        << ", failed to fetch device type:" << string(clewErrorString(error));
-          continue;
-        }
-        string readable_device_name = get_readable_device_name(device_id);
-        if (readable_device_name != device_name) {
-          FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
-        }
-        FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
-        string hardware_id = get_hardware_id(platform_name, device_id);
-        string device_extensions = get_device_extensions(device_id);
-        usable_devices->push_back(OpenCLPlatformDevice(platform_id,
-                                                       platform_name,
-                                                       device_id,
-                                                       device_type,
-                                                       readable_device_name,
-                                                       hardware_id,
-                                                       device_extensions));
-      }
-      else {
-        FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
-      }
-    }
-  }
-  first_time = false;
-}
-
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
-{
-  /* Reset from possible previous state. */
-  platform_ids->resize(0);
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms, error)) {
-    return false;
-  }
-  /* Get actual platforms. */
-  cl_int err;
-  platform_ids->resize(num_platforms);
-  if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_platform_id> OpenCLInfo::get_platforms()
-{
-  vector<cl_platform_id> platform_ids;
-  get_platforms(&platform_ids);
-  return platform_ids;
-}
-
-bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_platforms = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platforms()
-{
-  cl_uint num_platforms;
-  if (!get_num_platforms(&num_platforms)) {
-    return 0;
-  }
-  return num_platforms;
-}
-
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
-{
-  char buffer[256];
-  if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
-      CL_SUCCESS) {
-    *platform_name = "";
-    return false;
-  }
-  *platform_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
-{
-  string platform_name;
-  if (!get_platform_name(platform_id, &platform_name)) {
-    return "";
-  }
-  return platform_name;
-}
-
-bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                          cl_device_type device_type,
-                                          cl_uint *num_devices,
-                                          cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *num_devices = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
-                                             cl_device_type device_type)
-{
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
-    return 0;
-  }
-  return num_devices;
-}
-
-bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                      cl_device_type device_type,
-                                      vector<cl_device_id> *device_ids,
-                                      cl_int *error)
-{
-  /* Reset from possible previous state. */
-  device_ids->resize(0);
-  /* Get number of devices to pre-allocate memory. */
-  cl_uint num_devices;
-  if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
-    return false;
-  }
-  /* Get actual device list. */
-  device_ids->resize(num_devices);
-  cl_int err;
-  if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
-                                                      cl_device_type device_type)
-{
-  vector<cl_device_id> devices;
-  get_platform_devices(platform_id, device_type, &devices);
-  return devices;
-}
-
-bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_name = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_name = buffer;
-  return true;
-}
-
-string OpenCLInfo::get_device_name(cl_device_id device_id)
-{
-  string device_name;
-  if (!get_device_name(device_id, &device_name)) {
-    return "";
-  }
-  return device_name;
-}
-
-bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
-                                       string *device_extensions,
-                                       cl_int *error)
-{
-  size_t extension_length = 0;
-  cl_int err;
-  /* Determine the size of the extension string. */
-  if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, 0, &extension_length)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  vector<char> buffer(extension_length);
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_EXTENSIONS, extension_length, buffer.data(), NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_extensions = "";
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  *device_extensions = string(buffer.data());
-  return true;
-}
-
-string OpenCLInfo::get_device_extensions(cl_device_id device_id)
-{
-  string device_extensions;
-  if (!get_device_extensions(device_id, &device_extensions)) {
-    return "";
-  }
-  return device_extensions;
-}
-
-bool OpenCLInfo::get_device_type(cl_device_id device_id,
-                                 cl_device_type *device_type,
-                                 cl_int *error)
-{
-  cl_int err;
-  if ((err = clGetDeviceInfo(
-           device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    *device_type = 0;
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  return true;
-}
-
-cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
-{
-  cl_device_type device_type;
-  if (!get_device_type(device_id, &device_type)) {
-    return 0;
-  }
-  return device_type;
-}
-
-string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
-{
-  string name = "";
-  char board_name[1024];
-  size_t length = 0;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
-      CL_SUCCESS) {
-    if (length != 0 && board_name[0] != '\0') {
-      name = board_name;
-    }
-  }
-
-  /* Fallback to standard device name API. */
-  if (name.empty()) {
-    name = get_device_name(device_id);
-  }
-
-  /* Special exception for AMD Vega, need to be able to tell
-   * Vega 56 from 64 apart.
-   */
-  if (name == "Radeon RX Vega") {
-    cl_int max_compute_units = 0;
-    if (clGetDeviceInfo(device_id,
-                        CL_DEVICE_MAX_COMPUTE_UNITS,
-                        sizeof(max_compute_units),
-                        &max_compute_units,
-                        NULL) == CL_SUCCESS) {
-      name += " " + to_string(max_compute_units);
-    }
-  }
-
-  /* Distinguish from our native CPU device. */
-  if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
-    name += " (OpenCL)";
-  }
-
-  return name;
-}
-
-bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
-{
-  char buffer[1024];
-  cl_int err;
-  if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
-      CL_SUCCESS) {
-    if (error != NULL) {
-      *error = err;
-    }
-    return false;
-  }
-  if (error != NULL) {
-    *error = CL_SUCCESS;
-  }
-  if (sscanf(buffer, "%d.%d", major, minor) < 2) {
-    VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
-    return false;
-  }
-  return true;
-}
-
-int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
-{
-  int base_align_bits;
-  if (clGetDeviceInfo(
-          device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
-      CL_SUCCESS) {
-    return base_align_bits / 8;
-  }
-  return 1;
-}
-
-CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/device/optix/device.cpp b/intern/cycles/device/optix/device.cpp
new file mode 100644
index 00000000000..13f23bd229a
--- /dev/null
+++ b/intern/cycles/device/optix/device.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/optix/device.h"
+
+#include "device/cuda/device.h"
+#include "device/optix/device_impl.h"
+#include "util/util_logging.h"
+
+#ifdef WITH_OPTIX
+#  include <optix_function_table_definition.h>
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+bool device_optix_init()
+{
+#ifdef WITH_OPTIX
+  if (g_optixFunctionTable.optixDeviceContextCreate != NULL) {
+    /* Already initialized function table. */
+    return true;
+  }
+
+  /* Need to initialize CUDA as well. */
+  if (!device_cuda_init()) {
+    return false;
+  }
+
+  const OptixResult result = optixInit();
+
+  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
+    VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
+               "Please update to the latest driver first!";
+    return false;
+  }
+  else if (result != OPTIX_SUCCESS) {
+    VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
+    return false;
+  }
+
+  /* Loaded OptiX successfully! */
+  return true;
+#else
+  return false;
+#endif
+}
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices)
+{
+#ifdef WITH_OPTIX
+  devices.reserve(cuda_devices.size());
+
+  /* Simply add all supported CUDA devices as OptiX devices again. */
+  for (DeviceInfo info : cuda_devices) {
+    assert(info.type == DEVICE_CUDA);
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
+    if (major < 5) {
+      /* Only Maxwell and up are supported by OptiX. */
+      continue;
+    }
+
+    info.type = DEVICE_OPTIX;
+    info.id += "_OptiX";
+    info.denoisers |= DENOISER_OPTIX;
+
+    devices.push_back(info);
+  }
+#else
+  (void)cuda_devices;
+  (void)devices;
+#endif
+}
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_OPTIX
+  return new OptiXDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create OptiX device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device.h b/intern/cycles/device/optix/device.h
new file mode 100644
index 00000000000..29fa729c2e4
--- /dev/null
+++ b/intern/cycles/device/optix/device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_optix_init();
+
+Device *device_optix_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo> &devices);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
new file mode 100644
index 00000000000..b54d423a183
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -0,0 +1,1573 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/device_impl.h"
+
+#  include "bvh/bvh.h"
+#  include "bvh/bvh_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+#  include "render/buffers.h"
+#  include "render/hair.h"
+#  include "render/mesh.h"
+#  include "render/object.h"
+#  include "render/pass.h"
+#  include "render/scene.h"
+
+#  include "util/util_debug.h"
+#  include "util/util_logging.h"
+#  include "util/util_md5.h"
+#  include "util/util_path.h"
+#  include "util/util_progress.h"
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+OptiXDevice::Denoiser::Denoiser(OptiXDevice *device)
+    : device(device), queue(device), state(device, "__denoiser_state")
+{
+}
+
+OptiXDevice::Denoiser::~Denoiser()
+{
+  const CUDAContextScope scope(device);
+  if (optix_denoiser != nullptr) {
+    optixDenoiserDestroy(optix_denoiser);
+  }
+}
+
+OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+    : CUDADevice(info, stats, profiler),
+      sbt_data(this, "__sbt", MEM_READ_ONLY),
+      launch_params(this, "__params"),
+      denoiser_(this)
+{
+  /* Make the CUDA context current. */
+  if (!cuContext) {
+    /* Do not initialize if CUDA context creation failed already. */
+    return;
+  }
+  const CUDAContextScope scope(this);
+
+  /* Create OptiX context for this device. */
+  OptixDeviceContextOptions options = {};
+#  ifdef WITH_CYCLES_LOGGING
+  options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
+  options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
+    switch (level) {
+      case 1:
+        LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
+        break;
+      case 2:
+        LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
+        break;
+      case 3:
+        LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
+        break;
+      case 4:
+        LOG_IF(INFO, VLOG_IS_ON(1)) << message;
+        break;
+    }
+  };
+#  endif
+  if (DebugFlags().optix.use_debug) {
+    options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
+  }
+  optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
+#  ifdef WITH_CYCLES_LOGGING
+  optix_assert(optixDeviceContextSetLogCallback(
+      context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
+#  endif
+
+  /* Fix weird compiler bug that assigns wrong size. */
+  launch_params.data_elements = sizeof(KernelParamsOptiX);
+
+  /* Allocate launch parameter buffer memory on device. */
+  launch_params.alloc_to_device(1);
+}
+
+OptiXDevice::~OptiXDevice()
+{
+  /* Make CUDA context current. */
+  const CUDAContextScope scope(this);
+
+  free_bvh_memory_delayed();
+
+  sbt_data.free();
+  texture_info.free();
+  launch_params.free();
+
+  /* Unload modules. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+    }
+  }
+
+  optixDeviceContextDestroy(context);
+}
+
+unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
+{
+  return make_unique<OptiXDeviceQueue>(this);
+}
+
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+{
+  /* OptiX has its own internal acceleration structure format. */
+  return BVH_LAYOUT_OPTIX;
+}
+
+string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
+{
+  string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
+
+  /* Add OptiX SDK include directory to include paths. */
+  const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
+  if (optix_sdk_path) {
+    common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
+  }
+
+  /* Specialization for shader raytracing. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    common_cflags += " --keep-device-functions";
+  }
+
+  return common_cflags;
+}
+
+bool OptiXDevice::load_kernels(const uint kernel_features)
+{
+  if (have_error()) {
+    /* Abort early if context creation failed already. */
+    return false;
+  }
+
+  /* Load CUDA modules because we need some of the utility kernels. */
+  if (!CUDADevice::load_kernels(kernel_features)) {
+    return false;
+  }
+
+  /* Skip creating OptiX module if only doing denoising. */
+  if (!(kernel_features & (KERNEL_FEATURE_PATH_TRACING | KERNEL_FEATURE_BAKING))) {
+    return true;
+  }
+
+  const CUDAContextScope scope(this);
+
+  /* Unload existing OptiX module and pipelines first. */
+  if (optix_module != NULL) {
+    optixModuleDestroy(optix_module);
+    optix_module = NULL;
+  }
+  for (unsigned int i = 0; i < 2; ++i) {
+    if (builtin_modules[i] != NULL) {
+      optixModuleDestroy(builtin_modules[i]);
+      builtin_modules[i] = NULL;
+    }
+  }
+  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
+    if (pipelines[i] != NULL) {
+      optixPipelineDestroy(pipelines[i]);
+      pipelines[i] = NULL;
+    }
+  }
+
+  OptixModuleCompileOptions module_options = {};
+  module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
+
+  if (DebugFlags().optix.use_debug) {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
+    module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  module_options.boundValues = nullptr;
+  module_options.numBoundValues = 0;
+
+  OptixPipelineCompileOptions pipeline_options = {};
+  /* Default to no motion blur and two-level graph, since it is the fastest option. */
+  pipeline_options.usesMotionBlur = false;
+  pipeline_options.traversableGraphFlags =
+      OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
+  pipeline_options.numPayloadValues = 6;
+  pipeline_options.numAttributeValues = 2; /* u, v */
+  pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
+  pipeline_options.pipelineLaunchParamsVariableName = "__params"; /* See globals.h */
+
+  pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
+    }
+    else
+      pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
+  }
+
+  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
+   * This is necessary since objects may be reported to have motion if the Vector pass is
+   * active, but may still need to be rendered without motion blur if that isn't active as well. */
+  motion_blur = (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) != 0;
+
+  if (motion_blur) {
+    pipeline_options.usesMotionBlur = true;
+    /* Motion blur can insert motion transforms into the traversal graph.
+     * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
+    pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
+  }
+
+  { /* Load and compile PTX module with OptiX kernels. */
+    string ptx_data, ptx_filename = path_get((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
+                                                 "lib/kernel_optix_shader_raytrace.ptx" :
+                                                 "lib/kernel_optix.ptx");
+    if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
+      if (!getenv("OPTIX_ROOT_DIR")) {
+        set_error(
+            "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
+            "the Optix SDK to be able to compile Optix kernels on demand).");
+        return false;
+      }
+      ptx_filename = compile_kernel(
+          kernel_features,
+          (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ? "kernel_shader_raytrace" : "kernel",
+          "optix",
+          true);
+    }
+    if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
+      return false;
+    }
+
+    const OptixResult result = optixModuleCreateFromPTX(context,
+                                                        &module_options,
+                                                        &pipeline_options,
+                                                        ptx_data.data(),
+                                                        ptx_data.size(),
+                                                        nullptr,
+                                                        0,
+                                                        &optix_module);
+    if (result != OPTIX_SUCCESS) {
+      set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
+                              ptx_filename.c_str(),
+                              optixGetErrorName(result)));
+      return false;
+    }
+  }
+
+  /* Create program groups. */
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
+  OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_closest";
+  group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_shadow";
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_subsurface";
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
+  group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
+      "__raygen__kernel_optix_integrator_intersect_volume_stack";
+  group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
+  group_descs[PG_MISS].miss.module = optix_module;
+  group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
+  group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
+  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+
+  if (kernel_features & KERNEL_FEATURE_HAIR) {
+    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
+      /* Built-in thick curve intersection. */
+      OptixBuiltinISOptions builtin_options = {};
+      builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+      builtin_options.usesMotionBlur = false;
+
+      optix_assert(optixBuiltinISModuleGet(
+          context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
+
+      group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
+      group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
+
+      if (motion_blur) {
+        builtin_options.usesMotionBlur = true;
+
+        optix_assert(optixBuiltinISModuleGet(
+            context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
+
+        group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
+        group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
+        group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
+        group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
+      }
+    }
+    else {
+      /* Custom ribbon intersection. */
+      group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
+      group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+      group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
+    }
+  }
+
+  if (kernel_features & (KERNEL_FEATURE_SUBSURFACE | KERNEL_FEATURE_NODE_RAYTRACE)) {
+    /* Add hit group for local intersections. */
+    group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+    group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
+    group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
+  }
+
+  /* Shader raytracing replaces some functions with direct callables. */
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
+    group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
+        "__raygen__kernel_optix_integrator_shade_surface_raytrace";
+    group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
+    group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
+        "__direct_callable__svm_node_bevel";
+    group_descs[PG_CALL_AO_PASS].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
+    group_descs[PG_CALL_AO_PASS].callables.moduleDC = optix_module;
+    group_descs[PG_CALL_AO_PASS].callables.entryFunctionNameDC = "__direct_callable__ao_pass";
+  }
+
+  optix_assert(optixProgramGroupCreate(
+      context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
+
+  /* Get program stack sizes. */
+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
+  /* Set up SBT, which in this case is used only to select between different programs. */
+  sbt_data.alloc(NUM_PROGRAM_GROUPS);
+  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
+  }
+  sbt_data.copy_to_device(); /* Upload SBT to device. */
+
+  /* Calculate maximum trace continuation stack size. */
+  unsigned int trace_css = stack_size[PG_HITD].cssCH;
+  /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
+  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
+  trace_css = std::max(trace_css,
+                       stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
+
+  OptixPipelineLinkOptions link_options = {};
+  link_options.maxTraceDepth = 1;
+
+  if (DebugFlags().optix.use_debug) {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
+  }
+  else {
+    link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
+  }
+
+  if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    /* Create shader raytracing pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+    pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
+    pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_SHADE_RAYTRACE]));
+
+    /* Combine ray generation and trace continuation stack size. */
+    const unsigned int css = stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG +
+                             link_options.maxTraceDepth * trace_css;
+    const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
+                                      stack_size[PG_CALL_SVM_BEVEL].dssDC);
+
+    /* Set stack size depending on pipeline options. */
+    optix_assert(optixPipelineSetStackSize(
+        pipelines[PIP_SHADE_RAYTRACE], 0, dss, css, motion_blur ? 3 : 2));
+  }
+
+  { /* Create intersection-only pipeline. */
+    vector<OptixProgramGroup> pipeline_groups;
+    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
+    pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
+    pipeline_groups.push_back(groups[PG_MISS]);
+    pipeline_groups.push_back(groups[PG_HITD]);
+    pipeline_groups.push_back(groups[PG_HITS]);
+    pipeline_groups.push_back(groups[PG_HITL]);
+    if (motion_blur) {
+      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
+      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
+    }
+
+    optix_assert(optixPipelineCreate(context,
+                                     &pipeline_options,
+                                     &link_options,
+                                     pipeline_groups.data(),
+                                     pipeline_groups.size(),
+                                     nullptr,
+                                     0,
+                                     &pipelines[PIP_INTERSECT]));
+
+    /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
+    const unsigned int css =
+        std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
+                 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
+                          std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
+                                   stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
+        link_options.maxTraceDepth * trace_css;
+
+    optix_assert(
+        optixPipelineSetStackSize(pipelines[PIP_INTERSECT], 0, 0, css, motion_blur ? 3 : 2));
+  }
+
+  /* Clean up program group objects. */
+  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
+    optixProgramGroupDestroy(groups[i]);
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Buffer denoising.
+ */
+
+class OptiXDevice::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DeviceDenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer"),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    const int num_guiding_passes = num_input_passes - 1;
+
+    if (num_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxiliary passes. */
+  int num_input_passes = 0;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDevice::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDevice::denoise_buffer(const DeviceDenoiseTask &task)
+{
+  const CUDAContextScope scope(this);
+
+  DenoiseContext context(this, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+DeviceQueue *OptiXDevice::get_denoise_queue()
+{
+  return &denoiser_.queue;
+}
+
+bool OptiXDevice::denoise_filter_guiding_preprocess(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&context.guiding_params.pass_normal),
+                  &context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&context.pass_denoising_albedo),
+                  const_cast<int *>(&context.pass_denoising_normal),
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&context.num_samples)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_guiding_set_fake_albedo(DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {const_cast<device_ptr *>(&context.guiding_params.device_pointer),
+                  const_cast<int *>(&context.guiding_params.pass_stride),
+                  const_cast<int *>(&context.guiding_params.pass_albedo),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDevice::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (!context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error connverting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_.queue.synchronize();
+}
+
+void OptiXDevice::denoise_color_read(DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      &denoiser_.queue, pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, context.buffer_params, destination);
+}
+
+bool OptiXDevice::denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&pass.denoised_offset)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_filter_color_postprocess(DenoiseContext &context,
+                                                   const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  void *args[] = {&context.render_buffers->buffer.device_pointer,
+                  const_cast<int *>(&buffer_params.full_x),
+                  const_cast<int *>(&buffer_params.full_y),
+                  const_cast<int *>(&buffer_params.width),
+                  const_cast<int *>(&buffer_params.height),
+                  const_cast<int *>(&buffer_params.offset),
+                  const_cast<int *>(&buffer_params.stride),
+                  const_cast<int *>(&buffer_params.pass_stride),
+                  const_cast<int *>(&context.num_samples),
+                  const_cast<int *>(&pass.noisy_offset),
+                  const_cast<int *>(&pass.denoised_offset),
+                  const_cast<int *>(&context.pass_sample_count),
+                  const_cast<int *>(&pass.num_components),
+                  const_cast<bool *>(&pass.use_compositing)};
+
+  return denoiser_.queue.enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDevice::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDevice::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (denoiser_.optix_denoiser == nullptr) ||
+                                 (denoiser_.use_pass_albedo != context.use_pass_albedo) ||
+                                 (denoiser_.use_pass_normal != context.use_pass_normal);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (denoiser_.optix_denoiser) {
+    optixDenoiserDestroy(denoiser_.optix_denoiser);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+  const OptixResult result = optixDenoiserCreate(
+      this->context, OPTIX_DENOISER_MODEL_KIND_HDR, &denoiser_options, &denoiser_.optix_denoiser);
+
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  denoiser_.use_pass_albedo = context.use_pass_albedo;
+  denoiser_.use_pass_normal = context.use_pass_normal;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  denoiser_.is_configured = false;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_configure_if_needed(DenoiseContext &context)
+{
+  if (denoiser_.is_configured && (denoiser_.configured_size.x == context.buffer_params.width &&
+                                  denoiser_.configured_size.y == context.buffer_params.height)) {
+    return true;
+  }
+
+  const BufferParams &buffer_params = context.buffer_params;
+
+  OptixDenoiserSizes sizes = {};
+  optix_assert(optixDenoiserComputeMemoryResources(
+      denoiser_.optix_denoiser, buffer_params.width, buffer_params.height, &sizes));
+
+  denoiser_.scratch_size = sizes.withOverlapScratchSizeInBytes;
+  denoiser_.scratch_offset = sizes.stateSizeInBytes;
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  denoiser_.state.alloc_to_device(denoiser_.scratch_offset + denoiser_.scratch_size);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(denoiser_.optix_denoiser,
+                                                denoiser_.queue.stream(),
+                                                buffer_params.width,
+                                                buffer_params.height,
+                                                denoiser_.state.device_pointer,
+                                                denoiser_.scratch_offset,
+                                                denoiser_.state.device_pointer +
+                                                    denoiser_.scratch_offset,
+                                                denoiser_.scratch_size);
+  if (result != OPTIX_SUCCESS) {
+    set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  denoiser_.is_configured = true;
+  denoiser_.configured_size.x = buffer_params.width;
+  denoiser_.configured_size.y = buffer_params.height;
+
+  return true;
+}
+
+bool OptiXDevice::denoise_run(DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+
+  OptixImage2D output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  device_vector<float> fake_albedo(this, "fake_albedo", MEM_READ_WRITE);
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  /* Finally run denoising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.output = output_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+
+  optix_assert(optixDenoiserInvoke(denoiser_.optix_denoiser,
+                                   denoiser_.queue.stream(),
+                                   &params,
+                                   denoiser_.state.device_pointer,
+                                   denoiser_.scratch_offset,
+                                   &guide_layers,
+                                   &image_layers,
+                                   1,
+                                   0,
+                                   0,
+                                   denoiser_.state.device_pointer + denoiser_.scratch_offset,
+                                   denoiser_.scratch_size));
+
+  return true;
+}
+
+bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
+                                  OptixBuildOperation operation,
+                                  const OptixBuildInput &build_input,
+                                  uint16_t num_motion_steps)
+{
+  const CUDAContextScope scope(this);
+
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  /* Compute memory usage. */
+  OptixAccelBufferSizes sizes = {};
+  OptixAccelBuildOptions options = {};
+  options.operation = operation;
+  if (use_fast_trace_bvh) {
+    VLOG(2) << "Using fast to trace OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+  }
+  else {
+    VLOG(2) << "Using fast to update OptiX BVH";
+    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
+  }
+
+  options.motionOptions.numKeys = num_motion_steps;
+  options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
+  options.motionOptions.timeBegin = 0.0f;
+  options.motionOptions.timeEnd = 1.0f;
+
+  optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
+
+  /* Allocate required output buffers. */
+  device_only_memory<char> temp_mem(this, "optix temp as build mem");
+  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+  if (!temp_mem.device_pointer) {
+    /* Make sure temporary memory allocation succeeded. */
+    return false;
+  }
+
+  device_only_memory<char> &out_data = bvh->as_data;
+  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
+    assert(out_data.device == this);
+    out_data.alloc_to_device(sizes.outputSizeInBytes);
+    if (!out_data.device_pointer) {
+      return false;
+    }
+  }
+  else {
+    assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
+  }
+
+  /* Finally build the acceleration structure. */
+  OptixAccelEmitDesc compacted_size_prop = {};
+  compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+  /* A tiny space was allocated for this property at the end of the temporary buffer above.
+   * Make sure this pointer is 8-byte aligned. */
+  compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
+  OptixTraversableHandle out_handle = 0;
+  optix_assert(optixAccelBuild(context,
+                               NULL,
+                               &options,
+                               &build_input,
+                               1,
+                               temp_mem.device_pointer,
+                               sizes.tempSizeInBytes,
+                               out_data.device_pointer,
+                               sizes.outputSizeInBytes,
+                               &out_handle,
+                               use_fast_trace_bvh ? &compacted_size_prop : NULL,
+                               use_fast_trace_bvh ? 1 : 0));
+  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+  /* Wait for all operations to finish. */
+  cuda_assert(cuStreamSynchronize(NULL));
+
+  /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
+   */
+  if (use_fast_trace_bvh) {
+    uint64_t compacted_size = sizes.outputSizeInBytes;
+    cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+    /* Temporary memory is no longer needed, so free it now to make space. */
+    temp_mem.free();
+
+    /* There is no point compacting if the size does not change. */
+    if (compacted_size < sizes.outputSizeInBytes) {
+      device_only_memory<char> compacted_data(this, "optix compacted as");
+      compacted_data.alloc_to_device(compacted_size);
+      if (!compacted_data.device_pointer)
+        /* Do not compact if memory allocation for compacted acceleration structure fails.
+         * Can just use the uncompacted one then, so succeed here regardless. */
+        return !have_error();
+
+      optix_assert(optixAccelCompact(
+          context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
+      bvh->traversable_handle = static_cast<uint64_t>(out_handle);
+
+      /* Wait for compaction to finish. */
+      cuda_assert(cuStreamSynchronize(NULL));
+
+      std::swap(out_data.device_size, compacted_data.device_size);
+      std::swap(out_data.device_pointer, compacted_data.device_pointer);
+    }
+  }
+
+  return !have_error();
+}
+
+void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
+
+  free_bvh_memory_delayed();
+
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  progress.set_substatus("Building OptiX acceleration structure");
+
+  if (!bvh->params.top_level) {
+    assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
+
+    /* Refit is only possible in viewport for now (because AS is built with
+     * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
+    OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
+    if (refit && !use_fast_trace_bvh) {
+      assert(bvh_optix->traversable_handle != 0);
+      operation = OPTIX_BUILD_OPERATION_UPDATE;
+    }
+    else {
+      bvh_optix->as_data.free();
+      bvh_optix->traversable_handle = 0;
+    }
+
+    /* Build bottom level acceleration structures (BLAS). */
+    Geometry *const geom = bvh->geometry[0];
+    if (geom->geometry_type == Geometry::HAIR) {
+      /* Build BLAS for curve primitives. */
+      Hair *const hair = static_cast<Hair *const>(geom);
+      if (hair->num_curves() == 0) {
+        return;
+      }
+
+      const size_t num_segments = hair->num_segments();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = hair->get_motion_steps();
+      }
+
+      device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      /* Four control points for each curve segment. */
+      const size_t num_vertices = num_segments * 4;
+      if (hair->curve_shape == CURVE_THICK) {
+        index_data.alloc(num_segments);
+        vertex_data.alloc(num_vertices * num_motion_steps);
+      }
+      else
+        aabb_data.alloc(num_segments * num_motion_steps);
+
+      /* Get AABBs for each motion step. */
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        /* The center step for motion vertices is not stored in the attribute. */
+        const float3 *keys = hair->get_curve_keys().data();
+        size_t center_step = (num_motion_steps - 1) / 2;
+        if (step != center_step) {
+          size_t attr_offset = (step > center_step) ? step - 1 : step;
+          /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
+          keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
+        }
+
+        for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
+          const Hair::Curve curve = hair->get_curve(j);
+          const array<float> &curve_radius = hair->get_curve_radius();
+
+          for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
+            if (hair->curve_shape == CURVE_THICK) {
+              int k0 = curve.first_key + segment;
+              int k1 = k0 + 1;
+              int ka = max(k0 - 1, curve.first_key);
+              int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
+
+              const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
+              const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
+              const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
+              const float4 pw = make_float4(
+                  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
+
+              /* Convert Catmull-Rom data to Bezier spline. */
+              static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
+              static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
+              static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
+              static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
+
+              index_data[i] = i * 4;
+              float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
+              v[0] = make_float4(
+                  dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
+              v[1] = make_float4(
+                  dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
+              v[2] = make_float4(
+                  dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
+              v[3] = make_float4(
+                  dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
+            }
+            else {
+              BoundBox bounds = BoundBox::empty;
+              curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
+
+              const size_t index = step * num_segments + i;
+              aabb_data[index].minX = bounds.min.x;
+              aabb_data[index].minY = bounds.min.y;
+              aabb_data[index].minZ = bounds.min.z;
+              aabb_data[index].maxX = bounds.max.x;
+              aabb_data[index].maxY = bounds.max.y;
+              aabb_data[index].maxZ = bounds.max.z;
+            }
+          }
+        }
+      }
+
+      /* Upload AABB data to GPU. */
+      aabb_data.copy_to_device();
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> aabb_ptrs;
+      aabb_ptrs.reserve(num_motion_steps);
+      vector<device_ptr> width_ptrs;
+      vector<device_ptr> vertex_ptrs;
+      width_ptrs.reserve(num_motion_steps);
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
+        const device_ptr base_ptr = vertex_data.device_pointer +
+                                    step * num_vertices * sizeof(float4);
+        width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
+        vertex_ptrs.push_back(base_ptr);
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      if (hair->curve_shape == CURVE_THICK) {
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
+        build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
+        build_input.curveArray.numPrimitives = num_segments;
+        build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+        build_input.curveArray.numVertices = num_vertices;
+        build_input.curveArray.vertexStrideInBytes = sizeof(float4);
+        build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
+        build_input.curveArray.widthStrideInBytes = sizeof(float4);
+        build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
+        build_input.curveArray.indexStrideInBytes = sizeof(int);
+        build_input.curveArray.flag = build_flags;
+        build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+      else {
+        /* Disable visibility test any-hit program, since it is already checked during
+         * intersection. Those trace calls that require anyhit can force it with a ray flag. */
+        build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
+
+        build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
+        build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
+        build_input.customPrimitiveArray.numPrimitives = num_segments;
+        build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
+        build_input.customPrimitiveArray.flags = &build_flags;
+        build_input.customPrimitiveArray.numSbtRecords = 1;
+        build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
+      }
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+    else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+      /* Build BLAS for triangle primitives. */
+      Mesh *const mesh = static_cast<Mesh *const>(geom);
+      if (mesh->num_triangles() == 0) {
+        return;
+      }
+
+      const size_t num_verts = mesh->get_verts().size();
+
+      size_t num_motion_steps = 1;
+      Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
+      if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
+        num_motion_steps = mesh->get_motion_steps();
+      }
+
+      device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
+      index_data.alloc(mesh->get_triangles().size());
+      memcpy(index_data.data(),
+             mesh->get_triangles().data(),
+             mesh->get_triangles().size() * sizeof(int));
+      device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
+      vertex_data.alloc(num_verts * num_motion_steps);
+
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        const float3 *verts = mesh->get_verts().data();
+
+        size_t center_step = (num_motion_steps - 1) / 2;
+        /* The center step for motion vertices is not stored in the attribute. */
+        if (step != center_step) {
+          verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
+        }
+
+        memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
+      }
+
+      /* Upload triangle data to GPU. */
+      index_data.copy_to_device();
+      vertex_data.copy_to_device();
+
+      vector<device_ptr> vertex_ptrs;
+      vertex_ptrs.reserve(num_motion_steps);
+      for (size_t step = 0; step < num_motion_steps; ++step) {
+        vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
+      }
+
+      /* Force a single any-hit call, so shadow record-all behavior works correctly. */
+      unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
+      OptixBuildInput build_input = {};
+      build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
+      build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
+      build_input.triangleArray.numVertices = num_verts;
+      build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
+      build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
+      build_input.triangleArray.indexBuffer = index_data.device_pointer;
+      build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
+      build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
+      build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
+      build_input.triangleArray.flags = &build_flags;
+      /* The SBT does not store per primitive data since Cycles already allocates separate
+       * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
+       * one and rely on that having the same meaning in this case. */
+      build_input.triangleArray.numSbtRecords = 1;
+      build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
+
+      if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
+        progress.set_error("Failed to build OptiX acceleration structure");
+      }
+    }
+  }
+  else {
+    unsigned int num_instances = 0;
+    unsigned int max_num_instances = 0xFFFFFFFF;
+
+    bvh_optix->as_data.free();
+    bvh_optix->traversable_handle = 0;
+    bvh_optix->motion_transform_data.free();
+
+    optixDeviceContextGetProperty(context,
+                                  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
+                                  &max_num_instances,
+                                  sizeof(max_num_instances));
+    /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
+    max_num_instances >>= 1;
+    if (bvh->objects.size() > max_num_instances) {
+      progress.set_error(
+          "Failed to build OptiX acceleration structure because there are too many instances");
+      return;
+    }
+
+    /* Fill instance descriptions. */
+    device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
+    instances.alloc(bvh->objects.size());
+
+    /* Calculate total motion transform size and allocate memory for them. */
+    size_t motion_transform_offset = 0;
+    if (motion_blur) {
+      size_t total_motion_transform_size = 0;
+      for (Object *const ob : bvh->objects) {
+        if (ob->is_traceable() && ob->use_motion()) {
+          total_motion_transform_size = align_up(total_motion_transform_size,
+                                                 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+          const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+          total_motion_transform_size = total_motion_transform_size +
+                                        sizeof(OptixSRTMotionTransform) +
+                                        motion_keys * sizeof(OptixSRTData);
+        }
+      }
+
+      assert(bvh_optix->motion_transform_data.device == this);
+      bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
+    }
+
+    for (Object *ob : bvh->objects) {
+      /* Skip non-traceable objects. */
+      if (!ob->is_traceable()) {
+        continue;
+      }
+
+      BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
+      OptixTraversableHandle handle = blas->traversable_handle;
+
+      OptixInstance &instance = instances[num_instances++];
+      memset(&instance, 0, sizeof(instance));
+
+      /* Clear transform to identity matrix. */
+      instance.transform[0] = 1.0f;
+      instance.transform[5] = 1.0f;
+      instance.transform[10] = 1.0f;
+
+      /* Set user instance ID to object index (but leave low bit blank). */
+      instance.instanceId = ob->get_device_index() << 1;
+
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      instance.visibilityMask = 1;
+
+      if (ob->get_geometry()->has_volume) {
+        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
+         */
+        instance.visibilityMask |= 2;
+      }
+
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
+        /* Same applies to curves (so they can be skipped in local trace calls). */
+        instance.visibilityMask |= 4;
+
+        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
+            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+          /* Select between motion blur and non-motion blur built-in intersection module. */
+          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
+        }
+      }
+
+      /* Insert motion traversable if object has motion. */
+      if (motion_blur && ob->use_motion()) {
+        size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
+        size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                       motion_keys * sizeof(OptixSRTData);
+
+        const CUDAContextScope scope(this);
+
+        motion_transform_offset = align_up(motion_transform_offset,
+                                           OPTIX_TRANSFORM_BYTE_ALIGNMENT);
+        CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
+                                           motion_transform_offset;
+        motion_transform_offset += motion_transform_size;
+
+        /* Allocate host side memory for motion transform and fill it with transform data. */
+        OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
+            new uint8_t[motion_transform_size]);
+        motion_transform.child = handle;
+        motion_transform.motionOptions.numKeys = ob->get_motion().size();
+        motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
+        motion_transform.motionOptions.timeBegin = 0.0f;
+        motion_transform.motionOptions.timeEnd = 1.0f;
+
+        OptixSRTData *const srt_data = motion_transform.srtData;
+        array<DecomposedTransform> decomp(ob->get_motion().size());
+        transform_motion_decompose(
+            decomp.data(), ob->get_motion().data(), ob->get_motion().size());
+
+        for (size_t i = 0; i < ob->get_motion().size(); ++i) {
+          /* Scale. */
+          srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
+          srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
+          srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
+
+          /* Shear. */
+          srt_data[i].a = decomp[i].z.x; /* scale.x.y */
+          srt_data[i].b = decomp[i].z.y; /* scale.x.z */
+          srt_data[i].c = decomp[i].w.x; /* scale.y.z */
+          assert(decomp[i].z.z == 0.0f); /* scale.y.x */
+          assert(decomp[i].w.y == 0.0f); /* scale.z.x */
+          assert(decomp[i].w.z == 0.0f); /* scale.z.y */
+
+          /* Pivot point. */
+          srt_data[i].pvx = 0.0f;
+          srt_data[i].pvy = 0.0f;
+          srt_data[i].pvz = 0.0f;
+
+          /* Rotation. */
+          srt_data[i].qx = decomp[i].x.x;
+          srt_data[i].qy = decomp[i].x.y;
+          srt_data[i].qz = decomp[i].x.z;
+          srt_data[i].qw = decomp[i].x.w;
+
+          /* Translation. */
+          srt_data[i].tx = decomp[i].y.x;
+          srt_data[i].ty = decomp[i].y.y;
+          srt_data[i].tz = decomp[i].y.z;
+        }
+
+        /* Upload motion transform to GPU. */
+        cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+        delete[] reinterpret_cast<uint8_t *>(&motion_transform);
+
+        /* Disable instance transform if object uses motion transform already. */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+
+        /* Get traversable handle to motion transform. */
+        optixConvertPointerToTraversableHandle(context,
+                                               motion_transform_gpu,
+                                               OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
+                                               &instance.traversableHandle);
+      }
+      else {
+        instance.traversableHandle = handle;
+
+        if (ob->get_geometry()->is_instanced()) {
+          /* Set transform matrix. */
+          memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
+        }
+        else {
+          /* Disable instance transform if geometry already has it applied to vertex data. */
+          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          /* Non-instanced objects read ID from 'prim_object', so distinguish
+           * them from instanced objects with the low bit set. */
+          instance.instanceId |= 1;
+        }
+      }
+    }
+
+    /* Upload instance descriptions. */
+    instances.resize(num_instances);
+    instances.copy_to_device();
+
+    /* Build top-level acceleration structure (TLAS) */
+    OptixBuildInput build_input = {};
+    build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
+    build_input.instanceArray.instances = instances.device_pointer;
+    build_input.instanceArray.numInstances = num_instances;
+
+    if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
+      progress.set_error("Failed to build OptiX acceleration structure");
+    }
+    tlas_handle = bvh_optix->traversable_handle;
+  }
+}
+
+void OptiXDevice::release_optix_bvh(BVH *bvh)
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
+   * while GPU is still rendering. */
+  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
+
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
+  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
+  bvh_optix->traversable_handle = 0;
+}
+
+void OptiXDevice::free_bvh_memory_delayed()
+{
+  thread_scoped_lock lock(delayed_free_bvh_mutex);
+  delayed_free_bvh_memory.free_memory();
+}
+
+void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  /* Set constant memory for CUDA module. */
+  CUDADevice::const_copy_to(name, host, size);
+
+  if (strcmp(name, "__data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update traversable handle (since it is different for each device on multi devices). */
+    KernelData *const data = (KernelData *)host;
+    *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+
+    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
+    return;
+  }
+
+  /* Update data storage pointers in launch parameters. */
+#  define KERNEL_TEX(data_type, tex_name) \
+    if (strcmp(name, #tex_name) == 0) { \
+      update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
+      return; \
+    }
+  KERNEL_TEX(IntegratorStateGPU, __integrator_state)
+#  include "kernel/kernel_textures.h"
+#  undef KERNEL_TEX
+}
+
+void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
new file mode 100644
index 00000000000..91ef52e0a5a
--- /dev/null
+++ b/intern/cycles/device/optix/device_impl.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2019, NVIDIA Corporation.
+ * Copyright 2019, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/device_impl.h"
+#  include "device/optix/queue.h"
+#  include "device/optix/util.h"
+#  include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BVHOptiX;
+struct KernelParamsOptiX;
+
+/* List of OptiX program groups. */
+enum {
+  PG_RGEN_INTERSECT_CLOSEST,
+  PG_RGEN_INTERSECT_SHADOW,
+  PG_RGEN_INTERSECT_SUBSURFACE,
+  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_SURFACE_RAYTRACE,
+  PG_MISS,
+  PG_HITD, /* Default hit group. */
+  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
+  PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITD_MOTION,
+  PG_HITS_MOTION,
+  PG_CALL_SVM_AO,
+  PG_CALL_SVM_BEVEL,
+  PG_CALL_AO_PASS,
+  NUM_PROGRAM_GROUPS
+};
+
+static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
+static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
+static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
+static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
+
+/* List of OptiX pipelines. */
+enum { PIP_SHADE_RAYTRACE, PIP_INTERSECT, NUM_PIPELINES };
+
+/* A single shader binding table entry. */
+struct SbtRecord {
+  char header[OPTIX_SBT_RECORD_HEADER_SIZE];
+};
+
+class OptiXDevice : public CUDADevice {
+ public:
+  OptixDeviceContext context = NULL;
+
+  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
+  OptixModule builtin_modules[2] = {};
+  OptixPipeline pipelines[NUM_PIPELINES] = {};
+
+  bool motion_blur = false;
+  device_vector<SbtRecord> sbt_data;
+  device_only_memory<KernelParamsOptiX> launch_params;
+  OptixTraversableHandle tlas_handle = 0;
+
+  vector<device_only_memory<char>> delayed_free_bvh_memory;
+  thread_mutex delayed_free_bvh_mutex;
+
+  class Denoiser {
+   public:
+    explicit Denoiser(OptiXDevice *device);
+    ~Denoiser();
+
+    OptiXDevice *device;
+    OptiXDeviceQueue queue;
+
+    OptixDenoiser optix_denoiser = nullptr;
+
+    /* Configuration size, as provided to `optixDenoiserSetup`.
+     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+     * `is_configured` will be false. */
+    bool is_configured = false;
+    int2 configured_size = make_int2(0, 0);
+
+    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+     * The memory layout goes as following: [denoiser state][scratch buffer]. */
+    device_only_memory<unsigned char> state;
+    size_t scratch_offset = 0;
+    size_t scratch_size = 0;
+
+    bool use_pass_albedo = false;
+    bool use_pass_normal = false;
+  };
+  Denoiser denoiser_;
+
+ public:
+  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+  ~OptiXDevice();
+
+ private:
+  BVHLayoutMask get_bvh_layout_mask() const override;
+
+  string compile_kernel_get_common_cflags(const uint kernel_features) override;
+
+  bool load_kernels(const uint kernel_features) override;
+
+  bool build_optix_bvh(BVHOptiX *bvh,
+                       OptixBuildOperation operation,
+                       const OptixBuildInput &build_input,
+                       uint16_t num_motion_steps);
+
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+
+  void release_optix_bvh(BVH *bvh) override;
+  void free_bvh_memory_delayed();
+
+  void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void update_launch_params(size_t offset, void *data, size_t data_size);
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* --------------------------------------------------------------------
+   * Denoising.
+   */
+
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
+  virtual DeviceQueue *get_denoise_queue() override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
new file mode 100644
index 00000000000..458ed70baa8
--- /dev/null
+++ b/intern/cycles/device/optix/queue.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_OPTIX
+
+#  include "device/optix/queue.h"
+#  include "device/optix/device_impl.h"
+
+#  include "util/util_time.h"
+
+#  undef __KERNEL_CPU__
+#  define __KERNEL_OPTIX__
+#  include "kernel/device/optix/globals.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* CUDADeviceQueue */
+
+OptiXDeviceQueue::OptiXDeviceQueue(OptiXDevice *device) : CUDADeviceQueue(device)
+{
+}
+
+void OptiXDeviceQueue::init_execution()
+{
+  CUDADeviceQueue::init_execution();
+}
+
+static bool is_optix_specific_kernel(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+}
+
+bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (!is_optix_specific_kernel(kernel)) {
+    return CUDADeviceQueue::enqueue(kernel, work_size, args);
+  }
+
+  if (cuda_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const CUDAContextScope scope(cuda_device_);
+
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
+  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;
+
+  cuda_device_assert(
+      cuda_device_,
+      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
+                        args[0],  // &d_path_index
+                        sizeof(device_ptr),
+                        cuda_stream_));
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    cuda_device_assert(
+        cuda_device_,
+        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+                          args[1],  // &d_render_buffer
+                          sizeof(device_ptr),
+                          cuda_stream_));
+  }
+
+  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+
+  OptixPipeline pipeline = nullptr;
+  OptixShaderBindingTable sbt_params = {};
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SHADOW * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_SUBSURFACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+      pipeline = optix_device->pipelines[PIP_INTERSECT];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
+      break;
+
+    default:
+      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
+                 << " is attempted to be enqueued.";
+      return false;
+  }
+
+  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
+  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
+  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
+  sbt_params.callablesRecordBase = sbt_data_ptr + CALLABLE_PROGRAM_GROUPS_BASE * sizeof(SbtRecord);
+  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
+  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
+
+  /* Launch the ray generation program. */
+  optix_device_assert(optix_device,
+                      optixLaunch(pipeline,
+                                  cuda_stream_,
+                                  launch_params_ptr,
+                                  optix_device->launch_params.data_elements,
+                                  &sbt_params,
+                                  work_size,
+                                  1,
+                                  1));
+
+  return !(optix_device->have_error());
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/queue.h b/intern/cycles/device/optix/queue.h
new file mode 100644
index 00000000000..0de422ccc71
--- /dev/null
+++ b/intern/cycles/device/optix/queue.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/queue.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OptiXDevice;
+
+/* Base class for CUDA queues. */
+class OptiXDeviceQueue : public CUDADeviceQueue {
+ public:
+  OptiXDeviceQueue(OptiXDevice *device);
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/device/optix/util.h b/intern/cycles/device/optix/util.h
new file mode 100644
index 00000000000..34ae5bb5609
--- /dev/null
+++ b/intern/cycles/device/optix/util.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_OPTIX
+
+#  include "device/cuda/util.h"
+
+#  ifdef WITH_CUDA_DYNLOAD
+#    include <cuew.h>
+// Do not use CUDA SDK headers when using CUEW
+#    define OPTIX_DONT_INCLUDE_CUDA
+#  endif
+
+#  include <optix_stubs.h>
+
+/* Utility for checking return values of OptiX function calls. */
+#  define optix_device_assert(optix_device, stmt) \
+    { \
+      OptixResult result = stmt; \
+      if (result != OPTIX_SUCCESS) { \
+        const char *name = optixGetErrorName(result); \
+        optix_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define optix_assert(stmt) optix_device_assert(this, stmt)
+
+#endif /* WITH_OPTIX */
diff --git a/intern/cycles/graph/node.cpp b/intern/cycles/graph/node.cpp
index 57f25283f85..8294e716ebe 100644
--- a/intern/cycles/graph/node.cpp
+++ b/intern/cycles/graph/node.cpp
@@ -814,7 +814,7 @@ bool Node::socket_is_modified(const SocketType &input) const
   return (socket_modified & input.modified_flag_bit) != 0;
 }
 
-bool Node::is_modified()
+bool Node::is_modified() const
 {
   return socket_modified != 0;
 }
diff --git a/intern/cycles/graph/node.h b/intern/cycles/graph/node.h
index aa365baeccd..8f27a82d37b 100644
--- a/intern/cycles/graph/node.h
+++ b/intern/cycles/graph/node.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <type_traits>
+
 #include "graph/node_type.h"
 
 #include "util/util_array.h"
@@ -34,7 +36,10 @@ struct Transform;
 #define NODE_SOCKET_API_BASE_METHODS(type_, name, string_name) \
   const SocketType *get_##name##_socket() const \
   { \
-    static const SocketType *socket = type->find_input(ustring(string_name)); \
+    /* Explicitly cast to base class to use `Node::type` even if the derived class defines \
+     * `type`. */ \
+    const Node *self_node = this; \
+    static const SocketType *socket = self_node->type->find_input(ustring(string_name)); \
     return socket; \
   } \
   bool name##_is_modified() const \
@@ -111,6 +116,15 @@ struct Node {
   void set(const SocketType &input, const Transform &value);
   void set(const SocketType &input, Node *value);
 
+  /* Implicitly cast enums and enum classes to integer, which matches an internal way of how
+   * enumerator values are stored and accessed in a generic API. */
+  template<class ValueType, typename std::enable_if_t<std::is_enum_v<ValueType>> * = nullptr>
+  void set(const SocketType &input, const ValueType &value)
+  {
+    static_assert(sizeof(ValueType) <= sizeof(int), "Enumerator type should fit int");
+    set(input, static_cast<int>(value));
+  }
+
   /* set array values. the memory from the input array will taken over
    * by the node and the input array will be empty after return */
   void set(const SocketType &input, array<bool> &value);
@@ -164,7 +178,7 @@ struct Node {
 
   bool socket_is_modified(const SocketType &input) const;
 
-  bool is_modified();
+  bool is_modified() const;
 
   void tag_modified();
   void clear_modified();
diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt
new file mode 100644
index 00000000000..bfabd35d7c3
--- /dev/null
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -0,0 +1,76 @@
+# Copyright 2011-2021 Blender Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set(INC
+  ..
+)
+
+set(SRC
+  adaptive_sampling.cpp
+  denoiser.cpp
+  denoiser_device.cpp
+  denoiser_oidn.cpp
+  denoiser_optix.cpp
+  path_trace.cpp
+  tile.cpp
+  pass_accessor.cpp
+  pass_accessor_cpu.cpp
+  pass_accessor_gpu.cpp
+  path_trace_work.cpp
+  path_trace_work_cpu.cpp
+  path_trace_work_gpu.cpp
+  render_scheduler.cpp
+  shader_eval.cpp
+  work_balancer.cpp
+  work_tile_scheduler.cpp
+)
+
+set(SRC_HEADERS
+  adaptive_sampling.h
+  denoiser.h
+  denoiser_device.h
+  denoiser_oidn.h
+  denoiser_optix.h
+  path_trace.h
+  tile.h
+  pass_accessor.h
+  pass_accessor_cpu.h
+  pass_accessor_gpu.h
+  path_trace_work.h
+  path_trace_work_cpu.h
+  path_trace_work_gpu.h
+  render_scheduler.h
+  shader_eval.h
+  work_balancer.h
+  work_tile_scheduler.h
+)
+
+set(LIB
+  # NOTE: Is required for RenderBuffers access. Might consider moving files around a bit to
+  # avoid such cyclic dependency.
+  cycles_render
+
+  cycles_util
+)
+
+if(WITH_OPENIMAGEDENOISE)
+  list(APPEND LIB
+    ${OPENIMAGEDENOISE_LIBRARIES}
+  )
+endif()
+
+include_directories(${INC})
+include_directories(SYSTEM ${INC_SYS})
+
+cycles_add_library(cycles_integrator "${LIB}" ${SRC} ${SRC_HEADERS})
diff --git a/intern/cycles/integrator/adaptive_sampling.cpp b/intern/cycles/integrator/adaptive_sampling.cpp
new file mode 100644
index 00000000000..23fbcfea5c2
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/adaptive_sampling.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+AdaptiveSampling::AdaptiveSampling()
+{
+}
+
+int AdaptiveSampling::align_samples(int start_sample, int num_samples) const
+{
+  if (!use) {
+    return num_samples;
+  }
+
+  /*
+   * The naive implementation goes as following:
+   *
+   *   int count = 1;
+   *   while (!need_filter(start_sample + count - 1) && count < num_samples) {
+   *     ++count;
+   *   }
+   *   return count;
+   */
+
+  /* 0-based sample index at which first filtering will happen. */
+  const int first_filter_sample = (min_samples + 1) | (adaptive_step - 1);
+
+  /* Allow as many samples as possible until the first filter sample. */
+  if (start_sample + num_samples <= first_filter_sample) {
+    return num_samples;
+  }
+
+  const int next_filter_sample = max(first_filter_sample, start_sample | (adaptive_step - 1));
+
+  const int num_samples_until_filter = next_filter_sample - start_sample + 1;
+
+  return min(num_samples_until_filter, num_samples);
+}
+
+bool AdaptiveSampling::need_filter(int sample) const
+{
+  if (!use) {
+    return false;
+  }
+
+  if (sample <= min_samples) {
+    return false;
+  }
+
+  return (sample & (adaptive_step - 1)) == (adaptive_step - 1);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/adaptive_sampling.h b/intern/cycles/integrator/adaptive_sampling.h
new file mode 100644
index 00000000000..d98edd9894c
--- /dev/null
+++ b/intern/cycles/integrator/adaptive_sampling.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling {
+ public:
+  AdaptiveSampling();
+
+  /* Align number of samples so that they align with the adaptive filtering.
+   *
+   * Returns the new value for the `num_samples` so that after rendering so many samples on top
+   * of `start_sample` filtering is required.
+   *
+   * The alignment happens in a way that allows to render as many samples as possible without
+   * missing any filtering point. This means that the result is "clamped" by the nearest sample
+   * at which filtering is needed. This is part of mechanism which ensures that all devices will
+   * perform same exact filtering and adaptive sampling, regardless of their performance.
+   *
+   * `start_sample` is the 0-based index of sample.
+   *
+   * NOTE: The start sample is included into the number of samples to render. This means that
+   * if the number of samples is 1, then the path tracer will render samples [align_samples],
+   * if the number of samples is 2, then the path tracer will render samples [align_samples,
+   * align_samples + 1] and so on. */
+  int align_samples(int start_sample, int num_samples) const;
+
+  /* Check whether adaptive sampling filter should happen at this sample.
+   * Returns false if the adaptive sampling is not use.
+   *
+   * `sample` is the 0-based index of sample. */
+  bool need_filter(int sample) const;
+
+  bool use = false;
+  int adaptive_step = 0;
+  int min_samples = 0;
+  float threshold = 0.0f;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.cpp b/intern/cycles/integrator/denoiser.cpp
new file mode 100644
index 00000000000..598bbd497a5
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser.h"
+
+#include "device/device.h"
+#include "integrator/denoiser_oidn.h"
+#include "integrator/denoiser_optix.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoiseParams &params)
+{
+  DCHECK(params.use);
+
+  switch (params.type) {
+    case DENOISER_OPTIX:
+      return make_unique<OptiXDenoiser>(path_trace_device, params);
+
+    case DENOISER_OPENIMAGEDENOISE:
+      return make_unique<OIDNDenoiser>(path_trace_device, params);
+
+    case DENOISER_NUM:
+    case DENOISER_NONE:
+    case DENOISER_ALL:
+      /* pass */
+      break;
+  }
+
+  LOG(FATAL) << "Unhandled denoiser type " << params.type << ", should never happen.";
+
+  return nullptr;
+}
+
+Denoiser::Denoiser(Device *path_trace_device, const DenoiseParams &params)
+    : path_trace_device_(path_trace_device), params_(params)
+{
+  DCHECK(params.use);
+}
+
+void Denoiser::set_params(const DenoiseParams &params)
+{
+  DCHECK_EQ(params.type, params_.type);
+
+  if (params.type == params_.type) {
+    params_ = params;
+  }
+  else {
+    LOG(ERROR) << "Attempt to change denoiser type.";
+  }
+}
+
+const DenoiseParams &Denoiser::get_params() const
+{
+  return params_;
+}
+
+bool Denoiser::load_kernels(Progress *progress)
+{
+  const Device *denoiser_device = ensure_denoiser_device(progress);
+
+  if (!denoiser_device) {
+    path_trace_device_->set_error("No device available to denoise on");
+    return false;
+  }
+
+  VLOG(3) << "Will denoise on " << denoiser_device->info.description << " ("
+          << denoiser_device->info.id << ")";
+
+  return true;
+}
+
+Device *Denoiser::get_denoiser_device() const
+{
+  return denoiser_device_;
+}
+
+/* Check whether given device is single (not a MultiDevice) and supports requested denoiser. */
+static bool is_single_supported_device(Device *device, DenoiserType type)
+{
+  if (device->info.type == DEVICE_MULTI) {
+    /* Assume multi-device is never created with a single sub-device.
+     * If one requests such configuration it should be checked on the session level. */
+    return false;
+  }
+
+  if (!device->info.multi_devices.empty()) {
+    /* Some configurations will use multi_devices, but keep the type of an individual device.
+     * This does simplify checks for homogenous setups, but here we really need a single device. */
+    return false;
+  }
+
+  /* Check the denoiser type is supported. */
+  return (device->info.denoisers & type);
+}
+
+/* Find best suitable device to perform denoiser on. Will iterate over possible sub-devices of
+ * multi-device.
+ *
+ * If there is no device available which supports given denoiser type nullptr is returned. */
+static Device *find_best_device(Device *device, DenoiserType type)
+{
+  Device *best_device = nullptr;
+
+  device->foreach_device([&](Device *sub_device) {
+    if ((sub_device->info.denoisers & type) == 0) {
+      return;
+    }
+    if (!best_device) {
+      best_device = sub_device;
+    }
+    else {
+      /* TODO(sergey): Choose fastest device from available ones. Taking into account performance
+       * of the device and data transfer cost. */
+    }
+  });
+
+  return best_device;
+}
+
+static unique_ptr<Device> create_denoiser_device(Device *path_trace_device,
+                                                 const uint device_type_mask)
+{
+  const vector<DeviceInfo> device_infos = Device::available_devices(device_type_mask);
+  if (device_infos.empty()) {
+    return nullptr;
+  }
+
+  /* TODO(sergey): Use one of the already configured devices, so that OptiX denoising can happen on
+   * a physical CUDA device which is already used for rendering. */
+
+  /* TODO(sergey): Choose fastest device for denoising. */
+
+  const DeviceInfo denoiser_device_info = device_infos.front();
+
+  unique_ptr<Device> denoiser_device(
+      Device::create(denoiser_device_info, path_trace_device->stats, path_trace_device->profiler));
+
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (denoiser_device->have_error()) {
+    return nullptr;
+  }
+
+  /* Only need denoising feature, everything else is unused. */
+  if (!denoiser_device->load_kernels(KERNEL_FEATURE_DENOISING)) {
+    return nullptr;
+  }
+
+  return denoiser_device;
+}
+
+Device *Denoiser::ensure_denoiser_device(Progress *progress)
+{
+  /* The best device has been found already, avoid sequential lookups.
+   * Additionally, avoid device re-creation if it has failed once. */
+  if (denoiser_device_ || device_creation_attempted_) {
+    return denoiser_device_;
+  }
+
+  /* Simple case: rendering happens on a single device which also supports denoiser. */
+  if (is_single_supported_device(path_trace_device_, params_.type)) {
+    denoiser_device_ = path_trace_device_;
+    return denoiser_device_;
+  }
+
+  /* Find best device from the ones which are already used for rendering. */
+  denoiser_device_ = find_best_device(path_trace_device_, params_.type);
+  if (denoiser_device_) {
+    return denoiser_device_;
+  }
+
+  if (progress) {
+    progress->set_status("Loading denoising kernels (may take a few minutes the first time)");
+  }
+
+  device_creation_attempted_ = true;
+
+  const uint device_type_mask = get_device_type_mask();
+  local_denoiser_device_ = create_denoiser_device(path_trace_device_, device_type_mask);
+  denoiser_device_ = local_denoiser_device_.get();
+
+  return denoiser_device_;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser.h b/intern/cycles/integrator/denoiser.h
new file mode 100644
index 00000000000..b02bcbeb046
--- /dev/null
+++ b/intern/cycles/integrator/denoiser.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+/* TODO(sergey): The integrator folder might not be the best. Is easy to move files around if the
+ * better place is figured out. */
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "util/util_function.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class RenderBuffers;
+class Progress;
+
+/* Implementation of a specific denoising algorithm.
+ *
+ * This class takes care of breaking down denoising algorithm into a series of device calls or to
+ * calls of an external API to denoise given input.
+ *
+ * TODO(sergey): Are we better with device or a queue here? */
+class Denoiser {
+ public:
+  /* Create denoiser for the given path trace device.
+   *
+   * Notes:
+   * - The denoiser must be configured. This means that `params.use` must be true.
+   *   This is checked in debug builds.
+   * - The device might be MultiDevice. */
+  static unique_ptr<Denoiser> create(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual ~Denoiser() = default;
+
+  void set_params(const DenoiseParams &params);
+  const DenoiseParams &get_params() const;
+
+  /* Create devices and load kernels needed for denoising.
+   * The progress is used to communicate state when kernels actually needs to be loaded.
+   *
+   * NOTE: The `progress` is an optional argument, can be nullptr. */
+  virtual bool load_kernels(Progress *progress);
+
+  /* Denoise the entire buffer.
+   *
+   * Buffer parameters denotes an effective parameters used during rendering. It could be
+   * a lower resolution render into a bigger allocated buffer, which is used in viewport during
+   * navigation and non-unit pixel size. Use that instead of render_buffers->params.
+   *
+   * The buffer might be coming from a "foreign" device from what this denoise is created for.
+   * This means that in general case the denoiser will make sure the input data is available on
+   * the denoiser device, perform denoising, and put data back to the device where the buffer
+   * came from.
+   *
+   * The `num_samples` corresponds to the number of samples in the render buffers. It is used
+   * to scale buffers down to the "final" value in algorithms which don't do automatic exposure,
+   * or which needs "final" value for data passes.
+   *
+   * The `allow_inplace_modification` means that the denoiser is allowed to do in-place
+   * modification of the input passes (scaling them down i.e.). This will lower the memory
+   * footprint of the denoiser but will make input passes "invalid" (from path tracer) point of
+   * view.
+   *
+   * Returns true when all passes are denoised. Will return false if there is a denoiser error (for
+   * example, caused by misconfigured denoiser) or when user requested to cancel rendering. */
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) = 0;
+
+  /* Get a device which is used to perform actual denoising.
+   *
+   * Notes:
+   *
+   * - The device is lazily initialized via `load_kernels()`, so it will be nullptr until then,
+   *
+   * - The device can be different from the path tracing device. This happens, for example, when
+   *   using OptiX denoiser and rendering on CPU.
+   *
+   * - No threading safety is ensured in this call. This means, that it is up to caller to ensure
+   *   that there is no threading-conflict between denoising task lazily initializing the device
+   *   and access to this device happen. */
+  Device *get_denoiser_device() const;
+
+  function<bool(void)> is_cancelled_cb;
+
+  bool is_cancelled() const
+  {
+    if (!is_cancelled_cb) {
+      return false;
+    }
+    return is_cancelled_cb();
+  }
+
+ protected:
+  Denoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  /* Make sure denoising device is initialized. */
+  virtual Device *ensure_denoiser_device(Progress *progress);
+
+  /* Get device type mask which is used to filter available devices when new device needs to be
+   * created. */
+  virtual uint get_device_type_mask() const = 0;
+
+  Device *path_trace_device_;
+  DenoiseParams params_;
+
+  /* Cached pointer to the device on which denoising will happen.
+   * Used to avoid lookup of a device for every denoising request. */
+  Device *denoiser_device_ = nullptr;
+
+  /* Denoiser device which was created to perform denoising in the case the none of the rendering
+   * devices are capable of denoising. */
+  unique_ptr<Device> local_denoiser_device_;
+  bool device_creation_attempted_ = false;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.cpp b/intern/cycles/integrator/denoiser_device.cpp
new file mode 100644
index 00000000000..e8361c50f2f
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_device.h"
+
+#include "device/device.h"
+#include "device/device_denoise.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+}
+
+DeviceDenoiser::~DeviceDenoiser()
+{
+  /* Explicit implementation, to allow forward declaration of Device in the header. */
+}
+
+bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                    RenderBuffers *render_buffers,
+                                    const int num_samples,
+                                    bool allow_inplace_modification)
+{
+  Device *denoiser_device = get_denoiser_device();
+  if (!denoiser_device) {
+    return false;
+  }
+
+  DeviceDenoiseTask task;
+  task.params = params_;
+  task.num_samples = num_samples;
+  task.buffer_params = buffer_params;
+  task.allow_inplace_modification = allow_inplace_modification;
+
+  RenderBuffers local_render_buffers(denoiser_device);
+  bool local_buffer_used = false;
+
+  if (denoiser_device == render_buffers->buffer.device) {
+    /* The device can access an existing buffer pointer. */
+    local_buffer_used = false;
+    task.render_buffers = render_buffers;
+  }
+  else {
+    VLOG(3) << "Creating temporary buffer on denoiser device.";
+
+    DeviceQueue *queue = denoiser_device->get_denoise_queue();
+
+    /* Create buffer which is available by the device used by denoiser. */
+
+    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
+     * ignoring other light ad data passes. */
+
+    local_buffer_used = true;
+
+    render_buffers->copy_from_device();
+
+    local_render_buffers.reset(buffer_params);
+
+    /* NOTE: The local buffer is allocated for an exact size of the effective render size, while
+     * the input render buffer is allocated for the lowest resolution divider possible. So it is
+     * important to only copy actually needed part of the input buffer. */
+    memcpy(local_render_buffers.buffer.data(),
+           render_buffers->buffer.data(),
+           sizeof(float) * local_render_buffers.buffer.size());
+
+    queue->copy_to_device(local_render_buffers.buffer);
+
+    task.render_buffers = &local_render_buffers;
+    task.allow_inplace_modification = true;
+  }
+
+  const bool denoise_result = denoiser_device->denoise_buffer(task);
+
+  if (local_buffer_used) {
+    local_render_buffers.copy_from_device();
+
+    render_buffers_host_copy_denoised(
+        render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+
+    render_buffers->copy_to_device();
+  }
+
+  return denoise_result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_device.h b/intern/cycles/integrator/denoiser_device.h
new file mode 100644
index 00000000000..0fd934dba79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
+ * implemented as a part of a driver of specific device.
+ *
+ * This implementation makes sure the to-be-denoised buffer is available on the denoising device
+ * and invoke denoising kernel via device API. */
+class DeviceDenoiser : public Denoiser {
+ public:
+  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~DeviceDenoiser();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
new file mode 100644
index 00000000000..7fc2b2b1892
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -0,0 +1,628 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/denoiser_oidn.h"
+
+#include <array>
+
+#include "device/device.h"
+#include "device/device_queue.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "render/buffers.h"
+#include "util/util_array.h"
+#include "util/util_logging.h"
+#include "util/util_openimagedenoise.h"
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+thread_mutex OIDNDenoiser::mutex_;
+
+OIDNDenoiser::OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : Denoiser(path_trace_device, params)
+{
+  DCHECK_EQ(params.type, DENOISER_OPENIMAGEDENOISE);
+
+  DCHECK(openimagedenoise_supported()) << "OpenImageDenoiser is not supported on this platform.";
+}
+
+#ifdef WITH_OPENIMAGEDENOISE
+static bool oidn_progress_monitor_function(void *user_ptr, double /*n*/)
+{
+  OIDNDenoiser *oidn_denoiser = reinterpret_cast<OIDNDenoiser *>(user_ptr);
+  return !oidn_denoiser->is_cancelled();
+}
+#endif
+
+#ifdef WITH_OPENIMAGEDENOISE
+
+class OIDNPass {
+ public:
+  OIDNPass() = default;
+
+  OIDNPass(const BufferParams &buffer_params,
+           const char *name,
+           PassType type,
+           PassMode mode = PassMode::NOISY)
+      : name(name), type(type), mode(mode)
+  {
+    offset = buffer_params.get_pass_offset(type, mode);
+    need_scale = (type == PASS_DENOISING_ALBEDO || type == PASS_DENOISING_NORMAL);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  inline operator bool() const
+  {
+    return name[0] != '\0';
+  }
+
+  /* Name of an image which will be passed to the OIDN library.
+   * Should be one of the following: color, albedo, normal, output.
+   * The albedo and normal images are optional. */
+  const char *name = "";
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  int num_components = -1;
+  bool use_compositing = false;
+  bool use_denoising_albedo = true;
+
+  /* Offset of beginning of this pass in the render buffers. */
+  int offset = -1;
+
+  /* Denotes whether the data is to be scaled down with the number of passes.
+   * Is required for albedo and normal passes. The color pass OIDN will perform auto-exposure, so
+   * scaling is not needed for the color pass unless adaptive sampling is used.
+   *
+   * NOTE: Do not scale the output pass, as that requires to be a pointer in the original buffer.
+   * All the scaling on the output needed for integration with adaptive sampling will happen
+   * outside of generic pass handling. */
+  bool need_scale = false;
+
+  /* The content of the pass has been pre-filtered. */
+  bool is_filtered = false;
+
+  /* For the scaled passes, the data which holds values of scaled pixels. */
+  array<float> scaled_buffer;
+};
+
+class OIDNDenoiseContext {
+ public:
+  OIDNDenoiseContext(OIDNDenoiser *denoiser,
+                     const DenoiseParams &denoise_params,
+                     const BufferParams &buffer_params,
+                     RenderBuffers *render_buffers,
+                     const int num_samples,
+                     const bool allow_inplace_modification)
+      : denoiser_(denoiser),
+        denoise_params_(denoise_params),
+        buffer_params_(buffer_params),
+        render_buffers_(render_buffers),
+        num_samples_(num_samples),
+        allow_inplace_modification_(allow_inplace_modification),
+        pass_sample_count_(buffer_params_.get_pass_offset(PASS_SAMPLE_COUNT))
+  {
+    if (denoise_params_.use_pass_albedo) {
+      oidn_albedo_pass_ = OIDNPass(buffer_params_, "albedo", PASS_DENOISING_ALBEDO);
+    }
+
+    if (denoise_params_.use_pass_normal) {
+      oidn_normal_pass_ = OIDNPass(buffer_params_, "normal", PASS_DENOISING_NORMAL);
+    }
+  }
+
+  bool need_denoising() const
+  {
+    if (buffer_params_.width == 0 && buffer_params_.height == 0) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /* Make the guiding passes available by a sequential denoising of various passes. */
+  void read_guiding_passes()
+  {
+    read_guiding_pass(oidn_albedo_pass_);
+    read_guiding_pass(oidn_normal_pass_);
+  }
+
+  void denoise_pass(const PassType pass_type)
+  {
+    OIDNPass oidn_color_pass(buffer_params_, "color", pass_type);
+    if (oidn_color_pass.offset == PASS_UNUSED) {
+      return;
+    }
+
+    if (oidn_color_pass.use_denoising_albedo) {
+      if (albedo_replaced_with_fake_) {
+        LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+        return;
+      }
+    }
+
+    OIDNPass oidn_output_pass(buffer_params_, "output", pass_type, PassMode::DENOISED);
+    if (oidn_output_pass.offset == PASS_UNUSED) {
+      LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+      return;
+    }
+
+    OIDNPass oidn_color_access_pass = read_input_pass(oidn_color_pass, oidn_output_pass);
+
+    oidn::DeviceRef oidn_device = oidn::newDevice();
+    oidn_device.commit();
+
+    /* Create a filter for denoising a beauty (color) image using prefiltered auxiliary images too.
+     */
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_input_pass(oidn_filter, oidn_color_access_pass);
+    set_guiding_passes(oidn_filter, oidn_color_pass);
+    set_output_pass(oidn_filter, oidn_output_pass);
+    oidn_filter.setProgressMonitorFunction(oidn_progress_monitor_function, denoiser_);
+    oidn_filter.set("hdr", true);
+    oidn_filter.set("srgb", false);
+    if (denoise_params_.prefilter == DENOISER_PREFILTER_NONE ||
+        denoise_params_.prefilter == DENOISER_PREFILTER_ACCURATE) {
+      oidn_filter.set("cleanAux", true);
+    }
+    oidn_filter.commit();
+
+    filter_guiding_pass_if_needed(oidn_device, oidn_albedo_pass_);
+    filter_guiding_pass_if_needed(oidn_device, oidn_normal_pass_);
+
+    /* Filter the beauty image. */
+    oidn_filter.execute();
+
+    /* Check for errors. */
+    const char *error_message;
+    const oidn::Error error = oidn_device.getError(error_message);
+    if (error != oidn::Error::None && error != oidn::Error::Cancelled) {
+      LOG(ERROR) << "OpenImageDenoise error: " << error_message;
+    }
+
+    postprocess_output(oidn_color_pass, oidn_output_pass);
+  }
+
+ protected:
+  void filter_guiding_pass_if_needed(oidn::DeviceRef &oidn_device, OIDNPass &oidn_pass)
+  {
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE || !oidn_pass ||
+        oidn_pass.is_filtered) {
+      return;
+    }
+
+    oidn::FilterRef oidn_filter = oidn_device.newFilter("RT");
+    set_pass(oidn_filter, oidn_pass);
+    set_output_pass(oidn_filter, oidn_pass);
+    oidn_filter.commit();
+    oidn_filter.execute();
+
+    oidn_pass.is_filtered = true;
+  }
+
+  /* Make pixels of a guiding pass available by the denoiser. */
+  void read_guiding_pass(OIDNPass &oidn_pass)
+  {
+    if (!oidn_pass) {
+      return;
+    }
+
+    DCHECK(!oidn_pass.use_compositing);
+
+    if (denoise_params_.prefilter != DENOISER_PREFILTER_ACCURATE &&
+        !is_pass_scale_needed(oidn_pass)) {
+      /* Pass data is available as-is from the render buffers. */
+      return;
+    }
+
+    if (allow_inplace_modification_) {
+      scale_pass_in_render_buffers(oidn_pass);
+      return;
+    }
+
+    read_pass_pixels_into_buffer(oidn_pass);
+  }
+
+  /* Special reader of the input pass.
+   * To save memory it will read pixels into the output, and let the denoiser to perform an
+   * in-place operation. */
+  OIDNPass read_input_pass(OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    const bool use_compositing = oidn_input_pass.use_compositing;
+
+    /* Simple case: no compositing is involved, no scaling is needed.
+     * The pass pixels will be referenced as-is, without extra processing. */
+    if (!use_compositing && !is_pass_scale_needed(oidn_input_pass)) {
+      return oidn_input_pass;
+    }
+
+    float *buffer_data = render_buffers_->buffer.data();
+    float *pass_data = buffer_data + oidn_output_pass.offset;
+
+    PassAccessor::Destination destination(pass_data, 3);
+    destination.pixel_stride = buffer_params_.pass_stride;
+
+    read_pass_pixels(oidn_input_pass, destination);
+
+    OIDNPass oidn_input_pass_at_output = oidn_input_pass;
+    oidn_input_pass_at_output.offset = oidn_output_pass.offset;
+
+    return oidn_input_pass_at_output;
+  }
+
+  /* Read pass pixels using PassAccessor into the given destination. */
+  void read_pass_pixels(const OIDNPass &oidn_pass, const PassAccessor::Destination &destination)
+  {
+    PassAccessor::PassAccessInfo pass_access_info;
+    pass_access_info.type = oidn_pass.type;
+    pass_access_info.mode = oidn_pass.mode;
+    pass_access_info.offset = oidn_pass.offset;
+
+    /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+     * on the approximation. The latter is not even possible because OIDN does not support
+     * denoising of semi-transparent pixels. */
+    pass_access_info.use_approximate_shadow_catcher = false;
+    pass_access_info.use_approximate_shadow_catcher_background = false;
+    pass_access_info.show_active_pixels = false;
+
+    /* OIDN will perform an auto-exposure, so it is not required to know exact exposure configured
+     * by users. What is important is to use same exposure for read and write access of the pass
+     * pixels. */
+    const PassAccessorCPU pass_accessor(pass_access_info, 1.0f, num_samples_);
+
+    pass_accessor.get_render_tile_pixels(render_buffers_, buffer_params_, destination);
+  }
+
+  /* Read pass pixels using PassAccessor into a temporary buffer which is owned by the pass.. */
+  void read_pass_pixels_into_buffer(OIDNPass &oidn_pass)
+  {
+    VLOG(3) << "Allocating temporary buffer for pass " << oidn_pass.name << " ("
+            << pass_type_as_string(oidn_pass.type) << ")";
+
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    array<float> &scaled_buffer = oidn_pass.scaled_buffer;
+    scaled_buffer.resize(width * height * 3);
+
+    const PassAccessor::Destination destination(scaled_buffer.data(), 3);
+
+    read_pass_pixels(oidn_pass, destination);
+  }
+
+  /* Set OIDN image to reference pixels from the given render buffer pass.
+   * No transform to the pixels is done, no additional memory is used. */
+  void set_pass_referenced(oidn::FilterRef &oidn_filter,
+                           const char *name,
+                           const OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+
+    const int64_t pixel_index = offset + x + y * stride;
+    const int64_t buffer_offset = pixel_index * pass_stride;
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    oidn_filter.setImage(name,
+                         buffer_data + buffer_offset + oidn_pass.offset,
+                         oidn::Format::Float3,
+                         width,
+                         height,
+                         0,
+                         pass_stride * sizeof(float),
+                         stride * pass_stride * sizeof(float));
+  }
+
+  void set_pass_from_buffer(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    oidn_filter.setImage(
+        name, oidn_pass.scaled_buffer.data(), oidn::Format::Float3, width, height, 0, 0, 0);
+  }
+
+  void set_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+  void set_pass(oidn::FilterRef &oidn_filter, const char *name, OIDNPass &oidn_pass)
+  {
+    if (oidn_pass.scaled_buffer.empty()) {
+      set_pass_referenced(oidn_filter, name, oidn_pass);
+    }
+    else {
+      set_pass_from_buffer(oidn_filter, name, oidn_pass);
+    }
+  }
+
+  void set_input_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass_referenced(oidn_filter, oidn_pass.name, oidn_pass);
+  }
+
+  void set_guiding_passes(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    if (oidn_albedo_pass_) {
+      if (oidn_pass.use_denoising_albedo) {
+        set_pass(oidn_filter, oidn_albedo_pass_);
+      }
+      else {
+        /* NOTE: OpenImageDenoise library implicitly expects albedo pass when normal pass has been
+         * provided. */
+        set_fake_albedo_pass(oidn_filter);
+      }
+    }
+
+    if (oidn_normal_pass_) {
+      set_pass(oidn_filter, oidn_normal_pass_);
+    }
+  }
+
+  void set_fake_albedo_pass(oidn::FilterRef &oidn_filter)
+  {
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+
+    if (!albedo_replaced_with_fake_) {
+      const int64_t num_pixel_components = width * height * 3;
+      oidn_albedo_pass_.scaled_buffer.resize(num_pixel_components);
+
+      for (int i = 0; i < num_pixel_components; ++i) {
+        oidn_albedo_pass_.scaled_buffer[i] = 0.5f;
+      }
+
+      albedo_replaced_with_fake_ = true;
+    }
+
+    set_pass(oidn_filter, oidn_albedo_pass_);
+  }
+
+  void set_output_pass(oidn::FilterRef &oidn_filter, OIDNPass &oidn_pass)
+  {
+    set_pass(oidn_filter, "output", oidn_pass);
+  }
+
+  /* Scale output pass to match adaptive sampling per-pixel scale, as well as bring alpha channel
+   * back. */
+  void postprocess_output(const OIDNPass &oidn_input_pass, const OIDNPass &oidn_output_pass)
+  {
+    kernel_assert(oidn_input_pass.num_components == oidn_output_pass.num_components);
+
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+    const bool need_scale = has_pass_sample_count || oidn_input_pass.use_compositing;
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *denoised_pixel = buffer_pixel + oidn_output_pass.offset;
+
+        if (need_scale) {
+          const float pixel_scale = has_pass_sample_count ?
+                                        __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                        num_samples_;
+
+          denoised_pixel[0] = denoised_pixel[0] * pixel_scale;
+          denoised_pixel[1] = denoised_pixel[1] * pixel_scale;
+          denoised_pixel[2] = denoised_pixel[2] * pixel_scale;
+        }
+
+        if (oidn_output_pass.num_components == 3) {
+          /* Pass without alpha channel. */
+        }
+        else if (!oidn_input_pass.use_compositing) {
+          /* Currently compositing passes are either 3-component (derived by dividing light passes)
+           * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+           * simplifies logic and avoids extra memory allocation. */
+          const float *noisy_pixel = buffer_pixel + oidn_input_pass.offset;
+          denoised_pixel[3] = noisy_pixel[3];
+        }
+        else {
+          /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+           * is an opaque pixel for 4 component passes. */
+          denoised_pixel[3] = 0;
+        }
+      }
+    }
+  }
+
+  bool is_pass_scale_needed(OIDNPass &oidn_pass) const
+  {
+    if (pass_sample_count_ != PASS_UNUSED) {
+      /* With adaptive sampling pixels will have different number of samples in them, so need to
+       * always scale the pass to make pixels uniformly sampled. */
+      return true;
+    }
+
+    if (!oidn_pass.need_scale) {
+      return false;
+    }
+
+    if (num_samples_ == 1) {
+      /* If the avoid scaling if there is only one sample, to save up time (so we don't divide
+       * buffer by 1). */
+      return false;
+    }
+
+    return true;
+  }
+
+  void scale_pass_in_render_buffers(OIDNPass &oidn_pass)
+  {
+    const int64_t x = buffer_params_.full_x;
+    const int64_t y = buffer_params_.full_y;
+    const int64_t width = buffer_params_.width;
+    const int64_t height = buffer_params_.height;
+    const int64_t offset = buffer_params_.offset;
+    const int64_t stride = buffer_params_.stride;
+    const int64_t pass_stride = buffer_params_.pass_stride;
+    const int64_t row_stride = stride * pass_stride;
+
+    const int64_t pixel_offset = offset + x + y * stride;
+    const int64_t buffer_offset = (pixel_offset * pass_stride);
+
+    float *buffer_data = render_buffers_->buffer.data();
+
+    const bool has_pass_sample_count = (pass_sample_count_ != PASS_UNUSED);
+
+    for (int y = 0; y < height; ++y) {
+      float *buffer_row = buffer_data + buffer_offset + y * row_stride;
+      for (int x = 0; x < width; ++x) {
+        float *buffer_pixel = buffer_row + x * pass_stride;
+        float *pass_pixel = buffer_pixel + oidn_pass.offset;
+
+        const float pixel_scale = 1.0f / (has_pass_sample_count ?
+                                              __float_as_uint(buffer_pixel[pass_sample_count_]) :
+                                              num_samples_);
+
+        pass_pixel[0] = pass_pixel[0] * pixel_scale;
+        pass_pixel[1] = pass_pixel[1] * pixel_scale;
+        pass_pixel[2] = pass_pixel[2] * pixel_scale;
+      }
+    }
+  }
+
+  OIDNDenoiser *denoiser_ = nullptr;
+
+  const DenoiseParams &denoise_params_;
+  const BufferParams &buffer_params_;
+  RenderBuffers *render_buffers_ = nullptr;
+  int num_samples_ = 0;
+  bool allow_inplace_modification_ = false;
+  int pass_sample_count_ = PASS_UNUSED;
+
+  /* Optional albedo and normal passes, reused by denoising of different pass types. */
+  OIDNPass oidn_albedo_pass_;
+  OIDNPass oidn_normal_pass_;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake_ = false;
+};
+#endif
+
+static unique_ptr<DeviceQueue> create_device_queue(const RenderBuffers *render_buffers)
+{
+  Device *device = render_buffers->buffer.device;
+  if (device->info.has_gpu_queue) {
+    return device->gpu_queue_create();
+  }
+  return nullptr;
+}
+
+static void copy_render_buffers_from_device(unique_ptr<DeviceQueue> &queue,
+                                            RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_from_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_from_device();
+  }
+}
+
+static void copy_render_buffers_to_device(unique_ptr<DeviceQueue> &queue,
+                                          RenderBuffers *render_buffers)
+{
+  if (queue) {
+    queue->copy_to_device(render_buffers->buffer);
+    queue->synchronize();
+  }
+  else {
+    render_buffers->copy_to_device();
+  }
+}
+
+bool OIDNDenoiser::denoise_buffer(const BufferParams &buffer_params,
+                                  RenderBuffers *render_buffers,
+                                  const int num_samples,
+                                  bool allow_inplace_modification)
+{
+  thread_scoped_lock lock(mutex_);
+
+  /* Make sure the host-side data is available for denoising. */
+  unique_ptr<DeviceQueue> queue = create_device_queue(render_buffers);
+  copy_render_buffers_from_device(queue, render_buffers);
+
+#ifdef WITH_OPENIMAGEDENOISE
+  OIDNDenoiseContext context(
+      this, params_, buffer_params, render_buffers, num_samples, allow_inplace_modification);
+
+  if (context.need_denoising()) {
+    context.read_guiding_passes();
+
+    const std::array<PassType, 3> passes = {
+        {/* Passes which will use real albedo when it is available. */
+         PASS_COMBINED,
+         PASS_SHADOW_CATCHER_MATTE,
+
+         /* Passes which do not need albedo and hence if real is present it needs to become fake.
+          */
+         PASS_SHADOW_CATCHER}};
+
+    for (const PassType pass_type : passes) {
+      context.denoise_pass(pass_type);
+      if (is_cancelled()) {
+        return false;
+      }
+    }
+
+    /* TODO: It may be possible to avoid this copy, but we have to ensure that when other code
+     * copies data from the device it doesn't overwrite the denoiser buffers. */
+    copy_render_buffers_to_device(queue, render_buffers);
+  }
+#endif
+
+  /* This code is not supposed to run when compiled without OIDN support, so can assume if we made
+   * it up here all passes are properly denoised. */
+  return true;
+}
+
+uint OIDNDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_CPU;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/denoiser_oidn.h b/intern/cycles/integrator/denoiser_oidn.h
new file mode 100644
index 00000000000..566e761ae79
--- /dev/null
+++ b/intern/cycles/integrator/denoiser_oidn.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of denoising API which uses OpenImageDenoise library. */
+class OIDNDenoiser : public Denoiser {
+ public:
+  /* Forwardly declared state which might be using compile-flag specific fields, such as
+   * OpenImageDenoise device and filter handles. */
+  class State;
+
+  OIDNDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+
+  /* We only perform one denoising at a time, since OpenImageDenoise itself is multithreaded.
+   * Use this mutex whenever images are passed to the OIDN and needs to be denoised. */
+  static thread_mutex mutex_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl b/intern/cycles/integrator/denoiser_optix.cpp
index ed64ae01aae..5f9de23bfe6 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_direct_lighting.cl
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,21 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_direct_lighting.h"
+#include "integrator/denoiser_optix.h"
 
-#define KERNEL_NAME direct_lighting
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
+#include "device/device.h"
+#include "device/device_denoise.h"
 
+CCL_NAMESPACE_BEGIN
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DeviceDenoiser(path_trace_device, params)
+{
+}
+
+uint OptiXDenoiser::get_device_type_mask() const
+{
+  return DEVICE_MASK_OPTIX;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl b/intern/cycles/integrator/denoiser_optix.h
index c314dc96c33..a8df770ecf7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_lamp_emission.cl
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2015 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,18 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_lamp_emission.h"
+#pragma once
 
-#define KERNEL_NAME lamp_emission
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#include "integrator/denoiser_device.h"
 
+CCL_NAMESPACE_BEGIN
+
+class OptiXDenoiser : public DeviceDenoiser {
+ public:
+  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+
+ protected:
+  virtual uint get_device_type_mask() const override;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
new file mode 100644
index 00000000000..87c048b1fa5
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/kernel_types.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Pass input information.
+ */
+
+PassAccessor::PassAccessInfo::PassAccessInfo(const BufferPass &pass)
+    : type(pass.type), mode(pass.mode), include_albedo(pass.include_albedo), offset(pass.offset)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass destination.
+ */
+
+PassAccessor::Destination::Destination(float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
+    : Destination(pass_type)
+{
+  pixels_half_rgba = pixels;
+}
+
+PassAccessor::Destination::Destination(const PassType pass_type)
+{
+  const PassInfo pass_info = Pass::get_info(pass_type);
+  num_components = pass_info.num_components;
+}
+
+/* --------------------------------------------------------------------
+ * Pass source.
+ */
+
+PassAccessor::Source::Source(const float *pixels, int num_components)
+    : pixels(pixels), num_components(num_components)
+{
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessor.
+ */
+
+PassAccessor::PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples)
+    : pass_access_info_(pass_access_info), exposure_(exposure), num_samples_(num_samples)
+{
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  return get_render_tile_pixels(render_buffers, render_buffers->params, destination);
+}
+
+static void pad_pixels(const BufferParams &buffer_params,
+                       const PassAccessor::Destination &destination,
+                       const int src_num_components)
+{
+  /* When requesting a single channel pass as RGBA, or RGB pass as RGBA,
+   * fill in the additional components for convenience. */
+  const int dest_num_components = destination.num_components;
+
+  if (src_num_components >= dest_num_components) {
+    return;
+  }
+
+  const size_t size = buffer_params.width * buffer_params.height;
+  if (destination.pixels) {
+    float *pixel = destination.pixels;
+
+    for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[1] = pixel[0];
+        pixel[2] = pixel[0];
+      }
+      if (dest_num_components >= 4) {
+        pixel[3] = 1.0f;
+      }
+    }
+  }
+
+  if (destination.pixels_half_rgba) {
+    const half one = float_to_half(1.0f);
+    half4 *pixel = destination.pixels_half_rgba;
+
+    for (size_t i = 0; i < size; i++, pixel++) {
+      if (dest_num_components >= 3 && src_num_components == 1) {
+        pixel[0].y = pixel[0].x;
+        pixel[0].z = pixel[0].x;
+      }
+      if (dest_num_components >= 4) {
+        pixel[0].w = one;
+      }
+    }
+  }
+}
+
+bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
+                                          const BufferParams &buffer_params,
+                                          const Destination &destination) const
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  if (pass_access_info_.offset == PASS_UNUSED) {
+    return false;
+  }
+
+  const PassType type = pass_access_info_.type;
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo pass_info = Pass::get_info(type, pass_access_info_.include_albedo);
+
+  if (pass_info.num_components == 1) {
+    /* Single channel passes. */
+    if (mode == PassMode::DENOISED) {
+      /* Denoised passes store their final pixels, no need in special calculation. */
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_RENDER_TIME) {
+      /* TODO(sergey): Needs implementation. */
+    }
+    else if (type == PASS_DEPTH) {
+      get_pass_depth(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_MIST) {
+      get_pass_mist(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SAMPLE_COUNT) {
+      get_pass_sample_count(render_buffers, buffer_params, destination);
+    }
+    else {
+      get_pass_float(render_buffers, buffer_params, destination);
+    }
+  }
+  else if (type == PASS_MOTION) {
+    /* Motion pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Motion pass must have 4 components";
+    get_pass_motion(render_buffers, buffer_params, destination);
+  }
+  else if (type == PASS_CRYPTOMATTE) {
+    /* Cryptomatte pass. */
+    DCHECK_EQ(destination.num_components, 4) << "Cryptomatte pass must have 4 components";
+    get_pass_cryptomatte(render_buffers, buffer_params, destination);
+  }
+  else {
+    /* RGB, RGBA and vector passes. */
+    DCHECK(destination.num_components == 3 || destination.num_components == 4)
+        << pass_type_as_string(type) << " pass must have 3 or 4 components";
+
+    if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
+      /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
+       * to approximate shadow with). */
+      get_pass_shadow_catcher_matte_with_shadow(render_buffers, buffer_params, destination);
+    }
+    else if (type == PASS_SHADOW_CATCHER && mode != PassMode::DENOISED) {
+      /* Shadow catcher pass. */
+      get_pass_shadow_catcher(render_buffers, buffer_params, destination);
+    }
+    else if ((pass_info.divide_type != PASS_NONE || pass_info.direct_type != PASS_NONE ||
+              pass_info.indirect_type != PASS_NONE) &&
+             mode != PassMode::DENOISED) {
+      /* RGB lighting passes that need to divide out color and/or sum direct and indirect. */
+      get_pass_light_path(render_buffers, buffer_params, destination);
+    }
+    else {
+      /* Passes that need no special computation, or denoised passes that already
+       * had the computation done. */
+      if (pass_info.num_components == 3) {
+        get_pass_float3(render_buffers, buffer_params, destination);
+      }
+      else if (pass_info.num_components == 4) {
+        if (destination.num_components == 3) {
+          /* Special case for denoiser access of RGBA passes ignoring alpha channel. */
+          get_pass_float3(render_buffers, buffer_params, destination);
+        }
+        else if (type == PASS_COMBINED || type == PASS_SHADOW_CATCHER ||
+                 type == PASS_SHADOW_CATCHER_MATTE) {
+          /* Passes with transparency as 4th component. */
+          get_pass_combined(render_buffers, buffer_params, destination);
+        }
+        else {
+          /* Passes with alpha as 4th component. */
+          get_pass_float4(render_buffers, buffer_params, destination);
+        }
+      }
+    }
+  }
+
+  pad_pixels(buffer_params, destination, pass_info.num_components);
+
+  return true;
+}
+
+void PassAccessor::init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination) const
+{
+  const PassMode mode = pass_access_info_.mode;
+  const PassInfo &pass_info = Pass::get_info(pass_access_info_.type,
+                                             pass_access_info_.include_albedo);
+
+  kfilm_convert->pass_offset = pass_access_info_.offset;
+  kfilm_convert->pass_stride = buffer_params.pass_stride;
+
+  kfilm_convert->pass_use_exposure = pass_info.use_exposure;
+  kfilm_convert->pass_use_filter = pass_info.use_filter;
+
+  /* TODO(sergey): Some of the passes needs to become denoised when denoised pass is accessed. */
+  if (pass_info.direct_type != PASS_NONE) {
+    kfilm_convert->pass_offset = buffer_params.get_pass_offset(pass_info.direct_type);
+  }
+  kfilm_convert->pass_indirect = buffer_params.get_pass_offset(pass_info.indirect_type);
+  kfilm_convert->pass_divide = buffer_params.get_pass_offset(pass_info.divide_type);
+
+  kfilm_convert->pass_combined = buffer_params.get_pass_offset(PASS_COMBINED);
+  kfilm_convert->pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  kfilm_convert->pass_adaptive_aux_buffer = buffer_params.get_pass_offset(
+      PASS_ADAPTIVE_AUX_BUFFER);
+  kfilm_convert->pass_motion_weight = buffer_params.get_pass_offset(PASS_MOTION_WEIGHT);
+  kfilm_convert->pass_shadow_catcher = buffer_params.get_pass_offset(PASS_SHADOW_CATCHER, mode);
+  kfilm_convert->pass_shadow_catcher_sample_count = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  kfilm_convert->pass_shadow_catcher_matte = buffer_params.get_pass_offset(
+      PASS_SHADOW_CATCHER_MATTE, mode);
+
+  /* Background is not denoised, so always use noisy pass. */
+  kfilm_convert->pass_background = buffer_params.get_pass_offset(PASS_BACKGROUND);
+
+  if (pass_info.use_filter) {
+    kfilm_convert->scale = num_samples_ != 0 ? 1.0f / num_samples_ : 0.0f;
+  }
+  else {
+    kfilm_convert->scale = 1.0f;
+  }
+
+  if (pass_info.use_exposure) {
+    kfilm_convert->exposure = exposure_;
+  }
+  else {
+    kfilm_convert->exposure = 1.0f;
+  }
+
+  kfilm_convert->scale_exposure = kfilm_convert->scale * kfilm_convert->exposure;
+
+  kfilm_convert->use_approximate_shadow_catcher = pass_access_info_.use_approximate_shadow_catcher;
+  kfilm_convert->use_approximate_shadow_catcher_background =
+      pass_access_info_.use_approximate_shadow_catcher_background;
+  kfilm_convert->show_active_pixels = pass_access_info_.show_active_pixels;
+
+  kfilm_convert->num_components = destination.num_components;
+  kfilm_convert->pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                           destination.num_components;
+
+  kfilm_convert->is_denoised = (mode == PassMode::DENOISED);
+}
+
+bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source)
+{
+  if (render_buffers == nullptr || render_buffers->buffer.data() == nullptr) {
+    return false;
+  }
+
+  const PassInfo pass_info = Pass::get_info(pass_access_info_.type,
+                                            pass_access_info_.include_albedo);
+
+  const BufferParams &buffer_params = render_buffers->params;
+
+  float *buffer_data = render_buffers->buffer.data();
+  const int size = buffer_params.width * buffer_params.height;
+
+  const int out_stride = buffer_params.pass_stride;
+  const int in_stride = source.num_components;
+  const int num_components_to_copy = min(source.num_components, pass_info.num_components);
+
+  float *out = buffer_data + pass_access_info_.offset;
+  const float *in = source.pixels + source.offset * in_stride;
+
+  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+    memcpy(out, in, sizeof(float) * num_components_to_copy);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
new file mode 100644
index 00000000000..624bf7d0b2c
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "render/pass.h"
+#include "util/util_half.h"
+#include "util/util_string.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class RenderBuffers;
+class BufferPass;
+class BufferParams;
+struct KernelFilmConvert;
+
+/* Helper class which allows to access pass data.
+ * Is designed in a way that it is created once when the pass data is known, and then pixels gets
+ * progressively update from various render buffers. */
+class PassAccessor {
+ public:
+  class PassAccessInfo {
+   public:
+    PassAccessInfo() = default;
+    explicit PassAccessInfo(const BufferPass &pass);
+
+    PassType type = PASS_NONE;
+    PassMode mode = PassMode::NOISY;
+    bool include_albedo = false;
+    int offset = -1;
+
+    /* For the shadow catcher matte pass: whether to approximate shadow catcher pass into its
+     * matte pass, so that both artificial objects and shadows can be alpha-overed onto a backdrop.
+     */
+    bool use_approximate_shadow_catcher = false;
+
+    /* When approximate shadow catcher matte is used alpha-over the result on top of background. */
+    bool use_approximate_shadow_catcher_background = false;
+
+    bool show_active_pixels = false;
+  };
+
+  class Destination {
+   public:
+    Destination() = default;
+    Destination(float *pixels, int num_components);
+    Destination(const PassType pass_type, half4 *pixels);
+
+    /* Destination will be initialized with the number of components which is native for the given
+     * pass type. */
+    explicit Destination(const PassType pass_type);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    float *pixels = nullptr;
+    half4 *pixels_half_rgba = nullptr;
+
+    /* Device-side pointers. */
+    device_ptr d_pixels = 0;
+    device_ptr d_pixels_half_rgba = 0;
+
+    /* Number of components per pixel in the floating-point destination.
+     * Is ignored for half4 destination (where number of components is implied to be 4). */
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+
+    /* Number of floats per pixel. When zero is the same as `num_components`.
+     *
+     * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+     * half-floats. */
+    int pixel_stride = 0;
+
+    /* Row stride in pixel elements:
+     *  - For the float destination stride is a number of floats per row.
+     *  - For the half4 destination stride is a number of half4 per row. */
+    int stride = 0;
+  };
+
+  class Source {
+   public:
+    Source() = default;
+    Source(const float *pixels, int num_components);
+
+    /* CPU-side pointers. only usable by the `PassAccessorCPU`. */
+    const float *pixels = nullptr;
+    int num_components = 0;
+
+    /* Offset in pixels from the beginning of pixels storage.
+     * Allows to get pixels of render buffer into a partial slice of the destination. */
+    int offset = 0;
+  };
+
+  PassAccessor(const PassAccessInfo &pass_access_info, float exposure, int num_samples);
+
+  virtual ~PassAccessor() = default;
+
+  /* Get pass data from the given render buffers, perform needed filtering, and store result into
+   * the pixels.
+   * The result is stored sequentially starting from the very beginning of the pixels memory. */
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const Destination &destination) const;
+  bool get_render_tile_pixels(const RenderBuffers *render_buffers,
+                              const BufferParams &buffer_params,
+                              const Destination &destination) const;
+  /* Set pass data for the given render buffers. Used for baking to read from passes. */
+  bool set_render_tile_pixels(RenderBuffers *render_buffers, const Source &source);
+
+ protected:
+  virtual void init_kernel_film_convert(KernelFilmConvert *kfilm_convert,
+                                        const BufferParams &buffer_params,
+                                        const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const = 0;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+
+  PassAccessInfo pass_access_info_;
+
+  float exposure_ = 0.0f;
+  int num_samples_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
new file mode 100644
index 00000000000..3c6691f6d43
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/kernel_types.h"
+#include "kernel/kernel_film.h"
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Kernel processing.
+ */
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                                           const BufferParams &buffer_params,
+                                                           const Destination &destination,
+                                                           const Processor &processor) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  if (destination.pixels) {
+    /* NOTE: No overlays are applied since they are not used for final renders.
+     * Can be supported via some sort of specialization to avoid code duplication. */
+
+    run_get_pass_kernel_processor_float(
+        &kfilm_convert, render_buffers, buffer_params, destination, processor);
+  }
+
+  if (destination.pixels_half_rgba) {
+    /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
+
+    if (destination.num_components == 1) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                float pixel;
+                                                processor(kfilm_convert, buffer, &pixel);
+
+                                                pixel_rgba[0] = pixel;
+                                                pixel_rgba[1] = pixel;
+                                                pixel_rgba[2] = pixel;
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 3) {
+      run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
+                                              render_buffers,
+                                              buffer_params,
+                                              destination,
+                                              [&processor](const KernelFilmConvert *kfilm_convert,
+                                                           ccl_global const float *buffer,
+                                                           float *pixel_rgba) {
+                                                processor(kfilm_convert, buffer, pixel_rgba);
+                                                pixel_rgba[3] = 1.0f;
+                                              });
+    }
+    else if (destination.num_components == 4) {
+      run_get_pass_kernel_processor_half_rgba(
+          &kfilm_convert, render_buffers, buffer_params, destination, processor);
+    }
+  }
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+  const float *buffer_data = render_buffers->buffer.data();
+  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
+                                                      destination.num_components;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+      float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
+
+      processor(kfilm_convert, buffer, pixel);
+    }
+  });
+}
+
+template<typename Processor>
+inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    const RenderBuffers *render_buffers,
+    const BufferParams &buffer_params,
+    const Destination &destination,
+    const Processor &processor) const
+{
+  const float *buffer_data = render_buffers->buffer.data();
+
+  half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    half4 *dst_row_start = dst_start + y * destination_stride;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+      const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
+      const float *buffer = buffer_data + input_pixel_offset;
+
+      float pixel[4];
+      processor(kfilm_convert, buffer, pixel);
+
+      film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+      half4 *pixel_half_rgba = dst_row_start + x;
+      float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+    }
+  });
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass) \
+  void PassAccessorCPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_get_pass_kernel_processor( \
+        render_buffers, buffer_params, destination, film_get_pass_pixel_##pass); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth)
+DEFINE_PASS_ACCESSOR(mist)
+DEFINE_PASS_ACCESSOR(sample_count)
+DEFINE_PASS_ACCESSOR(float)
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path)
+DEFINE_PASS_ACCESSOR(shadow_catcher)
+DEFINE_PASS_ACCESSOR(float3)
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion)
+DEFINE_PASS_ACCESSOR(cryptomatte)
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+DEFINE_PASS_ACCESSOR(combined)
+DEFINE_PASS_ACCESSOR(float4)
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_cpu.h b/intern/cycles/integrator/pass_accessor_cpu.h
new file mode 100644
index 00000000000..0313dc5bb0d
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_cpu.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelFilmConvert;
+
+/* Pass accessor implementation for CPU side. */
+class PassAccessorCPU : public PassAccessor {
+ public:
+  using PassAccessor::PassAccessor;
+
+ protected:
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
+                                            const BufferParams &buffer_params,
+                                            const Destination &destination,
+                                            const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_float(const KernelFilmConvert *kfilm_convert,
+                                                  const RenderBuffers *render_buffers,
+                                                  const BufferParams &buffer_params,
+                                                  const Destination &destination,
+                                                  const Processor &processor) const;
+
+  template<typename Processor>
+  inline void run_get_pass_kernel_processor_half_rgba(const KernelFilmConvert *kfilm_convert,
+                                                      const RenderBuffers *render_buffers,
+                                                      const BufferParams &buffer_params,
+                                                      const Destination &destination,
+                                                      const Processor &processor) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth)
+  DECLARE_PASS_ACCESSOR(mist)
+  DECLARE_PASS_ACCESSOR(sample_count)
+  DECLARE_PASS_ACCESSOR(float)
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path)
+  DECLARE_PASS_ACCESSOR(shadow_catcher)
+  DECLARE_PASS_ACCESSOR(float3)
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion)
+  DECLARE_PASS_ACCESSOR(cryptomatte)
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow)
+  DECLARE_PASS_ACCESSOR(combined)
+  DECLARE_PASS_ACCESSOR(float4)
+
+#undef DECLARE_PASS_ACCESSOR
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
new file mode 100644
index 00000000000..eb80ba99655
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/pass_accessor_gpu.h"
+
+#include "device/device_queue.h"
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+PassAccessorGPU::PassAccessorGPU(DeviceQueue *queue,
+                                 const PassAccessInfo &pass_access_info,
+                                 float exposure,
+                                 int num_samples)
+    : PassAccessor(pass_access_info, exposure, num_samples), queue_(queue)
+
+{
+}
+
+/* --------------------------------------------------------------------
+ * Kernel execution.
+ */
+
+void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
+                                               const RenderBuffers *render_buffers,
+                                               const BufferParams &buffer_params,
+                                               const Destination &destination) const
+{
+  KernelFilmConvert kfilm_convert;
+  init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  if (destination.d_pixels) {
+    DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+  if (destination.d_pixels_half_rgba) {
+    const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride),
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
+
+    queue_->enqueue(kernel_half_float, work_size, args);
+  }
+
+  queue_->synchronize();
+}
+
+/* --------------------------------------------------------------------
+ * Pass accessors.
+ */
+
+#define DEFINE_PASS_ACCESSOR(pass, kernel_pass) \
+  void PassAccessorGPU::get_pass_##pass(const RenderBuffers *render_buffers, \
+                                        const BufferParams &buffer_params, \
+                                        const Destination &destination) const \
+  { \
+    run_film_convert_kernels( \
+        DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
+  }
+
+/* Float (scalar) passes. */
+DEFINE_PASS_ACCESSOR(depth, DEPTH);
+DEFINE_PASS_ACCESSOR(mist, MIST);
+DEFINE_PASS_ACCESSOR(sample_count, SAMPLE_COUNT);
+DEFINE_PASS_ACCESSOR(float, FLOAT);
+
+/* Float3 passes. */
+DEFINE_PASS_ACCESSOR(light_path, LIGHT_PATH);
+DEFINE_PASS_ACCESSOR(float3, FLOAT3);
+
+/* Float4 passes. */
+DEFINE_PASS_ACCESSOR(motion, MOTION);
+DEFINE_PASS_ACCESSOR(cryptomatte, CRYPTOMATTE);
+DEFINE_PASS_ACCESSOR(shadow_catcher, SHADOW_CATCHER);
+DEFINE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow, SHADOW_CATCHER_MATTE_WITH_SHADOW);
+DEFINE_PASS_ACCESSOR(combined, COMBINED);
+DEFINE_PASS_ACCESSOR(float4, FLOAT4);
+
+#undef DEFINE_PASS_ACCESSOR
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/pass_accessor_gpu.h b/intern/cycles/integrator/pass_accessor_gpu.h
new file mode 100644
index 00000000000..bc37e4387f3
--- /dev/null
+++ b/intern/cycles/integrator/pass_accessor_gpu.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+/* Pass accessor implementation for GPU side. */
+class PassAccessorGPU : public PassAccessor {
+ public:
+  PassAccessorGPU(DeviceQueue *queue,
+                  const PassAccessInfo &pass_access_info,
+                  float exposure,
+                  int num_samples);
+
+ protected:
+  void run_film_convert_kernels(DeviceKernel kernel,
+                                const RenderBuffers *render_buffers,
+                                const BufferParams &buffer_params,
+                                const Destination &destination) const;
+
+#define DECLARE_PASS_ACCESSOR(pass) \
+  virtual void get_pass_##pass(const RenderBuffers *render_buffers, \
+                               const BufferParams &buffer_params, \
+                               const Destination &destination) const override;
+
+  /* Float (scalar) passes. */
+  DECLARE_PASS_ACCESSOR(depth);
+  DECLARE_PASS_ACCESSOR(mist);
+  DECLARE_PASS_ACCESSOR(sample_count);
+  DECLARE_PASS_ACCESSOR(float);
+
+  /* Float3 passes. */
+  DECLARE_PASS_ACCESSOR(light_path);
+  DECLARE_PASS_ACCESSOR(float3);
+
+  /* Float4 passes. */
+  DECLARE_PASS_ACCESSOR(motion);
+  DECLARE_PASS_ACCESSOR(cryptomatte);
+  DECLARE_PASS_ACCESSOR(shadow_catcher);
+  DECLARE_PASS_ACCESSOR(shadow_catcher_matte_with_shadow);
+  DECLARE_PASS_ACCESSOR(combined);
+  DECLARE_PASS_ACCESSOR(float4);
+
+#undef DECLARE_PASS_ACCESSOR
+
+  DeviceQueue *queue_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
new file mode 100644
index 00000000000..b62a06aea43
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -0,0 +1,1144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace.h"
+
+#include "device/cpu/device.h"
+#include "device/device.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/render_scheduler.h"
+#include "render/gpu_display.h"
+#include "render/pass.h"
+#include "render/scene.h"
+#include "render/tile.h"
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTrace::PathTrace(Device *device,
+                     Film *film,
+                     DeviceScene *device_scene,
+                     RenderScheduler &render_scheduler,
+                     TileManager &tile_manager)
+    : device_(device),
+      device_scene_(device_scene),
+      render_scheduler_(render_scheduler),
+      tile_manager_(tile_manager)
+{
+  DCHECK_NE(device_, nullptr);
+
+  {
+    vector<DeviceInfo> cpu_devices;
+    device_cpu_info(cpu_devices);
+
+    cpu_device_.reset(device_cpu_create(cpu_devices[0], device->stats, device->profiler));
+  }
+
+  /* Create path tracing work in advance, so that it can be reused by incremental sampling as much
+   * as possible. */
+  device_->foreach_device([&](Device *path_trace_device) {
+    path_trace_works_.emplace_back(PathTraceWork::create(
+        path_trace_device, film, device_scene, &render_cancel_.is_requested));
+  });
+
+  work_balance_infos_.resize(path_trace_works_.size());
+  work_balance_do_initial(work_balance_infos_);
+
+  render_scheduler.set_need_schedule_rebalance(path_trace_works_.size() > 1);
+}
+
+PathTrace::~PathTrace()
+{
+  /* Destroy any GPU resource which was used for graphics interop.
+   * Need to have access to the GPUDisplay as it is the only source of drawing context which is
+   * used for interop. */
+  if (gpu_display_) {
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->destroy_gpu_resources(gpu_display_.get());
+    }
+  }
+}
+
+void PathTrace::load_kernels()
+{
+  if (denoiser_) {
+    denoiser_->load_kernels(progress_);
+  }
+}
+
+void PathTrace::alloc_work_memory()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->alloc_work_memory();
+  }
+}
+
+bool PathTrace::ready_to_reset()
+{
+  /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU
+   * display. Of there is no such display, the logic here will break. */
+  DCHECK(gpu_display_);
+
+  /* The logic here tries to provide behavior which feels the most interactive feel to artists.
+   * General idea is to be able to reset as quickly as possible, while still providing interactive
+   * feel.
+   *
+   * If the render result was ever drawn after previous reset, consider that reset is now possible.
+   * This way camera navigation gives the quickest feedback of rendered pixels, regardless of
+   * whether CPU or GPU drawing pipeline is used.
+   *
+   * Consider reset happening after redraw "slow" enough to not clog anything. This is a bit
+   * arbitrary, but seems to work very well with viewport navigation in Blender. */
+
+  if (did_draw_after_reset_) {
+    return true;
+  }
+
+  return false;
+}
+
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
+{
+  if (big_tile_params_.modified(big_tile_params)) {
+    big_tile_params_ = big_tile_params;
+    render_state_.need_reset_params = true;
+  }
+
+  full_params_ = full_params;
+
+  /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+   * properly updated. */
+  if (gpu_display_) {
+    gpu_display_->reset(full_params);
+  }
+
+  render_state_.has_denoised_result = false;
+  render_state_.tile_written = false;
+
+  did_draw_after_reset_ = false;
+}
+
+void PathTrace::device_free()
+{
+  /* Free render buffers used by the path trace work to reduce memory peak. */
+  BufferParams empty_params;
+  empty_params.pass_stride = 0;
+  empty_params.update_offset_stride();
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->get_render_buffers()->reset(empty_params);
+  }
+  render_state_.need_reset_params = true;
+}
+
+void PathTrace::set_progress(Progress *progress)
+{
+  progress_ = progress;
+}
+
+void PathTrace::render(const RenderWork &render_work)
+{
+  /* Indicate that rendering has started and that it can be requested to cancel. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    if (render_cancel_.is_requested) {
+      return;
+    }
+    render_cancel_.is_rendering = true;
+  }
+
+  render_pipeline(render_work);
+
+  /* Indicate that rendering has finished, making it so thread which requested `cancel()` can carry
+   * on. */
+  {
+    thread_scoped_lock lock(render_cancel_.mutex);
+    render_cancel_.is_rendering = false;
+    render_cancel_.condition.notify_one();
+  }
+}
+
+void PathTrace::render_pipeline(RenderWork render_work)
+{
+  /* NOTE: Only check for "instant" cancel here. The user-requested cancel via progress is
+   * checked in Session and the work in the event of cancel is to be finished here. */
+
+  render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_passes !=
+                                                  0);
+
+  render_init_kernel_execution();
+
+  render_scheduler_.report_work_begin(render_work);
+
+  init_render_buffers(render_work);
+
+  rebalance(render_work);
+
+  path_trace(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  adaptive_sample(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  cryptomatte_postprocess(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  denoise(render_work);
+  if (render_cancel_.is_requested) {
+    return;
+  }
+
+  write_tile_buffer(render_work);
+  update_display(render_work);
+
+  progress_update_if_needed(render_work);
+
+  finalize_full_buffer_on_disk(render_work);
+}
+
+void PathTrace::render_init_kernel_execution()
+{
+  for (auto &&path_trace_work : path_trace_works_) {
+    path_trace_work->init_execution();
+  }
+}
+
+/* TODO(sergey): Look into `std::function` rather than using a template. Should not be a
+ * measurable performance impact at runtime, but will make compilation faster and binary somewhat
+ * smaller. */
+template<typename Callback>
+static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> &path_trace_works,
+                                         const vector<WorkBalanceInfo> &work_balance_infos,
+                                         const BufferParams &buffer_params,
+                                         const Callback &callback)
+{
+  const int num_works = path_trace_works.size();
+  const int height = buffer_params.height;
+
+  int current_y = 0;
+  for (int i = 0; i < num_works; ++i) {
+    const double weight = work_balance_infos[i].weight;
+    const int slice_height = max(lround(height * weight), 1);
+
+    /* Disallow negative values to deal with situations when there are more compute devices than
+     * scanlines. */
+    const int remaining_height = max(0, height - current_y);
+
+    BufferParams slide_params = buffer_params;
+    slide_params.full_y = buffer_params.full_y + current_y;
+    if (i < num_works - 1) {
+      slide_params.height = min(slice_height, remaining_height);
+    }
+    else {
+      slide_params.height = remaining_height;
+    }
+
+    slide_params.update_offset_stride();
+
+    callback(path_trace_works[i].get(), slide_params);
+
+    current_y += slide_params.height;
+  }
+}
+
+void PathTrace::update_allocated_work_buffer_params()
+{
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               big_tile_params_,
+                               [](PathTraceWork *path_trace_work, const BufferParams &params) {
+                                 RenderBuffers *buffers = path_trace_work->get_render_buffers();
+                                 buffers->reset(params);
+                               });
+}
+
+static BufferParams scale_buffer_params(const BufferParams &params, int resolution_divider)
+{
+  BufferParams scaled_params = params;
+
+  scaled_params.width = max(1, params.width / resolution_divider);
+  scaled_params.height = max(1, params.height / resolution_divider);
+  scaled_params.full_x = params.full_x / resolution_divider;
+  scaled_params.full_y = params.full_y / resolution_divider;
+  scaled_params.full_width = params.full_width / resolution_divider;
+  scaled_params.full_height = params.full_height / resolution_divider;
+
+  scaled_params.update_offset_stride();
+
+  return scaled_params;
+}
+
+void PathTrace::update_effective_work_buffer_params(const RenderWork &render_work)
+{
+  const int resolution_divider = render_work.resolution_divider;
+
+  const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
+  const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
+                                                                  resolution_divider);
+
+  foreach_sliced_buffer_params(path_trace_works_,
+                               work_balance_infos_,
+                               scaled_big_tile_params,
+                               [&](PathTraceWork *path_trace_work, const BufferParams params) {
+                                 path_trace_work->set_effective_buffer_params(
+                                     scaled_full_params, scaled_big_tile_params, params);
+                               });
+
+  render_state_.effective_big_tile_params = scaled_big_tile_params;
+}
+
+void PathTrace::update_work_buffer_params_if_needed(const RenderWork &render_work)
+{
+  if (render_state_.need_reset_params) {
+    update_allocated_work_buffer_params();
+  }
+
+  if (render_state_.need_reset_params ||
+      render_state_.resolution_divider != render_work.resolution_divider) {
+    update_effective_work_buffer_params(render_work);
+  }
+
+  render_state_.resolution_divider = render_work.resolution_divider;
+  render_state_.need_reset_params = false;
+}
+
+void PathTrace::init_render_buffers(const RenderWork &render_work)
+{
+  update_work_buffer_params_if_needed(render_work);
+
+  /* Handle initialization scheduled by the render scheduler. */
+  if (render_work.init_render_buffers) {
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->zero_render_buffers();
+    });
+
+    tile_buffer_read();
+  }
+}
+
+void PathTrace::path_trace(RenderWork &render_work)
+{
+  if (!render_work.path_trace.num_samples) {
+    return;
+  }
+
+  VLOG(3) << "Will path trace " << render_work.path_trace.num_samples
+          << " samples at the resolution divider " << render_work.resolution_divider;
+
+  const double start_time = time_dt();
+
+  const int num_works = path_trace_works_.size();
+
+  tbb::parallel_for(0, num_works, [&](int i) {
+    const double work_start_time = time_dt();
+    const int num_samples = render_work.path_trace.num_samples;
+
+    PathTraceWork *path_trace_work = path_trace_works_[i].get();
+
+    PathTraceWork::RenderStatistics statistics;
+    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+
+    const double work_time = time_dt() - work_start_time;
+    work_balance_infos_[i].time_spent += work_time;
+    work_balance_infos_[i].occupancy = statistics.occupancy;
+
+    VLOG(3) << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+            << work_time / num_samples
+            << " seconds per sample), occupancy: " << statistics.occupancy;
+  });
+
+  float occupancy_accum = 0.0f;
+  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
+    occupancy_accum += balance_info.occupancy;
+  }
+  const float occupancy = occupancy_accum / num_works;
+  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
+
+  render_scheduler_.report_path_trace_time(
+      render_work, time_dt() - start_time, is_cancel_requested());
+}
+
+void PathTrace::adaptive_sample(RenderWork &render_work)
+{
+  if (!render_work.adaptive_sampling.filter) {
+    return;
+  }
+
+  bool did_reschedule_on_idle = false;
+
+  while (true) {
+    VLOG(3) << "Will filter adaptive stopping buffer, threshold "
+            << render_work.adaptive_sampling.threshold;
+    if (render_work.adaptive_sampling.reset) {
+      VLOG(3) << "Will re-calculate convergency flag for currently converged pixels.";
+    }
+
+    const double start_time = time_dt();
+
+    uint num_active_pixels = 0;
+    tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+      const uint num_active_pixels_in_work =
+          path_trace_work->adaptive_sampling_converge_filter_count_active(
+              render_work.adaptive_sampling.threshold, render_work.adaptive_sampling.reset);
+      if (num_active_pixels_in_work) {
+        atomic_add_and_fetch_u(&num_active_pixels, num_active_pixels_in_work);
+      }
+    });
+
+    render_scheduler_.report_adaptive_filter_time(
+        render_work, time_dt() - start_time, is_cancel_requested());
+
+    if (num_active_pixels == 0) {
+      VLOG(3) << "All pixels converged.";
+      if (!render_scheduler_.render_work_reschedule_on_converge(render_work)) {
+        break;
+      }
+      VLOG(3) << "Continuing with lower threshold.";
+    }
+    else if (did_reschedule_on_idle) {
+      break;
+    }
+    else if (num_active_pixels < 128 * 128) {
+      /* NOTE: The hardcoded value of 128^2 is more of an empirical value to keep GPU busy so that
+       * there is no performance loss from the progressive noise floor feature.
+       *
+       * A better heuristic is possible here: for example, use maximum of 128^2 and percentage of
+       * the final resolution. */
+      if (!render_scheduler_.render_work_reschedule_on_idle(render_work)) {
+        VLOG(3) << "Rescheduling is not possible: final threshold is reached.";
+        break;
+      }
+      VLOG(3) << "Rescheduling lower threshold.";
+      did_reschedule_on_idle = true;
+    }
+    else {
+      break;
+    }
+  }
+}
+
+void PathTrace::set_denoiser_params(const DenoiseParams &params)
+{
+  render_scheduler_.set_denoiser_params(params);
+
+  if (!params.use) {
+    denoiser_.reset();
+    return;
+  }
+
+  if (denoiser_) {
+    const DenoiseParams old_denoiser_params = denoiser_->get_params();
+    if (old_denoiser_params.type == params.type) {
+      denoiser_->set_params(params);
+      return;
+    }
+  }
+
+  denoiser_ = Denoiser::create(device_, params);
+  denoiser_->is_cancelled_cb = [this]() { return is_cancel_requested(); };
+}
+
+void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  render_scheduler_.set_adaptive_sampling(adaptive_sampling);
+}
+
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+  if (!render_work.cryptomatte.postprocess) {
+    return;
+  }
+  VLOG(3) << "Perform cryptomatte work.";
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->cryptomatte_postproces();
+  });
+}
+
+void PathTrace::denoise(const RenderWork &render_work)
+{
+  if (!render_work.tile.denoise) {
+    return;
+  }
+
+  if (!denoiser_) {
+    /* Denoiser was not configured, so nothing to do here. */
+    return;
+  }
+
+  VLOG(3) << "Perform denoising work.";
+
+  const double start_time = time_dt();
+
+  RenderBuffers *buffer_to_denoise = nullptr;
+
+  unique_ptr<RenderBuffers> multi_device_buffers;
+  bool allow_inplace_modification = false;
+
+  if (path_trace_works_.size() == 1) {
+    buffer_to_denoise = path_trace_works_.front()->get_render_buffers();
+  }
+  else {
+    Device *denoiser_device = denoiser_->get_denoiser_device();
+    if (!denoiser_device) {
+      return;
+    }
+
+    multi_device_buffers = make_unique<RenderBuffers>(denoiser_device);
+    multi_device_buffers->reset(render_state_.effective_big_tile_params);
+
+    buffer_to_denoise = multi_device_buffers.get();
+
+    copy_to_render_buffers(multi_device_buffers.get());
+
+    allow_inplace_modification = true;
+  }
+
+  if (denoiser_->denoise_buffer(render_state_.effective_big_tile_params,
+                                buffer_to_denoise,
+                                get_num_samples_in_buffer(),
+                                allow_inplace_modification)) {
+    render_state_.has_denoised_result = true;
+  }
+
+  if (multi_device_buffers) {
+    multi_device_buffers->copy_from_device();
+    tbb::parallel_for_each(
+        path_trace_works_, [&multi_device_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+          path_trace_work->copy_from_denoised_render_buffers(multi_device_buffers.get());
+        });
+  }
+
+  render_scheduler_.report_denoise_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
+{
+  gpu_display_ = move(gpu_display);
+}
+
+void PathTrace::clear_gpu_display()
+{
+  if (gpu_display_) {
+    gpu_display_->clear();
+  }
+}
+
+void PathTrace::draw()
+{
+  if (!gpu_display_) {
+    return;
+  }
+
+  did_draw_after_reset_ |= gpu_display_->draw();
+}
+
+void PathTrace::update_display(const RenderWork &render_work)
+{
+  if (!render_work.display.update) {
+    return;
+  }
+
+  if (!gpu_display_ && !tile_buffer_update_cb) {
+    VLOG(3) << "Ignore display update.";
+    return;
+  }
+
+  if (full_params_.width == 0 || full_params_.height == 0) {
+    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (tile_buffer_update_cb) {
+    VLOG(3) << "Invoke buffer update callback.";
+
+    tile_buffer_update_cb();
+  }
+
+  if (gpu_display_) {
+    VLOG(3) << "Perform copy to GPUDisplay work.";
+
+    const int resolution_divider = render_work.resolution_divider;
+    const int texture_width = max(1, full_params_.width / resolution_divider);
+    const int texture_height = max(1, full_params_.height / resolution_divider);
+    if (!gpu_display_->update_begin(texture_width, texture_height)) {
+      LOG(ERROR) << "Error beginning GPUDisplay update.";
+      return;
+    }
+
+    const PassMode pass_mode = render_work.display.use_denoised_result &&
+                                       render_state_.has_denoised_result ?
+                                   PassMode::DENOISED :
+                                   PassMode::NOISY;
+
+    /* TODO(sergey): When using multi-device rendering map the GPUDisplay once and copy data from
+     * all works in parallel. */
+    const int num_samples = get_num_samples_in_buffer();
+    for (auto &&path_trace_work : path_trace_works_) {
+      path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples);
+    }
+
+    gpu_display_->update_end();
+  }
+
+  render_scheduler_.report_display_update_time(render_work, time_dt() - start_time);
+}
+
+void PathTrace::rebalance(const RenderWork &render_work)
+{
+  static const int kLogLevel = 3;
+
+  if (!render_work.rebalance) {
+    return;
+  }
+
+  const int num_works = path_trace_works_.size();
+
+  if (num_works == 1) {
+    VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
+    return;
+  }
+
+  const double start_time = time_dt();
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Perform rebalance work.";
+    VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].time_spent;
+    }
+  }
+
+  const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
+
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Calculated per-device weights for works:";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].weight;
+    }
+  }
+
+  if (!did_rebalance) {
+    VLOG(kLogLevel) << "Balance in path trace works did not change.";
+    render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, false);
+    return;
+  }
+
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+  big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+
+  copy_to_render_buffers(&big_tile_cpu_buffers);
+
+  render_state_.need_reset_params = true;
+  update_work_buffer_params_if_needed(render_work);
+
+  copy_from_render_buffers(&big_tile_cpu_buffers);
+
+  render_scheduler_.report_rebalance_time(render_work, time_dt() - start_time, true);
+}
+
+void PathTrace::write_tile_buffer(const RenderWork &render_work)
+{
+  if (!render_work.tile.write) {
+    return;
+  }
+
+  VLOG(3) << "Write tile result.";
+
+  render_state_.tile_written = true;
+
+  const bool has_multiple_tiles = tile_manager_.has_multiple_tiles();
+
+  /* Write render tile result, but only if not using tiled rendering.
+   *
+   * Tiles are written to a file during rendering, and written to the software at the end
+   * of rendering (wither when all tiles are finished, or when rendering was requested to be
+   * canceled).
+   *
+   * Important thing is: tile should be written to the software via callback only once. */
+  if (!has_multiple_tiles) {
+    VLOG(3) << "Write tile result via buffer write callback.";
+    tile_buffer_write();
+  }
+
+  /* Write tile to disk, so that the render work's render buffer can be re-used for the next tile.
+   */
+  if (has_multiple_tiles) {
+    VLOG(3) << "Write tile result into .";
+    tile_buffer_write_to_disk();
+  }
+}
+
+void PathTrace::finalize_full_buffer_on_disk(const RenderWork &render_work)
+{
+  if (!render_work.full.write) {
+    return;
+  }
+
+  VLOG(3) << "Handle full-frame render buffer work.";
+
+  if (!tile_manager_.has_written_tiles()) {
+    VLOG(3) << "No tiles on disk.";
+    return;
+  }
+
+  /* Make sure writing to the file is fully finished.
+   * This will include writing all possible missing tiles, ensuring validness of the file. */
+  tile_manager_.finish_write_tiles();
+
+  /* NOTE: The rest of full-frame post-processing (such as full-frame denoising) will be done after
+   * all scenes and layers are rendered by the Session (which happens after freeing Session memory,
+   * so that we never hold scene and full-frame buffer in memory at the same time). */
+}
+
+void PathTrace::cancel()
+{
+  thread_scoped_lock lock(render_cancel_.mutex);
+
+  render_cancel_.is_requested = true;
+
+  while (render_cancel_.is_rendering) {
+    render_cancel_.condition.wait(lock);
+  }
+
+  render_cancel_.is_requested = false;
+}
+
+int PathTrace::get_num_samples_in_buffer()
+{
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::is_cancel_requested()
+{
+  if (render_cancel_.is_requested) {
+    return true;
+  }
+
+  if (progress_ != nullptr) {
+    if (progress_->get_cancel()) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void PathTrace::tile_buffer_write()
+{
+  if (!tile_buffer_write_cb) {
+    return;
+  }
+
+  tile_buffer_write_cb();
+}
+
+void PathTrace::tile_buffer_read()
+{
+  if (!tile_buffer_read_cb) {
+    return;
+  }
+
+  if (tile_buffer_read_cb()) {
+    tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) {
+      path_trace_work->copy_render_buffers_to_device();
+    });
+  }
+}
+
+void PathTrace::tile_buffer_write_to_disk()
+{
+  /* Sample count pass is required to support per-tile partial results stored in the file. */
+  DCHECK_NE(big_tile_params_.get_pass_offset(PASS_SAMPLE_COUNT), PASS_UNUSED);
+
+  const int num_rendered_samples = render_scheduler_.get_num_rendered_samples();
+
+  if (num_rendered_samples == 0) {
+    /* The tile has zero samples, no need to write it. */
+    return;
+  }
+
+  /* Get access to the CPU-side render buffers of the current big tile. */
+  RenderBuffers *buffers;
+  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
+
+  if (path_trace_works_.size() == 1) {
+    path_trace_works_[0]->copy_render_buffers_from_device();
+    buffers = path_trace_works_[0]->get_render_buffers();
+  }
+  else {
+    big_tile_cpu_buffers.reset(render_state_.effective_big_tile_params);
+    copy_to_render_buffers(&big_tile_cpu_buffers);
+
+    buffers = &big_tile_cpu_buffers;
+  }
+
+  if (!tile_manager_.write_tile(*buffers)) {
+    LOG(ERROR) << "Error writing tile to file.";
+  }
+}
+
+void PathTrace::progress_update_if_needed(const RenderWork &render_work)
+{
+  if (progress_ != nullptr) {
+    const int2 tile_size = get_render_tile_size();
+    const int num_samples_added = tile_size.x * tile_size.y * render_work.path_trace.num_samples;
+    const int current_sample = render_work.path_trace.start_sample +
+                               render_work.path_trace.num_samples;
+    progress_->add_samples(num_samples_added, current_sample);
+  }
+
+  if (progress_update_cb) {
+    progress_update_cb();
+  }
+}
+
+void PathTrace::progress_set_status(const string &status, const string &substatus)
+{
+  if (progress_ != nullptr) {
+    progress_->set_status(status, substatus);
+  }
+}
+
+void PathTrace::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_to_render_buffers(render_buffers);
+                         });
+  render_buffers->copy_to_device();
+}
+
+void PathTrace::copy_from_render_buffers(RenderBuffers *render_buffers)
+{
+  render_buffers->copy_from_device();
+  tbb::parallel_for_each(path_trace_works_,
+                         [&render_buffers](unique_ptr<PathTraceWork> &path_trace_work) {
+                           path_trace_work->copy_from_render_buffers(render_buffers);
+                         });
+}
+
+bool PathTrace::copy_render_tile_from_device()
+{
+  if (full_frame_state_.render_buffers) {
+    /* Full-frame buffer is always allocated on CPU. */
+    return true;
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->copy_render_buffers_from_device()) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+static string get_layer_view_name(const RenderBuffers &buffers)
+{
+  string result;
+
+  if (buffers.params.layer.size()) {
+    result += string(buffers.params.layer);
+  }
+
+  if (buffers.params.view.size()) {
+    if (!result.empty()) {
+      result += ", ";
+    }
+    result += string(buffers.params.view);
+  }
+
+  return result;
+}
+
+void PathTrace::process_full_buffer_from_disk(string_view filename)
+{
+  VLOG(3) << "Processing full frame buffer file " << filename;
+
+  progress_set_status("Reading full buffer from disk");
+
+  RenderBuffers full_frame_buffers(cpu_device_.get());
+
+  DenoiseParams denoise_params;
+  if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
+    LOG(ERROR) << "Error reading tiles from file.";
+    return;
+  }
+
+  const string layer_view_name = get_layer_view_name(full_frame_buffers);
+
+  render_state_.has_denoised_result = false;
+
+  if (denoise_params.use) {
+    progress_set_status(layer_view_name, "Denoising");
+
+    /* Re-use the denoiser as much as possible, avoiding possible device re-initialization.
+     *
+     * It will not conflict with the regular rendering as:
+     *  - Rendering is supposed to be finished here.
+     *  - The next rendering will go via Session's `run_update_for_next_iteration` which will
+     *    ensure proper denoiser is used. */
+    set_denoiser_params(denoise_params);
+
+    /* Number of samples doesn't matter too much, since the samples count pass will be used. */
+    denoiser_->denoise_buffer(full_frame_buffers.params, &full_frame_buffers, 0, false);
+
+    render_state_.has_denoised_result = true;
+  }
+
+  full_frame_state_.render_buffers = &full_frame_buffers;
+
+  progress_set_status(layer_view_name, "Finishing");
+
+  /* Write the full result pretending that there is a single tile.
+   * Requires some state change, but allows to use same communication API with the software. */
+  tile_buffer_write();
+
+  full_frame_state_.render_buffers = nullptr;
+}
+
+int PathTrace::get_num_render_tile_samples() const
+{
+  if (full_frame_state_.render_buffers) {
+    return full_frame_state_.render_buffers->params.samples;
+  }
+
+  return render_scheduler_.get_num_rendered_samples();
+}
+
+bool PathTrace::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                       const PassAccessor::Destination &destination)
+{
+  if (full_frame_state_.render_buffers) {
+    return pass_accessor.get_render_tile_pixels(full_frame_state_.render_buffers, destination);
+  }
+
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->get_render_tile_pixels(pass_accessor, destination)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+bool PathTrace::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                       const PassAccessor::Source &source)
+{
+  bool success = true;
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    if (!success) {
+      return;
+    }
+    if (!path_trace_work->set_render_tile_pixels(pass_accessor, source)) {
+      success = false;
+    }
+  });
+
+  return success;
+}
+
+int2 PathTrace::get_render_tile_size() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(full_frame_state_.render_buffers->params.width,
+                     full_frame_state_.render_buffers->params.height);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.width, tile.height);
+}
+
+int2 PathTrace::get_render_tile_offset() const
+{
+  if (full_frame_state_.render_buffers) {
+    return make_int2(0, 0);
+  }
+
+  const Tile &tile = tile_manager_.get_current_tile();
+  return make_int2(tile.x, tile.y);
+}
+
+const BufferParams &PathTrace::get_render_tile_params() const
+{
+  if (full_frame_state_.render_buffers) {
+    return full_frame_state_.render_buffers->params;
+  }
+
+  return big_tile_params_;
+}
+
+bool PathTrace::has_denoised_result() const
+{
+  return render_state_.has_denoised_result;
+}
+
+/* --------------------------------------------------------------------
+ * Report generation.
+ */
+
+static const char *device_type_for_description(const DeviceType type)
+{
+  switch (type) {
+    case DEVICE_NONE:
+      return "None";
+
+    case DEVICE_CPU:
+      return "CPU";
+    case DEVICE_CUDA:
+      return "CUDA";
+    case DEVICE_OPTIX:
+      return "OptiX";
+    case DEVICE_DUMMY:
+      return "Dummy";
+    case DEVICE_MULTI:
+      return "Multi";
+  }
+
+  return "UNKNOWN";
+}
+
+/* Construct description of the device which will appear in the full report. */
+/* TODO(sergey): Consider making it more reusable utility. */
+static string full_device_info_description(const DeviceInfo &device_info)
+{
+  string full_description = device_info.description;
+
+  full_description += " (" + string(device_type_for_description(device_info.type)) + ")";
+
+  if (device_info.display_device) {
+    full_description += " (display)";
+  }
+
+  if (device_info.type == DEVICE_CPU) {
+    full_description += " (" + to_string(device_info.cpu_threads) + " threads)";
+  }
+
+  full_description += " [" + device_info.id + "]";
+
+  return full_description;
+}
+
+/* Construct string which will contain information about devices, possibly multiple of the devices.
+ *
+ * In the simple case the result looks like:
+ *
+ *   Message: Full Device Description
+ *
+ * If there are multiple devices then the result looks like:
+ *
+ *   Message: Full First Device Description
+ *            Full Second Device Description
+ *
+ * Note that the newlines are placed in a way so that the result can be easily concatenated to the
+ * full report. */
+static string device_info_list_report(const string &message, const DeviceInfo &device_info)
+{
+  string result = "\n" + message + ": ";
+  const string pad(message.length() + 2, ' ');
+
+  if (device_info.multi_devices.empty()) {
+    result += full_device_info_description(device_info) + "\n";
+    return result;
+  }
+
+  bool is_first = true;
+  for (const DeviceInfo &sub_device_info : device_info.multi_devices) {
+    if (!is_first) {
+      result += pad;
+    }
+
+    result += full_device_info_description(sub_device_info) + "\n";
+
+    is_first = false;
+  }
+
+  return result;
+}
+
+static string path_trace_devices_report(const vector<unique_ptr<PathTraceWork>> &path_trace_works)
+{
+  DeviceInfo device_info;
+  device_info.type = DEVICE_MULTI;
+
+  for (auto &&path_trace_work : path_trace_works) {
+    device_info.multi_devices.push_back(path_trace_work->get_device()->info);
+  }
+
+  return device_info_list_report("Path tracing on", device_info);
+}
+
+static string denoiser_device_report(const Denoiser *denoiser)
+{
+  if (!denoiser) {
+    return "";
+  }
+
+  if (!denoiser->get_params().use) {
+    return "";
+  }
+
+  const Device *denoiser_device = denoiser->get_denoiser_device();
+  if (!denoiser_device) {
+    return "";
+  }
+
+  return device_info_list_report("Denoising on", denoiser_device->info);
+}
+
+string PathTrace::full_report() const
+{
+  string result = "\nFull path tracing report\n";
+
+  result += path_trace_devices_report(path_trace_works_);
+  result += denoiser_device_report(denoiser_.get());
+
+  /* Report from the render scheduler, which includes:
+   * - Render mode (interactive, offline, headless)
+   * - Adaptive sampling and denoiser parameters
+   * - Breakdown of timing. */
+  result += render_scheduler_.full_report();
+
+  return result;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
new file mode 100644
index 00000000000..fc7713e6df9
--- /dev/null
+++ b/intern/cycles/integrator/path_trace.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+#include "integrator/pass_accessor.h"
+#include "integrator/path_trace_work.h"
+#include "integrator/work_balancer.h"
+#include "render/buffers.h"
+#include "util/util_function.h"
+#include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class AdaptiveSampling;
+class Device;
+class DeviceScene;
+class Film;
+class RenderBuffers;
+class RenderScheduler;
+class RenderWork;
+class Progress;
+class GPUDisplay;
+class TileManager;
+
+/* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of
+ * all the common steps of path tracing which are not device-specific. The list of tasks includes
+ * but is not limited to:
+ *  - Kernel graph.
+ *  - Scheduling logic.
+ *  - Queues management.
+ *  - Adaptive stopping. */
+class PathTrace {
+ public:
+  /* Render scheduler is used to report timing information and access things like start/finish
+   * sample. */
+  PathTrace(Device *device,
+            Film *film,
+            DeviceScene *device_scene,
+            RenderScheduler &render_scheduler,
+            TileManager &tile_manager);
+  ~PathTrace();
+
+  /* Create devices and load kernels which are created on-demand (for example, denoising devices).
+   * The progress is reported to the currently configure progress object (via `set_progress`). */
+  void load_kernels();
+
+  /* Allocate working memory. This runs before allocating scene memory so that we can estimate
+   * more accurately which scene device memory may need to allocated on the host. */
+  void alloc_work_memory();
+
+  /* Check whether now it is a good time to reset rendering.
+   * Used to avoid very often resets in the viewport, giving it a chance to draw intermediate
+   * render result. */
+  bool ready_to_reset();
+
+  void reset(const BufferParams &full_params, const BufferParams &big_tile_params);
+
+  void device_free();
+
+  /* Set progress tracker.
+   * Used to communicate details about the progress to the outer world, check whether rendering is
+   * to be canceled.
+   *
+   * The path tracer writes to this object, and then at a convenient moment runs
+   * progress_update_cb() callback. */
+  void set_progress(Progress *progress);
+
+  /* NOTE: This is a blocking call. Meaning, it will not return until given number of samples are
+   * rendered (or until rendering is requested to be canceled). */
+  void render(const RenderWork &render_work);
+
+  /* TODO(sergey): Decide whether denoiser is really a part of path tracer. Currently it is
+   * convenient to have it here because then its easy to access render buffer. But the downside is
+   * that this adds too much of entities which can live separately with some clear API. */
+
+  /* Set denoiser parameters.
+   * Use this to configure the denoiser before rendering any samples. */
+  void set_denoiser_params(const DenoiseParams &params);
+
+  /* Set parameters used for adaptive sampling.
+   * Use this to configure the adaptive sampler before rendering any samples. */
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  /* Set GPU display which takes care of drawing the render result. */
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
+
+  /* Clear the GPU display by filling it in with all zeroes. */
+  void clear_gpu_display();
+
+  /* Perform drawing of the current state of the GPUDisplay. */
+  void draw();
+
+  /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled.
+   * Used in cases like reset of render session.
+   *
+   * This is a blocking call, which returns as soon as there is no running `render_samples()` call.
+   */
+  void cancel();
+
+  /* Copy an entire render buffer to/from the path trace.  */
+
+  /* Copy happens via CPU side buffer: data will be copied from every device of the path trace, and
+   * the data will be copied to the device of the given render buffers. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy happens via CPU side buffer: data will be copied from the device of the given render
+   * buffers and will be copied to all devices of the path trace. */
+  void copy_from_render_buffers(RenderBuffers *render_buffers);
+
+  /* Copy render buffers of the big tile from the device to host.
+   * Return true if all copies are successful. */
+  bool copy_render_tile_from_device();
+
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
+
+  /* Get number of samples in the current big tile render buffers. */
+  int get_num_render_tile_samples() const;
+
+  /* Get pass data of the entire big tile.
+   * This call puts pass render result from all devices into the final pixels storage.
+   *
+   * NOTE: Expects buffers to be copied to the host using `copy_render_tile_from_device()`.
+   *
+   * Returns false if any of the accessor's `get_render_tile_pixels()` returned false. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Check whether denoiser was run and denoised passes are available. */
+  bool has_denoised_result() const;
+
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile.
+   * In the case of tiled rendering this will return full-frame after all tiles has been rendered.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
+
+  /* Get buffer parameters of the current tile.
+   *
+   * NOTE: If the full-frame buffer processing is in progress, returns parameters of the full-frame
+   * instead. */
+  const BufferParams &get_render_tile_params() const;
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+  /* Callback which communicates an updates state of the render buffer of the current big tile.
+   * Is called during path tracing to communicate work-in-progress state of the final buffer. */
+  function<void(void)> tile_buffer_update_cb;
+
+  /* Callback which communicates final rendered buffer. Is called after path-tracing is done. */
+  function<void(void)> tile_buffer_write_cb;
+
+  /* Callback which initializes rendered buffer. Is called before path-tracing starts.
+   *
+   * This is used for baking. */
+  function<bool(void)> tile_buffer_read_cb;
+
+  /* Callback which is called to report current rendering progress.
+   *
+   * It is supposed to be cheaper than buffer update/write, hence can be called more often.
+   * Additionally, it might be called form the middle of wavefront (meaning, it is not guaranteed
+   * that the buffer is "uniformly" sampled at the moment of this callback). */
+  function<void(void)> progress_update_cb;
+
+ protected:
+  /* Actual implementation of the rendering pipeline.
+   * Calls steps in order, checking for the cancel to be requested in between.
+   *
+   * Is separate from `render()` to simplify dealing with the early outputs and keeping
+   * `render_cancel_` in the consistent state. */
+  void render_pipeline(RenderWork render_work);
+
+  /* Initialize kernel execution on all integrator queues. */
+  void render_init_kernel_execution();
+
+  /* Make sure both allocated and effective buffer parameters of path tracer works are up to date
+   * with the current big tile parameters, performance-dependent slicing, and resolution divider.
+   */
+  void update_work_buffer_params_if_needed(const RenderWork &render_work);
+  void update_allocated_work_buffer_params();
+  void update_effective_work_buffer_params(const RenderWork &render_work);
+
+  /* Perform various steps of the render work.
+   *
+   * Note that some steps might modify the work, forcing some steps to happen within this iteration
+   * of rendering. */
+  void init_render_buffers(const RenderWork &render_work);
+  void path_trace(RenderWork &render_work);
+  void adaptive_sample(RenderWork &render_work);
+  void denoise(const RenderWork &render_work);
+  void cryptomatte_postprocess(const RenderWork &render_work);
+  void update_display(const RenderWork &render_work);
+  void rebalance(const RenderWork &render_work);
+  void write_tile_buffer(const RenderWork &render_work);
+  void finalize_full_buffer_on_disk(const RenderWork &render_work);
+
+  /* Get number of samples in the current state of the render buffers. */
+  int get_num_samples_in_buffer();
+
+  /* Check whether user requested to cancel rendering, so that path tracing is to be finished as
+   * soon as possible. */
+  bool is_cancel_requested();
+
+  /* Write the big tile render buffer via the write callback. */
+  void tile_buffer_write();
+
+  /* Read the big tile render buffer via the read callback. */
+  void tile_buffer_read();
+
+  /* Write current tile into the file on disk. */
+  void tile_buffer_write_to_disk();
+
+  /* Run the progress_update_cb callback if it is needed. */
+  void progress_update_if_needed(const RenderWork &render_work);
+
+  void progress_set_status(const string &status, const string &substatus = "");
+
+  /* Pointer to a device which is configured to be used for path tracing. If multiple devices
+   * are configured this is a `MultiDevice`. */
+  Device *device_ = nullptr;
+
+  /* CPU device for creating temporary render buffers on the CPU side. */
+  unique_ptr<Device> cpu_device_;
+
+  DeviceScene *device_scene_;
+
+  RenderScheduler &render_scheduler_;
+  TileManager &tile_manager_;
+
+  unique_ptr<GPUDisplay> gpu_display_;
+
+  /* Per-compute device descriptors of work which is responsible for path tracing on its configured
+   * device. */
+  vector<unique_ptr<PathTraceWork>> path_trace_works_;
+
+  /* Per-path trace work information needed for multi-device balancing. */
+  vector<WorkBalanceInfo> work_balance_infos_;
+
+  /* Render buffer parameters of the full frame and current big tile. */
+  BufferParams full_params_;
+  BufferParams big_tile_params_;
+
+  /* Denoiser which takes care of denoising the big tile. */
+  unique_ptr<Denoiser> denoiser_;
+
+  /* State which is common for all the steps of the render work.
+   * Is brought up to date in the `render()` call and is accessed from all the steps involved into
+   * rendering the work. */
+  struct {
+    /* Denotes whether render buffers parameters of path trace works are to be reset for the new
+     * value of the big tile parameters. */
+    bool need_reset_params = false;
+
+    /* Divider of the resolution for faster previews.
+     *
+     * Allows to re-use same render buffer, but have less pixels rendered into in it. The way to
+     * think of render buffer in this case is as an over-allocated array: the resolution divider
+     * affects both resolution and stride as visible by the integrator kernels. */
+    int resolution_divider = 0;
+
+    /* Parameters of the big tile with the current resolution divider applied. */
+    BufferParams effective_big_tile_params;
+
+    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    bool has_denoised_result = false;
+
+    /* Current tile has been written (to either disk or callback.
+     * Indicates that no more work will be done on this tile. */
+    bool tile_written = false;
+  } render_state_;
+
+  /* Progress object which is used to communicate sample progress. */
+  Progress *progress_;
+
+  /* Fields required for canceling render on demand, as quickly as possible. */
+  struct {
+    /* Indicates whether there is an on-going `render_samples()` call. */
+    bool is_rendering = false;
+
+    /* Indicates whether rendering is requested to be canceled by `cancel()`. */
+    bool is_requested = false;
+
+    /* Synchronization between thread which does `render_samples()` and thread which does
+     * `cancel()`. */
+    thread_mutex mutex;
+    thread_condition_variable condition;
+  } render_cancel_;
+
+  /* Indicates whether a render result was drawn after latest session reset.
+   * Used by `ready_to_reset()` to implement logic which feels the most interactive. */
+  bool did_draw_after_reset_ = true;
+
+  /* State of the full frame processing and writing to the software. */
+  struct {
+    RenderBuffers *render_buffers = nullptr;
+  } full_frame_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp
new file mode 100644
index 00000000000..d9634acac10
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/path_trace_work_cpu.h"
+#include "integrator/path_trace_work_gpu.h"
+#include "render/buffers.h"
+#include "render/film.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+unique_ptr<PathTraceWork> PathTraceWork::create(Device *device,
+                                                Film *film,
+                                                DeviceScene *device_scene,
+                                                bool *cancel_requested_flag)
+{
+  if (device->info.type == DEVICE_CPU) {
+    return make_unique<PathTraceWorkCPU>(device, film, device_scene, cancel_requested_flag);
+  }
+
+  return make_unique<PathTraceWorkGPU>(device, film, device_scene, cancel_requested_flag);
+}
+
+PathTraceWork::PathTraceWork(Device *device,
+                             Film *film,
+                             DeviceScene *device_scene,
+                             bool *cancel_requested_flag)
+    : device_(device),
+      film_(film),
+      device_scene_(device_scene),
+      buffers_(make_unique<RenderBuffers>(device)),
+      effective_buffer_params_(buffers_->params),
+      cancel_requested_flag_(cancel_requested_flag)
+{
+}
+
+PathTraceWork::~PathTraceWork()
+{
+}
+
+RenderBuffers *PathTraceWork::get_render_buffers()
+{
+  return buffers_.get();
+}
+
+void PathTraceWork::set_effective_buffer_params(const BufferParams &effective_full_params,
+                                                const BufferParams &effective_big_tile_params,
+                                                const BufferParams &effective_buffer_params)
+{
+  effective_full_params_ = effective_full_params;
+  effective_big_tile_params_ = effective_big_tile_params;
+  effective_buffer_params_ = effective_buffer_params;
+}
+
+bool PathTraceWork::has_multiple_works() const
+{
+  /* Assume if there are multiple works working on the same big tile none of the works gets the
+   * entire big tile to work on. */
+  return !(effective_big_tile_params_.width == effective_buffer_params_.width &&
+           effective_big_tile_params_.height == effective_buffer_params_.height &&
+           effective_big_tile_params_.full_x == effective_buffer_params_.full_x &&
+           effective_big_tile_params_.full_y == effective_buffer_params_.full_y);
+}
+
+void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
+{
+  copy_render_buffers_from_device();
+
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = buffers_->buffer.data();
+  float *dst = render_buffers->buffer.data() + offset_in_floats;
+
+  memcpy(dst, src, data_size);
+}
+
+void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t height = effective_buffer_params_.height;
+  const int64_t pass_stride = effective_buffer_params_.pass_stride;
+  const int64_t row_stride = width * pass_stride;
+  const int64_t data_size = row_stride * height * sizeof(float);
+
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset_in_floats = offset_y * row_stride;
+
+  const float *src = render_buffers->buffer.data() + offset_in_floats;
+  float *dst = buffers_->buffer.data();
+
+  memcpy(dst, src, data_size);
+
+  copy_render_buffers_to_device();
+}
+
+void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
+{
+  const int64_t width = effective_buffer_params_.width;
+  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int64_t offset = offset_y * width;
+
+  render_buffers_host_copy_denoised(
+      buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+
+  copy_render_buffers_to_device();
+}
+
+bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
+                                           const PassAccessor::Destination &destination)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Destination slice_destination = destination;
+  slice_destination.offset += offset_y * width;
+
+  return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+}
+
+bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
+                                           const PassAccessor::Source &source)
+{
+  const int offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  const int width = effective_buffer_params_.width;
+
+  PassAccessor::Source slice_source = source;
+  slice_source.offset += offset_y * width;
+
+  return pass_accessor.set_render_tile_pixels(buffers_.get(), slice_source);
+}
+
+PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMode pass_mode) const
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+  const KernelBackground &kbackground = device_scene_->data.background;
+
+  const BufferParams &params = buffers_->params;
+
+  const BufferPass *display_pass = params.get_actual_display_pass(film_->get_display_pass());
+
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = display_pass->type;
+  pass_access_info.offset = PASS_UNUSED;
+
+  if (pass_mode == PassMode::DENOISED) {
+    pass_access_info.mode = PassMode::DENOISED;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type, PassMode::DENOISED);
+  }
+
+  if (pass_access_info.offset == PASS_UNUSED) {
+    pass_access_info.mode = PassMode::NOISY;
+    pass_access_info.offset = params.get_pass_offset(pass_access_info.type);
+  }
+
+  pass_access_info.use_approximate_shadow_catcher = kfilm.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      kfilm.use_approximate_shadow_catcher && !kbackground.transparent;
+
+  return pass_access_info;
+}
+
+PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template(
+    const GPUDisplay *gpu_display) const
+{
+  PassAccessor::Destination destination(film_->get_display_pass());
+
+  const int2 display_texture_size = gpu_display->get_texture_size();
+  const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x;
+  const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y;
+
+  destination.offset = texture_y * display_texture_size.x + texture_x;
+  destination.stride = display_texture_size.x;
+
+  return destination;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
new file mode 100644
index 00000000000..8c9c8811199
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/pass_accessor.h"
+#include "render/buffers.h"
+#include "render/pass.h"
+#include "util/util_types.h"
+#include "util/util_unique_ptr.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+class Device;
+class DeviceScene;
+class Film;
+class GPUDisplay;
+class RenderBuffers;
+
+class PathTraceWork {
+ public:
+  struct RenderStatistics {
+    float occupancy = 1.0f;
+  };
+
+  /* Create path trace work which fits best the device.
+   *
+   * The cancel request flag is used for a cheap check whether cancel is to be performed as soon as
+   * possible. This could be, for example, request to cancel rendering on camera navigation in
+   * viewport. */
+  static unique_ptr<PathTraceWork> create(Device *device,
+                                          Film *film,
+                                          DeviceScene *device_scene,
+                                          bool *cancel_requested_flag);
+
+  virtual ~PathTraceWork();
+
+  /* Access the render buffers.
+   *
+   * Is only supposed to be used by the PathTrace to update buffer allocation and slicing to
+   * correspond to the big tile size and relative device performance. */
+  RenderBuffers *get_render_buffers();
+
+  /* Set effective parameters of the big tile and the work itself. */
+  void set_effective_buffer_params(const BufferParams &effective_full_params,
+                                   const BufferParams &effective_big_tile_params,
+                                   const BufferParams &effective_buffer_params);
+
+  /* Check whether the big tile is being worked on by multiple path trace works. */
+  bool has_multiple_works() const;
+
+  /* Allocate working memory for execution. Must be called before init_execution(). */
+  virtual void alloc_work_memory(){};
+
+  /* Initialize execution of kernels.
+   * Will ensure that all device queues are initialized for execution.
+   *
+   * This method is to be called after any change in the scene. It is not needed to call it prior
+   * to an every call of the `render_samples()`. */
+  virtual void init_execution() = 0;
+
+  /* Render given number of samples as a synchronous blocking call.
+   * The samples are added to the render buffer associated with this work. */
+  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+
+  /* Copy render result from this work to the corresponding place of the GPU display.
+   *
+   * The `pass_mode` indicates whether to access denoised or noisy version of the display pass. The
+   * noisy pass mode will be passed here when it is known that the buffer does not have denoised
+   * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is
+   * not used then this function will fall-back to the noisy pass instead. */
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) = 0;
+
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0;
+
+  /* Copy data from/to given render buffers.
+   * Will copy pixels from a corresponding place (from multi-device point of view) of the render
+   * buffers, and copy work's render buffers to the corresponding place of the destination. */
+
+  /* Notes:
+   * - Copies work's render buffer from the device.
+   * - Copies CPU-side buffer of the given buffer
+   * - Does not copy the buffer to its device. */
+  void copy_to_render_buffers(RenderBuffers *render_buffers);
+
+  /* Notes:
+   * - Does not copy given render buffers from the device.
+   * - Copies work's render buffer to its device. */
+  void copy_from_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+   * given render buffers, leaving rest of the passes.
+   *
+   * Same notes about device copying applies to this call as well. */
+  void copy_from_denoised_render_buffers(const RenderBuffers *render_buffers);
+
+  /* Copy render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool copy_render_buffers_from_device() = 0;
+  virtual bool copy_render_buffers_to_device() = 0;
+
+  /* Zero render buffers to/from device using an appropriate device queue when needed so that
+   * things are executed in order with the `render_samples()`. */
+  virtual bool zero_render_buffers() = 0;
+
+  /* Access pixels rendered by this work and copy them to the corresponding location in the
+   * destination.
+   *
+   * NOTE: Does not perform copy of buffers from the device. Use `copy_render_tile_from_device()`
+   * to update host-side data. */
+  bool get_render_tile_pixels(const PassAccessor &pass_accessor,
+                              const PassAccessor::Destination &destination);
+
+  /* Set pass data for baking. */
+  bool set_render_tile_pixels(PassAccessor &pass_accessor, const PassAccessor::Source &source);
+
+  /* Perform convergence test on the render buffer, and filter the convergence mask.
+   * Returns number of active pixels (the ones which did not converge yet). */
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
+
+  /* Run cryptomatte pass post-processing kernels. */
+  virtual void cryptomatte_postproces() = 0;
+
+  /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
+   * possible, without waiting for any samples to be finished. */
+  inline bool is_cancel_requested() const
+  {
+    /* NOTE: Rely on the fact that on x86 CPU reading scalar can happen without atomic even in
+     * threaded environment. */
+    return *cancel_requested_flag_;
+  }
+
+  /* Access to the device which is used to path trace this work on. */
+  Device *get_device() const
+  {
+    return device_;
+  }
+
+ protected:
+  PathTraceWork(Device *device,
+                Film *film,
+                DeviceScene *device_scene,
+                bool *cancel_requested_flag);
+
+  PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const;
+
+  /* Get destination which offset and stride are configured so that writing to it will write to a
+   * proper location of GPU display texture, taking current tile and device slice into account. */
+  PassAccessor::Destination get_gpu_display_destination_template(
+      const GPUDisplay *gpu_display) const;
+
+  /* Device which will be used for path tracing.
+   * Note that it is an actual render device (and never is a multi-device). */
+  Device *device_;
+
+  /* Film is used to access display pass configuration for GPU display update.
+   * Note that only fields which are not a part of kernel data can be accessed via the Film. */
+  Film *film_;
+
+  /* Device side scene storage, that may be used for integrator logic. */
+  DeviceScene *device_scene_;
+
+  /* Render buffers where sampling is being accumulated into, allocated for a fraction of the big
+   * tile which is being rendered by this work.
+   * It also defines possible subset of a big tile in the case of multi-device rendering. */
+  unique_ptr<RenderBuffers> buffers_;
+
+  /* Effective parameters of the full, big tile, and current work render buffer.
+   * The latter might be different from `buffers_->params` when there is a resolution divider
+   * involved. */
+  BufferParams effective_full_params_;
+  BufferParams effective_big_tile_params_;
+  BufferParams effective_buffer_params_;
+
+  bool *cancel_requested_flag_ = nullptr;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
new file mode 100644
index 00000000000..b9a33b64051
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_cpu.h"
+
+#include "device/cpu/kernel.h"
+#include "device/device.h"
+
+#include "integrator/pass_accessor_cpu.h"
+
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Create TBB arena for execution of path tracing and rendering tasks. */
+static inline tbb::task_arena local_tbb_arena_create(const Device *device)
+{
+  /* TODO: limit this to number of threads of CPU device, it may be smaller than
+   * the system number of threads when we reduce the number of CPU threads in
+   * CPU + GPU rendering to dedicate some cores to handling the GPU device. */
+  return tbb::task_arena(device->info.cpu_threads);
+}
+
+/* Get CPUKernelThreadGlobals for the current thread. */
+static inline CPUKernelThreadGlobals *kernel_thread_globals_get(
+    vector<CPUKernelThreadGlobals> &kernel_thread_globals)
+{
+  const int thread_index = tbb::this_task_arena::current_thread_index();
+  DCHECK_GE(thread_index, 0);
+  DCHECK_LE(thread_index, kernel_thread_globals.size());
+
+  return &kernel_thread_globals[thread_index];
+}
+
+PathTraceWorkCPU::PathTraceWorkCPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      kernels_(*(device->get_cpu_kernels()))
+{
+  DCHECK_EQ(device->info.type, DEVICE_CPU);
+}
+
+void PathTraceWorkCPU::init_execution()
+{
+  /* Cache per-thread kernel globals. */
+  device_->get_cpu_kernel_thread_globals(kernel_thread_globals_);
+}
+
+void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  const int64_t image_width = effective_buffer_params_.width;
+  const int64_t image_height = effective_buffer_params_.height;
+  const int64_t total_pixels_num = image_width * image_height;
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.start_profiling();
+  }
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), total_pixels_num, [&](int64_t work_index) {
+      if (is_cancel_requested()) {
+        return;
+      }
+
+      const int y = work_index / image_width;
+      const int x = work_index - y * image_width;
+
+      KernelWorkTile work_tile;
+      work_tile.x = effective_buffer_params_.full_x + x;
+      work_tile.y = effective_buffer_params_.full_y + y;
+      work_tile.w = 1;
+      work_tile.h = 1;
+      work_tile.start_sample = start_sample;
+      work_tile.num_samples = 1;
+      work_tile.offset = effective_buffer_params_.offset;
+      work_tile.stride = effective_buffer_params_.stride;
+
+      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);
+
+      render_samples_full_pipeline(kernel_globals, work_tile, samples_num);
+    });
+  });
+
+  for (CPUKernelThreadGlobals &kernel_globals : kernel_thread_globals_) {
+    kernel_globals.stop_profiling();
+  }
+
+  statistics.occupancy = 1.0f;
+}
+
+void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                                    const KernelWorkTile &work_tile,
+                                                    const int samples_num)
+{
+  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
+  const bool has_bake = device_scene_->data.bake.use;
+
+  IntegratorStateCPU integrator_states[2] = {};
+
+  IntegratorStateCPU *state = &integrator_states[0];
+  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+
+  KernelWorkTile sample_work_tile = work_tile;
+  float *render_buffer = buffers_->buffer.data();
+
+  for (int sample = 0; sample < samples_num; ++sample) {
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    if (has_bake) {
+      if (!kernels_.integrator_init_from_bake(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+    else {
+      if (!kernels_.integrator_init_from_camera(
+              kernel_globals, state, &sample_work_tile, render_buffer)) {
+        break;
+      }
+    }
+
+    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
+
+    if (has_shadow_catcher) {
+      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
+    }
+
+    ++sample_work_tile.start_sample;
+  }
+}
+
+void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  half4 *rgba_half = gpu_display->map_texture_buffer();
+  if (!rgba_half) {
+    /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for
+     * some implementations of GPUDisplay which can not map memory? */
+    return;
+  }
+
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+
+  const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples);
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.pixels_half_rgba = rgba_half;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+  local_arena.execute([&]() {
+    pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+  });
+
+  gpu_display->unmap_texture_buffer();
+}
+
+void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/)
+{
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_from_device()
+{
+  return buffers_->copy_from_device();
+}
+
+bool PathTraceWorkCPU::copy_render_buffers_to_device()
+{
+  buffers_->buffer.copy_to_device();
+  return true;
+}
+
+bool PathTraceWorkCPU::zero_render_buffers()
+{
+  buffers_->zero();
+  return true;
+}
+
+int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int offset = effective_buffer_params_.offset;
+  const int stride = effective_buffer_params_.stride;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  uint num_active_pixels = 0;
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(full_y, full_y + height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+
+      bool row_converged = true;
+      uint num_row_pixels_active = 0;
+      for (int x = 0; x < width; ++x) {
+        if (!kernels_.adaptive_sampling_convergence_check(
+                kernel_globals, render_buffer, full_x + x, y, threshold, reset, offset, stride)) {
+          ++num_row_pixels_active;
+          row_converged = false;
+        }
+      }
+
+      atomic_fetch_and_add_uint32(&num_active_pixels, num_row_pixels_active);
+
+      if (!row_converged) {
+        kernels_.adaptive_sampling_filter_x(
+            kernel_globals, render_buffer, y, full_x, width, offset, stride);
+      }
+    });
+  });
+
+  if (num_active_pixels) {
+    local_arena.execute([&]() {
+      tbb::parallel_for(full_x, full_x + width, [&](int x) {
+        CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+        kernels_.adaptive_sampling_filter_y(
+            kernel_globals, render_buffer, x, full_y, height, offset, stride);
+      });
+    });
+  }
+
+  return num_active_pixels;
+}
+
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
new file mode 100644
index 00000000000..ab729bbf879
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/cpu/kernel_thread_globals.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+struct KernelGlobals;
+
+class CPUKernels;
+
+/* Implementation of PathTraceWork which schedules work on to queues pixel-by-pixel,
+ * for CPU devices.
+ *
+ * NOTE: For the CPU rendering there are assumptions about TBB arena size and number of concurrent
+ * queues on the render device which makes this work be only usable on CPU. */
+class PathTraceWorkCPU : public PathTraceWork {
+ public:
+  PathTraceWorkCPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  /* Core path tracing routine. Renders given work time on the given queue. */
+  void render_samples_full_pipeline(KernelGlobals *kernel_globals,
+                                    const KernelWorkTile &work_tile,
+                                    const int samples_num);
+
+  /* CPU kernels. */
+  const CPUKernels &kernels_;
+
+  /* Copy of kernel globals which is suitable for concurrent access from multiple threads.
+   *
+   * More specifically, the `kernel_globals_` is local to each threads and nobody else is
+   * accessing it, but some "localization" is required to decouple from kernel globals stored
+   * on the device level. */
+  vector<CPUKernelThreadGlobals> kernel_thread_globals_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
new file mode 100644
index 00000000000..135466becc6
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/path_trace_work_gpu.h"
+
+#include "device/device.h"
+
+#include "integrator/pass_accessor_gpu.h"
+#include "render/buffers.h"
+#include "render/gpu_display.h"
+#include "render/scene.h"
+#include "util/util_logging.h"
+#include "util/util_tbb.h"
+#include "util/util_time.h"
+
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
+                                   Film *film,
+                                   DeviceScene *device_scene,
+                                   bool *cancel_requested_flag)
+    : PathTraceWork(device, film, device_scene, cancel_requested_flag),
+      queue_(device->gpu_queue_create()),
+      integrator_state_soa_kernel_features_(0),
+      integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
+      integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
+      integrator_shader_raytrace_sort_counter_(
+          device, "integrator_shader_raytrace_sort_counter", MEM_READ_WRITE),
+      integrator_next_shadow_catcher_path_index_(
+          device, "integrator_next_shadow_catcher_path_index", MEM_READ_WRITE),
+      queued_paths_(device, "queued_paths", MEM_READ_WRITE),
+      num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
+      work_tiles_(device, "work_tiles", MEM_READ_WRITE),
+      gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+      max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))),
+      min_num_active_paths_(queue_->num_concurrent_busy_states()),
+      max_active_path_index_(0)
+{
+  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
+
+  /* Limit number of active paths to the half of the overall state. This is due to the logic in the
+   * path compaction which relies on the fact that regeneration does not happen sooner than half of
+   * the states are available again. */
+  min_num_active_paths_ = min(min_num_active_paths_, max_num_paths_ / 2);
+}
+
+void PathTraceWorkGPU::alloc_integrator_soa()
+{
+  /* IntegrateState allocated as structure of arrays. */
+
+  /* Check if we already allocated memory for the required features. */
+  const uint kernel_features = device_scene_->data.kernel_features;
+  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
+    return;
+  }
+  integrator_state_soa_kernel_features_ = kernel_features;
+
+  /* Allocate a device only memory buffer before for each struct member, and then
+   * write the pointers into a struct that resides in constant memory.
+   *
+   * TODO: store float3 in separate XYZ arrays. */
+#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && \
+      (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_END(name) \
+  break; \
+  }
+#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+  if (array_index == array_size - 1) { \
+    break; \
+  } \
+  }
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+}
+
+void PathTraceWorkGPU::alloc_integrator_queue()
+{
+  if (integrator_queue_counter_.size() == 0) {
+    integrator_queue_counter_.alloc(1);
+    integrator_queue_counter_.zero_to_device();
+    integrator_queue_counter_.copy_from_device();
+    integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+                                              integrator_queue_counter_.device_pointer;
+  }
+
+  /* Allocate data for active path index arrays. */
+  if (num_queued_paths_.size() == 0) {
+    num_queued_paths_.alloc(1);
+    num_queued_paths_.zero_to_device();
+  }
+
+  if (queued_paths_.size() == 0) {
+    queued_paths_.alloc(max_num_paths_);
+    /* TODO: this could be skip if we had a function to just allocate on device. */
+    queued_paths_.zero_to_device();
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_sorting()
+{
+  /* Allocate arrays for shader sorting. */
+  const int max_shaders = device_scene_->data.max_shaders;
+  if (integrator_shader_sort_counter_.size() < max_shaders) {
+    integrator_shader_sort_counter_.alloc(max_shaders);
+    integrator_shader_sort_counter_.zero_to_device();
+
+    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
+    integrator_shader_raytrace_sort_counter_.zero_to_device();
+
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+        (int *)integrator_shader_sort_counter_.device_pointer;
+    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  }
+}
+
+void PathTraceWorkGPU::alloc_integrator_path_split()
+{
+  if (integrator_next_shadow_catcher_path_index_.size() != 0) {
+    return;
+  }
+
+  integrator_next_shadow_catcher_path_index_.alloc(1);
+  /* TODO(sergey): Use queue? */
+  integrator_next_shadow_catcher_path_index_.zero_to_device();
+
+  integrator_state_gpu_.next_shadow_catcher_path_index =
+      (int *)integrator_next_shadow_catcher_path_index_.device_pointer;
+}
+
+void PathTraceWorkGPU::alloc_work_memory()
+{
+  alloc_integrator_soa();
+  alloc_integrator_queue();
+  alloc_integrator_sorting();
+  alloc_integrator_path_split();
+}
+
+void PathTraceWorkGPU::init_execution()
+{
+  queue_->init_execution();
+
+  /* Copy to device side struct in constant memory. */
+  device_->const_copy_to(
+      "__integrator_state", &integrator_state_gpu_, sizeof(integrator_state_gpu_));
+}
+
+void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
+                                      int start_sample,
+                                      int samples_num)
+{
+  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
+   * add more work (because tiles are smaller, so there is higher chance that more paths will
+   * become busy after adding new tiles). This is especially important for the shadow catcher which
+   * schedules work in halves of available number of paths. */
+  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
+
+  work_tile_scheduler_.reset(effective_buffer_params_, start_sample, samples_num);
+
+  enqueue_reset();
+
+  int num_iterations = 0;
+  uint64_t num_busy_accum = 0;
+
+  /* TODO: set a hard limit in case of undetected kernel failures? */
+  while (true) {
+    /* Enqueue work from the scheduler, on start or when there are not enough
+     * paths to keep the device occupied. */
+    bool finished;
+    if (enqueue_work_tiles(finished)) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    /* Stop if no more work remaining. */
+    if (finished) {
+      break;
+    }
+
+    /* Enqueue on of the path iteration kernels. */
+    if (enqueue_path_iteration()) {
+      /* Copy stats from the device. */
+      queue_->copy_from_device(integrator_queue_counter_);
+
+      if (!queue_->synchronize()) {
+        break; /* Stop on error. */
+      }
+    }
+
+    if (is_cancel_requested()) {
+      break;
+    }
+
+    num_busy_accum += get_num_active_paths();
+    ++num_iterations;
+  }
+
+  statistics.occupancy = static_cast<float>(num_busy_accum) / num_iterations / max_num_paths_;
+}
+
+DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
+{
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int max_num_queued = 0;
+  DeviceKernel kernel = DEVICE_KERNEL_NUM;
+
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    if (queue_counter->num_queued[i] > max_num_queued) {
+      kernel = (DeviceKernel)i;
+      max_num_queued = queue_counter->num_queued[i];
+    }
+  }
+
+  return kernel;
+}
+
+void PathTraceWorkGPU::enqueue_reset()
+{
+  void *args[] = {&max_num_paths_};
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
+  queue_->zero_to_device(integrator_queue_counter_);
+  queue_->zero_to_device(integrator_shader_sort_counter_);
+  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+
+  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
+   * counter on the host side because `zero_to_device()` is not doing it. */
+  if (integrator_queue_counter_.host_pointer) {
+    memset(integrator_queue_counter_.data(), 0, integrator_queue_counter_.memory_size());
+  }
+}
+
+bool PathTraceWorkGPU::enqueue_path_iteration()
+{
+  /* Find kernel to execute, with max number of queued paths. */
+  const IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_active_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    num_active_paths += queue_counter->num_queued[i];
+  }
+
+  if (num_active_paths == 0) {
+    return false;
+  }
+
+  /* Find kernel to execute, with max number of queued paths. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel == DEVICE_KERNEL_NUM) {
+    return false;
+  }
+
+  /* Finish shadows before potentially adding more shadow rays. We can only
+   * store one shadow ray in the integrator state. */
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME) {
+    if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+      return true;
+    }
+    else if (queue_counter->num_queued[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW]) {
+      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+      return true;
+    }
+  }
+
+  /* Schedule kernel with maximum number of queued items. */
+  enqueue_path_iteration(kernel);
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
+{
+  void *d_path_index = (void *)NULL;
+
+  /* Create array of path indices for which this kernel is queued to be executed. */
+  int work_size = max_active_path_index_;
+
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+  int num_queued = queue_counter->num_queued[kernel];
+
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    /* Compute array of active paths, sorted by shader. */
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    compute_sorted_queued_paths(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel);
+  }
+  else if (num_queued < work_size) {
+    work_size = num_queued;
+    d_path_index = (void *)queued_paths_.device_pointer;
+
+    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+      /* Compute array of active shadow paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
+    }
+    else {
+      /* Compute array of active paths for specific kernel. */
+      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY, kernel);
+    }
+  }
+
+  DCHECK_LE(work_size, max_num_paths_);
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+      /* Ray intersection kernels with integrator state. */
+      void *args[] = {&d_path_index, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+      /* Shading kernels with integrator state and render buffer. */
+      void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+      void *args[] = {&d_path_index, &d_render_buffer, const_cast<int *>(&work_size)};
+
+      queue_->enqueue(kernel, work_size, args);
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unhandled kernel " << device_kernel_as_string(kernel)
+                 << " used for path iteration, should never happen.";
+      break;
+  }
+}
+
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+  void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+  assert(d_counter != nullptr);
+
+  /* Compute prefix sum of number of active paths with each shader. */
+  {
+    const int work_size = 1;
+    int max_shaders = device_scene_->data.max_shaders;
+    void *args[] = {&d_counter, &max_shaders};
+    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  /* Launch kernel to fill the active paths arrays. */
+  {
+    /* TODO: this could be smaller for terminated paths based on amount of work we want
+     * to schedule. */
+    const int work_size = max_active_path_index_;
+
+    void *d_queued_paths = (void *)queued_paths_.device_pointer;
+    void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+    void *args[] = {const_cast<int *>(&work_size),
+                    &d_queued_paths,
+                    &d_num_queued_paths,
+                    &d_counter,
+                    &d_queued_kernel};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
+
+  if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+    queue_->zero_to_device(integrator_shader_sort_counter_);
+  }
+  else if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  else {
+    assert(0);
+  }
+}
+
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
+{
+  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  const int work_size = max_active_path_index_;
+  void *d_queued_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {
+      const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
+
+  queue_->zero_to_device(num_queued_paths_);
+  queue_->enqueue(kernel, work_size, args);
+}
+
+void PathTraceWorkGPU::compact_states(const int num_active_paths)
+{
+  if (num_active_paths == 0) {
+    max_active_path_index_ = 0;
+  }
+
+  /* Compact fragmented path states into the start of the array, moving any paths
+   * with index higher than the number of active paths into the gaps. */
+  if (max_active_path_index_ == num_active_paths) {
+    return;
+  }
+
+  void *d_compact_paths = (void *)queued_paths_.device_pointer;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+
+  /* Create array with terminated paths that we can write to. */
+  {
+    /* TODO: can the work size be reduced here? */
+    int offset = num_active_paths;
+    int work_size = num_active_paths;
+    void *args[] = {&work_size, &d_compact_paths, &d_num_queued_paths, &offset};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, work_size, args);
+  }
+
+  /* Create array of paths that we need to compact, where the path index is bigger
+   * than the number of active paths. */
+  {
+    int work_size = max_active_path_index_;
+    void *args[] = {
+        &work_size, &d_compact_paths, &d_num_queued_paths, const_cast<int *>(&num_active_paths)};
+    queue_->zero_to_device(num_queued_paths_);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY, work_size, args);
+  }
+
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  int num_compact_paths = num_queued_paths_.data()[0];
+
+  /* Move paths into gaps. */
+  if (num_compact_paths > 0) {
+    int work_size = num_compact_paths;
+    int active_states_offset = 0;
+    int terminated_states_offset = num_active_paths;
+    void *args[] = {
+        &d_compact_paths, &active_states_offset, &terminated_states_offset, &work_size};
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES, work_size, args);
+  }
+
+  queue_->synchronize();
+
+  /* Adjust max active path index now we know which part of the array is actually used. */
+  max_active_path_index_ = num_active_paths;
+}
+
+bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
+{
+  /* If there are existing paths wait them to go to intersect closest kernel, which will align the
+   * wavefront of the existing and newly added paths. */
+  /* TODO: Check whether counting new intersection kernels here will have positive affect on the
+   * performance. */
+  const DeviceKernel kernel = get_most_queued_kernel();
+  if (kernel != DEVICE_KERNEL_NUM && kernel != DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST) {
+    return false;
+  }
+
+  int num_active_paths = get_num_active_paths();
+
+  /* Don't schedule more work if canceling. */
+  if (is_cancel_requested()) {
+    if (num_active_paths == 0) {
+      finished = true;
+    }
+    return false;
+  }
+
+  finished = false;
+
+  vector<KernelWorkTile> work_tiles;
+
+  int max_num_camera_paths = max_num_paths_;
+  int num_predicted_splits = 0;
+
+  if (has_shadow_catcher()) {
+    /* When there are shadow catchers in the scene bounce from them will split the state. So we
+     * make sure there is enough space in the path states array to fit split states.
+     *
+     * Basically, when adding N new paths we ensure that there is 2*N available path states, so
+     * that all the new paths can be split.
+     *
+     * Note that it is possible that some of the current states can still split, so need to make
+     * sure there is enough space for them as well. */
+
+    /* Number of currently in-flight states which can still split. */
+    const int num_scheduled_possible_split = shadow_catcher_count_possible_splits();
+
+    const int num_available_paths = max_num_paths_ - num_active_paths;
+    const int num_new_paths = num_available_paths / 2;
+    max_num_camera_paths = max(num_active_paths,
+                               num_active_paths + num_new_paths - num_scheduled_possible_split);
+    num_predicted_splits += num_scheduled_possible_split + num_new_paths;
+  }
+
+  /* Schedule when we're out of paths or there are too few paths to keep the
+   * device occupied. */
+  int num_paths = num_active_paths;
+  if (num_paths == 0 || num_paths < min_num_active_paths_) {
+    /* Get work tiles until the maximum number of path is reached. */
+    while (num_paths < max_num_camera_paths) {
+      KernelWorkTile work_tile;
+      if (work_tile_scheduler_.get_work(&work_tile, max_num_camera_paths - num_paths)) {
+        work_tiles.push_back(work_tile);
+        num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* If we couldn't get any more tiles, we're done. */
+    if (work_tiles.size() == 0 && num_paths == 0) {
+      finished = true;
+      return false;
+    }
+  }
+
+  /* Initialize paths from work tiles. */
+  if (work_tiles.size() == 0) {
+    return false;
+  }
+
+  /* Compact state array when number of paths becomes small relative to the
+   * known maximum path index, which makes computing active index arrays slow. */
+  compact_states(num_active_paths);
+
+  if (has_shadow_catcher()) {
+    integrator_next_shadow_catcher_path_index_.data()[0] = num_paths;
+    queue_->copy_to_device(integrator_next_shadow_catcher_path_index_);
+  }
+
+  enqueue_work_tiles((device_scene_->data.bake.use) ? DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE :
+                                                      DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA,
+                     work_tiles.data(),
+                     work_tiles.size(),
+                     num_active_paths,
+                     num_predicted_splits);
+
+  return true;
+}
+
+void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
+                                          const KernelWorkTile work_tiles[],
+                                          const int num_work_tiles,
+                                          const int num_active_paths,
+                                          const int num_predicted_splits)
+{
+  /* Copy work tiles to device. */
+  if (work_tiles_.size() < num_work_tiles) {
+    work_tiles_.alloc(num_work_tiles);
+  }
+
+  int path_index_offset = num_active_paths;
+  int max_tile_work_size = 0;
+  for (int i = 0; i < num_work_tiles; i++) {
+    KernelWorkTile &work_tile = work_tiles_.data()[i];
+    work_tile = work_tiles[i];
+
+    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+    work_tile.path_index_offset = path_index_offset;
+    work_tile.work_size = tile_work_size;
+
+    path_index_offset += tile_work_size;
+
+    max_tile_work_size = max(max_tile_work_size, tile_work_size);
+  }
+
+  queue_->copy_to_device(work_tiles_);
+
+  void *d_work_tiles = (void *)work_tiles_.device_pointer;
+  void *d_render_buffer = (void *)buffers_->buffer.device_pointer;
+
+  /* Launch kernel. */
+  void *args[] = {&d_work_tiles,
+                  const_cast<int *>(&num_work_tiles),
+                  &d_render_buffer,
+                  const_cast<int *>(&max_tile_work_size)};
+
+  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
+
+  max_active_path_index_ = path_index_offset + num_predicted_splits;
+}
+
+int PathTraceWorkGPU::get_num_active_paths()
+{
+  /* TODO: this is wrong, does not account for duplicates with shadow! */
+  IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
+
+  int num_paths = 0;
+  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
+    DCHECK_GE(queue_counter->num_queued[i], 0)
+        << "Invalid number of queued states for kernel "
+        << device_kernel_as_string(static_cast<DeviceKernel>(i));
+    num_paths += queue_counter->num_queued[i];
+  }
+
+  return num_paths;
+}
+
+bool PathTraceWorkGPU::should_use_graphics_interop()
+{
+  /* There are few aspects with the graphics interop when using multiple devices caused by the fact
+   * that the GPUDisplay has a single texture:
+   *
+   *   CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when
+   *   attempting to register OpenGL PBO which has been mapped. Which makes sense, because
+   *   otherwise one would run into a conflict of where the source of truth is. */
+  if (has_multiple_works()) {
+    return false;
+  }
+
+  if (!interop_use_checked_) {
+    Device *device = queue_->device;
+    interop_use_ = device->should_use_graphics_interop();
+
+    if (interop_use_) {
+      VLOG(2) << "Will be using graphics interop GPU display update.";
+    }
+    else {
+      VLOG(2) << "Will be using naive GPU display update.";
+    }
+
+    interop_use_checked_ = true;
+  }
+
+  return interop_use_;
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display,
+                                           PassMode pass_mode,
+                                           int num_samples)
+{
+  if (device_->have_error()) {
+    /* Don't attempt to update GPU display if the device has errors: the error state will make
+     * wrong decisions to happen about interop, causing more chained bugs. */
+    return;
+  }
+
+  if (!buffers_->buffer.device_pointer) {
+    LOG(WARNING) << "Request for GPU display update without allocated render buffers.";
+    return;
+  }
+
+  if (should_use_graphics_interop()) {
+    if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) {
+      return;
+    }
+
+    /* If error happens when trying to use graphics interop fallback to the native implementation
+     * and don't attempt to use interop for the further updates. */
+    interop_use_ = false;
+  }
+
+  copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples);
+}
+
+void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
+                                                 PassMode pass_mode,
+                                                 int num_samples)
+{
+  const int full_x = effective_buffer_params_.full_x;
+  const int full_y = effective_buffer_params_.full_y;
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+  const int final_width = buffers_->params.width;
+  const int final_height = buffers_->params.height;
+
+  const int texture_x = full_x - effective_full_params_.full_x;
+  const int texture_y = full_y - effective_full_params_.full_y;
+
+  /* Re-allocate display memory if needed, and make sure the device pointer is allocated.
+   *
+   * NOTE: allocation happens to the final resolution so that no re-allocation happens on every
+   * change of the resolution divider. However, if the display becomes smaller, shrink the
+   * allocated memory as well. */
+  if (gpu_display_rgba_half_.data_width != final_width ||
+      gpu_display_rgba_half_.data_height != final_height) {
+    gpu_display_rgba_half_.alloc(final_width, final_height);
+    /* TODO(sergey): There should be a way to make sure device-side memory is allocated without
+     * transferring zeroes to the device. */
+    queue_->zero_to_device(gpu_display_rgba_half_);
+  }
+
+  PassAccessor::Destination destination(film_->get_display_pass());
+  destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  gpu_display_rgba_half_.copy_from_device();
+
+  gpu_display->copy_pixels_to_texture(
+      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
+}
+
+bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  if (!device_graphics_interop_) {
+    device_graphics_interop_ = queue_->graphics_interop_create();
+  }
+
+  const DeviceGraphicsInteropDestination graphics_interop_dst =
+      gpu_display->graphics_interop_get();
+  device_graphics_interop_->set_destination(graphics_interop_dst);
+
+  const device_ptr d_rgba_half = device_graphics_interop_->map();
+  if (!d_rgba_half) {
+    return false;
+  }
+
+  PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display);
+  destination.d_pixels_half_rgba = d_rgba_half;
+
+  get_render_tile_film_pixels(destination, pass_mode, num_samples);
+
+  device_graphics_interop_->unmap();
+
+  return true;
+}
+
+void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display)
+{
+  if (!device_graphics_interop_) {
+    return;
+  }
+  gpu_display->graphics_interop_activate();
+  device_graphics_interop_ = nullptr;
+  gpu_display->graphics_interop_deactivate();
+}
+
+void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                                   PassMode pass_mode,
+                                                   int num_samples)
+{
+  const KernelFilm &kfilm = device_scene_->data.film;
+
+  const PassAccessor::PassAccessInfo pass_access_info = get_display_pass_access_info(pass_mode);
+  const PassAccessorGPU pass_accessor(queue_.get(), pass_access_info, kfilm.exposure, num_samples);
+
+  pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination);
+}
+
+int PathTraceWorkGPU::adaptive_sampling_converge_filter_count_active(float threshold, bool reset)
+{
+  const int num_active_pixels = adaptive_sampling_convergence_check_count_active(threshold, reset);
+
+  if (num_active_pixels) {
+    enqueue_adaptive_sampling_filter_x();
+    enqueue_adaptive_sampling_filter_y();
+    queue_->synchronize();
+  }
+
+  return num_active_pixels;
+}
+
+int PathTraceWorkGPU::adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
+{
+  device_vector<uint> num_active_pixels(device_, "num_active_pixels", MEM_READ_WRITE);
+  num_active_pixels.alloc(1);
+
+  queue_->zero_to_device(num_active_pixels);
+
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&effective_buffer_params_.full_x),
+                  const_cast<int *>(&effective_buffer_params_.full_y),
+                  const_cast<int *>(&effective_buffer_params_.width),
+                  const_cast<int *>(&effective_buffer_params_.height),
+                  &threshold,
+                  &reset,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride,
+                  &num_active_pixels.device_pointer};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK, work_size, args);
+
+  queue_->copy_from_device(num_active_pixels);
+  queue_->synchronize();
+
+  return num_active_pixels.data()[0];
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_x()
+{
+  const int work_size = effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X, work_size, args);
+}
+
+void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
+{
+  const int work_size = effective_buffer_params_.width;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  &effective_buffer_params_.full_x,
+                  &effective_buffer_params_.full_y,
+                  &effective_buffer_params_.width,
+                  &effective_buffer_params_.height,
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
+}
+
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&work_size),
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_from_device()
+{
+  queue_->copy_from_device(buffers_->buffer);
+
+  /* Synchronize so that the CPU-side buffer is available at the exit of this function. */
+  return queue_->synchronize();
+}
+
+bool PathTraceWorkGPU::copy_render_buffers_to_device()
+{
+  queue_->copy_to_device(buffers_->buffer);
+
+  /* NOTE: The direct device access to the buffers only happens within this path trace work. The
+   * rest of communication happens via API calls which involves `copy_render_buffers_from_device()`
+   * which will perform synchronization as needed. */
+
+  return true;
+}
+
+bool PathTraceWorkGPU::zero_render_buffers()
+{
+  queue_->zero_to_device(buffers_->buffer);
+
+  return true;
+}
+
+bool PathTraceWorkGPU::has_shadow_catcher() const
+{
+  return device_scene_->data.integrator.has_shadow_catcher;
+}
+
+int PathTraceWorkGPU::shadow_catcher_count_possible_splits()
+{
+  if (max_active_path_index_ == 0) {
+    return 0;
+  }
+
+  if (!has_shadow_catcher()) {
+    return 0;
+  }
+
+  queue_->zero_to_device(num_queued_paths_);
+
+  const int work_size = max_active_path_index_;
+  void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
+  void *args[] = {const_cast<int *>(&work_size), &d_num_queued_paths};
+
+  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS, work_size, args);
+  queue_->copy_from_device(num_queued_paths_);
+  queue_->synchronize();
+
+  return num_queued_paths_.data()[0];
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
new file mode 100644
index 00000000000..38788122b0d
--- /dev/null
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+
+#include "device/device_graphics_interop.h"
+#include "device/device_memory.h"
+#include "device/device_queue.h"
+
+#include "integrator/path_trace_work.h"
+#include "integrator/work_tile_scheduler.h"
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelWorkTile;
+
+/* Implementation of PathTraceWork which schedules work to the device in tiles which are sized
+ * to match device queue's number of path states.
+ * This implementation suits best devices which have a lot of integrator states, such as GPU. */
+class PathTraceWorkGPU : public PathTraceWork {
+ public:
+  PathTraceWorkGPU(Device *device,
+                   Film *film,
+                   DeviceScene *device_scene,
+                   bool *cancel_requested_flag);
+
+  virtual void alloc_work_memory() override;
+  virtual void init_execution() override;
+
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num) override;
+
+  virtual void copy_to_gpu_display(GPUDisplay *gpu_display,
+                                   PassMode pass_mode,
+                                   int num_samples) override;
+  virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override;
+
+  virtual bool copy_render_buffers_from_device() override;
+  virtual bool copy_render_buffers_to_device() override;
+  virtual bool zero_render_buffers() override;
+
+  virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
+
+ protected:
+  void alloc_integrator_soa();
+  void alloc_integrator_queue();
+  void alloc_integrator_sorting();
+  void alloc_integrator_path_split();
+
+  /* Returns DEVICE_KERNEL_NUM if there are no scheduled kernels. */
+  DeviceKernel get_most_queued_kernel() const;
+
+  void enqueue_reset();
+
+  bool enqueue_work_tiles(bool &finished);
+  void enqueue_work_tiles(DeviceKernel kernel,
+                          const KernelWorkTile work_tiles[],
+                          const int num_work_tiles,
+                          const int num_active_paths,
+                          const int num_predicted_splits);
+
+  bool enqueue_path_iteration();
+  void enqueue_path_iteration(DeviceKernel kernel);
+
+  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+  void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+
+  void compact_states(const int num_active_paths);
+
+  int get_num_active_paths();
+
+  /* Check whether graphics interop can be used for the GPUDisplay update. */
+  bool should_use_graphics_interop();
+
+  /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the
+   * device, then copies pixels to the host and pushes them to the `gpu_display`. */
+  void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability
+   * functionality, avoiding copy of pixels to the host. */
+  bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples);
+
+  /* Synchronously run film conversion kernel and store display result in the given destination. */
+  void get_render_tile_film_pixels(const PassAccessor::Destination &destination,
+                                   PassMode pass_mode,
+                                   int num_samples);
+
+  int adaptive_sampling_convergence_check_count_active(float threshold, bool reset);
+  void enqueue_adaptive_sampling_filter_x();
+  void enqueue_adaptive_sampling_filter_y();
+
+  bool has_shadow_catcher() const;
+
+  /* Count how many currently scheduled paths can still split. */
+  int shadow_catcher_count_possible_splits();
+
+  /* Integrator queue. */
+  unique_ptr<DeviceQueue> queue_;
+
+  /* Scheduler which gives work to path tracing threads. */
+  WorkTileScheduler work_tile_scheduler_;
+
+  /* Integrate state for paths. */
+  IntegratorStateGPU integrator_state_gpu_;
+  /* SoA arrays for integrator state. */
+  vector<unique_ptr<device_memory>> integrator_state_soa_;
+  uint integrator_state_soa_kernel_features_;
+  /* Keep track of number of queued kernels. */
+  device_vector<IntegratorQueueCounter> integrator_queue_counter_;
+  /* Shader sorting. */
+  device_vector<int> integrator_shader_sort_counter_;
+  device_vector<int> integrator_shader_raytrace_sort_counter_;
+  /* Path split. */
+  device_vector<int> integrator_next_shadow_catcher_path_index_;
+
+  /* Temporary buffer to get an array of queued path for a particular kernel. */
+  device_vector<int> queued_paths_;
+  device_vector<int> num_queued_paths_;
+
+  /* Temporary buffer for passing work tiles to kernel. */
+  device_vector<KernelWorkTile> work_tiles_;
+
+  /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not
+   * available. Is allocated on-demand. */
+  device_vector<half4> gpu_display_rgba_half_;
+
+  unique_ptr<DeviceGraphicsInterop> device_graphics_interop_;
+
+  /* Cached result of device->should_use_graphics_interop(). */
+  bool interop_use_checked_ = false;
+  bool interop_use_ = false;
+
+  /* Maximum number of concurrent integrator states. */
+  int max_num_paths_;
+
+  /* Minimum number of paths which keeps the device bust. If the actual number of paths falls below
+   * this value more work will be scheduled. */
+  int min_num_active_paths_;
+
+  /* Maximum path index, effective number of paths used may be smaller than
+   * the size of the integrator_state_ buffer so can avoid iterating over the
+   * full buffer. */
+  int max_active_path_index_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
new file mode 100644
index 00000000000..3e5b3417a6a
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -0,0 +1,1187 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/render_scheduler.h"
+
+#include "render/session.h"
+#include "render/tile.h"
+#include "util/util_logging.h"
+#include "util/util_math.h"
+#include "util/util_time.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Render scheduler.
+ */
+
+RenderScheduler::RenderScheduler(TileManager &tile_manager, const SessionParams &params)
+    : headless_(params.headless),
+      background_(params.background),
+      pixel_size_(params.pixel_size),
+      tile_manager_(tile_manager),
+      default_start_resolution_divider_(pixel_size_ * 8)
+{
+  use_progressive_noise_floor_ = !background_;
+}
+
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+  need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
+void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
+{
+  need_schedule_rebalance_works_ = need_schedule_rebalance;
+}
+
+bool RenderScheduler::is_background() const
+{
+  return background_;
+}
+
+void RenderScheduler::set_denoiser_params(const DenoiseParams &params)
+{
+  denoiser_params_ = params;
+}
+
+void RenderScheduler::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
+{
+  adaptive_sampling_ = adaptive_sampling;
+}
+
+bool RenderScheduler::is_adaptive_sampling_used() const
+{
+  return adaptive_sampling_.use;
+}
+
+void RenderScheduler::set_start_sample(int start_sample)
+{
+  start_sample_ = start_sample;
+}
+
+int RenderScheduler::get_start_sample() const
+{
+  return start_sample_;
+}
+
+void RenderScheduler::set_num_samples(int num_samples)
+{
+  num_samples_ = num_samples;
+}
+
+int RenderScheduler::get_num_samples() const
+{
+  return num_samples_;
+}
+
+void RenderScheduler::set_time_limit(double time_limit)
+{
+  time_limit_ = time_limit;
+}
+
+double RenderScheduler::get_time_limit() const
+{
+  return time_limit_;
+}
+
+int RenderScheduler::get_rendered_sample() const
+{
+  DCHECK_GT(get_num_rendered_samples(), 0);
+
+  return start_sample_ + get_num_rendered_samples() - 1;
+}
+
+int RenderScheduler::get_num_rendered_samples() const
+{
+  return state_.num_rendered_samples;
+}
+
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+{
+  buffer_params_ = buffer_params;
+
+  update_start_resolution_divider();
+
+  set_num_samples(num_samples);
+
+  /* In background mode never do lower resolution render preview, as it is not really supported
+   * by the software. */
+  if (background_) {
+    state_.resolution_divider = 1;
+  }
+  else {
+    /* NOTE: Divide by 2 because of the way how scheduling works: it advances resolution divider
+     * first and then initialized render work. */
+    state_.resolution_divider = start_resolution_divider_ * 2;
+  }
+
+  state_.num_rendered_samples = 0;
+  state_.last_display_update_time = 0.0;
+  state_.last_display_update_sample = -1;
+
+  state_.last_rebalance_time = 0.0;
+  state_.num_rebalance_requested = 0;
+  state_.num_rebalance_changes = 0;
+  state_.last_rebalance_changed = false;
+  state_.need_rebalance_at_next_work = false;
+
+  /* TODO(sergey): Choose better initial value. */
+  /* NOTE: The adaptive sampling settings might not be available here yet. */
+  state_.adaptive_sampling_threshold = 0.4f;
+
+  state_.last_work_tile_was_denoised = false;
+  state_.tile_result_was_written = false;
+  state_.postprocess_work_scheduled = false;
+  state_.full_frame_work_scheduled = false;
+  state_.full_frame_was_written = false;
+
+  state_.path_trace_finished = false;
+
+  state_.start_render_time = 0.0;
+  state_.end_render_time = 0.0;
+  state_.time_limit_reached = false;
+
+  state_.occupancy_num_samples = 0;
+  state_.occupancy = 1.0f;
+
+  first_render_time_.path_trace_per_sample = 0.0;
+  first_render_time_.denoise_time = 0.0;
+  first_render_time_.display_update_time = 0.0;
+
+  path_trace_time_.reset();
+  denoise_time_.reset();
+  adaptive_filter_time_.reset();
+  display_update_time_.reset();
+  rebalance_time_.reset();
+}
+
+void RenderScheduler::reset_for_next_tile()
+{
+  reset(buffer_params_, num_samples_);
+}
+
+bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
+{
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (render_work_reschedule_on_idle(render_work)) {
+    return true;
+  }
+
+  state_.path_trace_finished = true;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  return false;
+}
+
+bool RenderScheduler::render_work_reschedule_on_idle(RenderWork &render_work)
+{
+  if (!use_progressive_noise_floor_) {
+    return false;
+  }
+
+  /* Move to the next resolution divider. Assume adaptive filtering is not needed during
+   * navigation. */
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (adaptive_sampling_.use) {
+    if (state_.adaptive_sampling_threshold > adaptive_sampling_.threshold) {
+      state_.adaptive_sampling_threshold = max(state_.adaptive_sampling_threshold / 2,
+                                               adaptive_sampling_.threshold);
+
+      render_work.adaptive_sampling.threshold = state_.adaptive_sampling_threshold;
+      render_work.adaptive_sampling.reset = true;
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+void RenderScheduler::render_work_reschedule_on_cancel(RenderWork &render_work)
+{
+  VLOG(3) << "Schedule work for cancel.";
+
+  /* Un-schedule samples: they will not be rendered and should not be counted. */
+  state_.num_rendered_samples -= render_work.path_trace.num_samples;
+
+  const bool has_rendered_samples = get_num_rendered_samples() != 0;
+
+  /* Reset all fields of the previous work, canceling things like adaptive sampling filtering and
+   * denoising.
+   * However, need to preserve write requests, since those will not be possible to recover and
+   * writes are only to happen once. */
+  const bool tile_write = render_work.tile.write;
+  const bool full_write = render_work.full.write;
+
+  render_work = RenderWork();
+
+  render_work.tile.write = tile_write;
+  render_work.full.write = full_write;
+
+  /* Do not write tile if it has zero samples it it, treat it similarly to all other tiles which
+   * got canceled. */
+  if (!state_.tile_result_was_written && has_rendered_samples) {
+    render_work.tile.write = true;
+  }
+
+  if (!state_.full_frame_was_written) {
+    render_work.full.write = true;
+  }
+
+  /* Update current tile, but only if any sample was rendered.
+   * Allows to have latest state of tile visible while full buffer is being processed.
+   *
+   * Note that if there are no samples in the current tile its render buffer might have pixels
+   * remained from previous state.
+   *
+   * If the full result was written, then there is no way any updates were made to the render
+   * buffers. And the buffers might have been freed from the device, so display update is not
+   * possible. */
+  if (has_rendered_samples && !state_.full_frame_was_written) {
+    render_work.display.update = true;
+  }
+}
+
+bool RenderScheduler::done() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return false;
+  }
+
+  if (state_.path_trace_finished || state_.time_limit_reached) {
+    return true;
+  }
+
+  return get_num_rendered_samples() >= num_samples_;
+}
+
+RenderWork RenderScheduler::get_render_work()
+{
+  check_time_limit_reached();
+
+  const double time_now = time_dt();
+
+  if (done()) {
+    RenderWork render_work;
+    render_work.resolution_divider = state_.resolution_divider;
+
+    if (!set_postprocess_render_work(&render_work)) {
+      set_full_frame_render_work(&render_work);
+    }
+
+    if (!render_work) {
+      state_.end_render_time = time_now;
+    }
+
+    update_state_for_render_work(render_work);
+
+    return render_work;
+  }
+
+  RenderWork render_work;
+
+  if (state_.resolution_divider != pixel_size_) {
+    state_.resolution_divider = max(state_.resolution_divider / 2, pixel_size_);
+    state_.num_rendered_samples = 0;
+    state_.last_display_update_sample = -1;
+  }
+
+  render_work.resolution_divider = state_.resolution_divider;
+
+  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
+  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+
+  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());
+
+  /* NOTE: Rebalance scheduler requires current number of samples to not be advanced forward. */
+  render_work.rebalance = work_need_rebalance();
+
+  /* NOTE: Advance number of samples now, so that filter and denoising check can see that all the
+   * samples are rendered. */
+  state_.num_rendered_samples += render_work.path_trace.num_samples;
+
+  render_work.adaptive_sampling.filter = work_need_adaptive_filter();
+  render_work.adaptive_sampling.threshold = work_adaptive_threshold();
+  render_work.adaptive_sampling.reset = false;
+
+  bool denoiser_delayed, denoiser_ready_to_display;
+  render_work.tile.denoise = work_need_denoise(denoiser_delayed, denoiser_ready_to_display);
+
+  render_work.tile.write = done();
+
+  render_work.display.update = work_need_update_display(denoiser_delayed);
+  render_work.display.use_denoised_result = denoiser_ready_to_display;
+
+  if (done()) {
+    set_postprocess_render_work(&render_work);
+  }
+
+  update_state_for_render_work(render_work);
+
+  return render_work;
+}
+
+void RenderScheduler::update_state_for_render_work(const RenderWork &render_work)
+{
+  const double time_now = time_dt();
+
+  if (render_work.rebalance) {
+    state_.last_rebalance_time = time_now;
+    ++state_.num_rebalance_requested;
+  }
+
+  /* A fallback display update time, for the case there is an error of display update, or when
+   * there is no display at all. */
+  if (render_work.display.update) {
+    state_.last_display_update_time = time_now;
+    state_.last_display_update_sample = state_.num_rendered_samples;
+  }
+
+  state_.last_work_tile_was_denoised = render_work.tile.denoise;
+  state_.tile_result_was_written |= render_work.tile.write;
+  state_.full_frame_was_written |= render_work.full.write;
+}
+
+bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
+{
+  if (state_.postprocess_work_scheduled) {
+    return false;
+  }
+  state_.postprocess_work_scheduled = true;
+
+  bool any_scheduled = false;
+
+  if (need_schedule_cryptomatte_) {
+    render_work->cryptomatte.postprocess = true;
+    any_scheduled = true;
+  }
+
+  if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
+    render_work->tile.denoise = true;
+    any_scheduled = true;
+  }
+
+  if (!state_.tile_result_was_written) {
+    render_work->tile.write = true;
+    any_scheduled = true;
+  }
+
+  if (any_scheduled) {
+    render_work->display.update = true;
+  }
+
+  return any_scheduled;
+}
+
+void RenderScheduler::set_full_frame_render_work(RenderWork *render_work)
+{
+  if (state_.full_frame_work_scheduled) {
+    return;
+  }
+
+  if (!tile_manager_.has_multiple_tiles()) {
+    /* There is only single tile, so all work has been performed already. */
+    return;
+  }
+
+  if (!tile_manager_.done()) {
+    /* There are still tiles to be rendered. */
+    return;
+  }
+
+  if (state_.full_frame_was_written) {
+    return;
+  }
+
+  state_.full_frame_work_scheduled = true;
+
+  render_work->full.write = true;
+}
+
+/* Knowing time which it took to complete a task at the current resolution divider approximate how
+ * long it would have taken to complete it at a final resolution. */
+static double approximate_final_time(const RenderWork &render_work, double time)
+{
+  if (render_work.resolution_divider == 1) {
+    return time;
+  }
+
+  const double resolution_divider_sq = render_work.resolution_divider *
+                                       render_work.resolution_divider;
+  return time * resolution_divider_sq;
+}
+
+void RenderScheduler::report_work_begin(const RenderWork &render_work)
+{
+  /* Start counting render time when rendering samples at their final resolution.
+   *
+   * NOTE: The work might have the path trace part be all zero: this happens when a post-processing
+   * work is scheduled after the path tracing. Checking for just a start sample doesn't work here
+   * because it might be wrongly 0. Check for whether path tracing is actually happening as it is
+   * expected to happen in the first work. */
+  if (render_work.resolution_divider == pixel_size_ && render_work.path_trace.num_samples != 0 &&
+      render_work.path_trace.start_sample == get_start_sample()) {
+    state_.start_render_time = time_dt();
+  }
+}
+
+void RenderScheduler::report_path_trace_time(const RenderWork &render_work,
+                                             double time,
+                                             bool is_cancelled)
+{
+  path_trace_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.path_trace_per_sample = final_time_approx /
+                                               render_work.path_trace.num_samples;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    path_trace_time_.reset_average();
+  }
+
+  path_trace_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average path tracing time: " << path_trace_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_path_trace_occupancy(const RenderWork &render_work, float occupancy)
+{
+  state_.occupancy_num_samples = render_work.path_trace.num_samples;
+  state_.occupancy = occupancy;
+  VLOG(4) << "Measured path tracing occupancy: " << occupancy;
+}
+
+void RenderScheduler::report_adaptive_filter_time(const RenderWork &render_work,
+                                                  double time,
+                                                  bool is_cancelled)
+{
+  adaptive_filter_time_.add_wall(time);
+
+  if (is_cancelled) {
+    return;
+  }
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_report_reset_average(render_work)) {
+    adaptive_filter_time_.reset_average();
+  }
+
+  adaptive_filter_time_.add_average(final_time_approx, render_work.path_trace.num_samples);
+
+  VLOG(4) << "Average adaptive sampling filter  time: " << adaptive_filter_time_.get_average()
+          << " seconds.";
+}
+
+void RenderScheduler::report_denoise_time(const RenderWork &render_work, double time)
+{
+  denoise_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.denoise_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    denoise_time_.reset_average();
+  }
+
+  denoise_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average denoising time: " << denoise_time_.get_average() << " seconds.";
+}
+
+void RenderScheduler::report_display_update_time(const RenderWork &render_work, double time)
+{
+  display_update_time_.add_wall(time);
+
+  const double final_time_approx = approximate_final_time(render_work, time);
+
+  if (work_is_usable_for_first_render_estimation(render_work)) {
+    first_render_time_.display_update_time = final_time_approx;
+  }
+
+  if (work_report_reset_average(render_work)) {
+    display_update_time_.reset_average();
+  }
+
+  display_update_time_.add_average(final_time_approx);
+
+  VLOG(4) << "Average display update time: " << display_update_time_.get_average() << " seconds.";
+
+  /* Move the display update moment further in time, so that logic which checks when last update
+   * did happen have more reliable point in time (without path tracing and denoising parts of the
+   * render work). */
+  state_.last_display_update_time = time_dt();
+}
+
+void RenderScheduler::report_rebalance_time(const RenderWork &render_work,
+                                            double time,
+                                            bool balance_changed)
+{
+  rebalance_time_.add_wall(time);
+
+  if (work_report_reset_average(render_work)) {
+    rebalance_time_.reset_average();
+  }
+
+  rebalance_time_.add_average(time);
+
+  if (balance_changed) {
+    ++state_.num_rebalance_changes;
+  }
+
+  state_.last_rebalance_changed = balance_changed;
+
+  VLOG(4) << "Average rebalance time: " << rebalance_time_.get_average() << " seconds.";
+}
+
+string RenderScheduler::full_report() const
+{
+  const double render_wall_time = state_.end_render_time - state_.start_render_time;
+  const int num_rendered_samples = get_num_rendered_samples();
+
+  string result = "\nRender Scheduler Summary\n\n";
+
+  {
+    string mode;
+    if (headless_) {
+      mode = "Headless";
+    }
+    else if (background_) {
+      mode = "Background";
+    }
+    else {
+      mode = "Interactive";
+    }
+    result += "Mode: " + mode + "\n";
+  }
+
+  result += "Resolution: " + to_string(buffer_params_.width) + "x" +
+            to_string(buffer_params_.height) + "\n";
+
+  result += "\nAdaptive sampling:\n";
+  result += "  Use: " + string_from_bool(adaptive_sampling_.use) + "\n";
+  if (adaptive_sampling_.use) {
+    result += "  Step: " + to_string(adaptive_sampling_.adaptive_step) + "\n";
+    result += "  Min Samples: " + to_string(adaptive_sampling_.min_samples) + "\n";
+    result += "  Threshold: " + to_string(adaptive_sampling_.threshold) + "\n";
+  }
+
+  result += "\nDenoiser:\n";
+  result += "  Use: " + string_from_bool(denoiser_params_.use) + "\n";
+  if (denoiser_params_.use) {
+    result += "  Type: " + string(denoiserTypeToHumanReadable(denoiser_params_.type)) + "\n";
+    result += "  Start Sample: " + to_string(denoiser_params_.start_sample) + "\n";
+
+    string passes = "Color";
+    if (denoiser_params_.use_pass_albedo) {
+      passes += ", Albedo";
+    }
+    if (denoiser_params_.use_pass_normal) {
+      passes += ", Normal";
+    }
+
+    result += "  Passes: " + passes + "\n";
+  }
+
+  if (state_.num_rebalance_requested) {
+    result += "\nRebalancer:\n";
+    result += "  Number of requested rebalances: " + to_string(state_.num_rebalance_requested) +
+              "\n";
+    result += "  Number of performed rebalances: " + to_string(state_.num_rebalance_changes) +
+              "\n";
+  }
+
+  result += "\nTime (in seconds):\n";
+  result += string_printf("  %20s %20s %20s\n", "", "Wall", "Average");
+  result += string_printf("  %20s %20f %20f\n",
+                          "Path Tracing",
+                          path_trace_time_.get_wall(),
+                          path_trace_time_.get_average());
+
+  if (adaptive_sampling_.use) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Adaptive Filter",
+                            adaptive_filter_time_.get_wall(),
+                            adaptive_filter_time_.get_average());
+  }
+
+  if (denoiser_params_.use) {
+    result += string_printf(
+        "  %20s %20f %20f\n", "Denoiser", denoise_time_.get_wall(), denoise_time_.get_average());
+  }
+
+  result += string_printf("  %20s %20f %20f\n",
+                          "Display Update",
+                          display_update_time_.get_wall(),
+                          display_update_time_.get_average());
+
+  if (state_.num_rebalance_requested) {
+    result += string_printf("  %20s %20f %20f\n",
+                            "Rebalance",
+                            rebalance_time_.get_wall(),
+                            rebalance_time_.get_average());
+  }
+
+  const double total_time = path_trace_time_.get_wall() + adaptive_filter_time_.get_wall() +
+                            denoise_time_.get_wall() + display_update_time_.get_wall();
+  result += "\n  Total: " + to_string(total_time) + "\n";
+
+  result += string_printf(
+      "\nRendered %d samples in %f seconds\n", num_rendered_samples, render_wall_time);
+
+  /* When adaptive sampling is used the average time becomes meaningless, because different samples
+   * will likely render different number of pixels. */
+  if (!adaptive_sampling_.use) {
+    result += string_printf("Average time per sample: %f seconds\n",
+                            render_wall_time / num_rendered_samples);
+  }
+
+  return result;
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds() const
+{
+  return guess_display_update_interval_in_seconds_for_num_samples(state_.num_rendered_samples);
+}
+
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples(
+    int num_rendered_samples) const
+{
+  double update_interval = guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      num_rendered_samples);
+
+  if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
+    const double remaining_render_time = max(0.0,
+                                             time_limit_ - (time_dt() - state_.start_render_time));
+
+    update_interval = min(update_interval, remaining_render_time);
+  }
+
+  return update_interval;
+}
+
+/* TODO(sergey): This is just a quick implementation, exact values might need to be tweaked based
+ * on a more careful experiments with viewport rendering. */
+double RenderScheduler::guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+    int num_rendered_samples) const
+{
+  /* TODO(sergey): Need a decision on whether this should be using number of samples rendered
+   * within the current render session, or use absolute number of samples with the start sample
+   * taken into account. It will depend on whether the start sample offset clears the render
+   * buffer. */
+
+  if (state_.need_rebalance_at_next_work) {
+    return 0.1;
+  }
+  if (state_.last_rebalance_changed) {
+    return 0.2;
+  }
+
+  if (headless_) {
+    /* In headless mode do rare updates, so that the device occupancy is high, but there are still
+     * progress messages printed to the logs. */
+    return 30.0;
+  }
+
+  if (background_) {
+    if (num_rendered_samples < 32) {
+      return 1.0;
+    }
+    return 2.0;
+  }
+
+  /* Render time and number of samples rendered are used to figure out the display update interval.
+   *  Render time is used to allow for fast display updates in the first few seconds of rendering
+   *  on fast devices. Number of samples rendered is used to allow for potentially quicker display
+   *  updates on slow devices during the first few samples. */
+  const double render_time = path_trace_time_.get_wall();
+  if (render_time < 1) {
+    return 0.1;
+  }
+  if (render_time < 2) {
+    return 0.25;
+  }
+  if (render_time < 4) {
+    return 0.5;
+  }
+  if (render_time < 8 || num_rendered_samples < 32) {
+    return 1.0;
+  }
+  return 2.0;
+}
+
+int RenderScheduler::calculate_num_samples_per_update() const
+{
+  const double time_per_sample_average = path_trace_time_.get_average();
+  const double num_samples_in_second = pixel_size_ * pixel_size_ / time_per_sample_average;
+
+  const double update_interval_in_seconds = guess_display_update_interval_in_seconds();
+
+  return max(int(num_samples_in_second * update_interval_in_seconds), 1);
+}
+
+int RenderScheduler::get_start_sample_to_path_trace() const
+{
+  return start_sample_ + state_.num_rendered_samples;
+}
+
+/* Round number of samples to the closest power of two.
+ * Rounding might happen to higher or lower value depending on which one is closer. Such behavior
+ * allows to have number of samples to be power of two without diverging from the planned number of
+ * samples too much. */
+static inline uint round_num_samples_to_power_of_2(const uint num_samples)
+{
+  if (num_samples == 1) {
+    return 1;
+  }
+
+  if (is_power_of_two(num_samples)) {
+    return num_samples;
+  }
+
+  const uint num_samples_up = next_power_of_two(num_samples);
+  const uint num_samples_down = num_samples_up - (num_samples_up >> 1);
+
+  const uint delta_up = num_samples_up - num_samples;
+  const uint delta_down = num_samples - num_samples_down;
+
+  if (delta_up <= delta_down) {
+    return num_samples_up;
+  }
+
+  return num_samples_down;
+}
+
+int RenderScheduler::get_num_samples_to_path_trace() const
+{
+  if (state_.resolution_divider != pixel_size_) {
+    return get_num_samples_during_navigation(state_.resolution_divider);
+  }
+
+  /* Always start full resolution render  with a single sample. Gives more instant feedback to
+   * artists, and allows to gather information for a subsequent path tracing works. Do it in the
+   * headless mode as well, to give some estimate of how long samples are taking. */
+  if (state_.num_rendered_samples == 0) {
+    return 1;
+  }
+
+  const int num_samples_per_update = calculate_num_samples_per_update();
+  const int path_trace_start_sample = get_start_sample_to_path_trace();
+
+  /* Round number of samples to a power of two, so that division of path states into tiles goes in
+   * a more integer manner.
+   * This might make it so updates happens more rarely due to rounding up. In the test scenes this
+   * is not huge deal because it is not seen that more than 8 samples can be rendered between
+   * updates. If that becomes a problem we can add some extra rules like never allow to round up
+   * more than N samples. */
+  const int num_samples_pot = round_num_samples_to_power_of_2(num_samples_per_update);
+
+  const int max_num_samples_to_render = start_sample_ + num_samples_ - path_trace_start_sample;
+
+  int num_samples_to_render = min(num_samples_pot, max_num_samples_to_render);
+
+  /* When enough statistics is available and doing an offline rendering prefer to keep device
+   * occupied. */
+  if (state_.occupancy_num_samples && (background_ || headless_)) {
+    /* Keep occupancy at about 0.5 (this is more of an empirical figure which seems to match scenes
+     * with good performance without forcing occupancy to be higher). */
+    int num_samples_to_occupy = state_.occupancy_num_samples;
+    if (state_.occupancy < 0.5f) {
+      num_samples_to_occupy = lround(state_.occupancy_num_samples * 0.7f / state_.occupancy);
+    }
+
+    num_samples_to_render = max(num_samples_to_render,
+                                min(num_samples_to_occupy, max_num_samples_to_render));
+  }
+
+  /* If adaptive sampling is not use, render as many samples per update as possible, keeping the
+   * device fully occupied, without much overhead of display updates. */
+  if (!adaptive_sampling_.use) {
+    return num_samples_to_render;
+  }
+
+  /* TODO(sergey): Add extra "clamping" here so that none of the filtering points is missing. This
+   * is to ensure that the final render is pixel-matched regardless of how many samples per second
+   * compute device can do. */
+
+  return adaptive_sampling_.align_samples(path_trace_start_sample, num_samples_to_render);
+}
+
+int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) const
+{
+  /* Special trick for fast navigation: schedule multiple samples during fast navigation
+   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
+   * usable visual feedback for artists. There are a couple of tricks though. */
+
+  if (is_denoise_active_during_update()) {
+    /* When denoising is used during navigation prefer using a higher resolution with less samples
+     * (scheduling less samples here will make it so the resolution_divider calculation will use a
+     * lower value for the divider). This is because both OpenImageDenoiser and OptiX denoiser
+     * give visually better results on a higher resolution image with less samples. */
+    return 1;
+  }
+
+  if (resolution_divider <= pixel_size_) {
+    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+     * the sample count at this resolution division, but instead assists in the calculation of
+     * the resolution divider. */
+    return 1;
+  }
+
+  if (resolution_divider == pixel_size_ * 2) {
+    /* When resolution divider is the previous step to the final resolution, schedule two samples.
+     * This is so that rendering on lower resolution does not exceed time that it takes to render
+     * first sample at the full resolution. */
+    return 2;
+  }
+
+  /* Always render 4 samples, even if scene is configured for less.
+   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
+   * to have 4 time extra samples, so overall worst case timing is the same as the final resolution
+   * at one sample. */
+  return 4;
+}
+
+bool RenderScheduler::work_need_adaptive_filter() const
+{
+  return adaptive_sampling_.need_filter(get_rendered_sample());
+}
+
+float RenderScheduler::work_adaptive_threshold() const
+{
+  if (!use_progressive_noise_floor_) {
+    return adaptive_sampling_.threshold;
+  }
+
+  return max(state_.adaptive_sampling_threshold, adaptive_sampling_.threshold);
+}
+
+bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
+{
+  delayed = false;
+  ready_to_display = true;
+
+  if (!denoiser_params_.use) {
+    /* Denoising is disabled, no need to scheduler work for it. */
+    return false;
+  }
+
+  if (done()) {
+    /* Always denoise at the last sample. */
+    return true;
+  }
+
+  if (background_) {
+    /* Background render, only denoise when rendering the last sample. */
+    /* TODO(sergey): Follow similar logic to viewport, giving an overview of how final denoised
+     * image looks like even for the background rendering. */
+    return false;
+  }
+
+  /* Viewport render. */
+
+  /* Navigation might render multiple samples at a lower resolution. Those are not to be counted as
+   * final samples. */
+  const int num_samples_finished = state_.resolution_divider == pixel_size_ ?
+                                       state_.num_rendered_samples :
+                                       1;
+
+  /* Immediately denoise when we reach the start sample or last sample. */
+  if (num_samples_finished == denoiser_params_.start_sample ||
+      num_samples_finished == num_samples_) {
+    return true;
+  }
+
+  /* Do not denoise until the sample at which denoising should start is reached. */
+  if (num_samples_finished < denoiser_params_.start_sample) {
+    ready_to_display = false;
+    return false;
+  }
+
+  /* Avoid excessive denoising in viewport after reaching a certain sample count and render time.
+   */
+  /* TODO(sergey): Consider making time interval and sample configurable. */
+  delayed = (path_trace_time_.get_wall() > 4 && num_samples_finished >= 20 &&
+             (time_dt() - state_.last_display_update_time) < 1.0);
+
+  return !delayed;
+}
+
+bool RenderScheduler::work_need_update_display(const bool denoiser_delayed)
+{
+  if (headless_) {
+    /* Force disable display update in headless mode. There will be nothing to display the
+     * in-progress result. */
+    return false;
+  }
+
+  if (denoiser_delayed) {
+    /* If denoiser has been delayed the display can not be updated as it will not contain
+     * up-to-date state of the render result. */
+    return false;
+  }
+
+  if (!adaptive_sampling_.use) {
+    /* When adaptive sampling is not used the work is scheduled in a way that they keep render
+     * device busy for long enough, so that the display update can happen right after the
+     * rendering. */
+    return true;
+  }
+
+  if (done() || state_.last_display_update_sample == -1) {
+    /* Make sure an initial and final results of adaptive sampling is communicated ot the display.
+     */
+    return true;
+  }
+
+  /* For the development purposes of adaptive sampling it might be very useful to see all updates
+   * of active pixels after convergence check. However, it would cause a slowdown for regular usage
+   * users. Possibly, make it a debug panel option to allow rapid update to ease development
+   * without need to re-compiled. */
+  // if (work_need_adaptive_filter()) {
+  //   return true;
+  // }
+
+  /* When adaptive sampling is used, its possible that only handful of samples of a very simple
+   * scene will be scheduled to a powerful device (in order to not "miss" any of filtering points).
+   * We take care of skipping updates here based on when previous display update did happen. */
+  const double update_interval = guess_display_update_interval_in_seconds_for_num_samples(
+      state_.last_display_update_sample);
+  return (time_dt() - state_.last_display_update_time) > update_interval;
+}
+
+bool RenderScheduler::work_need_rebalance()
+{
+  /* This is the minimum time, as the rebalancing can not happen more often than the path trace
+   * work. */
+  static const double kRebalanceIntervalInSeconds = 1;
+
+  if (!need_schedule_rebalance_works_) {
+    return false;
+  }
+
+  if (state_.resolution_divider != pixel_size_) {
+    /* Don't rebalance at a non-final resolution divider. Some reasons for this:
+     *  - It will introduce unnecessary during navigation.
+     *  - Per-render device timing information is not very reliable yet. */
+    return false;
+  }
+
+  if (state_.num_rendered_samples == 0) {
+    state_.need_rebalance_at_next_work = true;
+    return false;
+  }
+
+  if (state_.need_rebalance_at_next_work) {
+    state_.need_rebalance_at_next_work = false;
+    return true;
+  }
+
+  if (state_.last_rebalance_changed) {
+    return true;
+  }
+
+  return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
+}
+
+void RenderScheduler::update_start_resolution_divider()
+{
+  if (start_resolution_divider_ == 0) {
+    /* Resolution divider has never been calculated before: use default resolution, so that we have
+     * somewhat good initial behavior, giving a chance to collect real numbers. */
+    start_resolution_divider_ = default_start_resolution_divider_;
+    VLOG(3) << "Initial resolution divider is " << start_resolution_divider_;
+    return;
+  }
+
+  if (first_render_time_.path_trace_per_sample == 0.0) {
+    /* Not enough information to calculate better resolution, keep the existing one. */
+    return;
+  }
+
+  const double desired_update_interval_in_seconds =
+      guess_viewport_navigation_update_interval_in_seconds();
+
+  const double actual_time_per_update = first_render_time_.path_trace_per_sample +
+                                        first_render_time_.denoise_time +
+                                        first_render_time_.display_update_time;
+
+  /* Allow some percent of tolerance, so that if the render time is close enough to the higher
+   * resolution we prefer to use it instead of going way lower resolution and time way below the
+   * desired one. */
+  const int resolution_divider_for_update = calculate_resolution_divider_for_time(
+      desired_update_interval_in_seconds * 1.4, actual_time_per_update);
+
+  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
+   * render time is somewhere on a boundary between two resolutions. */
+
+  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+   * simple and compute device is fast). */
+  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+
+  VLOG(3) << "Calculated resolution divider is " << start_resolution_divider_;
+}
+
+double RenderScheduler::guess_viewport_navigation_update_interval_in_seconds() const
+{
+  if (is_denoise_active_during_update()) {
+    /* Use lower value than the non-denoised case to allow having more pixels to reconstruct the
+     * image from. With the faster updates and extra compute required the resolution becomes too
+     * low to give usable feedback. */
+    /* NOTE: Based on performance of OpenImageDenoiser on CPU. For OptiX denoiser or other denoiser
+     * on GPU the value might need to become lower for faster navigation. */
+    return 1.0 / 12.0;
+  }
+
+  /* For the best match with the Blender's viewport the refresh ratio should be 60fps. This will
+   * avoid "jelly" effects. However, on a non-trivial scenes this can only be achieved with high
+   * values of the resolution divider which does not give very pleasant updates during navigation.
+   * Choose less frequent updates to allow more noise-free and higher resolution updates. */
+
+  /* TODO(sergey): Can look into heuristic which will allow to have 60fps if the resolution divider
+   * is not too high. Alternatively, synchronize Blender's overlays updates to Cycles updates. */
+
+  return 1.0 / 30.0;
+}
+
+bool RenderScheduler::is_denoise_active_during_update() const
+{
+  if (!denoiser_params_.use) {
+    return false;
+  }
+
+  if (denoiser_params_.start_sample > 1) {
+    return false;
+  }
+
+  return true;
+}
+
+bool RenderScheduler::work_is_usable_for_first_render_estimation(const RenderWork &render_work)
+{
+  return render_work.resolution_divider == pixel_size_ &&
+         render_work.path_trace.start_sample == start_sample_;
+}
+
+bool RenderScheduler::work_report_reset_average(const RenderWork &render_work)
+{
+  /* When rendering at a non-final resolution divider time average is not very useful because it
+   * will either bias average down (due to lower render times on the smaller images) or will give
+   * incorrect result when trying to estimate time which would have spent on the final resolution.
+   *
+   * So we only accumulate average for the latest resolution divider which was rendered. */
+  return render_work.resolution_divider != pixel_size_;
+}
+
+void RenderScheduler::check_time_limit_reached()
+{
+  if (time_limit_ == 0.0) {
+    /* No limit is enforced. */
+    return;
+  }
+
+  if (state_.start_render_time == 0.0) {
+    /* Rendering did not start yet. */
+    return;
+  }
+
+  const double current_time = time_dt();
+
+  if (current_time - state_.start_render_time < time_limit_) {
+    /* Time limit is not reached yet. */
+    return;
+  }
+
+  state_.time_limit_reached = true;
+  state_.end_render_time = current_time;
+}
+
+/* --------------------------------------------------------------------
+ * Utility functions.
+ */
+
+int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
+{
+  /* TODO(sergey): There should a non-iterative analytical formula here. */
+
+  int resolution_divider = 1;
+
+  /* This algorithm iterates through resolution dividers until a divider is found that achieves
+   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
+   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
+   * pre_resolution_division_samples and post_resolution_division_samples are used in this
+   * calculation to better predict the performance impact of changing resolution divisions as
+   * the sample count can also change between resolution divisions. */
+  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
+    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    resolution_divider = resolution_divider * 2;
+    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
+    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
+{
+  if (resolution == INT_MAX) {
+    return 1;
+  }
+
+  int resolution_divider = 1;
+  while (width * height > resolution * resolution) {
+    width = max(1, width / 2);
+    height = max(1, height / 2);
+
+    resolution_divider <<= 1;
+  }
+
+  return resolution_divider;
+}
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider)
+{
+  const int pixel_area = width * height;
+  const int resolution = lround(sqrt(pixel_area));
+
+  return resolution / resolution_divider;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h
new file mode 100644
index 00000000000..b7b598fb10c
--- /dev/null
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -0,0 +1,466 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/adaptive_sampling.h"
+#include "integrator/denoiser.h" /* For DenoiseParams. */
+#include "render/buffers.h"
+#include "util/util_string.h"
+
+CCL_NAMESPACE_BEGIN
+
+class SessionParams;
+class TileManager;
+
+class RenderWork {
+ public:
+  int resolution_divider = 1;
+
+  /* Initialize render buffers.
+   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * baking target. */
+  bool init_render_buffers = false;
+
+  /* Path tracing samples information. */
+  struct {
+    int start_sample = 0;
+    int num_samples = 0;
+  } path_trace;
+
+  struct {
+    /* Check for convergency and filter the mask. */
+    bool filter = false;
+
+    float threshold = 0.0f;
+
+    /* Reset convergency flag when filtering, forcing a re-check of whether pixel did converge. */
+    bool reset = false;
+  } adaptive_sampling;
+
+  struct {
+    bool postprocess = false;
+  } cryptomatte;
+
+  /* Work related on the current tile. */
+  struct {
+    /* Write render buffers of the current tile.
+     *
+     * It is up to the path trace to decide whether writing should happen via user-provided
+     * callback into the rendering software, or via tile manager into a partial file. */
+    bool write = false;
+
+    bool denoise = false;
+  } tile;
+
+  /* Work related on the full-frame render buffer. */
+  struct {
+    /* Write full render result.
+     * Implies reading the partial file from disk. */
+    bool write = false;
+  } full;
+
+  /* Display which is used to visualize render result. */
+  struct {
+    /* Display needs to be updated for the new render. */
+    bool update = false;
+
+    /* Display can use denoised result if available. */
+    bool use_denoised_result = true;
+  } display;
+
+  /* Re-balance multi-device scheduling after rendering this work.
+   * Note that the scheduler does not know anything about devices, so if there is only a single
+   * device used, then it is up for the PathTracer to ignore the balancing. */
+  bool rebalance = false;
+
+  /* Conversion to bool, to simplify checks about whether there is anything to be done for this
+   * work. */
+  inline operator bool() const
+  {
+    return path_trace.num_samples || adaptive_sampling.filter || display.update || tile.denoise ||
+           tile.write || full.write;
+  }
+};
+
+class RenderScheduler {
+ public:
+  RenderScheduler(TileManager &tile_manager, const SessionParams &params);
+
+  /* Specify whether cryptomatte-related works are to be scheduled. */
+  void set_need_schedule_cryptomatte(bool need_schedule_cryptomatte);
+
+  /* Allows to disable work re-balancing works, allowing to schedule as much to a single device
+   * as possible. */
+  void set_need_schedule_rebalance(bool need_schedule_rebalance);
+
+  bool is_background() const;
+
+  void set_denoiser_params(const DenoiseParams &params);
+  void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling);
+
+  bool is_adaptive_sampling_used() const;
+
+  /* Start sample for path tracing.
+   * The scheduler will schedule work using this sample as the first one. */
+  void set_start_sample(int start_sample);
+  int get_start_sample() const;
+
+  /* Number of samples to render, starting from start sample.
+   * The scheduler will schedule work in the range of
+   * [start_sample, start_sample + num_samples - 1], inclusively. */
+  void set_num_samples(int num_samples);
+  int get_num_samples() const;
+
+  /* Time limit for the path tracing tasks, in minutes.
+   * Zero disables the limit. */
+  void set_time_limit(double time_limit);
+  double get_time_limit() const;
+
+  /* Get sample up to which rendering has been done.
+   * This is an absolute 0-based value.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 14.
+   *
+   * If there were no samples rendered, then the behavior is undefined. */
+  int get_rendered_sample() const;
+
+  /* Get number of samples rendered within the current scheduling session.
+   *
+   * For example, if start sample is 10 and and 5 samples were rendered, then this call will
+   * return 5.
+   *
+   * Note that this is based on the scheduling information. In practice this means that if someone
+   * requested for work to render the scheduler considers the work done. */
+  int get_num_rendered_samples() const;
+
+  /* Reset scheduler, indicating that rendering will happen from scratch.
+   * Resets current rendered state, as well as scheduling information. */
+  void reset(const BufferParams &buffer_params, int num_samples);
+
+  /* Reset scheduler upon switching to a next tile.
+   * Will keep the same number of samples and full-frame render parameters, but will reset progress
+   * and allow schedule renders works from the beginning of the new tile. */
+  void reset_for_next_tile();
+
+  /* Reschedule adaptive sampling work when all pixels did converge.
+   * If there is nothing else to be done for the adaptive sampling (pixels did converge to the
+   * final threshold) then false is returned and the render scheduler will stop scheduling path
+   * tracing works. Otherwise will modify the work's adaptive sampling settings to continue with
+   * a lower threshold. */
+  bool render_work_reschedule_on_converge(RenderWork &render_work);
+
+  /* Reschedule adaptive sampling work when the device is mostly on idle, but not all pixels yet
+   * converged.
+   * If re-scheduling is not possible (adaptive sampling is happening with the final threshold, and
+   * the path tracer is to finish the current pixels) then false is returned. */
+  bool render_work_reschedule_on_idle(RenderWork &render_work);
+
+  /* Reschedule work when rendering has been requested to cancel.
+   *
+   * Will skip all work which is not needed anymore because no more samples will be added (for
+   * example, adaptive sampling filtering and convergence check will be skipped).
+   * Will enable all work needed to make sure all passes are communicated to the software.
+   *
+   * NOTE: Should be used before passing work to `PathTrace::render_samples()`. */
+  void render_work_reschedule_on_cancel(RenderWork &render_work);
+
+  RenderWork get_render_work();
+
+  /* Report that the path tracer started to work, after scene update and loading kernels. */
+  void report_work_begin(const RenderWork &render_work);
+
+  /* Report time (in seconds) which corresponding part of work took. */
+  void report_path_trace_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_path_trace_occupancy(const RenderWork &render_work, float occupancy);
+  void report_adaptive_filter_time(const RenderWork &render_work, double time, bool is_cancelled);
+  void report_denoise_time(const RenderWork &render_work, double time);
+  void report_display_update_time(const RenderWork &render_work, double time);
+  void report_rebalance_time(const RenderWork &render_work, double time, bool balance_changed);
+
+  /* Generate full multi-line report of the rendering process, including rendering parameters,
+   * times, and so on. */
+  string full_report() const;
+
+ protected:
+  /* Check whether all work has been scheduled and time limit was not exceeded.
+   *
+   * NOTE: Tricky bit: if the time limit was reached the done() is considered to be true, but some
+   * extra work needs to be scheduled to denoise and write final result. */
+  bool done() const;
+
+  /* Update scheduling state for a newly scheduled work.
+   * Takes care of things like checking whether work was ever denoised, tile was written and states
+   * like that. */
+  void update_state_for_render_work(const RenderWork &render_work);
+
+  /* Returns true if any work was scheduled. */
+  bool set_postprocess_render_work(RenderWork *render_work);
+
+  /*  Set work which is to be performed after all tiles has been rendered. */
+  void set_full_frame_render_work(RenderWork *render_work);
+
+  /* Update start resolution divider based on the accumulated timing information, preserving nice
+   * feeling navigation feel. */
+  void update_start_resolution_divider();
+
+  /* Calculate desired update interval in seconds based on the current timings and settings.
+   * Will give an interval which provides good feeling updates during viewport navigation. */
+  double guess_viewport_navigation_update_interval_in_seconds() const;
+
+  /* Check whether denoising is active during interactive update while resolution divider is not
+   * unit. */
+  bool is_denoise_active_during_update() const;
+
+  /* Heuristic which aims to give perceptually pleasant update of display interval in a way that at
+   * lower samples and near the beginning of rendering, updates happen more often, but with higher
+   * number of samples and later in the render, updates happen less often but device occupancy
+   * goes higher. */
+  double guess_display_update_interval_in_seconds() const;
+  double guess_display_update_interval_in_seconds_for_num_samples(int num_rendered_samples) const;
+  double guess_display_update_interval_in_seconds_for_num_samples_no_limit(
+      int num_rendered_samples) const;
+
+  /* Calculate number of samples which can be rendered within current desired update interval which
+   * is calculated by `guess_update_interval_in_seconds()`. */
+  int calculate_num_samples_per_update() const;
+
+  /* Get start sample and the number of samples which are to be path traces in the current work. */
+  int get_start_sample_to_path_trace() const;
+  int get_num_samples_to_path_trace() const;
+
+  /* Calculate how many samples there are to be rendered for the very first path trace after reset.
+   */
+  int get_num_samples_during_navigation(int resolution_divier) const;
+
+  /* Whether adaptive sampling convergence check and filter is to happen. */
+  bool work_need_adaptive_filter() const;
+
+  /* Calculate threshold for adaptive sampling. */
+  float work_adaptive_threshold() const;
+
+  /* Check whether current work needs denoising.
+   * Denoising is not needed if the denoiser is not configured, or when denoising is happening too
+   * often.
+   *
+   * The delayed will be true when the denoiser is configured for use, but it was delayed for a
+   * later sample, to reduce overhead.
+   *
+   * ready_to_display will be false if we may have a denoised result that is outdated due to
+   * increased samples. */
+  bool work_need_denoise(bool &delayed, bool &ready_to_display);
+
+  /* Check whether current work need to update display.
+   *
+   * The `denoiser_delayed` is what `work_need_denoise()` returned as delayed denoiser flag. */
+  bool work_need_update_display(const bool denoiser_delayed);
+
+  /* Check whether it is time to perform rebalancing for the render work, */
+  bool work_need_rebalance();
+
+  /* Check whether timing of the given work are usable to store timings in the `first_render_time_`
+   * for the resolution divider calculation. */
+  bool work_is_usable_for_first_render_estimation(const RenderWork &render_work);
+
+  /* Check whether timing report about the given work need to reset accumulated average time. */
+  bool work_report_reset_average(const RenderWork &render_work);
+
+  /* CHeck whether render time limit has been reached (or exceeded), and if so store related
+   * information in the state so that rendering is considered finished, and is possible to report
+   * average render time information. */
+  void check_time_limit_reached();
+
+  /* Helper class to keep track of task timing.
+   *
+   * Contains two parts: wall time and average. The wall time is an actual wall time of how long it
+   * took to complete all tasks of a type. Is always advanced when PathTracer reports time update.
+   *
+   * The average time is used for scheduling purposes. It is estimated to be a time of how long it
+   * takes to perform task on the final resolution. */
+  class TimeWithAverage {
+   public:
+    inline void reset()
+    {
+      total_wall_time_ = 0.0;
+
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+    inline void add_wall(double time)
+    {
+      total_wall_time_ += time;
+    }
+
+    inline void add_average(double time, int num_measurements = 1)
+    {
+      average_time_accumulator_ += time;
+      num_average_times_ += num_measurements;
+    }
+
+    inline double get_wall() const
+    {
+      return total_wall_time_;
+    }
+
+    inline double get_average() const
+    {
+      if (num_average_times_ == 0) {
+        return 0;
+      }
+      return average_time_accumulator_ / num_average_times_;
+    }
+
+    inline void reset_average()
+    {
+      average_time_accumulator_ = 0.0;
+      num_average_times_ = 0;
+    }
+
+   protected:
+    double total_wall_time_ = 0.0;
+
+    double average_time_accumulator_ = 0.0;
+    int num_average_times_ = 0;
+  };
+
+  struct {
+    int resolution_divider = 1;
+
+    /* Number of rendered samples on top of the start sample. */
+    int num_rendered_samples = 0;
+
+    /* Point in time the latest GPUDisplay work has been scheduled. */
+    double last_display_update_time = 0.0;
+    /* Value of -1 means display was never updated. */
+    int last_display_update_sample = -1;
+
+    /* Point in time at which last rebalance has been performed. */
+    double last_rebalance_time = 0.0;
+
+    /* Number of rebalance works which has been requested to be performed.
+     * The path tracer might ignore the work if there is a single device rendering. */
+    int num_rebalance_requested = 0;
+
+    /* Number of rebalance works handled which did change balance across devices. */
+    int num_rebalance_changes = 0;
+
+    bool need_rebalance_at_next_work = false;
+
+    /* Denotes whether the latest performed rebalance work cause an actual rebalance of work across
+     * devices. */
+    bool last_rebalance_changed = false;
+
+    /* Threshold for adaptive sampling which will be scheduled to work when not using progressive
+     * noise floor. */
+    float adaptive_sampling_threshold = 0.0f;
+
+    bool last_work_tile_was_denoised = false;
+    bool tile_result_was_written = false;
+    bool postprocess_work_scheduled = false;
+    bool full_frame_work_scheduled = false;
+    bool full_frame_was_written = false;
+
+    bool path_trace_finished = false;
+    bool time_limit_reached = false;
+
+    /* Time at which rendering started and finished. */
+    double start_render_time = 0.0;
+    double end_render_time = 0.0;
+
+    /* Measured occupancy of the render devices measured normalized to the number of samples.
+     *
+     * In a way it is "trailing": when scheduling new work this occupancy is measured when the
+     * previous work was rendered. */
+    int occupancy_num_samples = 0;
+    float occupancy = 1.0f;
+  } state_;
+
+  /* Timing of tasks which were performed at the very first render work at 100% of the
+   * resolution. This timing information is used to estimate resolution divider for fats
+   * navigation. */
+  struct {
+    double path_trace_per_sample;
+    double denoise_time;
+    double display_update_time;
+  } first_render_time_;
+
+  TimeWithAverage path_trace_time_;
+  TimeWithAverage adaptive_filter_time_;
+  TimeWithAverage denoise_time_;
+  TimeWithAverage display_update_time_;
+  TimeWithAverage rebalance_time_;
+
+  /* Whether cryptomatte-related work will be scheduled. */
+  bool need_schedule_cryptomatte_ = false;
+
+  /* Whether to schedule device load rebalance works.
+   * Rebalancing requires some special treatment for update intervals and such, so if it's known
+   * that the rebalance will be ignored (due to single-device rendering i.e.) is better to fully
+   * ignore rebalancing logic. */
+  bool need_schedule_rebalance_works_ = false;
+
+  /* Path tracing work will be scheduled for samples from within
+   * [start_sample_, start_sample_ + num_samples_ - 1] range, inclusively. */
+  int start_sample_ = 0;
+  int num_samples_ = 0;
+
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit_ = 0.0;
+
+  /* Headless rendering without interface. */
+  bool headless_;
+
+  /* Background (offline) rendering. */
+  bool background_;
+
+  /* Pixel size is used to force lower resolution render for final pass. Useful for retina or other
+   * types of hi-dpi displays. */
+  int pixel_size_ = 1;
+
+  TileManager &tile_manager_;
+
+  BufferParams buffer_params_;
+  DenoiseParams denoiser_params_;
+
+  AdaptiveSampling adaptive_sampling_;
+
+  /* Progressively lower adaptive sampling threshold level, keeping the image at a uniform noise
+   * level. */
+  bool use_progressive_noise_floor_ = false;
+
+  /* Default value for the resolution divider which will be used when there is no render time
+   * information available yet.
+   * It is also what defines the upper limit of the automatically calculated resolution divider. */
+  int default_start_resolution_divider_ = 1;
+
+  /* Initial resolution divider which will be used on render scheduler reset. */
+  int start_resolution_divider_ = 0;
+
+  /* Calculate smallest resolution divider which will bring down actual rendering time below the
+   * desired one. This call assumes linear dependency of render time from number of pixels
+   * (quadratic dependency from the resolution divider): resolution divider of 2 brings render time
+   * down by a factor of 4. */
+  int calculate_resolution_divider_for_time(double desired_time, double actual_time);
+};
+
+int calculate_resolution_divider_for_resolution(int width, int height, int resolution);
+
+int calculate_resolution_for_divider(int width, int height, int resolution_divider);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp
new file mode 100644
index 00000000000..d35ff4cd03f
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/shader_eval.h"
+
+#include "device/device.h"
+#include "device/device_queue.h"
+
+#include "device/cpu/kernel.h"
+#include "device/cpu/kernel_thread_globals.h"
+
+#include "util/util_logging.h"
+#include "util/util_progress.h"
+#include "util/util_tbb.h"
+
+CCL_NAMESPACE_BEGIN
+
+ShaderEval::ShaderEval(Device *device, Progress &progress) : device_(device), progress_(progress)
+{
+  DCHECK_NE(device_, nullptr);
+}
+
+bool ShaderEval::eval(const ShaderEvalType type,
+                      const int max_num_points,
+                      const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+                      const function<void(device_vector<float4> &)> &read_output)
+{
+  bool first_device = true;
+  bool success = true;
+
+  device_->foreach_device([&](Device *device) {
+    if (!first_device) {
+      LOG(ERROR) << "Multi-devices are not yet fully implemented, will evaluate shader on a "
+                    "single device.";
+      return;
+    }
+    first_device = false;
+
+    device_vector<KernelShaderEvalInput> input(device, "ShaderEval input", MEM_READ_ONLY);
+    device_vector<float4> output(device, "ShaderEval output", MEM_READ_WRITE);
+
+    /* Allocate and copy device buffers. */
+    DCHECK_EQ(input.device, device);
+    DCHECK_EQ(output.device, device);
+    DCHECK_LE(output.size(), input.size());
+
+    input.alloc(max_num_points);
+    int num_points = fill_input(input);
+    if (num_points == 0) {
+      return;
+    }
+
+    input.copy_to_device();
+    output.alloc(num_points);
+    output.zero_to_device();
+
+    /* Evaluate on CPU or GPU. */
+    success = (device->info.type == DEVICE_CPU) ? eval_cpu(device, type, input, output) :
+                                                  eval_gpu(device, type, input, output);
+
+    /* Copy data back from device if not canceled. */
+    if (success) {
+      output.copy_from_device(0, 1, output.size());
+      read_output(output);
+    }
+
+    input.free();
+    output.free();
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_cpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  vector<CPUKernelThreadGlobals> kernel_thread_globals;
+  device->get_cpu_kernel_thread_globals(kernel_thread_globals);
+
+  /* Find required kernel function. */
+  const CPUKernels &kernels = *(device->get_cpu_kernels());
+
+  /* Simple parallel_for over all work items. */
+  const int64_t work_size = output.size();
+  KernelShaderEvalInput *input_data = input.data();
+  float4 *output_data = output.data();
+  bool success = true;
+
+  tbb::task_arena local_arena(device->info.cpu_threads);
+  local_arena.execute([&]() {
+    tbb::parallel_for(int64_t(0), work_size, [&](int64_t work_index) {
+      /* TODO: is this fast enough? */
+      if (progress_.get_cancel()) {
+        success = false;
+        return;
+      }
+
+      const int thread_index = tbb::this_task_arena::current_thread_index();
+      KernelGlobals *kg = &kernel_thread_globals[thread_index];
+
+      switch (type) {
+        case SHADER_EVAL_DISPLACE:
+          kernels.shader_eval_displace(kg, input_data, output_data, work_index);
+          break;
+        case SHADER_EVAL_BACKGROUND:
+          kernels.shader_eval_background(kg, input_data, output_data, work_index);
+          break;
+      }
+    });
+  });
+
+  return success;
+}
+
+bool ShaderEval::eval_gpu(Device *device,
+                          const ShaderEvalType type,
+                          device_vector<KernelShaderEvalInput> &input,
+                          device_vector<float4> &output)
+{
+  /* Find required kernel function. */
+  DeviceKernel kernel;
+  switch (type) {
+    case SHADER_EVAL_DISPLACE:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_DISPLACE;
+      break;
+    case SHADER_EVAL_BACKGROUND:
+      kernel = DEVICE_KERNEL_SHADER_EVAL_BACKGROUND;
+      break;
+  };
+
+  /* Create device queue. */
+  unique_ptr<DeviceQueue> queue = device->gpu_queue_create();
+  queue->init_execution();
+
+  /* Execute work on GPU in chunk, so we can cancel.
+   * TODO : query appropriate size from device.*/
+  const int chunk_size = 65536;
+
+  const int work_size = output.size();
+  void *d_input = (void *)input.device_pointer;
+  void *d_output = (void *)output.device_pointer;
+
+  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int d_work_size = min(chunk_size, work_size - d_offset);
+    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
+
+    queue->enqueue(kernel, d_work_size, args);
+    queue->synchronize();
+
+    if (progress_.get_cancel()) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/shader_eval.h b/intern/cycles/integrator/shader_eval.h
new file mode 100644
index 00000000000..7dbf334b8d7
--- /dev/null
+++ b/intern/cycles/integrator/shader_eval.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_memory.h"
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_function.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class Progress;
+
+enum ShaderEvalType {
+  SHADER_EVAL_DISPLACE,
+  SHADER_EVAL_BACKGROUND,
+};
+
+/* ShaderEval class performs shader evaluation for background light and displacement. */
+class ShaderEval {
+ public:
+  ShaderEval(Device *device, Progress &progress);
+
+  /* Evaluate shader at points specified by KernelShaderEvalInput and write out
+   * RGBA colors to output. */
+  bool eval(const ShaderEvalType type,
+            const int max_num_points,
+            const function<int(device_vector<KernelShaderEvalInput> &)> &fill_input,
+            const function<void(device_vector<float4> &)> &read_output);
+
+ protected:
+  bool eval_cpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+  bool eval_gpu(Device *device,
+                const ShaderEvalType type,
+                device_vector<KernelShaderEvalInput> &input,
+                device_vector<float4> &output);
+
+  Device *device_;
+  Progress &progress_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
new file mode 100644
index 00000000000..3387b7bedf1
--- /dev/null
+++ b/intern/cycles/integrator/tile.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/tile.h"
+
+#include "util/util_logging.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size)
+{
+  os << "size: (" << tile_size.width << ", " << tile_size.height << ")";
+  os << ", num_samples: " << tile_size.num_samples;
+  return os;
+}
+
+ccl_device_inline uint round_down_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return prev_power_of_two(x);
+}
+
+ccl_device_inline uint round_up_to_power_of_two(uint x)
+{
+  if (is_power_of_two(x)) {
+    return x;
+  }
+
+  return next_power_of_two(x);
+}
+
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states)
+{
+  if (max_num_path_states == 1) {
+    /* Simple case: avoid any calculation, which could cause rounding issues. */
+    return TileSize(1, 1, 1);
+  }
+
+  const int64_t num_pixels = image_size.x * image_size.y;
+  const int64_t num_pixel_samples = num_pixels * num_samples;
+
+  if (max_num_path_states >= num_pixel_samples) {
+    /* Image fully fits into the state (could be border render, for example). */
+    return TileSize(image_size.x, image_size.y, num_samples);
+  }
+
+  /* The idea here is to keep number of samples per tile as much as possible to improve coherency
+   * across threads.
+   *
+   * Some general ideas:
+   *  - Prefer smaller tiles with more samples, which improves spatial coherency of paths.
+   *  - Keep values a power of two, for more integer fit into the maximum number of paths. */
+
+  TileSize tile_size;
+
+  /* Calculate tile size as if it is the most possible one to fit an entire range of samples.
+   * The idea here is to keep tiles as small as possible, and keep device occupied by scheduling
+   * multiple tiles with the same coordinates rendering different samples. */
+  const int num_path_states_per_sample = max_num_path_states / num_samples;
+  if (num_path_states_per_sample != 0) {
+    tile_size.width = round_down_to_power_of_two(lround(sqrt(num_path_states_per_sample)));
+    tile_size.height = tile_size.width;
+  }
+  else {
+    tile_size.width = tile_size.height = 1;
+  }
+
+  if (num_samples == 1) {
+    tile_size.num_samples = 1;
+  }
+  else {
+    /* Heuristic here is to have more uniform division of the sample range: for example prefer
+     * [32 <38 times>, 8] over [1024, 200]. This allows to greedily add more tiles early on. */
+    tile_size.num_samples = min(round_up_to_power_of_two(lround(sqrt(num_samples / 2))),
+                                static_cast<uint>(num_samples));
+
+    const int tile_area = tile_size.width / tile_size.height;
+    tile_size.num_samples = min(tile_size.num_samples, max_num_path_states / tile_area);
+  }
+
+  DCHECK_GE(tile_size.width, 1);
+  DCHECK_GE(tile_size.height, 1);
+  DCHECK_GE(tile_size.num_samples, 1);
+  DCHECK_LE(tile_size.width * tile_size.height * tile_size.num_samples, max_num_path_states);
+
+  return tile_size;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
new file mode 100644
index 00000000000..d0824843ddb
--- /dev/null
+++ b/intern/cycles/integrator/tile.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct TileSize {
+  TileSize() = default;
+
+  inline TileSize(int width, int height, int num_samples)
+      : width(width), height(height), num_samples(num_samples)
+  {
+  }
+
+  inline bool operator==(const TileSize &other) const
+  {
+    return width == other.width && height == other.height && num_samples == other.num_samples;
+  }
+  inline bool operator!=(const TileSize &other) const
+  {
+    return !(*this == other);
+  }
+
+  int width = 0, height = 0;
+  int num_samples = 0;
+};
+
+std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
+
+/* Calculate tile size which is best suitable for rendering image of a given size with given number
+ * of active path states.
+ * Will attempt to provide best guess to keep path tracing threads of a device as localized as
+ * possible, and have as many threads active for every tile as possible. */
+TileSize tile_calculate_best_size(const int2 &image_size,
+                                  const int num_samples,
+                                  const int max_num_path_states);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
new file mode 100644
index 00000000000..9f96fe3632b
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_balancer.h"
+
+#include "util/util_math.h"
+
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  if (num_infos == 1) {
+    work_balance_infos[0].weight = 1.0;
+    return;
+  }
+
+  /* There is no statistics available, so start with an equal distribution. */
+  const double weight = 1.0 / num_infos;
+  for (WorkBalanceInfo &balance_info : work_balance_infos) {
+    balance_info.weight = weight;
+  }
+}
+
+static double calculate_total_time(const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  double total_time = 0;
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    total_time += info.time_spent;
+  }
+  return total_time;
+}
+
+/* The balance is based on equalizing time which devices spent performing a task. Assume that
+ * average of the observed times is usable for estimating whether more or less work is to be
+ * scheduled, and how difference in the work scheduling is needed. */
+
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  const double total_time = calculate_total_time(work_balance_infos);
+  const double time_average = total_time / num_infos;
+
+  double total_weight = 0;
+  vector<double> new_weights;
+  new_weights.reserve(num_infos);
+
+  /* Equalize the overall average time. This means that we don't make it so every work will perform
+   * amount of work based on the current average, but that after the weights changes the time will
+   * equalize.
+   * Can think of it that if one of the devices is 10% faster than another, then one device needs
+   * to do 5% less of the current work, and another needs to do 5% more. */
+  const double lerp_weight = 1.0 / num_infos;
+
+  bool has_big_difference = false;
+
+  for (const WorkBalanceInfo &info : work_balance_infos) {
+    const double time_target = lerp(info.time_spent, time_average, lerp_weight);
+    const double new_weight = info.weight * time_target / info.time_spent;
+    new_weights.push_back(new_weight);
+    total_weight += new_weight;
+
+    if (std::fabs(1.0 - time_target / time_average) > 0.02) {
+      has_big_difference = true;
+    }
+  }
+
+  if (!has_big_difference) {
+    return false;
+  }
+
+  const double total_weight_inv = 1.0 / total_weight;
+  for (int i = 0; i < num_infos; ++i) {
+    WorkBalanceInfo &info = work_balance_infos[i];
+    info.weight = new_weights[i] * total_weight_inv;
+    info.time_spent = 0;
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_balancer.h b/intern/cycles/integrator/work_balancer.h
new file mode 100644
index 00000000000..fc5e561845e
--- /dev/null
+++ b/intern/cycles/integrator/work_balancer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct WorkBalanceInfo {
+  /* Time spent performing corresponding work. */
+  double time_spent = 0;
+
+  /* Average occupancy of the device while performing the work. */
+  float occupancy = 1.0f;
+
+  /* Normalized weight, which is ready to be used for work balancing (like calculating fraction of
+   * the big tile which is to be rendered on the device). */
+  double weight = 1.0;
+};
+
+/* Balance work for an initial render integration, before any statistics is known. */
+void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos);
+
+/* Rebalance work after statistics has been accumulated.
+ * Returns true if the balancing did change. */
+bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
new file mode 100644
index 00000000000..e6ada2f46ee
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "integrator/work_tile_scheduler.h"
+
+#include "device/device_queue.h"
+#include "integrator/tile.h"
+#include "render/buffers.h"
+#include "util/util_atomic.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+WorkTileScheduler::WorkTileScheduler()
+{
+}
+
+void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
+{
+  max_num_path_states_ = max_num_path_states;
+}
+
+void WorkTileScheduler::reset(const BufferParams &buffer_params, int sample_start, int samples_num)
+{
+  /* Image buffer parameters. */
+  image_full_offset_px_.x = buffer_params.full_x;
+  image_full_offset_px_.y = buffer_params.full_y;
+
+  image_size_px_ = make_int2(buffer_params.width, buffer_params.height);
+
+  offset_ = buffer_params.offset;
+  stride_ = buffer_params.stride;
+
+  /* Samples parameters. */
+  sample_start_ = sample_start;
+  samples_num_ = samples_num;
+
+  /* Initialize new scheduling. */
+  reset_scheduler_state();
+}
+
+void WorkTileScheduler::reset_scheduler_state()
+{
+  tile_size_ = tile_calculate_best_size(image_size_px_, samples_num_, max_num_path_states_);
+
+  VLOG(3) << "Will schedule tiles of size " << tile_size_;
+
+  if (VLOG_IS_ON(3)) {
+    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
+     * and purely focusing on the number of used path states. */
+    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                        tile_size_.num_samples;
+    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+    VLOG(3) << "Number of unused path states: "
+            << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  }
+
+  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+
+  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
+  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
+
+  next_work_index_ = 0;
+  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
+}
+
+bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_size)
+{
+  /* Note that the `max_work_size` can be higher than the `max_num_path_states_`: this is because
+   * the path trace work can decide to use smaller tile sizes and greedily schedule multiple tiles,
+   * improving overall device occupancy.
+   * So the `max_num_path_states_` is a "scheduling unit", and the `max_work_size` is a "scheduling
+   * limit". */
+
+  DCHECK_NE(max_num_path_states_, 0);
+
+  const int work_index = atomic_fetch_and_add_int32(&next_work_index_, 1);
+  if (work_index >= total_work_size_) {
+    return false;
+  }
+
+  const int sample_range_index = work_index % num_tiles_per_sample_range_;
+  const int start_sample = sample_range_index * tile_size_.num_samples;
+  const int tile_index = work_index / num_tiles_per_sample_range_;
+  const int tile_y = tile_index / num_tiles_x_;
+  const int tile_x = tile_index - tile_y * num_tiles_x_;
+
+  KernelWorkTile work_tile;
+  work_tile.x = tile_x * tile_size_.width;
+  work_tile.y = tile_y * tile_size_.height;
+  work_tile.w = tile_size_.width;
+  work_tile.h = tile_size_.height;
+  work_tile.start_sample = sample_start_ + start_sample;
+  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.offset = offset_;
+  work_tile.stride = stride_;
+
+  work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
+  work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);
+
+  work_tile.x += image_full_offset_px_.x;
+  work_tile.y += image_full_offset_px_.y;
+
+  const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+  DCHECK_GT(tile_work_size, 0);
+
+  if (max_work_size && tile_work_size > max_work_size) {
+    /* The work did not fit into the requested limit of the work size. Unschedule the tile,
+     * allowing others (or ourselves later one) to pick it up.
+     *
+     * TODO: Such temporary decrement is not ideal, since it might lead to situation when another
+     * device sees there is nothing to be done, finishing its work and leaving all work to be
+     * done by us. */
+    atomic_fetch_and_add_int32(&next_work_index_, -1);
+    return false;
+  }
+
+  *work_tile_ = work_tile;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
new file mode 100644
index 00000000000..85f11b601c7
--- /dev/null
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "integrator/tile.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+struct KernelWorkTile;
+
+/* Scheduler of device work tiles.
+ * Takes care of feeding multiple devices running in parallel a work which needs to be done. */
+class WorkTileScheduler {
+ public:
+  WorkTileScheduler();
+
+  /* MAximum path states which are allowed to be used by a single scheduled work tile.
+   *
+   * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
+   * this number of states. */
+  void set_max_num_path_states(int max_num_path_states);
+
+  /* Scheduling will happen for pixels within a big tile denotes by its parameters. */
+  void reset(const BufferParams &buffer_params, int sample_start, int samples_num);
+
+  /* Get work for a device.
+   * Returns true if there is still work to be done and initialize the work tile to all
+   * parameters of this work. If there is nothing remaining to be done, returns false and the
+   * work tile is kept unchanged.
+   *
+   * Optionally pass max_work_size to do nothing if there is no tile small enough. */
+  bool get_work(KernelWorkTile *work_tile, const int max_work_size = 0);
+
+ protected:
+  void reset_scheduler_state();
+
+  /* Maximum allowed path states to be used.
+   *
+   * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
+   * number of path states is kind of a detail. Is there a more generic term from the scheduler
+   * point of view? */
+  int max_num_path_states_ = 0;
+
+  /* Offset in pixels within a global buffer. */
+  int2 image_full_offset_px_ = make_int2(0, 0);
+
+  /* dimensions of the currently rendering image in pixels. */
+  int2 image_size_px_ = make_int2(0, 0);
+
+  /* Offset and stride of the buffer within which scheduling is happening.
+   * Will be passed over to the KernelWorkTile. */
+  int offset_, stride_;
+
+  /* Start sample of index and number of samples which are to be rendered.
+   * The scheduler will cover samples range of [start, start + num] over the entire image
+   * (splitting into a smaller work tiles). */
+  int sample_start_ = 0;
+  int samples_num_ = 0;
+
+  /* Tile size which be scheduled for rendering. */
+  TileSize tile_size_;
+
+  /* Number of tiles in X and Y axis of the image. */
+  int num_tiles_x_, num_tiles_y_;
+
+  /* Total number of tiles on the image.
+   * Pre-calculated as `num_tiles_x_ * num_tiles_y_` and re-used in the `get_work()`.
+   *
+   * TODO(sergey): Is this an over-optimization? Maybe it's unmeasurable to calculate the value
+   * in the `get_work()`? */
+  int total_tiles_num_ = 0;
+
+  /* In the case when the number of samples in the `tile_size_` is lower than samples_num_ denotes
+   * how many tiles are to be "stacked" to cover the entire requested range of samples. */
+  int num_tiles_per_sample_range_ = 0;
+
+  int next_work_index_ = 0;
+  int total_work_size_ = 0;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 0ce33c51778..4196539a9b1 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -22,68 +22,22 @@ set(INC_SYS
 
 )
 
-set(SRC_CPU_KERNELS
-  kernels/cpu/kernel.cpp
-  kernels/cpu/kernel_sse2.cpp
-  kernels/cpu/kernel_sse3.cpp
-  kernels/cpu/kernel_sse41.cpp
-  kernels/cpu/kernel_avx.cpp
-  kernels/cpu/kernel_avx2.cpp
-  kernels/cpu/kernel_split.cpp
-  kernels/cpu/kernel_split_sse2.cpp
-  kernels/cpu/kernel_split_sse3.cpp
-  kernels/cpu/kernel_split_sse41.cpp
-  kernels/cpu/kernel_split_avx.cpp
-  kernels/cpu/kernel_split_avx2.cpp
-  kernels/cpu/filter.cpp
-  kernels/cpu/filter_sse2.cpp
-  kernels/cpu/filter_sse3.cpp
-  kernels/cpu/filter_sse41.cpp
-  kernels/cpu/filter_avx.cpp
-  kernels/cpu/filter_avx2.cpp
+set(SRC_DEVICE_CPU
+  device/cpu/kernel.cpp
+  device/cpu/kernel_sse2.cpp
+  device/cpu/kernel_sse3.cpp
+  device/cpu/kernel_sse41.cpp
+  device/cpu/kernel_avx.cpp
+  device/cpu/kernel_avx2.cpp
 )
 
-set(SRC_CUDA_KERNELS
-  kernels/cuda/kernel.cu
-  kernels/cuda/kernel_split.cu
-  kernels/cuda/filter.cu
+set(SRC_DEVICE_CUDA
+  device/cuda/kernel.cu
 )
 
-set(SRC_OPENCL_KERNELS
-  kernels/opencl/kernel_adaptive_stopping.cl
-  kernels/opencl/kernel_adaptive_filter_x.cl
-  kernels/opencl/kernel_adaptive_filter_y.cl
-  kernels/opencl/kernel_adaptive_adjust_samples.cl
-  kernels/opencl/kernel_bake.cl
-  kernels/opencl/kernel_base.cl
-  kernels/opencl/kernel_displace.cl
-  kernels/opencl/kernel_background.cl
-  kernels/opencl/kernel_state_buffer_size.cl
-  kernels/opencl/kernel_split_bundle.cl
-  kernels/opencl/kernel_data_init.cl
-  kernels/opencl/kernel_path_init.cl
-  kernels/opencl/kernel_queue_enqueue.cl
-  kernels/opencl/kernel_scene_intersect.cl
-  kernels/opencl/kernel_lamp_emission.cl
-  kernels/opencl/kernel_do_volume.cl
-  kernels/opencl/kernel_indirect_background.cl
-  kernels/opencl/kernel_shader_setup.cl
-  kernels/opencl/kernel_shader_sort.cl
-  kernels/opencl/kernel_shader_eval.cl
-  kernels/opencl/kernel_holdout_emission_blurring_pathtermination_ao.cl
-  kernels/opencl/kernel_subsurface_scatter.cl
-  kernels/opencl/kernel_direct_lighting.cl
-  kernels/opencl/kernel_shadow_blocked_ao.cl
-  kernels/opencl/kernel_shadow_blocked_dl.cl
-  kernels/opencl/kernel_enqueue_inactive.cl
-  kernels/opencl/kernel_next_iteration_setup.cl
-  kernels/opencl/kernel_indirect_subsurface.cl
-  kernels/opencl/kernel_buffer_update.cl
-  kernels/opencl/filter.cl
-)
-
-set(SRC_OPTIX_KERNELS
-  kernels/optix/kernel_optix.cu
+set(SRC_DEVICE_OPTIX
+  device/optix/kernel.cu
+  device/optix/kernel_shader_raytrace.cu
 )
 
 set(SRC_BVH_HEADERS
@@ -105,63 +59,56 @@ set(SRC_HEADERS
   kernel_bake.h
   kernel_camera.h
   kernel_color.h
-  kernel_compat_cpu.h
-  kernel_compat_cuda.h
-  kernel_compat_optix.h
-  kernel_compat_opencl.h
   kernel_differential.h
   kernel_emission.h
   kernel_film.h
-  kernel_globals.h
   kernel_id_passes.h
   kernel_jitter.h
   kernel_light.h
   kernel_light_background.h
   kernel_light_common.h
+  kernel_lookup_table.h
   kernel_math.h
   kernel_montecarlo.h
   kernel_passes.h
-  kernel_path.h
-  kernel_path_branched.h
-  kernel_path_common.h
   kernel_path_state.h
-  kernel_path_surface.h
-  kernel_path_subsurface.h
-  kernel_path_volume.h
   kernel_profiling.h
   kernel_projection.h
-  kernel_queues.h
   kernel_random.h
   kernel_shader.h
-  kernel_shadow.h
-  kernel_subsurface.h
+  kernel_shadow_catcher.h
   kernel_textures.h
   kernel_types.h
-  kernel_volume.h
   kernel_work_stealing.h
   kernel_write_passes.h
 )
 
-set(SRC_KERNELS_CPU_HEADERS
-  kernel.h
-  kernels/cpu/kernel_cpu.h
-  kernels/cpu/kernel_cpu_impl.h
-  kernels/cpu/kernel_cpu_image.h
-  kernels/cpu/filter_cpu.h
-  kernels/cpu/filter_cpu_impl.h
+set(SRC_DEVICE_CPU_HEADERS
+  device/cpu/compat.h
+  device/cpu/image.h
+  device/cpu/globals.h
+  device/cpu/kernel.h
+  device/cpu/kernel_arch.h
+  device/cpu/kernel_arch_impl.h
 )
-
-set(SRC_KERNELS_CUDA_HEADERS
-  kernels/cuda/kernel_config.h
-  kernels/cuda/kernel_cuda_image.h
+set(SRC_DEVICE_GPU_HEADERS
+  device/gpu/image.h
+  device/gpu/kernel.h
+  device/gpu/parallel_active_index.h
+  device/gpu/parallel_prefix_sum.h
+  device/gpu/parallel_reduce.h
+  device/gpu/parallel_sorted_index.h
 )
 
-set(SRC_KERNELS_OPTIX_HEADERS
+set(SRC_DEVICE_CUDA_HEADERS
+  device/cuda/compat.h
+  device/cuda/config.h
+  device/cuda/globals.h
 )
 
-set(SRC_KERNELS_OPENCL_HEADERS
-  kernels/opencl/kernel_split_function.h
-  kernels/opencl/kernel_opencl_image.h
+set(SRC_DEVICE_OPTIX_HEADERS
+  device/optix/compat.h
+  device/optix/globals.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -259,25 +206,32 @@ set(SRC_GEOM_HEADERS
   geom/geom_object.h
   geom/geom_patch.h
   geom/geom_primitive.h
+  geom/geom_shader_data.h
   geom/geom_subd_triangle.h
   geom/geom_triangle.h
   geom/geom_triangle_intersect.h
   geom/geom_volume.h
 )
 
-set(SRC_FILTER_HEADERS
-  filter/filter.h
-  filter/filter_defines.h
-  filter/filter_features.h
-  filter/filter_features_sse.h
-  filter/filter_kernel.h
-  filter/filter_nlm_cpu.h
-  filter/filter_nlm_gpu.h
-  filter/filter_prefilter.h
-  filter/filter_reconstruction.h
-  filter/filter_transform.h
-  filter/filter_transform_gpu.h
-  filter/filter_transform_sse.h
+set(SRC_INTEGRATOR_HEADERS
+  integrator/integrator_init_from_bake.h
+  integrator/integrator_init_from_camera.h
+  integrator/integrator_intersect_closest.h
+  integrator/integrator_intersect_shadow.h
+  integrator/integrator_intersect_subsurface.h
+  integrator/integrator_intersect_volume_stack.h
+  integrator/integrator_megakernel.h
+  integrator/integrator_shade_background.h
+  integrator/integrator_shade_light.h
+  integrator/integrator_shade_shadow.h
+  integrator/integrator_shade_surface.h
+  integrator/integrator_shade_volume.h
+  integrator/integrator_state.h
+  integrator/integrator_state_flow.h
+  integrator/integrator_state_template.h
+  integrator/integrator_state_util.h
+  integrator/integrator_subsurface.h
+  integrator/integrator_volume_stack.h
 )
 
 set(SRC_UTIL_HEADERS
@@ -333,36 +287,6 @@ set(SRC_UTIL_HEADERS
   ../util/util_types_vector3_impl.h
 )
 
-set(SRC_SPLIT_HEADERS
-  split/kernel_adaptive_adjust_samples.h
-  split/kernel_adaptive_filter_x.h
-  split/kernel_adaptive_filter_y.h
-  split/kernel_adaptive_stopping.h
-  split/kernel_branched.h
-  split/kernel_buffer_update.h
-  split/kernel_data_init.h
-  split/kernel_direct_lighting.h
-  split/kernel_do_volume.h
-  split/kernel_enqueue_inactive.h
-  split/kernel_holdout_emission_blurring_pathtermination_ao.h
-  split/kernel_indirect_background.h
-  split/kernel_indirect_subsurface.h
-  split/kernel_lamp_emission.h
-  split/kernel_next_iteration_setup.h
-  split/kernel_path_init.h
-  split/kernel_queue_enqueue.h
-  split/kernel_scene_intersect.h
-  split/kernel_shader_setup.h
-  split/kernel_shader_sort.h
-  split/kernel_shader_eval.h
-  split/kernel_shadow_blocked_ao.h
-  split/kernel_shadow_blocked_dl.h
-  split/kernel_split_common.h
-  split/kernel_split_data.h
-  split/kernel_split_data_types.h
-  split/kernel_subsurface_scatter.h
-)
-
 set(LIB
 
 )
@@ -393,21 +317,17 @@ if(WITH_CYCLES_CUDA_BINARIES)
   endif()
 
   # build for each arch
-  set(cuda_sources kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu
+  set(cuda_sources device/cuda/kernel.cu
     ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
+    ${SRC_DEVICE_GPU_HEADERS}
+    ${SRC_DEVICE_CUDA_HEADERS}
     ${SRC_BVH_HEADERS}
     ${SRC_SVM_HEADERS}
     ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
     ${SRC_CLOSURE_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
-  set(cuda_filter_sources kernels/cuda/filter.cu
-    ${SRC_HEADERS}
-    ${SRC_KERNELS_CUDA_HEADERS}
-    ${SRC_FILTER_HEADERS}
-    ${SRC_UTIL_HEADERS}
-  )
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
@@ -427,7 +347,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       endif()
     endif()
 
-    set(cuda_kernel_src "/kernels/cuda/${name}.cu")
+    set(cuda_kernel_src "/device/cuda/${name}.cu")
 
     set(cuda_flags ${flags}
       -D CCL_NAMESPACE_BEGIN=
@@ -435,7 +355,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -D NVCC
       -m ${CUDA_BITS}
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
-      -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
       --use_fast_math
       -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
@@ -523,14 +443,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
     endif()
     if(DEFINED cuda_nvcc_executable AND DEFINED cuda_toolkit_root_dir)
       # Compile regular kernel
-      CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} filter "" "${cuda_filter_sources}" FALSE)
       CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${cuda_sources}" FALSE)
 
-      if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
-        # Compile split kernel
-        CYCLES_CUDA_KERNEL_ADD(${arch} ${prev_arch} kernel_split "-D __SPLIT__" "${cuda_sources}" FALSE)
-      endif()
-
       if(WITH_CYCLES_CUDA_BUILD_SERIAL)
         set(prev_arch ${arch})
       endif()
@@ -547,15 +461,15 @@ endif()
 # OptiX PTX modules
 
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
-  macro(CYCLES_OPTIX_KERNEL_ADD name flags)
-    set(input "kernels/optix/kernel_optix.cu")
+  macro(CYCLES_OPTIX_KERNEL_ADD name input flags)
     set(output "${CMAKE_CURRENT_BINARY_DIR}/${name}.ptx")
 
     set(cuda_flags ${flags}
       -I "${OPTIX_INCLUDE_DIR}"
       -I "${CMAKE_CURRENT_SOURCE_DIR}/.."
-      -I "${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda"
+      -I "${CMAKE_CURRENT_SOURCE_DIR}/device/cuda"
       --use_fast_math
+      -Wno-deprecated-gpu-targets
       -o ${output})
 
     if(WITH_NANOVDB)
@@ -580,11 +494,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND ${CUBIN_CC_ENV}
@@ -603,11 +519,13 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
         DEPENDS
           ${input}
           ${SRC_HEADERS}
-          ${SRC_KERNELS_CUDA_HEADERS}
-          ${SRC_KERNELS_OPTIX_HEADERS}
+          ${SRC_DEVICE_GPU_HEADERS}
+          ${SRC_DEVICE_CUDA_HEADERS}
+          ${SRC_DEVICE_OPTIX_HEADERS}
           ${SRC_BVH_HEADERS}
           ${SRC_SVM_HEADERS}
           ${SRC_GEOM_HEADERS}
+          ${SRC_INTEGRATOR_HEADERS}
           ${SRC_CLOSURE_HEADERS}
           ${SRC_UTIL_HEADERS}
         COMMAND
@@ -624,8 +542,14 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
     delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${output}" ${CYCLES_INSTALL_PATH}/lib)
   endmacro()
 
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix "-D __NO_SHADER_RAYTRACE__")
-  CYCLES_OPTIX_KERNEL_ADD(kernel_optix_shader_raytrace "--keep-device-functions")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix
+    "device/optix/kernel.cu"
+    "")
+  CYCLES_OPTIX_KERNEL_ADD(
+    kernel_optix_shader_raytrace
+    "device/optix/kernel_shader_raytrace.cu"
+    "--keep-device-functions")
 
   add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
   cycles_set_solution_folder(cycles_kernel_optix)
@@ -659,62 +583,47 @@ if(WITH_COMPILER_ASAN)
   endif()
 endif()
 
-set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
-set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(device/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 
 if(CXX_HAS_SSE)
-  set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX)
-  set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()
 
 if(CXX_HAS_AVX2)
-  set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
-  set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+  set_source_files_properties(device/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()
 
 cycles_add_library(cycles_kernel "${LIB}"
-  ${SRC_CPU_KERNELS}
-  ${SRC_CUDA_KERNELS}
-  ${SRC_OPTIX_KERNELS}
-  ${SRC_OPENCL_KERNELS}
+  ${SRC_DEVICE_CPU}
+  ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_OPTIX}
   ${SRC_HEADERS}
-  ${SRC_KERNELS_CPU_HEADERS}
-  ${SRC_KERNELS_CUDA_HEADERS}
-  ${SRC_KERNELS_OPTIX_HEADERS}
-  ${SRC_KERNELS_OPENCL_HEADERS}
+  ${SRC_DEVICE_CPU_HEADERS}
+  ${SRC_DEVICE_GPU_HEADERS}
+  ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_OPTIX_HEADERS}
   ${SRC_BVH_HEADERS}
   ${SRC_CLOSURE_HEADERS}
-  ${SRC_FILTER_HEADERS}
   ${SRC_SVM_HEADERS}
   ${SRC_GEOM_HEADERS}
-  ${SRC_SPLIT_HEADERS}
+  ${SRC_INTEGRATOR_HEADERS}
 )
 
 source_group("bvh" FILES ${SRC_BVH_HEADERS})
 source_group("closure" FILES ${SRC_CLOSURE_HEADERS})
-source_group("filter" FILES ${SRC_FILTER_HEADERS})
 source_group("geom" FILES ${SRC_GEOM_HEADERS})
+source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
-source_group("kernel\\split" FILES ${SRC_SPLIT_HEADERS})
-source_group("kernels\\cpu" FILES ${SRC_CPU_KERNELS} ${SRC_KERNELS_CPU_HEADERS})
-source_group("kernels\\cuda" FILES ${SRC_CUDA_KERNELS} ${SRC_KERNELS_CUDA_HEADERS})
-source_group("kernels\\opencl" FILES ${SRC_OPENCL_KERNELS} ${SRC_KERNELS_OPENCL_HEADERS})
-source_group("kernels\\optix" FILES ${SRC_OPTIX_KERNELS} ${SRC_KERNELS_OPTIX_HEADERS})
+source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
+source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
+source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
 source_group("svm" FILES ${SRC_SVM_HEADERS})
 
 if(WITH_CYCLES_CUDA)
@@ -724,31 +633,20 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
 
-# OpenCL kernel
-
-# set(KERNEL_PREPROCESSED ${CMAKE_CURRENT_BINARY_DIR}/kernel_preprocessed.cl)
-# add_custom_command(
-#    OUTPUT ${KERNEL_PREPROCESSED}
-#    COMMAND gcc -x c++ -E ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cl -I ${CMAKE_CURRENT_SOURCE_DIR}/../util/ -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -o ${KERNEL_PREPROCESSED}
-#    DEPENDS ${SRC_KERNEL} ${SRC_UTIL_HEADERS})
-# add_custom_target(cycles_kernel_preprocess ALL DEPENDS ${KERNEL_PREPROCESSED})
-# delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
+# Install kernel source for runtime compilation
 
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPENCL_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CUDA_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_OPTIX_KERNELS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPENCL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SPLIT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/split)
-
 
 if(WITH_NANOVDB)
   set(SRC_NANOVDB_HEADERS
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index acf29cf1baf..539e9fd05fb 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -25,6 +25,8 @@
  * the code has been extended and modified to support more primitives and work
  * with CPU/CUDA/OpenCL. */
 
+#pragma once
+
 #ifdef __EMBREE__
 #  include "kernel/bvh/bvh_embree.h"
 #endif
@@ -152,13 +154,11 @@ ccl_device_inline bool scene_intersect_valid(const Ray *ray)
   return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
 }
 
-ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
                                           const Ray *ray,
                                           const uint visibility,
                                           Intersection *isect)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT);
-
 #ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -238,15 +238,13 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
 }
 
 #ifdef __BVH_LOCAL__
-ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
                                                 const Ray *ray,
                                                 LocalIntersection *local_isect,
                                                 int local_object,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)lcg_state) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)lcg_state) >> 32) & 0xFFFFFFFF;
@@ -313,8 +311,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
         float3 dir = ray->D;
         float3 idir = ray->D;
         Transform ob_itfm;
-        rtc_ray.tfar = bvh_instance_motion_push(
-            kg, local_object, ray, &P, &dir, &idir, ray->t, &ob_itfm);
+        rtc_ray.tfar = ray->t *
+                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
         /* bvh_instance_motion_push() returns the inverse transform but
          * it's not needed here. */
         (void)ob_itfm;
@@ -353,15 +351,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
 #endif
 
 #ifdef __SHADOW_RECORD_ALL__
-ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      uint visibility,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = ((uint64_t)isect) & 0xFFFFFFFF;
   uint p1 = (((uint64_t)isect) >> 32) & 0xFFFFFFFF;
@@ -401,17 +397,13 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
     CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
     ctx.isect_s = isect;
     ctx.max_hits = max_hits;
-    ctx.num_hits = 0;
     IntersectContext rtc_ctx(&ctx);
     RTCRay rtc_ray;
     kernel_embree_setup_ray(*ray, rtc_ray, visibility);
     rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
 
-    if (ctx.num_hits > max_hits) {
-      return true;
-    }
     *num_hits = ctx.num_hits;
-    return rtc_ray.tfar == -INFINITY;
+    return ctx.opaque_hit;
   }
 #    endif /* __EMBREE__ */
 
@@ -439,13 +431,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
 #endif /* __SHADOW_RECORD_ALL__ */
 
 #ifdef __VOLUME__
-ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
+ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
                                                  const Ray *ray,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME);
-
 #  ifdef __KERNEL_OPTIX__
   uint p0 = 0;
   uint p1 = 0;
@@ -498,14 +488,12 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
 #endif /* __VOLUME__ */
 
 #ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
+ccl_device_intersect uint scene_intersect_volume_all(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint max_hits,
                                                      const uint visibility)
 {
-  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_ALL);
-
   if (!scene_intersect_valid(ray)) {
     return false;
   }
diff --git a/intern/cycles/kernel/bvh/bvh_embree.h b/intern/cycles/kernel/bvh/bvh_embree.h
index 4605c3ea51d..092d770dcac 100644
--- a/intern/cycles/kernel/bvh/bvh_embree.h
+++ b/intern/cycles/kernel/bvh/bvh_embree.h
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <embree3/rtcore_ray.h>
 #include <embree3/rtcore_scene.h>
 
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-// clang-format on
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
 
 #include "util/util_vector.h"
 
@@ -36,25 +35,29 @@ struct CCLIntersectContext {
     RAY_VOLUME_ALL = 4,
   } RayType;
 
-  KernelGlobals *kg;
+  const KernelGlobals *kg;
   RayType type;
 
   /* for shadow rays */
   Intersection *isect_s;
   int max_hits;
   int num_hits;
+  float max_t;
+  bool opaque_hit;
 
   /* for SSS Rays: */
   LocalIntersection *local_isect;
   int local_object_id;
   uint *lcg_state;
 
-  CCLIntersectContext(KernelGlobals *kg_, RayType type_)
+  CCLIntersectContext(const KernelGlobals *kg_, RayType type_)
   {
     kg = kg_;
     type = type_;
     max_hits = 1;
     num_hits = 0;
+    max_t = FLT_MAX;
+    opaque_hit = false;
     isect_s = NULL;
     local_isect = NULL;
     local_object_id = -1;
@@ -98,7 +101,7 @@ ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
   rayhit.hit.primID = RTC_INVALID_GEOMETRY_ID;
 }
 
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_hit(const KernelGlobals *kg,
                                                  const RTCRay *ray,
                                                  const RTCHit *hit,
                                                  Intersection *isect)
@@ -123,7 +126,7 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals *kg,
   isect->type = kernel_tex_fetch(__prim_type, isect->prim);
 }
 
-ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals *kg,
+ccl_device_inline void kernel_embree_convert_sss_hit(const KernelGlobals *kg,
                                                      const RTCRay *ray,
                                                      const RTCHit *hit,
                                                      Intersection *isect,
diff --git a/intern/cycles/kernel/bvh/bvh_local.h b/intern/cycles/kernel/bvh/bvh_local.h
index 4006c9c1632..90b9f410b29 100644
--- a/intern/cycles/kernel/bvh/bvh_local.h
+++ b/intern/cycles/kernel/bvh/bvh_local.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      LocalIntersection *local_isect,
                                      int local_object,
@@ -74,9 +74,9 @@ ccl_device_inline
   if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
     Transform ob_itfm;
-    isect_t = bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+    isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-    isect_t = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir, isect_t);
+    isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
     object = local_object;
   }
@@ -196,7 +196,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          LocalIntersection *local_isect,
                                          int local_object,
diff --git a/intern/cycles/kernel/bvh/bvh_nodes.h b/intern/cycles/kernel/bvh/bvh_nodes.h
index 5367bdb633c..15cd0f22213 100644
--- a/intern/cycles/kernel/bvh/bvh_nodes.h
+++ b/intern/cycles/kernel/bvh/bvh_nodes.h
@@ -16,7 +16,7 @@
 
 // TODO(sergey): Look into avoid use of full Transform and use 3x3 matrix and
 // 3-vector which might be faster.
-ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *kg,
+ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(const KernelGlobals *kg,
                                                                 int node_addr,
                                                                 int child)
 {
@@ -28,7 +28,7 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals *k
   return space;
 }
 
-ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_aligned_node_intersect(const KernelGlobals *kg,
                                                       const float3 P,
                                                       const float3 idir,
                                                       const float t,
@@ -76,7 +76,7 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals *kg,
 #endif
 }
 
-ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg,
+ccl_device_forceinline bool bvh_unaligned_node_intersect_child(const KernelGlobals *kg,
                                                                const float3 P,
                                                                const float3 dir,
                                                                const float t,
@@ -102,7 +102,7 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals *kg
   return tnear <= tfar;
 }
 
-ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_unaligned_node_intersect(const KernelGlobals *kg,
                                                         const float3 P,
                                                         const float3 dir,
                                                         const float3 idir,
@@ -134,7 +134,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals *kg,
   return mask;
 }
 
-ccl_device_forceinline int bvh_node_intersect(KernelGlobals *kg,
+ccl_device_forceinline int bvh_node_intersect(const KernelGlobals *kg,
                                               const float3 P,
                                               const float3 dir,
                                               const float3 idir,
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 2e94b1d7c37..0ae36fccf9b 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -36,7 +36,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint visibility,
@@ -68,10 +68,10 @@ ccl_device_inline
   Transform ob_itfm;
 #endif
 
-  int num_hits_in_instance = 0;
+  float t_world_to_instance = 1.0f;
 
   *num_hits = 0;
-  isect_array->t = tmax;
+  Intersection *isect = isect_array;
 
   /* traversal loop */
   do {
@@ -147,13 +147,14 @@ ccl_device_inline
 
             switch (p_type) {
               case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect, P, dir, isect_t, visibility, object, prim_addr);
                 break;
               }
 #if BVH_FEATURE(BVH_MOTION)
               case PRIMITIVE_MOTION_TRIANGLE: {
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 break;
               }
 #endif
@@ -163,8 +164,16 @@ ccl_device_inline
               case PRIMITIVE_CURVE_RIBBON:
               case PRIMITIVE_MOTION_CURVE_RIBBON: {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
-                hit = curve_intersect(
-                    kg, isect_array, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                hit = curve_intersect(kg,
+                                      isect,
+                                      P,
+                                      dir,
+                                      isect_t,
+                                      visibility,
+                                      object,
+                                      prim_addr,
+                                      ray->time,
+                                      curve_type);
                 break;
               }
 #endif
@@ -176,27 +185,49 @@ ccl_device_inline
 
             /* shadow ray early termination */
             if (hit) {
+              /* Convert intersection distance to world space. */
+              isect->t /= t_world_to_instance;
+
               /* detect if this surface has a shader with transparent shadows */
 
               /* todo: optimize so primitive visibility flag indicates if
                * the primitive has a transparent shadow shader? */
-              const int flags = intersection_get_shader_flags(kg, isect_array);
+              const int flags = intersection_get_shader_flags(kg, isect);
 
-              /* if no transparent shadows, all light is blocked */
-              if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) {
-                return true;
-              }
-              /* if maximum number of hits reached, block all light */
-              else if (*num_hits == max_hits) {
+              if (!(flags & SD_HAS_TRANSPARENT_SHADOW) || max_hits == 0) {
+                /* If no transparent shadows, all light is blocked and we can
+                 * stop immediately. */
                 return true;
               }
 
-              /* move on to next entry in intersections array */
-              isect_array++;
+              /* Increase the number of hits, possibly beyond max_hits, we will
+               * simply not record those and only keep the max_hits closest. */
               (*num_hits)++;
-              num_hits_in_instance++;
 
-              isect_array->t = isect_t;
+              if (*num_hits >= max_hits) {
+                /* If maximum number of hits reached, find the intersection with
+                 * the largest distance to potentially replace when another hit
+                 * is found. */
+                const int num_recorded_hits = min(max_hits, *num_hits);
+                float max_recorded_t = isect_array[0].t;
+                int max_recorded_hit = 0;
+
+                for (int i = 1; i < num_recorded_hits; i++) {
+                  if (isect_array[i].t > max_recorded_t) {
+                    max_recorded_t = isect_array[i].t;
+                    max_recorded_hit = i;
+                  }
+                }
+
+                isect = isect_array + max_recorded_hit;
+
+                /* Limit the ray distance and stop counting hits beyond this. */
+                isect_t = max_recorded_t * t_world_to_instance;
+              }
+              else {
+                /* Still have space for intersection, use next hit. */
+                isect = isect + 1;
+              }
             }
 
             prim_addr++;
@@ -207,13 +238,14 @@ ccl_device_inline
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect_t = bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+          t_world_to_instance = bvh_instance_motion_push(
+              kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+          t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
-          num_hits_in_instance = 0;
-          isect_array->t = isect_t;
+          /* Convert intersection to object space. */
+          isect_t *= t_world_to_instance;
 
           ++stack_ptr;
           kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -228,32 +260,19 @@ ccl_device_inline
       kernel_assert(object != OBJECT_NONE);
 
       /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-
 #if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
+      bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
 #else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
+      bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
 #endif
 
-        /* scale isect->t to adjust for instancing */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-      }
-
-      isect_t = tmax;
-      isect_array->t = isect_t;
+      /* Restore world space ray length. If max number of hits exceeded this
+       * distance is reduced to recorded only the closest hits. If not use
+       * the original ray length. */
+      isect_t = (max_hits && *num_hits > max_hits) ? isect->t : tmax;
 
       object = OBJECT_NONE;
+      t_world_to_instance = 1.0f;
       node_addr = traversal_stack[stack_ptr];
       --stack_ptr;
     }
@@ -262,7 +281,7 @@ ccl_device_inline
   return false;
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint visibility,
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 89250a8d60a..a26d8c514f3 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -31,7 +31,7 @@
  * BVH_MOTION: motion blur rendering
  */
 
-ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                                      const Ray *ray,
                                                      Intersection *isect,
                                                      const uint visibility)
@@ -136,7 +136,8 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
             case PRIMITIVE_TRIANGLE: {
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
-                if (triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr)) {
+                if (triangle_intersect(
+                        kg, isect, P, dir, isect->t, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -149,7 +150,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 kernel_assert(kernel_tex_fetch(__prim_type, prim_addr) == type);
                 if (motion_triangle_intersect(
-                        kg, isect, P, dir, ray->time, visibility, object, prim_addr)) {
+                        kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr)) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
                     return true;
@@ -166,8 +167,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
               for (; prim_addr < prim_addr2; prim_addr++) {
                 const uint curve_type = kernel_tex_fetch(__prim_type, prim_addr);
                 kernel_assert((curve_type & PRIMITIVE_ALL) == (type & PRIMITIVE_ALL));
-                const bool hit = curve_intersect(
-                    kg, isect, P, dir, visibility, object, prim_addr, ray->time, curve_type);
+                const bool hit = curve_intersect(kg,
+                                                 isect,
+                                                 P,
+                                                 dir,
+                                                 isect->t,
+                                                 visibility,
+                                                 object,
+                                                 prim_addr,
+                                                 ray->time,
+                                                 curve_type);
                 if (hit) {
                   /* shadow ray early termination */
                   if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -184,10 +193,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
           object = kernel_tex_fetch(__prim_object, -prim_addr - 1);
 
 #if BVH_FEATURE(BVH_MOTION)
-          isect->t = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+          isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+          isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
           ++stack_ptr;
@@ -218,7 +226,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_types.h b/intern/cycles/kernel/bvh/bvh_types.h
index 98e6ec25d15..6039e707fc3 100644
--- a/intern/cycles/kernel/bvh/bvh_types.h
+++ b/intern/cycles/kernel/bvh/bvh_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BVH_TYPES__
-#define __BVH_TYPES__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,5 +42,3 @@ CCL_NAMESPACE_BEGIN
 #define BVH_FEATURE(f) (((BVH_FUNCTION_FEATURES) & (f)) != 0)
 
 CCL_NAMESPACE_END
-
-#endif /* __BVH_TYPES__ */
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
index b1faebce957..21384457b16 100644
--- a/intern/cycles/kernel/bvh/bvh_util.h
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -71,86 +71,6 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
-/* This function should be used to compute a modified ray start position for
- * rays leaving from a surface. The algorithm slightly distorts flat surface
- * of a triangle. Surface is lifted by amount h along normal n in the incident
- * point. */
-
-ccl_device_inline float3 smooth_surface_offset(KernelGlobals *kg, ShaderData *sd, float3 Ng)
-{
-  float3 V[3], N[3];
-  triangle_vertices_and_normals(kg, sd->prim, V, N);
-
-  const float u = sd->u, v = sd->v;
-  const float w = 1 - u - v;
-  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
-  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
-
-  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
-
-  /* Parabolic approximation */
-  float a = dot(N[2] - N[0], V[0] - V[2]);
-  float b = dot(N[2] - N[1], V[1] - V[2]);
-  float c = dot(N[1] - N[0], V[1] - V[0]);
-  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
-
-  /* Check flipped normals */
-  if (dot(n, Ng) > 0) {
-    /* Local linear envelope */
-    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
-    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
-    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
-    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
-    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
-    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
-    h = max(min(min(h0, h1), h2), h * 0.5f);
-  }
-  else {
-    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
-    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
-    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
-    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
-    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
-    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
-    h = min(-min(min(h0, h1), h2), h * 0.5f);
-  }
-
-  return n * h;
-}
-
-/* Ray offset to avoid shadow terminator artifact. */
-
-ccl_device_inline float3 ray_offset_shadow(KernelGlobals *kg, ShaderData *sd, float3 L)
-{
-  float NL = dot(sd->N, L);
-  bool transmit = (NL < 0.0f);
-  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
-  float3 P = ray_offset(sd->P, Ng);
-
-  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
-    const float offset_cutoff =
-        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
-    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
-     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
-     * make a smooth transition near the threshold. */
-    if (offset_cutoff > 0.0f) {
-      float NgL = dot(Ng, L);
-      float offset_amount = 0.0f;
-      if (NL < offset_cutoff) {
-        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
-      }
-      else {
-        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
-      }
-      if (offset_amount > 0.0f) {
-        P += smooth_surface_offset(kg, sd, Ng) * offset_amount;
-      }
-    }
-  }
-
-  return P;
-}
-
 #if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
@@ -193,10 +113,10 @@ ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
 }
 #endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
 
-/* Utility to quickly get a shader flags from an intersection. */
+/* Utility to quickly get flags from an intersection. */
 
-ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg,
-                                                         const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
 {
   const int prim = kernel_tex_fetch(__prim_index, isect->prim);
   int shader = 0;
@@ -217,14 +137,14 @@ ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_rest
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
 }
 
-ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg,
-                                                   const Intersection *isect)
+ccl_device_forceinline int intersection_get_shader_from_isect_prim(
+    const KernelGlobals *ccl_restrict kg, const int isect_prim)
 {
-  const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  const int prim = kernel_tex_fetch(__prim_index, isect_prim);
   int shader = 0;
 
 #ifdef __HAIR__
-  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+  if (kernel_tex_fetch(__prim_type, isect_prim) & PRIMITIVE_ALL_TRIANGLE)
 #endif
   {
     shader = kernel_tex_fetch(__tri_shader, prim);
@@ -239,7 +159,13 @@ ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict k
   return shader & SHADER_MASK;
 }
 
-ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict kg,
+ccl_device_forceinline int intersection_get_shader(const KernelGlobals *ccl_restrict kg,
+                                                   const Intersection *ccl_restrict isect)
+{
+  return intersection_get_shader_from_isect_prim(kg, isect->prim);
+}
+
+ccl_device_forceinline int intersection_get_object(const KernelGlobals *ccl_restrict kg,
                                                    const Intersection *ccl_restrict isect)
 {
   if (isect->object != OBJECT_NONE) {
@@ -249,4 +175,12 @@ ccl_device_forceinline int intersection_get_object(KernelGlobals *ccl_restrict k
   return kernel_tex_fetch(__prim_object, isect->prim);
 }
 
+ccl_device_forceinline int intersection_get_object_flags(const KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *ccl_restrict isect)
+{
+  const int object = intersection_get_object(kg, isect);
+
+  return kernel_tex_fetch(__object_flag, object);
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 1f2ea47269b..0411d9c522d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    bool BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect,
                                      const uint visibility)
@@ -147,7 +147,7 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                triangle_intersect(kg, isect, P, dir, visibility, object, prim_addr);
+                triangle_intersect(kg, isect, P, dir, isect->t, visibility, object, prim_addr);
               }
               break;
             }
@@ -165,7 +165,7 @@ ccl_device_inline
                   continue;
                 }
                 motion_triangle_intersect(
-                    kg, isect, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect, P, dir, isect->t, ray->time, visibility, object, prim_addr);
               }
               break;
             }
@@ -181,10 +181,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect->t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
+            isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect->t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect->t);
+            isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             ++stack_ptr;
@@ -222,7 +221,7 @@ ccl_device_inline
   return (isect->prim != PRIM_NONE);
 }
 
-ccl_device_inline bool BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline bool BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect,
                                          const uint visibility)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index a8664cc4331..4874270f15d 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -35,7 +35,7 @@ ccl_device
 #else
 ccl_device_inline
 #endif
-    uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+    uint BVH_FUNCTION_FULL_NAME(BVH)(const KernelGlobals *kg,
                                      const Ray *ray,
                                      Intersection *isect_array,
                                      const uint max_hits,
@@ -150,7 +150,8 @@ ccl_device_inline
                 if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                   continue;
                 }
-                hit = triangle_intersect(kg, isect_array, P, dir, visibility, object, prim_addr);
+                hit = triangle_intersect(
+                    kg, isect_array, P, dir, isect_t, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -190,7 +191,7 @@ ccl_device_inline
                   continue;
                 }
                 hit = motion_triangle_intersect(
-                    kg, isect_array, P, dir, ray->time, visibility, object, prim_addr);
+                    kg, isect_array, P, dir, isect_t, ray->time, visibility, object, prim_addr);
                 if (hit) {
                   /* Move on to next entry in intersections array. */
                   isect_array++;
@@ -228,10 +229,9 @@ ccl_device_inline
           int object_flag = kernel_tex_fetch(__object_flag, object);
           if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect_t = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, isect_t, &ob_itfm);
+            isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect_t = bvh_instance_push(kg, object, ray, &P, &dir, &idir, isect_t);
+            isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif
 
             num_hits_in_instance = 0;
@@ -289,7 +289,7 @@ ccl_device_inline
   return num_hits;
 }
 
-ccl_device_inline uint BVH_FUNCTION_NAME(KernelGlobals *kg,
+ccl_device_inline uint BVH_FUNCTION_NAME(const KernelGlobals *kg,
                                          const Ray *ray,
                                          Intersection *isect_array,
                                          const uint max_hits,
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 99a5a675976..72a8c2ba090 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 ccl_device ShaderClosure *closure_alloc(ShaderData *sd, int size, ClosureType type, float3 weight)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index 6f2f2ebb202..4eb8bcae997 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/closure/bsdf_ashikhmin_velvet.h"
 #include "kernel/closure/bsdf_diffuse.h"
@@ -109,7 +111,7 @@ ccl_device_inline float shift_cos_in(float cos_in, const float frequency_multipl
   return val;
 }
 
-ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+ccl_device_inline int bsdf_sample(const KernelGlobals *kg,
                                   ShaderData *sd,
                                   const ShaderClosure *sc,
                                   float randu,
@@ -429,21 +431,6 @@ ccl_device_inline int bsdf_sample(KernelGlobals *kg,
       break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-#endif
     default:
       label = LABEL_NONE;
       break;
@@ -482,15 +469,16 @@ ccl_device
 ccl_device_inline
 #endif
     float3
-    bsdf_eval(KernelGlobals *kg,
+    bsdf_eval(const KernelGlobals *kg,
               ShaderData *sd,
               const ShaderClosure *sc,
               const float3 omega_in,
+              const bool is_transmission,
               float *pdf)
 {
-  float3 eval;
+  float3 eval = zero_float3();
 
-  if (dot(sd->N, omega_in) >= 0.0f) {
+  if (!is_transmission) {
     switch (sc->type) {
       case CLOSURE_BSDF_DIFFUSE_ID:
       case CLOSURE_BSDF_BSSRDF_ID:
@@ -570,13 +558,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -663,13 +645,7 @@ ccl_device_inline
         break;
 #  endif /* __PRINCIPLED__ */
 #endif
-#ifdef __VOLUME__
-      case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-        eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
-        break;
-#endif
       default:
-        eval = make_float3(0.0f, 0.0f, 0.0f);
         break;
     }
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
@@ -682,7 +658,7 @@ ccl_device_inline
   return eval;
 }
 
-ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
+ccl_device void bsdf_blur(const KernelGlobals *kg, ShaderClosure *sc, float roughness)
 {
   /* ToDo: do we want to blur volume closures? */
 #ifdef __SVM__
@@ -715,55 +691,4 @@ ccl_device void bsdf_blur(KernelGlobals *kg, ShaderClosure *sc, float roughness)
 #endif
 }
 
-ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
-{
-#ifdef __SVM__
-  switch (a->type) {
-    case CLOSURE_BSDF_TRANSPARENT_ID:
-      return true;
-    case CLOSURE_BSDF_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_ID:
-    case CLOSURE_BSDF_TRANSLUCENT_ID:
-      return bsdf_diffuse_merge(a, b);
-    case CLOSURE_BSDF_OREN_NAYAR_ID:
-      return bsdf_oren_nayar_merge(a, b);
-    case CLOSURE_BSDF_REFLECTION_ID:
-    case CLOSURE_BSDF_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
-    case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
-    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
-    case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      return bsdf_microfacet_merge(a, b);
-    case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-      return bsdf_ashikhmin_velvet_merge(a, b);
-    case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-    case CLOSURE_BSDF_GLOSSY_TOON_ID:
-      return bsdf_toon_merge(a, b);
-    case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-    case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-      return bsdf_hair_merge(a, b);
-#  ifdef __PRINCIPLED__
-    case CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID:
-    case CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID:
-      return bsdf_principled_diffuse_merge(a, b);
-#  endif
-#  ifdef __VOLUME__
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      return volume_henyey_greenstein_merge(a, b);
-#  endif
-    default:
-      return false;
-  }
-#else
-  return false;
-#endif
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 9814a7cf5c9..be6383e521a 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -14,20 +14,19 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_ASHIKHMIN_SHIRLEY_H__
-#define __BSDF_ASHIKHMIN_SHIRLEY_H__
-
 /*
-ASHIKHMIN SHIRLEY BSDF
-
-Implementation of
-Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
-
-The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
-the case with all other microfacet-based BSDF implementations in Cycles.
+ * ASHIKHMIN SHIRLEY BSDF
+ *
+ * Implementation of
+ * Michael Ashikhmin and Peter Shirley: "An Anisotropic Phong BRDF Model" (2000)
+ *
+ * The Fresnel factor is missing to get a separable bsdf (intensity*color), as is
+ * the case with all other microfacet-based BSDF implementations in Cycles.
+ *
+ * Other than that, the implementation directly follows the paper.
+ */
 
-Other than that, the implementation directly follows the paper.
-*/
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -240,5 +239,3 @@ ccl_device int bsdf_ashikhmin_shirley_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_SHIRLEY_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
index 3d3f20edab3..f51027f5701 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -30,8 +30,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_ASHIKHMIN_VELVET_H__
-#define __BSDF_ASHIKHMIN_VELVET_H__
+#pragma once
+
+#include "kernel/kernel_montecarlo.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -54,14 +55,6 @@ ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_ashikhmin_velvet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const VelvetBsdf *bsdf_a = (const VelvetBsdf *)a;
-  const VelvetBsdf *bsdf_b = (const VelvetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->sigma == bsdf_b->sigma);
-}
-
 ccl_device float3 bsdf_ashikhmin_velvet_eval_reflect(const ShaderClosure *sc,
                                                      const float3 I,
                                                      const float3 omega_in,
@@ -175,5 +168,3 @@ ccl_device int bsdf_ashikhmin_velvet_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_ASHIKHMIN_VELVET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse.h b/intern/cycles/kernel/closure/bsdf_diffuse.h
index ea604ed0311..1555aa30304 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_H__
-#define __BSDF_DIFFUSE_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -49,14 +48,6 @@ ccl_device int bsdf_diffuse_setup(DiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const DiffuseBsdf *bsdf_a = (const DiffuseBsdf *)a;
-  const DiffuseBsdf *bsdf_b = (const DiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N));
-}
-
 ccl_device float3 bsdf_diffuse_eval_reflect(const ShaderClosure *sc,
                                             const float3 I,
                                             const float3 omega_in,
@@ -174,5 +165,3 @@ ccl_device int bsdf_translucent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
index aa62c1c7ceb..b06dd196b9e 100644
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_DIFFUSE_RAMP_H__
-#define __BSDF_DIFFUSE_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -125,5 +124,3 @@ ccl_device int bsdf_diffuse_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_DIFFUSE_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair.h b/intern/cycles/kernel/closure/bsdf_hair.h
index 7ca9424b815..f56f78aa1f0 100644
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_HAIR_H__
-#define __BSDF_HAIR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,15 +61,6 @@ ccl_device int bsdf_hair_transmission_setup(HairBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_hair_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HairBsdf *bsdf_a = (const HairBsdf *)a;
-  const HairBsdf *bsdf_b = (const HairBsdf *)b;
-
-  return (isequal_float3(bsdf_a->T, bsdf_b->T)) && (bsdf_a->roughness1 == bsdf_b->roughness1) &&
-         (bsdf_a->roughness2 == bsdf_b->roughness2) && (bsdf_a->offset == bsdf_b->offset);
-}
-
 ccl_device float3 bsdf_hair_reflection_eval_reflect(const ShaderClosure *sc,
                                                     const float3 I,
                                                     const float3 omega_in,
@@ -309,5 +299,3 @@ ccl_device int bsdf_hair_transmission_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_hair_principled.h b/intern/cycles/kernel/closure/bsdf_hair_principled.h
index f12661b3095..bfe56e5ab0e 100644
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #ifdef __KERNEL_CPU__
 #  include <fenv.h>
 #endif
 
 #include "kernel/kernel_color.h"
 
-#ifndef __BSDF_HAIR_PRINCIPLED_H__
-#  define __BSDF_HAIR_PRINCIPLED_H__
-
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledHairExtra {
@@ -181,12 +180,12 @@ ccl_device_inline float longitudinal_scattering(
 }
 
 /* Combine the three values using their luminances. */
-ccl_device_inline float4 combine_with_energy(KernelGlobals *kg, float3 c)
+ccl_device_inline float4 combine_with_energy(const KernelGlobals *kg, float3 c)
 {
   return make_float4(c.x, c.y, c.z, linear_rgb_to_gray(kg, c));
 }
 
-#  ifdef __HAIR__
+#ifdef __HAIR__
 /* Set up the hair closure. */
 ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bsdf)
 {
@@ -226,10 +225,10 @@ ccl_device int bsdf_principled_hair_setup(ShaderData *sd, PrincipledHairBSDF *bs
   return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
 
-#  endif /* __HAIR__ */
+#endif /* __HAIR__ */
 
 /* Given the Fresnel term and transmittance, generate the attenuation terms for each bounce. */
-ccl_device_inline void hair_attenuation(KernelGlobals *kg, float f, float3 T, float4 *Ap)
+ccl_device_inline void hair_attenuation(const KernelGlobals *kg, float f, float3 T, float4 *Ap)
 {
   /* Primary specular (R). */
   Ap[0] = make_float4(f, f, f, f);
@@ -278,7 +277,7 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
 }
 
 /* Evaluation function for our shader. */
-ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
+ccl_device float3 bsdf_principled_hair_eval(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const ShaderClosure *sc,
                                             const float3 omega_in,
@@ -356,7 +355,7 @@ ccl_device float3 bsdf_principled_hair_eval(KernelGlobals *kg,
 }
 
 /* Sampling function for the hair shader. */
-ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
+ccl_device int bsdf_principled_hair_sample(const KernelGlobals *kg,
                                            const ShaderClosure *sc,
                                            ShaderData *sd,
                                            float randu,
@@ -473,11 +472,11 @@ ccl_device int bsdf_principled_hair_sample(KernelGlobals *kg,
 
   *omega_in = X * sin_theta_i + Y * cos_theta_i * cosf(phi_i) + Z * cos_theta_i * sinf(phi_i);
 
-#  ifdef __RAY_DIFFERENTIALS__
+#ifdef __RAY_DIFFERENTIALS__
   float3 N = safe_normalize(sd->I + *omega_in);
   *domega_in_dx = (2 * dot(N, sd->dI.dx)) * N - sd->dI.dx;
   *domega_in_dy = (2 * dot(N, sd->dI.dy)) * N - sd->dI.dy;
-#  endif
+#endif
 
   return LABEL_GLOSSY | ((p == 0) ? LABEL_REFLECT : LABEL_TRANSMIT);
 }
@@ -501,7 +500,7 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
   return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }
 
-ccl_device float3 bsdf_principled_hair_albedo(ShaderClosure *sc)
+ccl_device float3 bsdf_principled_hair_albedo(const ShaderClosure *sc)
 {
   PrincipledHairBSDF *bsdf = (PrincipledHairBSDF *)sc;
   return exp3(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
@@ -523,5 +522,3 @@ ccl_device_inline float3 bsdf_principled_hair_sigma_from_concentration(const flo
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_HAIR_PRINCIPLED_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h b/intern/cycles/kernel/closure/bsdf_microfacet.h
index af03bab39f7..227cb448b47 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -30,8 +30,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_MICROFACET_H__
-#define __BSDF_MICROFACET_H__
+#pragma once
+
+#include "kernel/kernel_lookup_table.h"
+#include "kernel/kernel_random.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -53,7 +55,7 @@ static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf i
 
 /* Beckmann and GGX microfacet importance sampling. */
 
-ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals *kg,
+ccl_device_inline void microfacet_beckmann_sample_slopes(const KernelGlobals *kg,
                                                          const float cos_theta_i,
                                                          const float sin_theta_i,
                                                          float randu,
@@ -193,7 +195,7 @@ ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
   *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
 }
 
-ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals *kg,
+ccl_device_forceinline float3 microfacet_sample_stretched(const KernelGlobals *kg,
                                                           const float3 omega_i,
                                                           const float alpha_x,
                                                           const float alpha_y,
@@ -352,21 +354,6 @@ ccl_device int bsdf_microfacet_ggx_clearcoat_setup(MicrofacetBsdf *bsdf, const S
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_microfacet_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const MicrofacetBsdf *bsdf_a = (const MicrofacetBsdf *)a;
-  const MicrofacetBsdf *bsdf_b = (const MicrofacetBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->alpha_x == bsdf_b->alpha_x) &&
-         (bsdf_a->alpha_y == bsdf_b->alpha_y) && (isequal_float3(bsdf_a->T, bsdf_b->T)) &&
-         (bsdf_a->ior == bsdf_b->ior) &&
-         ((bsdf_a->extra == NULL && bsdf_b->extra == NULL) ||
-          ((bsdf_a->extra && bsdf_b->extra) &&
-           (isequal_float3(bsdf_a->extra->color, bsdf_b->extra->color)) &&
-           (isequal_float3(bsdf_a->extra->cspec0, bsdf_b->extra->cspec0)) &&
-           (bsdf_a->extra->clearcoat == bsdf_b->extra->clearcoat)));
-}
-
 ccl_device int bsdf_microfacet_ggx_refraction_setup(MicrofacetBsdf *bsdf)
 {
   bsdf->extra = NULL;
@@ -558,7 +545,7 @@ ccl_device float3 bsdf_microfacet_ggx_eval_transmit(const ShaderClosure *sc,
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_ggx_sample(const KernelGlobals *kg,
                                           const ShaderClosure *sc,
                                           float3 Ng,
                                           float3 I,
@@ -986,7 +973,7 @@ ccl_device float3 bsdf_microfacet_beckmann_eval_transmit(const ShaderClosure *sc
   return make_float3(out, out, out);
 }
 
-ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_beckmann_sample(const KernelGlobals *kg,
                                                const ShaderClosure *sc,
                                                float3 Ng,
                                                float3 I,
@@ -1175,5 +1162,3 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals *kg,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_MICROFACET_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
index 9795c8da065..68d5071dbce 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Most of the code is based on the supplemental implementations from
@@ -466,7 +468,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_eval_reflect(const ShaderClosure *sc
                         bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_sample(const KernelGlobals *kg,
                                                 const ShaderClosure *sc,
                                                 float3 Ng,
                                                 float3 I,
@@ -628,7 +630,7 @@ ccl_device float3 bsdf_microfacet_multi_ggx_glass_eval_reflect(const ShaderClosu
                        bsdf->extra->cspec0);
 }
 
-ccl_device int bsdf_microfacet_multi_ggx_glass_sample(KernelGlobals *kg,
+ccl_device int bsdf_microfacet_multi_ggx_glass_sample(const KernelGlobals *kg,
                                                       const ShaderClosure *sc,
                                                       float3 Ng,
                                                       float3 I,
diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
index 41e5736bf49..be12d47f0ea 100644
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_OREN_NAYAR_H__
-#define __BSDF_OREN_NAYAR_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -61,14 +60,6 @@ ccl_device int bsdf_oren_nayar_setup(OrenNayarBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_oren_nayar_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const OrenNayarBsdf *bsdf_a = (const OrenNayarBsdf *)a;
-  const OrenNayarBsdf *bsdf_b = (const OrenNayarBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_oren_nayar_eval_reflect(const ShaderClosure *sc,
                                                const float3 I,
                                                const float3 omega_in,
@@ -127,5 +118,3 @@ ccl_device int bsdf_oren_nayar_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_OREN_NAYAR_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_phong_ramp.h b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
index cf5484383f2..43f8cf71c59 100644
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_PHONG_RAMP_H__
-#define __BSDF_PHONG_RAMP_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -153,5 +152,3 @@ ccl_device int bsdf_phong_ramp_sample(const ShaderClosure *sc,
 #endif /* __OSL__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PHONG_RAMP_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
index d5d012068ff..a72af519482 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_DIFFUSE_H__
-#define __BSDF_PRINCIPLED_DIFFUSE_H__
+#pragma once
 
 /* DISNEY PRINCIPLED DIFFUSE BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledDiffuseBsdf {
@@ -61,14 +62,6 @@ ccl_device int bsdf_principled_diffuse_setup(PrincipledDiffuseBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_principled_diffuse_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const PrincipledDiffuseBsdf *bsdf_a = (const PrincipledDiffuseBsdf *)a;
-  const PrincipledDiffuseBsdf *bsdf_b = (const PrincipledDiffuseBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N) && bsdf_a->roughness == bsdf_b->roughness);
-}
-
 ccl_device float3 bsdf_principled_diffuse_eval_reflect(const ShaderClosure *sc,
                                                        const float3 I,
                                                        const float3 omega_in,
@@ -136,5 +129,3 @@ ccl_device int bsdf_principled_diffuse_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_DIFFUSE_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_principled_sheen.h b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
index 3707de29d73..60ce7e4eb75 100644
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -14,14 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __BSDF_PRINCIPLED_SHEEN_H__
-#define __BSDF_PRINCIPLED_SHEEN_H__
+#pragma once
 
 /* DISNEY PRINCIPLED SHEEN BRDF
  *
  * Shading model by Brent Burley (Disney): "Physically Based Shading at Disney" (2012)
  */
 
+#include "kernel/closure/bsdf_util.h"
+
 CCL_NAMESPACE_BEGIN
 
 typedef ccl_addr_space struct PrincipledSheenBsdf {
@@ -137,5 +138,3 @@ ccl_device int bsdf_principled_sheen_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_PRINCIPLED_SHEEN_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_reflection.h b/intern/cycles/kernel/closure/bsdf_reflection.h
index c24ba170915..31283971d5a 100644
--- a/intern/cycles/kernel/closure/bsdf_reflection.h
+++ b/intern/cycles/kernel/closure/bsdf_reflection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFLECTION_H__
-#define __BSDF_REFLECTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -93,5 +92,3 @@ ccl_device int bsdf_reflection_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFLECTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_refraction.h b/intern/cycles/kernel/closure/bsdf_refraction.h
index d4fbe86dac0..cfedb5dfe2c 100644
--- a/intern/cycles/kernel/closure/bsdf_refraction.h
+++ b/intern/cycles/kernel/closure/bsdf_refraction.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_REFRACTION_H__
-#define __BSDF_REFRACTION_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -111,5 +110,3 @@ ccl_device int bsdf_refraction_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_REFRACTION_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_toon.h b/intern/cycles/kernel/closure/bsdf_toon.h
index cc5de21ed0e..acdafe0f735 100644
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TOON_H__
-#define __BSDF_TOON_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -55,15 +54,6 @@ ccl_device int bsdf_diffuse_toon_setup(ToonBsdf *bsdf)
   return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 
-ccl_device bool bsdf_toon_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const ToonBsdf *bsdf_a = (const ToonBsdf *)a;
-  const ToonBsdf *bsdf_b = (const ToonBsdf *)b;
-
-  return (isequal_float3(bsdf_a->N, bsdf_b->N)) && (bsdf_a->size == bsdf_b->size) &&
-         (bsdf_a->smooth == bsdf_b->smooth);
-}
-
 ccl_device float3 bsdf_toon_get_intensity(float max_angle, float smooth, float angle)
 {
   float is;
@@ -248,5 +238,3 @@ ccl_device int bsdf_glossy_toon_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TOON_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_transparent.h b/intern/cycles/kernel/closure/bsdf_transparent.h
index 4e5513499e8..f1dc7efb345 100644
--- a/intern/cycles/kernel/closure/bsdf_transparent.h
+++ b/intern/cycles/kernel/closure/bsdf_transparent.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_TRANSPARENT_H__
-#define __BSDF_TRANSPARENT_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -123,5 +122,3 @@ ccl_device int bsdf_transparent_sample(const ShaderClosure *sc,
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_TRANSPARENT_H__ */
diff --git a/intern/cycles/kernel/closure/bsdf_util.h b/intern/cycles/kernel/closure/bsdf_util.h
index a73dee1b045..beec5f768a1 100644
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __BSDF_UTIL_H__
-#define __BSDF_UTIL_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -150,5 +149,3 @@ interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, float3 cspec0
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __BSDF_UTIL_H__ */
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 562daf1286d..e095314678a 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_BSSRDF_H__
-#define __KERNEL_BSSRDF_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,310 +23,71 @@ typedef ccl_addr_space struct Bssrdf {
 
   float3 radius;
   float3 albedo;
-  float sharpness;
-  float texture_blur;
   float roughness;
-  float channels;
+  float anisotropy;
 } Bssrdf;
 
 static_assert(sizeof(ShaderClosure) >= sizeof(Bssrdf), "Bssrdf is too large!");
 
-/* Planar Truncated Gaussian
- *
- * Note how this is different from the typical gaussian, this one integrates
- * to 1 over the plane (where you get an extra 2*pi*x factor). We are lucky
- * that integrating x*exp(-x) gives a nice closed form solution. */
-
-/* paper suggests 1/12.46 which is much too small, suspect it's *12.46 */
-#define GAUSS_TRUNCATE 12.46f
-
-ccl_device float bssrdf_gaussian_eval(const float radius, float r)
-{
-  /* integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) from 0 to Rm
-   * = 1 - exp(-Rm*Rm/(2*v)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  if (r >= Rm)
-    return 0.0f;
-
-  return expf(-r * r / (2.0f * v)) / (2.0f * M_PI_F * v);
-}
-
-ccl_device float bssrdf_gaussian_pdf(const float radius, float r)
+ccl_device float bssrdf_dipole_compute_Rd(float alpha_prime, float fourthirdA)
 {
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  return bssrdf_gaussian_eval(radius, r) * (1.0f / (area_truncated));
+  float s = sqrtf(3.0f * (1.0f - alpha_prime));
+  return 0.5f * alpha_prime * (1.0f + expf(-fourthirdA * s)) * expf(-s);
 }
 
-ccl_device void bssrdf_gaussian_sample(const float radius, float xi, float *r, float *h)
+ccl_device float bssrdf_dipole_compute_alpha_prime(float rd, float fourthirdA)
 {
-  /* xi = integrate (2*pi*r * exp(-r*r/(2*v)))/(2*pi*v)) = -exp(-r^2/(2*v))
-   * r = sqrt(-2*v*logf(xi)) */
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  /* 1.0 - expf(-Rm*Rm/(2*v)) simplified */
-  const float area_truncated = 1.0f - expf(-0.5f * GAUSS_TRUNCATE);
-
-  /* r(xi) */
-  const float r_squared = -2.0f * v * logf(1.0f - xi * area_truncated);
-  *r = sqrtf(r_squared);
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_squared);
-}
-
-/* Planar Cubic BSSRDF falloff
- *
- * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
- * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
- * far as I can tell has no closed form solution. So we get an iterative solution
- * instead with newton-raphson. */
-
-ccl_device float bssrdf_cubic_eval(const float radius, const float sharpness, float r)
-{
-  if (sharpness == 0.0f) {
-    const float Rm = radius;
-
-    if (r >= Rm)
-      return 0.0f;
-
-    /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
-    const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
-    const float f = Rm - r;
-    const float num = f * f * f;
-
-    return (10.0f * num) / (Rm5 * M_PI_F);
+  /* Little Newton solver. */
+  if (rd < 1e-4f) {
+    return 0.0f;
+  }
+  if (rd >= 0.995f) {
+    return 0.999999f;
   }
-  else {
-    float Rm = radius * (1.0f + sharpness);
-
-    if (r >= Rm)
-      return 0.0f;
 
-    /* custom variation with extra sharpness, to match the previous code */
-    const float y = 1.0f / (1.0f + sharpness);
-    float Rmy, ry, ryinv;
+  float x0 = 0.0f;
+  float x1 = 1.0f;
+  float xmid, fmid;
 
-    if (sharpness == 1.0f) {
-      Rmy = sqrtf(Rm);
-      ry = sqrtf(r);
-      ryinv = (ry > 0.0f) ? 1.0f / ry : 0.0f;
+  constexpr const int max_num_iterations = 12;
+  for (int i = 0; i < max_num_iterations; ++i) {
+    xmid = 0.5f * (x0 + x1);
+    fmid = bssrdf_dipole_compute_Rd(xmid, fourthirdA);
+    if (fmid < rd) {
+      x0 = xmid;
     }
     else {
-      Rmy = powf(Rm, y);
-      ry = powf(r, y);
-      ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
+      x1 = xmid;
     }
-
-    const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
-    const float f = Rmy - ry;
-    const float num = f * (f * f) * (y * ryinv);
-
-    return (10.0f * num) / (Rmy5 * M_PI_F);
-  }
-}
-
-ccl_device float bssrdf_cubic_pdf(const float radius, const float sharpness, float r)
-{
-  return bssrdf_cubic_eval(radius, sharpness, r);
-}
-
-/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device_forceinline float bssrdf_cubic_quintic_root_find(float xi)
-{
-  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
-   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
-   * should not be too bad */
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  float x = 0.25f;
-  int i;
-
-  for (i = 0; i < max_iteration_count; i++) {
-    float x2 = x * x;
-    float x3 = x2 * x;
-    float nx = (1.0f - x);
-
-    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
-    float f_ = 20.0f * (x * nx) * (nx * nx);
-
-    if (fabsf(f) < tolerance || f_ == 0.0f)
-      break;
-
-    x = saturate(x - f / f_);
   }
 
-  return x;
+  return xmid;
 }
 
-ccl_device void bssrdf_cubic_sample(
-    const float radius, const float sharpness, float xi, float *r, float *h)
+ccl_device void bssrdf_setup_radius(Bssrdf *bssrdf, const ClosureType type, const float eta)
 {
-  float Rm = radius;
-  float r_ = bssrdf_cubic_quintic_root_find(xi);
-
-  if (sharpness != 0.0f) {
-    r_ = powf(r_, 1.0f + sharpness);
-    Rm *= (1.0f + sharpness);
-  }
-
-  r_ *= Rm;
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Approximate Reflectance Profiles
- * http://graphics.pixar.com/library/ApproxBSSRDF/paper.pdf
- */
-
-/* This is a bit arbitrary, just need big enough radius so it matches
- * the mean free length, but still not too big so sampling is still
- * effective. Might need some further tweaks.
- */
-#define BURLEY_TRUNCATE 16.0f
-#define BURLEY_TRUNCATE_CDF 0.9963790093708328f  // cdf(BURLEY_TRUNCATE)
-
-ccl_device_inline float bssrdf_burley_fitting(float A)
-{
-  /* Diffuse surface transmission, equation (6). */
-  return 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-}
-
-/* Scale mean free path length so it gives similar looking result
- * to Cubic and Gaussian models.
- */
-ccl_device_inline float3 bssrdf_burley_compatible_mfp(float3 r)
-{
-  return 0.25f * M_1_PI_F * r;
-}
-
-ccl_device void bssrdf_burley_setup(Bssrdf *bssrdf)
-{
-  /* Mean free path length. */
-  const float3 l = bssrdf_burley_compatible_mfp(bssrdf->radius);
-  /* Surface albedo. */
-  const float3 A = bssrdf->albedo;
-  const float3 s = make_float3(
-      bssrdf_burley_fitting(A.x), bssrdf_burley_fitting(A.y), bssrdf_burley_fitting(A.z));
-
-  bssrdf->radius = l / s;
-}
-
-ccl_device float bssrdf_burley_eval(const float d, float r)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-
-  if (r >= Rm)
-    return 0.0f;
-
-  /* Burley reflectance profile, equation (3).
-   *
-   * NOTES:
-   * - Surface albedo is already included into sc->weight, no need to
-   *   multiply by this term here.
-   * - This is normalized diffuse model, so the equation is multiplied
-   *   by 2*pi, which also matches cdf().
-   */
-  float exp_r_3_d = expf(-r / (3.0f * d));
-  float exp_r_d = exp_r_3_d * exp_r_3_d * exp_r_3_d;
-  return (exp_r_d + exp_r_3_d) / (4.0f * d);
-}
-
-ccl_device float bssrdf_burley_pdf(const float d, float r)
-{
-  return bssrdf_burley_eval(d, r) * (1.0f / BURLEY_TRUNCATE_CDF);
-}
-
-/* Find the radius for desired CDF value.
- * Returns scaled radius, meaning the result is to be scaled up by d.
- * Since there's no closed form solution we do Newton-Raphson method to find it.
- */
-ccl_device_forceinline float bssrdf_burley_root_find(float xi)
-{
-  const float tolerance = 1e-6f;
-  const int max_iteration_count = 10;
-  /* Do initial guess based on manual curve fitting, this allows us to reduce
-   * number of iterations to maximum 4 across the [0..1] range. We keep maximum
-   * number of iteration higher just to be sure we didn't miss root in some
-   * corner case.
-   */
-  float r;
-  if (xi <= 0.9f) {
-    r = expf(xi * xi * 2.4f) - 1.0f;
+  if (type == CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID) {
+    /* Scale mean free path length so it gives similar looking result to older
+     * Cubic, Gaussian and Burley models. */
+    bssrdf->radius *= 0.25f * M_1_PI_F;
   }
   else {
-    /* TODO(sergey): Some nicer curve fit is possible here. */
-    r = 15.0f;
-  }
-  /* Solve against scaled radius. */
-  for (int i = 0; i < max_iteration_count; i++) {
-    float exp_r_3 = expf(-r / 3.0f);
-    float exp_r = exp_r_3 * exp_r_3 * exp_r_3;
-    float f = 1.0f - 0.25f * exp_r - 0.75f * exp_r_3 - xi;
-    float f_ = 0.25f * exp_r + 0.25f * exp_r_3;
+    /* Adjust radius based on IOR and albedo. */
+    const float inv_eta = 1.0f / eta;
+    const float F_dr = inv_eta * (-1.440f * inv_eta + 0.710f) + 0.668f + 0.0636f * eta;
+    const float fourthirdA = (4.0f / 3.0f) * (1.0f + F_dr) /
+                             (1.0f - F_dr); /* From Jensen's `Fdr` ratio formula. */
 
-    if (fabsf(f) < tolerance || f_ == 0.0f) {
-      break;
-    }
+    const float3 alpha_prime = make_float3(
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.x, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.y, fourthirdA),
+        bssrdf_dipole_compute_alpha_prime(bssrdf->albedo.z, fourthirdA));
 
-    r = r - f / f_;
-    if (r < 0.0f) {
-      r = 0.0f;
-    }
+    bssrdf->radius *= sqrt(3.0f * (one_float3() - alpha_prime));
   }
-  return r;
 }
 
-ccl_device void bssrdf_burley_sample(const float d, float xi, float *r, float *h)
-{
-  const float Rm = BURLEY_TRUNCATE * d;
-  const float r_ = bssrdf_burley_root_find(xi * BURLEY_TRUNCATE_CDF) * d;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* None BSSRDF falloff
- *
- * Samples distributed over disk with no falloff, for reference. */
-
-ccl_device float bssrdf_none_eval(const float radius, float r)
-{
-  const float Rm = radius;
-  return (r < Rm) ? 1.0f : 0.0f;
-}
-
-ccl_device float bssrdf_none_pdf(const float radius, float r)
-{
-  /* integrate (2*pi*r)/(pi*Rm*Rm) from 0 to Rm = 1 */
-  const float Rm = radius;
-  const float area = (M_PI_F * Rm * Rm);
-
-  return bssrdf_none_eval(radius, r) / area;
-}
-
-ccl_device void bssrdf_none_sample(const float radius, float xi, float *r, float *h)
-{
-  /* xi = integrate (2*pi*r)/(pi*Rm*Rm) = r^2/Rm^2
-   * r = sqrt(xi)*Rm */
-  const float Rm = radius;
-  const float r_ = sqrtf(xi) * Rm;
-
-  *r = r_;
-
-  /* h^2 + r^2 = Rm^2 */
-  *h = safe_sqrtf(Rm * Rm - r_ * r_);
-}
-
-/* Generic */
+/* Setup */
 
 ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
 {
@@ -342,7 +102,7 @@ ccl_device_inline Bssrdf *bssrdf_alloc(ShaderData *sd, float3 weight)
   return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? bssrdf : NULL;
 }
 
-ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
+ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type, const float ior)
 {
   int flag = 0;
   int bssrdf_channels = 3;
@@ -371,7 +131,7 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   if (bssrdf_channels < 3) {
     /* Add diffuse BSDF if any radius too small. */
 #ifdef __PRINCIPLED__
-    if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
+    if (bssrdf->roughness != FLT_MAX) {
       float roughness = bssrdf->roughness;
       float3 N = bssrdf->N;
 
@@ -401,16 +161,9 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   /* Setup BSSRDF if radius is large enough. */
   if (bssrdf_channels > 0) {
     bssrdf->type = type;
-    bssrdf->channels = bssrdf_channels;
-    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf->channels;
-    bssrdf->texture_blur = saturate(bssrdf->texture_blur);
-    bssrdf->sharpness = saturate(bssrdf->sharpness);
+    bssrdf->sample_weight = fabsf(average(bssrdf->weight)) * bssrdf_channels;
 
-    if (type == CLOSURE_BSSRDF_BURLEY_ID || type == CLOSURE_BSSRDF_PRINCIPLED_ID ||
-        type == CLOSURE_BSSRDF_RANDOM_WALK_ID ||
-        type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-      bssrdf_burley_setup(bssrdf);
-    }
+    bssrdf_setup_radius(bssrdf, type, ior);
 
     flag |= SD_BSSRDF;
   }
@@ -422,77 +175,4 @@ ccl_device int bssrdf_setup(ShaderData *sd, Bssrdf *bssrdf, ClosureType type)
   return flag;
 }
 
-ccl_device void bssrdf_sample(const ShaderClosure *sc, float xi, float *r, float *h)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float radius;
-
-  /* Sample color channel and reuse random number. Only a subset of channels
-   * may be used if their radius was too small to handle as BSSRDF. */
-  xi *= bssrdf->channels;
-
-  if (xi < 1.0f) {
-    radius = (bssrdf->radius.x > 0.0f) ? bssrdf->radius.x :
-             (bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                         bssrdf->radius.z;
-  }
-  else if (xi < 2.0f) {
-    xi -= 1.0f;
-    radius = (bssrdf->radius.x > 0.0f && bssrdf->radius.y > 0.0f) ? bssrdf->radius.y :
-                                                                    bssrdf->radius.z;
-  }
-  else {
-    xi -= 2.0f;
-    radius = bssrdf->radius.z;
-  }
-
-  /* Sample BSSRDF. */
-  if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    bssrdf_cubic_sample(radius, bssrdf->sharpness, xi, r, h);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    bssrdf_gaussian_sample(radius, xi, r, h);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID) */
-    bssrdf_burley_sample(radius, xi, r, h);
-  }
-}
-
-ccl_device float bssrdf_channel_pdf(const Bssrdf *bssrdf, float radius, float r)
-{
-  if (radius == 0.0f) {
-    return 0.0f;
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_CUBIC_ID) {
-    return bssrdf_cubic_pdf(radius, bssrdf->sharpness, r);
-  }
-  else if (bssrdf->type == CLOSURE_BSSRDF_GAUSSIAN_ID) {
-    return bssrdf_gaussian_pdf(radius, r);
-  }
-  else { /* if (bssrdf->type == CLOSURE_BSSRDF_BURLEY_ID ||
-          *     bssrdf->type == CLOSURE_BSSRDF_PRINCIPLED_ID)*/
-    return bssrdf_burley_pdf(radius, r);
-  }
-}
-
-ccl_device_forceinline float3 bssrdf_eval(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-
-  return make_float3(bssrdf_channel_pdf(bssrdf, bssrdf->radius.x, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.y, r),
-                     bssrdf_channel_pdf(bssrdf, bssrdf->radius.z, r));
-}
-
-ccl_device_forceinline float bssrdf_pdf(const ShaderClosure *sc, float r)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  float3 pdf = bssrdf_eval(sc, r);
-
-  return (pdf.x + pdf.y + pdf.z) / bssrdf->channels;
-}
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_BSSRDF_H__ */
diff --git a/intern/cycles/kernel/closure/emissive.h b/intern/cycles/kernel/closure/emissive.h
index 911382e6865..a2519d97618 100644
--- a/intern/cycles/kernel/closure/emissive.h
+++ b/intern/cycles/kernel/closure/emissive.h
@@ -30,6 +30,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* BACKGROUND CLOSURE */
diff --git a/intern/cycles/kernel/closure/volume.h b/intern/cycles/kernel/closure/volume.h
index 1430f712701..69959a3f21b 100644
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __VOLUME_H__
-#define __VOLUME_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -62,21 +61,12 @@ ccl_device int volume_henyey_greenstein_setup(HenyeyGreensteinVolume *volume)
   return SD_SCATTER;
 }
 
-ccl_device bool volume_henyey_greenstein_merge(const ShaderClosure *a, const ShaderClosure *b)
-{
-  const HenyeyGreensteinVolume *volume_a = (const HenyeyGreensteinVolume *)a;
-  const HenyeyGreensteinVolume *volume_b = (const HenyeyGreensteinVolume *)b;
-
-  return (volume_a->g == volume_b->g);
-}
-
-ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderClosure *sc,
+ccl_device float3 volume_henyey_greenstein_eval_phase(const ShaderVolumeClosure *svc,
                                                       const float3 I,
                                                       float3 omega_in,
                                                       float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer */
   if (fabsf(g) < 1e-3f) {
@@ -122,7 +112,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, float *pd
   return dir;
 }
 
-ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
+ccl_device int volume_henyey_greenstein_sample(const ShaderVolumeClosure *svc,
                                                float3 I,
                                                float3 dIdx,
                                                float3 dIdy,
@@ -134,8 +124,7 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
                                                float3 *domega_in_dy,
                                                float *pdf)
 {
-  const HenyeyGreensteinVolume *volume = (const HenyeyGreensteinVolume *)sc;
-  float g = volume->g;
+  float g = svc->g;
 
   /* note that I points towards the viewer and so is used negated */
   *omega_in = henyey_greenstrein_sample(-I, g, randu, randv, pdf);
@@ -153,17 +142,15 @@ ccl_device int volume_henyey_greenstein_sample(const ShaderClosure *sc,
 /* VOLUME CLOSURE */
 
 ccl_device float3 volume_phase_eval(const ShaderData *sd,
-                                    const ShaderClosure *sc,
+                                    const ShaderVolumeClosure *svc,
                                     float3 omega_in,
                                     float *pdf)
 {
-  kernel_assert(sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID);
-
-  return volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+  return volume_henyey_greenstein_eval_phase(svc, sd->I, omega_in, pdf);
 }
 
 ccl_device int volume_phase_sample(const ShaderData *sd,
-                                   const ShaderClosure *sc,
+                                   const ShaderVolumeClosure *svc,
                                    float randu,
                                    float randv,
                                    float3 *eval,
@@ -171,31 +158,65 @@ ccl_device int volume_phase_sample(const ShaderData *sd,
                                    differential3 *domega_in,
                                    float *pdf)
 {
-  int label;
-
-  switch (sc->type) {
-    case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-      label = volume_henyey_greenstein_sample(sc,
-                                              sd->I,
-                                              sd->dI.dx,
-                                              sd->dI.dy,
-                                              randu,
-                                              randv,
-                                              eval,
-                                              omega_in,
-                                              &domega_in->dx,
-                                              &domega_in->dy,
-                                              pdf);
-      break;
-    default:
-      *eval = make_float3(0.0f, 0.0f, 0.0f);
-      label = LABEL_NONE;
-      break;
+  return volume_henyey_greenstein_sample(svc,
+                                         sd->I,
+                                         sd->dI.dx,
+                                         sd->dI.dy,
+                                         randu,
+                                         randv,
+                                         eval,
+                                         omega_in,
+                                         &domega_in->dx,
+                                         &domega_in->dy,
+                                         pdf);
+}
+
+/* Volume sampling utilities. */
+
+/* todo: this value could be tweaked or turned into a probability to avoid
+ * unnecessary work in volumes and subsurface scattering. */
+#define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+ccl_device float3 volume_color_transmittance(float3 sigma, float t)
+{
+  return exp3(-sigma * t);
+}
+
+ccl_device float volume_channel_get(float3 value, int channel)
+{
+  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
+}
+
+ccl_device int volume_sample_channel(float3 albedo, float3 throughput, float rand, float3 *pdf)
+{
+  /* Sample color channel proportional to throughput and single scattering
+   * albedo, to significantly reduce noise with many bounce, following:
+   *
+   * "Practical and Controllable Subsurface Scattering for Production Path
+   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+  float3 weights = fabs(throughput * albedo);
+  float sum_weights = weights.x + weights.y + weights.z;
+  float3 weights_pdf;
+
+  if (sum_weights > 0.0f) {
+    weights_pdf = weights / sum_weights;
   }
+  else {
+    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
+  }
+
+  *pdf = weights_pdf;
 
-  return label;
+  /* OpenCL does not support -> on float3, so don't use pdf->x. */
+  if (rand < weights_pdf.x) {
+    return 0;
+  }
+  else if (rand < weights_pdf.x + weights_pdf.y) {
+    return 1;
+  }
+  else {
+    return 2;
+  }
 }
 
 CCL_NAMESPACE_END
-
-#endif
diff --git a/intern/cycles/kernel/kernel_compat_cpu.h b/intern/cycles/kernel/device/cpu/compat.h
index 88f6a264a5a..bfd936c7bbd 100644
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CPU_H__
-#define __KERNEL_COMPAT_CPU_H__
+#pragma once
 
 #define __KERNEL_CPU__
 
@@ -27,14 +26,6 @@
 #  pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #include "util/util_half.h"
 #include "util/util_math.h"
 #include "util/util_simd.h"
@@ -43,15 +34,6 @@
 
 #define ccl_addr_space
 
-#define ccl_local_id(d) 0
-#define ccl_global_id(d) (kg->global_id[d])
-
-#define ccl_local_size(d) 1
-#define ccl_global_size(d) (kg->global_size[d])
-
-#define ccl_group_id(d) ccl_global_id(d)
-#define ccl_num_groups(d) ccl_global_size(d)
-
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
@@ -72,37 +54,11 @@ CCL_NAMESPACE_BEGIN
  * simple arrays and after inlining fetch hopefully revert to being a simple
  * pointer lookup. */
 template<typename T> struct texture {
-  ccl_always_inline const T &fetch(int index)
+  ccl_always_inline const T &fetch(int index) const
   {
     kernel_assert(index >= 0 && index < width);
     return data[index];
   }
-#if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-  /* Reads 256 bytes but indexes in blocks of 128 bytes to maintain
-   * compatibility with existing indices and data structures.
-   */
-  ccl_always_inline avxf fetch_avxf(const int index)
-  {
-    kernel_assert(index >= 0 && (index + 1) < width);
-    ssef *ssef_data = (ssef *)data;
-    ssef *ssef_node_data = &ssef_data[index];
-    return _mm256_loadu_ps((float *)ssef_node_data);
-  }
-#endif
-
-#ifdef __KERNEL_SSE2__
-  ccl_always_inline ssef fetch_ssef(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssef *)data)[index];
-  }
-
-  ccl_always_inline ssei fetch_ssei(int index)
-  {
-    kernel_assert(index >= 0 && index < width);
-    return ((ssei *)data)[index];
-  }
-#endif
 
   T *data;
   int width;
@@ -110,15 +66,6 @@ template<typename T> struct texture {
 
 /* Macros to handle different memory storage on different devices */
 
-#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
-#define kernel_tex_fetch_avxf(tex, index) (kg->tex.fetch_avxf(index))
-#define kernel_tex_fetch_ssef(tex, index) (kg->tex.fetch_ssef(index))
-#define kernel_tex_fetch_ssei(tex, index) (kg->tex.fetch_ssei(index))
-#define kernel_tex_lookup(tex, t, offset, size) (kg->tex.lookup(t, offset, size))
-#define kernel_tex_array(tex) (kg->tex.data)
-
-#define kernel_data (kg->__data)
-
 #ifdef __KERNEL_SSE2__
 typedef vector3<sseb> sse3b;
 typedef vector3<ssef> sse3f;
@@ -152,5 +99,3 @@ typedef vector3<avxf> avx3f;
 #endif
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COMPAT_CPU_H__ */
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
new file mode 100644
index 00000000000..98b036e269d
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
+ * the kernel, to access constant data. These are all stored as "textures", but
+ * these are really just standard arrays. We can't use actually globals because
+ * multiple renders may be running inside the same process. */
+
+#ifdef __OSL__
+struct OSLGlobals;
+struct OSLThreadData;
+struct OSLShadingSystem;
+#endif
+
+typedef struct KernelGlobals {
+#define KERNEL_TEX(type, name) texture<type> name;
+#include "kernel/kernel_textures.h"
+
+  KernelData __data;
+
+#ifdef __OSL__
+  /* On the CPU, we also have the OSL globals here. Most data structures are shared
+   * with SVM, the difference is in the shaders and object/mesh attributes. */
+  OSLGlobals *osl;
+  OSLShadingSystem *osl_ss;
+  OSLThreadData *osl_tdata;
+#endif
+
+  /* **** Run-time data ****  */
+
+  ProfilingState profiler;
+} KernelGlobals;
+
+/* Abstraction macros */
+#define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
+#define kernel_tex_array(tex) (kg->tex.data)
+#define kernel_data (kg->__data)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/device/cpu/image.h
index 59b96c86c50..57e81ab186d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_CPU_IMAGE_H__
-#define __KERNEL_CPU_IMAGE_H__
+#pragma once
 
 #ifdef WITH_NANOVDB
 #  define NANOVDB_USE_INTRINSICS
@@ -584,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
 
 #undef SET_CUBIC_SPLINE_WEIGHTS
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -612,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -656,5 +655,3 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
 } /* Namespace. */
 
 CCL_NAMESPACE_END
-
-#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index 8040bfb7b33..ac1cdf5fffe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -56,9 +56,9 @@
 /* do nothing */
 #endif
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
 
 CCL_NAMESPACE_BEGIN
 
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index b907c6a2bac..ae2a841835a 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -14,50 +14,49 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_H__
-#define __KERNEL_H__
+#pragma once
 
 /* CPU Kernel Interface */
 
-#include "kernel/kernel_types.h"
 #include "util/util_types.h"
 
+#include "kernel/kernel_types.h"
+
 CCL_NAMESPACE_BEGIN
 
 #define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
 #define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
+struct IntegratorStateCPU;
 struct KernelGlobals;
 struct KernelData;
 
 KernelGlobals *kernel_globals_create();
 void kernel_globals_free(KernelGlobals *kg);
 
-void *kernel_osl_memory(KernelGlobals *kg);
-bool kernel_osl_use(KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobals *kg);
+bool kernel_osl_use(const KernelGlobals *kg);
 
 void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
 void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu.h"
+#include "kernel/device/cpu/kernel_arch.h"
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_H__ */
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
new file mode 100644
index 00000000000..81f328c710b
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#define KERNEL_INTEGRATOR_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state)
+
+#define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    ccl_global float *render_buffer)
+
+#define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer)
+
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
+KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
+KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
+KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
+KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_background);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_light);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_shadow);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_surface);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(shade_volume);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
+
+#undef KERNEL_INTEGRATOR_FUNCTION
+#undef KERNEL_INTEGRATOR_INIT_FUNCTION
+#undef KERNEL_INTEGRATOR_SHADE_FUNCTION
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset);
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride);
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride);
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride);
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index);
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
+
+#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
new file mode 100644
index 00000000000..1432abfd330
--- /dev/null
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#pragma once
+
+// clang-format off
+#include "kernel/device/cpu/compat.h"
+
+#ifndef KERNEL_STUB
+#    include "kernel/device/cpu/globals.h"
+#    include "kernel/device/cpu/image.h"
+
+#    include "kernel/integrator/integrator_state.h"
+#    include "kernel/integrator/integrator_state_flow.h"
+#    include "kernel/integrator/integrator_state_util.h"
+
+#    include "kernel/integrator/integrator_init_from_camera.h"
+#    include "kernel/integrator/integrator_init_from_bake.h"
+#    include "kernel/integrator/integrator_intersect_closest.h"
+#    include "kernel/integrator/integrator_intersect_shadow.h"
+#    include "kernel/integrator/integrator_intersect_subsurface.h"
+#    include "kernel/integrator/integrator_intersect_volume_stack.h"
+#    include "kernel/integrator/integrator_shade_background.h"
+#    include "kernel/integrator/integrator_shade_light.h"
+#    include "kernel/integrator/integrator_shade_shadow.h"
+#    include "kernel/integrator/integrator_shade_surface.h"
+#    include "kernel/integrator/integrator_shade_volume.h"
+#    include "kernel/integrator/integrator_megakernel.h"
+
+#    include "kernel/kernel_film.h"
+#    include "kernel/kernel_adaptive_sampling.h"
+#    include "kernel/kernel_bake.h"
+# include "kernel/kernel_id_passes.h"
+
+#else
+#  define STUB_ASSERT(arch, name) \
+    assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif   /* KERNEL_STUB */
+// clang-format on
+
+CCL_NAMESPACE_BEGIN
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+#ifdef KERNEL_STUB
+#  define KERNEL_INVOKE(name, ...) (STUB_ASSERT(KERNEL_ARCH, name), 0)
+#else
+#  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
+#endif
+
+#define DEFINE_INTEGRATOR_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state) \
+  { \
+    KERNEL_INVOKE(name, kg, state); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, state, render_buffer); \
+  }
+
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
+DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
+DEFINE_INTEGRATOR_KERNEL(intersect_closest)
+DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
+DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
+DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
+DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+                                                     const KernelShaderEvalInput *input,
+                                                     float4 *output,
+                                                     const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_displace);
+#else
+  kernel_displace_evaluate(kg, input, output, offset);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+                                                       const KernelShaderEvalInput *input,
+                                                       float4 *output,
+                                                       const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_background);
+#else
+  kernel_background_evaluate(kg, input, output, offset);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
+    const KernelGlobals *kg,
+    ccl_global float *render_buffer,
+    int x,
+    int y,
+    float threshold,
+    bool reset,
+    int offset,
+    int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_convergence_check);
+  return false;
+#else
+  return kernel_adaptive_sampling_convergence_check(
+      kg, render_buffer, x, y, threshold, reset, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int y,
+                                                           int start_x,
+                                                           int width,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_x);
+#else
+  kernel_adaptive_sampling_filter_x(kg, render_buffer, y, start_x, width, offset, stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int start_y,
+                                                           int height,
+                                                           int offset,
+                                                           int stride)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, adaptive_sampling_filter_y);
+#else
+  kernel_adaptive_sampling_filter_y(kg, render_buffer, x, start_y, height, offset, stride);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+                                                        ccl_global float *render_buffer,
+                                                        int pixel_index)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, cryptomatte_postprocess);
+#else
+  kernel_cryptomatte_post(kg, render_buffer, pixel_index);
+#endif
+}
+
+/* --------------------------------------------------------------------
+ * Bake.
+ */
+/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
+
+void KERNEL_FUNCTION_FULL_NAME(bake)(
+    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
+{
+#if 0
+#  ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, bake);
+#  else
+#    ifdef __BAKING__
+  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
+#    endif
+#  endif /* KERNEL_STUB */
+#endif
+}
+
+#undef KERNEL_INVOKE
+#undef DEFINE_INTEGRATOR_KERNEL
+#undef DEFINE_INTEGRATOR_SHADE_KERNEL
+#undef DEFINE_INTEGRATOR_INIT_KERNEL
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 5f6b6800363..220768036ab 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -34,6 +34,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 97e8fc25140..90c05113cbe 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -35,6 +35,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index 26d7fd4de48..fb85ef5b0d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -29,6 +29,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 3f259aa4480..87baf04258a 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -31,6 +31,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index 68bae8c07c6..bb421d58815 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -32,6 +32,6 @@
 #  endif
 #endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
 
-#include "kernel/kernel.h"
+#include "kernel/device/cpu/kernel.h"
 #define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#include "kernel/device/cpu/kernel_arch_impl.h"
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/device/cuda/compat.h
index ea3b78b7cef..3c85a8e7bd2 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -14,20 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_CUDA_H__
-#define __KERNEL_COMPAT_CUDA_H__
+#pragma once
 
 #define __KERNEL_GPU__
 #define __KERNEL_CUDA__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
 
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
 #endif
 
 /* Manual definitions so we can compile without CUDA toolkit. */
@@ -38,8 +33,6 @@ typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -47,14 +40,7 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Qualifier wrappers for different names on different devices */
+/* Qualifiers */
 
 #define ccl_device __device__ __inline__
 #if __CUDA_ARCH__ < 500
@@ -68,104 +54,61 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local __shared__
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
-/* TODO(sergey): In theory we might use references with CUDA, however
- * performance impact yet to be investigated.
- */
-#define ccl_ref
 #define ccl_align(n) __align__(n)
 #define ccl_optional_struct_init
 
-#define ATTR_FALLTHROUGH
-
-#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH)
-
 /* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
-/* Types */
+/* GPU thread, block, grid size and index */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
 
-/* Work item functions */
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
 
-ccl_device_inline uint ccl_local_id(uint d)
-{
-  switch (d) {
-    case 0:
-      return threadIdx.x;
-    case 1:
-      return threadIdx.y;
-    case 2:
-      return threadIdx.z;
-    default:
-      return 0;
-  }
-}
+/* GPU warp synchronization. */
 
-#define ccl_global_id(d) (ccl_group_id(d) * ccl_local_size(d) + ccl_local_id(d))
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
 
-ccl_device_inline uint ccl_local_size(uint d)
-{
-  switch (d) {
-    case 0:
-      return blockDim.x;
-    case 1:
-      return blockDim.y;
-    case 2:
-      return blockDim.z;
-    default:
-      return 0;
-  }
-}
+/* GPU texture objects */
 
-#define ccl_global_size(d) (ccl_num_groups(d) * ccl_local_size(d))
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
 
-ccl_device_inline uint ccl_group_id(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
 {
-  switch (d) {
-    case 0:
-      return blockIdx.x;
-    case 1:
-      return blockIdx.y;
-    case 2:
-      return blockIdx.z;
-    default:
-      return 0;
-  }
+  return tex2D<T>(texobj, x, y);
 }
 
-ccl_device_inline uint ccl_num_groups(uint d)
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
 {
-  switch (d) {
-    case 0:
-      return gridDim.x;
-    case 1:
-      return gridDim.y;
-    case 2:
-      return gridDim.z;
-    default:
-      return 0;
-  }
+  return tex3D<T>(texobj, x, y, z);
 }
 
-/* Textures */
-
-/* Use arrays for regular data. */
-#define kernel_tex_fetch(t, index) t[(index)]
-#define kernel_tex_array(t) (t)
-
-#define kernel_data __data
-
 /* Use fast math functions */
 
 #define cosf(x) __cosf(((float)(x)))
@@ -175,4 +118,18 @@ ccl_device_inline uint ccl_num_groups(uint d)
 #define logf(x) __logf(((float)(x)))
 #define expf(x) __expf(((float)(x)))
 
-#endif /* __KERNEL_COMPAT_CUDA_H__ */
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
new file mode 100644
index 00000000000..46196dcdb51
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from CUDA occupancy calculator.
+ *
+ * Terminology
+ * - CUDA GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* 3.0 and 3.5 */
+#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+#  define GPU_KERNEL_MAX_REGISTERS 63
+
+/* 5.x, 6.x */
+#elif __CUDA_ARCH__ <= 699
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
+/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
+ * registers */
+#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
+#    define GPU_KERNEL_MAX_REGISTERS 64
+#  else
+#    define GPU_KERNEL_MAX_REGISTERS 48
+#  endif
+
+/* 7.x, 8.x */
+#elif __CUDA_ARCH__ <= 899
+#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
+#  define GPU_BLOCK_MAX_THREADS 1024
+#  define GPU_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
+#  define GPU_KERNEL_MAX_REGISTERS 96
+
+/* unknown architecture */
+#else
+#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
new file mode 100644
index 00000000000..169047175f5
--- /dev/null
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl b/intern/cycles/kernel/device/cuda/kernel.cu
index 84938b889e5..e26fe243642 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_indirect_subsurface.cl
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2013 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
+/* CUDA kernel entry points */
 
-#define KERNEL_NAME indirect_subsurface
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
+#ifdef __CUDA_ARCH__
 
+#  include "kernel/device/cuda/compat.h"
+#  include "kernel/device/cuda/config.h"
+#  include "kernel/device/cuda/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/device/gpu/image.h
index 132653fa7ca..b015c78a8f5 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
 #ifdef WITH_NANOVDB
 #  define NDEBUG /* Disable "assert" in device code */
 #  define NANOVDB_USE_INTRINSICS
@@ -61,9 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -81,15 +85,18 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
   float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
   float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
 
-  return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
-         cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
+  return cubic_g0(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y0) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y0)) +
+         cubic_g1(fy) * (g0x * ccl_gpu_tex_object_read_2D<T>(tex, x0, y1) +
+                         g1x * ccl_gpu_tex_object_read_2D<T>(tex, x1, y1));
 }
 
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+ccl_device_noinline T
+kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
 {
-  CUtexObject tex = (CUtexObject)info.data;
+  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
   x = (x * info.width) - 0.5f;
   y = (y * info.height) - 0.5f;
@@ -117,10 +124,14 @@ ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x,
   float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
   float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
 
-  return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
-         g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) +
-                g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1)));
+  return g0z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z0)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z0) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z0))) +
+         g1z * (g0y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y0, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y0, z1)) +
+                g1y * (g0x * ccl_gpu_tex_object_read_3D<T>(tex, x0, y1, z1) +
+                       g1x * ccl_gpu_tex_object_read_3D<T>(tex, x1, y1, z1)));
 }
 
 #ifdef WITH_NANOVDB
@@ -157,7 +168,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 }
 
 template<typename T>
-ccl_device_inline T kernel_tex_image_interp_nanovdb(
+ccl_device_noinline T kernel_tex_image_interp_nanovdb(
     const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
@@ -178,7 +189,7 @@ ccl_device_inline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -190,8 +201,8 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       return kernel_tex_image_interp_bicubic<float4>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex2D<float4>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_2D<float4>(tex, x, y);
     }
   }
   /* float, byte and half */
@@ -202,15 +213,15 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, fl
       f = kernel_tex_image_interp_bicubic<float>(info, x, y);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex2D<float>(tex, x, y);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_2D<float>(tex, x, y);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
@@ -245,8 +256,8 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       return kernel_tex_image_interp_tricubic<float4>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      return tex3D<float4>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      return ccl_gpu_tex_object_read_3D<float4>(tex, x, y, z);
     }
   }
   else {
@@ -256,10 +267,12 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg,
       f = kernel_tex_image_interp_tricubic<float>(info, x, y, z);
     }
     else {
-      CUtexObject tex = (CUtexObject)info.data;
-      f = tex3D<float>(tex, x, y, z);
+      ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
+      f = ccl_gpu_tex_object_read_3D<float>(tex, x, y, z);
     }
 
     return make_float4(f, f, f, 1.0f);
   }
 }
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
new file mode 100644
index 00000000000..7b79c0aedfa
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -0,0 +1,843 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Common GPU kernels. */
+
+#include "kernel/device/gpu/parallel_active_index.h"
+#include "kernel/device/gpu/parallel_prefix_sum.h"
+#include "kernel/device/gpu/parallel_sorted_index.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_init_from_bake.h"
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_bake.h"
+#include "kernel/kernel_film.h"
+#include "kernel/kernel_work_stealing.h"
+
+/* --------------------------------------------------------------------
+ * Integrator.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_reset(int num_states)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  if (state < num_states) {
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
+                                           const int num_tiles,
+                                           float *render_buffer,
+                                           const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
+                                         const int num_tiles,
+                                         float *render_buffer,
+                                         const int max_tile_work_size)
+{
+  const int work_index = ccl_gpu_global_id_x();
+
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
+  }
+
+  const int state = tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_closest(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_shadow(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_subsurface(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_intersect_volume_stack(NULL, state);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_background(const int *path_index_array,
+                                           float *render_buffer,
+                                           const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_background(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_light(const int *path_index_array,
+                                      float *render_buffer,
+                                      const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_light(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_shadow(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface(const int *path_index_array,
+                                        float *render_buffer,
+                                        const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
+                                                 float *render_buffer,
+                                                 const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shade_volume(const int *path_index_array,
+                                       float *render_buffer,
+                                       const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+    integrator_shade_volume(NULL, state, render_buffer);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_paths_array(int num_states,
+                                             int *indices,
+                                             int *num_indices,
+                                             int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
+                                                    int *indices,
+                                                    int *num_indices,
+                                                    int kernel)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [kernel](const int state) {
+        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_paths_array(int num_states,
+                                                 int *indices,
+                                                 int *num_indices,
+                                                 int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_sorted_paths_array(
+        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+{
+  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(path, shader_sort_key) :
+                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_paths_array(int num_states,
+                                              int *indices,
+                                              int *num_indices,
+                                              int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
+                                         const int active_states_offset,
+                                         const int terminated_states_offset,
+                                         const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_state_move(to_state, from_state);
+  }
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_prefix_sum(int *values, int num_values)
+{
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+}
+
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
+                                                   int sx,
+                                                   int sy,
+                                                   int sw,
+                                                   int sh,
+                                                   float threshold,
+                                                   bool reset,
+                                                   int offset,
+                                                   int stride,
+                                                   uint *num_active_pixels)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / sw;
+  const int x = work_index - y * sw;
+
+  bool converged = true;
+
+  if (x < sw && y < sh) {
+    converged = kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_x(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int y = ccl_gpu_global_id_x();
+
+  if (y < sh) {
+    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_adaptive_sampling_filter_y(
+        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+  const int x = ccl_gpu_global_id_x();
+
+  if (x < sw) {
+    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Cryptomatte.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+{
+  const int pixel_index = ccl_gpu_global_id_x();
+
+  if (pixel_index < num_pixels) {
+    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Film.
+ */
+
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                      float *pixels,
+                                                      float *render_buffer,
+                                                      int num_pixels,
+                                                      int width,
+                                                      int offset,
+                                                      int stride,
+                                                      int dst_offset,
+                                                      int dst_stride,
+                                                      const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels +
+                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
+/* Common implementation for half4 destination and 4-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  const int render_pixel_index = ccl_gpu_global_id_x();
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+
+  float pixel[4];
+  processor(kfilm_convert, buffer, pixel);
+
+  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
+
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+}
+
+/* Common implementation for half4 destination and 3-channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        processor(kfilm_convert, buffer, pixel_rgba);
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+/* Common implementation for half4 destination and single channel input pass. */
+template<typename Processor>
+ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
+    const KernelFilmConvert *kfilm_convert,
+    uchar4 *rgba,
+    float *render_buffer,
+    int num_pixels,
+    int width,
+    int offset,
+    int stride,
+    int rgba_offset,
+    int rgba_stride,
+    const Processor &processor)
+{
+  kernel_gpu_film_convert_half_rgba_common_rgba(
+      kfilm_convert,
+      rgba,
+      render_buffer,
+      num_pixels,
+      width,
+      offset,
+      stride,
+      rgba_offset,
+      rgba_stride,
+      [&processor](const KernelFilmConvert *kfilm_convert,
+                   ccl_global const float *buffer,
+                   float *pixel_rgba) {
+        float value;
+        processor(kfilm_convert, buffer, &value);
+
+        pixel_rgba[0] = value;
+        pixel_rgba[1] = value;
+        pixel_rgba[2] = value;
+        pixel_rgba[3] = 1.0f;
+      });
+}
+
+#define KERNEL_FILM_CONVERT_PROC(name) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
+
+#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
+  (const KernelFilmConvert kfilm_convert, \
+   float *pixels, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_common(&kfilm_convert, \
+                                   pixels, \
+                                   render_buffer, \
+                                   num_pixels, \
+                                   width, \
+                                   offset, \
+                                   stride, \
+                                   rgba_offset, \
+                                   rgba_stride, \
+                                   film_get_pass_pixel_##variant); \
+  } \
+  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
+  (const KernelFilmConvert kfilm_convert, \
+   uchar4 *rgba, \
+   float *render_buffer, \
+   int num_pixels, \
+   int width, \
+   int offset, \
+   int stride, \
+   int rgba_offset, \
+   int rgba_stride) \
+  { \
+    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
+                                                        rgba, \
+                                                        render_buffer, \
+                                                        num_pixels, \
+                                                        width, \
+                                                        offset, \
+                                                        stride, \
+                                                        rgba_offset, \
+                                                        rgba_stride, \
+                                                        film_get_pass_pixel_##variant); \
+  }
+
+KERNEL_FILM_CONVERT_DEFINE(depth, value)
+KERNEL_FILM_CONVERT_DEFINE(mist, value)
+KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
+KERNEL_FILM_CONVERT_DEFINE(float, value)
+
+KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
+KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
+
+KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
+KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
+KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
+KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
+KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
+
+#undef KERNEL_FILM_CONVERT_DEFINE
+#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
+#undef KERNEL_FILM_CONVERT_PROC
+
+/* --------------------------------------------------------------------
+ * Shader evaluation.
+ */
+
+/* Displacement */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
+                                    float4 *output,
+                                    const int offset,
+                                    const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_displace_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* Background Shader Evaluation */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
+                                      float4 *output,
+                                      const int offset,
+                                      const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_background_evaluate(NULL, input, output, offset + i);
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Denoising.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_preprocess(float *render_buffer,
+                                       int full_x,
+                                       int full_y,
+                                       int width,
+                                       int height,
+                                       int offset,
+                                       int stride,
+                                       int pass_stride,
+                                       int pass_denoised)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float *color_out = buffer + pass_denoised;
+  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
+  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
+  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
+                                         int guiding_pass_stride,
+                                         int guiding_pass_albedo,
+                                         int guiding_pass_normal,
+                                         const float *render_buffer,
+                                         int render_offset,
+                                         int render_stride,
+                                         int render_pass_stride,
+                                         int render_pass_sample_count,
+                                         int render_pass_denoising_albedo,
+                                         int render_pass_denoising_normal,
+                                         int full_x,
+                                         int full_y,
+                                         int width,
+                                         int height,
+                                         int num_samples)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
+  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+
+  float pixel_scale;
+  if (render_pass_sample_count == PASS_UNUSED) {
+    pixel_scale = 1.0f / num_samples;
+  }
+  else {
+    pixel_scale = 1.0f / __float_as_uint(buffer[render_pass_sample_count]);
+  }
+
+  /* Albedo pass. */
+  if (guiding_pass_albedo != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
+
+    const float *aledo_in = buffer + render_pass_denoising_albedo;
+    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+    albedo_out[0] = aledo_in[0] * pixel_scale;
+    albedo_out[1] = aledo_in[1] * pixel_scale;
+    albedo_out[2] = aledo_in[2] * pixel_scale;
+  }
+
+  /* Normal pass. */
+  if (render_pass_denoising_normal != PASS_UNUSED) {
+    kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
+
+    const float *normal_in = buffer + render_pass_denoising_normal;
+    float *normal_out = guiding_pixel + guiding_pass_normal;
+
+    normal_out[0] = normal_in[0] * pixel_scale;
+    normal_out[1] = normal_in[1] * pixel_scale;
+    normal_out[2] = normal_in[2] * pixel_scale;
+  }
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
+                                              int guiding_pass_stride,
+                                              int guiding_pass_albedo,
+                                              int width,
+                                              int height)
+{
+  kernel_assert(guiding_pass_albedo != PASS_UNUSED);
+
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t guiding_pixel_index = x + y * width;
+  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+
+  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+
+  albedo_out[0] = 0.5f;
+  albedo_out[1] = 0.5f;
+  albedo_out[2] = 0.5f;
+}
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_filter_color_postprocess(float *render_buffer,
+                                        int full_x,
+                                        int full_y,
+                                        int width,
+                                        int height,
+                                        int offset,
+                                        int stride,
+                                        int pass_stride,
+                                        int num_samples,
+                                        int pass_noisy,
+                                        int pass_denoised,
+                                        int pass_sample_count,
+                                        int num_components,
+                                        bool use_compositing)
+{
+  const int work_index = ccl_gpu_global_id_x();
+  const int y = work_index / width;
+  const int x = work_index - y * width;
+
+  if (x >= width || y >= height) {
+    return;
+  }
+
+  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
+  float *buffer = render_buffer + render_pixel_index * pass_stride;
+
+  float pixel_scale;
+  if (pass_sample_count == PASS_UNUSED) {
+    pixel_scale = num_samples;
+  }
+  else {
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
+  }
+
+  float *denoised_pixel = buffer + pass_denoised;
+
+  denoised_pixel[0] *= pixel_scale;
+  denoised_pixel[1] *= pixel_scale;
+  denoised_pixel[2] *= pixel_scale;
+
+  if (num_components == 3) {
+    /* Pass without alpha channel. */
+  }
+  else if (!use_compositing) {
+    /* Currently compositing passes are either 3-component (derived by dividing light passes)
+     * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
+     * simplifies logic and avoids extra memory allocation. */
+    const float *noisy_pixel = buffer + pass_noisy;
+    denoised_pixel[3] = noisy_pixel[3];
+  }
+  else {
+    /* Assigning to zero since this is a default alpha value for 3-component passes, and it
+     * is an opaque pixel for 4 component passes. */
+
+    denoised_pixel[3] = 0;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
+                                                               uint *num_possible_splits)
+{
+  const int state = ccl_gpu_global_id_x();
+
+  bool can_split = false;
+
+  if (state < num_states) {
+    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+  }
+
+  /* NOTE: All threads specified in the mask must execute the intrinsic. */
+  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
+  if (lane_id == 0) {
+    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+  }
+}
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
new file mode 100644
index 00000000000..85500bf4d07
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active.
+ *
+ * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename IsActiveOp>
+__device__ void gpu_parallel_active_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                IsActiveOp is_active_op)
+{
+  extern ccl_gpu_shared int warp_offset[];
+
+  const uint thread_index = ccl_gpu_thread_idx_x;
+  const uint thread_warp = thread_index % ccl_gpu_warp_size;
+
+  const uint warp_index = thread_index / ccl_gpu_warp_size;
+  const uint num_warps = blocksize / ccl_gpu_warp_size;
+
+  /* Test if state corresponding to this thread is active. */
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+
+  /* For each thread within a warp compute how many other active states precede it. */
+  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
+  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+
+  /* Last thread in warp stores number of active states for each warp. */
+  if (thread_warp == ccl_gpu_warp_size - 1) {
+    warp_offset[warp_index] = thread_offset + is_active;
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Last thread in block converts per-warp sizes to offsets, increments global size of
+   * index array and gets offset to write to. */
+  if (thread_index == blocksize - 1) {
+    /* TODO: parallelize this. */
+    int offset = 0;
+    for (int i = 0; i < num_warps; i++) {
+      int num_active = warp_offset[i];
+      warp_offset[i] = offset;
+      offset += num_active;
+    }
+
+    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write to index array. */
+  if (is_active) {
+    const uint block_offset = warp_offset[num_warps];
+    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
new file mode 100644
index 00000000000..f609520b8b4
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel prefix sum.
+ *
+ * TODO: actually make this work in parallel.
+ *
+ * This is used for an array the size of the number of shaders in the scene
+ * which is not usually huge, so might not be a significant bottleneck. */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+{
+  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+    return;
+  }
+
+  int offset = 0;
+  for (int i = 0; i < num_values; i++) {
+    const int new_offset = offset + values[i];
+    values[i] = offset;
+    offset = new_offset;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h
new file mode 100644
index 00000000000..65b1990dbb8
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Parallel sum of array input_data with size n into output_sum.
+ *
+ * Adapted from "Optimizing Parallel Reduction in GPU", Mark Harris.
+ *
+ * This version adds multiple elements per thread sequentially.  This reduces
+ * the overall cost of the algorithm while keeping the work complexity O(n) and
+ * the step complexity O(log n). (Brent's Theorem optimization) */
+
+#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+
+template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
+__device__ void gpu_parallel_sum(
+    const InputT *input_data, const uint n, OutputT *output_sum, OutputT zero, ConvertOp convert)
+{
+  extern ccl_gpu_shared OutputT shared_data[];
+
+  const uint tid = ccl_gpu_thread_idx_x;
+  const uint gridsize = blocksize * ccl_gpu_grid_dim_x();
+
+  OutputT sum = zero;
+  for (uint i = ccl_gpu_block_idx_x * blocksize + tid; i < n; i += gridsize) {
+    sum += convert(input_data[i]);
+  }
+  shared_data[tid] = sum;
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 512 && tid < 256) {
+    shared_data[tid] = sum = sum + shared_data[tid + 256];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 256 && tid < 128) {
+    shared_data[tid] = sum = sum + shared_data[tid + 128];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 128 && tid < 64) {
+    shared_data[tid] = sum = sum + shared_data[tid + 64];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (blocksize >= 64 && tid < 32) {
+    shared_data[tid] = sum = sum + shared_data[tid + 32];
+  }
+
+  ccl_gpu_syncthreads();
+
+  if (tid < 32) {
+    for (int offset = ccl_gpu_warp_size / 2; offset > 0; offset /= 2) {
+      sum += ccl_shfl_down_sync(0xFFFFFFFF, sum, offset);
+    }
+  }
+
+  if (tid == 0) {
+    output_sum[ccl_gpu_block_idx_x] = sum;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
new file mode 100644
index 00000000000..99b35468517
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Given an array of states, build an array of indices for which the states
+ * are active and sorted by a given key. The prefix sum of the number of active
+ * states per key must have already been computed.
+ *
+ * TODO: there may be ways to optimize this to avoid this many atomic ops? */
+
+#include "util/util_atomic.h"
+
+#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+
+template<uint blocksize, typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                int *indices,
+                                                int *num_indices,
+                                                int *key_prefix_sum,
+                                                GetKeyOp get_key_op)
+{
+  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
+  const int key = (state_index < num_states) ? get_key_op(state_index) :
+                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
+
+  if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
+    const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
+    indices[index] = state_index;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_compat_optix.h b/intern/cycles/kernel/device/optix/compat.h
index 064c99ca100..fb9e094b535 100644
--- a/intern/cycles/kernel/kernel_compat_optix.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -15,14 +15,13 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COMPAT_OPTIX_H__
-#define __KERNEL_COMPAT_OPTIX_H__
+#pragma once
 
 #define OPTIX_DONT_INCLUDE_CUDA
 #include <optix.h>
 
 #define __KERNEL_GPU__
-#define __KERNEL_CUDA__  // OptiX kernels are implicitly CUDA kernels too
+#define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
@@ -31,14 +30,14 @@
 #  define ATTR_FALLTHROUGH
 #endif
 
+/* Manual definitions so we can compile without CUDA toolkit. */
+
 #ifdef __CUDACC_RTC__
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 #else
 #  include <stdint.h>
 #endif
-typedef unsigned short half;
-typedef unsigned long long CUtexObject;
 
 #ifdef CYCLES_CUBIN_CC
 #  define FLT_MIN 1.175494350822287507969e-38f
@@ -46,21 +45,6 @@ typedef unsigned long long CUtexObject;
 #  define FLT_EPSILON 1.192092896e-07F
 #endif
 
-__device__ half __float2half(const float f)
-{
-  half val;
-  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
-  return val;
-}
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
 #define ccl_device \
   __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
@@ -69,29 +53,75 @@ __device__ half __float2half(const float f)
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
 #define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
 #define ccl_constant const
-#define ccl_local
-#define ccl_local_param
+#define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
 #define ccl_addr_space
-#define ccl_loop_no_unroll
 #define ccl_restrict __restrict__
-#define ccl_ref
+#define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
 
-// Zero initialize structs to help the compiler figure out scoping
+/* Zero initialize structs to help the compiler figure out scoping */
 #define ccl_optional_struct_init = {}
 
-#define kernel_data __params.data  // See kernel_globals.h
-#define kernel_tex_array(t) __params.t
-#define kernel_tex_fetch(t, index) __params.t[(index)]
+/* No assert supported for CUDA */
 
 #define kernel_assert(cond)
 
+/* GPU thread, block, grid size and index */
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization. */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+
+typedef unsigned long long CUtexObject;
+typedef CUtexObject ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Half */
+
+typedef unsigned short half;
+
+__device__ half __float2half(const float f)
+{
+  half val;
+  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
+  return val;
+}
+
 /* Types */
 
 #include "util/util_half.h"
 #include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPTIX_H__ */
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
new file mode 100644
index 00000000000..7d898ed5d91
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  int unused[1];
+};
+
+/* Launch parameters */
+struct KernelParamsOptiX {
+  /* Kernel arguments */
+  const int *path_index_array;
+  float *render_buffer;
+
+  /* Global scene data and textures */
+  KernelData data;
+#define KERNEL_TEX(type, name) const type *name;
+#include "kernel/kernel_textures.h"
+
+  /* Integrator state */
+  IntegratorStateGPU __integrator_state;
+};
+
+#ifdef __NVCC__
+extern "C" static __constant__ KernelParamsOptiX __params;
+#endif
+
+/* Abstraction macros */
+#define kernel_data __params.data
+#define kernel_tex_array(t) __params.t
+#define kernel_tex_fetch(t, index) __params.t[(index)]
+#define kernel_integrator_state __params.__integrator_state
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/optix/kernel_optix.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7f609eab474..c1e36febfc0 100644
--- a/intern/cycles/kernel/kernels/optix/kernel_optix.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -16,14 +16,20 @@
  */
 
 // clang-format off
-#include "kernel/kernel_compat_optix.h"
-#include "util/util_atomic.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "../cuda/kernel_cuda_image.h"  // Texture lookup uses normal CUDA intrinsics
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_bake.h"
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+#include "kernel/integrator/integrator_state_util.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
 // clang-format on
 
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
@@ -53,52 +59,36 @@ template<bool always = false> ccl_device_forceinline uint get_object_id()
     return OBJECT_NONE;
 }
 
-extern "C" __global__ void __raygen__kernel_optix_path_trace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
-  KernelGlobals kg;  // Allocate stack storage for common data
-
-  const uint3 launch_index = optixGetLaunchIndex();
-  // Keep threads for same pixel together to improve occupancy of warps
-  uint pixel_offset = launch_index.x / __params.tile.num_samples;
-  uint sample_offset = launch_index.x % __params.tile.num_samples;
-
-  kernel_path_trace(&kg,
-                    __params.tile.buffer,
-                    __params.tile.start_sample + sample_offset,
-                    __params.tile.x + pixel_offset,
-                    __params.tile.y + launch_index.y,
-                    __params.tile.offset,
-                    __params.tile.stride);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_closest(nullptr, path_index);
 }
 
-#ifdef __BAKING__
-extern "C" __global__ void __raygen__kernel_optix_bake()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_bake_evaluate(&kg,
-                       p.input,
-                       p.output,
-                       (ShaderEvalType)p.type,
-                       p.filter,
-                       p.sx + optixGetLaunchIndex().x,
-                       p.offset,
-                       p.sample);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_shadow(nullptr, path_index);
 }
-#endif
 
-extern "C" __global__ void __raygen__kernel_optix_displace()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_displace_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_subsurface(nullptr, path_index);
 }
 
-extern "C" __global__ void __raygen__kernel_optix_background()
+extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
 {
-  KernelGlobals kg;
-  const ShaderParams &p = __params.shader;
-  kernel_background_evaluate(&kg, p.input, p.output, p.sx + optixGetLaunchIndex().x);
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_intersect_volume_stack(nullptr, path_index);
 }
 
 extern "C" __global__ void __miss__kernel_optix_miss()
@@ -179,54 +169,91 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 {
 #ifdef __SHADOW_RECORD_ALL__
+  bool ignore_intersection = false;
+
   const uint prim = optixGetPrimitiveIndex();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
   if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
-    return optixIgnoreIntersection();
+    ignore_intersection = true;
   }
 #  endif
 
-  // Offset into array with num_hits
-  Intersection *const isect = get_payload_ptr_0<Intersection>() + optixGetPayload_2();
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__prim_type, prim);
-
+  float u = 0.0f, v = 0.0f;
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
-    isect->u = 1.0f - barycentrics.y - barycentrics.x;
-    isect->v = barycentrics.x;
+    u = 1.0f - barycentrics.y - barycentrics.x;
+    v = barycentrics.x;
   }
 #  ifdef __HAIR__
   else {
-    const float u = __uint_as_float(optixGetAttribute_0());
-    isect->u = u;
-    isect->v = __uint_as_float(optixGetAttribute_1());
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
 
     // Filter out curve endcaps
     if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
+      ignore_intersection = true;
     }
   }
 #  endif
 
+  int num_hits = optixGetPayload_2();
+  int record_index = num_hits;
+  const int max_hits = optixGetPayload_3();
+
+  if (!ignore_intersection) {
+    optixSetPayload_2(num_hits + 1);
+  }
+
+  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+
 #  ifdef __TRANSPARENT_SHADOWS__
-  // Detect if this surface has a shader with transparent shadows
-  if (!shader_transparent_shadow(NULL, isect) || optixGetPayload_2() >= optixGetPayload_3()) {
+  if (num_hits >= max_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    const int num_recorded_hits = min(max_hits, num_hits);
+    float max_recorded_t = isect_array[0].t;
+    int max_recorded_hit = 0;
+
+    for (int i = 1; i < num_recorded_hits; i++) {
+      if (isect_array[i].t > max_recorded_t) {
+        max_recorded_t = isect_array[i].t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
+       * hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
 #  endif
-    // This is an opaque hit or the hit limit has been reached, abort traversal
-    optixSetPayload_5(true);
-    return optixTerminateRay();
+
+  if (!ignore_intersection) {
+    Intersection *const isect = isect_array + record_index;
+    isect->u = u;
+    isect->v = v;
+    isect->t = optixGetRayTmax();
+    isect->prim = prim;
+    isect->object = get_object_id();
+    isect->type = kernel_tex_fetch(__prim_type, prim);
+
+#  ifdef __TRANSPARENT_SHADOWS__
+    // Detect if this surface has a shader with transparent shadows
+    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
+#  endif
+      // If no transparent shadows, all light is blocked and we can stop immediately
+      optixSetPayload_5(true);
+      return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
+    }
+#  endif
   }
 
-  optixSetPayload_2(optixGetPayload_2() + 1);  // num_hits++
-
   // Continue tracing
   optixIgnoreIntersection();
-#  endif
 #endif
 }
 
@@ -300,7 +327,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (curve_intersect(NULL, &isect, P, dir, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),   // Attribute_0
@@ -317,11 +344,4 @@ extern "C" __global__ void __intersection__curve_ribbon()
     optix_intersection_curve(prim, type);
   }
 }
-
-extern "C" __global__ void __intersection__curve_all()
-{
-  const uint prim = optixGetPrimitiveIndex();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
-  optix_intersection_curve(prim, type);
-}
 #endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
new file mode 100644
index 00000000000..bf787e29eaa
--- /dev/null
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2021, Blender Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copy of the regular kernels with additional shader ray-tracing kernel that takes
+ * much longer to compiler. This is only loaded when needed by the scene. */
+
+#include "kernel/device/optix/kernel.cu"
+#include "kernel/integrator/integrator_shade_surface.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
+                                                       global_index;
+  integrator_shade_surface_raytrace(nullptr, path_index, __params.render_buffer);
+}
diff --git a/intern/cycles/kernel/filter/filter.h b/intern/cycles/kernel/filter/filter.h
deleted file mode 100644
index b067e53a8bf..00000000000
--- a/intern/cycles/kernel/filter/filter.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_H__
-#define __FILTER_H__
-
-/* CPU Filter Kernel Interface */
-
-#include "util/util_types.h"
-
-#include "kernel/filter/filter_defines.h"
-
-CCL_NAMESPACE_BEGIN
-
-#define KERNEL_NAME_JOIN(x, y, z) x##_##y##_##z
-#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
-#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
-
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu.h"
-
-CCL_NAMESPACE_END
-
-#endif /* __FILTER_H__ */
diff --git a/intern/cycles/kernel/filter/filter_defines.h b/intern/cycles/kernel/filter/filter_defines.h
deleted file mode 100644
index 1c0ac5e2cb7..00000000000
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __FILTER_DEFINES_H__
-#define __FILTER_DEFINES_H__
-
-#define DENOISE_FEATURES 11
-#define TRANSFORM_SIZE (DENOISE_FEATURES * DENOISE_FEATURES)
-#define XTWX_SIZE (((DENOISE_FEATURES + 1) * (DENOISE_FEATURES + 2)) / 2)
-#define XTWY_SIZE (DENOISE_FEATURES + 1)
-
-#define DENOISE_MAX_FRAMES 16
-
-typedef struct TileInfo {
-  int offsets[9];
-  int strides[9];
-  int x[4];
-  int y[4];
-  int from_render;
-  int frames[DENOISE_MAX_FRAMES];
-  int num_frames;
-  /* TODO(lukas): CUDA doesn't have uint64_t... */
-#ifdef __KERNEL_OPENCL__
-  ccl_global float *buffers[9];
-#else
-  long long int buffers[9];
-#endif
-} TileInfo;
-
-#ifdef __KERNEL_OPENCL__
-#  define CCL_FILTER_TILE_INFO \
-    ccl_global TileInfo *tile_info, ccl_global float *tile_buffer_1, \
-        ccl_global float *tile_buffer_2, ccl_global float *tile_buffer_3, \
-        ccl_global float *tile_buffer_4, ccl_global float *tile_buffer_5, \
-        ccl_global float *tile_buffer_6, ccl_global float *tile_buffer_7, \
-        ccl_global float *tile_buffer_8, ccl_global float *tile_buffer_9
-#  define CCL_FILTER_TILE_INFO_ARG \
-    tile_info, tile_buffer_1, tile_buffer_2, tile_buffer_3, tile_buffer_4, tile_buffer_5, \
-        tile_buffer_6, tile_buffer_7, tile_buffer_8, tile_buffer_9
-#  define ccl_get_tile_buffer(id) \
-    (id == 0 ? tile_buffer_1 : \
-     id == 1 ? tile_buffer_2 : \
-     id == 2 ? tile_buffer_3 : \
-     id == 3 ? tile_buffer_4 : \
-     id == 4 ? tile_buffer_5 : \
-     id == 5 ? tile_buffer_6 : \
-     id == 6 ? tile_buffer_7 : \
-     id == 7 ? tile_buffer_8 : \
-               tile_buffer_9)
-#else
-#  ifdef __KERNEL_CUDA__
-#    define CCL_FILTER_TILE_INFO ccl_global TileInfo *tile_info
-#  else
-#    define CCL_FILTER_TILE_INFO TileInfo *tile_info
-#  endif
-#  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
-#endif
-
-#endif /* __FILTER_DEFINES_H__*/
diff --git a/intern/cycles/kernel/filter/filter_features.h b/intern/cycles/kernel/filter/filter_features.h
deleted file mode 100644
index 8a2af957146..00000000000
--- a/intern/cycles/kernel/filter/filter_features.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature(buffer, pass) (buffer)[(pass)*pass_stride]
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).+ * pixel_buffer always
- * points to the current pixel in the first pass. Repeat the loop for every secondary frame if
- * there are any. */
-#define FOR_PIXEL_WINDOW \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
-
-#define END_FOR_PIXEL_WINDOW \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features(int3 pixel,
-                                           const ccl_global float *ccl_restrict buffer,
-                                           float *features,
-                                           bool use_time,
-                                           const float *ccl_restrict mean,
-                                           int pass_stride)
-{
-  features[0] = pixel.x;
-  features[1] = pixel.y;
-  features[2] = fabsf(ccl_get_feature(buffer, 0));
-  features[3] = ccl_get_feature(buffer, 1);
-  features[4] = ccl_get_feature(buffer, 2);
-  features[5] = ccl_get_feature(buffer, 3);
-  features[6] = ccl_get_feature(buffer, 4);
-  features[7] = ccl_get_feature(buffer, 5);
-  features[8] = ccl_get_feature(buffer, 6);
-  features[9] = ccl_get_feature(buffer, 7);
-  if (use_time) {
-    features[10] = pixel.z;
-  }
-  if (mean) {
-    for (int i = 0; i < (use_time ? 11 : 10); i++) {
-      features[i] -= mean[i];
-    }
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales(int3 pixel,
-                                                 const ccl_global float *ccl_restrict buffer,
-                                                 float *scales,
-                                                 bool use_time,
-                                                 const float *ccl_restrict mean,
-                                                 int pass_stride)
-{
-  scales[0] = fabsf(pixel.x - mean[0]);
-  scales[1] = fabsf(pixel.y - mean[1]);
-  scales[2] = fabsf(fabsf(ccl_get_feature(buffer, 0)) - mean[2]);
-  scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
-                                      ccl_get_feature(buffer, 2) - mean[4],
-                                      ccl_get_feature(buffer, 3) - mean[5]));
-  scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
-  scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
-                                      ccl_get_feature(buffer, 6) - mean[8],
-                                      ccl_get_feature(buffer, 7) - mean[9]));
-  if (use_time) {
-    scales[6] = fabsf(pixel.z - mean[10]);
-  }
-}
-
-ccl_device_inline void filter_calculate_scale(float *scale, bool use_time)
-{
-  scale[0] = 1.0f / max(scale[0], 0.01f);
-  scale[1] = 1.0f / max(scale[1], 0.01f);
-  scale[2] = 1.0f / max(scale[2], 0.01f);
-  if (use_time) {
-    scale[10] = 1.0f / max(scale[6], 0.01f);
-  }
-  scale[6] = 1.0f / max(scale[4], 0.01f);
-  scale[7] = scale[8] = scale[9] = 1.0f / max(sqrtf(scale[5]), 0.01f);
-  scale[3] = scale[4] = scale[5] = 1.0f / max(sqrtf(scale[3]), 0.01f);
-}
-
-ccl_device_inline float3 filter_get_color(const ccl_global float *ccl_restrict buffer,
-                                          int pass_stride)
-{
-  return make_float3(
-      ccl_get_feature(buffer, 8), ccl_get_feature(buffer, 9), ccl_get_feature(buffer, 10));
-}
-
-ccl_device_inline void design_row_add(float *design_row,
-                                      int rank,
-                                      const ccl_global float *ccl_restrict transform,
-                                      int stride,
-                                      int row,
-                                      float feature,
-                                      int transform_row_stride)
-{
-  for (int i = 0; i < rank; i++) {
-    design_row[1 + i] += transform[(row * transform_row_stride + i) * stride] * feature;
-  }
-}
-
-/* Fill the design row. */
-ccl_device_inline void filter_get_design_row_transform(
-    int3 p_pixel,
-    const ccl_global float *ccl_restrict p_buffer,
-    int3 q_pixel,
-    const ccl_global float *ccl_restrict q_buffer,
-    int pass_stride,
-    int rank,
-    float *design_row,
-    const ccl_global float *ccl_restrict transform,
-    int stride,
-    bool use_time)
-{
-  int num_features = use_time ? 11 : 10;
-
-  design_row[0] = 1.0f;
-  math_vector_zero(design_row + 1, rank);
-
-#define DESIGN_ROW_ADD(I, F) \
-  design_row_add(design_row, rank, transform, stride, I, F, num_features);
-  DESIGN_ROW_ADD(0, q_pixel.x - p_pixel.x);
-  DESIGN_ROW_ADD(1, q_pixel.y - p_pixel.y);
-  DESIGN_ROW_ADD(2, fabsf(ccl_get_feature(q_buffer, 0)) - fabsf(ccl_get_feature(p_buffer, 0)));
-  DESIGN_ROW_ADD(3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
-  DESIGN_ROW_ADD(4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
-  DESIGN_ROW_ADD(5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
-  DESIGN_ROW_ADD(6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
-  DESIGN_ROW_ADD(7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
-  DESIGN_ROW_ADD(8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
-  DESIGN_ROW_ADD(9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
-  if (use_time) {
-    DESIGN_ROW_ADD(10, q_pixel.z - p_pixel.z)
-  }
-#undef DESIGN_ROW_ADD
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_features_sse.h b/intern/cycles/kernel/filter/filter_features_sse.h
deleted file mode 100644
index 59d4ace2bef..00000000000
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define ccl_get_feature_sse(pass) load_float4(buffer + (pass)*pass_stride)
-
-/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
- * pixel_buffer always points to the first of the 4 current pixel in the first pass.
- * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set
- * for all pixels within the window. Repeat the loop for every secondary frame if there are any. */
-#define FOR_PIXEL_WINDOW_SSE \
-  for (int frame = 0; frame < tile_info->num_frames; frame++) { \
-    pixel.z = tile_info->frames[frame]; \
-    pixel_buffer = buffer + (low.y - rect.y) * buffer_w + (low.x - rect.x) + \
-                   frame * frame_stride; \
-    float4 t4 = make_float4(pixel.z); \
-    for (pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
-      float4 y4 = make_float4(pixel.y); \
-      for (pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
-        float4 x4 = make_float4(pixel.x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f); \
-        int4 active_pixels = x4 < make_float4(high.x);
-
-#define END_FOR_PIXEL_WINDOW_SSE \
-  } \
-  pixel_buffer += buffer_w - (high.x - low.x); \
-  } \
-  }
-
-ccl_device_inline void filter_get_features_sse(float4 x,
-                                               float4 y,
-                                               float4 t,
-                                               int4 active_pixels,
-                                               const float *ccl_restrict buffer,
-                                               float4 *features,
-                                               bool use_time,
-                                               const float4 *ccl_restrict mean,
-                                               int pass_stride)
-{
-  int num_features = use_time ? 11 : 10;
-
-  features[0] = x;
-  features[1] = y;
-  features[2] = fabs(ccl_get_feature_sse(0));
-  features[3] = ccl_get_feature_sse(1);
-  features[4] = ccl_get_feature_sse(2);
-  features[5] = ccl_get_feature_sse(3);
-  features[6] = ccl_get_feature_sse(4);
-  features[7] = ccl_get_feature_sse(5);
-  features[8] = ccl_get_feature_sse(6);
-  features[9] = ccl_get_feature_sse(7);
-  if (use_time) {
-    features[10] = t;
-  }
-
-  if (mean) {
-    for (int i = 0; i < num_features; i++) {
-      features[i] = features[i] - mean[i];
-    }
-  }
-  for (int i = 0; i < num_features; i++) {
-    features[i] = mask(active_pixels, features[i]);
-  }
-}
-
-ccl_device_inline void filter_get_feature_scales_sse(float4 x,
-                                                     float4 y,
-                                                     float4 t,
-                                                     int4 active_pixels,
-                                                     const float *ccl_restrict buffer,
-                                                     float4 *scales,
-                                                     bool use_time,
-                                                     const float4 *ccl_restrict mean,
-                                                     int pass_stride)
-{
-  scales[0] = fabs(x - mean[0]);
-  scales[1] = fabs(y - mean[1]);
-  scales[2] = fabs(fabs(ccl_get_feature_sse(0)) - mean[2]);
-  scales[3] = sqr(ccl_get_feature_sse(1) - mean[3]) + sqr(ccl_get_feature_sse(2) - mean[4]) +
-              sqr(ccl_get_feature_sse(3) - mean[5]);
-  scales[4] = fabs(ccl_get_feature_sse(4) - mean[6]);
-  scales[5] = sqr(ccl_get_feature_sse(5) - mean[7]) + sqr(ccl_get_feature_sse(6) - mean[8]) +
-              sqr(ccl_get_feature_sse(7) - mean[9]);
-  if (use_time) {
-    scales[6] = fabs(t - mean[10]);
-  }
-
-  for (int i = 0; i < (use_time ? 7 : 6); i++)
-    scales[i] = mask(active_pixels, scales[i]);
-}
-
-ccl_device_inline void filter_calculate_scale_sse(float4 *scale, bool use_time)
-{
-  scale[0] = rcp(max(reduce_max(scale[0]), make_float4(0.01f)));
-  scale[1] = rcp(max(reduce_max(scale[1]), make_float4(0.01f)));
-  scale[2] = rcp(max(reduce_max(scale[2]), make_float4(0.01f)));
-  if (use_time) {
-    scale[10] = rcp(max(reduce_max(scale[6]), make_float4(0.01f)));
-  }
-  scale[6] = rcp(max(reduce_max(scale[4]), make_float4(0.01f)));
-  scale[7] = scale[8] = scale[9] = rcp(max(reduce_max(sqrt(scale[5])), make_float4(0.01f)));
-  scale[3] = scale[4] = scale[5] = rcp(max(reduce_max(sqrt(scale[3])), make_float4(0.01f)));
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_kernel.h b/intern/cycles/kernel/filter/filter_kernel.h
deleted file mode 100644
index 2ef03dc0a02..00000000000
--- a/intern/cycles/kernel/filter/filter_kernel.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_color.h"
-#include "util/util_math.h"
-#include "util/util_math_fast.h"
-#include "util/util_texture.h"
-
-#include "util/util_atomic.h"
-#include "util/util_math_matrix.h"
-
-#include "kernel/filter/filter_defines.h"
-
-#include "kernel/filter/filter_features.h"
-#ifdef __KERNEL_SSE3__
-#  include "kernel/filter/filter_features_sse.h"
-#endif
-
-#include "kernel/filter/filter_prefilter.h"
-
-#ifdef __KERNEL_GPU__
-#  include "kernel/filter/filter_transform_gpu.h"
-#else
-#  ifdef __KERNEL_SSE3__
-#    include "kernel/filter/filter_transform_sse.h"
-#  else
-#    include "kernel/filter/filter_transform.h"
-#  endif
-#endif
-
-#include "kernel/filter/filter_reconstruction.h"
-
-#ifdef __KERNEL_CPU__
-#  include "kernel/filter/filter_nlm_cpu.h"
-#else
-#  include "kernel/filter/filter_nlm_gpu.h"
-#endif
diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h
deleted file mode 100644
index 24200c29203..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#define load4_a(buf, ofs) (*((float4 *)((buf) + (ofs))))
-#define load4_u(buf, ofs) load_float4((buf) + (ofs))
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(int dx,
-                                                         int dy,
-                                                         const float *ccl_restrict weight_image,
-                                                         const float *ccl_restrict variance_image,
-                                                         const float *ccl_restrict scale_image,
-                                                         float *difference_image,
-                                                         int4 rect,
-                                                         int stride,
-                                                         int channel_offset,
-                                                         int frame_offset,
-                                                         float a,
-                                                         float k_2)
-{
-  /* Strides need to be aligned to 16 bytes. */
-  kernel_assert((stride % 4) == 0 && (channel_offset % 4) == 0);
-
-  int aligned_lowx = rect.x & (~3);
-  const int numChannels = (channel_offset > 0) ? 3 : 1;
-  const float4 channel_fac = make_float4(1.0f / numChannels);
-
-  for (int y = rect.y; y < rect.w; y++) {
-    int idx_p = y * stride + aligned_lowx;
-    int idx_q = (y + dy) * stride + aligned_lowx + dx + frame_offset;
-    for (int x = aligned_lowx; x < rect.z; x += 4, idx_p += 4, idx_q += 4) {
-      float4 diff = make_float4(0.0f);
-      float4 scale_fac;
-      if (scale_image) {
-        scale_fac = clamp(load4_a(scale_image, idx_p) / load4_u(scale_image, idx_q),
-                          make_float4(0.25f),
-                          make_float4(4.0f));
-      }
-      else {
-        scale_fac = make_float4(1.0f);
-      }
-      for (int c = 0, chan_ofs = 0; c < numChannels; c++, chan_ofs += channel_offset) {
-        /* idx_p is guaranteed to be aligned, but idx_q isn't. */
-        float4 color_p = load4_a(weight_image, idx_p + chan_ofs);
-        float4 color_q = scale_fac * load4_u(weight_image, idx_q + chan_ofs);
-        float4 cdiff = color_p - color_q;
-        float4 var_p = load4_a(variance_image, idx_p + chan_ofs);
-        float4 var_q = sqr(scale_fac) * load4_u(variance_image, idx_q + chan_ofs);
-        diff += (cdiff * cdiff - a * (var_p + min(var_p, var_q))) /
-                (make_float4(1e-8f) + k_2 * (var_p + var_q));
-      }
-      load4_a(difference_image, idx_p) = diff * channel_fac;
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    const int low = max(rect.y, y - f);
-    const int high = min(rect.w, y + f + 1);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-    for (int y1 = low; y1 < high; y1++) {
-      for (int x = aligned_lowx; x < rect.z; x += 4) {
-        load4_a(out_image, y * stride + x) += load4_a(difference_image, y1 * stride + x);
-      }
-    }
-    float fac = 1.0f / (high - low);
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) *= fac;
-    }
-  }
-}
-
-ccl_device_inline void nlm_blur_horizontal(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = make_float4(0.0f);
-    }
-  }
-
-  for (int dx = -f; dx <= f; dx++) {
-    aligned_lowx = round_down(rect.x - min(0, dx), 4);
-    int highx = rect.z - max(0, dx);
-    int4 lowx4 = make_int4(rect.x - min(0, dx));
-    int4 highx4 = make_int4(rect.z - max(0, dx));
-    for (int y = rect.y; y < rect.w; y++) {
-      for (int x = aligned_lowx; x < highx; x += 4) {
-        int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-        int4 active = (x4 >= lowx4) & (x4 < highx4);
-
-        float4 diff = load4_u(difference_image, y * stride + x + dx);
-        load4_a(out_image, y * stride + x) += mask(active, diff);
-      }
-    }
-  }
-
-  aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      float4 x4 = make_float4(x) + make_float4(0.0f, 1.0f, 2.0f, 3.0f);
-      float4 low = max(make_float4(rect.x), x4 - make_float4(f));
-      float4 high = min(make_float4(rect.z), x4 + make_float4(f + 1));
-      load4_a(out_image, y * stride + x) *= rcp(high - low);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(
-    const float *ccl_restrict difference_image, float *out_image, int4 rect, int stride, int f)
-{
-  nlm_blur_horizontal(difference_image, out_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      load4_a(out_image, y * stride + x) = fast_expf4(
-          -max(load4_a(out_image, y * stride + x), make_float4(0.0f)));
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int dx,
-                                                       int dy,
-                                                       const float *ccl_restrict difference_image,
-                                                       const float *ccl_restrict image,
-                                                       float *temp_image,
-                                                       float *out_image,
-                                                       float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  nlm_blur_horizontal(difference_image, temp_image, rect, stride, f);
-
-  int aligned_lowx = round_down(rect.x, 4);
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = aligned_lowx; x < rect.z; x += 4) {
-      int4 x4 = make_int4(x) + make_int4(0, 1, 2, 3);
-      int4 active = (x4 >= make_int4(rect.x)) & (x4 < make_int4(rect.z));
-
-      int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-
-      float4 weight = load4_a(temp_image, idx_p);
-      load4_a(accum_image, idx_p) += mask(active, weight);
-
-      float4 val = load4_u(image, idx_q);
-      if (channel_offset) {
-        val += load4_u(image, idx_q + channel_offset);
-        val += load4_u(image, idx_q + 2 * channel_offset);
-        val *= 1.0f / 3.0f;
-      }
-
-      load4_a(out_image, idx_p) += mask(active, weight * val);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx,
-                                                           int dy,
-                                                           int t,
-                                                           const float *ccl_restrict
-                                                               difference_image,
-                                                           const float *ccl_restrict buffer,
-                                                           float *transform,
-                                                           int *rank,
-                                                           float *XtWX,
-                                                           float3 *XtWY,
-                                                           int4 rect,
-                                                           int4 filter_window,
-                                                           int stride,
-                                                           int f,
-                                                           int pass_stride,
-                                                           int frame_offset,
-                                                           bool use_time)
-{
-  int4 clip_area = rect_clip(rect, filter_window);
-  /* fy and fy are in filter-window-relative coordinates,
-   * while x and y are in feature-window-relative coordinates. */
-  for (int y = clip_area.y; y < clip_area.w; y++) {
-    for (int x = clip_area.x; x < clip_area.z; x++) {
-      const int low = max(rect.x, x - f);
-      const int high = min(rect.z, x + f + 1);
-      float sum = 0.0f;
-      for (int x1 = low; x1 < high; x1++) {
-        sum += difference_image[y * stride + x1];
-      }
-      float weight = sum * (1.0f / (high - low));
-
-      int storage_ofs = coord_to_local_index(filter_window, x, y);
-      float *l_transform = transform + storage_ofs * TRANSFORM_SIZE;
-      float *l_XtWX = XtWX + storage_ofs * XTWX_SIZE;
-      float3 *l_XtWY = XtWY + storage_ofs * XTWY_SIZE;
-      int *l_rank = rank + storage_ofs;
-
-      kernel_filter_construct_gramian(x,
-                                      y,
-                                      1,
-                                      dx,
-                                      dy,
-                                      t,
-                                      stride,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time,
-                                      buffer,
-                                      l_transform,
-                                      l_rank,
-                                      weight,
-                                      l_XtWX,
-                                      l_XtWY,
-                                      0);
-    }
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(float *out_image,
-                                                   const float *ccl_restrict accum_image,
-                                                   int4 rect,
-                                                   int w)
-{
-  for (int y = rect.y; y < rect.w; y++) {
-    for (int x = rect.x; x < rect.z; x++) {
-      out_image[y * w + x] /= accum_image[y * w + x];
-    }
-  }
-}
-
-#undef load4_a
-#undef load4_u
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h
deleted file mode 100644
index 650c743f34f..00000000000
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Determines pixel coordinates and offset for the current thread.
- * Returns whether the thread should do any work.
- *
- * All coordinates are relative to the denoising buffer!
- *
- * Window is the rect that should be processed.
- * co is filled with (x, y, dx, dy).
- */
-ccl_device_inline bool get_nlm_coords_window(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs, int4 window)
-{
-  /* Determine the pixel offset that this thread should apply. */
-  int s = 2 * r + 1;
-  int si = ccl_global_id(1);
-  int sx = si % s;
-  int sy = si / s;
-  if (sy >= s) {
-    return false;
-  }
-
-  /* Pixels still need to lie inside the denoising buffer after applying the offset,
-   * so determine the area for which this is the case. */
-  int dx = sx - r;
-  int dy = sy - r;
-
-  *rect = make_int4(max(0, -dx), max(0, -dy), w - max(0, dx), h - max(0, dy));
-
-  /* Find the intersection of the area that we want to process (window) and the area
-   * that can be processed (rect) to get the final area for this offset. */
-  int4 clip_area = rect_clip(window, *rect);
-
-  /* If the radius is larger than one of the sides of the window,
-   * there will be shifts for which there is no usable pixel at all. */
-  if (!rect_is_valid(clip_area)) {
-    return false;
-  }
-
-  /* Map the linear thread index to pixels inside the clip area. */
-  int x, y;
-  if (!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
-    return false;
-  }
-
-  *co = make_int4(x, y, dx, dy);
-
-  *ofs = (sy * s + sx) * stride;
-
-  return true;
-}
-
-ccl_device_inline bool get_nlm_coords(
-    int w, int h, int r, int stride, int4 *rect, int4 *co, int *ofs)
-{
-  return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_difference(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    const ccl_global float *ccl_restrict weight_image,
-    const ccl_global float *ccl_restrict variance_image,
-    const ccl_global float *ccl_restrict scale_image,
-    ccl_global float *difference_image,
-    int4 rect,
-    int stride,
-    int channel_offset,
-    int frame_offset,
-    float a,
-    float k_2)
-{
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx) + frame_offset;
-  int numChannels = channel_offset ? 3 : 1;
-
-  float diff = 0.0f;
-  float scale_fac = 1.0f;
-  if (scale_image) {
-    scale_fac = clamp(scale_image[idx_p] / scale_image[idx_q], 0.25f, 4.0f);
-  }
-
-  for (int c = 0; c < numChannels; c++, idx_p += channel_offset, idx_q += channel_offset) {
-    float cdiff = weight_image[idx_p] - scale_fac * weight_image[idx_q];
-    float pvar = variance_image[idx_p];
-    float qvar = sqr(scale_fac) * variance_image[idx_q];
-    diff += (cdiff * cdiff - a * (pvar + min(pvar, qvar))) / (1e-8f + k_2 * (pvar + qvar));
-  }
-  if (numChannels > 1) {
-    diff *= 1.0f / numChannels;
-  }
-  difference_image[y * stride + x] = diff;
-}
-
-ccl_device_inline void kernel_filter_nlm_blur(int x,
-                                              int y,
-                                              const ccl_global float *ccl_restrict
-                                                  difference_image,
-                                              ccl_global float *out_image,
-                                              int4 rect,
-                                              int stride,
-                                              int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.y, y - f);
-  const int high = min(rect.w, y + f + 1);
-  for (int y1 = low; y1 < high; y1++) {
-    sum += difference_image[y1 * stride + x];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = sum;
-}
-
-ccl_device_inline void kernel_filter_nlm_calc_weight(int x,
-                                                     int y,
-                                                     const ccl_global float *ccl_restrict
-                                                         difference_image,
-                                                     ccl_global float *out_image,
-                                                     int4 rect,
-                                                     int stride,
-                                                     int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-  out_image[y * stride + x] = fast_expf(-max(sum, 0.0f));
-}
-
-ccl_device_inline void kernel_filter_nlm_update_output(int x,
-                                                       int y,
-                                                       int dx,
-                                                       int dy,
-                                                       const ccl_global float *ccl_restrict
-                                                           difference_image,
-                                                       const ccl_global float *ccl_restrict image,
-                                                       ccl_global float *out_image,
-                                                       ccl_global float *accum_image,
-                                                       int4 rect,
-                                                       int channel_offset,
-                                                       int stride,
-                                                       int f)
-{
-  float sum = 0.0f;
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  sum *= 1.0f / (high - low);
-
-  int idx_p = y * stride + x, idx_q = (y + dy) * stride + (x + dx);
-  if (out_image) {
-    atomic_add_and_fetch_float(accum_image + idx_p, sum);
-
-    float val = image[idx_q];
-    if (channel_offset) {
-      val += image[idx_q + channel_offset];
-      val += image[idx_q + 2 * channel_offset];
-      val *= 1.0f / 3.0f;
-    }
-    atomic_add_and_fetch_float(out_image + idx_p, sum * val);
-  }
-  else {
-    accum_image[idx_p] = sum;
-  }
-}
-
-ccl_device_inline void kernel_filter_nlm_construct_gramian(
-    int x,
-    int y,
-    int dx,
-    int dy,
-    int t,
-    const ccl_global float *ccl_restrict difference_image,
-    const ccl_global float *ccl_restrict buffer,
-    const ccl_global float *ccl_restrict transform,
-    ccl_global int *rank,
-    ccl_global float *XtWX,
-    ccl_global float3 *XtWY,
-    int4 rect,
-    int4 filter_window,
-    int stride,
-    int f,
-    int pass_stride,
-    int frame_offset,
-    bool use_time,
-    int localIdx)
-{
-  const int low = max(rect.x, x - f);
-  const int high = min(rect.z, x + f + 1);
-  float sum = 0.0f;
-  for (int x1 = low; x1 < high; x1++) {
-    sum += difference_image[y * stride + x1];
-  }
-  float weight = sum * (1.0f / (high - low));
-
-  /* Reconstruction data is only stored for pixels inside the filter window,
-   * so compute the pixels's index in there. */
-  int storage_ofs = coord_to_local_index(filter_window, x, y);
-  transform += storage_ofs;
-  rank += storage_ofs;
-  XtWX += storage_ofs;
-  XtWY += storage_ofs;
-
-  kernel_filter_construct_gramian(x,
-                                  y,
-                                  rect_size(filter_window),
-                                  dx,
-                                  dy,
-                                  t,
-                                  stride,
-                                  pass_stride,
-                                  frame_offset,
-                                  use_time,
-                                  buffer,
-                                  transform,
-                                  rank,
-                                  weight,
-                                  XtWX,
-                                  XtWY,
-                                  localIdx);
-}
-
-ccl_device_inline void kernel_filter_nlm_normalize(int x,
-                                                   int y,
-                                                   ccl_global float *out_image,
-                                                   const ccl_global float *ccl_restrict
-                                                       accum_image,
-                                                   int stride)
-{
-  out_image[y * stride + x] /= accum_image[y * stride + x];
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_prefilter.h b/intern/cycles/kernel/filter/filter_prefilter.h
deleted file mode 100644
index 97cecba190e..00000000000
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/**
- * First step of the shadow prefiltering, performs the shadow division and stores all data
- * in a nice and easy rectangular array that can be passed to the NLM filter.
- *
- * Calculates:
- * \param unfiltered: Contains the two half images of the shadow feature pass
- * \param sampleVariance: The sample-based variance calculated in the kernel.
- * Note: This calculation is biased in general,
- * and especially here since the variance of the ratio can only be approximated.
- * \param sampleVarianceV: Variance of the sample variance estimation, quite noisy
- * (since it's essentially the buffer variance of the two variance halves)
- * \param bufferVariance: The buffer-based variance of the shadow feature.
- * Unbiased, but quite noisy.
- */
-ccl_device void kernel_filter_divide_shadow(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int x,
-                                            int y,
-                                            ccl_global float *unfilteredA,
-                                            ccl_global float *unfilteredB,
-                                            ccl_global float *sampleVariance,
-                                            ccl_global float *sampleVarianceV,
-                                            ccl_global float *bufferVariance,
-                                            int4 rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-
-  int offset = tile_info->offsets[tile];
-  int stride = tile_info->strides[tile];
-  const ccl_global float *ccl_restrict center_buffer = (ccl_global float *)ccl_get_tile_buffer(
-      tile);
-  center_buffer += (y * stride + x + offset) * buffer_pass_stride;
-  center_buffer += buffer_denoising_offset + 14;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-  unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
-  unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
-
-  float varA = center_buffer[2];
-  float varB = center_buffer[5];
-  int odd_sample = (sample + 1) / 2;
-  int even_sample = sample / 2;
-
-  /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-   * update does not work efficiently with atomics in the kernel. */
-  varA = max(0.0f, varA - unfilteredA[idx] * unfilteredA[idx] * odd_sample);
-  varB = max(0.0f, varB - unfilteredB[idx] * unfilteredB[idx] * even_sample);
-
-  varA /= max(odd_sample - 1, 1);
-  varB /= max(even_sample - 1, 1);
-
-  sampleVariance[idx] = 0.5f * (varA + varB) / sample;
-  sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample * sample);
-  bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) *
-                        (unfilteredA[idx] - unfilteredB[idx]);
-}
-
-/* Load a regular feature from the render buffers into the denoise buffer.
- * Parameters:
- * - sample: The sample amount in the buffer, used to normalize the buffer.
- * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
- * - x, y: Current pixel
- * - mean, variance: Target denoise buffers.
- * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
- */
-ccl_device void kernel_filter_get_feature(int sample,
-                                          CCL_FILTER_TILE_INFO,
-                                          int m_offset,
-                                          int v_offset,
-                                          int x,
-                                          int y,
-                                          ccl_global float *mean,
-                                          ccl_global float *variance,
-                                          float scale,
-                                          int4 rect,
-                                          int buffer_pass_stride,
-                                          int buffer_denoising_offset)
-{
-  int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-  int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-  int tile = ytile * 3 + xtile;
-  ccl_global float *center_buffer = ((ccl_global float *)ccl_get_tile_buffer(tile)) +
-                                    (tile_info->offsets[tile] + y * tile_info->strides[tile] + x) *
-                                        buffer_pass_stride +
-                                    buffer_denoising_offset;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float val = scale * center_buffer[m_offset];
-  mean[idx] = val;
-
-  if (v_offset >= 0) {
-    if (sample > 1) {
-      /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
-       * update does not work efficiently with atomics in the kernel. */
-      variance[idx] = max(
-          0.0f, (center_buffer[v_offset] - val * val * sample) / (sample * (sample - 1)));
-    }
-    else {
-      /* Can't compute variance with single sample, just set it very high. */
-      variance[idx] = 1e10f;
-    }
-  }
-}
-
-ccl_device void kernel_filter_write_feature(int sample,
-                                            int x,
-                                            int y,
-                                            int4 buffer_params,
-                                            ccl_global float *from,
-                                            ccl_global float *buffer,
-                                            int out_offset,
-                                            int4 rect)
-{
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  combined_buffer[out_offset] = from[idx];
-}
-
-#define GET_COLOR(image) \
-  make_float3(image[idx], image[idx + pass_stride], image[idx + 2 * pass_stride])
-#define SET_COLOR(image, color) \
-  image[idx] = color.x; \
-  image[idx + pass_stride] = color.y; \
-  image[idx + 2 * pass_stride] = color.z
-
-ccl_device void kernel_filter_detect_outliers(int x,
-                                              int y,
-                                              ccl_global float *in,
-                                              ccl_global float *variance_out,
-                                              ccl_global float *depth,
-                                              ccl_global float *image_out,
-                                              int4 rect,
-                                              int pass_stride)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  ccl_global float *image_in = in;
-  ccl_global float *variance_in = in + 3 * pass_stride;
-
-  int n = 0;
-  float values[25];
-  float pixel_variance, max_variance = 0.0f;
-  for (int y1 = max(y - 2, rect.y); y1 < min(y + 3, rect.w); y1++) {
-    for (int x1 = max(x - 2, rect.x); x1 < min(x + 3, rect.z); x1++) {
-      int idx = (y1 - rect.y) * buffer_w + (x1 - rect.x);
-      float3 color = GET_COLOR(image_in);
-      color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-      float L = average(color);
-
-      /* Find the position of L. */
-      int i;
-      for (i = 0; i < n; i++) {
-        if (values[i] > L)
-          break;
-      }
-      /* Make space for L by shifting all following values to the right. */
-      for (int j = n; j > i; j--) {
-        values[j] = values[j - 1];
-      }
-      /* Insert L. */
-      values[i] = L;
-      n++;
-
-      float3 pixel_var = GET_COLOR(variance_in);
-      float var = average(pixel_var);
-      if ((x1 == x) && (y1 == y)) {
-        pixel_variance = (pixel_var.x < 0.0f || pixel_var.y < 0.0f || pixel_var.z < 0.0f) ? -1.0f :
-                                                                                            var;
-      }
-      else {
-        max_variance = max(max_variance, var);
-      }
-    }
-  }
-
-  max_variance += 1e-4f;
-
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  float3 color = GET_COLOR(image_in);
-  float3 variance = GET_COLOR(variance_in);
-  color = max(color, make_float3(0.0f, 0.0f, 0.0f));
-  variance = max(variance, make_float3(0.0f, 0.0f, 0.0f));
-
-  float L = average(color);
-
-  float ref = 2.0f * values[(int)(n * 0.75f)];
-
-  /* Slightly offset values to avoid false positives in (almost) black areas. */
-  max_variance += 1e-5f;
-  ref -= 1e-5f;
-
-  if (L > ref) {
-    /* The pixel appears to be an outlier.
-     * However, it may just be a legitimate highlight. Therefore, it is checked how likely it is
-     * that the pixel should actually be at the reference value: If the reference is within the
-     * 3-sigma interval, the pixel is assumed to be a statistical outlier. Otherwise, it is very
-     * unlikely that the pixel should be darker, which indicates a legitimate highlight.
-     */
-
-    if (pixel_variance < 0.0f || pixel_variance > 9.0f * max_variance) {
-      depth[idx] = -depth[idx];
-      color *= ref / L;
-      variance = make_float3(max_variance, max_variance, max_variance);
-    }
-    else {
-      float stddev = sqrtf(pixel_variance);
-      if (L - 3 * stddev < ref) {
-        /* The pixel is an outlier, so negate the depth value to mark it as one.
-         * Also, scale its brightness down to the outlier threshold to avoid trouble with the NLM
-         * weights. */
-        depth[idx] = -depth[idx];
-        float fac = ref / L;
-        color *= fac;
-        variance *= sqr(fac);
-      }
-    }
-  }
-
-  /* Apply log(1+x) transform to compress highlights and avoid halos in the denoised results.
-   * Variance is transformed accordingly - the derivative of the transform is 1/(1+x), so we
-   * scale by the square of that (since we have variance instead of standard deviation). */
-  color = color_highlight_compress(color, &variance);
-
-  SET_COLOR(image_out, color);
-  SET_COLOR(variance_out, variance);
-}
-
-#undef GET_COLOR
-#undef SET_COLOR
-
-/* Combine A/B buffers.
- * Calculates the combined mean and the buffer variance. */
-ccl_device void kernel_filter_combine_halves(int x,
-                                             int y,
-                                             ccl_global float *mean,
-                                             ccl_global float *variance,
-                                             ccl_global float *a,
-                                             ccl_global float *b,
-                                             int4 rect,
-                                             int r)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-  int idx = (y - rect.y) * buffer_w + (x - rect.x);
-
-  if (mean)
-    mean[idx] = 0.5f * (a[idx] + b[idx]);
-  if (variance) {
-    if (r == 0)
-      variance[idx] = 0.25f * (a[idx] - b[idx]) * (a[idx] - b[idx]);
-    else {
-      variance[idx] = 0.0f;
-      float values[25];
-      int numValues = 0;
-      for (int py = max(y - r, rect.y); py < min(y + r + 1, rect.w); py++) {
-        for (int px = max(x - r, rect.x); px < min(x + r + 1, rect.z); px++) {
-          int pidx = (py - rect.y) * buffer_w + (px - rect.x);
-          values[numValues++] = 0.25f * (a[pidx] - b[pidx]) * (a[pidx] - b[pidx]);
-        }
-      }
-      /* Insertion-sort the variances (fast enough for 25 elements). */
-      for (int i = 1; i < numValues; i++) {
-        float v = values[i];
-        int j;
-        for (j = i - 1; j >= 0 && values[j] > v; j--)
-          values[j + 1] = values[j];
-        values[j + 1] = v;
-      }
-      variance[idx] = values[(7 * numValues) / 8];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
deleted file mode 100644
index 17941689ad5..00000000000
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_filter_construct_gramian(int x,
-                                                       int y,
-                                                       int storage_stride,
-                                                       int dx,
-                                                       int dy,
-                                                       int t,
-                                                       int buffer_stride,
-                                                       int pass_stride,
-                                                       int frame_offset,
-                                                       bool use_time,
-                                                       const ccl_global float *ccl_restrict buffer,
-                                                       const ccl_global float *ccl_restrict
-                                                           transform,
-                                                       ccl_global int *rank,
-                                                       float weight,
-                                                       ccl_global float *XtWX,
-                                                       ccl_global float3 *XtWY,
-                                                       int localIdx)
-{
-  if (weight < 1e-3f) {
-    return;
-  }
-
-  int p_offset = y * buffer_stride + x;
-  int q_offset = (y + dy) * buffer_stride + (x + dx) + frame_offset;
-
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_design_row[(DENOISE_FEATURES + 1) * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *design_row = shared_design_row + localIdx * (DENOISE_FEATURES + 1);
-#else
-  float design_row[DENOISE_FEATURES + 1];
-#endif
-
-  float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
-
-  /* If the pixel was flagged as an outlier during prefiltering, skip it. */
-  if (ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
-    return;
-  }
-
-  filter_get_design_row_transform(make_int3(x, y, t),
-                                  buffer + p_offset,
-                                  make_int3(x + dx, y + dy, t),
-                                  buffer + q_offset,
-                                  pass_stride,
-                                  *rank,
-                                  design_row,
-                                  transform,
-                                  stride,
-                                  use_time);
-
-#ifdef __KERNEL_GPU__
-  math_trimatrix_add_gramian_strided(XtWX, (*rank) + 1, design_row, weight, stride);
-  math_vec3_add_strided(XtWY, (*rank) + 1, design_row, weight * q_color, stride);
-#else
-  math_trimatrix_add_gramian(XtWX, (*rank) + 1, design_row, weight);
-  math_vec3_add(XtWY, (*rank) + 1, design_row, weight * q_color);
-#endif
-}
-
-ccl_device_inline void kernel_filter_finalize(int x,
-                                              int y,
-                                              ccl_global float *buffer,
-                                              ccl_global int *rank,
-                                              int storage_stride,
-                                              ccl_global float *XtWX,
-                                              ccl_global float3 *XtWY,
-                                              int4 buffer_params,
-                                              int sample)
-{
-#ifdef __KERNEL_GPU__
-  const int stride = storage_stride;
-#else
-  const int stride = 1;
-  (void)storage_stride;
-#endif
-
-  if (XtWX[0] < 1e-3f) {
-    /* There is not enough information to determine a denoised result.
-     * As a fallback, keep the original value of the pixel. */
-    return;
-  }
-
-  /* The weighted average of pixel colors (essentially, the NLM-filtered image).
-   * In case the solution of the linear model fails due to numerical issues or
-   * returns nonsensical negative values, fall back to this value. */
-  float3 mean_color = XtWY[0] / XtWX[0];
-
-  math_trimatrix_vec3_solve(XtWX, XtWY, (*rank) + 1, stride);
-
-  float3 final_color = XtWY[0];
-  if (!isfinite3_safe(final_color) ||
-      (final_color.x < -0.01f || final_color.y < -0.01f || final_color.z < -0.01f)) {
-    final_color = mean_color;
-  }
-
-  /* Clamp pixel value to positive values and reverse the highlight compression transform. */
-  final_color = color_highlight_uncompress(max(final_color, make_float3(0.0f, 0.0f, 0.0f)));
-
-  ccl_global float *combined_buffer = buffer + (y * buffer_params.y + x + buffer_params.x) *
-                                                   buffer_params.z;
-  if (buffer_params.w >= 0) {
-    final_color *= sample;
-    if (buffer_params.w > 0) {
-      final_color.x += combined_buffer[buffer_params.w + 0];
-      final_color.y += combined_buffer[buffer_params.w + 1];
-      final_color.z += combined_buffer[buffer_params.w + 2];
-    }
-  }
-  combined_buffer[0] = final_color.x;
-  combined_buffer[1] = final_color.y;
-  combined_buffer[2] = final_color.z;
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform.h b/intern/cycles/kernel/filter/filter_transform.h
deleted file mode 100644
index 880a661214e..00000000000
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float features[DENOISE_FEATURES];
-
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < (*rank); i++) {
-    math_vector_mul(transform + i * num_features, feature_scale, num_features);
-  }
-  math_matrix_transpose(transform, num_features, 1);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
deleted file mode 100644
index ec258a5212a..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  ccl_global float *transform,
-                                                  ccl_global int *rank,
-                                                  int radius,
-                                                  float pca_threshold,
-                                                  int transform_stride,
-                                                  int localIdx)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-#ifdef __KERNEL_CUDA__
-  ccl_local float shared_features[DENOISE_FEATURES * CCL_MAX_LOCAL_SIZE];
-  ccl_local_param float *features = shared_features + localIdx * DENOISE_FEATURES;
-#else
-  float features[DENOISE_FEATURES];
-#endif
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-  const ccl_global float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  /* === Shift feature passes to have mean 0. === */
-  float feature_means[DENOISE_FEATURES];
-  math_vector_zero(feature_means, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add(feature_means, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_vector_scale(feature_means, 1.0f / num_pixels, num_features);
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float feature_scale[DENOISE_FEATURES];
-  math_vector_zero(feature_scale, num_features);
-
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_feature_scales(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  filter_calculate_scale(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero(feature_matrix, num_features);
-  FOR_PIXEL_WINDOW
-  {
-    filter_get_features(pixel, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul(features, feature_scale, num_features);
-    math_matrix_add_gramian(feature_matrix, num_features, features, 1.0f);
-  }
-  END_FOR_PIXEL_WINDOW
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, transform_stride);
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, transform_stride);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    for (int j = 0; j < (*rank); j++) {
-      transform[(i * num_features + j) * transform_stride] *= feature_scale[i];
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
deleted file mode 100644
index 0304d990f9f..00000000000
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buffer,
-                                                  CCL_FILTER_TILE_INFO,
-                                                  int x,
-                                                  int y,
-                                                  int4 rect,
-                                                  int pass_stride,
-                                                  int frame_stride,
-                                                  bool use_time,
-                                                  float *transform,
-                                                  int *rank,
-                                                  int radius,
-                                                  float pca_threshold)
-{
-  int buffer_w = align_up(rect.z - rect.x, 4);
-
-  float4 features[DENOISE_FEATURES];
-  const float *ccl_restrict pixel_buffer;
-  int3 pixel;
-
-  int num_features = use_time ? 11 : 10;
-
-  /* === Calculate denoising window. === */
-  int2 low = make_int2(max(rect.x, x - radius), max(rect.y, y - radius));
-  int2 high = make_int2(min(rect.z, x + radius + 1), min(rect.w, y + radius + 1));
-  int num_pixels = (high.y - low.y) * (high.x - low.x) * tile_info->num_frames;
-
-  /* === Shift feature passes to have mean 0. === */
-  float4 feature_means[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_means, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, NULL, pass_stride);
-    math_vector_add_sse(feature_means, num_features, features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float4 pixel_scale = make_float4(1.0f / num_pixels);
-  for (int i = 0; i < num_features; i++) {
-    feature_means[i] = reduce_add(feature_means[i]) * pixel_scale;
-  }
-
-  /* === Scale the shifted feature passes to a range of [-1; 1] ===
-   * Will be baked into the transform later. */
-  float4 feature_scale[DENOISE_FEATURES];
-  math_vector_zero_sse(feature_scale, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_feature_scales_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_max_sse(feature_scale, features, num_features);
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  filter_calculate_scale_sse(feature_scale, use_time);
-
-  /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimensional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions.
-   * This mainly helps to prevent over-fitting. */
-  float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_zero_sse(feature_matrix_sse, num_features);
-  FOR_PIXEL_WINDOW_SSE
-  {
-    filter_get_features_sse(
-        x4, y4, t4, active_pixels, pixel_buffer, features, use_time, feature_means, pass_stride);
-    math_vector_mul_sse(features, num_features, feature_scale);
-    math_matrix_add_gramian_sse(feature_matrix_sse, num_features, features, make_float4(1.0f));
-  }
-  END_FOR_PIXEL_WINDOW_SSE
-
-  float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
-  math_matrix_hsum(feature_matrix, num_features, feature_matrix_sse);
-
-  math_matrix_jacobi_eigendecomposition(feature_matrix, transform, num_features, 1);
-
-  *rank = 0;
-  /* Prevent over-fitting when a small window is used. */
-  int max_rank = min(num_features, num_pixels / 3);
-  if (pca_threshold < 0.0f) {
-    float threshold_energy = 0.0f;
-    for (int i = 0; i < num_features; i++) {
-      threshold_energy += feature_matrix[i * num_features + i];
-    }
-    threshold_energy *= 1.0f - (-pca_threshold);
-
-    float reduced_energy = 0.0f;
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      if (i >= 2 && reduced_energy >= threshold_energy)
-        break;
-      float s = feature_matrix[i * num_features + i];
-      reduced_energy += s;
-    }
-  }
-  else {
-    for (int i = 0; i < max_rank; i++, (*rank)++) {
-      float s = feature_matrix[i * num_features + i];
-      if (i >= 2 && sqrtf(s) < pca_threshold)
-        break;
-    }
-  }
-
-  math_matrix_transpose(transform, num_features, 1);
-
-  /* Bake the feature scaling into the transformation matrix. */
-  for (int i = 0; i < num_features; i++) {
-    math_vector_scale(transform + i * num_features, feature_scale[i][0], *rank);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom.h b/intern/cycles/kernel/geom/geom.h
index 5ff4d5f7053..4de824cc277 100644
--- a/intern/cycles/kernel/geom/geom.h
+++ b/intern/cycles/kernel/geom/geom.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 // clang-format off
 #include "kernel/geom/geom_attribute.h"
 #include "kernel/geom/geom_object.h"
@@ -31,4 +33,5 @@
 #include "kernel/geom/geom_curve_intersect.h"
 #include "kernel/geom/geom_volume.h"
 #include "kernel/geom/geom_primitive.h"
+#include "kernel/geom/geom_shader_data.h"
 // clang-format on
diff --git a/intern/cycles/kernel/geom/geom_attribute.h b/intern/cycles/kernel/geom/geom_attribute.h
index b37797ac21b..9532a21fec7 100644
--- a/intern/cycles/kernel/geom/geom_attribute.h
+++ b/intern/cycles/kernel/geom/geom_attribute.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Attributes
@@ -25,9 +27,9 @@ CCL_NAMESPACE_BEGIN
  * Lookup of attributes is different between OSL and SVM, as OSL is ustring
  * based while for SVM we use integer ids. */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd);
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd);
 
-ccl_device_inline uint attribute_primitive_type(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint attribute_primitive_type(const KernelGlobals *kg, const ShaderData *sd)
 {
   if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && subd_triangle_patch(kg, sd) != ~0) {
     return ATTR_PRIM_SUBD;
@@ -46,12 +48,12 @@ ccl_device_inline AttributeDescriptor attribute_not_found()
 
 /* Find attribute based on ID */
 
-ccl_device_inline uint object_attribute_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_attribute_map_offset(const KernelGlobals *kg, int object)
 {
   return kernel_tex_fetch(__objects, object).attribute_map_offset;
 }
 
-ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
+ccl_device_inline AttributeDescriptor find_attribute(const KernelGlobals *kg,
                                                      const ShaderData *sd,
                                                      uint id)
 {
@@ -98,7 +100,7 @@ ccl_device_inline AttributeDescriptor find_attribute(KernelGlobals *kg,
 
 /* Transform matrix attribute on meshes */
 
-ccl_device Transform primitive_attribute_matrix(KernelGlobals *kg,
+ccl_device Transform primitive_attribute_matrix(const KernelGlobals *kg,
                                                 const ShaderData *sd,
                                                 const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/geom/geom_curve.h b/intern/cycles/kernel/geom/geom_curve.h
index b5a62a31ca9..a827a67ce7a 100644
--- a/intern/cycles/kernel/geom/geom_curve.h
+++ b/intern/cycles/kernel/geom/geom_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve Primitive
@@ -25,8 +27,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Reading attributes on various curve elements */
 
-ccl_device float curve_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float curve_attribute_float(const KernelGlobals *kg,
+                                       const ShaderData *sd,
+                                       const AttributeDescriptor desc,
+                                       float *dx,
+                                       float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_CURVE_KEY | ATTR_ELEMENT_CURVE_KEY_MOTION)) {
     float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
@@ -64,7 +69,7 @@ ccl_device float curve_attribute_float(
   }
 }
 
-ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
+ccl_device float2 curve_attribute_float2(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float2 *dx,
@@ -110,7 +115,7 @@ ccl_device float2 curve_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
+ccl_device float3 curve_attribute_float3(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float3 *dx,
@@ -152,7 +157,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
+ccl_device float4 curve_attribute_float4(const KernelGlobals *kg,
                                          const ShaderData *sd,
                                          const AttributeDescriptor desc,
                                          float4 *dx,
@@ -196,7 +201,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals *kg,
 
 /* Curve thickness */
 
-ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
+ccl_device float curve_thickness(const KernelGlobals *kg, const ShaderData *sd)
 {
   float r = 0.0f;
 
@@ -224,7 +229,7 @@ ccl_device float curve_thickness(KernelGlobals *kg, ShaderData *sd)
 /* Curve location for motion pass, linear interpolation between keys and
  * ignoring radius because we do the same for the motion keys */
 
-ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_motion_center_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   float4 curvedata = kernel_tex_fetch(__curves, sd->prim);
   int k0 = __float_as_int(curvedata.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -240,7 +245,7 @@ ccl_device float3 curve_motion_center_location(KernelGlobals *kg, ShaderData *sd
 
 /* Curve tangent normal */
 
-ccl_device float3 curve_tangent_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 curve_tangent_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 tgN = make_float3(0.0f, 0.0f, 0.0f);
 
diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h
index e25bf5b4660..213f3e62ee0 100644
--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Curve primitive intersection functions.
@@ -167,6 +169,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
 }
 
 ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+                                          float *ray_tfar,
                                           const float dt,
                                           const float4 curve[4],
                                           float u,
@@ -230,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
     if (fabsf(f) < f_err && fabsf(g) < g_err) {
       t += dt;
-      if (!(0.0f <= t && t <= isect->t)) {
+      if (!(0.0f <= t && t <= *ray_tfar)) {
         return false; /* Rejects NaNs */
       }
       if (!(u >= 0.0f && u <= 1.0f)) {
@@ -247,6 +250,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
       }
 
       /* Record intersection. */
+      *ray_tfar = t;
       isect->t = t;
       isect->u = u;
       isect->v = 0.0f;
@@ -259,6 +263,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
 
 ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           const float3 ray_dir,
+                                          float ray_tfar,
                                           float4 curve[4],
                                           Intersection *isect)
 {
@@ -339,7 +344,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
       }
 
       /* Intersect with cap-planes. */
-      float2 tp = make_float2(-dt, isect->t - dt);
+      float2 tp = make_float2(-dt, ray_tfar - dt);
       tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
       const float2 h0 = half_plane_intersect(
           float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
@@ -402,19 +407,19 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
         }
         else {
           recurse = true;
         }
       }
 
-      if (valid1 && (tp1.x + dt <= isect->t)) {
+      if (valid1 && (tp1.x + dt <= ray_tfar)) {
         const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
                                           CURVE_NUM_BEZIER_SUBDIVISIONS;
         if (depth >= termDepth) {
           found |= curve_intersect_iterative(
-              ray_dir, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
         }
         else {
           recurse = true;
@@ -542,7 +547,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
 
 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
                                         const float3 ray_dir,
-                                        const float ray_tfar,
+                                        float ray_tfar,
                                         const int N,
                                         float4 curve[4],
                                         Intersection *isect)
@@ -590,7 +595,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
 
       /* Intersect quad. */
       float vu, vv, vt;
-      bool valid0 = ribbon_intersect_quad(isect->t, lp0, lp1, up1, up0, &vu, &vv, &vt);
+      bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
 
       if (valid0) {
         /* ignore self intersections */
@@ -604,6 +609,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
           vv = 2.0f * vv - 1.0f;
 
           /* Record intersection. */
+          ray_tfar = vt;
           isect->t = vt;
           isect->u = u + vu * step_size;
           isect->v = vv;
@@ -619,10 +625,11 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
   return false;
 }
 
-ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
+ccl_device_forceinline bool curve_intersect(const KernelGlobals *kg,
                                             Intersection *isect,
                                             const float3 P,
                                             const float3 dir,
+                                            const float tmax,
                                             uint visibility,
                                             int object,
                                             int curveAddr,
@@ -672,7 +679,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* todo: adaptive number of subdivisions could help performance here. */
     const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, isect->t, subdivisions, curve, isect)) {
+    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -682,7 +689,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
     return false;
   }
   else {
-    if (curve_intersect_recursive(P, dir, curve, isect)) {
+    if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
       isect->prim = curveAddr;
       isect->object = object;
       isect->type = type;
@@ -693,28 +700,23 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
+ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
                                           ShaderData *sd,
-                                          const Intersection *isect,
-                                          const Ray *ray)
+                                          float3 P,
+                                          float3 D,
+                                          float t,
+                                          const int isect_object,
+                                          const int isect_prim)
 {
-  float t = isect->t;
-  float3 P = ray->P;
-  float3 D = ray->D;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
     D = normalize_len(D, &t);
   }
 
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  int prim = kernel_tex_fetch(__prim_index, isect_prim);
   float4 v00 = kernel_tex_fetch(__curves, prim);
 
   int k0 = __float_as_int(v00.x) + PRIMITIVE_UNPACK_SEGMENT(sd->type);
@@ -735,23 +737,20 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     motion_curve_keys(kg, sd->object, sd->prim, sd->time, ka, k0, k1, kb, P_curve);
   }
 
-  sd->u = isect->u;
-
   P = P + D * t;
 
-  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, isect->u);
+  const float4 dPdu4 = catmull_rom_basis_derivative(P_curve, sd->u);
   const float3 dPdu = float4_to_float3(dPdu4);
 
   if (sd->type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
     /* Rounded smooth normals for ribbons, to approximate thick curve shape. */
     const float3 tangent = normalize(dPdu);
     const float3 bitangent = normalize(cross(tangent, -D));
-    const float sine = isect->v;
+    const float sine = sd->v;
     const float cosine = safe_sqrtf(1.0f - sine * sine);
 
     sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
     sd->Ng = -D;
-    sd->v = isect->v;
 
 #  if 0
     /* This approximates the position and geometric normal of a thick curve too,
@@ -765,7 +764,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
     /* Thick curves, compute normal using direction from inside the curve.
      * This could be optimized by recording the normal in the intersection,
      * however for Optix this would go beyond the size of the payload. */
-    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, isect->u));
+    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
     const float3 Ng = normalize(P - P_inside);
 
     sd->N = Ng;
@@ -779,13 +778,8 @@ ccl_device_inline void curve_shader_setup(KernelGlobals *kg,
   sd->dPdv = cross(dPdu, sd->Ng);
 #  endif
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_motion_curve.h b/intern/cycles/kernel/geom/geom_motion_curve.h
index 0f66f4af755..5294da03145 100644
--- a/intern/cycles/kernel/geom/geom_motion_curve.h
+++ b/intern/cycles/kernel/geom/geom_motion_curve.h
@@ -12,6 +12,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Motion Curve Primitive
@@ -25,7 +27,7 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __HAIR__
 
-ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_curve_motion(const KernelGlobals *kg,
                                                   int object,
                                                   uint id,
                                                   AttributeElement *elem)
@@ -50,7 +52,7 @@ ccl_device_inline int find_attribute_curve_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step_linear(const KernelGlobals *kg,
                                                          int offset,
                                                          int numkeys,
                                                          int numsteps,
@@ -78,7 +80,7 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals *kg,
 
 /* return 2 curve key locations */
 ccl_device_inline void motion_curve_keys_linear(
-    KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
+    const KernelGlobals *kg, int object, int prim, float time, int k0, int k1, float4 keys[2])
 {
   /* get motion info */
   int numsteps, numkeys;
@@ -105,7 +107,7 @@ ccl_device_inline void motion_curve_keys_linear(
   keys[1] = (1.0f - t) * keys[1] + t * next_keys[1];
 }
 
-ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys_for_step(const KernelGlobals *kg,
                                                   int offset,
                                                   int numkeys,
                                                   int numsteps,
@@ -138,7 +140,7 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals *kg,
 }
 
 /* return 2 curve key locations */
-ccl_device_inline void motion_curve_keys(KernelGlobals *kg,
+ccl_device_inline void motion_curve_keys(const KernelGlobals *kg,
                                          int object,
                                          int prim,
                                          float time,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h
index 53d6b92dd7e..eb4a39e062b 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -25,11 +25,13 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Time interpolation of vertex positions and normals */
 
-ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
+ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
                                             int object,
                                             uint id,
                                             AttributeElement *elem)
@@ -49,7 +51,7 @@ ccl_device_inline int find_attribute_motion(KernelGlobals *kg,
   return (attr_map.y == ATTR_ELEMENT_NONE) ? (int)ATTR_STD_NOT_FOUND : (int)attr_map.z;
 }
 
-ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_verts_for_step(const KernelGlobals *kg,
                                                       uint4 tri_vindex,
                                                       int offset,
                                                       int numverts,
@@ -76,7 +78,7 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
+ccl_device_inline void motion_triangle_normals_for_step(const KernelGlobals *kg,
                                                         uint4 tri_vindex,
                                                         int offset,
                                                         int numverts,
@@ -104,7 +106,7 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals *kg,
 }
 
 ccl_device_inline void motion_triangle_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 verts[3])
 {
   /* get motion info */
   int numsteps, numverts;
@@ -134,7 +136,7 @@ ccl_device_inline void motion_triangle_vertices(
 }
 
 ccl_device_inline float3 motion_triangle_smooth_normal(
-    KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
+    const KernelGlobals *kg, float3 Ng, int object, int prim, float u, float v, float time)
 {
   /* get motion info */
   int numsteps, numverts;
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
index 859d919f0bb..ec7e4b07d76 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_intersect.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Refine triangle intersection to more precise hit point. For rays that travel
@@ -32,23 +34,21 @@ CCL_NAMESPACE_BEGIN
  * a closer distance.
  */
 
-ccl_device_inline float3 motion_triangle_refine(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, float3 verts[3])
+ccl_device_inline float3 motion_triangle_refine(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float3 P,
+                                                float3 D,
+                                                float t,
+                                                const int isect_object,
+                                                const int isect_prim,
+                                                float3 verts[3])
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -70,13 +70,8 @@ ccl_device_inline float3 motion_triangle_refine(
   /* Compute refined position. */
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -86,7 +81,7 @@ ccl_device_inline float3 motion_triangle_refine(
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space
+/* Same as above, except that t is assumed to be in object space
  * for instancing.
  */
 
@@ -97,27 +92,22 @@ ccl_device_noinline
 ccl_device_inline
 #  endif
     float3
-    motion_triangle_refine_local(KernelGlobals *kg,
+    motion_triangle_refine_local(const KernelGlobals *kg,
                                  ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray,
+                                 float3 P,
+                                 float3 D,
+                                 float t,
+                                 const int isect_object,
+                                 const int isect_prim,
                                  float3 verts[3])
 {
 #  ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return motion_triangle_refine(kg, sd, isect, ray, verts);
+  /* t is always in world space with OptiX. */
+  return motion_triangle_refine(kg, sd, P, D, t, isect_object, isect_prim, verts);
 #  else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #    ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#      endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -138,13 +128,8 @@ ccl_device_inline
 
   P = P + D * rt;
 
-  if (isect->object != OBJECT_NONE) {
-#      ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#      else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#      endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -160,10 +145,11 @@ ccl_device_inline
  * time and do a ray intersection with the resulting triangle.
  */
 
-ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect(const KernelGlobals *kg,
                                                  Intersection *isect,
                                                  float3 P,
                                                  float3 dir,
+                                                 float tmax,
                                                  float time,
                                                  uint visibility,
                                                  int object,
@@ -179,7 +165,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              (ssef *)verts,
 #else
@@ -215,7 +201,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals *kg,
  * Returns whether traversal should be stopped.
  */
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool motion_triangle_intersect_local(const KernelGlobals *kg,
                                                        LocalIntersection *local_isect,
                                                        float3 P,
                                                        float3 dir,
diff --git a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
index 7a91f8041f7..85c4f0ca522 100644
--- a/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle_shader.h
@@ -25,6 +25,8 @@
  * and ATTR_STD_MOTION_VERTEX_NORMAL mesh attributes.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Setup of motion triangle specific parts of ShaderData, moved into this one
@@ -32,8 +34,14 @@ CCL_NAMESPACE_BEGIN
  * normals */
 
 /* return 3 triangle vertex normals */
-ccl_device_noinline void motion_triangle_shader_setup(
-    KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool is_local)
+ccl_device_noinline void motion_triangle_shader_setup(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      const float3 P,
+                                                      const float3 D,
+                                                      const float ray_t,
+                                                      const int isect_object,
+                                                      const int isect_prim,
+                                                      bool is_local)
 {
   /* Get shader. */
   sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
@@ -63,12 +71,12 @@ ccl_device_noinline void motion_triangle_shader_setup(
   /* Compute refined position. */
 #ifdef __BVH_LOCAL__
   if (is_local) {
-    sd->P = motion_triangle_refine_local(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine_local(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   else
 #endif /* __BVH_LOCAL__*/
   {
-    sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+    sd->P = motion_triangle_refine(kg, sd, P, D, ray_t, isect_object, isect_prim, verts);
   }
   /* Compute face normal. */
   float3 Ng;
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index fe73335a335..7d6ad7b4fe3 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -22,6 +22,8 @@
  * directly primitives in the BVH with world space locations applied, and the object
  * ID is looked up afterwards. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Object attributes, for now a fixed size and contents */
@@ -35,7 +37,7 @@ enum ObjectVectorTransform { OBJECT_PASS_MOTION_PRE = 0, OBJECT_PASS_MOTION_POST
 
 /* Object to world space transformation */
 
-ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform(const KernelGlobals *kg,
                                                    int object,
                                                    enum ObjectTransform type)
 {
@@ -49,7 +51,7 @@ ccl_device_inline Transform object_fetch_transform(KernelGlobals *kg,
 
 /* Lamp to world space transformation */
 
-ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bool inverse)
+ccl_device_inline Transform lamp_fetch_transform(const KernelGlobals *kg, int lamp, bool inverse)
 {
   if (inverse) {
     return kernel_tex_fetch(__lights, lamp).itfm;
@@ -61,7 +63,7 @@ ccl_device_inline Transform lamp_fetch_transform(KernelGlobals *kg, int lamp, bo
 
 /* Object to world space transformation for motion vectors */
 
-ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_motion_pass_transform(const KernelGlobals *kg,
                                                                int object,
                                                                enum ObjectVectorTransform type)
 {
@@ -72,7 +74,7 @@ ccl_device_inline Transform object_fetch_motion_pass_transform(KernelGlobals *kg
 /* Motion blurred object transformations */
 
 #ifdef __OBJECT_MOTION__
-ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion(const KernelGlobals *kg,
                                                           int object,
                                                           float time)
 {
@@ -86,7 +88,7 @@ ccl_device_inline Transform object_fetch_transform_motion(KernelGlobals *kg,
   return tfm;
 }
 
-ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg,
+ccl_device_inline Transform object_fetch_transform_motion_test(const KernelGlobals *kg,
                                                                int object,
                                                                float time,
                                                                Transform *itfm)
@@ -111,45 +113,79 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 }
 #endif
 
+/* Get transform matrix for shading point. */
+
+ccl_device_inline Transform object_get_transform(const KernelGlobals *kg, const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_tfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+#endif
+}
+
+ccl_device_inline Transform object_get_inverse_transform(const KernelGlobals *kg,
+                                                         const ShaderData *sd)
+{
+#ifdef __OBJECT_MOTION__
+  return (sd->object_flag & SD_OBJECT_MOTION) ?
+             sd->ob_itfm_motion :
+             object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#else
+  return object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+#endif
+}
 /* Transform position from object to world space */
 
-ccl_device_inline void object_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_position_transform(const KernelGlobals *kg,
                                                  const ShaderData *sd,
                                                  float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_tfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_tfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform position from world to object space */
 
-ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_position_transform(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-  *P = transform_point_auto(&sd->ob_itfm, *P);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *P = transform_point_auto(&sd->ob_itfm_motion, *P);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *P = transform_point(&tfm, *P);
-#endif
 }
 
 /* Transform normal from world to object space */
 
-ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_normal_transform(const KernelGlobals *kg,
                                                        const ShaderData *sd,
                                                        float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
-    *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm, *N));
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    if ((sd->object != OBJECT_NONE) || (sd->type == PRIMITIVE_LAMP)) {
+      *N = normalize(transform_direction_transposed_auto(&sd->ob_tfm_motion, *N));
+    }
+    return;
   }
-#else
+#endif
+
   if (sd->object != OBJECT_NONE) {
     Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
     *N = normalize(transform_direction_transposed(&tfm, *N));
@@ -158,65 +194,79 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg,
     Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
     *N = normalize(transform_direction_transposed(&tfm, *N));
   }
-#endif
 }
 
 /* Transform normal from object to world space */
 
-ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
+ccl_device_inline void object_normal_transform(const KernelGlobals *kg,
+                                               const ShaderData *sd,
+                                               float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-  *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm, *N));
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *N = normalize(transform_direction_transposed_auto(&sd->ob_itfm_motion, *N));
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
   *N = normalize(transform_direction_transposed(&tfm, *N));
-#endif
 }
 
 /* Transform direction vector from object to world space */
 
-ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
+ccl_device_inline void object_dir_transform(const KernelGlobals *kg,
+                                            const ShaderData *sd,
+                                            float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_tfm, *D);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_tfm_motion, *D);
+    return;
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   *D = transform_direction(&tfm, *D);
-#endif
 }
 
 /* Transform direction vector from world to object space */
 
-ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg,
+ccl_device_inline void object_inverse_dir_transform(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-  *D = transform_direction_auto(&sd->ob_itfm, *D);
-#else
-  Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  *D = transform_direction(&tfm, *D);
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    *D = transform_direction_auto(&sd->ob_itfm_motion, *D);
+    return;
+  }
 #endif
+
+  const Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+  *D = transform_direction(&tfm, *D);
 }
 
 /* Object center position */
 
-ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline float3 object_location(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-  return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
-#else
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    return make_float3(sd->ob_tfm_motion.x.w, sd->ob_tfm_motion.y.w, sd->ob_tfm_motion.z.w);
+  }
+#endif
+
   Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
   return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
-#endif
 }
 
 /* Color of the object */
 
-ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_color(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -227,7 +277,7 @@ ccl_device_inline float3 object_color(KernelGlobals *kg, int object)
 
 /* Pass ID number of object */
 
-ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_pass_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -237,7 +287,7 @@ ccl_device_inline float object_pass_id(KernelGlobals *kg, int object)
 
 /* Per lamp random number for shader variation */
 
-ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
+ccl_device_inline float lamp_random_number(const KernelGlobals *kg, int lamp)
 {
   if (lamp == LAMP_NONE)
     return 0.0f;
@@ -247,7 +297,7 @@ ccl_device_inline float lamp_random_number(KernelGlobals *kg, int lamp)
 
 /* Per object random number for shader variation */
 
-ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
+ccl_device_inline float object_random_number(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -257,7 +307,7 @@ ccl_device_inline float object_random_number(KernelGlobals *kg, int object)
 
 /* Particle ID from which this object was generated */
 
-ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
+ccl_device_inline int object_particle_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -267,7 +317,7 @@ ccl_device_inline int object_particle_id(KernelGlobals *kg, int object)
 
 /* Generated texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_generated(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -279,7 +329,7 @@ ccl_device_inline float3 object_dupli_generated(KernelGlobals *kg, int object)
 
 /* UV texture coordinate on surface from where object was instanced */
 
-ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
+ccl_device_inline float3 object_dupli_uv(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return make_float3(0.0f, 0.0f, 0.0f);
@@ -291,7 +341,7 @@ ccl_device_inline float3 object_dupli_uv(KernelGlobals *kg, int object)
 /* Information about mesh for motion blurred triangles and curves */
 
 ccl_device_inline void object_motion_info(
-    KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
+    const KernelGlobals *kg, int object, int *numsteps, int *numverts, int *numkeys)
 {
   if (numkeys) {
     *numkeys = kernel_tex_fetch(__objects, object).numkeys;
@@ -305,7 +355,7 @@ ccl_device_inline void object_motion_info(
 
 /* Offset to an objects patch map */
 
-ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
+ccl_device_inline uint object_patch_map_offset(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -315,7 +365,7 @@ ccl_device_inline uint object_patch_map_offset(KernelGlobals *kg, int object)
 
 /* Volume step size */
 
-ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_density(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return 1.0f;
@@ -324,7 +374,7 @@ ccl_device_inline float object_volume_density(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).volume_density;
 }
 
-ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
+ccl_device_inline float object_volume_step_size(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE) {
     return kernel_data.background.volume_step_size;
@@ -335,14 +385,14 @@ ccl_device_inline float object_volume_step_size(KernelGlobals *kg, int object)
 
 /* Pass ID for shader */
 
-ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
+ccl_device int shader_pass_id(const KernelGlobals *kg, const ShaderData *sd)
 {
   return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
 /* Cryptomatte ID */
 
-ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0.0f;
@@ -350,7 +400,7 @@ ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
   return kernel_tex_fetch(__objects, object).cryptomatte_object;
 }
 
-ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+ccl_device_inline float object_cryptomatte_asset_id(const KernelGlobals *kg, int object)
 {
   if (object == OBJECT_NONE)
     return 0;
@@ -360,42 +410,42 @@ ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int objec
 
 /* Particle data from which object was instanced */
 
-ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
+ccl_device_inline uint particle_index(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).index;
 }
 
-ccl_device float particle_age(KernelGlobals *kg, int particle)
+ccl_device float particle_age(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).age;
 }
 
-ccl_device float particle_lifetime(KernelGlobals *kg, int particle)
+ccl_device float particle_lifetime(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).lifetime;
 }
 
-ccl_device float particle_size(KernelGlobals *kg, int particle)
+ccl_device float particle_size(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).size;
 }
 
-ccl_device float4 particle_rotation(KernelGlobals *kg, int particle)
+ccl_device float4 particle_rotation(const KernelGlobals *kg, int particle)
 {
   return kernel_tex_fetch(__particles, particle).rotation;
 }
 
-ccl_device float3 particle_location(KernelGlobals *kg, int particle)
+ccl_device float3 particle_location(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).location);
 }
 
-ccl_device float3 particle_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).velocity);
 }
 
-ccl_device float3 particle_angular_velocity(KernelGlobals *kg, int particle)
+ccl_device float3 particle_angular_velocity(const KernelGlobals *kg, int particle)
 {
   return float4_to_float3(kernel_tex_fetch(__particles, particle).angular_velocity);
 }
@@ -418,7 +468,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 /* Transform ray into object space to enter static object in BVH */
 
 ccl_device_inline float bvh_instance_push(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+    const KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir)
 {
   Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -428,17 +478,18 @@ ccl_device_inline float bvh_instance_push(
   *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit static object in BVH. */
 
-ccl_device_inline float bvh_instance_pop(
-    KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float t)
+ccl_device_inline float bvh_instance_pop(const KernelGlobals *kg,
+                                         int object,
+                                         const Ray *ray,
+                                         float3 *P,
+                                         float3 *dir,
+                                         float3 *idir,
+                                         float t)
 {
   if (t != FLT_MAX) {
     Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
@@ -454,7 +505,7 @@ ccl_device_inline float bvh_instance_pop(
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_pop_factor(const KernelGlobals *kg,
                                                int object,
                                                const Ray *ray,
                                                float3 *P,
@@ -473,13 +524,12 @@ ccl_device_inline void bvh_instance_pop_factor(KernelGlobals *kg,
 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */
 
-ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_push(const KernelGlobals *kg,
                                                  int object,
                                                  const Ray *ray,
                                                  float3 *P,
                                                  float3 *dir,
                                                  float3 *idir,
-                                                 float t,
                                                  Transform *itfm)
 {
   object_fetch_transform_motion_test(kg, object, ray->time, itfm);
@@ -490,16 +540,12 @@ ccl_device_inline float bvh_instance_motion_push(KernelGlobals *kg,
   *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
   *idir = bvh_inverse_direction(*dir);
 
-  if (t != FLT_MAX) {
-    t *= len;
-  }
-
-  return t;
+  return len;
 }
 
 /* Transform ray to exit motion blurred object in BVH. */
 
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
+ccl_device_inline float bvh_instance_motion_pop(const KernelGlobals *kg,
                                                 int object,
                                                 const Ray *ray,
                                                 float3 *P,
@@ -521,7 +567,7 @@ ccl_device_inline float bvh_instance_motion_pop(KernelGlobals *kg,
 
 /* Same as above, but returns scale factor to apply to multiple intersection distances */
 
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
+ccl_device_inline void bvh_instance_motion_pop_factor(const KernelGlobals *kg,
                                                       int object,
                                                       const Ray *ray,
                                                       float3 *P,
@@ -538,48 +584,11 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg,
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-ccl_device_inline void object_position_transform_addrspace(KernelGlobals *kg,
-                                                           const ShaderData *sd,
-                                                           ccl_addr_space float3 *P)
-{
-  float3 private_P = *P;
-  object_position_transform(kg, sd, &private_P);
-  *P = private_P;
-}
-
-ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
-                                                      const ShaderData *sd,
-                                                      ccl_addr_space float3 *D)
-{
-  float3 private_D = *D;
-  object_dir_transform(kg, sd, &private_D);
-  *D = private_D;
-}
-
-ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
-                                                         const ShaderData *sd,
-                                                         ccl_addr_space float3 *N)
-{
-  float3 private_N = *N;
-  object_normal_transform(kg, sd, &private_N);
-  *N = private_N;
-}
-#endif
-
-#ifndef __KERNEL_OPENCL__
-#  define object_position_transform_auto object_position_transform
-#  define object_dir_transform_auto object_dir_transform
-#  define object_normal_transform_auto object_normal_transform
-#else
-#  define object_position_transform_auto object_position_transform_addrspace
-#  define object_dir_transform_auto object_dir_transform_addrspace
-#  define object_normal_transform_auto object_normal_transform_addrspace
-#endif
+#define object_position_transform_auto object_position_transform
+#define object_dir_transform_auto object_dir_transform
+#define object_normal_transform_auto object_normal_transform
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_patch.h b/intern/cycles/kernel/geom/geom_patch.h
index 9c1768f05db..ce0fc15f196 100644
--- a/intern/cycles/kernel/geom/geom_patch.h
+++ b/intern/cycles/kernel/geom/geom_patch.h
@@ -24,6 +24,8 @@
  * language governing permissions and limitations under the Apache License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 typedef struct PatchHandle {
@@ -60,7 +62,7 @@ ccl_device_inline int patch_map_resolve_quadrant(float median, float *u, float *
 /* retrieve PatchHandle from patch coords */
 
 ccl_device_inline PatchHandle
-patch_map_find_patch(KernelGlobals *kg, int object, int patch, float u, float v)
+patch_map_find_patch(const KernelGlobals *kg, int object, int patch, float u, float v)
 {
   PatchHandle handle;
 
@@ -191,7 +193,7 @@ ccl_device_inline void patch_eval_normalize_coords(uint patch_bits, float *u, fl
 
 /* retrieve patch control indices */
 
-ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
+ccl_device_inline int patch_eval_indices(const KernelGlobals *kg,
                                          const PatchHandle *handle,
                                          int channel,
                                          int indices[PATCH_MAX_CONTROL_VERTS])
@@ -208,7 +210,7 @@ ccl_device_inline int patch_eval_indices(KernelGlobals *kg,
 
 /* evaluate patch basis functions */
 
-ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
+ccl_device_inline void patch_eval_basis(const KernelGlobals *kg,
                                         const PatchHandle *handle,
                                         float u,
                                         float v,
@@ -247,7 +249,7 @@ ccl_device_inline void patch_eval_basis(KernelGlobals *kg,
 
 /* generic function for evaluating indices and weights from patch coords */
 
-ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
+ccl_device_inline int patch_eval_control_verts(const KernelGlobals *kg,
                                                int object,
                                                int patch,
                                                float u,
@@ -269,7 +271,7 @@ ccl_device_inline int patch_eval_control_verts(KernelGlobals *kg,
 
 /* functions for evaluating attributes on patches */
 
-ccl_device float patch_eval_float(KernelGlobals *kg,
+ccl_device float patch_eval_float(const KernelGlobals *kg,
                                   const ShaderData *sd,
                                   int offset,
                                   int patch,
@@ -306,7 +308,7 @@ ccl_device float patch_eval_float(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float2 patch_eval_float2(KernelGlobals *kg,
+ccl_device float2 patch_eval_float2(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -343,7 +345,7 @@ ccl_device float2 patch_eval_float2(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float3 patch_eval_float3(KernelGlobals *kg,
+ccl_device float3 patch_eval_float3(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -380,7 +382,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_float4(KernelGlobals *kg,
+ccl_device float4 patch_eval_float4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
@@ -417,7 +419,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals *kg,
   return val;
 }
 
-ccl_device float4 patch_eval_uchar4(KernelGlobals *kg,
+ccl_device float4 patch_eval_uchar4(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     int offset,
                                     int patch,
diff --git a/intern/cycles/kernel/geom/geom_primitive.h b/intern/cycles/kernel/geom/geom_primitive.h
index aeb044c9ad3..ba31b12e817 100644
--- a/intern/cycles/kernel/geom/geom_primitive.h
+++ b/intern/cycles/kernel/geom/geom_primitive.h
@@ -19,6 +19,10 @@
  * Generic functions to look up mesh, curve and volume primitive attributes for
  * shading and render passes. */
 
+#pragma once
+
+#include "kernel/kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Surface Attributes
@@ -27,8 +31,11 @@ CCL_NAMESPACE_BEGIN
  * attributes for performance, mainly for GPU performance to avoid bringing in
  * heavy volume interpolation code. */
 
-ccl_device_inline float primitive_surface_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_inline float primitive_surface_attribute_float(const KernelGlobals *kg,
+                                                          const ShaderData *sd,
+                                                          const AttributeDescriptor desc,
+                                                          float *dx,
+                                                          float *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -50,7 +57,7 @@ ccl_device_inline float primitive_surface_attribute_float(
   }
 }
 
-ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
+ccl_device_inline float2 primitive_surface_attribute_float2(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float2 *dx,
@@ -76,7 +83,7 @@ ccl_device_inline float2 primitive_surface_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_surface_attribute_float3(const KernelGlobals *kg,
                                                             const ShaderData *sd,
                                                             const AttributeDescriptor desc,
                                                             float3 *dx,
@@ -102,11 +109,11 @@ ccl_device_inline float3 primitive_surface_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_surface_attribute_float4(KernelGlobals *kg,
-                                                            const ShaderData *sd,
-                                                            const AttributeDescriptor desc,
-                                                            float4 *dx,
-                                                            float4 *dy)
+ccl_device_forceinline float4 primitive_surface_attribute_float4(const KernelGlobals *kg,
+                                                                 const ShaderData *sd,
+                                                                 const AttributeDescriptor desc,
+                                                                 float4 *dx,
+                                                                 float4 *dy)
 {
   if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
     if (subd_triangle_patch(kg, sd) == ~0)
@@ -141,7 +148,7 @@ ccl_device_inline bool primitive_is_volume_attribute(const ShaderData *sd,
   return sd->type == PRIMITIVE_VOLUME;
 }
 
-ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
+ccl_device_inline float primitive_volume_attribute_float(const KernelGlobals *kg,
                                                          const ShaderData *sd,
                                                          const AttributeDescriptor desc)
 {
@@ -153,7 +160,7 @@ ccl_device_inline float primitive_volume_attribute_float(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
+ccl_device_inline float3 primitive_volume_attribute_float3(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -165,7 +172,7 @@ ccl_device_inline float3 primitive_volume_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
+ccl_device_inline float4 primitive_volume_attribute_float4(const KernelGlobals *kg,
                                                            const ShaderData *sd,
                                                            const AttributeDescriptor desc)
 {
@@ -180,7 +187,7 @@ ccl_device_inline float4 primitive_volume_attribute_float4(KernelGlobals *kg,
 
 /* Default UV coordinate */
 
-ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 primitive_uv(const KernelGlobals *kg, const ShaderData *sd)
 {
   const AttributeDescriptor desc = find_attribute(kg, sd, ATTR_STD_UV);
 
@@ -193,7 +200,7 @@ ccl_device_inline float3 primitive_uv(KernelGlobals *kg, ShaderData *sd)
 
 /* Ptex coordinates */
 
-ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
+ccl_device bool primitive_ptex(const KernelGlobals *kg, ShaderData *sd, float2 *uv, int *face_id)
 {
   /* storing ptex data as attributes is not memory efficient but simple for tests */
   const AttributeDescriptor desc_face_id = find_attribute(kg, sd, ATTR_STD_PTEX_FACE_ID);
@@ -213,7 +220,7 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 
 /* Surface tangent */
 
-ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 primitive_tangent(const KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
   if (sd->type & PRIMITIVE_ALL_CURVE)
@@ -245,7 +252,7 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 
 /* Motion vector for motion pass */
 
-ccl_device_inline float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float4 primitive_motion_vector(const KernelGlobals *kg, const ShaderData *sd)
 {
   /* center position */
   float3 center;
diff --git a/intern/cycles/kernel/geom/geom_shader_data.h b/intern/cycles/kernel/geom/geom_shader_data.h
new file mode 100644
index 00000000000..fb2cb5cb1ea
--- /dev/null
+++ b/intern/cycles/kernel/geom/geom_shader_data.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Functions to initialize ShaderData given.
+ *
+ * Could be from an incoming ray, intersection or sampled position. */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* ShaderData setup from incoming ray */
+
+#ifdef __OBJECT_MOTION__
+ccl_device void shader_setup_object_transforms(const KernelGlobals *ccl_restrict kg,
+                                               ShaderData *ccl_restrict sd,
+                                               float time)
+{
+  if (sd->object_flag & SD_OBJECT_MOTION) {
+    sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
+    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+  }
+}
+#endif
+
+/* TODO: break this up if it helps reduce register pressure to load data from
+ * global memory as we write it to shaderdata. */
+ccl_device_inline void shader_setup_from_ray(const KernelGlobals *ccl_restrict kg,
+                                             ShaderData *ccl_restrict sd,
+                                             const Ray *ccl_restrict ray,
+                                             const Intersection *ccl_restrict isect)
+{
+  /* Read intersection data into shader globals.
+   *
+   * TODO: this is redundant, could potentially remove some of this from
+   * ShaderData but would need to ensure that it also works for shadow
+   * shader evaluation. */
+  sd->u = isect->u;
+  sd->v = isect->v;
+  sd->ray_length = isect->t;
+  sd->type = isect->type;
+  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
+                                                isect->object;
+  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
+  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
+  sd->lamp = LAMP_NONE;
+  sd->flag = 0;
+
+  /* Read matrices and time. */
+  sd->time = ray->time;
+
+#ifdef __OBJECT_MOTION__
+  shader_setup_object_transforms(kg, sd, ray->time);
+#endif
+
+  /* Read ray data into shader globals. */
+  sd->I = -ray->D;
+
+#ifdef __HAIR__
+  if (sd->type & PRIMITIVE_ALL_CURVE) {
+    /* curve */
+    curve_shader_setup(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+  }
+  else
+#endif
+      if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* static triangle */
+    float3 Ng = triangle_normal(kg, sd);
+    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+
+    /* vectors */
+    sd->P = triangle_refine(kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim);
+    sd->Ng = Ng;
+    sd->N = Ng;
+
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL)
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+#ifdef __DPDU__
+    /* dPdu/dPdv */
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+#endif
+  }
+  else {
+    /* motion triangle */
+    motion_triangle_shader_setup(
+        kg, sd, ray->P, ray->D, isect->t, isect->object, isect->prim, false);
+  }
+
+  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+
+  if (isect->object != OBJECT_NONE) {
+    /* instance transform */
+    object_normal_transform_auto(kg, sd, &sd->N);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+#ifdef __DPDU__
+    object_dir_transform_auto(kg, sd, &sd->dPdu);
+    object_dir_transform_auto(kg, sd, &sd->dPdv);
+#endif
+  }
+
+  /* backfacing test */
+  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+  if (backfacing) {
+    sd->flag |= SD_BACKFACING;
+    sd->Ng = -sd->Ng;
+    sd->N = -sd->N;
+#ifdef __DPDU__
+    sd->dPdu = -sd->dPdu;
+    sd->dPdv = -sd->dPdv;
+#endif
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  differential_transfer_compact(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, sd->ray_length);
+  differential_incoming_compact(&sd->dI, ray->D, ray->dD);
+  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
+#endif
+}
+
+/* ShaderData setup from position sampled on mesh */
+
+ccl_device_inline void shader_setup_from_sample(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const float3 P,
+                                                const float3 Ng,
+                                                const float3 I,
+                                                int shader,
+                                                int object,
+                                                int prim,
+                                                float u,
+                                                float v,
+                                                float t,
+                                                float time,
+                                                bool object_space,
+                                                int lamp)
+{
+  /* vectors */
+  sd->P = P;
+  sd->N = Ng;
+  sd->Ng = Ng;
+  sd->I = I;
+  sd->shader = shader;
+  if (prim != PRIM_NONE)
+    sd->type = PRIMITIVE_TRIANGLE;
+  else if (lamp != LAMP_NONE)
+    sd->type = PRIMITIVE_LAMP;
+  else
+    sd->type = PRIMITIVE_NONE;
+
+  /* primitive */
+  sd->object = object;
+  sd->lamp = LAMP_NONE;
+  /* Currently no access to bvh prim index for strand sd->prim. */
+  sd->prim = prim;
+  sd->u = u;
+  sd->v = v;
+  sd->time = time;
+  sd->ray_length = t;
+
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  if (sd->object != OBJECT_NONE) {
+    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
+
+#ifdef __OBJECT_MOTION__
+    shader_setup_object_transforms(kg, sd, time);
+#endif
+  }
+  else if (lamp != LAMP_NONE) {
+    sd->lamp = lamp;
+  }
+
+  /* transform into world space */
+  if (object_space) {
+    object_position_transform_auto(kg, sd, &sd->P);
+    object_normal_transform_auto(kg, sd, &sd->Ng);
+    sd->N = sd->Ng;
+    object_dir_transform_auto(kg, sd, &sd->I);
+  }
+
+  if (sd->type & PRIMITIVE_TRIANGLE) {
+    /* smooth normal */
+    if (sd->shader & SHADER_SMOOTH_NORMAL) {
+      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
+
+      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+        object_normal_transform_auto(kg, sd, &sd->N);
+      }
+    }
+
+    /* dPdu/dPdv */
+#ifdef __DPDU__
+    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
+
+    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+      object_dir_transform_auto(kg, sd, &sd->dPdu);
+      object_dir_transform_auto(kg, sd, &sd->dPdv);
+    }
+#endif
+  }
+  else {
+#ifdef __DPDU__
+    sd->dPdu = zero_float3();
+    sd->dPdv = zero_float3();
+#endif
+  }
+
+  /* backfacing test */
+  if (sd->prim != PRIM_NONE) {
+    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
+
+    if (backfacing) {
+      sd->flag |= SD_BACKFACING;
+      sd->Ng = -sd->Ng;
+      sd->N = -sd->N;
+#ifdef __DPDU__
+      sd->dPdu = -sd->dPdu;
+      sd->dPdv = -sd->dPdv;
+#endif
+    }
+  }
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* no ray differentials here yet */
+  sd->dP = differential3_zero();
+  sd->dI = differential3_zero();
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup for displacement */
+
+ccl_device void shader_setup_from_displace(const KernelGlobals *ccl_restrict kg,
+                                           ShaderData *ccl_restrict sd,
+                                           int object,
+                                           int prim,
+                                           float u,
+                                           float v)
+{
+  float3 P, Ng, I = zero_float3();
+  int shader;
+
+  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
+
+  /* force smooth shading for displacement */
+  shader |= SHADER_SMOOTH_NORMAL;
+
+  shader_setup_from_sample(
+      kg,
+      sd,
+      P,
+      Ng,
+      I,
+      shader,
+      object,
+      prim,
+      u,
+      v,
+      0.0f,
+      0.5f,
+      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
+      LAMP_NONE);
+}
+
+/* ShaderData setup from ray into background */
+
+ccl_device_inline void shader_setup_from_background(const KernelGlobals *ccl_restrict kg,
+                                                    ShaderData *ccl_restrict sd,
+                                                    const float3 ray_P,
+                                                    const float3 ray_D,
+                                                    const float ray_time)
+{
+  /* for NDC coordinates */
+  sd->ray_P = ray_P;
+
+  /* vectors */
+  sd->P = ray_D;
+  sd->N = -ray_D;
+  sd->Ng = -ray_D;
+  sd->I = -ray_D;
+  sd->shader = kernel_data.background.surface_shader;
+  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
+  sd->object_flag = 0;
+  sd->time = ray_time;
+  sd->ray_length = 0.0f;
+
+  sd->object = OBJECT_NONE;
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#endif
+
+#ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO: ray->dP */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#endif
+}
+
+/* ShaderData setup from point inside volume */
+
+#ifdef __VOLUME__
+ccl_device_inline void shader_setup_from_volume(const KernelGlobals *ccl_restrict kg,
+                                                ShaderData *ccl_restrict sd,
+                                                const Ray *ccl_restrict ray)
+{
+
+  /* vectors */
+  sd->P = ray->P;
+  sd->N = -ray->D;
+  sd->Ng = -ray->D;
+  sd->I = -ray->D;
+  sd->shader = SHADER_NONE;
+  sd->flag = 0;
+  sd->object_flag = 0;
+  sd->time = ray->time;
+  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
+
+  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
+  sd->lamp = LAMP_NONE;
+  sd->prim = PRIM_NONE;
+  sd->type = PRIMITIVE_VOLUME;
+
+  sd->u = 0.0f;
+  sd->v = 0.0f;
+
+#  ifdef __DPDU__
+  /* dPdu/dPdv */
+  sd->dPdu = zero_float3();
+  sd->dPdv = zero_float3();
+#  endif
+
+#  ifdef __RAY_DIFFERENTIALS__
+  /* differentials */
+  sd->dP = differential3_zero(); /* TODO ray->dD */
+  differential_incoming(&sd->dI, sd->dP);
+  sd->du = differential_zero();
+  sd->dv = differential_zero();
+#  endif
+
+  /* for NDC coordinates */
+  sd->ray_P = ray->P;
+  sd->ray_dP = ray->dP;
+}
+#endif /* __VOLUME__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/geom/geom_subd_triangle.h b/intern/cycles/kernel/geom/geom_subd_triangle.h
index 9eceb996926..877b2ece15b 100644
--- a/intern/cycles/kernel/geom/geom_subd_triangle.h
+++ b/intern/cycles/kernel/geom/geom_subd_triangle.h
@@ -16,18 +16,20 @@
 
 /* Functions for retrieving attributes on triangles produced from subdivision meshes */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Patch index for triangle, -1 if not subdivision triangle */
 
-ccl_device_inline uint subd_triangle_patch(KernelGlobals *kg, const ShaderData *sd)
+ccl_device_inline uint subd_triangle_patch(const KernelGlobals *kg, const ShaderData *sd)
 {
   return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
 }
 
 /* UV coords of triangle within patch */
 
-ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
+ccl_device_inline void subd_triangle_patch_uv(const KernelGlobals *kg,
                                               const ShaderData *sd,
                                               float2 uv[3])
 {
@@ -40,7 +42,7 @@ ccl_device_inline void subd_triangle_patch_uv(KernelGlobals *kg,
 
 /* Vertex indices of patch */
 
-ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch)
+ccl_device_inline uint4 subd_triangle_patch_indices(const KernelGlobals *kg, int patch)
 {
   uint4 indices;
 
@@ -54,21 +56,23 @@ ccl_device_inline uint4 subd_triangle_patch_indices(KernelGlobals *kg, int patch
 
 /* Originating face for patch */
 
-ccl_device_inline uint subd_triangle_patch_face(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_face(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 4);
 }
 
 /* Number of corners on originating face */
 
-ccl_device_inline uint subd_triangle_patch_num_corners(KernelGlobals *kg, int patch)
+ccl_device_inline uint subd_triangle_patch_num_corners(const KernelGlobals *kg, int patch)
 {
   return kernel_tex_fetch(__patches, patch + 5) & 0xffff;
 }
 
 /* Indices of the four corners that are used by the patch */
 
-ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch, int corners[4])
+ccl_device_inline void subd_triangle_patch_corners(const KernelGlobals *kg,
+                                                   int patch,
+                                                   int corners[4])
 {
   uint4 data;
 
@@ -99,8 +103,11 @@ ccl_device_inline void subd_triangle_patch_corners(KernelGlobals *kg, int patch,
 
 /* Reading attributes on various subdivision triangle elements */
 
-ccl_device_noinline float subd_triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device_noinline float subd_triangle_attribute_float(const KernelGlobals *kg,
+                                                        const ShaderData *sd,
+                                                        const AttributeDescriptor desc,
+                                                        float *dx,
+                                                        float *dy)
 {
   int patch = subd_triangle_patch(kg, sd);
 
@@ -235,7 +242,7 @@ ccl_device_noinline float subd_triangle_attribute_float(
   }
 }
 
-ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
+ccl_device_noinline float2 subd_triangle_attribute_float2(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float2 *dx,
@@ -378,7 +385,7 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
+ccl_device_noinline float3 subd_triangle_attribute_float3(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float3 *dx,
@@ -520,7 +527,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals *kg,
+ccl_device_noinline float4 subd_triangle_attribute_float4(const KernelGlobals *kg,
                                                           const ShaderData *sd,
                                                           const AttributeDescriptor desc,
                                                           float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle.h b/intern/cycles/kernel/geom/geom_triangle.h
index ff7909ca425..910fb122c6d 100644
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -20,10 +20,12 @@
  * ray intersection we use a precomputed triangle storage to accelerate
  * intersection at the cost of more memory usage */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* Normal on triangle. */
-ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device_inline float3 triangle_normal(const KernelGlobals *kg, ShaderData *sd)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
@@ -41,8 +43,14 @@ ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 }
 
 /* Point and normal on triangle. */
-ccl_device_inline void triangle_point_normal(
-    KernelGlobals *kg, int object, int prim, float u, float v, float3 *P, float3 *Ng, int *shader)
+ccl_device_inline void triangle_point_normal(const KernelGlobals *kg,
+                                             int object,
+                                             int prim,
+                                             float u,
+                                             float v,
+                                             float3 *P,
+                                             float3 *Ng,
+                                             int *shader)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -67,7 +75,7 @@ ccl_device_inline void triangle_point_normal(
 
 /* Triangle vertex locations */
 
-ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3])
+ccl_device_inline void triangle_vertices(const KernelGlobals *kg, int prim, float3 P[3])
 {
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
   P[0] = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex.w + 0));
@@ -77,7 +85,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
 
 /* Triangle vertex locations and vertex normals */
 
-ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
+ccl_device_inline void triangle_vertices_and_normals(const KernelGlobals *kg,
                                                      int prim,
                                                      float3 P[3],
                                                      float3 N[3])
@@ -94,7 +102,7 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals *kg,
 /* Interpolate smooth vertex normal from vertices */
 
 ccl_device_inline float3
-triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
+triangle_smooth_normal(const KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -108,7 +116,7 @@ triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 }
 
 ccl_device_inline float3 triangle_smooth_normal_unnormalized(
-    KernelGlobals *kg, ShaderData *sd, float3 Ng, int prim, float u, float v)
+    const KernelGlobals *kg, const ShaderData *sd, float3 Ng, int prim, float u, float v)
 {
   /* load triangle vertices */
   const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -130,7 +138,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
+ccl_device_inline void triangle_dPdudv(const KernelGlobals *kg,
                                        int prim,
                                        ccl_addr_space float3 *dPdu,
                                        ccl_addr_space float3 *dPdv)
@@ -148,8 +156,11 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals *kg,
 
 /* Reading attributes on various triangle elements */
 
-ccl_device float triangle_attribute_float(
-    KernelGlobals *kg, const ShaderData *sd, const AttributeDescriptor desc, float *dx, float *dy)
+ccl_device float triangle_attribute_float(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const AttributeDescriptor desc,
+                                          float *dx,
+                                          float *dy)
 {
   if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION | ATTR_ELEMENT_CORNER)) {
     float f0, f1, f2;
@@ -195,7 +206,7 @@ ccl_device float triangle_attribute_float(
   }
 }
 
-ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
+ccl_device float2 triangle_attribute_float2(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float2 *dx,
@@ -245,7 +256,7 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals *kg,
   }
 }
 
-ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
+ccl_device float3 triangle_attribute_float3(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float3 *dx,
@@ -295,7 +306,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg,
   }
 }
 
-ccl_device float4 triangle_attribute_float4(KernelGlobals *kg,
+ccl_device float4 triangle_attribute_float4(const KernelGlobals *kg,
                                             const ShaderData *sd,
                                             const AttributeDescriptor desc,
                                             float4 *dx,
diff --git a/intern/cycles/kernel/geom/geom_triangle_intersect.h b/intern/cycles/kernel/geom/geom_triangle_intersect.h
index b0cce274b94..30b77ebd2eb 100644
--- a/intern/cycles/kernel/geom/geom_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/geom_triangle_intersect.h
@@ -20,12 +20,17 @@
  * intersection at the cost of more memory usage.
  */
 
+#pragma once
+
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect(const KernelGlobals *kg,
                                           Intersection *isect,
                                           float3 P,
                                           float3 dir,
+                                          float tmax,
                                           uint visibility,
                                           int object,
                                           int prim_addr)
@@ -41,7 +46,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
   float t, u, v;
   if (ray_triangle_intersect(P,
                              dir,
-                             isect->t,
+                             tmax,
 #if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
                              ssef_verts,
 #else
@@ -78,7 +83,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals *kg,
  */
 
 #ifdef __BVH_LOCAL__
-ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
+ccl_device_inline bool triangle_intersect_local(const KernelGlobals *kg,
                                                 LocalIntersection *local_isect,
                                                 float3 P,
                                                 float3 dir,
@@ -192,25 +197,20 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals *kg,
  * http://www.cs.virginia.edu/~gfx/Courses/2003/ImageSynthesis/papers/Acceleration/Fast%20MinimumStorage%20RayTriangle%20Intersection.pdf
  */
 
-ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine(const KernelGlobals *kg,
                                          ShaderData *sd,
-                                         const Intersection *isect,
-                                         const Ray *ray)
+                                         float3 P,
+                                         float3 D,
+                                         float t,
+                                         const int isect_object,
+                                         const int isect_prim)
 {
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
 #ifdef __INTERSECTION_REFINE__
-  if (isect->object != OBJECT_NONE) {
+  if (isect_object != OBJECT_NONE) {
     if (UNLIKELY(t == 0.0f)) {
       return P;
     }
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D * t);
@@ -219,7 +219,7 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 
   P = P + D * t;
 
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -239,13 +239,8 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
     P = P + D * rt;
   }
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
@@ -255,28 +250,23 @@ ccl_device_inline float3 triangle_refine(KernelGlobals *kg,
 #endif
 }
 
-/* Same as above, except that isect->t is assumed to be in object space for
+/* Same as above, except that t is assumed to be in object space for
  * instancing.
  */
-ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
+ccl_device_inline float3 triangle_refine_local(const KernelGlobals *kg,
                                                ShaderData *sd,
-                                               const Intersection *isect,
-                                               const Ray *ray)
+                                               float3 P,
+                                               float3 D,
+                                               float t,
+                                               const int isect_object,
+                                               const int isect_prim)
 {
 #ifdef __KERNEL_OPTIX__
-  /* isect->t is always in world space with OptiX. */
-  return triangle_refine(kg, sd, isect, ray);
+  /* t is always in world space with OptiX. */
+  return triangle_refine(kg, sd, P, D, t, isect_object, isect_prim);
 #else
-  float3 P = ray->P;
-  float3 D = ray->D;
-  float t = isect->t;
-
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_itfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_INVERSE_TRANSFORM);
-#  endif
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_inverse_transform(kg, sd);
 
     P = transform_point(&tfm, P);
     D = transform_direction(&tfm, D);
@@ -286,7 +276,7 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   P = P + D * t;
 
 #  ifdef __INTERSECTION_REFINE__
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
+  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect_prim);
   const float4 tri_a = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0),
                tri_b = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1),
                tri_c = kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2);
@@ -307,13 +297,8 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals *kg,
   }
 #  endif /* __INTERSECTION_REFINE__ */
 
-  if (isect->object != OBJECT_NONE) {
-#  ifdef __OBJECT_MOTION__
-    Transform tfm = sd->ob_tfm;
-#  else
-    Transform tfm = object_fetch_transform(kg, isect->object, OBJECT_TRANSFORM);
-#  endif
-
+  if (isect_object != OBJECT_NONE) {
+    const Transform tfm = object_get_transform(kg, sd);
     P = transform_point(&tfm, P);
   }
 
diff --git a/intern/cycles/kernel/geom/geom_volume.h b/intern/cycles/kernel/geom/geom_volume.h
index 809b76245ba..2bcd7e56b5f 100644
--- a/intern/cycles/kernel/geom/geom_volume.h
+++ b/intern/cycles/kernel/geom/geom_volume.h
@@ -23,13 +23,15 @@
  * 3D voxel textures can be assigned as attributes per mesh, which means the
  * same shader can be used for volume objects with different densities, etc. */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __VOLUME__
 
 /* Return position normalized to 0..1 in mesh bounds */
 
-ccl_device_inline float3 volume_normalized_position(KernelGlobals *kg,
+ccl_device_inline float3 volume_normalized_position(const KernelGlobals *kg,
                                                     const ShaderData *sd,
                                                     float3 P)
 {
@@ -68,7 +70,7 @@ ccl_device float3 volume_attribute_value_to_float3(const float4 value)
   }
 }
 
-ccl_device float4 volume_attribute_float4(KernelGlobals *kg,
+ccl_device float4 volume_attribute_float4(const KernelGlobals *kg,
                                           const ShaderData *sd,
                                           const AttributeDescriptor desc)
 {
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_bake.h b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
new file mode 100644
index 00000000000..96db606cee1
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_bake.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+
+#include "kernel/geom/geom.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* This helps with AA but it's not the real solution as it does not AA the geometry
+ * but it's better than nothing, thus committed. */
+ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
+{
+  /* use mirror repeat (like opengl texture) so that if the barycentric
+   * coordinate goes past the end of the triangle it is not always clamped
+   * to the same value, gives ugly patterns */
+  u /= max;
+  float fu = floorf(u);
+  u = u - fu;
+
+  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_bake(INTEGRATOR_STATE_ARGS,
+                                          const ccl_global KernelWorkTile *ccl_restrict tile,
+                                          ccl_global float *render_buffer,
+                                          const int x,
+                                          const int y,
+                                          const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Always count the sample, even if the camera sample will reject the ray. */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Setup render buffers. */
+  const int index = INTEGRATOR_STATE(path, render_pixel_index);
+  const int pass_stride = kernel_data.film.pass_stride;
+  render_buffer += index * pass_stride;
+
+  ccl_global float *primitive = render_buffer + kernel_data.film.pass_bake_primitive;
+  ccl_global float *differential = render_buffer + kernel_data.film.pass_bake_differential;
+
+  const int seed = __float_as_uint(primitive[0]);
+  int prim = __float_as_uint(primitive[1]);
+  if (prim == -1) {
+    return false;
+  }
+
+  prim += kernel_data.bake.tri_offset;
+
+  /* Random number generator. */
+  const uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
+
+  float filter_x, filter_y;
+  if (sample == 0) {
+    filter_x = filter_y = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_x, &filter_y);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Barycentric UV with sub-pixel offset. */
+  float u = primitive[2];
+  float v = primitive[3];
+
+  float dudx = differential[0];
+  float dudy = differential[1];
+  float dvdx = differential[2];
+  float dvdy = differential[3];
+
+  if (sample > 0) {
+    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
+    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
+                                 1.0f - u);
+  }
+
+  /* Position and normal on triangle. */
+  float3 P, Ng;
+  int shader;
+  triangle_point_normal(kg, kernel_data.bake.object_index, prim, u, v, &P, &Ng, &shader);
+  if (kernel_data.film.pass_background != PASS_UNUSED) {
+    /* Environment baking. */
+
+    /* Setup and write ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = zero_float3();
+    ray.D = normalize(P);
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup next kernel to execute. */
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  }
+  else {
+    /* Surface baking. */
+    const float3 N = (shader & SHADER_SMOOTH_NORMAL) ? triangle_smooth_normal(kg, Ng, prim, u, v) :
+                                                       Ng;
+
+    /* Setup ray. */
+    Ray ray ccl_optional_struct_init;
+    ray.P = P + N;
+    ray.D = -N;
+    ray.t = FLT_MAX;
+    ray.time = 0.5f;
+
+    /* Setup differentials. */
+    float3 dPdu, dPdv;
+    triangle_dPdudv(kg, prim, &dPdu, &dPdv);
+    differential3 dP;
+    dP.dx = dPdu * dudx + dPdv * dvdx;
+    dP.dy = dPdu * dudy + dPdv * dvdy;
+    ray.dP = differential_make_compact(dP);
+    ray.dD = differential_zero_compact();
+
+    /* Write ray. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+    /* Setup and write intersection. */
+    Intersection isect ccl_optional_struct_init;
+    isect.object = kernel_data.bake.object_index;
+    isect.prim = prim;
+    isect.u = u;
+    isect.v = v;
+    isect.t = 1.0f;
+    isect.type = PRIMITIVE_TRIANGLE;
+#ifdef __EMBREE__
+    isect.Ng = Ng;
+#endif
+    integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+    /* Setup next kernel to execute. */
+    const int shader_index = shader & SHADER_MASK;
+    const int shader_flags = kernel_tex_fetch(__shaders, shader_index).flags;
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index);
+    }
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_init_from_camera.h b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
new file mode 100644
index 00000000000..58e7bde4c94
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_init_from_camera.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_adaptive_sampling.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_random.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_camera_sample(const KernelGlobals *ccl_restrict kg,
+                                               const int sample,
+                                               const int x,
+                                               const int y,
+                                               const uint rng_hash,
+                                               Ray *ray)
+{
+  /* Filter sampling. */
+  float filter_u, filter_v;
+
+  if (sample == 0) {
+    filter_u = 0.5f;
+    filter_v = 0.5f;
+  }
+  else {
+    path_rng_2D(kg, rng_hash, sample, PRNG_FILTER_U, &filter_u, &filter_v);
+  }
+
+  /* Depth of field sampling. */
+  float lens_u = 0.0f, lens_v = 0.0f;
+  if (kernel_data.cam.aperturesize > 0.0f) {
+    path_rng_2D(kg, rng_hash, sample, PRNG_LENS_U, &lens_u, &lens_v);
+  }
+
+  /* Motion blur time sampling. */
+  float time = 0.0f;
+#ifdef __CAMERA_MOTION__
+  if (kernel_data.cam.shuttertime != -1.0f)
+    time = path_rng_1D(kg, rng_hash, sample, PRNG_TIME);
+#endif
+
+  /* Generate camera ray. */
+  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
+}
+
+/* Return false to indicate that this pixel is finished.
+ * Used by CPU implementation to not attempt to sample pixel for multiple samples once its known
+ * that the pixel did converge. */
+ccl_device bool integrator_init_from_camera(INTEGRATOR_STATE_ARGS,
+                                            const ccl_global KernelWorkTile *ccl_restrict tile,
+                                            ccl_global float *render_buffer,
+                                            const int x,
+                                            const int y,
+                                            const int scheduled_sample)
+{
+  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
+
+  /* Initialize path state to give basic buffer access and allow early outputs. */
+  path_state_init(INTEGRATOR_STATE_PASS, tile, x, y);
+
+  /* Check whether the pixel has converged and should not be sampled anymore. */
+  if (!kernel_need_sample_pixel(INTEGRATOR_STATE_PASS, render_buffer)) {
+    return false;
+  }
+
+  /* Count the sample and get an effective sample for this pixel.
+   *
+   * This logic allows to both count actual number of samples per pixel, and to add samples to this
+   * pixel after it was converged and samples were added somewhere else (in which case the
+   * `scheduled_sample` will be different from actual number of samples in this pixel). */
+  const int sample = kernel_accum_sample(INTEGRATOR_STATE_PASS, render_buffer, scheduled_sample);
+
+  /* Initialize random number seed for path. */
+  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
+
+  {
+    /* Generate camera ray. */
+    Ray ray;
+    integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
+    if (ray.t == 0.0f) {
+      return true;
+    }
+
+    /* Write camera ray to state. */
+    integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+  }
+
+  /* Initialize path state for path integration. */
+  path_state_init_integrator(INTEGRATOR_STATE_PASS, sample, rng_hash);
+
+  /* Continue with intersect_closest kernel, optionally initializing volume
+   * stack before that if the camera may be inside a volume. */
+  if (kernel_data.cam.is_inside_volume) {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+  }
+  else {
+    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+  }
+
+  return true;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
new file mode 100644
index 00000000000..34ca6814534
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shadow_catcher.h"
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/bvh/bvh.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<uint32_t current_kernel>
+ccl_device_forceinline bool integrator_intersect_terminate(INTEGRATOR_STATE_ARGS,
+                                                           const int shader_flags)
+{
+
+  /* Optional AO bounce termination.
+   * We continue evaluating emissive/transparent surfaces and volumes, similar
+   * to direct lighting. Only if we know there are none can we terminate the
+   * path immediately. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    if (shader_flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    }
+    else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_VOLUME;
+    }
+    else {
+      return true;
+    }
+  }
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* We perform path termination in this kernel to avoid launching shade_surface
+   * and evaluating the shader when not needed. Only for emission and transparent
+   * surfaces in front of emission do we need to evaluate the shader, since we
+   * perform MIS as part of indirect rays. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = path_state_continuation_probability(INTEGRATOR_STATE_PASS, path_flag);
+
+  if (probability != 1.0f) {
+    const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
+
+    if (probability == 0.0f || terminate >= probability) {
+      if (shader_flags & SD_HAS_EMISSION) {
+        /* Mark path to be terminated right after shader evaluation on the surface. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
+      }
+      else if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        /* TODO: only do this for emissive volumes. */
+        INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IN_NEXT_VOLUME;
+      }
+      else {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/* Note that current_kernel is a template value since making this a variable
+ * leads to poor performance with CUDA atomics. */
+template<uint32_t current_kernel>
+ccl_device_forceinline void integrator_intersect_shader_next_kernel(
+    INTEGRATOR_STATE_ARGS,
+    const Intersection *ccl_restrict isect,
+    const int shader,
+    const int shader_flags)
+{
+  /* Note on scheduling.
+   *
+   * When there is no shadow catcher split the scheduling is simple: schedule surface shading with
+   * or without raytrace support, depending on the shader used.
+   *
+   * When there is a shadow catcher split the general idea is to have the following configuration:
+   *
+   *  - Schedule surface shading kernel (with corresponding raytrace support) for the ray which
+   *    will trace shadow catcher object.
+   *
+   *  - When no alpha-over of approximate shadow catcher is needed, schedule surface shading for
+   *    the matte ray.
+   *
+   *  - Otherwise schedule background shading kernel, so that we have a background to alpha-over
+   *    on. The background kernel will then schedule surface shading for the matte ray.
+   *
+   * Note that the splitting leaves kernel and sorting counters as-is, so use INIT semantic for
+   * the matte path. */
+
+  const bool use_raytrace_kernel = ((shader_flags & SD_HAS_RAYTRACE) ||
+                                    (kernel_data.film.pass_ao != PASS_UNUSED));
+
+  if (use_raytrace_kernel) {
+    INTEGRATOR_PATH_NEXT_SORTED(
+        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+  }
+
+#ifdef __SHADOW_CATCHER__
+  const int object_flags = intersection_get_object_flags(kg, isect);
+  if (kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags)) {
+    if (kernel_data.film.use_approximate_shadow_catcher && !kernel_data.background.transparent) {
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+      if (use_raytrace_kernel) {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+      else {
+        INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      }
+    }
+    else if (use_raytrace_kernel) {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    }
+    else {
+      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    }
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_CLOSEST);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+  kernel_assert(ray.t != 0.0f);
+
+  const uint visibility = path_state_ray_visibility(INTEGRATOR_STATE_PASS);
+  const int last_isect_prim = INTEGRATOR_STATE(isect, prim);
+  const int last_isect_object = INTEGRATOR_STATE(isect, object);
+
+  /* Trick to use short AO rays to approximate indirect light at the end of the path. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+
+    const int last_object = last_isect_object != OBJECT_NONE ?
+                                last_isect_object :
+                                kernel_tex_fetch(__prim_object, last_isect_prim);
+    const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
+    if (object_ao_distance != 0.0f) {
+      ray.t = object_ao_distance;
+    }
+  }
+
+  /* Scene Intersection. */
+  Intersection isect ccl_optional_struct_init;
+  bool hit = scene_intersect(kg, &ray, visibility, &isect);
+
+  /* TODO: remove this and do it in the various intersection functions instead. */
+  if (!hit) {
+    isect.prim = PRIM_NONE;
+  }
+
+  /* Light intersection for MIS. */
+  if (kernel_data.integrator.use_lamp_mis) {
+    /* NOTE: if we make lights visible to camera rays, we'll need to initialize
+     * these in the path_state_init. */
+    const int last_type = INTEGRATOR_STATE(isect, type);
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+
+    hit = lights_intersect(
+              kg, &ray, &isect, last_isect_prim, last_isect_object, last_type, path_flag) ||
+          hit;
+  }
+
+  /* Write intersection result into global integrator state memory. */
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &isect);
+
+#ifdef __VOLUME__
+  if (!integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+    const bool hit_surface = hit && !(isect.type & PRIMITIVE_LAMP);
+    const int shader = (hit_surface) ? intersection_get_shader(kg, &isect) : SHADER_NONE;
+    const int flags = (hit_surface) ? kernel_tex_fetch(__shaders, shader).flags : 0;
+
+    if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, flags)) {
+      /* Continue with volume kernel if we are inside a volume, regardless
+       * if we hit anything. */
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    }
+    else {
+      INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+    return;
+  }
+#endif
+
+  if (hit) {
+    /* Hit a surface, continue with light or surface kernel. */
+    if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      if (!integrator_intersect_terminate<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+              INTEGRATOR_STATE_PASS, flags)) {
+        integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST>(
+            INTEGRATOR_STATE_PASS, &isect, shader, flags);
+        return;
+      }
+      else {
+        INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+        return;
+      }
+    }
+  }
+  else {
+    /* Nothing hit, continue with background kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_shadow.h b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
new file mode 100644
index 00000000000..5bd9cfda4a4
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_shadow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Visibility for the shadow ray. */
+ccl_device_forceinline uint integrate_intersect_shadow_visibility(INTEGRATOR_STATE_CONST_ARGS)
+{
+  uint visibility = PATH_RAY_SHADOW;
+
+#ifdef __SHADOW_CATCHER__
+  const uint32_t path_flag = INTEGRATOR_STATE(shadow_path, flag);
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+#endif
+
+  return visibility;
+}
+
+ccl_device bool integrate_intersect_shadow_opaque(INTEGRATOR_STATE_ARGS,
+                                                  const Ray *ray,
+                                                  const uint visibility)
+{
+  /* Mask which will pick only opaque visibility bits from the `visibility`.
+   * Calculate the mask at compile time: the visibility will either be a high bits for the shadow
+   * catcher objects, or lower bits for the regular objects (there is no need to check the path
+   * state here again). */
+  constexpr const uint opaque_mask = SHADOW_CATCHER_VISIBILITY_SHIFT(PATH_RAY_SHADOW_OPAQUE) |
+                                     PATH_RAY_SHADOW_OPAQUE;
+
+  Intersection isect;
+  const bool opaque_hit = scene_intersect(kg, ray, visibility & opaque_mask, &isect);
+
+  if (!opaque_hit) {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+
+ccl_device_forceinline int integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_CONST_ARGS)
+{
+  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
+  const int transparent_bounce = INTEGRATOR_STATE(shadow_path, transparent_bounce);
+
+  return max(transparent_max_bounce - transparent_bounce - 1, 0);
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device bool integrate_intersect_shadow_transparent(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ray,
+                                                       const uint visibility)
+{
+  Intersection isect[INTEGRATOR_SHADOW_ISECT_SIZE];
+
+  /* Limit the number hits to the max transparent bounces allowed and the size that we
+   * have available in the integrator state. */
+  const uint max_transparent_hits = integrate_shadow_max_transparent_hits(INTEGRATOR_STATE_PASS);
+  const uint max_hits = min(max_transparent_hits, (uint)INTEGRATOR_SHADOW_ISECT_SIZE);
+  uint num_hits = 0;
+  bool opaque_hit = scene_intersect_shadow_all(kg, ray, isect, visibility, max_hits, &num_hits);
+
+  /* If number of hits exceed the transparent bounces limit, make opaque. */
+  if (num_hits > max_transparent_hits) {
+    opaque_hit = true;
+  }
+
+  if (!opaque_hit) {
+    uint num_recorded_hits = min(num_hits, max_hits);
+
+    if (num_recorded_hits > 0) {
+      sort_intersections(isect, num_recorded_hits);
+
+      /* Write intersection result into global integrator state memory. */
+      for (int hit = 0; hit < num_recorded_hits; hit++) {
+        integrator_state_write_shadow_isect(INTEGRATOR_STATE_PASS, &isect[hit], hit);
+      }
+    }
+
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = num_hits;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(shadow_path, num_hits) = 0;
+  }
+
+  return opaque_hit;
+}
+#endif
+
+ccl_device void integrator_intersect_shadow(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW);
+
+  /* Read ray from integrator state into local memory. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Compute visibility. */
+  const uint visibility = integrate_intersect_shadow_visibility(INTEGRATOR_STATE_PASS);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* TODO: compile different kernels depending on this? Especially for OptiX
+   * conditional trace calls are bad. */
+  const bool opaque_hit =
+      (kernel_data.integrator.transparent_shadows) ?
+          integrate_intersect_shadow_transparent(INTEGRATOR_STATE_PASS, &ray, visibility) :
+          integrate_intersect_shadow_opaque(INTEGRATOR_STATE_PASS, &ray, visibility);
+#else
+  const bool opaque_hit = integrate_intersect_shadow_opaque(
+      INTEGRATOR_STATE_PASS, &ray, visibility);
+#endif
+
+  if (opaque_hit) {
+    /* Hit an opaque surface, shadow path ends here. */
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    /* Hit nothing or transparent surfaces, continue to shadow kernel
+     * for shading and render buffer output.
+     *
+     * TODO: could also write to render buffer directly if no transparent shadows?
+     * Could save a kernel execution for the common case. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index c10ecc426c6..7c090952dc7 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_state_buffer_size.cl
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -1,5 +1,5 @@
 /*
- * Copyright 2011-2017 Blender Foundation
+ * Copyright 2011-2021 Blender Foundation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,23 @@
  * limitations under the License.
  */
 
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
+#pragma once
 
-__kernel void kernel_ocl_path_trace_state_buffer_size(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        uint num_threads,
-        ccl_global uint64_t *size)
+#include "kernel/integrator/integrator_subsurface.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
 {
-	((KernelGlobals*)kg)->data = data;
-	*size = split_data_buffer_size((KernelGlobals*)kg, num_threads);
+  PROFILING_INIT(kg, PROFILING_INTERSECT_SUBSURFACE);
+
+#ifdef __SUBSURFACE__
+  if (subsurface_scatter(INTEGRATOR_STATE_PASS)) {
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }
 
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
new file mode 100644
index 00000000000..60d8a8e3e54
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/bvh/bvh.h"
+#include "kernel/geom/geom.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_ARGS,
+                                                              const float3 from_P,
+                                                              const float3 to_P)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  kernel_assert(kernel_data.integrator.use_volumes);
+
+  Ray volume_ray ccl_optional_struct_init;
+  volume_ray.P = from_P;
+  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+  if (num_hits > 0) {
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+    }
+  }
+#else
+  Intersection isect;
+  int step = 0;
+  while (step < 2 * VOLUME_STACK_SIZE &&
+         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    if (volume_ray.t != FLT_MAX) {
+      volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
+    }
+    ++step;
+  }
+#endif
+}
+
+ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
+{
+  PROFILING_INIT(kg, PROFILING_INTERSECT_VOLUME_STACK);
+
+  ShaderDataTinyStorage stack_sd_storage;
+  ShaderData *stack_sd = AS_SHADER_DATA(&stack_sd_storage);
+
+  Ray volume_ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &volume_ray);
+  volume_ray.t = FLT_MAX;
+
+  const uint visibility = (INTEGRATOR_STATE(path, flag) & PATH_RAY_ALL_VISIBILITY);
+  int stack_index = 0, enclosed_index = 0;
+
+  /* Write background shader. */
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    const VolumeStack new_entry = {OBJECT_NONE, kernel_data.background.volume_shader};
+    integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+    stack_index++;
+  }
+
+#ifdef __VOLUME_RECORD_ALL__
+  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  uint num_hits = scene_intersect_volume_all(
+      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+  if (num_hits > 0) {
+    int enclosed_volumes[VOLUME_STACK_SIZE];
+    Intersection *isect = hits;
+
+    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+
+    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
+      shader_setup_from_ray(kg, stack_sd, &volume_ray, isect);
+      if (stack_sd->flag & SD_BACKFACING) {
+        bool need_add = true;
+        for (int i = 0; i < enclosed_index && need_add; ++i) {
+          /* If ray exited the volume and never entered to that volume
+           * it means that camera is inside such a volume.
+           */
+          if (enclosed_volumes[i] == stack_sd->object) {
+            need_add = false;
+          }
+        }
+        for (int i = 0; i < stack_index && need_add; ++i) {
+          /* Don't add intersections twice. */
+          VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+          if (entry.object == stack_sd->object) {
+            need_add = false;
+            break;
+          }
+        }
+        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+          const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+          integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+          ++stack_index;
+        }
+      }
+      else {
+        /* If ray from camera enters the volume, this volume shouldn't
+         * be added to the stack on exit.
+         */
+        enclosed_volumes[enclosed_index++] = stack_sd->object;
+      }
+    }
+  }
+#else
+  int enclosed_volumes[VOLUME_STACK_SIZE];
+  int step = 0;
+
+  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
+         step < 2 * VOLUME_STACK_SIZE) {
+    Intersection isect;
+    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
+      break;
+    }
+
+    shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
+    if (stack_sd->flag & SD_BACKFACING) {
+      /* If ray exited the volume and never entered to that volume
+       * it means that camera is inside such a volume.
+       */
+      bool need_add = true;
+      for (int i = 0; i < enclosed_index && need_add; ++i) {
+        /* If ray exited the volume and never entered to that volume
+         * it means that camera is inside such a volume.
+         */
+        if (enclosed_volumes[i] == stack_sd->object) {
+          need_add = false;
+        }
+      }
+      for (int i = 0; i < stack_index && need_add; ++i) {
+        /* Don't add intersections twice. */
+        VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+        if (entry.object == stack_sd->object) {
+          need_add = false;
+          break;
+        }
+      }
+      if (need_add) {
+        const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+        ++stack_index;
+      }
+    }
+    else {
+      /* If ray from camera enters the volume, this volume shouldn't
+       * be added to the stack on exit.
+       */
+      enclosed_volumes[enclosed_index++] = stack_sd->object;
+    }
+
+    /* Move ray forward. */
+    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
+    ++step;
+  }
+#endif
+
+  /* Write terminator. */
+  const VolumeStack new_entry = {OBJECT_NONE, SHADER_NONE};
+  integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
+
+  INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+                       DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_megakernel.h b/intern/cycles/kernel/integrator/integrator_megakernel.h
new file mode 100644
index 00000000000..91363ea1c7f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_megakernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_init_from_camera.h"
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_intersect_shadow.h"
+#include "kernel/integrator/integrator_intersect_subsurface.h"
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/integrator_shade_background.h"
+#include "kernel/integrator/integrator_shade_light.h"
+#include "kernel/integrator/integrator_shade_shadow.h"
+#include "kernel/integrator/integrator_shade_surface.h"
+#include "kernel/integrator/integrator_shade_volume.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void integrator_megakernel(INTEGRATOR_STATE_ARGS,
+                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Each kernel indicates the next kernel to execute, so here we simply
+   * have to check what that kernel is and execute it.
+   *
+   * TODO: investigate if we can use device side enqueue for GPUs to avoid
+   * having to compile this big kernel. */
+  while (true) {
+    if (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+      /* First handle any shadow paths before we potentially create more shadow paths. */
+      switch (INTEGRATOR_STATE(shadow_path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+          integrator_intersect_shadow(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+          integrator_shade_shadow(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else if (INTEGRATOR_STATE(path, queued_kernel)) {
+      /* Then handle regular path kernels. */
+      switch (INTEGRATOR_STATE(path, queued_kernel)) {
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+          integrator_intersect_closest(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+          integrator_shade_background(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+          integrator_shade_surface(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+          integrator_shade_volume(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+          integrator_shade_surface_raytrace(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+          integrator_shade_light(INTEGRATOR_STATE_PASS, render_buffer);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+          integrator_intersect_subsurface(INTEGRATOR_STATE_PASS);
+          break;
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+          integrator_intersect_volume_stack(INTEGRATOR_STATE_PASS);
+          break;
+        default:
+          kernel_assert(0);
+          break;
+      }
+    }
+    else {
+      break;
+    }
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_background.h b/intern/cycles/kernel/integrator/integrator_shade_background.h
new file mode 100644
index 00000000000..3e4cc837e9b
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_background.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float3 integrator_eval_background_shader(INTEGRATOR_STATE_ARGS,
+                                                    ccl_global float *ccl_restrict render_buffer)
+{
+#ifdef __BACKGROUND__
+  const int shader = kernel_data.background.surface_shader;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Use visibility flag to skip lights. */
+  if (shader & SHADER_EXCLUDE_ANY) {
+    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((shader & SHADER_EXCLUDE_GLOSSY) && ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+                                              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+        ((shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return zero_float3();
+  }
+
+  /* Use fast constant background color if available. */
+  float3 L = zero_float3();
+  if (!shader_constant_emission_eval(kg, shader, &L)) {
+    /* Evaluate background shader. */
+
+    /* TODO: does aliasing like this break automatic SoA in CUDA?
+     * Should we instead store closures separate from ShaderData? */
+    ShaderDataTinyStorage emission_sd_storage;
+    ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
+    shader_setup_from_background(kg,
+                                 emission_sd,
+                                 INTEGRATOR_STATE(ray, P),
+                                 INTEGRATOR_STATE(ray, D),
+                                 INTEGRATOR_STATE(ray, time));
+
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, render_buffer, path_flag | PATH_RAY_EMISSION);
+
+    L = shader_background_eval(emission_sd);
+  }
+
+  /* Background MIS weights. */
+#  ifdef __BACKGROUND_MIS__
+  /* Check if background light exists or if we should skip pdf. */
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
+    const float3 ray_P = INTEGRATOR_STATE(ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(ray, D);
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* multiple importance sampling, get background light pdf for ray
+     * direction, and compute weight with respect to BSDF pdf */
+    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
+    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
+
+    L *= mis_weight;
+  }
+#  endif
+
+  return L;
+#else
+  return make_float3(0.8f, 0.8f, 0.8f);
+#endif
+}
+
+ccl_device_inline void integrate_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  /* Accumulate transparency for transparent background. We can skip background
+   * shader evaluation unless a background pass is used. */
+  bool eval_background = true;
+  float transparent = 0.0f;
+
+  const bool is_transparent_background_ray = kernel_data.background.transparent &&
+                                             (INTEGRATOR_STATE(path, flag) &
+                                              PATH_RAY_TRANSPARENT_BACKGROUND);
+
+  if (is_transparent_background_ray) {
+    transparent = average(INTEGRATOR_STATE(path, throughput));
+
+#ifdef __PASSES__
+    eval_background = (kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND));
+#else
+    eval_background = false;
+#endif
+  }
+
+  /* Evaluate background shader. */
+  float3 L = (eval_background) ?
+                 integrator_eval_background_shader(INTEGRATOR_STATE_PASS, render_buffer) :
+                 zero_float3();
+
+  /* When using the ao bounces approximation, adjust background
+   * shader intensity with ao factor. */
+  if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
+    L *= kernel_data.integrator.ao_bounces_factor;
+  }
+
+  /* Write to render buffer. */
+  kernel_accum_background(
+      INTEGRATOR_STATE_PASS, L, transparent, is_transparent_background_ray, render_buffer);
+}
+
+ccl_device_inline void integrate_distant_lights(INTEGRATOR_STATE_ARGS,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+  LightSample ls ccl_optional_struct_init;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) {
+      /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+      const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+      if (ls.shader & SHADER_EXCLUDE_ANY) {
+        if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+            ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+             ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+              (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+            ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+            ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
+            ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+          return;
+      }
+#endif
+
+      /* Evaluate light shader. */
+      /* TODO: does aliasing like this break automatic SoA in CUDA? */
+      ShaderDataTinyStorage emission_sd_storage;
+      ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+      float3 light_eval = light_sample_shader_eval(
+          INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+      if (is_zero(light_eval)) {
+        return;
+      }
+
+      /* MIS weighting. */
+      if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+        /* multiple importance sampling, get regular light pdf,
+         * and compute weight with respect to BSDF pdf */
+        const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        light_eval *= mis_weight;
+      }
+
+      /* Write to render buffer. */
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+    }
+  }
+}
+
+ccl_device void integrator_shade_background(INTEGRATOR_STATE_ARGS,
+                                            ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  /* TODO: unify these in a single loop to only have a single shader evaluation call. */
+  integrate_distant_lights(INTEGRATOR_STATE_PASS, render_buffer);
+  integrate_background(INTEGRATOR_STATE_PASS, render_buffer);
+
+#ifdef __SHADOW_CATCHER__
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SHADOW_CATCHER_BACKGROUND;
+
+    const int isect_prim = INTEGRATOR_STATE(isect, prim);
+    const int shader = intersection_get_shader_from_isect_prim(kg, isect_prim);
+    const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+
+    if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                  shader);
+    }
+    else {
+      INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+                                  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                  shader);
+    }
+    return;
+  }
+#endif
+
+  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_light.h b/intern/cycles/kernel/integrator/integrator_shade_light.h
new file mode 100644
index 00000000000..05b530f9665
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_light.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void integrate_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  /* Setup light sample. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  float3 ray_P = INTEGRATOR_STATE(ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(ray, D);
+  const float ray_time = INTEGRATOR_STATE(ray, time);
+
+  /* Advance ray beyond light. */
+  /* TODO: can we make this more numerically robust to avoid reintersecting the
+   * same light in some cases? */
+  const float3 new_ray_P = ray_offset(ray_P + ray_D * isect.t, ray_D);
+  INTEGRATOR_STATE_WRITE(ray, P) = new_ray_P;
+  INTEGRATOR_STATE_WRITE(ray, t) -= isect.t;
+
+  /* Set position to where the BSDF was sampled, for correct MIS PDF. */
+  const float mis_ray_t = INTEGRATOR_STATE(path, mis_ray_t);
+  ray_P -= ray_D * mis_ray_t;
+  isect.t += mis_ray_t;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = mis_ray_t + isect.t;
+
+  LightSample ls ccl_optional_struct_init;
+  const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
+
+  if (!use_light_sample) {
+    return;
+  }
+
+  /* Use visibility flag to skip lights. */
+#ifdef __PASSES__
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (ls.shader & SHADER_EXCLUDE_ANY) {
+    if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (path_flag & PATH_RAY_DIFFUSE)) ||
+        ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
+         ((path_flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
+          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
+        ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
+        ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
+      return;
+  }
+#endif
+
+  /* Evaluate light shader. */
+  /* TODO: does aliasing like this break automatic SoA in CUDA? */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  float3 light_eval = light_sample_shader_eval(INTEGRATOR_STATE_PASS, emission_sd, &ls, ray_time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* MIS weighting. */
+  if (!(path_flag & PATH_RAY_MIS_SKIP)) {
+    /* multiple importance sampling, get regular light pdf,
+     * and compute weight with respect to BSDF pdf */
+    const float mis_ray_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    light_eval *= mis_weight;
+  }
+
+  /* Write to render buffer. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, light_eval, render_buffer);
+}
+
+ccl_device void integrator_shade_light(INTEGRATOR_STATE_ARGS,
+                                       ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_LIGHT_SETUP);
+
+  integrate_light(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* TODO: we could get stuck in an infinite loop if there are precision issues
+   * and the same light is hit again.
+   *
+   * As a workaround count this as a transparent bounce. It makes some sense
+   * to interpret lights as transparent surfaces (and support making them opaque),
+   * but this needs to be revisited. */
+  uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+
+  if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    return;
+  }
+  else {
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+
+  /* TODO: in some cases we could continue directly to SHADE_BACKGROUND, but
+   * probably that optimization is probably not practical if we add lights to
+   * scene geometry. */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_shadow.h b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
new file mode 100644
index 00000000000..fd3c3ae1653
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_shadow.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+#include "kernel/kernel_shader.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool shadow_intersections_has_remaining(const int num_hits)
+{
+  return num_hits >= INTEGRATOR_SHADOW_ISECT_SIZE;
+}
+
+#ifdef __TRANSPARENT_SHADOWS__
+ccl_device_inline float3 integrate_transparent_surface_shadow(INTEGRATOR_STATE_ARGS, const int hit)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SURFACE);
+
+  /* TODO: does aliasing like this break automatic SoA in CUDA?
+   * Should we instead store closures separate from ShaderData?
+   *
+   * TODO: is it better to declare this outside the loop or keep it local
+   * so the compiler can see there is no dependency between iterations? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data at surface. */
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_shadow_isect(INTEGRATOR_STATE_PASS, &isect, hit);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, shadow_sd, &ray, &isect);
+
+  /* Evaluate shader. */
+  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW>(
+        INTEGRATOR_STATE_PASS, shadow_sd, NULL, PATH_RAY_SHADOW);
+  }
+
+#  ifdef __VOLUME__
+  /* Exit/enter volume. */
+  shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, shadow_sd);
+#  endif
+
+  /* Compute transparency from closures. */
+  return shader_bsdf_transparency(kg, shadow_sd);
+}
+
+#  ifdef __VOLUME__
+ccl_device_inline void integrate_transparent_volume_shadow(INTEGRATOR_STATE_ARGS,
+                                                           const int hit,
+                                                           const int num_recorded_hits,
+                                                           float3 *ccl_restrict throughput)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_VOLUME);
+
+  /* TODO: deduplicate with surface, or does it not matter for memory usage? */
+  ShaderDataTinyStorage shadow_sd_storage;
+  ShaderData *shadow_sd = AS_SHADER_DATA(&shadow_sd_storage);
+
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Modify ray position and length to match current segment. */
+  const float start_t = (hit == 0) ? 0.0f : INTEGRATOR_STATE_ARRAY(shadow_isect, hit - 1, t);
+  const float end_t = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(shadow_isect, hit, t) :
+                                                  ray.t;
+  ray.P += start_t * ray.D;
+  ray.t = end_t - start_t;
+
+  shader_setup_from_volume(kg, shadow_sd, &ray);
+
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  volume_shadow_heterogeneous(INTEGRATOR_STATE_PASS, &ray, shadow_sd, throughput, step_size);
+}
+#  endif
+
+ccl_device_inline bool integrate_transparent_shadow(INTEGRATOR_STATE_ARGS, const int num_hits)
+{
+  /* Accumulate shadow for transparent surfaces. */
+  const int num_recorded_hits = min(num_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+
+  for (int hit = 0; hit < num_recorded_hits + 1; hit++) {
+    /* Volume shaders. */
+    if (hit < num_recorded_hits || !shadow_intersections_has_remaining(num_hits)) {
+#  ifdef __VOLUME__
+      if (!integrator_state_shadow_volume_stack_is_empty(INTEGRATOR_STATE_PASS)) {
+        float3 throughput = INTEGRATOR_STATE(shadow_path, throughput);
+        integrate_transparent_volume_shadow(
+            INTEGRATOR_STATE_PASS, hit, num_recorded_hits, &throughput);
+        if (is_zero(throughput)) {
+          return true;
+        }
+
+        INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      }
+#  endif
+    }
+
+    /* Surface shaders. */
+    if (hit < num_recorded_hits) {
+      const float3 shadow = integrate_transparent_surface_shadow(INTEGRATOR_STATE_PASS, hit);
+      const float3 throughput = INTEGRATOR_STATE(shadow_path, throughput) * shadow;
+      if (is_zero(throughput)) {
+        return true;
+      }
+
+      INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+      INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) += 1;
+    }
+
+    /* Note we do not need to check max_transparent_bounce here, the number
+     * of intersections is already limited and made opaque in the
+     * INTERSECT_SHADOW kernel. */
+  }
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* There are more hits that we could not recorded due to memory usage,
+     * adjust ray to intersect again from the last hit. */
+    const float last_hit_t = INTEGRATOR_STATE_ARRAY(shadow_isect, num_recorded_hits - 1, t);
+    const float3 ray_P = INTEGRATOR_STATE(shadow_ray, P);
+    const float3 ray_D = INTEGRATOR_STATE(shadow_ray, D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray_offset(ray_P + last_hit_t * ray_D, ray_D);
+    INTEGRATOR_STATE_WRITE(shadow_ray, t) -= last_hit_t;
+  }
+
+  return false;
+}
+#endif /* __TRANSPARENT_SHADOWS__ */
+
+ccl_device void integrator_shade_shadow(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_SHADOW_SETUP);
+  const int num_hits = INTEGRATOR_STATE(shadow_path, num_hits);
+
+#ifdef __TRANSPARENT_SHADOWS__
+  /* Evaluate transparent shadows. */
+  const bool opaque = integrate_transparent_shadow(INTEGRATOR_STATE_PASS, num_hits);
+  if (opaque) {
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+#endif
+
+  if (shadow_intersections_has_remaining(num_hits)) {
+    /* More intersections to find, continue shadow ray. */
+    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    return;
+  }
+  else {
+    kernel_accum_light(INTEGRATOR_STATE_PASS, render_buffer);
+    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    return;
+  }
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
new file mode 100644
index 00000000000..73b7cad32be
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -0,0 +1,502 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_subsurface.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_forceinline void integrate_surface_shader_setup(INTEGRATOR_STATE_CONST_ARGS,
+                                                           ShaderData *sd)
+{
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  shader_setup_from_ray(kg, sd, &ray, &isect);
+}
+
+#ifdef __HOLDOUT__
+ccl_device_forceinline bool integrate_surface_holdout(INTEGRATOR_STATE_CONST_ARGS,
+                                                      ShaderData *sd,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+  /* Write holdout transparency to render buffer and stop if fully holdout. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
+      (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
+    const float3 holdout_weight = shader_holdout_apply(kg, sd);
+    if (kernel_data.background.transparent) {
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      const float transparent = average(holdout_weight * throughput);
+      kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+    }
+    if (isequal_float3(holdout_weight, one_float3())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+#endif /* __HOLDOUT__ */
+
+#ifdef __EMISSION__
+ccl_device_forceinline void integrate_surface_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                                       const ShaderData *sd,
+                                                       ccl_global float *ccl_restrict
+                                                           render_buffer)
+{
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+
+  /* Evaluate emissive closure. */
+  float3 L = shader_emissive_eval(sd);
+
+#  ifdef __HAIR__
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
+      (sd->type & PRIMITIVE_ALL_TRIANGLE))
+#  else
+  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+#  endif
+  {
+    const float bsdf_pdf = INTEGRATOR_STATE(path, mis_ray_pdf);
+    const float t = sd->ray_length + INTEGRATOR_STATE(path, mis_ray_t);
+
+    /* Multiple importance sampling, get triangle light pdf,
+     * and compute weight with respect to BSDF pdf. */
+    float pdf = triangle_light_pdf(kg, sd, t);
+    float mis_weight = power_heuristic(bsdf_pdf, pdf);
+
+    L *= mis_weight;
+  }
+
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_accum_emission(INTEGRATOR_STATE_PASS, throughput, L, render_buffer);
+}
+#endif /* __EMISSION__ */
+
+#ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!(kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL))) {
+    return;
+  }
+
+  /* Sample position on a light. */
+  LightSample ls ccl_optional_struct_init;
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, &ls)) {
+      return;
+    }
+  }
+
+  kernel_assert(ls.pdf != 0.0f);
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, &ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  const bool is_transmission = shader_bsdf_is_transmission(sd, ls.D);
+
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  const float bsdf_pdf = shader_bsdf_eval(kg, sd, ls.D, is_transmission, &bsdf_eval, ls.shader);
+  bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);
+
+  if (ls.shader & SHADER_USE_MIS) {
+    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    bsdf_eval_mul(&bsdf_eval, mis_weight);
+  }
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, &ls, &bsdf_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_surface_shadow_ray(kg, sd, &ls, &ray);
+  const bool is_light = light_sample_is_light(&ls);
+
+  /* Copy volume stack and enter/exit volume. */
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  if (is_transmission) {
+#  ifdef __VOLUME__
+    shadow_volume_stack_enter_exit(INTEGRATOR_STATE_PASS, sd);
+#  endif
+  }
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
+  const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#endif
+
+/* Path tracing: bounce off or through surface with new direction. */
+ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE_ARGS,
+                                                                ShaderData *sd,
+                                                                const RNGState *rng_state)
+{
+  /* Sample BSDF or BSSRDF. */
+  if (!(sd->flag & (SD_BSDF | SD_BSSRDF))) {
+    return LABEL_NONE;
+  }
+
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+  const ShaderClosure *sc = shader_bsdf_bssrdf_pick(sd, &bsdf_u);
+
+#ifdef __SUBSURFACE__
+  /* BSSRDF closure, we schedule subsurface intersection kernel. */
+  if (CLOSURE_IS_BSSRDF(sc->type)) {
+    return subsurface_bounce(INTEGRATOR_STATE_PASS, sd, sc);
+  }
+#endif
+
+  /* BSDF closure, sample direction. */
+  float bsdf_pdf;
+  BsdfEval bsdf_eval ccl_optional_struct_init;
+  float3 bsdf_omega_in ccl_optional_struct_init;
+  differential3 bsdf_domega_in ccl_optional_struct_init;
+  int label;
+
+  label = shader_bsdf_sample_closure(
+      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
+
+  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray. Note that clipping works through transparent bounces. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(bsdf_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = (label & LABEL_TRANSPARENT) ?
+                                       INTEGRATOR_STATE(ray, t) - sd->ray_length :
+                                       FLT_MAX;
+
+#ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(bsdf_domega_in);
+#endif
+
+  /* Update throughput. */
+  float3 throughput = INTEGRATOR_STATE(path, throughput);
+  throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path,
+                             diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+    }
+  }
+
+  /* Update path state */
+  if (label & LABEL_TRANSPARENT) {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+  }
+  else {
+    INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = bsdf_pdf;
+    INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(bsdf_pdf,
+                                                      INTEGRATOR_STATE(path, min_ray_pdf));
+  }
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return label;
+}
+
+#ifdef __VOLUME__
+ccl_device_forceinline bool integrate_surface_volume_only_bounce(INTEGRATOR_STATE_ARGS,
+                                                                 ShaderData *sd)
+{
+  if (!path_state_volume_next(INTEGRATOR_STATE_PASS)) {
+    return LABEL_NONE;
+  }
+
+  /* Setup ray position, direction stays unchanged. */
+  INTEGRATOR_STATE_WRITE(ray, P) = ray_offset(sd->P, -sd->Ng);
+
+  /* Clipping works through transparent. */
+  INTEGRATOR_STATE_WRITE(ray, t) -= sd->ray_length;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+#  endif
+
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) += sd->ray_length;
+
+  return LABEL_TRANSMIT | LABEL_TRANSPARENT;
+}
+#endif
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+ccl_device_forceinline void integrate_surface_ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  ifdef __KERNEL_OPTIX__
+  optixDirectCall<void>(2, INTEGRATOR_STATE_PASS, sd, rng_state, render_buffer);
+}
+
+extern "C" __device__ void __direct_callable__ao_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                      const ShaderData *ccl_restrict sd,
+                                                      const RNGState *ccl_restrict rng_state,
+                                                      ccl_global float *ccl_restrict render_buffer)
+{
+#  endif /* __KERNEL_OPTIX__ */
+  float bsdf_u, bsdf_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+  const float3 ao_N = shader_bsdf_ao_normal(kg, sd);
+  float3 ao_D;
+  float ao_pdf;
+  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
+    Ray ray ccl_optional_struct_init;
+    ray.P = ray_offset(sd->P, sd->Ng);
+    ray.D = ao_D;
+    ray.t = kernel_data.integrator.ao_bounces_distance;
+    ray.time = sd->time;
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
+
+    Intersection isect ccl_optional_struct_init;
+    if (!scene_intersect(kg, &ray, PATH_RAY_SHADOW_OPAQUE, &isect)) {
+      ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                                 render_buffer);
+      const float3 throughput = INTEGRATOR_STATE(path, throughput);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, throughput);
+    }
+  }
+}
+#endif /* defined(__AO__) && defined(__SHADER_RAYTRACE__) */
+
+template<uint node_feature_mask>
+ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
+                                  ccl_global float *ccl_restrict render_buffer)
+
+{
+  PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_SURFACE_SETUP);
+
+  /* Setup shader data. */
+  ShaderData sd;
+  integrate_surface_shader_setup(INTEGRATOR_STATE_PASS, &sd);
+  PROFILING_SHADER(sd.object, sd.shader);
+
+  int continue_path_label = 0;
+
+  /* Skip most work for volume bounding surface. */
+#ifdef __VOLUME__
+  if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
+#endif
+
+    {
+      const int path_flag = INTEGRATOR_STATE(path, flag);
+#ifdef __SUBSURFACE__
+      /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+      if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+#endif
+      {
+        /* Evaluate shader. */
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+        shader_eval_surface<node_feature_mask>(
+            INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
+      }
+    }
+
+#ifdef __SUBSURFACE__
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      /* When coming from inside subsurface scattering, setup a diffuse
+       * closure to perform lighting at the exit point. */
+      INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_SUBSURFACE;
+      subsurface_shader_data_setup(INTEGRATOR_STATE_PASS, &sd);
+    }
+#endif
+
+    shader_prepare_surface_closures(INTEGRATOR_STATE_PASS, &sd);
+
+#ifdef __HOLDOUT__
+    /* Evaluate holdout. */
+    if (!integrate_surface_holdout(INTEGRATOR_STATE_PASS, &sd, render_buffer)) {
+      return false;
+    }
+#endif
+
+#ifdef __EMISSION__
+    /* Write emission. */
+    if (sd.flag & SD_EMISSION) {
+      integrate_surface_emission(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+    }
+#endif
+
+#ifdef __PASSES__
+    /* Write render passes. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_PASSES);
+    kernel_write_data_passes(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Load random number state. */
+    RNGState rng_state;
+    path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+    /* Perform path termination. Most paths have already been terminated in
+     * the intersect_closest kernel, this is just for emission and for dividing
+     * throughput by the probability at the right moment. */
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                  0.0f :
+                                  path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                      path_flag);
+    if (probability == 0.0f) {
+      return false;
+    }
+    else if (probability != 1.0f) {
+      INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+    }
+
+#ifdef __DENOISING_FEATURES__
+    kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+#ifdef __SHADOW_CATCHER__
+    kernel_write_shadow_catcher_bounce_data(INTEGRATOR_STATE_PASS, &sd, render_buffer);
+#endif
+
+    /* Direct light. */
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_DIRECT_LIGHT);
+    integrate_surface_direct_light(INTEGRATOR_STATE_PASS, &sd, &rng_state);
+
+#if defined(__AO__) && defined(__SHADER_RAYTRACE__)
+    /* Ambient occlusion pass. */
+    if (node_feature_mask & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if ((kernel_data.film.pass_ao != PASS_UNUSED) &&
+          (INTEGRATOR_STATE(path, flag) & PATH_RAY_CAMERA)) {
+        PROFILING_EVENT(PROFILING_SHADE_SURFACE_AO);
+        integrate_surface_ao_pass(INTEGRATOR_STATE_PASS, &sd, &rng_state, render_buffer);
+      }
+    }
+#endif
+
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_bsdf_bssrdf_bounce(
+        INTEGRATOR_STATE_PASS, &sd, &rng_state);
+#ifdef __VOLUME__
+  }
+  else {
+    PROFILING_EVENT(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT);
+    continue_path_label = integrate_surface_volume_only_bounce(INTEGRATOR_STATE_PASS, &sd);
+  }
+
+  if (continue_path_label & LABEL_TRANSMIT) {
+    /* Enter/Exit volume. */
+    volume_stack_enter_exit(INTEGRATOR_STATE_PASS, &sd);
+  }
+#endif
+
+  return continue_path_label != 0;
+}
+
+template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
+         int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+ccl_device_forceinline void integrator_shade_surface(INTEGRATOR_STATE_ARGS,
+                                                     ccl_global float *ccl_restrict render_buffer)
+{
+  if (integrate_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, render_buffer)) {
+    if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SUBSURFACE) {
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+    }
+    else {
+      kernel_assert(INTEGRATOR_STATE(ray, t) != 0.0f);
+      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    }
+  }
+  else {
+    INTEGRATOR_PATH_TERMINATE(current_kernel);
+  }
+}
+
+ccl_device_forceinline void integrator_shade_surface_raytrace(
+    INTEGRATOR_STATE_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE>(INTEGRATOR_STATE_PASS,
+                                                                            render_buffer);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
new file mode 100644
index 00000000000..095a28ac505
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -0,0 +1,1019 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_accumulate.h"
+#include "kernel/kernel_emission.h"
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_passes.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/integrator/integrator_intersect_closest.h"
+#include "kernel/integrator/integrator_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __VOLUME__
+
+/* Events for probabilistic scattering. */
+
+typedef enum VolumeIntegrateEvent {
+  VOLUME_PATH_SCATTERED = 0,
+  VOLUME_PATH_ATTENUATED = 1,
+  VOLUME_PATH_MISSED = 2
+} VolumeIntegrateEvent;
+
+typedef struct VolumeIntegrateResult {
+  /* Throughput and offset for direct light scattering. */
+  bool direct_scatter;
+  float3 direct_throughput;
+  float direct_t;
+  ShaderVolumePhases direct_phases;
+
+  /* Throughput and offset for indirect light scattering. */
+  bool indirect_scatter;
+  float3 indirect_throughput;
+  float indirect_t;
+  ShaderVolumePhases indirect_phases;
+} VolumeIntegrateResult;
+
+/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
+ * and precision issues.
+ * todo: this value could be tweaked or turned into a probability to avoid unnecessary
+ * work in volumes and subsurface scattering. */
+#  define VOLUME_THROUGHPUT_EPSILON 1e-6f
+
+/* Volume shader properties
+ *
+ * extinction coefficient = absorption coefficient + scattering coefficient
+ * sigma_t = sigma_a + sigma_s */
+
+typedef struct VolumeShaderCoefficients {
+  float3 sigma_t;
+  float3 sigma_s;
+  float3 emission;
+} VolumeShaderCoefficients;
+
+/* Evaluate shader to get extinction coefficient at P. */
+ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                                   ShaderData *ccl_restrict sd,
+                                                   float3 *ccl_restrict extinction)
+{
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & SD_EXTINCTION)) {
+    return false;
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  *extinction = sd->closure_transparent_extinction * density;
+  return true;
+}
+
+/* Evaluate shader to get absorption, scattering and emission at P. */
+ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
+                                            ShaderData *ccl_restrict sd,
+                                            VolumeShaderCoefficients *coeff)
+{
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
+    return false;
+  }
+
+  coeff->sigma_s = zero_float3();
+  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
+  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
+
+  if (sd->flag & SD_SCATTER) {
+    for (int i = 0; i < sd->num_closure; i++) {
+      const ShaderClosure *sc = &sd->closure[i];
+
+      if (CLOSURE_IS_VOLUME(sc->type)) {
+        coeff->sigma_s += sc->weight;
+      }
+    }
+  }
+
+  const float density = object_volume_density(kg, sd->object);
+  coeff->sigma_s *= density;
+  coeff->sigma_t *= density;
+  coeff->emission *= density;
+
+  return true;
+}
+
+ccl_device_forceinline void volume_step_init(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             const float object_step_size,
+                                             float t,
+                                             float *step_size,
+                                             float *step_shade_offset,
+                                             float *steps_offset,
+                                             int *max_steps)
+{
+  if (object_step_size == FLT_MAX) {
+    /* Homogeneous volume. */
+    *step_size = t;
+    *step_shade_offset = 0.0f;
+    *steps_offset = 1.0f;
+    *max_steps = 1;
+  }
+  else {
+    /* Heterogeneous volume. */
+    *max_steps = kernel_data.integrator.volume_max_steps;
+    float step = min(object_step_size, t);
+
+    /* compute exact steps in advance for malloc */
+    if (t > *max_steps * step) {
+      step = t / (float)*max_steps;
+    }
+
+    *step_size = step;
+
+    /* Perform shading at this offset within a step, to integrate over
+     * over the entire step segment. */
+    *step_shade_offset = path_state_rng_1D_hash(kg, rng_state, 0x1e31d8a4);
+
+    /* Shift starting point of all segment by this random amount to avoid
+     * banding artifacts from the volume bounding shape. */
+    *steps_offset = path_state_rng_1D_hash(kg, rng_state, 0x3d22c7b3);
+  }
+}
+
+/* Volume Shadows
+ *
+ * These functions are used to attenuate shadow rays to lights. Both absorption
+ * and scattering will block light, represented by the extinction coefficient. */
+
+#  if 0
+/* homogeneous volume: assume shader evaluation at the starts gives
+ * the extinction coefficient for the entire line segment */
+ccl_device void volume_shadow_homogeneous(INTEGRATOR_STATE_ARGS,
+                                          Ray *ccl_restrict ray,
+                                          ShaderData *ccl_restrict sd,
+                                          float3 *ccl_restrict throughput)
+{
+  float3 sigma_t = zero_float3();
+
+  if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+    *throughput *= volume_color_transmittance(sigma_t, ray->t);
+  }
+}
+#  endif
+
+/* heterogeneous volume: integrate stepping through the volume until we
+ * reach the end, get absorbed entirely, or run out of iterations */
+ccl_device void volume_shadow_heterogeneous(INTEGRATOR_STATE_ARGS,
+                                            Ray *ccl_restrict ray,
+                                            ShaderData *ccl_restrict sd,
+                                            float3 *ccl_restrict throughput,
+                                            const float object_step_size)
+{
+  /* Load random number state. */
+  RNGState rng_state;
+  shadow_path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  float3 tp = *throughput;
+
+  /* Prepare for stepping.
+   * For shadows we do not offset all segments, since the starting point is
+   * already a random distance inside the volume. It also appears to create
+   * banding artifacts for unknown reasons. */
+  int max_steps;
+  float step_size, step_shade_offset, unused;
+  volume_step_init(kg,
+                   &rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &unused,
+                   &max_steps);
+  const float steps_offset = 1.0f;
+
+  /* compute extinction at the start */
+  float t = 0.0f;
+
+  float3 sum = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* advance to new position */
+    float new_t = min(ray->t, (i + steps_offset) * step_size);
+    float dt = new_t - t;
+
+    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
+    float3 sigma_t = zero_float3();
+
+    /* compute attenuation over segment */
+    sd->P = new_P;
+    if (shadow_volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &sigma_t)) {
+      /* Compute `expf()` only for every Nth step, to save some calculations
+       * because `exp(a)*exp(b) = exp(a+b)`, also do a quick #VOLUME_THROUGHPUT_EPSILON
+       * check then. */
+      sum += (-sigma_t * dt);
+      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
+        tp = *throughput * exp3(sum);
+
+        /* stop if nearly all light is blocked */
+        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
+            tp.z < VOLUME_THROUGHPUT_EPSILON)
+          break;
+      }
+    }
+
+    /* stop if at the end of the volume */
+    t = new_t;
+    if (t == ray->t) {
+      /* Update throughput in case we haven't done it above */
+      tp = *throughput * exp3(sum);
+      break;
+    }
+  }
+
+  *throughput = tp;
+}
+
+/* Equi-angular sampling as in:
+ * "Importance Sampling Techniques for Path Tracing in Participating Media" */
+
+ccl_device float volume_equiangular_sample(const Ray *ccl_restrict ray,
+                                           const float3 light_P,
+                                           const float xi,
+                                           float *pdf)
+{
+  const float t = ray->t;
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
+  if (UNLIKELY(theta_b == theta_a)) {
+    *pdf = 0.0f;
+    return 0.0f;
+  }
+  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return min(t, delta + t_); /* min is only for float precision errors */
+}
+
+ccl_device float volume_equiangular_pdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  const float delta = dot((light_P - ray->P), ray->D);
+  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
+
+  return pdf;
+}
+
+ccl_device float volume_equiangular_cdf(const Ray *ccl_restrict ray,
+                                        const float3 light_P,
+                                        const float sample_t)
+{
+  float delta = dot((light_P - ray->P), ray->D);
+  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
+  if (UNLIKELY(D == 0.0f)) {
+    return 0.0f;
+  }
+
+  const float t = ray->t;
+  const float t_ = sample_t - delta;
+
+  const float theta_a = -atan2f(delta, D);
+  const float theta_b = atan2f(t - delta, D);
+  if (UNLIKELY(theta_b == theta_a)) {
+    return 0.0f;
+  }
+
+  const float theta_sample = atan2f(t_, D);
+  const float cdf = (theta_sample - theta_a) / (theta_b - theta_a);
+
+  return cdf;
+}
+
+/* Distance sampling */
+
+ccl_device float volume_distance_sample(
+    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
+{
+  /* xi is [0, 1[ so log(0) should never happen, division by zero is
+   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
+  float sample_sigma_t = volume_channel_get(sigma_t, channel);
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float sample_transmittance = volume_channel_get(full_transmittance, channel);
+
+  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
+
+  *transmittance = volume_color_transmittance(sigma_t, sample_t);
+  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
+
+  /* todo: optimization: when taken together with hit/miss decision,
+   * the full_transmittance cancels out drops out and xi does not
+   * need to be remapped */
+
+  return sample_t;
+}
+
+ccl_device float3 volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
+{
+  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
+  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
+
+  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
+}
+
+/* Emission */
+
+ccl_device float3 volume_emission_integrate(VolumeShaderCoefficients *coeff,
+                                            int closure_flag,
+                                            float3 transmittance,
+                                            float t)
+{
+  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
+   * this goes to E * t as sigma_t goes to zero
+   *
+   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
+  float3 emission = coeff->emission;
+
+  if (closure_flag & SD_EXTINCTION) {
+    float3 sigma_t = coeff->sigma_t;
+
+    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
+    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
+    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
+  }
+  else
+    emission *= t;
+
+  return emission;
+}
+
+/* Volume Integration */
+
+typedef struct VolumeIntegrateState {
+  /* Volume segment extents. */
+  float start_t;
+  float end_t;
+
+  /* If volume is absorption-only up to this point, and no probabilistic
+   * scattering or termination has been used yet. */
+  bool absorption_only;
+
+  /* Random numbers for scattering. */
+  float rscatter;
+  float rphase;
+
+  /* Multiple importance sampling. */
+  VolumeSampleMethod direct_sample_method;
+  bool use_mis;
+  float distance_pdf;
+  float equiangular_pdf;
+} VolumeIntegrateState;
+
+ccl_device_forceinline void volume_integrate_step_scattering(
+    const ShaderData *sd,
+    const Ray *ray,
+    const float3 equiangular_light_P,
+    const VolumeShaderCoefficients &ccl_restrict coeff,
+    const float3 transmittance,
+    VolumeIntegrateState &ccl_restrict vstate,
+    VolumeIntegrateResult &ccl_restrict result)
+{
+  /* Pick random color channel, we use the Veach one-sample
+   * model with balance heuristic for the channels. */
+  const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+  float3 channel_pdf;
+  const int channel = volume_sample_channel(
+      albedo, result.indirect_throughput, vstate.rphase, &channel_pdf);
+
+  /* Equiangular sampling for direct lighting. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
+    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t) {
+      const float new_dt = result.direct_t - vstate.start_t;
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+
+      result.direct_scatter = true;
+      result.direct_throughput *= coeff.sigma_s * new_transmittance / vstate.equiangular_pdf;
+      shader_copy_volume_phases(&result.direct_phases, sd);
+
+      /* Multiple importance sampling. */
+      if (vstate.use_mis) {
+        const float distance_pdf = vstate.distance_pdf *
+                                   dot(channel_pdf, coeff.sigma_t * new_transmittance);
+        const float mis_weight = 2.0f * power_heuristic(vstate.equiangular_pdf, distance_pdf);
+        result.direct_throughput *= mis_weight;
+      }
+    }
+    else {
+      result.direct_throughput *= transmittance;
+      vstate.distance_pdf *= dot(channel_pdf, transmittance);
+    }
+  }
+
+  /* Distance sampling for indirect and optional direct lighting. */
+  if (!result.indirect_scatter) {
+    /* decide if we will scatter or continue */
+    const float sample_transmittance = volume_channel_get(transmittance, channel);
+
+    if (1.0f - vstate.rscatter >= sample_transmittance) {
+      /* compute sampling distance */
+      const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
+      const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
+      const float new_t = vstate.start_t + new_dt;
+
+      /* transmittance and pdf */
+      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
+      const float distance_pdf = dot(channel_pdf, coeff.sigma_t * new_transmittance);
+
+      /* throughput */
+      result.indirect_scatter = true;
+      result.indirect_t = new_t;
+      result.indirect_throughput *= coeff.sigma_s * new_transmittance / distance_pdf;
+      shader_copy_volume_phases(&result.indirect_phases, sd);
+
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        /* If using distance sampling for direct light, just copy parameters
+         * of indirect light since we scatter at the same point then. */
+        result.direct_scatter = true;
+        result.direct_t = result.indirect_t;
+        result.direct_throughput = result.indirect_throughput;
+        shader_copy_volume_phases(&result.direct_phases, sd);
+
+        /* Multiple importance sampling. */
+        if (vstate.use_mis) {
+          const float equiangular_pdf = volume_equiangular_pdf(ray, equiangular_light_P, new_t);
+          const float mis_weight = power_heuristic(vstate.distance_pdf * distance_pdf,
+                                                   equiangular_pdf);
+          result.direct_throughput *= 2.0f * mis_weight;
+        }
+      }
+    }
+    else {
+      /* throughput */
+      const float pdf = dot(channel_pdf, transmittance);
+      result.indirect_throughput *= transmittance / pdf;
+      if (vstate.direct_sample_method != VOLUME_SAMPLE_EQUIANGULAR) {
+        vstate.distance_pdf *= pdf;
+      }
+
+      /* remap rscatter so we can reuse it and keep thing stratified */
+      vstate.rscatter = 1.0f - (1.0f - vstate.rscatter) / sample_transmittance;
+    }
+  }
+}
+
+/* heterogeneous volume distance sampling: integrate stepping through the
+ * volume until we reach the end, get absorbed entirely, or run out of
+ * iterations. this does probabilistically scatter or get transmitted through
+ * for path tracing where we don't want to branch. */
+ccl_device_forceinline void volume_integrate_heterogeneous(
+    INTEGRATOR_STATE_ARGS,
+    Ray *ccl_restrict ray,
+    ShaderData *ccl_restrict sd,
+    const RNGState *rng_state,
+    ccl_global float *ccl_restrict render_buffer,
+    const float object_step_size,
+    const VolumeSampleMethod direct_sample_method,
+    const float3 equiangular_light_P,
+    VolumeIntegrateResult &result)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INTEGRATE);
+
+  /* Prepare for stepping.
+   * Using a different step offset for the first step avoids banding artifacts. */
+  int max_steps;
+  float step_size, step_shade_offset, steps_offset;
+  volume_step_init(kg,
+                   rng_state,
+                   object_step_size,
+                   ray->t,
+                   &step_size,
+                   &step_shade_offset,
+                   &steps_offset,
+                   &max_steps);
+
+  /* Initialize volume integration state. */
+  VolumeIntegrateState vstate ccl_optional_struct_init;
+  vstate.start_t = 0.0f;
+  vstate.end_t = 0.0f;
+  vstate.absorption_only = true;
+  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
+  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
+
+  /* Multiple importance sampling: pick between equiangular and distance sampling strategy. */
+  vstate.direct_sample_method = direct_sample_method;
+  vstate.use_mis = (direct_sample_method == VOLUME_SAMPLE_MIS);
+  if (vstate.use_mis) {
+    if (vstate.rscatter < 0.5f) {
+      vstate.rscatter *= 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_DISTANCE;
+    }
+    else {
+      vstate.rscatter = (vstate.rscatter - 0.5f) * 2.0f;
+      vstate.direct_sample_method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+  }
+  vstate.equiangular_pdf = 0.0f;
+  vstate.distance_pdf = 1.0f;
+
+  /* Initialize volume integration result. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  result.direct_throughput = throughput;
+  result.indirect_throughput = throughput;
+
+  /* Equiangular sampling: compute distance and PDF in advance. */
+  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR) {
+    result.direct_t = volume_equiangular_sample(
+        ray, equiangular_light_P, vstate.rscatter, &vstate.equiangular_pdf);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  const bool write_denoising_features = (INTEGRATOR_STATE(path, flag) &
+                                         PATH_RAY_DENOISING_FEATURES);
+  float3 accum_albedo = zero_float3();
+#  endif
+  float3 accum_emission = zero_float3();
+
+  for (int i = 0; i < max_steps; i++) {
+    /* Advance to new position */
+    vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
+    const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+    sd->P = ray->P + ray->D * shade_t;
+
+    /* compute segment */
+    VolumeShaderCoefficients coeff ccl_optional_struct_init;
+    if (volume_shader_sample(INTEGRATOR_STATE_PASS, sd, &coeff)) {
+      const int closure_flag = sd->flag;
+
+      /* Evaluate transmittance over segment. */
+      const float dt = (vstate.end_t - vstate.start_t);
+      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
+                                       volume_color_transmittance(coeff.sigma_t, dt) :
+                                       one_float3();
+
+      /* Emission. */
+      if (closure_flag & SD_EMISSION) {
+        /* Only write emission before indirect light scatter position, since we terminate
+         * stepping at that point if we have already found a direct light scatter position. */
+        if (!result.indirect_scatter) {
+          const float3 emission = volume_emission_integrate(
+              &coeff, closure_flag, transmittance, dt);
+          accum_emission += emission;
+        }
+      }
+
+      if (closure_flag & SD_EXTINCTION) {
+        if ((closure_flag & SD_SCATTER) || !vstate.absorption_only) {
+#  ifdef __DENOISING_FEATURES__
+          /* Accumulate albedo for denoising features. */
+          if (write_denoising_features && (closure_flag & SD_SCATTER)) {
+            const float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
+            accum_albedo += result.indirect_throughput * albedo * (one_float3() - transmittance);
+          }
+#  endif
+
+          /* Scattering and absorption. */
+          volume_integrate_step_scattering(
+              sd, ray, equiangular_light_P, coeff, transmittance, vstate, result);
+        }
+        else {
+          /* Absorption only. */
+          result.indirect_throughput *= transmittance;
+          result.direct_throughput *= transmittance;
+        }
+
+        /* Stop if nearly all light blocked. */
+        if (!result.indirect_scatter) {
+          if (max3(result.indirect_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            result.indirect_throughput = zero_float3();
+            break;
+          }
+        }
+        else if (!result.direct_scatter) {
+          if (max3(result.direct_throughput) < VOLUME_THROUGHPUT_EPSILON) {
+            break;
+          }
+        }
+      }
+
+      /* If we have scattering data for both direct and indirect, we're done. */
+      if (result.direct_scatter && result.indirect_scatter) {
+        break;
+      }
+    }
+
+    /* Stop if at the end of the volume. */
+    vstate.start_t = vstate.end_t;
+    if (vstate.start_t == ray->t) {
+      break;
+    }
+  }
+
+  /* Write accumulated emission. */
+  if (!is_zero(accum_emission)) {
+    kernel_accum_emission(
+        INTEGRATOR_STATE_PASS, result.indirect_throughput, accum_emission, render_buffer);
+  }
+
+#  ifdef __DENOISING_FEATURES__
+  /* Write denoising features. */
+  if (write_denoising_features) {
+    kernel_write_denoising_features_volume(
+        INTEGRATOR_STATE_PASS, accum_albedo, result.indirect_scatter, render_buffer);
+  }
+#  endif /* __DENOISING_FEATURES__ */
+}
+
+#  ifdef __EMISSION__
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline bool integrate_volume_sample_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          LightSample *ccl_restrict ls)
+{
+  /* Test if there is a light or BSDF that needs direct light. */
+  if (!kernel_data.integrator.use_direct_light) {
+    return false;
+  }
+
+  /* Sample position on a light. */
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  const uint bounce = INTEGRATOR_STATE(path, bounce);
+  float light_u, light_v;
+  path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+  light_distribution_sample_from_volume_segment(
+      kg, light_u, light_v, sd->time, sd->P, bounce, path_flag, ls);
+
+  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+    return false;
+  }
+
+  return true;
+}
+
+/* Path tracing: sample point on light and evaluate light shader, then
+ * queue shadow ray to be traced. */
+ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const RNGState *ccl_restrict rng_state,
+                                                          const float3 P,
+                                                          const ShaderVolumePhases *ccl_restrict
+                                                              phases,
+                                                          const float3 throughput,
+                                                          LightSample *ccl_restrict ls)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);
+
+  if (!kernel_data.integrator.use_direct_light) {
+    return;
+  }
+
+  /* Sample position on the same light again, now from the shading
+   * point where we scattered.
+   *
+   * TODO: decorrelate random numbers and use light_sample_new_position to
+   * avoid resampling the CDF. */
+  {
+    const int path_flag = INTEGRATOR_STATE(path, flag);
+    const uint bounce = INTEGRATOR_STATE(path, bounce);
+    float light_u, light_v;
+    path_state_rng_2D(kg, rng_state, PRNG_LIGHT_U, &light_u, &light_v);
+
+    if (!light_distribution_sample_from_position(
+            kg, light_u, light_v, sd->time, P, bounce, path_flag, ls)) {
+      return;
+    }
+  }
+
+  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+    return;
+  }
+
+  /* Evaluate light shader.
+   *
+   * TODO: can we reuse sd memory? In theory we can move this after
+   * integrate_surface_bounce, evaluate the BSDF, and only then evaluate
+   * the light shader. This could also move to its own kernel, for
+   * non-constant light sources. */
+  ShaderDataTinyStorage emission_sd_storage;
+  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
+  const float3 light_eval = light_sample_shader_eval(
+      INTEGRATOR_STATE_PASS, emission_sd, ls, sd->time);
+  if (is_zero(light_eval)) {
+    return;
+  }
+
+  /* Evaluate BSDF. */
+  BsdfEval phase_eval ccl_optional_struct_init;
+  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);
+
+  if (ls->shader & SHADER_USE_MIS) {
+    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    bsdf_eval_mul(&phase_eval, mis_weight);
+  }
+
+  bsdf_eval_mul3(&phase_eval, light_eval / ls->pdf);
+
+  /* Path termination. */
+  const float terminate = path_state_rng_light_termination(kg, rng_state);
+  if (light_sample_terminate(kg, ls, &phase_eval, terminate)) {
+    return;
+  }
+
+  /* Create shadow ray. */
+  Ray ray ccl_optional_struct_init;
+  light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
+  const bool is_light = light_sample_is_light(ls);
+
+  /* Write shadow ray and associated state to global memory. */
+  integrator_state_write_shadow_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Copy state from main path to shadow path. */
+  const uint16_t bounce = INTEGRATOR_STATE(path, bounce);
+  const uint16_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
+  uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
+  shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
+  shadow_flag |= PATH_RAY_VOLUME_PASS;
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
+  INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
+  INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
+  INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
+    INTEGRATOR_STATE_WRITE(shadow_path, unshadowed_throughput) = throughput;
+  }
+
+  integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
+
+  /* Branch off shadow kernel. */
+  INTEGRATOR_SHADOW_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+}
+#  endif
+
+/* Path tracing: scatter in new direction using phase function */
+ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS,
+                                                           ShaderData *sd,
+                                                           const RNGState *rng_state,
+                                                           const ShaderVolumePhases *phases)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_INDIRECT_LIGHT);
+
+  float phase_u, phase_v;
+  path_state_rng_2D(kg, rng_state, PRNG_BSDF_U, &phase_u, &phase_v);
+
+  /* Phase closure, sample direction. */
+  float phase_pdf;
+  BsdfEval phase_eval ccl_optional_struct_init;
+  float3 phase_omega_in ccl_optional_struct_init;
+  differential3 phase_domega_in ccl_optional_struct_init;
+
+  const int label = shader_volume_phase_sample(kg,
+                                               sd,
+                                               phases,
+                                               phase_u,
+                                               phase_v,
+                                               &phase_eval,
+                                               &phase_omega_in,
+                                               &phase_domega_in,
+                                               &phase_pdf);
+
+  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval)) {
+    return false;
+  }
+
+  /* Setup ray. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = normalize(phase_omega_in);
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+
+#  ifdef __RAY_DIFFERENTIALS__
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_make_compact(phase_domega_in);
+#  endif
+
+  /* Update throughput. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval) / phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, throughput) = throughput_phase;
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+  }
+
+  /* Update path state */
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = fminf(phase_pdf,
+                                                    INTEGRATOR_STATE(path, min_ray_pdf));
+
+  path_state_next(INTEGRATOR_STATE_PASS, label);
+  return true;
+}
+
+/* get the volume attenuation and emission over line segment defined by
+ * ray, with the assumption that there are no surfaces blocking light
+ * between the endpoints. distance sampling is used to decide if we will
+ * scatter or not. */
+ccl_device VolumeIntegrateEvent volume_integrate(INTEGRATOR_STATE_ARGS,
+                                                 Ray *ccl_restrict ray,
+                                                 ccl_global float *ccl_restrict render_buffer)
+{
+  ShaderData sd;
+  shader_setup_from_volume(kg, &sd, ray);
+
+  /* Load random number state. */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  /* Sample light ahead of volume stepping, for equiangular sampling. */
+  /* TODO: distant lights are ignored now, but could instead use even distribution. */
+  LightSample ls ccl_optional_struct_init;
+  const bool need_light_sample = !(INTEGRATOR_STATE(path, flag) & PATH_RAY_TERMINATE);
+  const bool have_equiangular_sample = need_light_sample &&
+                                       integrate_volume_sample_light(
+                                           INTEGRATOR_STATE_PASS, &sd, &rng_state, &ls) &&
+                                       (ls.t != FLT_MAX);
+
+  VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ?
+                                                volume_stack_sample_method(INTEGRATOR_STATE_PASS) :
+                                                VOLUME_SAMPLE_DISTANCE;
+
+  /* Step through volume. */
+  const float step_size = volume_stack_step_size(INTEGRATOR_STATE_PASS, [=](const int i) {
+    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+  });
+
+  /* TODO: expensive to zero closures? */
+  VolumeIntegrateResult result = {};
+  volume_integrate_heterogeneous(INTEGRATOR_STATE_PASS,
+                                 ray,
+                                 &sd,
+                                 &rng_state,
+                                 render_buffer,
+                                 step_size,
+                                 direct_sample_method,
+                                 ls.P,
+                                 result);
+
+  /* Perform path termination. The intersect_closest will have already marked this path
+   * to be terminated. That will shading evaluating to leave out any scattering closures,
+   * but emission and absorption are still handled for multiple importance sampling. */
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
+  const float probability = (path_flag & PATH_RAY_TERMINATE_IN_NEXT_VOLUME) ?
+                                0.0f :
+                                path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                    path_flag);
+  if (probability == 0.0f) {
+    return VOLUME_PATH_MISSED;
+  }
+
+  /* Direct light. */
+  if (result.direct_scatter) {
+    const float3 direct_P = ray->P + result.direct_t * ray->D;
+    result.direct_throughput /= probability;
+    integrate_volume_direct_light(INTEGRATOR_STATE_PASS,
+                                  &sd,
+                                  &rng_state,
+                                  direct_P,
+                                  &result.direct_phases,
+                                  result.direct_throughput,
+                                  &ls);
+  }
+
+  /* Indirect light.
+   *
+   * Only divide throughput by probability if we scatter. For the attenuation
+   * case the next surface will already do this division. */
+  if (result.indirect_scatter) {
+    result.indirect_throughput /= probability;
+  }
+  INTEGRATOR_STATE_WRITE(path, throughput) = result.indirect_throughput;
+
+  if (result.indirect_scatter) {
+    sd.P = ray->P + result.indirect_t * ray->D;
+
+    if (integrate_volume_phase_scatter(
+            INTEGRATOR_STATE_PASS, &sd, &rng_state, &result.indirect_phases)) {
+      return VOLUME_PATH_SCATTERED;
+    }
+    else {
+      return VOLUME_PATH_MISSED;
+    }
+  }
+  else {
+    return VOLUME_PATH_ATTENUATED;
+  }
+}
+
+#endif
+
+ccl_device void integrator_shade_volume(INTEGRATOR_STATE_ARGS,
+                                        ccl_global float *ccl_restrict render_buffer)
+{
+  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_SETUP);
+
+#ifdef __VOLUME__
+  /* Setup shader data. */
+  Ray ray ccl_optional_struct_init;
+  integrator_state_read_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  Intersection isect ccl_optional_struct_init;
+  integrator_state_read_isect(INTEGRATOR_STATE_PASS, &isect);
+
+  /* Set ray length to current segment. */
+  ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+
+  /* Clean volume stack for background rays. */
+  if (isect.prim == PRIM_NONE) {
+    volume_stack_clean(INTEGRATOR_STATE_PASS);
+  }
+
+  VolumeIntegrateEvent event = volume_integrate(INTEGRATOR_STATE_PASS, &ray, render_buffer);
+
+  if (event == VOLUME_PATH_SCATTERED) {
+    /* Queue intersect_closest kernel. */
+    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    return;
+  }
+  else if (event == VOLUME_PATH_MISSED) {
+    /* End path. */
+    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    return;
+  }
+  else {
+    /* Continue to background, light or surface. */
+    if (isect.prim == PRIM_NONE) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+      return;
+    }
+    else if (isect.type & PRIMITIVE_LAMP) {
+      INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+                           DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      return;
+    }
+    else {
+      /* Hit a surface, continue with surface kernel unless terminated. */
+      const int shader = intersection_get_shader(kg, &isect);
+      const int flags = kernel_tex_fetch(__shaders, shader).flags;
+
+      integrator_intersect_shader_next_kernel<DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME>(
+          INTEGRATOR_STATE_PASS, &isect, shader, flags);
+      return;
+    }
+  }
+#endif /* __VOLUME__ */
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
new file mode 100644
index 00000000000..094446be02c
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Integrator State
+ *
+ * This file defines the data structures that define the state of a path. Any state that is
+ * preserved and passed between kernel executions is part of this.
+ *
+ * The size of this state must be kept as small as possible, to reduce cache misses and keep memory
+ * usage under control on GPUs that may execute millions of kernels.
+ *
+ * Memory may be allocated and passed along in different ways depending on the device. There may
+ * be a scalar layout, or AoS or SoA layout for batches. The state may be passed along as a pointer
+ * to every kernel, or the pointer may exist at program scope or in constant memory. To abstract
+ * these differences between devices and experiment with different layouts, macros are used.
+ *
+ * INTEGRATOR_STATE_ARGS: prepend to argument definitions for every function that accesses
+ * path state.
+ * INTEGRATOR_STATE_CONST_ARGS: same as INTEGRATOR_STATE_ARGS, when state is read-only
+ * INTEGRATOR_STATE_PASS: use to pass along state to other functions access it.
+ *
+ * INTEGRATOR_STATE(x, y): read nested struct member x.y of IntegratorState
+ * INTEGRATOR_STATE_WRITE(x, y): write to nested struct member x.y of IntegratorState
+ *
+ * INTEGRATOR_STATE_ARRAY(x, index, y): read x[index].y
+ * INTEGRATOR_STATE_ARRAY_WRITE(x, index, y): write x[index].y
+ *
+ * INTEGRATOR_STATE_COPY(to_x, from_x): copy contents of one nested struct to another
+ *
+ * INTEGRATOR_STATE_IS_NULL: test if any integrator state is available, for shader evaluation
+ * INTEGRATOR_STATE_PASS_NULL: use to pass empty state to other functions.
+ *
+ * NOTE: if we end up with a device that passes no arguments, the leading comma will be a problem.
+ * Can solve it with more macros if we encounter it, but rather ugly so postpone for now.
+ */
+
+#include "kernel/kernel_types.h"
+
+#include "util/util_types.h"
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Constants
+ *
+ * TODO: these could be made dynamic depending on the features used in the scene. */
+
+#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
+#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+/* Data structures */
+
+/* Integrator State
+ *
+ * CPU rendering path state with AoS layout. */
+typedef struct IntegratorStateCPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+} IntegratorStateCPU;
+
+/* Path Queue
+ *
+ * Keep track of which kernels are queued to be executed next in the path
+ * for GPU rendering. */
+typedef struct IntegratorQueueCounter {
+  int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM];
+} IntegratorQueueCounter;
+
+/* Integrator State GPU
+ *
+ * GPU rendering path state with SoA layout. */
+typedef struct IntegratorStateGPU {
+#define KERNEL_STRUCT_BEGIN(name) struct {
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
+#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
+#define KERNEL_STRUCT_END(name) \
+  } \
+  name;
+#define KERNEL_STRUCT_END_ARRAY(name, size) \
+  } \
+  name[size];
+#include "kernel/integrator/integrator_state_template.h"
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
+#undef KERNEL_STRUCT_END
+#undef KERNEL_STRUCT_END_ARRAY
+
+  /* Count number of queued kernels. */
+  IntegratorQueueCounter *queue_counter;
+
+  /* Count number of kernels queued for specific shaders. */
+  int *sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM];
+
+  /* Index of path which will be used by a next shadow catcher split.  */
+  int *next_shadow_catcher_path_index;
+} IntegratorStateGPU;
+
+/* Abstraction
+ *
+ * Macros to access data structures on different devices.
+ *
+ * Note that there is a special access function for the shadow catcher state. This access is to
+ * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
+ * from a kernel which operates on a shadow catcher state will cause bad memory access. */
+
+#ifdef __KERNEL_CPU__
+
+/* Scalar access on CPU. */
+
+typedef IntegratorStateCPU *ccl_restrict IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    ccl_attr_maybe_unused const KernelGlobals *ccl_restrict kg, \
+        const IntegratorStateCPU *ccl_restrict state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, NULL
+#  define INTEGRATOR_STATE_IS_NULL (state == NULL)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct.member)
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) (state->nested_struct.member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    (((const IntegratorStateCPU *)state)->nested_struct[array_index].member)
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    ((state)->nested_struct[array_index].member)
+
+#else /* __KERNEL_CPU__ */
+
+/* Array access on GPU with Structure-of-Arrays. */
+
+typedef int IntegratorState;
+
+#  define INTEGRATOR_STATE_ARGS const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_CONST_ARGS \
+    const KernelGlobals *ccl_restrict kg, const IntegratorState state
+#  define INTEGRATOR_STATE_PASS kg, state
+
+#  define INTEGRATOR_STATE_PASS_NULL kg, -1
+#  define INTEGRATOR_STATE_IS_NULL (state == -1)
+
+#  define INTEGRATOR_STATE(nested_struct, member) \
+    kernel_integrator_state.nested_struct.member[state]
+#  define INTEGRATOR_STATE_WRITE(nested_struct, member) INTEGRATOR_STATE(nested_struct, member)
+
+#  define INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member) \
+    kernel_integrator_state.nested_struct[array_index].member[state]
+#  define INTEGRATOR_STATE_ARRAY_WRITE(nested_struct, array_index, member) \
+    INTEGRATOR_STATE_ARRAY(nested_struct, array_index, member)
+
+#endif /* __KERNEL_CPU__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_flow.h b/intern/cycles/kernel/integrator/integrator_state_flow.h
new file mode 100644
index 00000000000..8477efd7b66
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_flow.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_types.h"
+#include "util/util_atomic.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Control Flow
+ *
+ * Utilities for control flow between kernels. The implementation may differ per device
+ * or even be handled on the host side. To abstract such differences, experiment with
+ * different implementations and for debugging, this is abstracted using macros.
+ *
+ * There is a main path for regular path tracing camera for path tracing. Shadows for next
+ * event estimation branch off from this into their own path, that may be computed in
+ * parallel while the main path continues.
+ *
+ * Each kernel on the main path must call one of these functions. These may not be called
+ * multiple times from the same kernel.
+ *
+ * INTEGRATOR_PATH_INIT(next_kernel)
+ * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
+ * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ *
+ * For the shadow path similar functions are used, and again each shadow kernel must call
+ * one of them, and only once.
+ */
+
+#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(path, queued_kernel) == 0)
+#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0)
+
+#ifdef __KERNEL_GPU__
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
+                                1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    atomic_fetch_and_sub_uint32( \
+        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      const int key_ = key; \
+      atomic_fetch_and_sub_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
+      atomic_fetch_and_add_uint32( \
+          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      INTEGRATOR_STATE_WRITE(path, shader_sort_key) = key_; \
+      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
+                                  1); \
+    }
+
+#else
+
+#  define INTEGRATOR_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+    }
+#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
+    { \
+      INTEGRATOR_STATE_WRITE(path, queued_kernel) = next_kernel; \
+      (void)key; \
+      (void)current_kernel; \
+    }
+
+#  define INTEGRATOR_SHADOW_PATH_INIT(next_kernel) \
+    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel;
+#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = next_kernel; \
+      (void)current_kernel; \
+    }
+#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
+    { \
+      INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0; \
+      (void)current_kernel; \
+    }
+
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
new file mode 100644
index 00000000000..41dd1bfcdbf
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -0,0 +1,163 @@
+
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/************************************ Path State *****************************/
+
+KERNEL_STRUCT_BEGIN(path)
+/* Index of a pixel within the device render buffer where this path will write its result.
+ * To get an actual offset within the buffer the value needs to be multiplied by the
+ * `kernel_data.film.pass_stride`.
+ *
+ * The multiplication is delayed for later, so that state can use 32bit integer. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
+/* Current sample number. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current diffuse ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, diffuse_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current glossy ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, glossy_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transmission ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transmission_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current volume bounds ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, volume_bounds_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* Random number generator seed. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_hash, KERNEL_FEATURE_PATH_TRACING)
+/* Random number dimension offset. */
+KERNEL_STRUCT_MEMBER(path, uint32_t, rng_offset, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Multiple importance sampling
+ * The PDF of BSDF sampling at the last scatter point, and distance to the
+ * last scatter point minus the last ray segment. This distance lets us
+ * compute the complete distance through transparent surfaces and volumes. */
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
+/* Filter glossy. */
+KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+/* Shader sorting. */
+/* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
+KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(path)
+
+/************************************** Ray ***********************************/
+
+KERNEL_STRUCT_BEGIN(ray)
+KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(ray)
+
+/*************************** Intersection result ******************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(isect)
+KERNEL_STRUCT_MEMBER(isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_MEMBER(isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(isect)
+
+/*************** Subsurface closure state for subsurface kernel ***************/
+
+KERNEL_STRUCT_BEGIN(subsurface)
+KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, float, roughness, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_END(subsurface)
+
+/********************************** Volume Stack ******************************/
+
+KERNEL_STRUCT_BEGIN(volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+
+/********************************* Shadow Path State **************************/
+
+KERNEL_STRUCT_BEGIN(shadow_path)
+/* Current ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, bounce, KERNEL_FEATURE_PATH_TRACING)
+/* Current transparent ray bounce depth. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, transparent_bounce, KERNEL_FEATURE_PATH_TRACING)
+/* DeviceKernel bit indicating queued kernels.
+ * TODO: reduce size? */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, queued_kernel, KERNEL_FEATURE_PATH_TRACING)
+/* enum PathRayFlag */
+KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+/* Throughput for shadow pass. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, unshadowed_throughput, KERNEL_FEATURE_SHADOW_PASS)
+/* Ratio of throughput to distinguish diffuse and glossy render passes. */
+KERNEL_STRUCT_MEMBER(shadow_path, float3, diffuse_glossy_ratio, KERNEL_FEATURE_LIGHT_PASSES)
+/* Number of intersections found by ray-tracing. */
+KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_path)
+
+/********************************** Shadow Ray *******************************/
+
+KERNEL_STRUCT_BEGIN(shadow_ray)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END(shadow_ray)
+
+/*********************** Shadow Intersection result **************************/
+
+/* Result from scene intersection. */
+KERNEL_STRUCT_BEGIN(shadow_isect)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, u, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float, v, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, prim, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
+/* TODO: exclude for GPU. */
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+
+/**************************** Shadow Volume Stack *****************************/
+
+KERNEL_STRUCT_BEGIN(shadow_volume_stack)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
new file mode 100644
index 00000000000..cdf412fe22f
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/kernel_differential.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray */
+
+ccl_device_forceinline void integrator_state_write_ray(INTEGRATOR_STATE_ARGS,
+                                                       const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(ray, dP) = ray->dP;
+  INTEGRATOR_STATE_WRITE(ray, dD) = ray->dD;
+}
+
+ccl_device_forceinline void integrator_state_read_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                      Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(ray, P);
+  ray->D = INTEGRATOR_STATE(ray, D);
+  ray->t = INTEGRATOR_STATE(ray, t);
+  ray->time = INTEGRATOR_STATE(ray, time);
+  ray->dP = INTEGRATOR_STATE(ray, dP);
+  ray->dD = INTEGRATOR_STATE(ray, dD);
+}
+
+/* Shadow Ray */
+
+ccl_device_forceinline void integrator_state_write_shadow_ray(INTEGRATOR_STATE_ARGS,
+                                                              const Ray *ccl_restrict ray)
+{
+  INTEGRATOR_STATE_WRITE(shadow_ray, P) = ray->P;
+  INTEGRATOR_STATE_WRITE(shadow_ray, D) = ray->D;
+  INTEGRATOR_STATE_WRITE(shadow_ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(shadow_ray, time) = ray->time;
+  INTEGRATOR_STATE_WRITE(shadow_ray, dP) = ray->dP;
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_ray(INTEGRATOR_STATE_CONST_ARGS,
+                                                             Ray *ccl_restrict ray)
+{
+  ray->P = INTEGRATOR_STATE(shadow_ray, P);
+  ray->D = INTEGRATOR_STATE(shadow_ray, D);
+  ray->t = INTEGRATOR_STATE(shadow_ray, t);
+  ray->time = INTEGRATOR_STATE(shadow_ray, time);
+  ray->dP = INTEGRATOR_STATE(shadow_ray, dP);
+  ray->dD = differential_zero_compact();
+}
+
+/* Intersection */
+
+ccl_device_forceinline void integrator_state_write_isect(INTEGRATOR_STATE_ARGS,
+                                                         const Intersection *ccl_restrict isect)
+{
+  INTEGRATOR_STATE_WRITE(isect, t) = isect->t;
+  INTEGRATOR_STATE_WRITE(isect, u) = isect->u;
+  INTEGRATOR_STATE_WRITE(isect, v) = isect->v;
+  INTEGRATOR_STATE_WRITE(isect, object) = isect->object;
+  INTEGRATOR_STATE_WRITE(isect, prim) = isect->prim;
+  INTEGRATOR_STATE_WRITE(isect, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_WRITE(isect, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                        Intersection *ccl_restrict isect)
+{
+  isect->prim = INTEGRATOR_STATE(isect, prim);
+  isect->object = INTEGRATOR_STATE(isect, object);
+  isect->type = INTEGRATOR_STATE(isect, type);
+  isect->u = INTEGRATOR_STATE(isect, u);
+  isect->v = INTEGRATOR_STATE(isect, v);
+  isect->t = INTEGRATOR_STATE(isect, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE(isect, Ng);
+#endif
+}
+
+ccl_device_forceinline VolumeStack integrator_state_read_volume_stack(INTEGRATOR_STATE_CONST_ARGS,
+                                                                      int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline void integrator_state_write_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                int i,
+                                                                VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, i, shader) = entry.shader;
+}
+
+ccl_device_forceinline bool integrator_state_volume_stack_is_empty(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+/* Shadow Intersection */
+
+ccl_device_forceinline void integrator_state_write_shadow_isect(
+    INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect, const int index)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, t) = isect->t;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, u) = isect->u;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, v) = isect->v;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, object) = isect->object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, prim) = isect->prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, type) = isect->type;
+#ifdef __EMBREE__
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_isect, index, Ng) = isect->Ng;
+#endif
+}
+
+ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_CONST_ARGS,
+                                                               Intersection *ccl_restrict isect,
+                                                               const int index)
+{
+  isect->prim = INTEGRATOR_STATE_ARRAY(shadow_isect, index, prim);
+  isect->object = INTEGRATOR_STATE_ARRAY(shadow_isect, index, object);
+  isect->type = INTEGRATOR_STATE_ARRAY(shadow_isect, index, type);
+  isect->u = INTEGRATOR_STATE_ARRAY(shadow_isect, index, u);
+  isect->v = INTEGRATOR_STATE_ARRAY(shadow_isect, index, v);
+  isect->t = INTEGRATOR_STATE_ARRAY(shadow_isect, index, t);
+#ifdef __EMBREE__
+  isect->Ng = INTEGRATOR_STATE_ARRAY(shadow_isect, index, Ng);
+#endif
+}
+
+ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, object);
+      INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
+          volume_stack, i, shader);
+    }
+  }
+}
+
+ccl_device_forceinline VolumeStack
+integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_CONST_ARGS, int i)
+{
+  VolumeStack entry = {INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, object),
+                       INTEGRATOR_STATE_ARRAY(shadow_volume_stack, i, shader)};
+  return entry;
+}
+
+ccl_device_forceinline bool integrator_state_shadow_volume_stack_is_empty(
+    INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) ?
+             INTEGRATOR_STATE_ARRAY(shadow_volume_stack, 0, shader) == SHADER_NONE :
+             true;
+}
+
+ccl_device_forceinline void integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_ARGS,
+                                                                       int i,
+                                                                       VolumeStack entry)
+{
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = entry.object;
+  INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = entry.shader;
+}
+
+#if defined(__KERNEL_GPU__)
+ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state,
+                                                  const IntegratorState state)
+{
+  int index;
+
+  /* Rely on the compiler to optimize out unused assignments and `while(false)`'s. */
+
+#  define KERNEL_STRUCT_BEGIN(name) \
+    index = 0; \
+    do {
+
+#  define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct.name != nullptr) { \
+      kernel_integrator_state.parent_struct.name[to_state] = \
+          kernel_integrator_state.parent_struct.name[state]; \
+    }
+
+#  define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+    if (kernel_integrator_state.parent_struct[index].name != nullptr) { \
+      kernel_integrator_state.parent_struct[index].name[to_state] = \
+          kernel_integrator_state.parent_struct[index].name[state]; \
+    }
+
+#  define KERNEL_STRUCT_END(name) \
+    } \
+    while (false) \
+      ;
+
+#  define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+    ++index; \
+    } \
+    while (index < array_size) \
+      ;
+
+#  include "kernel/integrator/integrator_state_template.h"
+
+#  undef KERNEL_STRUCT_BEGIN
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_ARRAY_MEMBER
+#  undef KERNEL_STRUCT_END
+#  undef KERNEL_STRUCT_END_ARRAY
+}
+
+ccl_device_inline void integrator_state_move(const IntegratorState to_state,
+                                             const IntegratorState state)
+{
+  integrator_state_copy_only(to_state, state);
+
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
+
+#endif
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_ARGS)
+{
+#if defined(__KERNEL_GPU__)
+  const IntegratorState to_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_catcher_path_index[0], 1);
+
+  integrator_state_copy_only(to_state, state);
+
+  kernel_integrator_state.path.flag[to_state] |= PATH_RAY_SHADOW_CATCHER_PASS;
+
+  /* Sanity check: expect to split in the intersect-closest kernel, where there is no shadow ray
+   * and no sorting yet. */
+  kernel_assert(INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+  kernel_assert(kernel_integrator_state.sort_key_counter[INTEGRATOR_STATE(path, queued_kernel)] ==
+                nullptr);
+#else
+
+  IntegratorStateCPU *ccl_restrict split_state = state + 1;
+
+  *split_state = *state;
+
+  split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
+#endif
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_subsurface.h b/intern/cycles/kernel/integrator/integrator_subsurface.h
new file mode 100644
index 00000000000..9490738404e
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_subsurface.h
@@ -0,0 +1,623 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
+
+#include "kernel/bvh/bvh.h"
+
+#include "kernel/closure/alloc.h"
+#include "kernel/closure/bsdf_diffuse.h"
+#include "kernel/closure/bsdf_principled_diffuse.h"
+#include "kernel/closure/bssrdf.h"
+#include "kernel/closure/volume.h"
+
+#include "kernel/integrator/integrator_intersect_volume_stack.h"
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef __SUBSURFACE__
+
+ccl_device int subsurface_bounce(INTEGRATOR_STATE_ARGS, ShaderData *sd, const ShaderClosure *sc)
+{
+  /* We should never have two consecutive BSSRDF bounces, the second one should
+   * be converted to a diffuse BSDF to avoid this. */
+  kernel_assert(!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DIFFUSE_ANCESTOR));
+
+  /* Setup path state for intersect_subsurface kernel. */
+  const Bssrdf *bssrdf = (const Bssrdf *)sc;
+
+  /* Setup ray into surface. */
+  INTEGRATOR_STATE_WRITE(ray, P) = sd->P;
+  INTEGRATOR_STATE_WRITE(ray, D) = sd->N;
+  INTEGRATOR_STATE_WRITE(ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(ray, dP) = differential_make_compact(sd->dP);
+  INTEGRATOR_STATE_WRITE(ray, dD) = differential_zero_compact();
+
+  /* Pass along object info, reusing isect to save memory. */
+  INTEGRATOR_STATE_WRITE(isect, Ng) = sd->Ng;
+  INTEGRATOR_STATE_WRITE(isect, object) = sd->object;
+
+  /* Pass BSSRDF parameters. */
+  const uint32_t path_flag = INTEGRATOR_STATE_WRITE(path, flag);
+  INTEGRATOR_STATE_WRITE(path, flag) = (path_flag & ~PATH_RAY_CAMERA) | PATH_RAY_SUBSURFACE;
+  INTEGRATOR_STATE_WRITE(path, throughput) *= shader_bssrdf_sample_weight(sd, sc);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+    }
+  }
+
+  INTEGRATOR_STATE_WRITE(subsurface, albedo) = bssrdf->albedo;
+  INTEGRATOR_STATE_WRITE(subsurface, radius) = bssrdf->radius;
+  INTEGRATOR_STATE_WRITE(subsurface, roughness) = bssrdf->roughness;
+  INTEGRATOR_STATE_WRITE(subsurface, anisotropy) = bssrdf->anisotropy;
+
+  return LABEL_SUBSURFACE_SCATTER;
+}
+
+ccl_device void subsurface_shader_data_setup(INTEGRATOR_STATE_ARGS, ShaderData *sd)
+{
+  /* Get bump mapped normal from shader evaluation at exit point. */
+  float3 N = sd->N;
+  if (sd->flag & SD_HAS_BSSRDF_BUMP) {
+    N = shader_bssrdf_normal(sd);
+  }
+
+  /* Setup diffuse BSDF at the exit point. This replaces shader_eval_surface. */
+  sd->flag &= ~SD_CLOSURE_FLAGS;
+  sd->num_closure = 0;
+  sd->num_closure_left = kernel_data.max_closures;
+
+  const float3 weight = one_float3();
+  const float roughness = INTEGRATOR_STATE(subsurface, roughness);
+
+#  ifdef __PRINCIPLED__
+  if (roughness != FLT_MAX) {
+    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
+        sd, sizeof(PrincipledDiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      bsdf->roughness = roughness;
+      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular Disney principled diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
+    }
+  }
+  else
+#  endif /* __PRINCIPLED__ */
+  {
+    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
+
+    if (bsdf) {
+      bsdf->N = N;
+      sd->flag |= bsdf_diffuse_setup(bsdf);
+
+      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
+       * can recognize it as not being a regular diffuse closure */
+      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
+    }
+  }
+}
+
+/* Random walk subsurface scattering.
+ *
+ * "Practical and Controllable Subsurface Scattering for Production Path
+ *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
+
+/* Support for anisotropy from:
+ * "Path Traced Subsurface Scattering using Anisotropic Phase Functions
+ * and Non-Exponential Free Flights".
+ * Magnus Wrenninge, Ryusuke Villemin, Christophe Hery.
+ * https://graphics.pixar.com/library/PathTracedSubsurface/ */
+
+ccl_device void subsurface_random_walk_remap(
+    const float albedo, const float d, float g, float *sigma_t, float *alpha)
+{
+  /* Compute attenuation and scattering coefficients from albedo. */
+  const float g2 = g * g;
+  const float g3 = g2 * g;
+  const float g4 = g3 * g;
+  const float g5 = g4 * g;
+  const float g6 = g5 * g;
+  const float g7 = g6 * g;
+
+  const float A = 1.8260523782f + -1.28451056436f * g + -1.79904629312f * g2 +
+                  9.19393289202f * g3 + -22.8215585862f * g4 + 32.0234874259f * g5 +
+                  -23.6264803333f * g6 + 7.21067002658f * g7;
+  const float B = 4.98511194385f +
+                  0.127355959438f *
+                      expf(31.1491581433f * g + -201.847017512f * g2 + 841.576016723f * g3 +
+                           -2018.09288505f * g4 + 2731.71560286f * g5 + -1935.41424244f * g6 +
+                           559.009054474f * g7);
+  const float C = 1.09686102424f + -0.394704063468f * g + 1.05258115941f * g2 +
+                  -8.83963712726f * g3 + 28.8643230661f * g4 + -46.8802913581f * g5 +
+                  38.5402837518f * g6 + -12.7181042538f * g7;
+  const float D = 0.496310210422f + 0.360146581622f * g + -2.15139309747f * g2 +
+                  17.8896899217f * g3 + -55.2984010333f * g4 + 82.065982243f * g5 +
+                  -58.5106008578f * g6 + 15.8478295021f * g7;
+  const float E = 4.23190299701f +
+                  0.00310603949088f *
+                      expf(76.7316253952f * g + -594.356773233f * g2 + 2448.8834203f * g3 +
+                           -5576.68528998f * g4 + 7116.60171912f * g5 + -4763.54467887f * g6 +
+                           1303.5318055f * g7);
+  const float F = 2.40602999408f + -2.51814844609f * g + 9.18494908356f * g2 +
+                  -79.2191708682f * g3 + 259.082868209f * g4 + -403.613804597f * g5 +
+                  302.85712436f * g6 + -87.4370473567f * g7;
+
+  const float blend = powf(albedo, 0.25f);
+
+  *alpha = (1.0f - blend) * A * powf(atanf(B * albedo), C) +
+           blend * D * powf(atanf(E * albedo), F);
+  *alpha = clamp(*alpha, 0.0f, 0.999999f);  // because of numerical precision
+
+  float sigma_t_prime = 1.0f / fmaxf(d, 1e-16f);
+  *sigma_t = sigma_t_prime / (1.0f - g);
+}
+
+ccl_device void subsurface_random_walk_coefficients(const float3 albedo,
+                                                    const float3 radius,
+                                                    const float anisotropy,
+                                                    float3 *sigma_t,
+                                                    float3 *alpha,
+                                                    float3 *throughput)
+{
+  float sigma_t_x, sigma_t_y, sigma_t_z;
+  float alpha_x, alpha_y, alpha_z;
+
+  subsurface_random_walk_remap(albedo.x, radius.x, anisotropy, &sigma_t_x, &alpha_x);
+  subsurface_random_walk_remap(albedo.y, radius.y, anisotropy, &sigma_t_y, &alpha_y);
+  subsurface_random_walk_remap(albedo.z, radius.z, anisotropy, &sigma_t_z, &alpha_z);
+
+  /* Throughput already contains closure weight at this point, which includes the
+   * albedo, as well as closure mixing and Fresnel weights. Divide out the albedo
+   * which will be added through scattering. */
+  *throughput = safe_divide_color(*throughput, albedo);
+
+  /* With low albedo values (like 0.025) we get diffusion_length 1.0 and
+   * infinite phase functions. To avoid a sharp discontinuity as we go from
+   * such values to 0.0, increase alpha and reduce the throughput to compensate. */
+  const float min_alpha = 0.2f;
+  if (alpha_x < min_alpha) {
+    (*throughput).x *= alpha_x / min_alpha;
+    alpha_x = min_alpha;
+  }
+  if (alpha_y < min_alpha) {
+    (*throughput).y *= alpha_y / min_alpha;
+    alpha_y = min_alpha;
+  }
+  if (alpha_z < min_alpha) {
+    (*throughput).z *= alpha_z / min_alpha;
+    alpha_z = min_alpha;
+  }
+
+  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
+  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
+}
+
+/* References for Dwivedi sampling:
+ *
+ * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
+ * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
+ * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
+ *
+ * [2] "Improving the Dwivedi Sampling Scheme"
+ * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
+ * https://cg.ivd.kit.edu/1951.php
+ *
+ * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
+ * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
+ * https://iliyan.com/publications/RenderingCourse2020
+ */
+
+ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
+{
+  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
+  return 1.0f / ((v - cos_theta) * phase_log);
+}
+
+ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
+{
+  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
+   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
+   * we can implement the power function like this. */
+  return v - (v + 1.0f) * expf(-rand * phase_log);
+}
+
+ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
+{
+  /* Eq. 67 from [3] */
+  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
+}
+
+ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
+{
+  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+  float phi = M_2PI_F * randv;
+  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
+
+  float3 T, B;
+  make_orthonormals(D, &T, &B);
+  return dir.x * T + dir.y * B + dir.z * D;
+}
+
+ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
+                                                         float t,
+                                                         bool hit,
+                                                         float3 *transmittance)
+{
+  float3 T = volume_color_transmittance(sigma_t, t);
+  if (transmittance) {
+    *transmittance = T;
+  }
+  return hit ? T : sigma_t * T;
+}
+
+/* Define the below variable to get the similarity code active,
+ * and the value represents the cutoff level */
+#  define SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL 9
+
+ccl_device_inline bool subsurface_random_walk(INTEGRATOR_STATE_ARGS,
+                                              RNGState rng_state,
+                                              Ray &ray,
+                                              LocalIntersection &ss_isect)
+{
+  float bssrdf_u, bssrdf_v;
+  path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
+
+  const float3 P = INTEGRATOR_STATE(ray, P);
+  const float3 N = INTEGRATOR_STATE(ray, D);
+  const float ray_dP = INTEGRATOR_STATE(ray, dP);
+  const float time = INTEGRATOR_STATE(ray, time);
+  const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+  const int object = INTEGRATOR_STATE(isect, object);
+
+  /* Sample diffuse surface scatter into the object. */
+  float3 D;
+  float pdf;
+  sample_cos_hemisphere(-N, bssrdf_u, bssrdf_v, &D, &pdf);
+  if (dot(-Ng, D) <= 0.0f) {
+    return false;
+  }
+
+  /* Setup ray. */
+  ray.P = ray_offset(P, -Ng);
+  ray.D = D;
+  ray.t = FLT_MAX;
+  ray.time = time;
+  ray.dP = ray_dP;
+  ray.dD = differential_zero_compact();
+
+#  ifndef __KERNEL_OPTIX__
+  /* Compute or fetch object transforms. */
+  Transform ob_itfm ccl_optional_struct_init;
+  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
+#  endif
+
+  /* Convert subsurface to volume coefficients.
+   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
+  const float3 albedo = INTEGRATOR_STATE(subsurface, albedo);
+  const float3 radius = INTEGRATOR_STATE(subsurface, radius);
+  const float anisotropy = INTEGRATOR_STATE(subsurface, anisotropy);
+
+  float3 sigma_t, alpha;
+  float3 throughput = INTEGRATOR_STATE_WRITE(path, throughput);
+  subsurface_random_walk_coefficients(albedo, radius, anisotropy, &sigma_t, &alpha, &throughput);
+  float3 sigma_s = sigma_t * alpha;
+
+  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
+   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
+   * for making the code significantly more complex and slower (if direction sampling depends on
+   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
+   *
+   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
+   * is too low results in fireflies while one that's too high just gives a bit more noise.
+   * Therefore, the code here uses the highest of the three albedos to be safe. */
+  const float diffusion_length = diffusion_length_dwivedi(max3(alpha));
+
+  if (diffusion_length == 1.0f) {
+    /* With specific values of alpha the length might become 1, which in asymptotic makes phase to
+     * be infinite. After first bounce it will cause throughput to be 0. Do early output, avoiding
+     * numerical issues and extra unneeded work. */
+    return false;
+  }
+
+  /* Precompute term for phase sampling. */
+  const float phase_log = logf((diffusion_length + 1.0f) / (diffusion_length - 1.0f));
+
+  /* Modify state for RNGs, decorrelated from other paths. */
+  rng_state.rng_hash = cmj_hash(rng_state.rng_hash + rng_state.rng_offset, 0xdeadbeef);
+
+  /* Random walk until we hit the surface again. */
+  bool hit = false;
+  bool have_opposite_interface = false;
+  float opposite_distance = 0.0f;
+
+  /* Todo: Disable for alpha>0.999 or so? */
+  /* Our heuristic, a compromise between guiding and classic. */
+  const float guided_fraction = 1.0f - fmaxf(0.5f, powf(fabsf(anisotropy), 0.125f));
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+  float3 sigma_s_star = sigma_s * (1.0f - anisotropy);
+  float3 sigma_t_star = sigma_t - sigma_s + sigma_s_star;
+  float3 sigma_t_org = sigma_t;
+  float3 sigma_s_org = sigma_s;
+  const float anisotropy_org = anisotropy;
+  const float guided_fraction_org = guided_fraction;
+#  endif
+
+  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
+    /* Advance random number offset. */
+    rng_state.rng_offset += PRNG_BOUNCE_NUM;
+
+#  ifdef SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL
+    // shadow with local variables according to depth
+    float anisotropy, guided_fraction;
+    float3 sigma_s, sigma_t;
+    if (bounce <= SUBSURFACE_RANDOM_WALK_SIMILARITY_LEVEL) {
+      anisotropy = anisotropy_org;
+      guided_fraction = guided_fraction_org;
+      sigma_t = sigma_t_org;
+      sigma_s = sigma_s_org;
+    }
+    else {
+      anisotropy = 0.0f;
+      guided_fraction = 0.75f;  // back to isotropic heuristic from Blender
+      sigma_t = sigma_t_star;
+      sigma_s = sigma_s_star;
+    }
+#  endif
+
+    /* Sample color channel, use MIS with balance heuristic. */
+    float rphase = path_state_rng_1D(kg, &rng_state, PRNG_PHASE_CHANNEL);
+    float3 channel_pdf;
+    int channel = volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
+    float sample_sigma_t = volume_channel_get(sigma_t, channel);
+    float randt = path_state_rng_1D(kg, &rng_state, PRNG_SCATTER_DISTANCE);
+
+    /* We need the result of the raycast to compute the full guided PDF, so just remember the
+     * relevant terms to avoid recomputing them later. */
+    float backward_fraction = 0.0f;
+    float forward_pdf_factor = 0.0f;
+    float forward_stretching = 1.0f;
+    float backward_pdf_factor = 0.0f;
+    float backward_stretching = 1.0f;
+
+    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
+    if (bounce > 0) {
+      /* Decide whether we should use guided or classic sampling. */
+      bool guided = (path_state_rng_1D(kg, &rng_state, PRNG_LIGHT_TERMINATE) < guided_fraction);
+
+      /* Determine if we want to sample away from the incoming interface.
+       * This only happens if we found a nearby opposite interface, and the probability for it
+       * depends on how close we are to it already.
+       * This probability term comes from the recorded presentation of [3]. */
+      bool guide_backward = false;
+      if (have_opposite_interface) {
+        /* Compute distance of the random walk between the tangent plane at the starting point
+         * and the assumed opposite interface (the parallel plane that contains the point we
+         * found in our ray query for the opposite side). */
+        float x = clamp(dot(ray.P - P, -N), 0.0f, opposite_distance);
+        backward_fraction = 1.0f /
+                            (1.0f + expf((opposite_distance - 2.0f * x) / diffusion_length));
+        guide_backward = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE) < backward_fraction;
+      }
+
+      /* Sample scattering direction. */
+      float scatter_u, scatter_v;
+      path_state_rng_2D(kg, &rng_state, PRNG_BSDF_U, &scatter_u, &scatter_v);
+      float cos_theta;
+      float hg_pdf;
+      if (guided) {
+        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
+        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
+         * sign here is enough to sample from that instead. */
+        if (guide_backward) {
+          cos_theta = -cos_theta;
+        }
+        float3 newD = direction_from_cosine(N, cos_theta, scatter_v);
+        hg_pdf = single_peaked_henyey_greenstein(dot(ray.D, newD), anisotropy);
+        ray.D = newD;
+      }
+      else {
+        float3 newD = henyey_greenstrein_sample(ray.D, anisotropy, scatter_u, scatter_v, &hg_pdf);
+        cos_theta = dot(newD, N);
+        ray.D = newD;
+      }
+
+      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
+       * Since phase sampling is channel-independent, we can get away with applying a factor
+       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
+       * it cancel with an equivalent term in the numerator of the full estimator.
+       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
+       */
+      forward_pdf_factor = M_1_2PI_F * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta) /
+                           hg_pdf;
+      backward_pdf_factor = M_1_2PI_F *
+                            eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta) / hg_pdf;
+
+      /* Prepare distance sampling.
+       * For the backwards case, this also needs the sign swapped since now directions against
+       * sd->N (and therefore with negative cos_theta) are preferred. */
+      forward_stretching = (1.0f - cos_theta / diffusion_length);
+      backward_stretching = (1.0f + cos_theta / diffusion_length);
+      if (guided) {
+        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
+      }
+    }
+
+    /* Sample direction along ray. */
+    float t = -logf(1.0f - randt) / sample_sigma_t;
+
+    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
+     * If yes, we will later use backwards guided sampling in order to have a decent
+     * chance of connecting to it.
+     * Todo: Maybe use less than 10 times the mean free path? */
+    ray.t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
+    scene_intersect_local(kg, &ray, &ss_isect, object, NULL, 1);
+    hit = (ss_isect.num_hits > 0);
+
+    if (hit) {
+#  ifdef __KERNEL_OPTIX__
+      /* t is always in world space with OptiX. */
+      ray.t = ss_isect.hits[0].t;
+#  else
+      /* Compute world space distance to surface hit. */
+      float3 D = transform_direction(&ob_itfm, ray.D);
+      D = normalize(D) * ss_isect.hits[0].t;
+      ray.t = len(transform_direction(&ob_tfm, D));
+#  endif
+    }
+
+    if (bounce == 0) {
+      /* Check if we hit the opposite side. */
+      if (hit) {
+        have_opposite_interface = true;
+        opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+      }
+      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
+       * so check if there would have been a hit in that case. */
+      hit = ray.t < t;
+    }
+
+    /* Use the distance to the exit point for the throughput update if we found one. */
+    if (hit) {
+      t = ray.t;
+    }
+    else if (bounce == 0) {
+      /* Restore original position if nothing was hit after the first bounce,
+       * without the ray_offset() that was added to avoid self-intersection.
+       * Otherwise if that offset is relatively large compared to the scattering
+       * radius, we never go back up high enough to exit the surface. */
+      ray.P = P;
+    }
+
+    /* Advance to new scatter location. */
+    ray.P += t * ray.D;
+
+    float3 transmittance;
+    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
+    if (bounce > 0) {
+      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
+      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
+
+      if (have_opposite_interface) {
+        /* First step of MIS: Depending on geometry we might have two methods for guided
+         * sampling, so perform MIS between them. */
+        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
+        guided_pdf = mix(
+            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
+      }
+      else {
+        /* Just include phase sampling factor otherwise. */
+        guided_pdf *= forward_pdf_factor;
+      }
+
+      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
+      pdf = mix(pdf, guided_pdf, guided_fraction);
+    }
+
+    /* Finally, we're applying MIS again to combine the three color channels.
+     * Altogether, the MIS computation combines up to nine different estimators:
+     * {classic, guided, backward_guided} x {r, g, b} */
+    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
+
+    if (hit) {
+      /* If we hit the surface, we are done. */
+      break;
+    }
+    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
+             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
+      /* Avoid unnecessary work and precision issue when throughput gets really small. */
+      break;
+    }
+  }
+
+  if (hit) {
+    kernel_assert(isfinite3_safe(throughput));
+    INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
+  }
+
+  return hit;
+}
+
+ccl_device_inline bool subsurface_scatter(INTEGRATOR_STATE_ARGS)
+{
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
+  Ray ray ccl_optional_struct_init;
+  LocalIntersection ss_isect ccl_optional_struct_init;
+
+  if (!subsurface_random_walk(INTEGRATOR_STATE_PASS, rng_state, ray, ss_isect)) {
+    return false;
+  }
+
+#  ifdef __VOLUME__
+  /* Update volume stack if needed. */
+  if (kernel_data.integrator.use_volumes) {
+    const int object = intersection_get_object(kg, &ss_isect.hits[0]);
+    const int object_flag = kernel_tex_fetch(__object_flag, object);
+
+    if (object_flag & SD_OBJECT_INTERSECTS_VOLUME) {
+      float3 P = INTEGRATOR_STATE(ray, P);
+      const float3 Ng = INTEGRATOR_STATE(isect, Ng);
+      const float3 offset_P = ray_offset(P, -Ng);
+
+      integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_PASS, offset_P, ray.P);
+    }
+  }
+#  endif /* __VOLUME__ */
+
+  /* Pretend ray is coming from the outside towards the exit point. This ensures
+   * correct front/back facing normals.
+   * TODO: find a more elegant solution? */
+  ray.P += ray.D * ray.t * 2.0f;
+  ray.D = -ray.D;
+
+  integrator_state_write_isect(INTEGRATOR_STATE_PASS, &ss_isect.hits[0]);
+  integrator_state_write_ray(INTEGRATOR_STATE_PASS, &ray);
+
+  /* Advanced random number offset for bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
+
+  const int shader = intersection_get_shader(kg, &ss_isect.hits[0]);
+  const int shader_flags = kernel_tex_fetch(__shaders, shader).flags;
+  if ((shader_flags & SD_HAS_RAYTRACE) || (kernel_data.film.pass_ao != PASS_UNUSED)) {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+                                shader);
+  }
+  else {
+    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+                                shader);
+  }
+
+  return true;
+}
+
+#endif /* __SUBSURFACE__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/integrator/integrator_volume_stack.h b/intern/cycles/kernel/integrator/integrator_volume_stack.h
new file mode 100644
index 00000000000..d53070095f0
--- /dev/null
+++ b/intern/cycles/kernel/integrator/integrator_volume_stack.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Volume Stack
+ *
+ * This is an array of object/shared ID's that the current segment of the path
+ * is inside of. */
+
+template<typename StackReadOp, typename StackWriteOp>
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS,
+                                        const ShaderData *sd,
+                                        StackReadOp stack_read,
+                                        StackWriteOp stack_write)
+{
+  /* todo: we should have some way for objects to indicate if they want the
+   * world shader to work inside them. excluding it by default is problematic
+   * because non-volume objects can't be assumed to be closed manifolds */
+  if (!(sd->flag & SD_HAS_VOLUME)) {
+    return;
+  }
+
+  if (sd->flag & SD_BACKFACING) {
+    /* Exit volume object: remove from stack. */
+    for (int i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      if (entry.object == sd->object) {
+        /* Shift back next stack entries. */
+        do {
+          entry = stack_read(i + 1);
+          stack_write(i, entry);
+          i++;
+        } while (entry.shader != SHADER_NONE);
+
+        return;
+      }
+    }
+  }
+  else {
+    /* Enter volume object: add to stack. */
+    int i;
+    for (i = 0;; i++) {
+      VolumeStack entry = stack_read(i);
+      if (entry.shader == SHADER_NONE) {
+        break;
+      }
+
+      /* Already in the stack? then we have nothing to do. */
+      if (entry.object == sd->object) {
+        return;
+      }
+    }
+
+    /* If we exceed the stack limit, ignore. */
+    if (i >= VOLUME_STACK_SIZE - 1) {
+      return;
+    }
+
+    /* Add to the end of the stack. */
+    const VolumeStack new_entry = {sd->object, sd->shader};
+    const VolumeStack empty_entry = {OBJECT_NONE, SHADER_NONE};
+    stack_write(i, new_entry);
+    stack_write(i + 1, empty_entry);
+  }
+}
+
+ccl_device void volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+ccl_device void shadow_volume_stack_enter_exit(INTEGRATOR_STATE_ARGS, const ShaderData *sd)
+{
+  volume_stack_enter_exit(
+      INTEGRATOR_STATE_PASS,
+      sd,
+      [=](const int i) {
+        return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
+      },
+      [=](const int i, const VolumeStack entry) {
+        integrator_state_write_shadow_volume_stack(INTEGRATOR_STATE_PASS, i, entry);
+      });
+}
+
+/* Clean stack after the last bounce.
+ *
+ * It is expected that all volumes are closed manifolds, so at the time when ray
+ * hits nothing (for example, it is a last bounce which goes to environment) the
+ * only expected volume in the stack is the world's one. All the rest volume
+ * entries should have been exited already.
+ *
+ * This isn't always true because of ray intersection precision issues, which
+ * could lead us to an infinite non-world volume in the stack, causing render
+ * artifacts.
+ *
+ * Use this function after the last bounce to get rid of all volumes apart from
+ * the world's one after the last bounce to avoid render artifacts.
+ */
+ccl_device_inline void volume_stack_clean(INTEGRATOR_STATE_ARGS)
+{
+  if (kernel_data.background.volume_shader != SHADER_NONE) {
+    /* Keep the world's volume in stack. */
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
+  }
+  else {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = SHADER_NONE;
+  }
+}
+
+template<typename StackReadOp>
+ccl_device float volume_stack_step_size(INTEGRATOR_STATE_ARGS, StackReadOp stack_read)
+{
+  float step_size = FLT_MAX;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    bool heterogeneous = false;
+
+    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
+      heterogeneous = true;
+    }
+    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
+      /* We want to render world or objects without any volume grids
+       * as homogeneous, but can only verify this at run-time since other
+       * heterogeneous volume objects may be using the same shader. */
+      int object = entry.object;
+      if (object != OBJECT_NONE) {
+        int object_flag = kernel_tex_fetch(__object_flag, object);
+        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
+          heterogeneous = true;
+        }
+      }
+    }
+
+    if (heterogeneous) {
+      float object_step_size = object_volume_step_size(kg, entry.object);
+      object_step_size *= kernel_data.integrator.volume_step_rate;
+      step_size = fminf(object_step_size, step_size);
+    }
+  }
+
+  return step_size;
+}
+
+typedef enum VolumeSampleMethod {
+  VOLUME_SAMPLE_NONE = 0,
+  VOLUME_SAMPLE_DISTANCE = (1 << 0),
+  VOLUME_SAMPLE_EQUIANGULAR = (1 << 1),
+  VOLUME_SAMPLE_MIS = (VOLUME_SAMPLE_DISTANCE | VOLUME_SAMPLE_EQUIANGULAR),
+} VolumeSampleMethod;
+
+ccl_device VolumeSampleMethod volume_stack_sample_method(INTEGRATOR_STATE_ARGS)
+{
+  VolumeSampleMethod method = VOLUME_SAMPLE_NONE;
+
+  for (int i = 0;; i++) {
+    VolumeStack entry = integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
+    int shader_flag = kernel_tex_fetch(__shaders, (entry.shader & SHADER_MASK)).flags;
+
+    if (shader_flag & SD_VOLUME_MIS) {
+      /* Multiple importance sampling. */
+      return VOLUME_SAMPLE_MIS;
+    }
+    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_DISTANCE) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Only equiangular sampling. */
+      method = VOLUME_SAMPLE_EQUIANGULAR;
+    }
+    else {
+      /* Distance + equiangular sampling -> multiple importance sampling. */
+      if (method == VOLUME_SAMPLE_EQUIANGULAR) {
+        return VOLUME_SAMPLE_MIS;
+      }
+
+      /* Distance sampling only. */
+      method = VOLUME_SAMPLE_DISTANCE;
+    }
+  }
+
+  return method;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 61653d328f1..9e12d24dcf4 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -14,751 +14,501 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_adaptive_sampling.h"
+#include "kernel_random.h"
+#include "kernel_shadow_catcher.h"
+#include "kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* BSDF Eval
+/* --------------------------------------------------------------------
+ * BSDF Evaluation
  *
- * BSDF evaluation result, split per BSDF type. This is used to accumulate
- * render passes separately. */
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd);
+ * BSDF evaluation result, split between diffuse and glossy. This is used to
+ * accumulate render passes separately. Note that reflection, transmission
+ * and volume scattering are written to different render passes, but we assume
+ * that only one of those can happen at a bounce, and so do not need to accumulate
+ * them separately. */
 
-ccl_device_inline void bsdf_eval_init(BsdfEval *eval,
-                                      ClosureType type,
-                                      float3 value,
-                                      int use_light_pass)
+ccl_device_inline void bsdf_eval_init(BsdfEval *eval, const bool is_diffuse, float3 value)
 {
-#ifdef __PASSES__
-  eval->use_light_pass = use_light_pass;
-
-  if (eval->use_light_pass) {
-    eval->diffuse = zero_float3();
-    eval->glossy = zero_float3();
-    eval->transmission = zero_float3();
-    eval->transparent = zero_float3();
-    eval->volume = zero_float3();
-
-    if (type == CLOSURE_BSDF_TRANSPARENT_ID)
-      eval->transparent = value;
-    else if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse = value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy = value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission = value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume = value;
-  }
-  else
-#endif
-  {
+  eval->diffuse = zero_float3();
+  eval->glossy = zero_float3();
+
+  if (is_diffuse) {
     eval->diffuse = value;
   }
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis = zero_float3();
-#endif
+  else {
+    eval->glossy = value;
+  }
 }
 
 ccl_device_inline void bsdf_eval_accum(BsdfEval *eval,
-                                       ClosureType type,
+                                       const bool is_diffuse,
                                        float3 value,
                                        float mis_weight)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis += value;
-#endif
   value *= mis_weight;
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type))
-      eval->diffuse += value;
-    else if (CLOSURE_IS_BSDF_GLOSSY(type))
-      eval->glossy += value;
-    else if (CLOSURE_IS_BSDF_TRANSMISSION(type))
-      eval->transmission += value;
-    else if (CLOSURE_IS_PHASE(type))
-      eval->volume += value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse += value;
-  }
-}
 
-ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
-{
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return is_zero(eval->diffuse) && is_zero(eval->glossy) && is_zero(eval->transmission) &&
-           is_zero(eval->transparent) && is_zero(eval->volume);
+  if (is_diffuse) {
+    eval->diffuse += value;
   }
-  else
-#endif
-  {
-    return is_zero(eval->diffuse);
+  else {
+    eval->glossy += value;
   }
 }
 
-ccl_device_inline void bsdf_eval_mis(BsdfEval *eval, float value)
+ccl_device_inline bool bsdf_eval_is_zero(BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-#endif
-  {
-    eval->diffuse *= value;
-  }
+  return is_zero(eval->diffuse) && is_zero(eval->glossy);
 }
 
 ccl_device_inline void bsdf_eval_mul(BsdfEval *eval, float value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-  bsdf_eval_mis(eval, value);
+  eval->diffuse *= value;
+  eval->glossy *= value;
 }
 
 ccl_device_inline void bsdf_eval_mul3(BsdfEval *eval, float3 value)
 {
-#ifdef __SHADOW_TRICKS__
-  eval->sum_no_mis *= value;
-#endif
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    eval->diffuse *= value;
-    eval->glossy *= value;
-    eval->transmission *= value;
-    eval->volume *= value;
-
-    /* skipping transparent, this function is used by for eval(), will be zero then */
-  }
-  else
-    eval->diffuse *= value;
-#else
   eval->diffuse *= value;
-#endif
+  eval->glossy *= value;
 }
 
 ccl_device_inline float3 bsdf_eval_sum(const BsdfEval *eval)
 {
-#ifdef __PASSES__
-  if (eval->use_light_pass) {
-    return eval->diffuse + eval->glossy + eval->transmission + eval->volume;
-  }
-  else
-#endif
-    return eval->diffuse;
+  return eval->diffuse + eval->glossy;
 }
 
-/* Path Radiance
- *
- * We accumulate different render passes separately. After summing at the end
- * to get the combined result, it should be identical. We definite directly
- * visible as the first non-transparent hit, while indirectly visible are the
- * bounces after that. */
-
-ccl_device_inline void path_radiance_init(KernelGlobals *kg, PathRadiance *L)
+ccl_device_inline float3 bsdf_eval_diffuse_glossy_ratio(const BsdfEval *eval)
 {
-  /* clear all */
-#ifdef __PASSES__
-  L->use_light_pass = kernel_data.film.use_light_pass;
-
-  if (kernel_data.film.use_light_pass) {
-    L->indirect = zero_float3();
-    L->direct_emission = zero_float3();
-
-    L->color_diffuse = zero_float3();
-    L->color_glossy = zero_float3();
-    L->color_transmission = zero_float3();
-
-    L->direct_diffuse = zero_float3();
-    L->direct_glossy = zero_float3();
-    L->direct_transmission = zero_float3();
-    L->direct_volume = zero_float3();
-
-    L->indirect_diffuse = zero_float3();
-    L->indirect_glossy = zero_float3();
-    L->indirect_transmission = zero_float3();
-    L->indirect_volume = zero_float3();
-
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-    L->background = zero_float3();
-    L->ao = zero_float3();
-    L->shadow = zero_float3();
-    L->mist = 0.0f;
-
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
-    L->state.direct = zero_float3();
-  }
-  else
-#endif
-  {
-    L->transparent = 0.0f;
-    L->emission = zero_float3();
-  }
-
-#ifdef __SHADOW_TRICKS__
-  L->path_total = zero_float3();
-  L->path_total_shaded = zero_float3();
-  L->shadow_background_color = zero_float3();
-  L->shadow_throughput = 0.0f;
-  L->shadow_transparency = 1.0f;
-  L->has_shadow_catcher = 0;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  L->denoising_normal = zero_float3();
-  L->denoising_albedo = zero_float3();
-  L->denoising_depth = 0.0f;
-#endif
+  /* Ratio of diffuse and glossy to recover proportions for writing to render pass.
+   * We assume reflection, transmission and volume scatter to be exclusive. */
+  return safe_divide_float3_float3(eval->diffuse, eval->diffuse + eval->glossy);
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(KernelGlobals *kg,
-                                                 PathRadianceState *L_state,
-                                                 ccl_addr_space float3 *throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float bsdf_pdf,
-                                                 int bounce,
-                                                 int bsdf_label)
-{
-  float inverse_pdf = 1.0f / bsdf_pdf;
-
-#ifdef __PASSES__
-  if (kernel_data.film.use_light_pass) {
-    if (bounce == 0 && !(bsdf_label & LABEL_TRANSPARENT)) {
-      /* first on directly visible surface */
-      float3 value = *throughput * inverse_pdf;
-
-      L_state->diffuse = bsdf_eval->diffuse * value;
-      L_state->glossy = bsdf_eval->glossy * value;
-      L_state->transmission = bsdf_eval->transmission * value;
-      L_state->volume = bsdf_eval->volume * value;
-
-      *throughput = L_state->diffuse + L_state->glossy + L_state->transmission + L_state->volume;
+/* --------------------------------------------------------------------
+ * Clamping
+ *
+ * Clamping is done on a per-contribution basis so that we can write directly
+ * to render buffers instead of using per-thread memory, and to avoid the
+ * impact of clamping on other contributions. */
 
-      L_state->direct = *throughput;
-    }
-    else {
-      /* transparent bounce before first hit, or indirectly visible through BSDF */
-      float3 sum = (bsdf_eval_sum(bsdf_eval) + bsdf_eval->transparent) * inverse_pdf;
-      *throughput *= sum;
-    }
+ccl_device_forceinline void kernel_accum_clamp(const KernelGlobals *kg, float3 *L, int bounce)
+{
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(*L)) {
+    kernel_assert(!"Cycles sample with non-finite value detected");
   }
-  else
 #endif
-  {
-    *throughput *= bsdf_eval->diffuse * inverse_pdf;
-  }
-}
+  /* Make sure all components are finite, allowing the contribution to be usable by adaptive
+   * sampling convergence check, but also to make it so render result never causes issues with
+   * post-processing. */
+  *L = ensure_finite3(*L);
 
 #ifdef __CLAMP_SAMPLE__
-ccl_device_forceinline void path_radiance_clamp(KernelGlobals *kg, float3 *L, int bounce)
-{
   float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
                                kernel_data.integrator.sample_clamp_direct;
   float sum = reduce_add(fabs(*L));
   if (sum > limit) {
     *L *= limit / sum;
   }
+#endif
 }
 
-ccl_device_forceinline void path_radiance_clamp_throughput(KernelGlobals *kg,
-                                                           float3 *L,
-                                                           float3 *throughput,
-                                                           int bounce)
-{
-  float limit = (bounce > 0) ? kernel_data.integrator.sample_clamp_indirect :
-                               kernel_data.integrator.sample_clamp_direct;
+/* --------------------------------------------------------------------
+ * Pass accumulation utilities.
+ */
 
-  float sum = reduce_add(fabs(*L));
-  if (sum > limit) {
-    float clamp_factor = limit / sum;
-    *L *= clamp_factor;
-    *throughput *= clamp_factor;
-  }
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
+{
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-#endif
+/* --------------------------------------------------------------------
+ * Adaptive sampling.
+ */
 
-ccl_device_inline void path_radiance_accum_emission(KernelGlobals *kg,
-                                                    PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 value)
+ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer,
+                                          int sample)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    return;
+  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
+    return sample;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0)
-      L->emission += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
-  }
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
 }
 
-ccl_device_inline void path_radiance_accum_ao(KernelGlobals *kg,
-                                              PathRadiance *L,
-                                              ccl_addr_space PathState *state,
-                                              float3 throughput,
-                                              float3 alpha,
-                                              float3 bsdf,
-                                              float3 ao)
+ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 contribution,
+                                             ccl_global float *ccl_restrict buffer)
 {
-#ifdef __PASSES__
-  /* Store AO pass. */
-  if (L->use_light_pass && state->bounce == 0) {
-    L->ao += alpha * throughput * ao;
-  }
-#endif
-
-#ifdef __SHADOW_TRICKS__
-  /* For shadow catcher, accumulate ratio. */
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf;
-    L->path_total += light;
-    L->path_total_shaded += ao * light;
+  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
+   * criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
+   * Carlo global illumination" except that here it is applied per pixel and not in hierarchical
+   * tiles. */
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return;
   }
-#endif
-
-  float3 contribution = throughput * bsdf * ao;
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->bounce == 0) {
-      /* Directly visible lighting. */
-      L->direct_diffuse += contribution;
-    }
-    else {
-      /* Indirectly visible lighting after BSDF bounce. */
-      L->indirect += contribution;
-    }
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  const int sample = INTEGRATOR_STATE(path, sample);
+  if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_adaptive_aux_buffer,
+        make_float4(contribution.x * 2.0f, contribution.y * 2.0f, contribution.z * 2.0f, 0.0f));
   }
 }
 
-ccl_device_inline void path_radiance_accum_total_ao(PathRadiance *L,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 throughput,
-                                                    float3 bsdf)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf;
-#endif
-}
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+#ifdef __SHADOW_CATCHER__
 
-ccl_device_inline void path_radiance_accum_light(KernelGlobals *kg,
-                                                 PathRadiance *L,
-                                                 ccl_addr_space PathState *state,
-                                                 float3 throughput,
-                                                 BsdfEval *bsdf_eval,
-                                                 float3 shadow,
-                                                 float shadow_fac,
-                                                 bool is_lamp)
+/* Accumulate contribution to the Shadow Catcher pass.
+ *
+ * Returns truth if the contribution is fully handled here and is not to be added to the other
+ * passes (like combined, adaptive sampling). */
+
+ccl_device bool kernel_accum_shadow_catcher(INTEGRATOR_STATE_CONST_ARGS,
+                                            const float3 contribution,
+                                            ccl_global float *ccl_restrict buffer)
 {
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    float3 light = throughput * bsdf_eval->sum_no_mis;
-    L->path_total += light;
-    L->path_total_shaded += shadow * light;
-
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
   }
-#endif
 
-  float3 shaded_throughput = throughput * shadow;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    /* Compute the clamping based on the total contribution.
-     * The resulting scale is then be applied to all individual components. */
-    float3 full_contribution = shaded_throughput * bsdf_eval_sum(bsdf_eval);
-#  ifdef __CLAMP_SAMPLE__
-    path_radiance_clamp_throughput(kg, &full_contribution, &shaded_throughput, state->bounce);
-#  endif
-
-    if (state->bounce == 0) {
-      /* directly visible lighting */
-      L->direct_diffuse += shaded_throughput * bsdf_eval->diffuse;
-      L->direct_glossy += shaded_throughput * bsdf_eval->glossy;
-      L->direct_transmission += shaded_throughput * bsdf_eval->transmission;
-      L->direct_volume += shaded_throughput * bsdf_eval->volume;
-
-      if (is_lamp) {
-        L->shadow += shadow * shadow_fac;
-      }
-    }
-    else {
-      /* indirectly visible lighting after BSDF bounce */
-      L->indirect += full_contribution;
-    }
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher_matte, contribution);
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
   }
-  else
-#endif
-  {
-    float3 contribution = shaded_throughput * bsdf_eval->diffuse;
-    path_radiance_clamp(kg, &contribution, state->bounce);
-    L->emission += contribution;
+
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
-}
 
-ccl_device_inline void path_radiance_accum_total_light(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput,
-                                                       const BsdfEval *bsdf_eval)
-{
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * bsdf_eval->sum_no_mis;
-  }
-#else
-  (void)L;
-  (void)state;
-  (void)throughput;
-  (void)bsdf_eval;
-#endif
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_background(KernelGlobals *kg,
-                                                      PathRadiance *L,
-                                                      ccl_addr_space PathState *state,
-                                                      float3 throughput,
-                                                      float3 value)
+ccl_device bool kernel_accum_shadow_catcher_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                        const float3 contribution,
+                                                        const float transparent,
+                                                        ccl_global float *ccl_restrict buffer)
 {
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
 
-#ifdef __SHADOW_TRICKS__
-  if (state->flag & PATH_RAY_STORE_SHADOW_INFO) {
-    L->path_total += throughput * value;
-    L->path_total_shaded += throughput * value * L->shadow_transparency;
+  kernel_assert(kernel_data.film.pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-    if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-      return;
-    }
+  if (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_BACKGROUND) {
+    return true;
   }
-#endif
 
-  float3 contribution = throughput * value;
-#ifdef __CLAMP_SAMPLE__
-  path_radiance_clamp(kg, &contribution, state->bounce - 1);
-#endif
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_shadow_catcher_matte,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+    /* NOTE: Accumulate the combined pass and to the samples count pass, so that the adaptive
+     * sampling is based on how noisy the combined pass is as if there were no catchers in the
+     * scene. */
+  }
 
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)
-      L->background += contribution;
-    else if (state->bounce == 1)
-      L->direct_emission += contribution;
-    else
-      L->indirect += contribution;
-  }
-  else
-#endif
-  {
-    L->emission += contribution;
+  /* Shadow catcher pass. */
+  if (kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_PASS)) {
+    /* NOTE: The transparency of the shadow catcher pass is ignored. It is not needed for the
+     * calculation and the alpha channel of the pass contains numbers of samples contributed to a
+     * pixel of the pass. */
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow_catcher, contribution);
+    return true;
   }
 
-#ifdef __DENOISING_FEATURES__
-  L->denoising_albedo += state->denoising_feature_weight * state->denoising_feature_throughput *
-                         value;
-#endif /* __DENOISING_FEATURES__ */
+  return false;
 }
 
-ccl_device_inline void path_radiance_accum_transparent(PathRadiance *L,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 throughput)
+ccl_device void kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_CONST_ARGS,
+                                                             const float transparent,
+                                                             ccl_global float *ccl_restrict buffer)
 {
-  L->transparent += average(throughput);
-}
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_accum_shadowcatcher(PathRadiance *L,
-                                                         float3 throughput,
-                                                         float3 background)
-{
-  L->shadow_throughput += average(throughput);
-  L->shadow_background_color += throughput * background;
-  L->has_shadow_catcher = 1;
-}
-#endif
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
 
-ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  /* this division is a bit ugly, but means we only have to keep track of
-   * only a single throughput further along the path, here we recover just
-   * the indirect path that is not influenced by any particular BSDF type */
-  if (L->use_light_pass) {
-    L->direct_emission = safe_divide_color(L->direct_emission, L->state.direct);
-    L->direct_diffuse += L->state.diffuse * L->direct_emission;
-    L->direct_glossy += L->state.glossy * L->direct_emission;
-    L->direct_transmission += L->state.transmission * L->direct_emission;
-    L->direct_volume += L->state.volume * L->direct_emission;
-
-    L->indirect = safe_divide_color(L->indirect, L->state.direct);
-    L->indirect_diffuse += L->state.diffuse * L->indirect;
-    L->indirect_glossy += L->state.glossy * L->indirect;
-    L->indirect_transmission += L->state.transmission * L->indirect;
-    L->indirect_volume += L->state.volume * L->indirect;
+  /* Matte pass. */
+  if (kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_PASS)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3, transparent);
   }
-#endif
 }
 
-ccl_device_inline void path_radiance_reset_indirect(PathRadiance *L)
-{
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state.diffuse = zero_float3();
-    L->state.glossy = zero_float3();
-    L->state.transmission = zero_float3();
-    L->state.volume = zero_float3();
+#endif /* __SHADOW_CATCHER__ */
+
+/* --------------------------------------------------------------------
+ * Render passes.
+ */
 
-    L->direct_emission = zero_float3();
-    L->indirect = zero_float3();
+/* Write combined pass. */
+ccl_device_inline void kernel_accum_combined_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                  const float3 contribution,
+                                                  ccl_global float *ccl_restrict buffer)
+{
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher(INTEGRATOR_STATE_PASS, contribution, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_combined, contribution);
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-ccl_device_inline void path_radiance_copy_indirect(PathRadiance *L, const PathRadiance *L_src)
+/* Write combined pass with transparency. */
+ccl_device_inline void kernel_accum_combined_transparent_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                              const float3 contribution,
+                                                              const float transparent,
+                                                              ccl_global float *ccl_restrict
+                                                                  buffer)
 {
-#ifdef __PASSES__
-  if (L->use_light_pass) {
-    L->state = L_src->state;
-
-    L->direct_emission = L_src->direct_emission;
-    L->indirect = L_src->indirect;
+#ifdef __SHADOW_CATCHER__
+  if (kernel_accum_shadow_catcher_transparent(
+          INTEGRATOR_STATE_PASS, contribution, transparent, buffer)) {
+    return;
   }
 #endif
+
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float4(
+        buffer + kernel_data.film.pass_combined,
+        make_float4(contribution.x, contribution.y, contribution.z, transparent));
+  }
+
+  kernel_accum_adaptive_buffer(INTEGRATOR_STATE_PASS, contribution, buffer);
 }
 
-#ifdef __SHADOW_TRICKS__
-ccl_device_inline void path_radiance_sum_shadowcatcher(KernelGlobals *kg,
-                                                       PathRadiance *L,
-                                                       float3 *L_sum,
-                                                       float *alpha)
+/* Write background or emission to appropriate pass. */
+ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE_CONST_ARGS,
+                                                                float3 contribution,
+                                                                ccl_global float *ccl_restrict
+                                                                    buffer,
+                                                                const int pass)
 {
-  /* Calculate current shadow of the path. */
-  float path_total = average(L->path_total);
-  float shadow;
+  if (!(kernel_data.film.light_pass_flag & PASS_ANY)) {
+    return;
+  }
 
-  if (UNLIKELY(!isfinite_safe(path_total))) {
-#  ifdef __KERNEL_DEBUG_NAN__
-    kernel_assert(!"Non-finite total radiance along the path");
-#  endif
-    shadow = 0.0f;
+#ifdef __PASSES__
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+  int pass_offset = PASS_UNUSED;
+
+  /* Denoising albedo. */
+#  ifdef __DENOISING_FEATURES__
+  if (path_flag & PATH_RAY_DENOISING_FEATURES) {
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = denoising_feature_throughput * contribution;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
   }
-  else if (path_total == 0.0f) {
-    shadow = L->shadow_transparency;
+#  endif /* __DENOISING_FEATURES__ */
+
+  if (!(path_flag & PATH_RAY_ANY_PASS)) {
+    /* Directly visible, write to emission or background pass. */
+    pass_offset = pass;
+  }
+  else if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+    /* Indirectly visible through reflection. */
+    const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_glossy_direct :
+                                            kernel_data.film.pass_glossy_indirect) :
+                                       ((INTEGRATOR_STATE(path, bounce) == 1) ?
+                                            kernel_data.film.pass_transmission_direct :
+                                            kernel_data.film.pass_transmission_indirect);
+
+    if (glossy_pass_offset != PASS_UNUSED) {
+      /* Glossy is a subset of the throughput, reconstruct it here using the
+       * diffuse-glossy ratio. */
+      const float3 ratio = INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+      const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+      kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+    }
+
+    /* Reconstruct diffuse subset of throughput. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_diffuse_direct :
+                                                          kernel_data.film.pass_diffuse_indirect;
+    if (pass_offset != PASS_UNUSED) {
+      contribution *= INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    }
   }
-  else {
-    float path_total_shaded = average(L->path_total_shaded);
-    shadow = path_total_shaded / path_total;
+  else if (path_flag & PATH_RAY_VOLUME_PASS) {
+    /* Indirectly visible through volume. */
+    pass_offset = (INTEGRATOR_STATE(path, bounce) == 1) ? kernel_data.film.pass_volume_direct :
+                                                          kernel_data.film.pass_volume_indirect;
   }
 
-  /* Calculate final light sum and transparency for shadow catcher object. */
-  if (kernel_data.background.transparent) {
-    *alpha -= L->shadow_throughput * shadow;
-  }
-  else {
-    L->shadow_background_color *= shadow;
-    *L_sum += L->shadow_background_color;
+  /* Single write call for GPU coherence. */
+  if (pass_offset != PASS_UNUSED) {
+    kernel_write_pass_float3(buffer + pass_offset, contribution);
   }
+#endif /* __PASSES__ */
 }
-#endif
 
-ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float *alpha)
+/* Write light contribution to render buffer. */
+ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
+                                          ccl_global float *ccl_restrict render_buffer)
 {
-  float3 L_sum;
-  /* Light Passes are used */
+  /* The throughput for shadow paths already contains the light shader evaluation. */
+  float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
+
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+
 #ifdef __PASSES__
-  float3 L_direct, L_indirect;
-  if (L->use_light_pass) {
-    path_radiance_sum_indirect(L);
-
-    L_direct = L->direct_diffuse + L->direct_glossy + L->direct_transmission + L->direct_volume +
-               L->emission;
-    L_indirect = L->indirect_diffuse + L->indirect_glossy + L->indirect_transmission +
-                 L->indirect_volume;
-
-    if (!kernel_data.background.transparent)
-      L_direct += L->background;
-
-    L_sum = L_direct + L_indirect;
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-
-    /* Reject invalid value */
-    if (!isfinite_safe(sum)) {
-#  ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite sum in path_radiance_clamp_and_sum!");
-#  endif
-      L_sum = zero_float3();
-
-      L->direct_diffuse = zero_float3();
-      L->direct_glossy = zero_float3();
-      L->direct_transmission = zero_float3();
-      L->direct_volume = zero_float3();
-
-      L->indirect_diffuse = zero_float3();
-      L->indirect_glossy = zero_float3();
-      L->indirect_transmission = zero_float3();
-      L->indirect_volume = zero_float3();
-
-      L->emission = zero_float3();
+  if (kernel_data.film.light_pass_flag & PASS_ANY) {
+    const int path_flag = INTEGRATOR_STATE(shadow_path, flag);
+    int pass_offset = PASS_UNUSED;
+
+    if (path_flag & (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS)) {
+      /* Indirectly visible through reflection. */
+      const int glossy_pass_offset = (path_flag & PATH_RAY_REFLECT_PASS) ?
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_glossy_direct :
+                                              kernel_data.film.pass_glossy_indirect) :
+                                         ((INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                                              kernel_data.film.pass_transmission_direct :
+                                              kernel_data.film.pass_transmission_indirect);
+
+      if (glossy_pass_offset != PASS_UNUSED) {
+        /* Glossy is a subset of the throughput, reconstruct it here using the
+         * diffuse-glossy ratio. */
+        const float3 ratio = INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+        const float3 glossy_contribution = (one_float3() - ratio) * contribution;
+        kernel_write_pass_float3(buffer + glossy_pass_offset, glossy_contribution);
+      }
+
+      /* Reconstruct diffuse subset of throughput. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_diffuse_direct :
+                        kernel_data.film.pass_diffuse_indirect;
+      if (pass_offset != PASS_UNUSED) {
+        contribution *= INTEGRATOR_STATE(shadow_path, diffuse_glossy_ratio);
+      }
+    }
+    else if (path_flag & PATH_RAY_VOLUME_PASS) {
+      /* Indirectly visible through volume. */
+      pass_offset = (INTEGRATOR_STATE(shadow_path, bounce) == 0) ?
+                        kernel_data.film.pass_volume_direct :
+                        kernel_data.film.pass_volume_indirect;
     }
-  }
 
-  /* No Light Passes */
-  else
-#endif
-  {
-    L_sum = L->emission;
+    /* Single write call for GPU coherence. */
+    if (pass_offset != PASS_UNUSED) {
+      kernel_write_pass_float3(buffer + pass_offset, contribution);
+    }
 
-    /* Reject invalid value */
-    float sum = fabsf((L_sum).x) + fabsf((L_sum).y) + fabsf((L_sum).z);
-    if (!isfinite_safe(sum)) {
-#ifdef __KERNEL_DEBUG_NAN__
-      kernel_assert(!"Non-finite final sum in path_radiance_clamp_and_sum!");
-#endif
-      L_sum = zero_float3();
+    /* Write shadow pass. */
+    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
+        (path_flag & PATH_RAY_CAMERA)) {
+      const float3 unshadowed_throughput = INTEGRATOR_STATE(shadow_path, unshadowed_throughput);
+      const float3 shadowed_throughput = INTEGRATOR_STATE(shadow_path, throughput);
+      const float3 shadow = safe_divide_float3_float3(shadowed_throughput, unshadowed_throughput) *
+                            kernel_data.film.pass_shadow_scale;
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_shadow, shadow);
     }
   }
+#endif
+}
 
-  /* Compute alpha. */
-  *alpha = 1.0f - L->transparent;
+/* Write transparency to render buffer.
+ *
+ * Note that we accumulate transparency = 1 - alpha in the render buffer.
+ * Otherwise we'd have to write alpha on path termination, which happens
+ * in many places. */
+ccl_device_inline void kernel_accum_transparent(INTEGRATOR_STATE_CONST_ARGS,
+                                                const float transparent,
+                                                ccl_global float *ccl_restrict render_buffer)
+{
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  /* Add shadow catcher contributions. */
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    path_radiance_sum_shadowcatcher(kg, L, &L_sum, alpha);
+  if (kernel_data.film.light_pass_flag & PASSMASK(COMBINED)) {
+    kernel_write_pass_float(buffer + kernel_data.film.pass_combined + 3, transparent);
   }
-#endif /* __SHADOW_TRICKS__ */
 
-  return L_sum;
+  kernel_accum_shadow_catcher_transparent_only(INTEGRATOR_STATE_PASS, transparent, buffer);
 }
 
-ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg,
-                                                     PathRadiance *L,
-                                                     float3 *noisy,
-                                                     float3 *clean)
+/* Write background contribution to render buffer.
+ *
+ * Includes transparency, matching kernel_accum_transparent. */
+ccl_device_inline void kernel_accum_background(INTEGRATOR_STATE_CONST_ARGS,
+                                               const float3 L,
+                                               const float transparent,
+                                               const bool is_transparent_background_ray,
+                                               ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __PASSES__
-  kernel_assert(L->use_light_pass);
-
-  *clean = L->emission + L->background;
-  *noisy = L->direct_volume + L->indirect_volume;
-
-#  define ADD_COMPONENT(flag, component) \
-    if (kernel_data.film.denoising_flags & flag) \
-      *clean += component; \
-    else \
-      *noisy += component;
-
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
-  ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
-#  undef ADD_COMPONENT
-#else
-  *noisy = L->emission;
-  *clean = zero_float3();
-#endif
+  float3 contribution = INTEGRATOR_STATE(path, throughput) * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __SHADOW_TRICKS__
-  if (L->has_shadow_catcher) {
-    *noisy += L->shadow_background_color;
-  }
-#endif
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-  *noisy = ensure_finite3(*noisy);
-  *clean = ensure_finite3(*clean);
+  if (is_transparent_background_ray) {
+    kernel_accum_transparent(INTEGRATOR_STATE_PASS, transparent, render_buffer);
+  }
+  else {
+    kernel_accum_combined_transparent_pass(
+        INTEGRATOR_STATE_PASS, contribution, transparent, buffer);
+  }
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_background);
 }
 
-ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample)
+/* Write emission to render buffer. */
+ccl_device_inline void kernel_accum_emission(INTEGRATOR_STATE_CONST_ARGS,
+                                             const float3 throughput,
+                                             const float3 L,
+                                             ccl_global float *ccl_restrict render_buffer)
 {
-#ifdef __SPLIT_KERNEL__
-#  define safe_float3_add(f, v) \
-    do { \
-      ccl_global float *p = (ccl_global float *)(&(f)); \
-      atomic_add_and_fetch_float(p + 0, (v).x); \
-      atomic_add_and_fetch_float(p + 1, (v).y); \
-      atomic_add_and_fetch_float(p + 2, (v).z); \
-    } while (0)
-#  define safe_float_add(f, v) atomic_add_and_fetch_float(&(f), (v))
-#else
-#  define safe_float3_add(f, v) (f) += (v)
-#  define safe_float_add(f, v) (f) += (v)
-#endif /* __SPLIT_KERNEL__ */
+  float3 contribution = throughput * L;
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(path, bounce) - 1);
 
-#ifdef __PASSES__
-  safe_float3_add(L->direct_diffuse, L_sample->direct_diffuse);
-  safe_float3_add(L->direct_glossy, L_sample->direct_glossy);
-  safe_float3_add(L->direct_transmission, L_sample->direct_transmission);
-  safe_float3_add(L->direct_volume, L_sample->direct_volume);
-
-  safe_float3_add(L->indirect_diffuse, L_sample->indirect_diffuse);
-  safe_float3_add(L->indirect_glossy, L_sample->indirect_glossy);
-  safe_float3_add(L->indirect_transmission, L_sample->indirect_transmission);
-  safe_float3_add(L->indirect_volume, L_sample->indirect_volume);
-
-  safe_float3_add(L->background, L_sample->background);
-  safe_float3_add(L->ao, L_sample->ao);
-  safe_float3_add(L->shadow, L_sample->shadow);
-  safe_float_add(L->mist, L_sample->mist);
-#endif /* __PASSES__ */
-  safe_float3_add(L->emission, L_sample->emission);
+  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
+                                                              render_buffer);
 
-#undef safe_float_add
-#undef safe_float3_add
+  kernel_accum_combined_pass(INTEGRATOR_STATE_PASS, contribution, buffer);
+  kernel_accum_emission_or_background_pass(
+      INTEGRATOR_STATE_PASS, contribution, buffer, kernel_data.film.pass_emission);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index 98b7bf7e7dc..7d71907effe 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -14,226 +14,146 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_ADAPTIVE_SAMPLING_H__
-#define __KERNEL_ADAPTIVE_SAMPLING_H__
+#pragma once
+
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
+/* Check whether the pixel has converged and should not be sampled anymore. */
 
-ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            int sample)
+ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
+                                                     ccl_global float *render_buffer)
 {
-  /* TODO Stefan: Is this better in linear, sRGB or something else? */
-  float4 I = *((ccl_global float4 *)buffer);
-  float4 A = *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-  /* The per pixel error as seen in section 2.1 of
-   * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
-   * A small epsilon is added to the divisor to prevent division by zero. */
-  float error = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) /
-                (sample * 0.0001f + sqrtf(I.x + I.y + I.z));
-  if (error < kernel_data.integrator.adaptive_threshold * (float)sample) {
-    /* Set the fourth component to non-zero value to indicate that this pixel has converged. */
-    buffer[kernel_data.film.pass_adaptive_aux_buffer + 3] += 1.0f;
+  if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
+    return true;
   }
-}
-
-/* Adjust the values of an adaptively sampled pixel. */
-
-ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg,
-                                            ccl_global float *buffer,
-                                            float sample_multiplier)
-{
-  *(ccl_global float4 *)(buffer) *= sample_multiplier;
 
-  /* Scale the aux pass too, this is necessary for progressive rendering to work properly. */
-  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer);
-  *(ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer) *= sample_multiplier;
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  ccl_global float *buffer = render_buffer + render_buffer_offset;
 
-#ifdef __PASSES__
-  int flag = kernel_data.film.pass_flag;
-
-  if (flag & PASSMASK(NORMAL))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_normal) *= sample_multiplier;
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  return buffer[aux_w_offset] == 0.0f;
+}
 
-  if (flag & PASSMASK(UV))
-    *(ccl_global float3 *)(buffer + kernel_data.film.pass_uv) *= sample_multiplier;
+/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
 
-  if (flag & PASSMASK(MOTION)) {
-    *(ccl_global float4 *)(buffer + kernel_data.film.pass_motion) *= sample_multiplier;
-    *(ccl_global float *)(buffer + kernel_data.film.pass_motion_weight) *= sample_multiplier;
+ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *kg,
+                                                           ccl_global float *render_buffer,
+                                                           int x,
+                                                           int y,
+                                                           float threshold,
+                                                           bool reset,
+                                                           int offset,
+                                                           int stride)
+{
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
+
+  const int render_pixel_index = offset + x + y * stride;
+  ccl_global float *buffer = render_buffer +
+                             (uint64_t)render_pixel_index * kernel_data.film.pass_stride;
+
+  /* TODO(Stefan): Is this better in linear, sRGB or something else? */
+
+  const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
+  if (!reset && A.w != 0.0f) {
+    /* If the pixel was considered converged, its state will not change in this kernel. Early
+     * output before doing any math.
+     *
+     * TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
+    return true;
   }
 
-  if (kernel_data.film.use_light_pass) {
-    int light_flag = kernel_data.film.light_pass_flag;
-
-    if (light_flag & PASSMASK(MIST))
-      *(ccl_global float *)(buffer + kernel_data.film.pass_mist) *= sample_multiplier;
-
-    /* Shadow pass omitted on purpose. It has its own scale parameter. */
-
-    if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_INDIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_indirect) *= sample_multiplier;
-    if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_direct) *= sample_multiplier;
-    if (light_flag & PASSMASK(VOLUME_DIRECT))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_volume_direct) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(EMISSION))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_emission) *= sample_multiplier;
-    if (light_flag & PASSMASK(BACKGROUND))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_background) *= sample_multiplier;
-    if (light_flag & PASSMASK(AO))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_ao) *= sample_multiplier;
-
-    if (light_flag & PASSMASK(DIFFUSE_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_diffuse_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(GLOSSY_COLOR))
-      *(ccl_global float3 *)(buffer + kernel_data.film.pass_glossy_color) *= sample_multiplier;
-    if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-      *(ccl_global float3 *)(buffer +
-                             kernel_data.film.pass_transmission_color) *= sample_multiplier;
-  }
-#endif
-
-#ifdef __DENOISING_FEATURES__
-
-#  define scale_float3_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale; \
-    *(buffer + offset + 3) *= scale * scale; \
-    *(buffer + offset + 4) *= scale * scale; \
-    *(buffer + offset + 5) *= scale * scale;
-
-#  define scale_shadow_variance(buffer, offset, scale) \
-    *(buffer + offset) *= scale; \
-    *(buffer + offset + 1) *= scale; \
-    *(buffer + offset + 2) *= scale * scale;
-
-  if (kernel_data.film.pass_denoising_data) {
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_A, sample_multiplier);
-    scale_shadow_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_SHADOW_B, sample_multiplier);
-    if (kernel_data.film.pass_denoising_clean) {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-      *(buffer + kernel_data.film.pass_denoising_clean) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 1) *= sample_multiplier;
-      *(buffer + kernel_data.film.pass_denoising_clean + 2) *= sample_multiplier;
-    }
-    else {
-      scale_float3_variance(
-          buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, sample_multiplier);
-    }
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL, sample_multiplier);
-    scale_float3_variance(
-        buffer, kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO, sample_multiplier);
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH) *= sample_multiplier;
-    *(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH +
-      1) *= sample_multiplier * sample_multiplier;
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Cryptomatte. */
-  if (kernel_data.film.cryptomatte_passes) {
-    int num_slots = 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) ? 1 : 0;
-    num_slots += (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) ? 1 : 0;
-    num_slots = num_slots * 2 * kernel_data.film.cryptomatte_depth;
-    ccl_global float2 *id_buffer = (ccl_global float2 *)(buffer +
-                                                         kernel_data.film.pass_cryptomatte);
-    for (int slot = 0; slot < num_slots; slot++) {
-      id_buffer[slot].y *= sample_multiplier;
-    }
-  }
+  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
 
-  /* AOVs. */
-  for (int i = 0; i < kernel_data.film.pass_aov_value_num; i++) {
-    *(buffer + kernel_data.film.pass_aov_value + i) *= sample_multiplier;
-  }
-  for (int i = 0; i < kernel_data.film.pass_aov_color_num; i++) {
-    *((ccl_global float4 *)(buffer + kernel_data.film.pass_aov_color) + i) *= sample_multiplier;
-  }
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
+  const float inv_sample = 1.0f / sample;
+
+  /* The per pixel error as seen in section 2.1 of
+   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
+  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
+                                 inv_sample;
+  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+  /* A small epsilon is added to the divisor to prevent division by zero. */
+  const float error = error_difference / (0.0001f + error_normalize);
+  const bool did_converge = (error < threshold);
+
+  const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+  buffer[aux_w_offset] = did_converge;
+
+  return did_converge;
 }
 
 /* This is a simple box filter in two passes.
  * When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
 
-ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_x(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int y,
+                                                  int start_x,
+                                                  int width,
+                                                  int offset,
+                                                  int stride)
 {
-  bool any = false;
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  for (int x = tile->x; x < tile->x + tile->w; ++x) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (x > tile->x && !prev) {
+  for (int x = start_x; x < start_x + width; ++x) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (x > start_x && !prev) {
         index = index - 1;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
-ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
+ccl_device void kernel_adaptive_sampling_filter_y(const KernelGlobals *kg,
+                                                  ccl_global float *render_buffer,
+                                                  int x,
+                                                  int start_y,
+                                                  int height,
+                                                  int offset,
+                                                  int stride)
 {
+  kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
+
   bool prev = false;
-  bool any = false;
-  for (int y = tile->y; y < tile->y + tile->h; ++y) {
-    int index = tile->offset + x + y * tile->stride;
-    ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w == 0.0f) {
-      any = true;
-      if (y > tile->y && !prev) {
-        index = index - tile->stride;
-        buffer = tile->buffer + index * kernel_data.film.pass_stride;
-        aux = (ccl_global float4 *)(buffer + kernel_data.film.pass_adaptive_aux_buffer);
-        (*aux).w = 0.0f;
+  for (int y = start_y; y < start_y + height; ++y) {
+    int index = offset + x + y * stride;
+    ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
+    const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
+
+    if (buffer[aux_w_offset] == 0.0f) {
+      if (y > start_y && !prev) {
+        index = index - stride;
+        buffer = render_buffer + index * kernel_data.film.pass_stride;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = true;
     }
     else {
       if (prev) {
-        (*aux).w = 0.0f;
+        buffer[aux_w_offset] = 0.0f;
       }
       prev = false;
     }
   }
-  return any;
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_ADAPTIVE_SAMPLING_H__ */
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 7da890b908d..e025bcd6674 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -14,502 +14,62 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BAKING__
-
-ccl_device_noinline void compute_light_pass(
-    KernelGlobals *kg, ShaderData *sd, PathRadiance *L, uint rng_hash, int pass_filter, int sample)
-{
-  kernel_assert(kernel_data.film.use_light_pass);
-
-  float3 throughput = one_float3();
-
-  /* Emission and indirect shader data memory used by various functions. */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  /* Init radiance. */
-  path_radiance_init(kg, L);
-
-  /* Init path state. */
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, NULL);
-
-  /* Evaluate surface shader. */
-  shader_eval_surface(kg, sd, &state, NULL, state.flag);
-
-  /* TODO: disable more closures we don't need besides transparent. */
-  shader_bsdf_disable_transparency(kg, sd);
-
-  /* Init ray. */
-  Ray ray;
-  ray.P = sd->P + sd->Ng;
-  ray.D = -sd->Ng;
-  ray.t = FLT_MAX;
-#  ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#  endif
-
-#  ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched) {
-    /* regular path tracer */
-#  endif
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_path_ao(kg, sd, emission_sd, L, &state, throughput, shader_bsdf_alpha(kg, sd));
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-    bool is_sss_sample = false;
-
-#  ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      SubsurfaceIndirectRays ss_indirect;
-      kernel_path_subsurface_init_indirect(&ss_indirect);
-      if (kernel_path_subsurface_scatter(
-              kg, sd, emission_sd, L, &state, &ray, &throughput, &ss_indirect)) {
-        while (ss_indirect.num_rays) {
-          kernel_path_subsurface_setup_indirect(kg, &ss_indirect, &state, &ray, L, &throughput);
-          kernel_path_indirect(
-              kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-        }
-        is_sss_sample = true;
-      }
-    }
-#  endif
-
-    /* sample light and BSDF */
-    if (!is_sss_sample && (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT))) {
-      kernel_path_surface_connect_light(kg, sd, emission_sd, throughput, &state, L);
-
-      if (kernel_path_surface_bounce(kg, sd, &throughput, &state, &L->state, &ray)) {
-#  ifdef __LAMP_MIS__
-        state.ray_t = 0.0f;
-#  endif
-        /* compute indirect light */
-        kernel_path_indirect(
-            kg, &indirect_sd, emission_sd, &ray, throughput, &state, L, sd->object);
-
-        /* sum and reset indirect light pass variables for the next samples */
-        path_radiance_sum_indirect(L);
-        path_radiance_reset_indirect(L);
-      }
-    }
-#  ifdef __BRANCHED_PATH__
-  }
-  else {
-    /* branched path tracer */
-
-    /* sample ambient occlusion */
-    if (pass_filter & BAKE_FILTER_AO) {
-      kernel_branched_path_ao(kg, sd, emission_sd, L, &state, throughput);
-    }
-
-    /* sample emission */
-    if ((pass_filter & BAKE_FILTER_EMISSION) && (sd->flag & SD_EMISSION)) {
-      float3 emission = indirect_primitive_emission(kg, sd, 0.0f, state.flag, state.ray_pdf);
-      path_radiance_accum_emission(kg, L, &state, throughput, emission);
-    }
-
-#    ifdef __SUBSURFACE__
-    /* sample subsurface scattering */
-    if ((pass_filter & BAKE_FILTER_DIFFUSE) && (sd->flag & SD_BSSRDF)) {
-      /* When mixing BSSRDF and BSDF closures we should skip BSDF lighting
-       * if scattering was successful. */
-      kernel_branched_path_subsurface_scatter(
-          kg, sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-    }
-#    endif
-
-    /* sample light and BSDF */
-    if (pass_filter & (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT)) {
-#    if defined(__EMISSION__)
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = kernel_data.integrator.sample_all_lights_direct;
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, &state, throughput, 1.0f, L, all);
-      }
-#    endif
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, sd, &indirect_sd, emission_sd, throughput, 1.0f, &state, L);
-    }
-  }
-#  endif
-}
-
-/* this helps with AA but it's not the real solution as it does not AA the geometry
- *  but it's better than nothing, thus committed */
-ccl_device_inline float bake_clamp_mirror_repeat(float u, float max)
-{
-  /* use mirror repeat (like opengl texture) so that if the barycentric
-   * coordinate goes past the end of the triangle it is not always clamped
-   * to the same value, gives ugly patterns */
-  u /= max;
-  float fu = floorf(u);
-  u = u - fu;
-
-  return ((((int)fu) & 1) ? 1.0f - u : u) * max;
-}
-
-ccl_device_inline float3 kernel_bake_shader_bsdf(KernelGlobals *kg,
-                                                 ShaderData *sd,
-                                                 const ShaderEvalType type)
-{
-  switch (type) {
-    case SHADER_EVAL_DIFFUSE:
-      return shader_bsdf_diffuse(kg, sd);
-    case SHADER_EVAL_GLOSSY:
-      return shader_bsdf_glossy(kg, sd);
-    case SHADER_EVAL_TRANSMISSION:
-      return shader_bsdf_transmission(kg, sd);
-    default:
-      kernel_assert(!"Unknown bake type passed to BSDF evaluate");
-      return zero_float3();
-  }
-}
-
-ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       PathState *state,
-                                                       float3 direct,
-                                                       float3 indirect,
-                                                       const ShaderEvalType type,
-                                                       const int pass_filter)
-{
-  float3 color;
-  const bool is_color = (pass_filter & BAKE_FILTER_COLOR) != 0;
-  const bool is_direct = (pass_filter & BAKE_FILTER_DIRECT) != 0;
-  const bool is_indirect = (pass_filter & BAKE_FILTER_INDIRECT) != 0;
-  float3 out = zero_float3();
-
-  if (is_color) {
-    if (is_direct || is_indirect) {
-      /* Leave direct and diffuse channel colored. */
-      color = one_float3();
-    }
-    else {
-      /* surface color of the pass only */
-      shader_eval_surface(kg, sd, state, NULL, 0);
-      return kernel_bake_shader_bsdf(kg, sd, type);
-    }
-  }
-  else {
-    shader_eval_surface(kg, sd, state, NULL, 0);
-    color = kernel_bake_shader_bsdf(kg, sd, type);
-  }
-
-  if (is_direct) {
-    out += safe_divide_even_color(direct, color);
-  }
-
-  if (is_indirect) {
-    out += safe_divide_even_color(indirect, color);
-  }
-
-  return out;
-}
-
-ccl_device void kernel_bake_evaluate(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* Setup render buffers. */
-  const int index = offset + x + y * stride;
-  const int pass_stride = kernel_data.film.pass_stride;
-  buffer += index * pass_stride;
-
-  ccl_global float *primitive = buffer + kernel_data.film.pass_bake_primitive;
-  ccl_global float *differential = buffer + kernel_data.film.pass_bake_differential;
-  ccl_global float *output = buffer + kernel_data.film.pass_combined;
-
-  int seed = __float_as_uint(primitive[0]);
-  int prim = __float_as_uint(primitive[1]);
-  if (prim == -1)
-    return;
-
-  prim += kernel_data.bake.tri_offset;
-
-  /* Random number generator. */
-  uint rng_hash = hash_uint(seed) ^ kernel_data.integrator.seed;
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  float filter_x, filter_y;
-  if (sample == 0) {
-    filter_x = filter_y = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, rng_hash, sample, num_samples, PRNG_FILTER_U, &filter_x, &filter_y);
-  }
-
-  /* Barycentric UV with sub-pixel offset. */
-  float u = primitive[2];
-  float v = primitive[3];
-
-  float dudx = differential[0];
-  float dudy = differential[1];
-  float dvdx = differential[2];
-  float dvdy = differential[3];
-
-  if (sample > 0) {
-    u = bake_clamp_mirror_repeat(u + dudx * (filter_x - 0.5f) + dudy * (filter_y - 0.5f), 1.0f);
-    v = bake_clamp_mirror_repeat(v + dvdx * (filter_x - 0.5f) + dvdy * (filter_y - 0.5f),
-                                 1.0f - u);
-  }
-
-  /* Shader data setup. */
-  int object = kernel_data.bake.object_index;
-  int shader;
-  float3 P, Ng;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  ShaderData sd;
-  shader_setup_from_sample(
-      kg,
-      &sd,
-      P,
-      Ng,
-      Ng,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      1.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-  sd.I = sd.N;
-
-  /* Setup differentials. */
-  sd.dP.dx = sd.dPdu * dudx + sd.dPdv * dvdx;
-  sd.dP.dy = sd.dPdu * dudy + sd.dPdv * dvdy;
-  sd.du.dx = dudx;
-  sd.du.dy = dudy;
-  sd.dv.dx = dvdx;
-  sd.dv.dy = dvdy;
-
-  /* Set RNG state for shaders that use sampling. */
-  PathState state = {0};
-  state.rng_hash = rng_hash;
-  state.rng_offset = 0;
-  state.sample = sample;
-  state.num_samples = num_samples;
-  state.min_ray_pdf = FLT_MAX;
-
-  /* Light passes if we need more than color. */
-  PathRadiance L;
-  int pass_filter = kernel_data.bake.pass_filter;
-
-  if (kernel_data.bake.pass_filter & ~BAKE_FILTER_COLOR)
-    compute_light_pass(kg, &sd, &L, rng_hash, pass_filter, sample);
-
-  float3 out = zero_float3();
-
-  ShaderEvalType type = (ShaderEvalType)kernel_data.bake.type;
-  switch (type) {
-    /* data passes */
-    case SHADER_EVAL_NORMAL:
-    case SHADER_EVAL_ROUGHNESS:
-    case SHADER_EVAL_EMISSION: {
-      if (type != SHADER_EVAL_NORMAL || (sd.flag & SD_HAS_BUMP)) {
-        int path_flag = (type == SHADER_EVAL_EMISSION) ? PATH_RAY_EMISSION : 0;
-        shader_eval_surface(kg, &sd, &state, NULL, path_flag);
-      }
-
-      if (type == SHADER_EVAL_NORMAL) {
-        float3 N = sd.N;
-        if (sd.flag & SD_HAS_BUMP) {
-          N = shader_bsdf_average_normal(kg, &sd);
-        }
+#pragma once
 
-        /* encoding: normal = (2 * color) - 1 */
-        out = N * 0.5f + make_float3(0.5f, 0.5f, 0.5f);
-      }
-      else if (type == SHADER_EVAL_ROUGHNESS) {
-        float roughness = shader_bsdf_average_roughness(&sd);
-        out = make_float3(roughness, roughness, roughness);
-      }
-      else {
-        out = shader_emissive_eval(&sd);
-      }
-      break;
-    }
-    case SHADER_EVAL_UV: {
-      out = primitive_uv(kg, &sd);
-      break;
-    }
-#  ifdef __PASSES__
-    /* light passes */
-    case SHADER_EVAL_AO: {
-      out = L.ao;
-      break;
-    }
-    case SHADER_EVAL_COMBINED: {
-      if ((pass_filter & BAKE_FILTER_COMBINED) == BAKE_FILTER_COMBINED) {
-        float alpha;
-        out = path_radiance_clamp_and_sum(kg, &L, &alpha);
-        break;
-      }
+#include "kernel/kernel_differential.h"
+#include "kernel/kernel_projection.h"
+#include "kernel/kernel_shader.h"
 
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_DIRECT) == BAKE_FILTER_DIFFUSE_DIRECT)
-        out += L.direct_diffuse;
-      if ((pass_filter & BAKE_FILTER_DIFFUSE_INDIRECT) == BAKE_FILTER_DIFFUSE_INDIRECT)
-        out += L.indirect_diffuse;
+#include "kernel/geom/geom.h"
 
-      if ((pass_filter & BAKE_FILTER_GLOSSY_DIRECT) == BAKE_FILTER_GLOSSY_DIRECT)
-        out += L.direct_glossy;
-      if ((pass_filter & BAKE_FILTER_GLOSSY_INDIRECT) == BAKE_FILTER_GLOSSY_INDIRECT)
-        out += L.indirect_glossy;
-
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_DIRECT) == BAKE_FILTER_TRANSMISSION_DIRECT)
-        out += L.direct_transmission;
-      if ((pass_filter & BAKE_FILTER_TRANSMISSION_INDIRECT) == BAKE_FILTER_TRANSMISSION_INDIRECT)
-        out += L.indirect_transmission;
-
-      if ((pass_filter & BAKE_FILTER_EMISSION) != 0)
-        out += L.emission;
-
-      break;
-    }
-    case SHADER_EVAL_SHADOW: {
-      out = L.shadow;
-      break;
-    }
-    case SHADER_EVAL_DIFFUSE: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_diffuse, L.indirect_diffuse, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_GLOSSY: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_glossy, L.indirect_glossy, type, pass_filter);
-      break;
-    }
-    case SHADER_EVAL_TRANSMISSION: {
-      out = kernel_bake_evaluate_direct_indirect(
-          kg, &sd, &state, L.direct_transmission, L.indirect_transmission, type, pass_filter);
-      break;
-    }
-#  endif
-
-    /* extra */
-    case SHADER_EVAL_ENVIRONMENT: {
-      /* setup ray */
-      Ray ray;
-
-      ray.P = zero_float3();
-      ray.D = normalize(P);
-      ray.t = 0.0f;
-#  ifdef __CAMERA_MOTION__
-      ray.time = 0.5f;
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-      ray.dD = differential3_zero();
-      ray.dP = differential3_zero();
-#  endif
-
-      /* setup shader data */
-      shader_setup_from_background(kg, &sd, &ray);
-
-      /* evaluate */
-      int path_flag = 0; /* we can't know which type of BSDF this is for */
-      shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-      out = shader_background_eval(&sd);
-      break;
-    }
-    default: {
-      /* no real shader, returning the position of the verts for debugging */
-      out = normalize(P);
-      break;
-    }
-  }
-
-  /* write output */
-  const float4 result = make_float4(out.x, out.y, out.z, 1.0f);
-  kernel_write_pass_float4(output, result);
-}
-
-#endif /* __BAKING__ */
+CCL_NAMESPACE_BEGIN
 
-ccl_device void kernel_displace_evaluate(KernelGlobals *kg,
-                                         ccl_global uint4 *input,
+ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
+                                         ccl_global const KernelShaderEvalInput *input,
                                          ccl_global float4 *output,
-                                         int i)
+                                         const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
+  /* Setup shader data. */
+  const KernelShaderEvalInput in = input[offset];
 
-  /* setup shader data */
-  int object = in.x;
-  int prim = in.y;
-  float u = __uint_as_float(in.z);
-  float v = __uint_as_float(in.w);
-
-  shader_setup_from_displace(kg, &sd, object, prim, u, v);
+  ShaderData sd;
+  shader_setup_from_displace(kg, &sd, in.object, in.prim, in.u, in.v);
 
-  /* evaluate */
-  float3 P = sd.P;
-  shader_eval_displacement(kg, &sd, &state);
+  /* Evaluate displacement shader. */
+  const float3 P = sd.P;
+  shader_eval_displacement(INTEGRATOR_STATE_PASS_NULL, &sd);
   float3 D = sd.P - P;
 
   object_inverse_dir_transform(kg, &sd, &D);
 
-  /* write output */
-  output[i] += make_float4(D.x, D.y, D.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
 }
 
-ccl_device void kernel_background_evaluate(KernelGlobals *kg,
-                                           ccl_global uint4 *input,
+ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
+                                           ccl_global const KernelShaderEvalInput *input,
                                            ccl_global float4 *output,
-                                           int i)
+                                           const int offset)
 {
-  ShaderData sd;
-  PathState state = {0};
-  uint4 in = input[i];
-
-  /* setup ray */
-  Ray ray;
-  float u = __uint_as_float(in.x);
-  float v = __uint_as_float(in.y);
-
-  ray.P = zero_float3();
-  ray.D = equirectangular_to_direction(u, v);
-  ray.t = 0.0f;
-#ifdef __CAMERA_MOTION__
-  ray.time = 0.5f;
-#endif
+  /* Setup ray */
+  const KernelShaderEvalInput in = input[offset];
+  const float3 ray_P = zero_float3();
+  const float3 ray_D = equirectangular_to_direction(in.u, in.v);
+  const float ray_time = 0.5f;
 
-#ifdef __RAY_DIFFERENTIALS__
-  ray.dD = differential3_zero();
-  ray.dP = differential3_zero();
-#endif
-
-  /* setup shader data */
-  shader_setup_from_background(kg, &sd, &ray);
+  /* Setup shader data. */
+  ShaderData sd;
+  shader_setup_from_background(kg, &sd, ray_P, ray_D, ray_time);
 
-  /* evaluate */
-  int path_flag = 0; /* we can't know which type of BSDF this is for */
-  shader_eval_surface(kg, &sd, &state, NULL, path_flag | PATH_RAY_EMISSION);
-  float3 color = shader_background_eval(&sd);
+  /* Evaluate shader.
+   * This is being evaluated for all BSDFs, so path flag does not contain a specific type. */
+  const int path_flag = PATH_RAY_EMISSION;
+  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+      INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
+  const float3 color = shader_background_eval(&sd);
 
-  /* write output */
-  output[i] += make_float4(color.x, color.y, color.z, 0.0f);
+  /* Write output. */
+  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_camera.h b/intern/cycles/kernel/kernel_camera.h
index 1bfac37158d..7be5da8fe6d 100644
--- a/intern/cycles/kernel/kernel_camera.h
+++ b/intern/cycles/kernel/kernel_camera.h
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_differential.h"
+#include "kernel_lookup_table.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Perspective Camera */
@@ -39,7 +46,7 @@ ccl_device float2 camera_sample_aperture(ccl_constant KernelCamera *cam, float u
   return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg,
+ccl_device void camera_sample_perspective(const KernelGlobals *ccl_restrict kg,
                                           float raster_x,
                                           float raster_y,
                                           float lens_u,
@@ -113,10 +120,14 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
     float3 Dcenter = transform_direction(&cameratoworld, Pcamera);
-
-    ray->dP = differential3_zero();
-    ray->dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - normalize(Dcenter);
-    ray->dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - normalize(Dcenter);
+    float3 Dcenter_normalized = normalize(Dcenter);
+
+    /* TODO: can this be optimized to give compact differentials directly? */
+    ray->dP = differential_zero_compact();
+    differential3 dD;
+    dD.dx = normalize(Dcenter + float4_to_float3(kernel_data.cam.dx)) - Dcenter_normalized;
+    dD.dy = normalize(Dcenter + float4_to_float3(kernel_data.cam.dy)) - Dcenter_normalized;
+    ray->dD = differential_make_compact(dD);
 #endif
   }
   else {
@@ -143,8 +154,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dx = normalize(transform_direction(&cameratoworld, Dx));
     spherical_stereo_transform(&kernel_data.cam, &Px, &Dx);
 
-    ray->dP.dx = Px - Pcenter;
-    ray->dD.dx = Dx - Dcenter;
+    differential3 dP, dD;
+
+    dP.dx = Px - Pcenter;
+    dD.dx = Dx - Dcenter;
 
     float3 Py = Pnostereo;
     float3 Dy = transform_perspective(&rastertocamera,
@@ -152,8 +165,10 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
     Dy = normalize(transform_direction(&cameratoworld, Dy));
     spherical_stereo_transform(&kernel_data.cam, &Py, &Dy);
 
-    ray->dP.dy = Py - Pcenter;
-    ray->dD.dy = Dy - Dcenter;
+    dP.dy = Py - Pcenter;
+    dD.dy = Dy - Dcenter;
+    ray->dD = differential_make_compact(dD);
+    ray->dP = differential_make_compact(dP);
 #endif
   }
 
@@ -162,8 +177,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
   float z_inv = 1.0f / normalize(Pcamera).z;
   float nearclip = kernel_data.cam.nearclip * z_inv;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = kernel_data.cam.cliplength * z_inv;
 #else
   ray->t = FLT_MAX;
@@ -171,7 +185,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg,
 }
 
 /* Orthographic Camera */
-ccl_device void camera_sample_orthographic(KernelGlobals *kg,
+ccl_device void camera_sample_orthographic(const KernelGlobals *ccl_restrict kg,
                                            float raster_x,
                                            float raster_y,
                                            float lens_u,
@@ -220,10 +234,12 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg,
 
 #ifdef __RAY_DIFFERENTIALS__
   /* ray differential */
-  ray->dP.dx = float4_to_float3(kernel_data.cam.dx);
-  ray->dP.dy = float4_to_float3(kernel_data.cam.dy);
+  differential3 dP;
+  dP.dx = float4_to_float3(kernel_data.cam.dx);
+  dP.dy = float4_to_float3(kernel_data.cam.dx);
 
-  ray->dD = differential3_zero();
+  ray->dP = differential_make_compact(dP);
+  ray->dD = differential_zero_compact();
 #endif
 
 #ifdef __CAMERA_CLIPPING__
@@ -323,8 +339,9 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Px, &Dx);
   }
 
-  ray->dP.dx = Px - Pcenter;
-  ray->dD.dx = Dx - Dcenter;
+  differential3 dP, dD;
+  dP.dx = Px - Pcenter;
+  dD.dx = Dx - Dcenter;
 
   float3 Py = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y + 1.0f, 0.0f));
   float3 Dy = panorama_to_direction(cam, Py.x, Py.y);
@@ -334,16 +351,17 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
     spherical_stereo_transform(cam, &Py, &Dy);
   }
 
-  ray->dP.dy = Py - Pcenter;
-  ray->dD.dy = Dy - Dcenter;
+  dP.dy = Py - Pcenter;
+  dD.dy = Dy - Dcenter;
+  ray->dD = differential_make_compact(dD);
+  ray->dP = differential_make_compact(dP);
 #endif
 
 #ifdef __CAMERA_CLIPPING__
   /* clipping */
   float nearclip = cam->nearclip;
   ray->P += nearclip * ray->D;
-  ray->dP.dx += nearclip * ray->dD.dx;
-  ray->dP.dy += nearclip * ray->dD.dy;
+  ray->dP += nearclip * ray->dD;
   ray->t = cam->cliplength;
 #else
   ray->t = FLT_MAX;
@@ -352,7 +370,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
 
 /* Common */
 
-ccl_device_inline void camera_sample(KernelGlobals *kg,
+ccl_device_inline void camera_sample(const KernelGlobals *ccl_restrict kg,
                                      int x,
                                      int y,
                                      float filter_u,
@@ -426,13 +444,13 @@ ccl_device_inline void camera_sample(KernelGlobals *kg,
 
 /* Utilities */
 
-ccl_device_inline float3 camera_position(KernelGlobals *kg)
+ccl_device_inline float3 camera_position(const KernelGlobals *kg)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   return make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
 }
 
-ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_distance(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
   float3 camP = make_float3(cameratoworld.x.w, cameratoworld.y.w, cameratoworld.z.w);
@@ -446,7 +464,7 @@ ccl_device_inline float camera_distance(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
+ccl_device_inline float camera_z_depth(const KernelGlobals *kg, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     Transform worldtocamera = kernel_data.cam.worldtocamera;
@@ -459,7 +477,7 @@ ccl_device_inline float camera_z_depth(KernelGlobals *kg, float3 P)
   }
 }
 
-ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P)
+ccl_device_inline float3 camera_direction_from_point(const KernelGlobals *kg, float3 P)
 {
   Transform cameratoworld = kernel_data.cam.cameratoworld;
 
@@ -473,7 +491,7 @@ ccl_device_inline float3 camera_direction_from_point(KernelGlobals *kg, float3 P
   }
 }
 
-ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd, float3 P)
+ccl_device_inline float3 camera_world_to_ndc(const KernelGlobals *kg, ShaderData *sd, float3 P)
 {
   if (kernel_data.cam.type != CAMERA_PANORAMA) {
     /* perspective / ortho */
diff --git a/intern/cycles/kernel/kernel_color.h b/intern/cycles/kernel/kernel_color.h
index 5eb1bdad02e..960774e0741 100644
--- a/intern/cycles/kernel/kernel_color.h
+++ b/intern/cycles/kernel/kernel_color.h
@@ -14,25 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_COLOR_H__
-#define __KERNEL_COLOR_H__
+#pragma once
 
 #include "util/util_color.h"
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float3 xyz_to_rgb(KernelGlobals *kg, float3 xyz)
+ccl_device float3 xyz_to_rgb(const KernelGlobals *kg, float3 xyz)
 {
   return make_float3(dot(float4_to_float3(kernel_data.film.xyz_to_r), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_g), xyz),
                      dot(float4_to_float3(kernel_data.film.xyz_to_b), xyz));
 }
 
-ccl_device float linear_rgb_to_gray(KernelGlobals *kg, float3 c)
+ccl_device float linear_rgb_to_gray(const KernelGlobals *kg, float3 c)
 {
   return dot(c, float4_to_float3(kernel_data.film.rgb_to_y));
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_COLOR_H__ */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
deleted file mode 100644
index 4a9304a134c..00000000000
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_COMPAT_OPENCL_H__
-#define __KERNEL_COMPAT_OPENCL_H__
-
-#define __KERNEL_GPU__
-#define __KERNEL_OPENCL__
-
-/* no namespaces in opencl */
-#define CCL_NAMESPACE_BEGIN
-#define CCL_NAMESPACE_END
-
-#ifdef __CL_NOINLINE__
-#  define ccl_noinline __attribute__((noinline))
-#else
-#  define ccl_noinline
-#endif
-
-/* in opencl all functions are device functions, so leave this empty */
-#define ccl_device
-#define ccl_device_inline ccl_device
-#define ccl_device_forceinline ccl_device
-#define ccl_device_noinline ccl_device ccl_noinline
-#define ccl_device_noinline_cpu ccl_device
-#define ccl_may_alias
-#define ccl_static_constant static __constant
-#define ccl_constant __constant
-#define ccl_global __global
-#define ccl_local __local
-#define ccl_local_param __local
-#define ccl_private __private
-#define ccl_restrict restrict
-#define ccl_ref
-#define ccl_align(n) __attribute__((aligned(n)))
-#define ccl_optional_struct_init
-
-#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION)
-#  define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
-#else
-#  define ccl_loop_no_unroll
-#endif
-
-#ifdef __SPLIT_KERNEL__
-#  define ccl_addr_space __global
-#else
-#  define ccl_addr_space
-#endif
-
-#define ATTR_FALLTHROUGH
-
-#define ccl_local_id(d) get_local_id(d)
-#define ccl_global_id(d) get_global_id(d)
-
-#define ccl_local_size(d) get_local_size(d)
-#define ccl_global_size(d) get_global_size(d)
-
-#define ccl_group_id(d) get_group_id(d)
-#define ccl_num_groups(d) get_num_groups(d)
-
-/* Selective nodes compilation. */
-#ifndef __NODES_MAX_GROUP__
-#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
-#endif
-#ifndef __NODES_FEATURES__
-#  define __NODES_FEATURES__ NODE_FEATURE_ALL
-#endif
-
-/* no assert in opencl */
-#define kernel_assert(cond)
-
-/* make_type definitions with opencl style element initializers */
-#ifdef make_float2
-#  undef make_float2
-#endif
-#ifdef make_float3
-#  undef make_float3
-#endif
-#ifdef make_float4
-#  undef make_float4
-#endif
-#ifdef make_int2
-#  undef make_int2
-#endif
-#ifdef make_int3
-#  undef make_int3
-#endif
-#ifdef make_int4
-#  undef make_int4
-#endif
-#ifdef make_uchar4
-#  undef make_uchar4
-#endif
-
-#define make_float2(x, y) ((float2)(x, y))
-#define make_float3(x, y, z) ((float3)(x, y, z))
-#define make_float4(x, y, z, w) ((float4)(x, y, z, w))
-#define make_int2(x, y) ((int2)(x, y))
-#define make_int3(x, y, z) ((int3)(x, y, z))
-#define make_int4(x, y, z, w) ((int4)(x, y, z, w))
-#define make_uchar4(x, y, z, w) ((uchar4)(x, y, z, w))
-
-/* math functions */
-#define __uint_as_float(x) as_float(x)
-#define __float_as_uint(x) as_uint(x)
-#define __int_as_float(x) as_float(x)
-#define __float_as_int(x) as_int(x)
-#define powf(x, y) pow(((float)(x)), ((float)(y)))
-#define fabsf(x) fabs(((float)(x)))
-#define copysignf(x, y) copysign(((float)(x)), ((float)(y)))
-#define asinf(x) asin(((float)(x)))
-#define acosf(x) acos(((float)(x)))
-#define atanf(x) atan(((float)(x)))
-#define floorf(x) floor(((float)(x)))
-#define ceilf(x) ceil(((float)(x)))
-#define hypotf(x, y) hypot(((float)(x)), ((float)(y)))
-#define atan2f(x, y) atan2(((float)(x)), ((float)(y)))
-#define fmaxf(x, y) fmax(((float)(x)), ((float)(y)))
-#define fminf(x, y) fmin(((float)(x)), ((float)(y)))
-#define fmodf(x, y) fmod((float)(x), (float)(y))
-#define sinhf(x) sinh(((float)(x)))
-#define coshf(x) cosh(((float)(x)))
-#define tanhf(x) tanh(((float)(x)))
-
-/* Use native functions with possibly lower precision for performance,
- * no issues found so far. */
-#if 1
-#  define sinf(x) native_sin(((float)(x)))
-#  define cosf(x) native_cos(((float)(x)))
-#  define tanf(x) native_tan(((float)(x)))
-#  define expf(x) native_exp(((float)(x)))
-#  define sqrtf(x) native_sqrt(((float)(x)))
-#  define logf(x) native_log(((float)(x)))
-#  define rcp(x) native_recip(x)
-#else
-#  define sinf(x) sin(((float)(x)))
-#  define cosf(x) cos(((float)(x)))
-#  define tanf(x) tan(((float)(x)))
-#  define expf(x) exp(((float)(x)))
-#  define sqrtf(x) sqrt(((float)(x)))
-#  define logf(x) log(((float)(x)))
-#  define rcp(x) recip(x)
-#endif
-
-/* data lookup defines */
-#define kernel_data (*kg->data)
-#define kernel_tex_array(tex) \
-  ((const ccl_global tex##_t *)(kg->buffers[kg->tex.cl_buffer] + kg->tex.data))
-#define kernel_tex_fetch(tex, index) kernel_tex_array(tex)[(index)]
-
-/* define NULL */
-#ifndef NULL
-#  define NULL ((void *)0)
-#endif
-
-/* enable extensions */
-#ifdef __KERNEL_CL_KHR_FP16__
-#  pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#include "util/util_half.h"
-#include "util/util_types.h"
-
-#endif /* __KERNEL_COMPAT_OPENCL_H__ */
diff --git a/intern/cycles/kernel/kernel_differential.h b/intern/cycles/kernel/kernel_differential.h
index 3ec0cdbaccc..db4e110bd10 100644
--- a/intern/cycles/kernel/kernel_differential.h
+++ b/intern/cycles/kernel/kernel_differential.h
@@ -14,26 +14,28 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(ccl_addr_space differential3 *dP_,
-                                      const differential3 dP,
-                                      float3 D,
-                                      const differential3 dD,
-                                      float3 Ng,
-                                      float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *surface_dP,
+                                      const differential3 ray_dP,
+                                      float3 ray_D,
+                                      const differential3 ray_dD,
+                                      float3 surface_Ng,
+                                      float ray_t)
 {
   /* ray differential transfer through homogeneous medium, to
    * compute dPdx/dy at a shading point from the incoming ray */
 
-  float3 tmp = D / dot(D, Ng);
-  float3 tmpx = dP.dx + t * dD.dx;
-  float3 tmpy = dP.dy + t * dD.dy;
+  float3 tmp = ray_D / dot(ray_D, surface_Ng);
+  float3 tmpx = ray_dP.dx + ray_t * ray_dD.dx;
+  float3 tmpy = ray_dP.dy + ray_t * ray_dD.dy;
 
-  dP_->dx = tmpx - dot(tmpx, Ng) * tmp;
-  dP_->dy = tmpy - dot(tmpy, Ng) * tmp;
+  surface_dP->dx = tmpx - dot(tmpx, surface_Ng) * tmp;
+  surface_dP->dy = tmpy - dot(tmpy, surface_Ng) * tmp;
 }
 
 ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
@@ -112,4 +114,53 @@ ccl_device differential3 differential3_zero()
   return d;
 }
 
+/* Compact ray differentials that are just a scale to reduce memory usage and
+ * access cost in GPU.
+ *
+ * See above for more accurate reference implementations.
+ *
+ * TODO: also store the more compact version in ShaderData and recompute where
+ * needed? */
+
+ccl_device_forceinline float differential_zero_compact()
+{
+  return 0.0f;
+}
+
+ccl_device_forceinline float differential_make_compact(const differential3 D)
+{
+  return 0.5f * (len(D.dx) + len(D.dy));
+}
+
+ccl_device_forceinline void differential_transfer_compact(ccl_addr_space differential3 *surface_dP,
+                                                          const float ray_dP,
+                                                          const float3 /* ray_D */,
+                                                          const float ray_dD,
+                                                          const float3 surface_Ng,
+                                                          const float ray_t)
+{
+  /* ray differential transfer through homogeneous medium, to
+   * compute dPdx/dy at a shading point from the incoming ray */
+  float scale = ray_dP + ray_t * ray_dD;
+
+  float3 dx, dy;
+  make_orthonormals(surface_Ng, &dx, &dy);
+  surface_dP->dx = dx * scale;
+  surface_dP->dy = dy * scale;
+}
+
+ccl_device_forceinline void differential_incoming_compact(ccl_addr_space differential3 *dI,
+                                                          const float3 D,
+                                                          const float dD)
+{
+  /* compute dIdx/dy at a shading point, we just need to negate the
+   * differential of the ray direction */
+
+  float3 dx, dy;
+  make_orthonormals(D, &dx, &dy);
+
+  dI->dx = dD * dx;
+  dI->dy = dD * dy;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index aebf2ec8e28..d62285d173d 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -14,40 +14,36 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/kernel_light.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_path_state.h"
+#include "kernel/kernel_shader.h"
+
 CCL_NAMESPACE_BEGIN
 
-/* Direction Emission */
-ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    LightSample *ls,
-                                                    ccl_addr_space PathState *state,
-                                                    float3 I,
-                                                    differential3 dI,
-                                                    float t,
-                                                    float time)
+/* Evaluate shader on light. */
+ccl_device_noinline_cpu float3 light_sample_shader_eval(INTEGRATOR_STATE_ARGS,
+                                                        ShaderData *ccl_restrict emission_sd,
+                                                        LightSample *ccl_restrict ls,
+                                                        float time)
 {
   /* setup shading at emitter */
   float3 eval = zero_float3();
 
   if (shader_constant_emission_eval(kg, ls->shader, &eval)) {
-    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, I) < 0.0f) {
+    if ((ls->prim != PRIM_NONE) && dot(ls->Ng, ls->D) > 0.0f) {
       ls->Ng = -ls->Ng;
     }
   }
   else {
     /* Setup shader data and call shader_eval_surface once, better
      * for GPU coherence and compile times. */
+    PROFILING_INIT_FOR_SHADER(kg, PROFILING_SHADE_LIGHT_SETUP);
 #ifdef __BACKGROUND_MIS__
     if (ls->type == LIGHT_BACKGROUND) {
-      Ray ray;
-      ray.D = ls->D;
-      ray.P = ls->P;
-      ray.t = 1.0f;
-      ray.time = time;
-      ray.dP = differential3_zero();
-      ray.dD = dI;
-
-      shader_setup_from_background(kg, emission_sd, &ray);
+      shader_setup_from_background(kg, emission_sd, ls->P, ls->D, time);
     }
     else
 #endif
@@ -56,13 +52,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
                                emission_sd,
                                ls->P,
                                ls->Ng,
-                               I,
+                               -ls->D,
                                ls->shader,
                                ls->object,
                                ls->prim,
                                ls->u,
                                ls->v,
-                               t,
+                               ls->t,
                                time,
                                false,
                                ls->lamp);
@@ -70,11 +66,13 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
       ls->Ng = emission_sd->Ng;
     }
 
+    PROFILING_SHADER(emission_sd->object, emission_sd->shader);
+    PROFILING_EVENT(PROFILING_SHADE_LIGHT_EVAL);
+
     /* No proper path flag, we're evaluating this for all closures. that's
      * weak but we'd have to do multiple evaluations otherwise. */
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, NULL, PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
+    shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
+        INTEGRATOR_STATE_PASS, emission_sd, NULL, PATH_RAY_EMISSION);
 
     /* Evaluate closures. */
 #ifdef __BACKGROUND_MIS__
@@ -98,85 +96,129 @@ ccl_device_noinline_cpu float3 direct_emissive_eval(KernelGlobals *kg,
   return eval;
 }
 
-ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
-                                             ShaderData *sd,
-                                             ShaderData *emission_sd,
-                                             LightSample *ls,
-                                             ccl_addr_space PathState *state,
-                                             Ray *ray,
-                                             BsdfEval *eval,
-                                             bool *is_lamp,
-                                             float rand_terminate)
+/* Test if light sample is from a light or emission from geometry. */
+ccl_device_inline bool light_sample_is_light(const LightSample *ccl_restrict ls)
 {
-  if (ls->pdf == 0.0f)
-    return false;
-
-  /* todo: implement */
-  differential3 dD = differential3_zero();
+  /* return if it's a lamp for shadow pass */
+  return (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
+}
 
-  /* evaluate closure */
+/* Early path termination of shadow rays. */
+ccl_device_inline bool light_sample_terminate(const KernelGlobals *ccl_restrict kg,
+                                              const LightSample *ccl_restrict ls,
+                                              BsdfEval *ccl_restrict eval,
+                                              const float rand_terminate)
+{
+  if (bsdf_eval_is_zero(eval)) {
+    return true;
+  }
 
-  float3 light_eval = direct_emissive_eval(
-      kg, emission_sd, ls, state, -ls->D, dD, ls->t, sd->time);
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    float probability = max3(fabs(bsdf_eval_sum(eval))) *
+                        kernel_data.integrator.light_inv_rr_threshold;
+    if (probability < 1.0f) {
+      if (rand_terminate >= probability) {
+        return true;
+      }
+      bsdf_eval_mul(eval, 1.0f / probability);
+    }
+  }
 
-  if (is_zero(light_eval))
-    return false;
+  return false;
+}
 
-    /* evaluate BSDF at shading point */
+/* This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. The algorithm slightly distorts flat surface
+ * of a triangle. Surface is lifted by amount h along normal n in the incident
+ * point. */
 
-#ifdef __VOLUME__
-  if (sd->prim != PRIM_NONE)
-    shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
+ccl_device_inline float3 shadow_ray_smooth_surface_offset(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          float3 Ng)
+{
+  float3 V[3], N[3];
+  triangle_vertices_and_normals(kg, sd->prim, V, N);
+
+  const float u = sd->u, v = sd->v;
+  const float w = 1 - u - v;
+  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
+  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */
+
+  object_normal_transform(kg, sd, &n); /* Normal x scale, world space */
+
+  /* Parabolic approximation */
+  float a = dot(N[2] - N[0], V[0] - V[2]);
+  float b = dot(N[2] - N[1], V[1] - V[2]);
+  float c = dot(N[1] - N[0], V[1] - V[0]);
+  float h = a * u * (u - 1) + (a + b + c) * u * v + b * v * (v - 1);
+
+  /* Check flipped normals */
+  if (dot(n, Ng) > 0) {
+    /* Local linear envelope */
+    float h0 = max(max(dot(V[1] - V[0], N[0]), dot(V[2] - V[0], N[0])), 0.0f);
+    float h1 = max(max(dot(V[0] - V[1], N[1]), dot(V[2] - V[1], N[1])), 0.0f);
+    float h2 = max(max(dot(V[0] - V[2], N[2]), dot(V[1] - V[2], N[2])), 0.0f);
+    h0 = max(dot(V[0] - P, N[0]) + h0, 0.0f);
+    h1 = max(dot(V[1] - P, N[1]) + h1, 0.0f);
+    h2 = max(dot(V[2] - P, N[2]) + h2, 0.0f);
+    h = max(min(min(h0, h1), h2), h * 0.5f);
+  }
   else {
-    float bsdf_pdf;
-    shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
-    if (ls->shader & SHADER_USE_MIS) {
-      /* Multiple importance sampling. */
-      float mis_weight = power_heuristic(ls->pdf, bsdf_pdf);
-      light_eval *= mis_weight;
-    }
+    float h0 = max(max(dot(V[0] - V[1], N[0]), dot(V[0] - V[2], N[0])), 0.0f);
+    float h1 = max(max(dot(V[1] - V[0], N[1]), dot(V[1] - V[2], N[1])), 0.0f);
+    float h2 = max(max(dot(V[2] - V[0], N[2]), dot(V[2] - V[1], N[2])), 0.0f);
+    h0 = max(dot(P - V[0], N[0]) + h0, 0.0f);
+    h1 = max(dot(P - V[1], N[1]) + h1, 0.0f);
+    h2 = max(dot(P - V[2], N[2]) + h2, 0.0f);
+    h = min(-min(min(h0, h1), h2), h * 0.5f);
   }
-#else
-  shader_bsdf_eval(kg, sd, ls->D, eval, ls->pdf, ls->shader & SHADER_USE_MIS);
-#endif
 
-  bsdf_eval_mul3(eval, light_eval / ls->pdf);
-
-#ifdef __PASSES__
-  /* use visibility flag to skip lights */
-  if (ls->shader & SHADER_EXCLUDE_ANY) {
-    if (ls->shader & SHADER_EXCLUDE_DIFFUSE)
-      eval->diffuse = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_GLOSSY)
-      eval->glossy = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_TRANSMIT)
-      eval->transmission = zero_float3();
-    if (ls->shader & SHADER_EXCLUDE_SCATTER)
-      eval->volume = zero_float3();
-  }
-#endif
+  return n * h;
+}
 
-  if (bsdf_eval_is_zero(eval))
-    return false;
+/* Ray offset to avoid shadow terminator artifact. */
 
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f
-#ifdef __SHADOW_TRICKS__
-      && (state->flag & PATH_RAY_SHADOW_CATCHER) == 0
-#endif
-  ) {
-    float probability = max3(fabs(bsdf_eval_sum(eval))) *
-                        kernel_data.integrator.light_inv_rr_threshold;
-    if (probability < 1.0f) {
-      if (rand_terminate >= probability) {
-        return false;
+ccl_device_inline float3 shadow_ray_offset(const KernelGlobals *ccl_restrict kg,
+                                           const ShaderData *ccl_restrict sd,
+                                           float3 L)
+{
+  float NL = dot(sd->N, L);
+  bool transmit = (NL < 0.0f);
+  float3 Ng = (transmit ? -sd->Ng : sd->Ng);
+  float3 P = ray_offset(sd->P, Ng);
+
+  if ((sd->type & PRIMITIVE_ALL_TRIANGLE) && (sd->shader & SHADER_SMOOTH_NORMAL)) {
+    const float offset_cutoff =
+        kernel_tex_fetch(__objects, sd->object).shadow_terminator_geometry_offset;
+    /* Do ray offset (heavy stuff) only for close to be terminated triangles:
+     * offset_cutoff = 0.1f means that 10-20% of rays will be affected. Also
+     * make a smooth transition near the threshold. */
+    if (offset_cutoff > 0.0f) {
+      float NgL = dot(Ng, L);
+      float offset_amount = 0.0f;
+      if (NL < offset_cutoff) {
+        offset_amount = clamp(2.0f - (NgL + NL) / offset_cutoff, 0.0f, 1.0f);
+      }
+      else {
+        offset_amount = clamp(1.0f - NgL / offset_cutoff, 0.0f, 1.0f);
+      }
+      if (offset_amount > 0.0f) {
+        P += shadow_ray_smooth_surface_offset(kg, sd, Ng) * offset_amount;
       }
-      bsdf_eval_mul(eval, 1.0f / probability);
     }
   }
 
+  return P;
+}
+
+ccl_device_inline void shadow_ray_setup(const ShaderData *ccl_restrict sd,
+                                        const LightSample *ccl_restrict ls,
+                                        const float3 P,
+                                        Ray *ray)
+{
   if (ls->shader & SHADER_CAST_SHADOW) {
     /* setup ray */
-    ray->P = ray_offset_shadow(kg, sd, ls->D);
+    ray->P = P;
 
     if (ls->t == FLT_MAX) {
       /* distant light */
@@ -185,160 +227,40 @@ ccl_device_noinline_cpu bool direct_emission(KernelGlobals *kg,
     }
     else {
       /* other lights, avoid self-intersection */
-      ray->D = ray_offset(ls->P, ls->Ng) - ray->P;
+      ray->D = ray_offset(ls->P, ls->Ng) - P;
       ray->D = normalize_len(ray->D, &ray->t);
     }
-
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
   }
   else {
     /* signal to not cast shadow ray */
+    ray->P = zero_float3();
+    ray->D = zero_float3();
     ray->t = 0.0f;
   }
 
-  /* return if it's a lamp for shadow pass */
-  *is_lamp = (ls->prim == PRIM_NONE && ls->type != LIGHT_BACKGROUND);
-
-  return true;
+  ray->dP = differential_make_compact(sd->dP);
+  ray->dD = differential_zero_compact();
+  ray->time = sd->time;
 }
 
-/* Indirect Primitive Emission */
-
-ccl_device_noinline_cpu float3 indirect_primitive_emission(
-    KernelGlobals *kg, ShaderData *sd, float t, int path_flag, float bsdf_pdf)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_surface_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                          const ShaderData *ccl_restrict sd,
+                                                          const LightSample *ccl_restrict ls,
+                                                          Ray *ray)
 {
-  /* evaluate emissive closure */
-  float3 L = shader_emissive_eval(sd);
-
-#ifdef __HAIR__
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_ALL_TRIANGLE))
-#else
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
-#endif
-  {
-    /* multiple importance sampling, get triangle light pdf,
-     * and compute weight with respect to BSDF pdf */
-    float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
-    return L * mis_weight;
-  }
-
-  return L;
+  const float3 P = shadow_ray_offset(kg, sd, ls->D);
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
-/* Indirect Lamp Emission */
-
-ccl_device_noinline_cpu void indirect_lamp_emission(KernelGlobals *kg,
-                                                    ShaderData *emission_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadiance *L,
-                                                    Ray *ray,
-                                                    float3 throughput)
+/* Create shadow ray towards light sample. */
+ccl_device_inline void light_sample_to_volume_shadow_ray(const KernelGlobals *ccl_restrict kg,
+                                                         const ShaderData *ccl_restrict sd,
+                                                         const LightSample *ccl_restrict ls,
+                                                         const float3 P,
+                                                         Ray *ray)
 {
-  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    LightSample ls ccl_optional_struct_init;
-
-    if (!lamp_light_eval(kg, lamp, ray->P, ray->D, ray->t, &ls))
-      continue;
-
-#ifdef __PASSES__
-    /* use visibility flag to skip lights */
-    if (ls.shader & SHADER_EXCLUDE_ANY) {
-      if (((ls.shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-          ((ls.shader & SHADER_EXCLUDE_GLOSSY) &&
-           ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-            (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-          ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-          ((ls.shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-        continue;
-    }
-#endif
-
-    float3 lamp_L = direct_emissive_eval(
-        kg, emission_sd, &ls, state, -ray->D, ray->dD, ls.t, ray->time);
-
-#ifdef __VOLUME__
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      /* shadow attenuation */
-      Ray volume_ray = *ray;
-      volume_ray.t = ls.t;
-      float3 volume_tp = one_float3();
-      kernel_volume_shadow(kg, emission_sd, state, &volume_ray, &volume_tp);
-      lamp_L *= volume_tp;
-    }
-#endif
-
-    if (!(state->flag & PATH_RAY_MIS_SKIP)) {
-      /* multiple importance sampling, get regular light pdf,
-       * and compute weight with respect to BSDF pdf */
-      float mis_weight = power_heuristic(state->ray_pdf, ls.pdf);
-      lamp_L *= mis_weight;
-    }
-
-    path_radiance_accum_emission(kg, L, state, throughput, lamp_L);
-  }
-}
-
-/* Indirect Background */
-
-ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg,
-                                                   ShaderData *emission_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_global float *buffer,
-                                                   ccl_addr_space Ray *ray)
-{
-#ifdef __BACKGROUND__
-  int shader = kernel_data.background.surface_shader;
-
-  /* Use visibility flag to skip lights. */
-  if (shader & SHADER_EXCLUDE_ANY) {
-    if (((shader & SHADER_EXCLUDE_DIFFUSE) && (state->flag & PATH_RAY_DIFFUSE)) ||
-        ((shader & SHADER_EXCLUDE_GLOSSY) &&
-         ((state->flag & (PATH_RAY_GLOSSY | PATH_RAY_REFLECT)) ==
-          (PATH_RAY_GLOSSY | PATH_RAY_REFLECT))) ||
-        ((shader & SHADER_EXCLUDE_TRANSMIT) && (state->flag & PATH_RAY_TRANSMIT)) ||
-        ((shader & SHADER_EXCLUDE_CAMERA) && (state->flag & PATH_RAY_CAMERA)) ||
-        ((shader & SHADER_EXCLUDE_SCATTER) && (state->flag & PATH_RAY_VOLUME_SCATTER)))
-      return zero_float3();
-  }
-
-  /* Evaluate background shader. */
-  float3 L = zero_float3();
-  if (!shader_constant_emission_eval(kg, shader, &L)) {
-#  ifdef __SPLIT_KERNEL__
-    Ray priv_ray = *ray;
-    shader_setup_from_background(kg, emission_sd, &priv_ray);
-#  else
-    shader_setup_from_background(kg, emission_sd, ray);
-#  endif
-
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, emission_sd, state, buffer, state->flag | PATH_RAY_EMISSION);
-    path_state_modify_bounce(state, false);
-
-    L = shader_background_eval(emission_sd);
-  }
-
-  /* Background MIS weights. */
-#  ifdef __BACKGROUND_MIS__
-  /* Check if background light exists or if we should skip pdf. */
-  if (!(state->flag & PATH_RAY_MIS_SKIP) && kernel_data.background.use_mis) {
-    /* multiple importance sampling, get background light pdf for ray
-     * direction, and compute weight with respect to BSDF pdf */
-    float pdf = background_light_pdf(kg, ray->P, ray->D);
-    float mis_weight = power_heuristic(state->ray_pdf, pdf);
-
-    return L * mis_weight;
-  }
-#  endif
-
-  return L;
-#else
-  return make_float3(0.8f, 0.8f, 0.8f);
-#endif
+  shadow_ray_setup(sd, ls, P, ray);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index a6fd4f1dc7e..715d764fb31 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -14,119 +14,516 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 film_get_pass_result(KernelGlobals *kg,
-                                       ccl_global float *buffer,
-                                       float sample_scale,
-                                       int index,
-                                       bool use_display_sample_scale)
-{
-  float4 pass_result;
-
-  int display_pass_stride = kernel_data.film.display_pass_stride;
-  int display_pass_components = kernel_data.film.display_pass_components;
-
-  if (display_pass_components == 4) {
-    float4 in = *(ccl_global float4 *)(buffer + display_pass_stride +
-                                       index * kernel_data.film.pass_stride);
-    float alpha = use_display_sample_scale ?
-                      (kernel_data.film.use_display_pass_alpha ? in.w : 1.0f / sample_scale) :
-                      1.0f;
-
-    pass_result = make_float4(in.x, in.y, in.z, alpha);
-
-    int display_divide_pass_stride = kernel_data.film.display_divide_pass_stride;
-    if (display_divide_pass_stride != -1) {
-      ccl_global float4 *divide_in = (ccl_global float4 *)(buffer + display_divide_pass_stride +
-                                                           index * kernel_data.film.pass_stride);
-      float3 divided = safe_divide_even_color(float4_to_float3(pass_result),
-                                              float4_to_float3(*divide_in));
-      pass_result = make_float4(divided.x, divided.y, divided.z, pass_result.w);
-    }
+/* --------------------------------------------------------------------
+ * Common utilities.
+ */
 
-    if (kernel_data.film.use_display_exposure) {
-      float exposure = kernel_data.film.exposure;
-      pass_result *= make_float4(exposure, exposure, exposure, 1.0f);
-    }
+/* The input buffer contains transparency = 1 - alpha, this converts it to
+ * alpha. Also clamp since alpha might end up outside of 0..1 due to Russian
+ * roulette. */
+ccl_device_forceinline float film_transparency_to_alpha(float transparency)
+{
+  return saturate(1.0f - transparency);
+}
+
+ccl_device_inline float film_get_scale(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+    return 1.0f / sample_count;
+  }
+
+  return 1.0f;
+}
+
+ccl_device_inline float film_get_scale_exposure(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    return kfilm_convert->scale_exposure;
+  }
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  if (kfilm_convert->pass_use_exposure) {
+    return scale * kfilm_convert->exposure;
+  }
+
+  return scale;
+}
+
+ccl_device_inline bool film_get_scale_and_scale_exposure(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict scale,
+    float *ccl_restrict scale_exposure)
+{
+  if (kfilm_convert->pass_sample_count == PASS_UNUSED) {
+    *scale = kfilm_convert->scale;
+    *scale_exposure = kfilm_convert->scale_exposure;
+    return true;
+  }
+
+  const uint sample_count = *((const uint *)(buffer + kfilm_convert->pass_sample_count));
+  if (!sample_count) {
+    *scale = 0.0f;
+    *scale_exposure = 0.0f;
+    return false;
+  }
+
+  if (kfilm_convert->pass_use_filter) {
+    *scale = 1.0f / sample_count;
   }
-  else if (display_pass_components == 1) {
-    ccl_global float *in = (ccl_global float *)(buffer + display_pass_stride +
-                                                index * kernel_data.film.pass_stride);
-    pass_result = make_float4(*in, *in, *in, 1.0f / sample_scale);
+  else {
+    *scale = 1.0f;
+  }
+
+  if (kfilm_convert->pass_use_exposure) {
+    *scale_exposure = *scale * kfilm_convert->exposure;
+  }
+  else {
+    *scale_exposure = *scale;
+  }
+
+  return true;
+}
+
+/* --------------------------------------------------------------------
+ * Float (scalar) passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_depth(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
+}
+
+ccl_device_inline void film_get_pass_pixel_mist(const KernelFilmConvert *ccl_restrict
+                                                    kfilm_convert,
+                                                ccl_global const float *ccl_restrict buffer,
+                                                float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  /* Note that we accumulate 1 - mist in the kernel to avoid having to
+   * track the mist values in the integrator state. */
+  pixel[0] = saturate(1.0f - f * scale_exposure);
+}
+
+ccl_device_inline void film_get_pass_pixel_sample_count(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+   * meaningful value when adaptive sampler stopped rendering image way before the maximum
+   * number of samples was reached (for examples when number of samples is set to 0 in
+   * viewport). */
+
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = __float_as_uint(f) * kfilm_convert->scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float(const KernelFilmConvert *ccl_restrict
+                                                     kfilm_convert,
+                                                 ccl_global const float *ccl_restrict buffer,
+                                                 float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 1);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float f = *in;
+
+  pixel[0] = f * scale_exposure;
+}
+
+/* --------------------------------------------------------------------
+ * Float 3 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_light_path(const KernelFilmConvert *ccl_restrict
+                                                          kfilm_convert,
+                                                      ccl_global const float *ccl_restrict buffer,
+                                                      float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  /* Read light pass. */
+  const float *in = buffer + kfilm_convert->pass_offset;
+  float3 f = make_float3(in[0], in[1], in[2]);
+
+  /* Optionally add indirect light pass. */
+  if (kfilm_convert->pass_indirect != PASS_UNUSED) {
+    const float *in_indirect = buffer + kfilm_convert->pass_indirect;
+    const float3 f_indirect = make_float3(in_indirect[0], in_indirect[1], in_indirect[2]);
+    f += f_indirect;
+  }
+
+  /* Optionally divide out color. */
+  if (kfilm_convert->pass_divide != PASS_UNUSED) {
+    const float *in_divide = buffer + kfilm_convert->pass_divide;
+    const float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
+    f = safe_divide_even_color(f, f_divide);
+
+    /* Exposure only, sample scale cancels out. */
+    f *= kfilm_convert->exposure;
+  }
+  else {
+    /* Sample scale and exposure. */
+    f *= film_get_scale_exposure(kfilm_convert, buffer);
+  }
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_float3(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale_exposure = film_get_scale_exposure(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 f = make_float3(in[0], in[1], in[2]) * scale_exposure;
+
+  pixel[0] = f.x;
+  pixel[1] = f.y;
+  pixel[2] = f.z;
+}
+
+/* --------------------------------------------------------------------
+ * Float4 passes.
+ */
+
+ccl_device_inline void film_get_pass_pixel_motion(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_motion_weight != PASS_UNUSED);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+  const float *in_weight = buffer + kfilm_convert->pass_motion_weight;
+
+  const float weight = in_weight[0];
+  const float weight_inv = (weight > 0.0f) ? 1.0f / weight : 0.0f;
+
+  const float4 motion = make_float4(in[0], in[1], in[2], in[3]) * weight_inv;
+
+  pixel[0] = motion.x;
+  pixel[1] = motion.y;
+  pixel[2] = motion.z;
+  pixel[3] = motion.w;
+}
+
+ccl_device_inline void film_get_pass_pixel_cryptomatte(const KernelFilmConvert *ccl_restrict
+                                                           kfilm_convert,
+                                                       ccl_global const float *ccl_restrict buffer,
+                                                       float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float4 f = make_float4(in[0], in[1], in[2], in[3]);
+
+  /* x and z contain integer IDs, don't rescale them.
+   * y and w contain matte weights, they get scaled. */
+  pixel[0] = f.x;
+  pixel[1] = f.y * scale;
+  pixel[2] = f.z;
+  pixel[3] = f.w * scale;
+}
+
+ccl_device_inline void film_get_pass_pixel_float4(const KernelFilmConvert *ccl_restrict
+                                                      kfilm_convert,
+                                                  ccl_global const float *ccl_restrict buffer,
+                                                  float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
+
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = alpha;
+}
+
+ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl_restrict
+                                                        kfilm_convert,
+                                                    ccl_global const float *ccl_restrict buffer,
+                                                    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 4);
+
+  /* 3rd channel contains transparency = 1 - alpha for the combined pass. */
+
+  kernel_assert(kfilm_convert->num_components == 4);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    pixel[0] = 0.0f;
+    pixel[1] = 0.0f;
+    pixel[2] = 0.0f;
+    pixel[3] = 0.0f;
+    return;
   }
 
-  return pass_result;
+  const float *in = buffer + kfilm_convert->pass_offset;
+
+  const float3 color = make_float3(in[0], in[1], in[2]) * scale_exposure;
+  const float alpha = in[3] * scale;
+
+  pixel[0] = color.x;
+  pixel[1] = color.y;
+  pixel[2] = color.z;
+  pixel[3] = film_transparency_to_alpha(alpha);
 }
 
-ccl_device float4 film_map(KernelGlobals *kg, float4 rgba_in, float scale)
+/* --------------------------------------------------------------------
+ * Shadow catcher.
+ */
+
+ccl_device_inline float3
+film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                                       ccl_global const float *ccl_restrict buffer)
 {
-  float4 result;
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
 
-  /* Conversion to SRGB. */
-  result.x = color_linear_to_srgb(rgba_in.x * scale);
-  result.y = color_linear_to_srgb(rgba_in.y * scale);
-  result.z = color_linear_to_srgb(rgba_in.z * scale);
+  float scale, scale_exposure;
+  film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure);
 
-  /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-  result.w = saturate(rgba_in.w * scale);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
 
-  return result;
+  const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
+
+  return pixel;
 }
 
-ccl_device uchar4 film_float_to_byte(float4 color)
+ccl_device_inline float3 safe_divide_shadow_catcher(float3 a, float3 b)
 {
-  uchar4 result;
+  float x, y, z;
 
-  /* simple float to byte conversion */
-  result.x = (uchar)(saturate(color.x) * 255.0f);
-  result.y = (uchar)(saturate(color.y) * 255.0f);
-  result.z = (uchar)(saturate(color.z) * 255.0f);
-  result.w = (uchar)(saturate(color.w) * 255.0f);
+  x = (b.x != 0.0f) ? a.x / b.x : 1.0f;
+  y = (b.y != 0.0f) ? a.y / b.y : 1.0f;
+  z = (b.z != 0.0f) ? a.z / b.z : 1.0f;
 
-  return result;
+  return make_float3(x, y, z);
 }
 
-ccl_device void kernel_film_convert_to_byte(KernelGlobals *kg,
-                                            ccl_global uchar4 *rgba,
-                                            ccl_global float *buffer,
-                                            float sample_scale,
-                                            int x,
-                                            int y,
-                                            int offset,
-                                            int stride)
+ccl_device_inline float3
+film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert,
+                              ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* For the shadow catcher pass we divide combined pass by the shadow catcher.
+   * Note that denoised shadow catcher pass contains value which only needs ot be scaled (but not
+   * to be calculated as division). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  if (kfilm_convert->is_denoised) {
+    return film_calculate_shadow_catcher_denoised(kfilm_convert, buffer);
+  }
 
-  /* map colors */
-  float4 float_result = film_map(kg, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
-  uchar4 uchar_result = film_float_to_byte(float_result);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_sample_count != PASS_UNUSED);
 
-  rgba += index;
-  *rgba = uchar_result;
+  /* If there is no shadow catcher object in this pixel, there is no modification of the light
+   * needed, so return one. */
+  ccl_global const float *in_catcher_sample_count =
+      buffer + kfilm_convert->pass_shadow_catcher_sample_count;
+  const float num_samples = in_catcher_sample_count[0];
+  if (num_samples == 0.0f) {
+    return one_float3();
+  }
+
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  ccl_global const float *in_catcher = buffer + kfilm_convert->pass_shadow_catcher;
+
+  /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
+   * shadow catcher objects in the scene. In this case there will be no auxiliary passes required
+   * for the devision (to save up memory). So delay the asserts to this point so that the number of
+   * samples check handles such configuration. */
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  ccl_global const float *in_combined = buffer + kfilm_convert->pass_combined;
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  /* No scaling needed. The integration works in way that number of samples in the combined and
+   * shadow catcher passes are the same, and exposure is canceled during the division. */
+  const float3 color_catcher = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]);
+  const float3 color_combined = make_float3(in_combined[0], in_combined[1], in_combined[2]);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]);
+
+  /* Need to ignore contribution of the matte object when doing division (otherwise there will be
+   * artifacts caused by anti-aliasing). Since combined pass is used for adaptive sampling and need
+   * to contain matte objects, we subtract matte objects contribution here. This is the same as if
+   * the matte objects were not accumulated to the combined pass. */
+  const float3 combined_no_matte = color_combined - color_matte;
+
+  const float3 shadow_catcher = safe_divide_shadow_catcher(combined_no_matte, color_catcher);
+
+  const float scale = film_get_scale(kfilm_convert, buffer);
+  const float transparency = in_combined[3] * scale;
+  const float alpha = film_transparency_to_alpha(transparency);
+
+  /* Alpha-over on white using transparency of the combined pass. This allows to eliminate
+   * artifacts which are happening on an edge of a shadow catcher when using transparent film.
+   * Note that we treat shadow catcher as straight alpha here because alpha got canceled out
+   * during the division. */
+  const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
+
+  return pixel;
 }
 
-ccl_device void kernel_film_convert_to_half_float(KernelGlobals *kg,
-                                                  ccl_global uchar4 *rgba,
-                                                  ccl_global float *buffer,
-                                                  float sample_scale,
-                                                  int x,
-                                                  int y,
-                                                  int offset,
-                                                  int stride)
+ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer)
 {
-  /* buffer offset */
-  int index = offset + x + y * stride;
+  /* The approximation of the shadow is 1 - average(shadow_catcher_pass). A better approximation
+   * is possible.
+   *
+   * The matte is alpha-overed onto the shadow (which is kind of alpha-overing shadow onto footage,
+   * and then alpha-overing synthetic objects on top). */
 
-  bool use_display_sample_scale = (kernel_data.film.display_divide_pass_stride == -1);
-  float4 rgba_in = film_get_pass_result(kg, buffer, sample_scale, index, use_display_sample_scale);
+  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher != PASS_UNUSED);
+  kernel_assert(kfilm_convert->pass_shadow_catcher_matte != PASS_UNUSED);
+
+  float scale, scale_exposure;
+  if (!film_get_scale_and_scale_exposure(kfilm_convert, buffer, &scale, &scale_exposure)) {
+    return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+  }
+
+  ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
+
+  const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
+
+  const float transparency = in_matte[3] * scale;
+  const float alpha = saturate(1.0f - transparency);
+
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
+
+  if (kfilm_convert->use_approximate_shadow_catcher_background) {
+    kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
+
+    ccl_global const float *in_background = buffer + kfilm_convert->pass_background;
+    const float3 color_background = make_float3(
+                                        in_background[0], in_background[1], in_background[2]) *
+                                    scale_exposure;
+    const float3 alpha_over = color_matte + color_background * (1.0f - alpha_matte);
+    return make_float4(alpha_over.x, alpha_over.y, alpha_over.z, 1.0f);
+  }
 
-  ccl_global half *out = (ccl_global half *)rgba + index * 4;
-  float4_store_half(out, rgba_in, use_display_sample_scale ? sample_scale : 1.0f);
+  return make_float4(color_matte.x, color_matte.y, color_matte.z, alpha_matte);
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components >= 3);
+
+  const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+}
+
+ccl_device_inline void film_get_pass_pixel_shadow_catcher_matte_with_shadow(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  kernel_assert(kfilm_convert->num_components == 3 || kfilm_convert->num_components == 4);
+
+  const float4 pixel_value = film_calculate_shadow_catcher_matte_with_shadow(kfilm_convert,
+                                                                             buffer);
+
+  pixel[0] = pixel_value.x;
+  pixel[1] = pixel_value.y;
+  pixel[2] = pixel_value.z;
+  if (kfilm_convert->num_components == 4) {
+    pixel[3] = pixel_value.w;
+  }
+}
+
+/* --------------------------------------------------------------------
+ * Compositing and overlays.
+ */
+
+ccl_device_inline void film_apply_pass_pixel_overlays_rgba(
+    const KernelFilmConvert *ccl_restrict kfilm_convert,
+    ccl_global const float *ccl_restrict buffer,
+    float *ccl_restrict pixel)
+{
+  if (kfilm_convert->show_active_pixels &&
+      kfilm_convert->pass_adaptive_aux_buffer != PASS_UNUSED) {
+    if (buffer[kfilm_convert->pass_adaptive_aux_buffer + 3] == 0.0f) {
+      const float3 active_rgb = make_float3(1.0f, 0.0f, 0.0f);
+      const float3 mix_rgb = interp(make_float3(pixel[0], pixel[1], pixel[2]), active_rgb, 0.5f);
+      pixel[0] = mix_rgb.x;
+      pixel[1] = mix_rgb.y;
+      pixel[2] = mix_rgb.z;
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
deleted file mode 100644
index 70aed6d54ed..00000000000
--- a/intern/cycles/kernel/kernel_globals.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Constant Globals */
-
-#ifndef __KERNEL_GLOBALS_H__
-#define __KERNEL_GLOBALS_H__
-
-#include "kernel/kernel_profiling.h"
-
-#ifdef __KERNEL_CPU__
-#  include "util/util_map.h"
-#  include "util/util_vector.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "util/util_atomic.h"
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* On the CPU, we pass along the struct KernelGlobals to nearly everywhere in
- * the kernel, to access constant data. These are all stored as "textures", but
- * these are really just standard arrays. We can't use actually globals because
- * multiple renders may be running inside the same process. */
-
-#ifdef __KERNEL_CPU__
-
-#  ifdef __OSL__
-struct OSLGlobals;
-struct OSLThreadData;
-struct OSLShadingSystem;
-#  endif
-
-typedef unordered_map<float, float> CoverageMap;
-
-struct Intersection;
-struct VolumeStep;
-
-typedef struct KernelGlobals {
-#  define KERNEL_TEX(type, name) texture<type> name;
-#  include "kernel/kernel_textures.h"
-
-  KernelData __data;
-
-#  ifdef __OSL__
-  /* On the CPU, we also have the OSL globals here. Most data structures are shared
-   * with SVM, the difference is in the shaders and object/mesh attributes. */
-  OSLGlobals *osl;
-  OSLShadingSystem *osl_ss;
-  OSLThreadData *osl_tdata;
-#  endif
-
-  /* **** Run-time data **** */
-
-  /* Heap-allocated storage for transparent shadows intersections. */
-  Intersection *transparent_shadow_intersections;
-
-  /* Storage for decoupled volume steps. */
-  VolumeStep *decoupled_volume_steps[2];
-  int decoupled_volume_steps_index;
-
-  /* A buffer for storing per-pixel coverage for Cryptomatte. */
-  CoverageMap *coverage_object;
-  CoverageMap *coverage_material;
-  CoverageMap *coverage_asset;
-
-  /* split kernel */
-  SplitData split_data;
-  SplitParams split_param_data;
-
-  int2 global_size;
-  int2 global_id;
-
-  ProfilingState profiler;
-} KernelGlobals;
-
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_OPTIX__
-
-typedef struct ShaderParams {
-  uint4 *input;
-  float4 *output;
-  int type;
-  int filter;
-  int sx;
-  int offset;
-  int sample;
-} ShaderParams;
-
-typedef struct KernelParams {
-  WorkTile tile;
-  KernelData data;
-  ShaderParams shader;
-#  define KERNEL_TEX(type, name) const type *name;
-#  include "kernel/kernel_textures.h"
-} KernelParams;
-
-typedef struct KernelGlobals {
-#  ifdef __VOLUME__
-  VolumeState volume_state;
-#  endif
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-extern "C" __constant__ KernelParams __params;
-
-#else /* __KERNEL_OPTIX__ */
-
-/* For CUDA, constant memory textures must be globals, so we can't put them
- * into a struct. As a result we don't actually use this struct and use actual
- * globals and simply pass along a NULL pointer everywhere, which we hope gets
- * optimized out. */
-
-#  ifdef __KERNEL_CUDA__
-
-__constant__ KernelData __data;
-typedef struct KernelGlobals {
-  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
-  Intersection hits_stack[64];
-} KernelGlobals;
-
-#    define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#    include "kernel/kernel_textures.h"
-
-#  endif /* __KERNEL_CUDA__ */
-
-#endif /* __KERNEL_OPTIX__ */
-
-/* OpenCL */
-
-#ifdef __KERNEL_OPENCL__
-
-#  define KERNEL_TEX(type, name) typedef type name##_t;
-#  include "kernel/kernel_textures.h"
-
-typedef ccl_addr_space struct KernelGlobals {
-  ccl_constant KernelData *data;
-  ccl_global char *buffers[8];
-
-#  define KERNEL_TEX(type, name) TextureInfo name;
-#  include "kernel/kernel_textures.h"
-
-#  ifdef __SPLIT_KERNEL__
-  SplitData split_data;
-  SplitParams split_param_data;
-#  endif
-} KernelGlobals;
-
-#  define KERNEL_BUFFER_PARAMS \
-    ccl_global char *buffer0, ccl_global char *buffer1, ccl_global char *buffer2, \
-        ccl_global char *buffer3, ccl_global char *buffer4, ccl_global char *buffer5, \
-        ccl_global char *buffer6, ccl_global char *buffer7
-
-#  define KERNEL_BUFFER_ARGS buffer0, buffer1, buffer2, buffer3, buffer4, buffer5, buffer6, buffer7
-
-ccl_device_inline void kernel_set_buffer_pointers(KernelGlobals *kg, KERNEL_BUFFER_PARAMS)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    kg->buffers[0] = buffer0;
-    kg->buffers[1] = buffer1;
-    kg->buffers[2] = buffer2;
-    kg->buffers[3] = buffer3;
-    kg->buffers[4] = buffer4;
-    kg->buffers[5] = buffer5;
-    kg->buffers[6] = buffer6;
-    kg->buffers[7] = buffer7;
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-ccl_device_inline void kernel_set_buffer_info(KernelGlobals *kg)
-{
-#  ifdef __SPLIT_KERNEL__
-  if (ccl_local_id(0) + ccl_local_id(1) == 0)
-#  endif
-  {
-    ccl_global TextureInfo *info = (ccl_global TextureInfo *)kg->buffers[0];
-
-#  define KERNEL_TEX(type, name) kg->name = *(info++);
-#  include "kernel/kernel_textures.h"
-  }
-
-#  ifdef __SPLIT_KERNEL__
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-#  endif
-}
-
-#endif /* __KERNEL_OPENCL__ */
-
-/* Interpolated lookup table access */
-
-ccl_device float lookup_table_read(KernelGlobals *kg, float x, int offset, int size)
-{
-  x = saturate(x) * (size - 1);
-
-  int index = min(float_to_int(x), size - 1);
-  int nindex = min(index + 1, size - 1);
-  float t = x - index;
-
-  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-ccl_device float lookup_table_read_2D(
-    KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
-{
-  y = saturate(y) * (ysize - 1);
-
-  int index = min(float_to_int(y), ysize - 1);
-  int nindex = min(index + 1, ysize - 1);
-  float t = y - index;
-
-  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
-  if (t == 0.0f)
-    return data0;
-
-  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
-  return (1.0f - t) * data0 + t * data1;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_GLOBALS_H__ */
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
index 1ca42e933d1..ed01f494f98 100644
--- a/intern/cycles/kernel/kernel_id_passes.h
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -14,8 +14,18 @@
  * limitations under the License.
  */
 
+#pragma once
+
 CCL_NAMESPACE_BEGIN
 
+/* Element of ID pass stored in the render buffers.
+ * It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
+ * render buffers might not meet expected by compiler alignment. */
+typedef struct IDPassBufferElement {
+  float x;
+  float y;
+} IDPassBufferElement;
+
 ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
                                              int num_slots,
                                              float id,
@@ -27,7 +37,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
   }
 
   for (int slot = 0; slot < num_slots; slot++) {
-    ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+    ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
 #ifdef __ATOMIC_PASS_WRITE__
     /* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
     if (id_buffer[slot].x == ID_NONE) {
@@ -65,7 +75,7 @@ ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
 
 ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
 {
-  ccl_global float2 *id_buffer = (ccl_global float2 *)buffer;
+  ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
   for (int slot = 1; slot < num_slots; ++slot) {
     if (id_buffer[slot].x == ID_NONE) {
       return;
@@ -73,7 +83,7 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
     /* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
     int i = slot;
     while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
-      float2 swap = id_buffer[i];
+      const IDPassBufferElement swap = id_buffer[i];
       id_buffer[i] = id_buffer[i - 1];
       id_buffer[i - 1] = swap;
       --i;
@@ -81,19 +91,16 @@ ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_sl
   }
 }
 
-#ifdef __KERNEL_GPU__
 /* post-sorting for Cryptomatte */
-ccl_device void kernel_cryptomatte_post(
-    KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+ccl_device_inline void kernel_cryptomatte_post(const KernelGlobals *kg,
+                                               ccl_global float *render_buffer,
+                                               int pixel_index)
 {
-  if (sample - 1 == kernel_data.integrator.aa_samples) {
-    int index = offset + x + y * stride;
-    int pass_stride = kernel_data.film.pass_stride;
-    ccl_global float *cryptomatte_buffer = buffer + index * pass_stride +
-                                           kernel_data.film.pass_cryptomatte;
-    kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-  }
+  const int pass_stride = kernel_data.film.pass_stride;
+  const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
+  ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
+                                         kernel_data.film.pass_cryptomatte;
+  kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
 }
-#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h
index f4e60a807f7..354e8115538 100644
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -14,93 +14,27 @@
  * limitations under the License.
  */
 
-/* TODO(sergey): Consider moving portable ctz/clz stuff to util. */
-
+#pragma once
 CCL_NAMESPACE_BEGIN
 
-/* "Correlated Multi-Jittered Sampling"
- * Andrew Kensler, Pixar Technical Memo 13-01, 2013 */
-
-/* TODO: find good value, suggested 64 gives pattern on cornell box ceiling. */
-#define CMJ_RANDOM_OFFSET_LIMIT 4096
-
-ccl_device_inline bool cmj_is_pow2(int i)
+ccl_device_inline uint32_t laine_karras_permutation(uint32_t x, uint32_t seed)
 {
-  return (i > 1) && ((i & (i - 1)) == 0);
-}
+  x += seed;
+  x ^= (x * 0x6c50b47cu);
+  x ^= x * 0xb82f1e52u;
+  x ^= x * 0xc7afe638u;
+  x ^= x * 0x8d22f6e6u;
 
-ccl_device_inline int cmj_fast_mod_pow2(int a, int b)
-{
-  return (a & (b - 1));
+  return x;
 }
 
-/* b must be > 1 */
-ccl_device_inline int cmj_fast_div_pow2(int a, int b)
+ccl_device_inline uint32_t nested_uniform_scramble(uint32_t x, uint32_t seed)
 {
-  kernel_assert(b > 1);
-  return a >> count_trailing_zeros(b);
-}
+  x = reverse_integer_bits(x);
+  x = laine_karras_permutation(x, seed);
+  x = reverse_integer_bits(x);
 
-ccl_device_inline uint cmj_w_mask(uint w)
-{
-  kernel_assert(w > 1);
-  return ((1 << (32 - count_leading_zeros(w))) - 1);
-}
-
-ccl_device_inline uint cmj_permute(uint i, uint l, uint p)
-{
-  uint w = l - 1;
-
-  if ((l & w) == 0) {
-    /* l is a power of two (fast) */
-    i ^= p;
-    i *= 0xe170893d;
-    i ^= p >> 16;
-    i ^= (i & w) >> 4;
-    i ^= p >> 8;
-    i *= 0x0929eb3f;
-    i ^= p >> 23;
-    i ^= (i & w) >> 1;
-    i *= 1 | p >> 27;
-    i *= 0x6935fa69;
-    i ^= (i & w) >> 11;
-    i *= 0x74dcb303;
-    i ^= (i & w) >> 2;
-    i *= 0x9e501cc3;
-    i ^= (i & w) >> 2;
-    i *= 0xc860a3df;
-    i &= w;
-    i ^= i >> 5;
-
-    return (i + p) & w;
-  }
-  else {
-    /* l is not a power of two (slow) */
-    w = cmj_w_mask(w);
-
-    do {
-      i ^= p;
-      i *= 0xe170893d;
-      i ^= p >> 16;
-      i ^= (i & w) >> 4;
-      i ^= p >> 8;
-      i *= 0x0929eb3f;
-      i ^= p >> 23;
-      i ^= (i & w) >> 1;
-      i *= 1 | p >> 27;
-      i *= 0x6935fa69;
-      i ^= (i & w) >> 11;
-      i *= 0x74dcb303;
-      i ^= (i & w) >> 2;
-      i *= 0x9e501cc3;
-      i ^= (i & w) >> 2;
-      i *= 0xc860a3df;
-      i &= w;
-      i ^= i >> 5;
-    } while (i >= l);
-
-    return (i + p) % l;
-  }
+  return x;
 }
 
 ccl_device_inline uint cmj_hash(uint i, uint p)
@@ -133,99 +67,101 @@ ccl_device_inline float cmj_randfloat(uint i, uint p)
   return cmj_hash(i, p) * (1.0f / 4294967808.0f);
 }
 
-#ifdef __CMJ__
-ccl_device float cmj_sample_1D(int s, int N, int p)
+ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
 {
-  kernel_assert(s < N);
-
-  uint x = cmj_permute(s, N, p * 0x68bc21eb);
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-
-  float invN = 1.0f / N;
-  return (x + jx) * invN;
+  return cmj_hash_simple(i, p) * (1.0f / (float)0xFFFFFFFF);
 }
 
-/* TODO(sergey): Do some extra tests and consider moving to util_math.h. */
-ccl_device_inline int cmj_isqrt(int value)
+ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
 {
-#  if defined(__KERNEL_CUDA__)
-  return float_to_int(__fsqrt_ru(value));
-#  elif defined(__KERNEL_GPU__)
-  return float_to_int(sqrtf(value));
-#  else
-  /* This is a work around for fast-math on CPU which might replace sqrtf()
-   * with am approximated version.
-   */
-  return float_to_int(sqrtf(value) + 1e-6f);
-#  endif
-}
+  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+   *  the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
+   * independently). */
+
+  /* Perform Owen shuffle of the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-ccl_device void cmj_sample_2D(int s, int N, int p, float *fx, float *fy)
-{
-  kernel_assert(s < N);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
 
-  int m = cmj_isqrt(N);
-  int n = (N - 1) / m + 1;
-  float invN = 1.0f / N;
-  float invm = 1.0f / m;
-  float invn = 1.0f / n;
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+#  else
+  /* Only jitter within the grid interval. */
+  float dx = cmj_randfloat(d, rng_hash);
+#  endif
+  fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+  fx = fx - floorf(fx);
 
-  s = cmj_permute(s, N, p * 0x51633e2d);
+#else
+#  warning "Not using Cranley-Patterson Rotation."
+#endif
 
-  int sdivm, smodm;
+  return fx;
+}
 
-  if (cmj_is_pow2(m)) {
-    sdivm = cmj_fast_div_pow2(s, m);
-    smodm = cmj_fast_mod_pow2(s, m);
-  }
-  else {
-    /* Doing `s * inmv` gives precision issues here. */
-    sdivm = s / m;
-    smodm = s - sdivm * m;
-  }
+ccl_device void pmj_sample_2D(
+    const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension, float *x, float *y)
+{
+  /* Perform a shuffle on the sample number to reorder the samples. */
+#ifdef _SIMPLE_HASH_
+  const uint rv = cmj_hash_simple(dimension, rng_hash);
+#else /* Use a _REGULAR_HASH_. */
+  const uint rv = cmj_hash(dimension, rng_hash);
+#endif
+#ifdef _XOR_SHUFFLE_
+#  warning "Using XOR shuffle."
+  const uint s = sample ^ rv;
+#else /* Use _OWEN_SHUFFLE_ for reordering. */
+  const uint s = nested_uniform_scramble(sample, rv);
+#endif
 
-  uint sx = cmj_permute(smodm, m, p * 0x68bc21eb);
-  uint sy = cmj_permute(sdivm, n, p * 0x02e5be93);
+  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
+  const uint sample_set = s / NUM_PMJ_SAMPLES;
+  const uint d = (dimension + sample_set);
+  const uint dim = d % NUM_PMJ_PATTERNS;
+  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
 
-  float jx = cmj_randfloat(s, p * 0x967a889b);
-  float jy = cmj_randfloat(s, p * 0x368cc8b7);
+  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
+  float fy = kernel_tex_fetch(__sample_pattern_lut, index + 1);
 
-  *fx = (sx + (sy + jx) * invn) * invm;
-  *fy = (s + jy) * invN;
-}
+#ifndef _NO_CRANLEY_PATTERSON_ROTATION_
+  /* Use Cranley-Patterson rotation to displace the sample pattern. */
+#  ifdef _SIMPLE_HASH_
+  float dx = cmj_randfloat_simple(d, rng_hash);
+  float dy = cmj_randfloat_simple(d + 1, rng_hash);
+#  else
+  float dx = cmj_randfloat(d, rng_hash);
+  float dy = cmj_randfloat(d + 1, rng_hash);
+#  endif
+  /* Only jitter within the grid cells. */
+  fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
+  fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
+  fx = fx - floorf(fx);
+  fy = fy - floorf(fy);
+#else
+#  warning "Not using Cranley Patterson Rotation."
 #endif
 
-ccl_device float pmj_sample_1D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  /* Fallback to random */
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    return cmj_randfloat(sample, p);
-  }
-  else {
-    const uint mask = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    return __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ mask) - 1.0f;
-  }
-}
-
-ccl_device float2 pmj_sample_2D(KernelGlobals *kg, int sample, int rng_hash, int dimension)
-{
-  if (sample >= NUM_PMJ_SAMPLES) {
-    const int p = rng_hash + dimension;
-    const float fx = cmj_randfloat(sample, p);
-    const float fy = cmj_randfloat(sample, p + 1);
-    return make_float2(fx, fy);
-  }
-  else {
-    const int index = ((dimension % NUM_PMJ_PATTERNS) * NUM_PMJ_SAMPLES + sample) * 2;
-    const uint maskx = cmj_hash_simple(dimension, rng_hash) & 0x007fffff;
-    const uint masky = cmj_hash_simple(dimension + 1, rng_hash) & 0x007fffff;
-    const float fx = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index) ^ maskx) - 1.0f;
-    const float fy = __uint_as_float(kernel_tex_fetch(__sample_pattern_lut, index + 1) ^ masky) -
-                     1.0f;
-    return make_float2(fx, fy);
-  }
+  (*x) = fx;
+  (*y) = fy;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 42a834d2ce3..52f641634b9 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -14,7 +14,14 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "geom/geom.h"
+
 #include "kernel_light_background.h"
+#include "kernel_montecarlo.h"
+#include "kernel_projection.h"
+#include "kernel_types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -37,10 +44,22 @@ typedef struct LightSample {
 
 /* Regular Light */
 
-ccl_device_inline bool lamp_light_sample(
-    KernelGlobals *kg, int lamp, float randu, float randv, float3 P, LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_inline bool light_sample(const KernelGlobals *kg,
+                                    const int lamp,
+                                    const float randu,
+                                    const float randv,
+                                    const float3 P,
+                                    const int path_flag,
+                                    LightSample *ls)
 {
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+      return false;
+    }
+  }
+
   LightType type = (LightType)klight->type;
   ls->type = type;
   ls->shader = klight->shader_id;
@@ -50,6 +69,18 @@ ccl_device_inline bool lamp_light_sample(
   ls->u = randu;
   ls->v = randv;
 
+  if (in_volume_segment && (type == LIGHT_DISTANT || type == LIGHT_BACKGROUND)) {
+    /* Distant lights in a volume get a dummy sample, position will not actually
+     * be used in that case. Only when sampling from a specific scatter position
+     * do we actually need to evaluate these. */
+    ls->P = zero_float3();
+    ls->Ng = zero_float3();
+    ls->D = zero_float3();
+    ls->pdf = true;
+    ls->t = FLT_MAX;
+    return true;
+  }
+
   if (type == LIGHT_DISTANT) {
     /* distant light */
     float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
@@ -123,13 +154,15 @@ ccl_device_inline bool lamp_light_sample(
       float invarea = fabsf(klight->area.invarea);
       bool is_round = (klight->area.invarea < 0.0f);
 
-      if (dot(ls->P - P, Ng) > 0.0f) {
-        return false;
+      if (!in_volume_segment) {
+        if (dot(ls->P - P, Ng) > 0.0f) {
+          return false;
+        }
       }
 
       float3 inplane;
 
-      if (is_round) {
+      if (is_round || in_volume_segment) {
         inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
         ls->P += inplane;
         ls->pdf = invarea;
@@ -176,79 +209,180 @@ ccl_device_inline bool lamp_light_sample(
   return (ls->pdf > 0.0f);
 }
 
-ccl_device bool lamp_light_eval(
-    KernelGlobals *kg, int lamp, float3 P, float3 D, float t, LightSample *ls)
+ccl_device bool lights_intersect(const KernelGlobals *ccl_restrict kg,
+                                 const Ray *ccl_restrict ray,
+                                 Intersection *ccl_restrict isect,
+                                 const int last_prim,
+                                 const int last_object,
+                                 const int last_type,
+                                 const int path_flag)
 {
-  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
-  LightType type = (LightType)klight->type;
-  ls->type = type;
-  ls->shader = klight->shader_id;
-  ls->object = PRIM_NONE;
-  ls->prim = PRIM_NONE;
-  ls->lamp = lamp;
-  /* todo: missing texture coordinates */
-  ls->u = 0.0f;
-  ls->v = 0.0f;
+  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+    const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
 
-  if (!(ls->shader & SHADER_USE_MIS))
-    return false;
+    if (path_flag & PATH_RAY_CAMERA) {
+      if (klight->shader_id & SHADER_EXCLUDE_CAMERA) {
+        continue;
+      }
+    }
+    else {
+      if (!(klight->shader_id & SHADER_USE_MIS)) {
+        continue;
+      }
+    }
 
-  if (type == LIGHT_DISTANT) {
-    /* distant light */
-    float radius = klight->distant.radius;
+    if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+      if (klight->shader_id & SHADER_EXCLUDE_SHADOW_CATCHER) {
+        continue;
+      }
+    }
 
-    if (radius == 0.0f)
-      return false;
-    if (t != FLT_MAX)
-      return false;
+    LightType type = (LightType)klight->type;
+    float t = 0.0f, u = 0.0f, v = 0.0f;
 
-    /* a distant light is infinitely far away, but equivalent to a disk
-     * shaped light exactly 1 unit away from the current shading point.
-     *
-     *     radius              t^2/cos(theta)
-     *  <---------->           t = sqrt(1^2 + tan(theta)^2)
-     *       tan(th)           area = radius*radius*pi
-     *       <----->
-     *        \    |           (1 + tan(theta)^2)/cos(theta)
-     *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
-     *       t  \th| 1         simplifies to
-     *           \-|           1/(cos(theta)^3)
-     *            \|           magic!
-     *             P
-     */
+    if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+      /* Sphere light. */
+      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+      const float radius = klight->spot.radius;
+      if (radius == 0.0f) {
+        continue;
+      }
 
-    float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    float costheta = dot(-lightD, D);
-    float cosangle = klight->distant.cosangle;
+      float3 P;
+      if (!ray_aligned_disk_intersect(ray->P, ray->D, ray->t, lightP, radius, &P, &t)) {
+        continue;
+      }
+    }
+    else if (type == LIGHT_AREA) {
+      /* Area light. */
+      const float invarea = fabsf(klight->area.invarea);
+      const bool is_round = (klight->area.invarea < 0.0f);
+      if (invarea == 0.0f) {
+        continue;
+      }
 
-    if (costheta < cosangle)
-      return false;
+      const float3 axisu = make_float3(
+          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
+      const float3 axisv = make_float3(
+          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
 
-    ls->P = -D;
-    ls->Ng = -D;
-    ls->D = D;
-    ls->t = FLT_MAX;
+      /* One sided. */
+      if (dot(ray->D, Ng) >= 0.0f) {
+        continue;
+      }
 
-    /* compute pdf */
-    float invarea = klight->distant.invarea;
-    ls->pdf = invarea / (costheta * costheta * costheta);
-    ls->eval_fac = ls->pdf;
+      const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+
+      float3 P;
+      if (!ray_quad_intersect(
+              ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+        continue;
+      }
+    }
+    else {
+      continue;
+    }
+
+    if (t < isect->t &&
+        !(last_prim == lamp && last_object == OBJECT_NONE && last_type == PRIMITIVE_LAMP)) {
+      isect->t = t;
+      isect->u = u;
+      isect->v = v;
+      isect->type = PRIMITIVE_LAMP;
+      isect->prim = lamp;
+      isect->object = OBJECT_NONE;
+    }
+  }
+
+  return isect->prim != PRIM_NONE;
+}
+
+ccl_device bool light_sample_from_distant_ray(const KernelGlobals *ccl_restrict kg,
+                                              const float3 ray_D,
+                                              const int lamp,
+                                              LightSample *ccl_restrict ls)
+{
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  const int shader = klight->shader_id;
+  const float radius = klight->distant.radius;
+  const LightType type = (LightType)klight->type;
+
+  if (type != LIGHT_DISTANT) {
+    return false;
+  }
+  if (!(shader & SHADER_USE_MIS)) {
+    return false;
+  }
+  if (radius == 0.0f) {
+    return false;
   }
-  else if (type == LIGHT_POINT || type == LIGHT_SPOT) {
-    float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    float radius = klight->spot.radius;
+  /* a distant light is infinitely far away, but equivalent to a disk
+   * shaped light exactly 1 unit away from the current shading point.
+   *
+   *     radius              t^2/cos(theta)
+   *  <---------->           t = sqrt(1^2 + tan(theta)^2)
+   *       tan(th)           area = radius*radius*pi
+   *       <----->
+   *        \    |           (1 + tan(theta)^2)/cos(theta)
+   *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
+   *       t  \th| 1         simplifies to
+   *           \-|           1/(cos(theta)^3)
+   *            \|           magic!
+   *             P
+   */
+
+  float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
+  float costheta = dot(-lightD, ray_D);
+  float cosangle = klight->distant.cosangle;
+
+  if (costheta < cosangle)
+    return false;
 
-    /* sphere light */
-    if (radius == 0.0f)
-      return false;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->u = 0.0f;
+  ls->v = 0.0f;
+  ls->t = FLT_MAX;
+  ls->P = -ray_D;
+  ls->Ng = -ray_D;
+  ls->D = ray_D;
+
+  /* compute pdf */
+  float invarea = klight->distant.invarea;
+  ls->pdf = invarea / (costheta * costheta * costheta);
+  ls->pdf *= kernel_data.integrator.pdf_lights;
+  ls->eval_fac = ls->pdf;
 
-    if (!ray_aligned_disk_intersect(P, D, t, lightP, radius, &ls->P, &ls->t)) {
-      return false;
-    }
+  return true;
+}
 
-    ls->Ng = -D;
-    ls->D = D;
+ccl_device bool light_sample_from_intersection(const KernelGlobals *ccl_restrict kg,
+                                               const Intersection *ccl_restrict isect,
+                                               const float3 ray_P,
+                                               const float3 ray_D,
+                                               LightSample *ccl_restrict ls)
+{
+  const int lamp = isect->prim;
+  const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, lamp);
+  LightType type = (LightType)klight->type;
+  ls->type = type;
+  ls->shader = klight->shader_id;
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->t = isect->t;
+  ls->P = ray_P + ray_D * ls->t;
+  ls->D = ray_D;
+
+  if (type == LIGHT_POINT || type == LIGHT_SPOT) {
+    ls->Ng = -ray_D;
 
     float invarea = klight->spot.invarea;
     ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
@@ -260,8 +394,9 @@ ccl_device bool lamp_light_eval(
       ls->eval_fac *= spot_light_attenuation(
           dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
 
-      if (ls->eval_fac == 0.0f)
+      if (ls->eval_fac == 0.0f) {
         return false;
+      }
     }
     float2 uv = map_to_sphere(ls->Ng);
     ls->u = uv.x;
@@ -274,31 +409,22 @@ ccl_device bool lamp_light_eval(
   else if (type == LIGHT_AREA) {
     /* area light */
     float invarea = fabsf(klight->area.invarea);
-    bool is_round = (klight->area.invarea < 0.0f);
-    if (invarea == 0.0f)
-      return false;
 
     float3 axisu = make_float3(
         klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
     float3 axisv = make_float3(
         klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
     float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-    /* one sided */
-    if (dot(D, Ng) >= 0.0f)
-      return false;
-
     float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
 
-    if (!ray_quad_intersect(
-            P, D, 0.0f, t, light_P, axisu, axisv, Ng, &ls->P, &ls->t, &ls->u, &ls->v, is_round)) {
-      return false;
-    }
-
-    ls->D = D;
+    ls->u = isect->u;
+    ls->v = isect->v;
+    ls->D = ray_D;
     ls->Ng = Ng;
+
+    const bool is_round = (klight->area.invarea < 0.0f);
     if (is_round) {
-      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
+      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t);
     }
     else {
       float3 sample_axisu = axisu;
@@ -306,12 +432,12 @@ ccl_device bool lamp_light_eval(
 
       if (klight->area.tan_spread > 0.0f) {
         if (!light_spread_clamp_area_light(
-                P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+                ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
           return false;
         }
       }
 
-      ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
+      ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
     }
     ls->eval_fac = 0.25f * invarea;
 
@@ -325,6 +451,7 @@ ccl_device bool lamp_light_eval(
     }
   }
   else {
+    kernel_assert(!"Invalid lamp type in light_sample_from_intersection");
     return false;
   }
 
@@ -337,7 +464,7 @@ ccl_device bool lamp_light_eval(
 
 /* returns true if the triangle is has motion blur or an instancing transform applied */
 ccl_device_inline bool triangle_world_space_vertices(
-    KernelGlobals *kg, int object, int prim, float time, float3 V[3])
+    const KernelGlobals *kg, int object, int prim, float time, float3 V[3])
 {
   bool has_motion = false;
   const int object_flag = kernel_tex_fetch(__object_flag, object);
@@ -365,7 +492,7 @@ ccl_device_inline bool triangle_world_space_vertices(
   return has_motion;
 }
 
-ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
+ccl_device_inline float triangle_light_pdf_area(const KernelGlobals *kg,
                                                 const float3 Ng,
                                                 const float3 I,
                                                 float t)
@@ -379,7 +506,9 @@ ccl_device_inline float triangle_light_pdf_area(KernelGlobals *kg,
   return t * t * pdf / cos_pi;
 }
 
-ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *sd, float t)
+ccl_device_forceinline float triangle_light_pdf(const KernelGlobals *kg,
+                                                const ShaderData *sd,
+                                                float t)
 {
   /* A naive heuristic to decide between costly solid angle sampling
    * and simple area sampling, comparing the distance to the triangle plane
@@ -448,7 +577,8 @@ ccl_device_forceinline float triangle_light_pdf(KernelGlobals *kg, ShaderData *s
   }
 }
 
-ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
+template<bool in_volume_segment>
+ccl_device_forceinline void triangle_light_sample(const KernelGlobals *kg,
                                                   int prim,
                                                   int object,
                                                   float randu,
@@ -488,7 +618,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
   float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
 
-  if (longest_edge_squared > distance_to_plane * distance_to_plane) {
+  if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
     /* see James Arvo, "Stratified Sampling of Spherical Triangles"
      * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
 
@@ -617,7 +747,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals *kg,
 
 /* Light Distribution */
 
-ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
+ccl_device int light_distribution_sample(const KernelGlobals *kg, float *randu)
 {
   /* This is basically std::upper_bound as used by PBRT, to find a point light or
    * triangle to emit from, proportional to area. a good improvement would be to
@@ -655,51 +785,93 @@ ccl_device int light_distribution_sample(KernelGlobals *kg, float *randu)
 
 /* Generic Light */
 
-ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals *kg, int index, int bounce)
+ccl_device_inline bool light_select_reached_max_bounces(const KernelGlobals *kg,
+                                                        int index,
+                                                        int bounce)
 {
   return (bounce > kernel_tex_fetch(__lights, index).max_bounces);
 }
 
-ccl_device_noinline bool light_sample(KernelGlobals *kg,
-                                      int lamp,
-                                      float randu,
-                                      float randv,
-                                      float time,
-                                      float3 P,
-                                      int bounce,
-                                      LightSample *ls)
+template<bool in_volume_segment>
+ccl_device_noinline bool light_distribution_sample(const KernelGlobals *kg,
+                                                   float randu,
+                                                   const float randv,
+                                                   const float time,
+                                                   const float3 P,
+                                                   const int bounce,
+                                                   const int path_flag,
+                                                   LightSample *ls)
 {
-  if (lamp < 0) {
-    /* sample index */
-    int index = light_distribution_sample(kg, &randu);
-
-    /* fetch light data */
-    const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(
-        __light_distribution, index);
-    int prim = kdistribution->prim;
-
-    if (prim >= 0) {
-      int object = kdistribution->mesh_light.object_id;
-      int shader_flag = kdistribution->mesh_light.shader_flag;
-
-      triangle_light_sample(kg, prim, object, randu, randv, time, ls, P);
-      ls->shader |= shader_flag;
-      return (ls->pdf > 0.0f);
+  /* Sample light index from distribution. */
+  const int index = light_distribution_sample(kg, &randu);
+  const ccl_global KernelLightDistribution *kdistribution = &kernel_tex_fetch(__light_distribution,
+                                                                              index);
+  const int prim = kdistribution->prim;
+
+  if (prim >= 0) {
+    /* Mesh light. */
+    const int object = kdistribution->mesh_light.object_id;
+
+    /* Exclude synthetic meshes from shadow catcher pass. */
+    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+        !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
+      return false;
     }
 
-    lamp = -prim - 1;
+    const int shader_flag = kdistribution->mesh_light.shader_flag;
+    triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P);
+    ls->shader |= shader_flag;
+    return (ls->pdf > 0.0f);
   }
 
+  const int lamp = -prim - 1;
+
   if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
     return false;
   }
 
-  return lamp_light_sample(kg, lamp, randu, randv, P, ls);
+  return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_volume_segment(const KernelGlobals *kg,
+                                                                     float randu,
+                                                                     const float randv,
+                                                                     const float time,
+                                                                     const float3 P,
+                                                                     const int bounce,
+                                                                     const int path_flag,
+                                                                     LightSample *ls)
+{
+  return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls);
+}
+
+ccl_device_inline bool light_distribution_sample_from_position(const KernelGlobals *kg,
+                                                               float randu,
+                                                               const float randv,
+                                                               const float time,
+                                                               const float3 P,
+                                                               const int bounce,
+                                                               const int path_flag,
+                                                               LightSample *ls)
+{
+  return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls);
 }
 
-ccl_device_inline int light_select_num_samples(KernelGlobals *kg, int index)
+ccl_device_inline bool light_distribution_sample_new_position(const KernelGlobals *kg,
+                                                              const float randu,
+                                                              const float randv,
+                                                              const float time,
+                                                              const float3 P,
+                                                              LightSample *ls)
 {
-  return kernel_tex_fetch(__lights, index).samples;
+  /* Sample a new position on the same light, for volume sampling. */
+  if (ls->type == LIGHT_TRIANGLE) {
+    triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P);
+    return (ls->pdf > 0.0f);
+  }
+  else {
+    return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls);
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_light_background.h b/intern/cycles/kernel/kernel_light_background.h
index f0f64ce8704..493ed560bc6 100644
--- a/intern/cycles/kernel/kernel_light_background.h
+++ b/intern/cycles/kernel/kernel_light_background.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "kernel_light_common.h"
 
 CCL_NAMESPACE_BEGIN
@@ -22,7 +24,10 @@ CCL_NAMESPACE_BEGIN
 
 #ifdef __BACKGROUND_MIS__
 
-ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float randv, float *pdf)
+ccl_device float3 background_map_sample(const KernelGlobals *kg,
+                                        float randu,
+                                        float randv,
+                                        float *pdf)
 {
   /* for the following, the CDF values are actually a pair of floats, with the
    * function value as X and the actual CDF as Y.  The last entry's function
@@ -104,7 +109,7 @@ ccl_device float3 background_map_sample(KernelGlobals *kg, float randu, float ra
 /* TODO(sergey): Same as above, after the release we should consider using
  * 'noinline' for all devices.
  */
-ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
+ccl_device float background_map_pdf(const KernelGlobals *kg, float3 direction)
 {
   float2 uv = direction_to_equirectangular(direction);
   int res_x = kernel_data.background.map_res_x;
@@ -138,7 +143,7 @@ ccl_device float background_map_pdf(KernelGlobals *kg, float3 direction)
 }
 
 ccl_device_inline bool background_portal_data_fetch_and_check_side(
-    KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
+    const KernelGlobals *kg, float3 P, int index, float3 *lightpos, float3 *dir)
 {
   int portal = kernel_data.background.portal_offset + index;
   const ccl_global KernelLight *klight = &kernel_tex_fetch(__lights, portal);
@@ -154,7 +159,7 @@ ccl_device_inline bool background_portal_data_fetch_and_check_side(
 }
 
 ccl_device_inline float background_portal_pdf(
-    KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
+    const KernelGlobals *kg, float3 P, float3 direction, int ignore_portal, bool *is_possible)
 {
   float portal_pdf = 0.0f;
 
@@ -214,7 +219,7 @@ ccl_device_inline float background_portal_pdf(
   return (num_possible > 0) ? portal_pdf / num_possible : 0.0f;
 }
 
-ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
+ccl_device int background_num_possible_portals(const KernelGlobals *kg, float3 P)
 {
   int num_possible_portals = 0;
   for (int p = 0; p < kernel_data.background.num_portals; p++) {
@@ -225,7 +230,7 @@ ccl_device int background_num_possible_portals(KernelGlobals *kg, float3 P)
   return num_possible_portals;
 }
 
-ccl_device float3 background_portal_sample(KernelGlobals *kg,
+ccl_device float3 background_portal_sample(const KernelGlobals *kg,
                                            float3 P,
                                            float randu,
                                            float randv,
@@ -280,7 +285,7 @@ ccl_device float3 background_portal_sample(KernelGlobals *kg,
   return zero_float3();
 }
 
-ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
+ccl_device_inline float3 background_sun_sample(const KernelGlobals *kg,
                                                float randu,
                                                float randv,
                                                float *pdf)
@@ -292,7 +297,7 @@ ccl_device_inline float3 background_sun_sample(KernelGlobals *kg,
   return D;
 }
 
-ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
+ccl_device_inline float background_sun_pdf(const KernelGlobals *kg, float3 D)
 {
   const float3 N = float4_to_float3(kernel_data.background.sun);
   const float angle = kernel_data.background.sun.w;
@@ -300,7 +305,7 @@ ccl_device_inline float background_sun_pdf(KernelGlobals *kg, float3 D)
 }
 
 ccl_device_inline float3
-background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
+background_light_sample(const KernelGlobals *kg, float3 P, float randu, float randv, float *pdf)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
@@ -400,7 +405,7 @@ background_light_sample(KernelGlobals *kg, float3 P, float randu, float randv, f
   return D;
 }
 
-ccl_device float background_light_pdf(KernelGlobals *kg, float3 P, float3 direction)
+ccl_device float background_light_pdf(const KernelGlobals *kg, float3 P, float3 direction)
 {
   float portal_method_pdf = kernel_data.background.portal_weight;
   float sun_method_pdf = kernel_data.background.sun_weight;
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
index 4a683d36226..765d8f5338e 100644
--- a/intern/cycles/kernel/kernel_light_common.h
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Area light sampling */
@@ -210,7 +214,7 @@ ccl_device bool light_spread_clamp_area_light(const float3 P,
   return true;
 }
 
-ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(const KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
   float cos_pi = dot(Ng, I);
 
diff --git a/intern/cycles/kernel/kernel_lookup_table.h b/intern/cycles/kernel/kernel_lookup_table.h
new file mode 100644
index 00000000000..33d9d5ae1f0
--- /dev/null
+++ b/intern/cycles/kernel/kernel_lookup_table.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Interpolated lookup table access */
+
+ccl_device float lookup_table_read(const KernelGlobals *kg, float x, int offset, int size)
+{
+  x = saturate(x) * (size - 1);
+
+  int index = min(float_to_int(x), size - 1);
+  int nindex = min(index + 1, size - 1);
+  float t = x - index;
+
+  float data0 = kernel_tex_fetch(__lookup_table, index + offset);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = kernel_tex_fetch(__lookup_table, nindex + offset);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+ccl_device float lookup_table_read_2D(
+    const KernelGlobals *kg, float x, float y, int offset, int xsize, int ysize)
+{
+  y = saturate(y) * (ysize - 1);
+
+  int index = min(float_to_int(y), ysize - 1);
+  int nindex = min(index + 1, ysize - 1);
+  float t = y - index;
+
+  float data0 = lookup_table_read(kg, x, offset + xsize * index, xsize);
+  if (t == 0.0f)
+    return data0;
+
+  float data1 = lookup_table_read(kg, x, offset + xsize * nindex, xsize);
+  return (1.0f - t) * data0 + t * data1;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_math.h b/intern/cycles/kernel/kernel_math.h
index 96391db7649..3c5ab95bbc8 100644
--- a/intern/cycles/kernel/kernel_math.h
+++ b/intern/cycles/kernel/kernel_math.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_MATH_H__
-#define __KERNEL_MATH_H__
+#pragma once
 
 #include "util/util_color.h"
 #include "util/util_math.h"
@@ -24,5 +23,3 @@
 #include "util/util_projection.h"
 #include "util/util_texture.h"
 #include "util/util_transform.h"
-
-#endif /* __KERNEL_MATH_H__ */
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index ce37bd0b15e..b158f4c4fd3 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_MONTECARLO_CL__
-#define __KERNEL_MONTECARLO_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -300,5 +299,3 @@ ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_MONTECARLO_CL__ */
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 8f58b8c3079..67466b28170 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,61 +14,52 @@
  * limitations under the License.
  */
 
+#pragma once
+
+#include "kernel/geom/geom.h"
+
 #include "kernel/kernel_id_passes.h"
+#include "kernel/kernel_write_passes.h"
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __DENOISING_FEATURES__
-
-ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg,
-                                                     ccl_global float *buffer,
-                                                     int sample,
-                                                     float path_total,
-                                                     float path_total_shaded)
+/* Get pointer to pixel in render buffer. */
+ccl_device_forceinline ccl_global float *kernel_pass_pixel_render_buffer(
+    INTEGRATOR_STATE_CONST_ARGS, ccl_global float *ccl_restrict render_buffer)
 {
-  if (kernel_data.film.pass_denoising_data == 0)
-    return;
-
-  buffer += sample_is_even(kernel_data.integrator.sampling_pattern, sample) ?
-                DENOISING_PASS_SHADOW_B :
-                DENOISING_PASS_SHADOW_A;
-
-  path_total = ensure_finite(path_total);
-  path_total_shaded = ensure_finite(path_total_shaded);
-
-  kernel_write_pass_float(buffer, path_total);
-  kernel_write_pass_float(buffer + 1, path_total_shaded);
-
-  float value = path_total_shaded / max(path_total, 1e-7f);
-  kernel_write_pass_float(buffer + 2, value * value);
+  const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                        kernel_data.film.pass_stride;
+  return render_buffer + render_buffer_offset;
 }
 
-ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
+#ifdef __DENOISING_FEATURES__
+
+ccl_device_forceinline void kernel_write_denoising_features_surface(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (state->denoising_feature_weight == 0.0f) {
+  if (!(INTEGRATOR_STATE(path, flag) & PATH_RAY_DENOISING_FEATURES)) {
     return;
   }
 
-  L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
-
   /* Skip implicitly transparent surfaces. */
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return;
   }
 
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
   float3 normal = zero_float3();
   float3 diffuse_albedo = zero_float3();
   float3 specular_albedo = zero_float3();
   float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
-    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+    if (!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
       continue;
+    }
 
     /* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
     normal += sc->N * sc->sample_weight;
@@ -106,140 +97,208 @@ ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
       normal /= sum_weight;
     }
 
-    /* Transform normal into camera space. */
-    const Transform worldtocamera = kernel_data.cam.worldtocamera;
-    normal = transform_direction(&worldtocamera, normal);
+    if (kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+      /* Transform normal into camera space. */
+      const Transform worldtocamera = kernel_data.cam.worldtocamera;
+      normal = transform_direction(&worldtocamera, normal);
+
+      const float3 denoising_normal = ensure_finite3(normal);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+    }
 
-    L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
-    L->denoising_albedo += ensure_finite3(state->denoising_feature_weight *
-                                          state->denoising_feature_throughput * diffuse_albedo);
+    if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+      const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
+                                                                   denoising_feature_throughput);
+      const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
+                                                     diffuse_albedo);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
+    }
 
-    state->denoising_feature_weight = 0.0f;
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
   }
   else {
-    state->denoising_feature_throughput *= specular_albedo;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) *= specular_albedo;
+  }
+}
+
+ccl_device_forceinline void kernel_write_denoising_features_volume(INTEGRATOR_STATE_ARGS,
+                                                                   const float3 albedo,
+                                                                   const bool scatter,
+                                                                   ccl_global float *ccl_restrict
+                                                                       render_buffer)
+{
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+  const float3 denoising_feature_throughput = INTEGRATOR_STATE(path, denoising_feature_throughput);
+
+  if (scatter && kernel_data.film.pass_denoising_normal != PASS_UNUSED) {
+    /* Assume scatter is sufficiently diffuse to stop writing denoising features. */
+    INTEGRATOR_STATE_WRITE(path, flag) &= ~PATH_RAY_DENOISING_FEATURES;
+
+    /* Write view direction as normal. */
+    const float3 denoising_normal = make_float3(0.0f, 0.0f, -1.0f);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
+  }
+
+  if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
+    /* Write albedo. */
+    const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput * albedo);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
   }
 }
 #endif /* __DENOISING_FEATURES__ */
 
-#ifdef __KERNEL_CPU__
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
-ccl_device_inline size_t kernel_write_id_pass_cpu(
-    float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+#ifdef __SHADOW_CATCHER__
+
+/* Write shadow catcher passes on a bounce from the shadow catcher object. */
+ccl_device_forceinline void kernel_write_shadow_catcher_bounce_data(
+    INTEGRATOR_STATE_ARGS, const ShaderData *sd, ccl_global float *ccl_restrict render_buffer)
 {
-  if (map) {
-    (*map)[id] += matte_weight;
-    return 0;
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return;
+  }
+
+  kernel_assert(kernel_data.film.pass_shadow_catcher_sample_count != PASS_UNUSED);
+  kernel_assert(kernel_data.film.pass_shadow_catcher_matte != PASS_UNUSED);
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, sd->object_flag)) {
+    return;
   }
-#else /* __KERNEL_CPU__ */
-#  define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) \
-    kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight)
-ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer,
-                                                   size_t depth,
-                                                   float id,
-                                                   float matte_weight)
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
+
+  /* Count sample for the shadow catcher object. */
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_sample_count, 1.0f);
+
+  /* Since the split is done, the sample does not contribute to the matte, so accumulate it as
+   * transparency to the matte. */
+  const float3 throughput = INTEGRATOR_STATE(path, throughput);
+  kernel_write_pass_float(buffer + kernel_data.film.pass_shadow_catcher_matte + 3,
+                          average(throughput));
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+ccl_device_inline size_t kernel_write_id_pass(float *ccl_restrict buffer,
+                                              size_t depth,
+                                              float id,
+                                              float matte_weight)
 {
-#endif /* __KERNEL_CPU__ */
-  kernel_write_id_slots(buffer, depth, id, matte_weight);
-  return depth * 2;
+  kernel_write_id_slots(buffer, depth * 2, id, matte_weight);
+  return depth * 4;
 }
 
-ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
-                                                ccl_global float *buffer,
-                                                PathRadiance *L,
-                                                ShaderData *sd,
-                                                ccl_addr_space PathState *state,
-                                                float3 throughput)
+ccl_device_inline void kernel_write_data_passes(INTEGRATOR_STATE_ARGS,
+                                                const ShaderData *sd,
+                                                ccl_global float *ccl_restrict render_buffer)
 {
 #ifdef __PASSES__
-  int path_flag = state->flag;
+  const int path_flag = INTEGRATOR_STATE(path, flag);
 
-  if (!(path_flag & PATH_RAY_CAMERA))
+  if (!(path_flag & PATH_RAY_CAMERA)) {
     return;
+  }
 
-  int flag = kernel_data.film.pass_flag;
-  int light_flag = kernel_data.film.light_pass_flag;
+  const int flag = kernel_data.film.pass_flag;
 
-  if (!((flag | light_flag) & PASS_ANY))
+  if (!(flag & PASS_ANY)) {
     return;
+  }
+
+  ccl_global float *buffer = kernel_pass_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer);
 
   if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
     if (!(sd->flag & SD_TRANSPARENT) || kernel_data.film.pass_alpha_threshold == 0.0f ||
         average(shader_bsdf_alpha(kg, sd)) >= kernel_data.film.pass_alpha_threshold) {
-      if (state->sample == 0) {
+      if (INTEGRATOR_STATE(path, sample) == 0) {
         if (flag & PASSMASK(DEPTH)) {
-          float depth = camera_z_depth(kg, sd->P);
+          const float depth = camera_z_depth(kg, sd->P);
           kernel_write_pass_float(buffer + kernel_data.film.pass_depth, depth);
         }
         if (flag & PASSMASK(OBJECT_ID)) {
-          float id = object_pass_id(kg, sd->object);
+          const float id = object_pass_id(kg, sd->object);
           kernel_write_pass_float(buffer + kernel_data.film.pass_object_id, id);
         }
         if (flag & PASSMASK(MATERIAL_ID)) {
-          float id = shader_pass_id(kg, sd);
+          const float id = shader_pass_id(kg, sd);
           kernel_write_pass_float(buffer + kernel_data.film.pass_material_id, id);
         }
       }
 
+      if (flag & PASSMASK(POSITION)) {
+        const float3 position = sd->P;
+        kernel_write_pass_float3(buffer + kernel_data.film.pass_position, position);
+      }
       if (flag & PASSMASK(NORMAL)) {
-        float3 normal = shader_bsdf_average_normal(kg, sd);
+        const float3 normal = shader_bsdf_average_normal(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_normal, normal);
       }
+      if (flag & PASSMASK(ROUGHNESS)) {
+        const float roughness = shader_bsdf_average_roughness(sd);
+        kernel_write_pass_float(buffer + kernel_data.film.pass_roughness, roughness);
+      }
       if (flag & PASSMASK(UV)) {
-        float3 uv = primitive_uv(kg, sd);
+        const float3 uv = primitive_uv(kg, sd);
         kernel_write_pass_float3(buffer + kernel_data.film.pass_uv, uv);
       }
       if (flag & PASSMASK(MOTION)) {
-        float4 speed = primitive_motion_vector(kg, sd);
+        const float4 speed = primitive_motion_vector(kg, sd);
         kernel_write_pass_float4(buffer + kernel_data.film.pass_motion, speed);
         kernel_write_pass_float(buffer + kernel_data.film.pass_motion_weight, 1.0f);
       }
 
-      state->flag |= PATH_RAY_SINGLE_PASS_DONE;
+      INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SINGLE_PASS_DONE;
     }
   }
 
   if (kernel_data.film.cryptomatte_passes) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
     const float matte_weight = average(throughput) *
                                (1.0f - average(shader_bsdf_transparency(kg, sd)));
     if (matte_weight > 0.0f) {
       ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
       if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-        float id = object_cryptomatte_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+        const float id = object_cryptomatte_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-        float id = shader_cryptomatte_id(kg, sd->shader);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+        const float id = shader_cryptomatte_id(kg, sd->shader);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
       if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-        float id = object_cryptomatte_asset_id(kg, sd->object);
-        cryptomatte_buffer += WRITE_ID_SLOT(
-            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+        const float id = object_cryptomatte_asset_id(kg, sd->object);
+        cryptomatte_buffer += kernel_write_id_pass(
+            cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight);
       }
     }
   }
 
-  if (light_flag & PASSMASK_COMPONENT(DIFFUSE))
-    L->color_diffuse += shader_bsdf_diffuse(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(GLOSSY))
-    L->color_glossy += shader_bsdf_glossy(kg, sd) * throughput;
-  if (light_flag & PASSMASK_COMPONENT(TRANSMISSION))
-    L->color_transmission += shader_bsdf_transmission(kg, sd) * throughput;
-
-  if (light_flag & PASSMASK(MIST)) {
-    /* bring depth into 0..1 range */
-    float mist_start = kernel_data.film.mist_start;
-    float mist_inv_depth = kernel_data.film.mist_inv_depth;
+  if (flag & PASSMASK(DIFFUSE_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color,
+                             shader_bsdf_diffuse(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(GLOSSY_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color,
+                             shader_bsdf_glossy(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(TRANSMISSION_COLOR)) {
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
+                             shader_bsdf_transmission(kg, sd) * throughput);
+  }
+  if (flag & PASSMASK(MIST)) {
+    /* Bring depth into 0..1 range. */
+    const float mist_start = kernel_data.film.mist_start;
+    const float mist_inv_depth = kernel_data.film.mist_inv_depth;
 
-    float depth = camera_distance(kg, sd->P);
+    const float depth = camera_distance(kg, sd->P);
     float mist = saturate((depth - mist_start) * mist_inv_depth);
 
-    /* falloff */
-    float mist_falloff = kernel_data.film.mist_falloff;
+    /* Falloff */
+    const float mist_falloff = kernel_data.film.mist_falloff;
 
     if (mist_falloff == 1.0f)
       ;
@@ -250,158 +309,17 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg,
     else
       mist = powf(mist, mist_falloff);
 
-    /* modulate by transparency */
-    float3 alpha = shader_bsdf_alpha(kg, sd);
-    L->mist += (1.0f - mist) * average(throughput * alpha);
-  }
-#endif
-}
+    /* Modulate by transparency */
+    const float3 throughput = INTEGRATOR_STATE(path, throughput);
+    const float3 alpha = shader_bsdf_alpha(kg, sd);
+    const float mist_output = (1.0f - mist) * average(throughput * alpha);
 
-ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg,
-                                                 ccl_global float *buffer,
-                                                 PathRadiance *L)
-{
-#ifdef __PASSES__
-  int light_flag = kernel_data.film.light_pass_flag;
-
-  if (!kernel_data.film.use_light_pass)
-    return;
-
-  if (light_flag & PASSMASK(DIFFUSE_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_indirect, L->indirect_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_indirect, L->indirect_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_indirect,
-                             L->indirect_transmission);
-  if (light_flag & PASSMASK(VOLUME_INDIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_indirect, L->indirect_volume);
-  if (light_flag & PASSMASK(DIFFUSE_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_direct, L->direct_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_direct, L->direct_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_direct,
-                             L->direct_transmission);
-  if (light_flag & PASSMASK(VOLUME_DIRECT))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_volume_direct, L->direct_volume);
-
-  if (light_flag & PASSMASK(EMISSION))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_emission, L->emission);
-  if (light_flag & PASSMASK(BACKGROUND))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_background, L->background);
-  if (light_flag & PASSMASK(AO))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_ao, L->ao);
-
-  if (light_flag & PASSMASK(DIFFUSE_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_diffuse_color, L->color_diffuse);
-  if (light_flag & PASSMASK(GLOSSY_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_glossy_color, L->color_glossy);
-  if (light_flag & PASSMASK(TRANSMISSION_COLOR))
-    kernel_write_pass_float3(buffer + kernel_data.film.pass_transmission_color,
-                             L->color_transmission);
-  if (light_flag & PASSMASK(SHADOW)) {
-    float3 shadow = L->shadow;
-    kernel_write_pass_float4(
-        buffer + kernel_data.film.pass_shadow,
-        make_float4(shadow.x, shadow.y, shadow.z, kernel_data.film.pass_shadow_scale));
+    /* Note that the final value in the render buffer we want is 1 - mist_output,
+     * to avoid having to tracking this in the Integrator state we do the negation
+     * after rendering. */
+    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, mist_output);
   }
-  if (light_flag & PASSMASK(MIST))
-    kernel_write_pass_float(buffer + kernel_data.film.pass_mist, 1.0f - L->mist);
 #endif
 }
 
-ccl_device_inline void kernel_write_result(KernelGlobals *kg,
-                                           ccl_global float *buffer,
-                                           int sample,
-                                           PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_WRITE_RESULT);
-  PROFILING_OBJECT(PRIM_NONE);
-
-  float alpha;
-  float3 L_sum = path_radiance_clamp_and_sum(kg, L, &alpha);
-
-  if (kernel_data.film.pass_flag & PASSMASK(COMBINED)) {
-    kernel_write_pass_float4(buffer, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
-  }
-
-  kernel_write_light_passes(kg, buffer, L);
-
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-#  ifdef __SHADOW_TRICKS__
-    kernel_write_denoising_shadow(kg,
-                                  buffer + kernel_data.film.pass_denoising_data,
-                                  sample,
-                                  average(L->path_total),
-                                  average(L->path_total_shaded));
-#  else
-    kernel_write_denoising_shadow(
-        kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
-#  endif
-    if (kernel_data.film.pass_denoising_clean) {
-      float3 noisy, clean;
-      path_radiance_split_denoising(kg, L, &noisy, &clean);
-      kernel_write_pass_float3_variance(
-          buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR, noisy);
-      kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean, clean);
-    }
-    else {
-      kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                            DENOISING_PASS_COLOR,
-                                        ensure_finite3(L_sum));
-    }
-
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_NORMAL,
-                                      L->denoising_normal);
-    kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data +
-                                          DENOISING_PASS_ALBEDO,
-                                      L->denoising_albedo);
-    kernel_write_pass_float_variance(
-        buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH, L->denoising_depth);
-  }
-#endif /* __DENOISING_FEATURES__ */
-
-  /* Adaptive Sampling. Fill the additional buffer with the odd samples and calculate our stopping
-     criteria. This is the heuristic from "A hierarchical automatic stopping condition for Monte
-     Carlo global illumination" except that here it is applied per pixel and not in hierarchical
-     tiles. */
-  if (kernel_data.film.pass_adaptive_aux_buffer &&
-      kernel_data.integrator.adaptive_threshold > 0.0f) {
-    if (sample_is_even(kernel_data.integrator.sampling_pattern, sample)) {
-      kernel_write_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer,
-                               make_float4(L_sum.x * 2.0f, L_sum.y * 2.0f, L_sum.z * 2.0f, 0.0f));
-    }
-#ifdef __KERNEL_CPU__
-    if ((sample > kernel_data.integrator.adaptive_min_samples) &&
-        kernel_data.integrator.adaptive_stop_per_sample) {
-      const int step = kernel_data.integrator.adaptive_step;
-
-      if ((sample & (step - 1)) == (step - 1)) {
-        kernel_do_adaptive_stopping(kg, buffer, sample);
-      }
-    }
-#endif
-  }
-
-  /* Write the sample count as negative numbers initially to mark the samples as in progress.
-   * Once the tile has finished rendering, the sign gets flipped and all the pixel values
-   * are scaled as if they were taken at a uniform sample count. */
-  if (kernel_data.film.pass_sample_count) {
-    /* Make sure it's a negative number. In progressive refine mode, this bit gets flipped between
-     * passes. */
-#ifdef __ATOMIC_PASS_WRITE__
-    atomic_fetch_and_or_uint32((ccl_global uint *)(buffer + kernel_data.film.pass_sample_count),
-                               0x80000000);
-#else
-    if (buffer[kernel_data.film.pass_sample_count] > 0) {
-      buffer[kernel_data.film.pass_sample_count] *= -1.0f;
-    }
-#endif
-    kernel_write_pass_float(buffer + kernel_data.film.pass_sample_count, -1.0f);
-  }
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
deleted file mode 100644
index 92a097de9e1..00000000000
--- a/intern/cycles/kernel/kernel_path.h
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-// clang-format off
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_projection.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_differential.h"
-#include "kernel/kernel_camera.h"
-
-#include "kernel/geom/geom.h"
-#include "kernel/bvh/bvh.h"
-
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_accumulate.h"
-#include "kernel/kernel_shader.h"
-#include "kernel/kernel_light.h"
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_passes.h"
-
-#if defined(__VOLUME__) || defined(__SUBSURFACE__)
-#  include "kernel/kernel_volume.h"
-#endif
-
-#ifdef __SUBSURFACE__
-#  include "kernel/kernel_subsurface.h"
-#endif
-
-#include "kernel/kernel_path_state.h"
-#include "kernel/kernel_shadow.h"
-#include "kernel/kernel_emission.h"
-#include "kernel/kernel_path_common.h"
-#include "kernel/kernel_path_surface.h"
-#include "kernel/kernel_path_volume.h"
-#include "kernel/kernel_path_subsurface.h"
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
-                                                        ccl_addr_space PathState *state,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        PathRadiance *L,
-                                                        const int last_object)
-{
-  PROFILING_INIT(kg, PROFILING_SCENE_INTERSECT);
-
-  uint visibility = path_state_ray_visibility(kg, state);
-
-  if (path_state_ao_bounce(kg, state)) {
-    ray->t = kernel_data.background.ao_distance;
-    if (last_object != OBJECT_NONE) {
-      const float object_ao_distance = kernel_tex_fetch(__objects, last_object).ao_distance;
-      if (object_ao_distance != 0.0f) {
-        ray->t = object_ao_distance;
-      }
-    }
-  }
-
-  bool hit = scene_intersect(kg, ray, visibility, isect);
-
-  return hit;
-}
-
-ccl_device_forceinline void kernel_path_lamp_emission(KernelGlobals *kg,
-                                                      ccl_addr_space PathState *state,
-                                                      Ray *ray,
-                                                      float3 throughput,
-                                                      ccl_addr_space Intersection *isect,
-                                                      ShaderData *emission_sd,
-                                                      PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_INDIRECT_EMISSION);
-
-#ifdef __LAMP_MIS__
-  if (kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
-    /* ray starting from previous non-transparent bounce */
-    Ray light_ray ccl_optional_struct_init;
-
-    light_ray.P = ray->P - state->ray_t * ray->D;
-    state->ray_t += isect->t;
-    light_ray.D = ray->D;
-    light_ray.t = state->ray_t;
-    light_ray.time = ray->time;
-    light_ray.dD = ray->dD;
-    light_ray.dP = ray->dP;
-
-    /* intersect with lamp */
-    indirect_lamp_emission(kg, emission_sd, state, L, &light_ray, throughput);
-  }
-#endif /* __LAMP_MIS__ */
-}
-
-ccl_device_forceinline void kernel_path_background(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   ccl_addr_space Ray *ray,
-                                                   float3 throughput,
-                                                   ShaderData *sd,
-                                                   ccl_global float *buffer,
-                                                   PathRadiance *L)
-{
-  /* eval background shader if nothing hit */
-  if (kernel_data.background.transparent && (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    L->transparent += average(throughput);
-
-#ifdef __PASSES__
-    if (!(kernel_data.film.light_pass_flag & PASSMASK(BACKGROUND)))
-#endif /* __PASSES__ */
-      return;
-  }
-
-  /* When using the ao bounces approximation, adjust background
-   * shader intensity with ao factor. */
-  if (path_state_ao_bounce(kg, state)) {
-    throughput *= kernel_data.background.ao_bounces_factor;
-  }
-
-#ifdef __BACKGROUND__
-  /* sample background shader */
-  float3 L_background = indirect_background(kg, sd, state, buffer, ray);
-  path_radiance_accum_background(kg, L, state, throughput, L_background);
-#endif /* __BACKGROUND__ */
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  ifdef __VOLUME__
-ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(KernelGlobals *kg,
-                                                                ShaderData *sd,
-                                                                PathState *state,
-                                                                Ray *ray,
-                                                                float3 *throughput,
-                                                                ccl_addr_space Intersection *isect,
-                                                                bool hit,
-                                                                ShaderData *emission_sd,
-                                                                PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_VOLUME);
-
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return VOLUME_PATH_ATTENUATED;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-#    ifdef __VOLUME_DECOUPLED__
-  int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-  bool direct = (state->flag & PATH_RAY_CAMERA) != 0;
-  bool decoupled = kernel_volume_use_decoupled(kg, step_size, direct, sampling_method);
-
-  if (decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    volume_segment.sampling_method = sampling_method;
-
-    /* emission */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-
-    /* scattering */
-    VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
-
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      int all = kernel_data.integrator.sample_all_lights_indirect;
-
-      /* direct light sampling */
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect sample. if we use distance sampling and take just
-       * one sample for direct and indirect light, we could share
-       * this computation, but makes code a bit complex */
-      float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-      float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-      result = kernel_volume_decoupled_scatter(
-          kg, state, &volume_ray, sd, throughput, rphase, rscatter, &volume_segment, NULL, true);
-    }
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-
-    if (result == VOLUME_PATH_SCATTERED) {
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-    else {
-      *throughput *= volume_segment.accum_transmittance;
-    }
-  }
-  else
-#    endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#    ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      /* indirect light bounce */
-      if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray))
-        return VOLUME_PATH_SCATTERED;
-      else
-        return VOLUME_PATH_MISSED;
-    }
-#    endif /* __VOLUME_SCATTER__ */
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-#  endif /* __VOLUME__ */
-
-#endif /* __SPLIT_KERNEL__ */
-
-ccl_device_forceinline bool kernel_path_shader_apply(KernelGlobals *kg,
-                                                     ShaderData *sd,
-                                                     ccl_addr_space PathState *state,
-                                                     ccl_addr_space Ray *ray,
-                                                     float3 throughput,
-                                                     ShaderData *emission_sd,
-                                                     PathRadiance *L,
-                                                     ccl_global float *buffer)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_APPLY);
-
-#ifdef __SHADOW_TRICKS__
-  if (sd->object_flag & SD_OBJECT_SHADOW_CATCHER) {
-    if (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND) {
-      state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_STORE_SHADOW_INFO);
-
-      float3 bg = zero_float3();
-      if (!kernel_data.background.transparent) {
-        bg = indirect_background(kg, emission_sd, state, NULL, ray);
-      }
-      path_radiance_accum_shadowcatcher(L, throughput, bg);
-    }
-  }
-  else if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    /* Only update transparency after shadow catcher bounce. */
-    L->shadow_transparency *= average(shader_bsdf_transparency(kg, sd));
-  }
-#endif /* __SHADOW_TRICKS__ */
-
-  /* holdout */
-#ifdef __HOLDOUT__
-  if (((sd->flag & SD_HOLDOUT) || (sd->object_flag & SD_OBJECT_HOLDOUT_MASK)) &&
-      (state->flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    const float3 holdout_weight = shader_holdout_apply(kg, sd);
-    if (kernel_data.background.transparent) {
-      L->transparent += average(holdout_weight * throughput);
-    }
-    if (isequal_float3(holdout_weight, one_float3())) {
-      return false;
-    }
-  }
-#endif /* __HOLDOUT__ */
-
-  /* holdout mask objects do not write data passes */
-  kernel_write_data_passes(kg, buffer, L, sd, state, throughput);
-
-  /* blurring of bsdf after bounces, for rays that have a small likelihood
-   * of following this particular path (diffuse, rough glossy) */
-  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
-    float blur_pdf = kernel_data.integrator.filter_glossy * state->min_ray_pdf;
-
-    if (blur_pdf < 1.0f) {
-      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
-      shader_bsdf_blur(kg, sd, blur_roughness);
-    }
-  }
-
-#ifdef __EMISSION__
-  /* emission */
-  if (sd->flag & SD_EMISSION) {
-    float3 emission = indirect_primitive_emission(
-        kg, sd, sd->ray_length, state->flag, state->ray_pdf);
-    path_radiance_accum_emission(kg, L, state, throughput, emission);
-  }
-#endif /* __EMISSION__ */
-
-  return true;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    void
-    kernel_path_ao(KernelGlobals *kg,
-                   ShaderData *sd,
-                   ShaderData *emission_sd,
-                   PathRadiance *L,
-                   ccl_addr_space PathState *state,
-                   float3 throughput,
-                   float3 ao_alpha)
-{
-  PROFILING_INIT(kg, PROFILING_AO);
-
-  /* todo: solve correlation */
-  float bsdf_u, bsdf_v;
-
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_D;
-  float ao_pdf;
-
-  sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-  if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-    Ray light_ray;
-    float3 ao_shadow;
-
-    light_ray.P = ray_offset(sd->P, sd->Ng);
-    light_ray.D = ao_D;
-    light_ray.t = kernel_data.background.ao_distance;
-    light_ray.time = sd->time;
-    light_ray.dP = sd->dP;
-    light_ray.dD = differential3_zero();
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-      path_radiance_accum_ao(kg, L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
-    }
-    else {
-      path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
-    }
-  }
-}
-
-#ifndef __SPLIT_KERNEL__
-
-#  if defined(__BRANCHED_PATH__) || defined(__BAKING__)
-
-ccl_device void kernel_path_indirect(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ShaderData *emission_sd,
-                                     Ray *ray,
-                                     float3 throughput,
-                                     PathState *state,
-                                     PathRadiance *L,
-                                     const int last_object)
-{
-#    ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#    endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, last_object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, sd, L);
-
-#    ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#    endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, sd, NULL, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-      if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, sd, state, NULL, state->flag);
-        shader_prepare_closures(sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, NULL)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#    ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, sd, state, L);
-#    endif
-
-#    ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, sd, emission_sd, L, state, throughput, zero_float3());
-        }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd->flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#    endif /* __SUBSURFACE__ */
-
-#    if defined(__EMISSION__)
-        int all = (kernel_data.integrator.sample_all_lights_indirect) ||
-                  (state->flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#    endif /* defined(__EMISSION__) */
-
-#    ifdef __VOLUME__
-      }
-#    endif
-
-      if (!kernel_path_surface_bounce(kg, sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#    ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#    endif /* __SUBSURFACE__ */
-}
-
-#  endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-
-ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
-                                                  PathState *state,
-                                                  float3 throughput,
-                                                  Ray *ray,
-                                                  PathRadiance *L,
-                                                  ccl_global float *buffer,
-                                                  ShaderData *emission_sd)
-{
-  PROFILING_INIT(kg, PROFILING_PATH_INTEGRATE);
-
-  /* Shader data memory used for both volumes and surfaces, saves stack space. */
-  ShaderData sd;
-
-#  ifdef __SUBSURFACE__
-  SubsurfaceIndirectRays ss_indirect;
-  kernel_path_subsurface_init_indirect(&ss_indirect);
-
-  for (;;) {
-#  endif /* __SUBSURFACE__ */
-
-    /* path iteration */
-    for (;;) {
-      /* Find intersection with objects in scene. */
-      Intersection isect;
-      bool hit = kernel_path_scene_intersect(kg, state, ray, &isect, L, sd.object);
-
-      /* Find intersection with lamps and compute emission for MIS. */
-      kernel_path_lamp_emission(kg, state, ray, throughput, &isect, &sd, L);
-
-#  ifdef __VOLUME__
-      /* Volume integration. */
-      VolumeIntegrateResult result = kernel_path_volume(
-          kg, &sd, state, ray, &throughput, &isect, hit, emission_sd, L);
-
-      if (result == VOLUME_PATH_SCATTERED) {
-        continue;
-      }
-      else if (result == VOLUME_PATH_MISSED) {
-        break;
-      }
-#  endif /* __VOLUME__*/
-
-      /* Shade background. */
-      if (!hit) {
-        kernel_path_background(kg, state, ray, throughput, &sd, buffer, L);
-        break;
-      }
-      else if (path_state_ao_bounce(kg, state)) {
-        if (intersection_get_shader_flags(kg, &isect) &
-            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
-        }
-        else {
-          break;
-        }
-      }
-
-      /* Setup shader data. */
-      shader_setup_from_ray(kg, &sd, &isect, ray);
-
-      /* Skip most work for volume bounding surface. */
-#  ifdef __VOLUME__
-      if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-
-        /* Evaluate shader. */
-        shader_eval_surface(kg, &sd, state, buffer, state->flag);
-        shader_prepare_closures(&sd, state);
-
-        /* Apply shadow catcher, holdout, emission. */
-        if (!kernel_path_shader_apply(kg, &sd, state, ray, throughput, emission_sd, L, buffer)) {
-          break;
-        }
-
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-
-#  ifdef __DENOISING_FEATURES__
-        kernel_update_denoising_features(kg, &sd, state, L);
-#  endif
-
-#  ifdef __AO__
-        /* ambient occlusion */
-        if (kernel_data.integrator.use_ambient_occlusion) {
-          kernel_path_ao(kg, &sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, &sd));
-        }
-#  endif /* __AO__ */
-
-#  ifdef __SUBSURFACE__
-        /* bssrdf scatter to a different location on the same object, replacing
-         * the closures with a diffuse BSDF */
-        if (sd.flag & SD_BSSRDF) {
-          if (kernel_path_subsurface_scatter(
-                  kg, &sd, emission_sd, L, state, ray, &throughput, &ss_indirect)) {
-            break;
-          }
-        }
-#  endif /* __SUBSURFACE__ */
-
-#  ifdef __EMISSION__
-        /* direct lighting */
-        kernel_path_surface_connect_light(kg, &sd, emission_sd, throughput, state, L);
-#  endif /* __EMISSION__ */
-
-#  ifdef __VOLUME__
-      }
-#  endif
-
-      /* compute direct lighting and next bounce */
-      if (!kernel_path_surface_bounce(kg, &sd, &throughput, state, &L->state, ray))
-        break;
-    }
-
-#  ifdef __SUBSURFACE__
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect.num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, &ss_indirect, state, ray, L, &throughput);
-    }
-    else {
-      break;
-    }
-  }
-#  endif /* __SUBSURFACE__ */
-}
-
-ccl_device void kernel_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  PROFILING_INIT(kg, PROFILING_RAY_SETUP);
-
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* Initialize random numbers and sample ray. */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  if (ray.t == 0.0f) {
-    return;
-  }
-
-  /* Initialize state. */
-  float3 throughput = one_float3();
-
-  PathRadiance L;
-  path_radiance_init(kg, &L);
-
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-#  ifdef __KERNEL_OPTIX__
-  /* Force struct into local memory to avoid costly spilling on trace calls. */
-  if (pass_stride < 0) /* This is never executed and just prevents the compiler from doing SROA. */
-    for (int i = 0; i < sizeof(L); ++i)
-      reinterpret_cast<unsigned char *>(&L)[-pass_stride + i] = 0;
-#  endif
-
-  /* Integrate. */
-  kernel_path_integrate(kg, &state, throughput, &ray, &L, buffer, emission_sd);
-
-  kernel_write_result(kg, buffer, sample, &L);
-}
-
-#endif /* __SPLIT_KERNEL__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
deleted file mode 100644
index a1ee1bc107e..00000000000
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *emission_sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space PathState *state,
-                                               float3 throughput)
-{
-  int num_samples = kernel_data.integrator.ao_samples;
-  float num_samples_inv = 1.0f / num_samples;
-  float ao_factor = kernel_data.background.ao_factor;
-  float3 ao_N;
-  float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
-  float3 ao_alpha = shader_bsdf_alpha(kg, sd);
-
-  for (int j = 0; j < num_samples; j++) {
-    float bsdf_u, bsdf_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, j, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-
-    float3 ao_D;
-    float ao_pdf;
-
-    sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
-
-    if (dot(sd->Ng, ao_D) > 0.0f && ao_pdf != 0.0f) {
-      Ray light_ray;
-      float3 ao_shadow;
-
-      light_ray.P = ray_offset(sd->P, sd->Ng);
-      light_ray.D = ao_D;
-      light_ray.t = kernel_data.background.ao_distance;
-      light_ray.time = sd->time;
-      light_ray.dP = sd->dP;
-      light_ray.dD = differential3_zero();
-
-      if (!shadow_blocked(kg, sd, emission_sd, state, &light_ray, &ao_shadow)) {
-        path_radiance_accum_ao(
-            kg, L, state, throughput * num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
-      }
-      else {
-        path_radiance_accum_total_ao(L, state, throughput * num_samples_inv, ao_bsdf);
-      }
-    }
-  }
-}
-
-#  ifndef __SPLIT_KERNEL__
-
-#    ifdef __VOLUME__
-ccl_device_forceinline void kernel_branched_path_volume(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 *throughput,
-                                                        ccl_addr_space Intersection *isect,
-                                                        bool hit,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L)
-{
-  /* Sanitize volume stack. */
-  if (!hit) {
-    kernel_volume_clean_stack(kg, state->volume_stack);
-  }
-
-  if (state->volume_stack[0].shader == SHADER_NONE) {
-    return;
-  }
-
-  /* volume attenuation, emission, scatter */
-  Ray volume_ray = *ray;
-  volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  const int object = sd->object;
-
-#      ifdef __VOLUME_DECOUPLED__
-  /* decoupled ray marching only supported on CPU */
-  if (kernel_data.integrator.volume_decoupled) {
-    /* cache steps along volume for repeated sampling */
-    VolumeSegment volume_segment;
-
-    shader_setup_from_volume(kg, sd, &volume_ray);
-    kernel_volume_decoupled_record(kg, state, &volume_ray, sd, &volume_segment, step_size);
-
-    /* direct light sampling */
-    if (volume_segment.closure_flag & SD_SCATTER) {
-      volume_segment.sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
-
-      int all = kernel_data.integrator.sample_all_lights_direct;
-
-      kernel_branched_path_volume_connect_light(
-          kg, sd, emission_sd, *throughput, state, L, all, &volume_ray, &volume_segment);
-
-      /* indirect light sampling */
-      int num_samples = kernel_data.integrator.volume_samples;
-      float num_samples_inv = 1.0f / num_samples;
-
-      for (int j = 0; j < num_samples; j++) {
-        PathState ps = *state;
-        Ray pray = *ray;
-        float3 tp = *throughput;
-
-        /* branch RNG state */
-        path_state_branch(&ps, j, num_samples);
-
-        /* scatter sample. if we use distance sampling and take just one
-         * sample for direct and indirect light, we could share this
-         * computation, but makes code a bit complex */
-        float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
-        float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(
-            kg, &ps, &pray, sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
-
-        if (result == VOLUME_PATH_SCATTERED &&
-            kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(
-              kg, indirect_sd, emission_sd, &pray, tp * num_samples_inv, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-    }
-
-    /* emission and transmittance */
-    if (volume_segment.closure_flag & SD_EMISSION)
-      path_radiance_accum_emission(kg, L, state, *throughput, volume_segment.accum_emission);
-    *throughput *= volume_segment.accum_transmittance;
-
-    /* free cached steps */
-    kernel_volume_decoupled_free(kg, &volume_segment);
-  }
-  else
-#      endif /* __VOLUME_DECOUPLED__ */
-  {
-    /* GPU: no decoupled ray marching, scatter probabilistically. */
-    int num_samples = kernel_data.integrator.volume_samples;
-    float num_samples_inv = 1.0f / num_samples;
-
-    /* todo: we should cache the shader evaluations from stepping
-     * through the volume, for now we redo them multiple times */
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      Ray pray = *ray;
-      float3 tp = (*throughput) * num_samples_inv;
-
-      /* branch RNG state */
-      path_state_branch(&ps, j, num_samples);
-
-      VolumeIntegrateResult result = kernel_volume_integrate(
-          kg, &ps, sd, &volume_ray, L, &tp, step_size);
-
-#      ifdef __VOLUME_SCATTER__
-      if (result == VOLUME_PATH_SCATTERED) {
-        /* todo: support equiangular, MIS and all light sampling.
-         * alternatively get decoupled ray marching working on the GPU */
-        kernel_path_volume_connect_light(kg, sd, emission_sd, tp, state, L);
-
-        if (kernel_path_volume_bounce(kg, sd, &tp, &ps, &L->state, &pray)) {
-          kernel_path_indirect(kg, indirect_sd, emission_sd, &pray, tp, &ps, L, object);
-
-          /* for render passes, sum and reset indirect light pass variables
-           * for the next samples */
-          path_radiance_sum_indirect(L);
-          path_radiance_reset_indirect(L);
-        }
-      }
-#      endif /* __VOLUME_SCATTER__ */
-    }
-
-    /* todo: avoid this calculation using decoupled ray marching */
-    kernel_volume_shadow(kg, emission_sd, state, &volume_ray, throughput);
-  }
-}
-#    endif /* __VOLUME__ */
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline_cpu void kernel_branched_path_surface_indirect_light(KernelGlobals *kg,
-                                                                         ShaderData *sd,
-                                                                         ShaderData *indirect_sd,
-                                                                         ShaderData *emission_sd,
-                                                                         float3 throughput,
-                                                                         float num_samples_adjust,
-                                                                         PathState *state,
-                                                                         PathRadiance *L)
-{
-  float sum_sample_weight = 0.0f;
-#    ifdef __DENOISING_FEATURES__
-  if (state->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#    endif /* __DENOISING_FEATURES__ */
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    /* transparency is not handled here, but in outer loop */
-    if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-      continue;
-    }
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = 0; j < num_samples; j++) {
-      PathState ps = *state;
-      float3 tp = throughput;
-      Ray bsdf_ray;
-#    ifdef __SHADOW_TRICKS__
-      float shadow_transparency = L->shadow_transparency;
-#    endif
-
-      ps.rng_hash = cmj_hash(state->rng_hash, i);
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, &tp, &ps, &L->state, &bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps.rng_hash = state->rng_hash;
-
-      kernel_path_indirect(
-          kg, indirect_sd, emission_sd, &bsdf_ray, tp * num_samples_inv, &ps, L, sd->object);
-
-      /* for render passes, sum and reset indirect light pass variables
-       * for the next samples */
-      path_radiance_sum_indirect(L);
-      path_radiance_reset_indirect(L);
-
-#    ifdef __SHADOW_TRICKS__
-      L->shadow_transparency = shadow_transparency;
-#    endif
-    }
-  }
-}
-
-#    ifdef __SUBSURFACE__
-ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *indirect_sd,
-                                                        ShaderData *emission_sd,
-                                                        PathRadiance *L,
-                                                        PathState *state,
-                                                        Ray *ray,
-                                                        float3 throughput)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* set up random number generator */
-    uint lcg_state = lcg_state_init(state, 0x68bc21eb);
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(state->rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = 0; j < num_samples; j++) {
-      PathState hit_state = *state;
-      path_state_branch(&hit_state, j, num_samples);
-      hit_state.rng_hash = bssrdf_rng_hash;
-
-      LocalIntersection ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_state_rng_2D(kg, &hit_state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-      int num_hits = subsurface_scatter_multi_intersect(
-          kg, &ss_isect, sd, &hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-      hit_state.rng_offset += PRNG_BOUNCE_NUM;
-
-#      ifdef __VOLUME__
-      Ray volume_ray = *ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#      endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = 0; hit < num_hits; hit++) {
-        ShaderData bssrdf_sd = *sd;
-        Bssrdf *bssrdf = (Bssrdf *)sc;
-        ClosureType bssrdf_type = sc->type;
-        float bssrdf_roughness = bssrdf->roughness;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect, hit, &bssrdf_sd, &hit_state, bssrdf_type, bssrdf_roughness);
-
-#      ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd.P, -bssrdf_sd.Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state.volume_stack[k] = state->volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state.volume_stack);
-        }
-#      endif /* __VOLUME__ */
-
-#      ifdef __EMISSION__
-        /* direct light */
-        if (kernel_data.integrator.use_direct_light) {
-          int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                    (hit_state.flag & PATH_RAY_SHADOW_CATCHER);
-          kernel_branched_path_surface_connect_light(
-              kg, &bssrdf_sd, emission_sd, &hit_state, throughput, num_samples_inv, L, all);
-        }
-#      endif /* __EMISSION__ */
-
-        /* indirect light */
-        kernel_branched_path_surface_indirect_light(
-            kg, &bssrdf_sd, indirect_sd, emission_sd, throughput, num_samples_inv, &hit_state, L);
-      }
-    }
-  }
-}
-#    endif /* __SUBSURFACE__ */
-
-ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
-                                               uint rng_hash,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer,
-                                               PathRadiance *L)
-{
-  /* initialize */
-  float3 throughput = one_float3();
-
-  path_radiance_init(kg, L);
-
-  /* shader data memory used for both volumes and surfaces, saves stack space */
-  ShaderData sd;
-  /* shader data used by emission, shadows, volume stacks, indirect path */
-  ShaderDataTinyStorage emission_sd_storage;
-  ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  ShaderData indirect_sd;
-
-  PathState state;
-  path_state_init(kg, emission_sd, &state, rng_hash, sample, &ray);
-
-  /* Main Loop
-   * Here we only handle transparency intersections from the camera ray.
-   * Indirect bounces are handled in kernel_branched_path_surface_indirect_light().
-   */
-  for (;;) {
-    /* Find intersection with objects in scene. */
-    Intersection isect;
-    bool hit = kernel_path_scene_intersect(kg, &state, &ray, &isect, L, sd.object);
-
-#    ifdef __VOLUME__
-    /* Volume integration. */
-    kernel_branched_path_volume(
-        kg, &sd, &state, &ray, &throughput, &isect, hit, &indirect_sd, emission_sd, L);
-#    endif /* __VOLUME__ */
-
-    /* Shade background. */
-    if (!hit) {
-      kernel_path_background(kg, &state, &ray, throughput, &sd, buffer, L);
-      break;
-    }
-
-    /* Setup and evaluate shader. */
-    shader_setup_from_ray(kg, &sd, &isect, &ray);
-
-    /* Skip most work for volume bounding surface. */
-#    ifdef __VOLUME__
-    if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
-#    endif
-
-      shader_eval_surface(kg, &sd, &state, buffer, state.flag);
-      shader_merge_closures(&sd);
-
-      /* Apply shadow catcher, holdout, emission. */
-      if (!kernel_path_shader_apply(kg, &sd, &state, &ray, throughput, emission_sd, L, buffer)) {
-        break;
-      }
-
-      /* transparency termination */
-      if (state.flag & PATH_RAY_TRANSPARENT) {
-        /* path termination. this is a strange place to put the termination, it's
-         * mainly due to the mixed in MIS that we use. gives too many unneeded
-         * shader evaluations, only need emission if we are going to terminate */
-        float probability = path_state_continuation_probability(kg, &state, throughput);
-
-        if (probability == 0.0f) {
-          break;
-        }
-        else if (probability != 1.0f) {
-          float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
-
-          if (terminate >= probability)
-            break;
-
-          throughput /= probability;
-        }
-      }
-
-#    ifdef __DENOISING_FEATURES__
-      kernel_update_denoising_features(kg, &sd, &state, L);
-#    endif
-
-#    ifdef __AO__
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        kernel_branched_path_ao(kg, &sd, emission_sd, L, &state, throughput);
-      }
-#    endif /* __AO__ */
-
-#    ifdef __SUBSURFACE__
-      /* bssrdf scatter to a different location on the same object */
-      if (sd.flag & SD_BSSRDF) {
-        kernel_branched_path_subsurface_scatter(
-            kg, &sd, &indirect_sd, emission_sd, L, &state, &ray, throughput);
-      }
-#    endif /* __SUBSURFACE__ */
-
-      PathState hit_state = state;
-
-#    ifdef __EMISSION__
-      /* direct light */
-      if (kernel_data.integrator.use_direct_light) {
-        int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                  (state.flag & PATH_RAY_SHADOW_CATCHER);
-        kernel_branched_path_surface_connect_light(
-            kg, &sd, emission_sd, &hit_state, throughput, 1.0f, L, all);
-      }
-#    endif /* __EMISSION__ */
-
-      /* indirect light */
-      kernel_branched_path_surface_indirect_light(
-          kg, &sd, &indirect_sd, emission_sd, throughput, 1.0f, &hit_state, L);
-
-      /* continue in case of transparency */
-      throughput *= shader_bsdf_transparency(kg, &sd);
-
-      if (is_zero(throughput))
-        break;
-
-      /* Update Path State */
-      path_state_next(kg, &state, LABEL_TRANSPARENT);
-
-#    ifdef __VOLUME__
-    }
-    else {
-      if (!path_state_volume_next(kg, &state)) {
-        break;
-      }
-    }
-#    endif
-
-    ray.P = ray_offset(sd.P, -sd.Ng);
-    ray.t -= sd.ray_length; /* clipping works through transparent */
-
-#    ifdef __RAY_DIFFERENTIALS__
-    ray.dP = sd.dP;
-    ray.dD.dx = -sd.dI.dx;
-    ray.dD.dy = -sd.dI.dy;
-#    endif /* __RAY_DIFFERENTIALS__ */
-
-#    ifdef __VOLUME__
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, &sd, state.volume_stack);
-#    endif /* __VOLUME__ */
-  }
-}
-
-ccl_device void kernel_branched_path_trace(
-    KernelGlobals *kg, ccl_global float *buffer, int sample, int x, int y, int offset, int stride)
-{
-  /* buffer offset */
-  int index = offset + x + y * stride;
-  int pass_stride = kernel_data.film.pass_stride;
-
-  buffer += index * pass_stride;
-
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                   kernel_data.film.pass_adaptive_aux_buffer);
-    if ((*aux).w > 0.0f) {
-      return;
-    }
-  }
-
-  /* initialize random numbers and ray */
-  uint rng_hash;
-  Ray ray;
-
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &ray);
-
-  /* integrate */
-  PathRadiance L;
-
-  if (ray.t != 0.0f) {
-    kernel_branched_path_integrate(kg, rng_hash, sample, ray, buffer, &L);
-    kernel_write_result(kg, buffer, sample, &L);
-  }
-}
-
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_common.h b/intern/cycles/kernel/kernel_path_common.h
deleted file mode 100644
index 815767595a9..00000000000
--- a/intern/cycles/kernel/kernel_path_common.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "util/util_hash.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_path_trace_setup(
-    KernelGlobals *kg, int sample, int x, int y, uint *rng_hash, ccl_addr_space Ray *ray)
-{
-  float filter_u;
-  float filter_v;
-
-  int num_samples = kernel_data.integrator.aa_samples;
-
-  path_rng_init(kg, sample, num_samples, rng_hash, x, y, &filter_u, &filter_v);
-
-  /* sample camera ray */
-
-  float lens_u = 0.0f, lens_v = 0.0f;
-
-  if (kernel_data.cam.aperturesize > 0.0f)
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v);
-
-  float time = 0.0f;
-
-#ifdef __CAMERA_MOTION__
-  if (kernel_data.cam.shuttertime != -1.0f)
-    time = path_rng_1D(kg, *rng_hash, sample, num_samples, PRNG_TIME);
-#endif
-
-  camera_sample(kg, x, y, filter_u, filter_v, lens_u, lens_v, time, ray);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index bf601580cd0..ebb2c0df4f1 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -14,99 +14,116 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
+#pragma once
 
-ccl_device_inline void path_state_init(KernelGlobals *kg,
-                                       ShaderData *stack_sd,
-                                       ccl_addr_space PathState *state,
-                                       uint rng_hash,
-                                       int sample,
-                                       ccl_addr_space Ray *ray)
-{
-  state->flag = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP | PATH_RAY_TRANSPARENT_BACKGROUND;
+#include "kernel_random.h"
 
-  state->rng_hash = rng_hash;
-  state->rng_offset = PRNG_BASE_NUM;
-  state->sample = sample;
-  state->num_samples = kernel_data.integrator.aa_samples;
-  state->branch_factor = 1.0f;
+CCL_NAMESPACE_BEGIN
 
-  state->bounce = 0;
-  state->diffuse_bounce = 0;
-  state->glossy_bounce = 0;
-  state->transmission_bounce = 0;
-  state->transparent_bounce = 0;
+/* Initialize queues, so that the this path is considered terminated.
+ * Used for early outputs in the camera ray initialization, as well as initialization of split
+ * states for shadow catcher. */
+ccl_device_inline void path_state_init_queues(INTEGRATOR_STATE_ARGS)
+{
+  INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
+  INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+}
 
-#ifdef __DENOISING_FEATURES__
-  if (kernel_data.film.pass_denoising_data) {
-    state->flag |= PATH_RAY_STORE_SHADOW_INFO;
-    state->denoising_feature_weight = 1.0f;
-    state->denoising_feature_throughput = one_float3();
-  }
-  else {
-    state->denoising_feature_weight = 0.0f;
-    state->denoising_feature_throughput = zero_float3();
-  }
-#endif /* __DENOISING_FEATURES__ */
+/* Minimalistic initialization of the path state, which is needed for early outputs in the
+ * integrator initialization to work. */
+ccl_device_inline void path_state_init(INTEGRATOR_STATE_ARGS,
+                                       const ccl_global KernelWorkTile *ccl_restrict tile,
+                                       const int x,
+                                       const int y)
+{
+  const uint render_pixel_index = (uint)tile->offset + x + y * tile->stride;
 
-  state->min_ray_pdf = FLT_MAX;
-  state->ray_pdf = 0.0f;
-#ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#endif
+  INTEGRATOR_STATE_WRITE(path, render_pixel_index) = render_pixel_index;
 
-#ifdef __VOLUME__
-  state->volume_bounce = 0;
-  state->volume_bounds_bounce = 0;
+  path_state_init_queues(INTEGRATOR_STATE_PASS);
+}
 
-  if (kernel_data.integrator.use_volumes) {
-    /* Initialize volume stack with volume we are inside of. */
-    kernel_volume_stack_init(kg, stack_sd, state, ray, state->volume_stack);
+/* Initialize the rest of the path state needed to continue the path integration. */
+ccl_device_inline void path_state_init_integrator(INTEGRATOR_STATE_ARGS,
+                                                  const int sample,
+                                                  const uint rng_hash)
+{
+  INTEGRATOR_STATE_WRITE(path, sample) = sample;
+  INTEGRATOR_STATE_WRITE(path, bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, glossy_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transmission_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, transparent_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = 0;
+  INTEGRATOR_STATE_WRITE(path, rng_hash) = rng_hash;
+  INTEGRATOR_STATE_WRITE(path, rng_offset) = PRNG_BASE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
+                                       PATH_RAY_TRANSPARENT_BACKGROUND;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, mis_ray_t) = 0.0f;
+  INTEGRATOR_STATE_WRITE(path, min_ray_pdf) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 0, shader) = kernel_data.background.volume_shader;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, object) = OBJECT_NONE;
+    INTEGRATOR_STATE_ARRAY_WRITE(volume_stack, 1, shader) = SHADER_NONE;
   }
-  else {
-    state->volume_stack[0].shader = SHADER_NONE;
+
+#ifdef __DENOISING_FEATURES__
+  if (kernel_data.kernel_features & KERNEL_FEATURE_DENOISING) {
+    INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_DENOISING_FEATURES;
+    INTEGRATOR_STATE_WRITE(path, denoising_feature_throughput) = one_float3();
   }
 #endif
 }
 
-ccl_device_inline void path_state_next(KernelGlobals *kg,
-                                       ccl_addr_space PathState *state,
-                                       int label)
+ccl_device_inline void path_state_next(INTEGRATOR_STATE_ARGS, int label)
 {
+  uint32_t flag = INTEGRATOR_STATE(path, flag);
+
   /* ray through transparent keeps same flags from previous ray and is
    * not counted as a regular bounce, transparent has separate max */
   if (label & LABEL_TRANSPARENT) {
-    state->flag |= PATH_RAY_TRANSPARENT;
-    state->transparent_bounce++;
-    if (state->transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_IMMEDIATE;
+    uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce) + 1;
+
+    flag |= PATH_RAY_TRANSPARENT;
+    if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
+      flag |= PATH_RAY_TERMINATE_ON_NEXT_SURFACE;
     }
 
     if (!kernel_data.integrator.transparent_shadows)
-      state->flag |= PATH_RAY_MIS_SKIP;
-
-    /* random number generator next bounce */
-    state->rng_offset += PRNG_BOUNCE_NUM;
+      flag |= PATH_RAY_MIS_SKIP;
 
+    INTEGRATOR_STATE_WRITE(path, flag) = flag;
+    INTEGRATOR_STATE_WRITE(path, transparent_bounce) = transparent_bounce;
+    /* Random number generator next bounce. */
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
     return;
   }
 
-  state->bounce++;
-  if (state->bounce >= kernel_data.integrator.max_bounce) {
-    state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+  uint32_t bounce = INTEGRATOR_STATE(path, bounce) + 1;
+  if (bounce >= kernel_data.integrator.max_bounce) {
+    flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
   }
 
-  state->flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
 
 #ifdef __VOLUME__
   if (label & LABEL_VOLUME_SCATTER) {
     /* volume scatter */
-    state->flag |= PATH_RAY_VOLUME_SCATTER;
-    state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    flag |= PATH_RAY_VOLUME_SCATTER;
+    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+    if (bounce == 1) {
+      flag |= PATH_RAY_VOLUME_PASS;
+    }
 
-    state->volume_bounce++;
-    if (state->volume_bounce >= kernel_data.integrator.max_volume_bounce) {
-      state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+    const int volume_bounce = INTEGRATOR_STATE(path, volume_bounce) + 1;
+    INTEGRATOR_STATE_WRITE(path, volume_bounce) = volume_bounce;
+    if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
+      flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
     }
   }
   else
@@ -114,163 +131,237 @@ ccl_device_inline void path_state_next(KernelGlobals *kg,
   {
     /* surface reflection/transmission */
     if (label & LABEL_REFLECT) {
-      state->flag |= PATH_RAY_REFLECT;
-      state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+      flag |= PATH_RAY_REFLECT;
+      flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
 
       if (label & LABEL_DIFFUSE) {
-        state->diffuse_bounce++;
-        if (state->diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int diffuse_bounce = INTEGRATOR_STATE(path, diffuse_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, diffuse_bounce) = diffuse_bounce;
+        if (diffuse_bounce >= kernel_data.integrator.max_diffuse_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
       else {
-        state->glossy_bounce++;
-        if (state->glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
-          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        const int glossy_bounce = INTEGRATOR_STATE(path, glossy_bounce) + 1;
+        INTEGRATOR_STATE_WRITE(path, glossy_bounce) = glossy_bounce;
+        if (glossy_bounce >= kernel_data.integrator.max_glossy_bounce) {
+          flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
         }
       }
     }
     else {
       kernel_assert(label & LABEL_TRANSMIT);
 
-      state->flag |= PATH_RAY_TRANSMIT;
+      flag |= PATH_RAY_TRANSMIT;
 
       if (!(label & LABEL_TRANSMIT_TRANSPARENT)) {
-        state->flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
+        flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
       }
 
-      state->transmission_bounce++;
-      if (state->transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
-        state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+      const int transmission_bounce = INTEGRATOR_STATE(path, transmission_bounce) + 1;
+      INTEGRATOR_STATE_WRITE(path, transmission_bounce) = transmission_bounce;
+      if (transmission_bounce >= kernel_data.integrator.max_transmission_bounce) {
+        flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
       }
     }
 
     /* diffuse/glossy/singular */
     if (label & LABEL_DIFFUSE) {
-      state->flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
+      flag |= PATH_RAY_DIFFUSE | PATH_RAY_DIFFUSE_ANCESTOR;
     }
     else if (label & LABEL_GLOSSY) {
-      state->flag |= PATH_RAY_GLOSSY;
+      flag |= PATH_RAY_GLOSSY;
     }
     else {
       kernel_assert(label & LABEL_SINGULAR);
-      state->flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+      flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
+    }
+
+    /* Render pass categories. */
+    if (bounce == 1) {
+      flag |= (label & LABEL_TRANSMIT) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
     }
   }
 
-  /* random number generator next bounce */
-  state->rng_offset += PRNG_BOUNCE_NUM;
+  INTEGRATOR_STATE_WRITE(path, flag) = flag;
+  INTEGRATOR_STATE_WRITE(path, bounce) = bounce;
 
-#ifdef __DENOISING_FEATURES__
-  if ((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
-    state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
-  }
-#endif
+  /* Random number generator next bounce. */
+  INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
 }
 
 #ifdef __VOLUME__
-ccl_device_inline bool path_state_volume_next(KernelGlobals *kg, ccl_addr_space PathState *state)
+ccl_device_inline bool path_state_volume_next(INTEGRATOR_STATE_ARGS)
 {
   /* For volume bounding meshes we pass through without counting transparent
    * bounces, only sanity check in case self intersection gets us stuck. */
-  state->volume_bounds_bounce++;
-  if (state->volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
+  uint32_t volume_bounds_bounce = INTEGRATOR_STATE(path, volume_bounds_bounce) + 1;
+  INTEGRATOR_STATE_WRITE(path, volume_bounds_bounce) = volume_bounds_bounce;
+  if (volume_bounds_bounce > VOLUME_BOUNDS_MAX) {
     return false;
   }
 
   /* Random number generator next bounce. */
-  if (state->volume_bounds_bounce > 1) {
-    state->rng_offset += PRNG_BOUNCE_NUM;
+  if (volume_bounds_bounce > 1) {
+    INTEGRATOR_STATE_WRITE(path, rng_offset) += PRNG_BOUNCE_NUM;
   }
 
   return true;
 }
 #endif
 
-ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state)
+ccl_device_inline uint path_state_ray_visibility(INTEGRATOR_STATE_CONST_ARGS)
 {
-  uint flag = state->flag & PATH_RAY_ALL_VISIBILITY;
+  const uint32_t path_flag = INTEGRATOR_STATE(path, flag);
 
-  /* for visibility, diffuse/glossy are for reflection only */
-  if (flag & PATH_RAY_TRANSMIT)
-    flag &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
-  /* todo: this is not supported as its own ray visibility yet */
-  if (state->flag & PATH_RAY_VOLUME_SCATTER)
-    flag |= PATH_RAY_DIFFUSE;
+  uint32_t visibility = path_flag & PATH_RAY_ALL_VISIBILITY;
 
-  return flag;
+  /* For visibility, diffuse/glossy are for reflection only. */
+  if (visibility & PATH_RAY_TRANSMIT) {
+    visibility &= ~(PATH_RAY_DIFFUSE | PATH_RAY_GLOSSY);
+  }
+
+  /* todo: this is not supported as its own ray visibility yet. */
+  if (path_flag & PATH_RAY_VOLUME_SCATTER) {
+    visibility |= PATH_RAY_DIFFUSE;
+  }
+
+  visibility = SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility);
+
+  return visibility;
 }
 
-ccl_device_inline float path_state_continuation_probability(KernelGlobals *kg,
-                                                            ccl_addr_space PathState *state,
-                                                            const float3 throughput)
+ccl_device_inline float path_state_continuation_probability(INTEGRATOR_STATE_CONST_ARGS,
+                                                            const uint32_t path_flag)
 {
-  if (state->flag & PATH_RAY_TERMINATE_IMMEDIATE) {
-    /* Ray is to be terminated immediately. */
-    return 0.0f;
-  }
-  else if (state->flag & PATH_RAY_TRANSPARENT) {
+  if (path_flag & PATH_RAY_TRANSPARENT) {
+    const uint32_t transparent_bounce = INTEGRATOR_STATE(path, transparent_bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
-      return 1.0f;
-    }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->transparent_bounce <= 8)) {
+    if (transparent_bounce <= kernel_data.integrator.transparent_min_bounce) {
       return 1.0f;
     }
-#endif
   }
   else {
+    const uint32_t bounce = INTEGRATOR_STATE(path, bounce);
     /* Do at least specified number of bounces without RR. */
-    if (state->bounce <= kernel_data.integrator.min_bounce) {
+    if (bounce <= kernel_data.integrator.min_bounce) {
       return 1.0f;
     }
-#ifdef __SHADOW_TRICKS__
-    /* Exception for shadow catcher not working correctly with RR. */
-    else if ((state->flag & PATH_RAY_SHADOW_CATCHER) && (state->bounce <= 3)) {
-      return 1.0f;
-    }
-#endif
   }
 
   /* Probabilistic termination: use sqrt() to roughly match typical view
    * transform and do path termination a bit later on average. */
-  return min(sqrtf(max3(fabs(throughput)) * state->branch_factor), 1.0f);
+  return min(sqrtf(max3(fabs(INTEGRATOR_STATE(path, throughput)))), 1.0f);
 }
 
-/* TODO(DingTo): Find more meaningful name for this */
-ccl_device_inline void path_state_modify_bounce(ccl_addr_space PathState *state, bool increase)
+ccl_device_inline bool path_state_ao_bounce(INTEGRATOR_STATE_CONST_ARGS)
 {
-  /* Modify bounce temporarily for shader eval */
-  if (increase)
-    state->bounce += 1;
-  else
-    state->bounce -= 1;
-}
-
-ccl_device_inline bool path_state_ao_bounce(KernelGlobals *kg, ccl_addr_space PathState *state)
-{
-  if (state->bounce <= kernel_data.integrator.ao_bounces) {
+  if (!kernel_data.integrator.ao_bounces) {
     return false;
   }
 
-  int bounce = state->bounce - state->transmission_bounce - (state->glossy_bounce > 0);
+  const int bounce = INTEGRATOR_STATE(path, bounce) - INTEGRATOR_STATE(path, transmission_bounce) -
+                     (INTEGRATOR_STATE(path, glossy_bounce) > 0) + 1;
   return (bounce > kernel_data.integrator.ao_bounces);
 }
 
-ccl_device_inline void path_state_branch(ccl_addr_space PathState *state,
-                                         int branch,
-                                         int num_branches)
+/* Random Number Sampling Utility Functions
+ *
+ * For each random number in each step of the path we must have a unique
+ * dimension to avoid using the same sequence twice.
+ *
+ * For branches in the path we must be careful not to reuse the same number
+ * in a sequence and offset accordingly.
+ */
+
+/* RNG State loaded onto stack. */
+typedef struct RNGState {
+  uint rng_hash;
+  uint rng_offset;
+  int sample;
+} RNGState;
+
+ccl_device_inline void path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset);
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline void shadow_path_state_rng_load(INTEGRATOR_STATE_CONST_ARGS, RNGState *rng_state)
+{
+  const uint shadow_bounces = INTEGRATOR_STATE(shadow_path, transparent_bounce) -
+                              INTEGRATOR_STATE(path, transparent_bounce);
+
+  rng_state->rng_hash = INTEGRATOR_STATE(path, rng_hash);
+  rng_state->rng_offset = INTEGRATOR_STATE(path, rng_offset) + PRNG_BOUNCE_NUM * shadow_bounces;
+  rng_state->sample = INTEGRATOR_STATE(path, sample);
+}
+
+ccl_device_inline float path_state_rng_1D(const KernelGlobals *kg,
+                                          const RNGState *rng_state,
+                                          int dimension)
+{
+  return path_rng_1D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_state_rng_2D(
+    const KernelGlobals *kg, const RNGState *rng_state, int dimension, float *fx, float *fy)
+{
+  path_rng_2D(
+      kg, rng_state->rng_hash, rng_state->sample, rng_state->rng_offset + dimension, fx, fy);
+}
+
+ccl_device_inline float path_state_rng_1D_hash(const KernelGlobals *kg,
+                                               const RNGState *rng_state,
+                                               uint hash)
+{
+  /* Use a hash instead of dimension, this is not great but avoids adding
+   * more dimensions to each bounce which reduces quality of dimensions we
+   * are already using. */
+  return path_rng_1D(
+      kg, cmj_hash_simple(rng_state->rng_hash, hash), rng_state->sample, rng_state->rng_offset);
+}
+
+ccl_device_inline float path_branched_rng_1D(const KernelGlobals *kg,
+                                             const RNGState *rng_state,
+                                             int branch,
+                                             int num_branches,
+                                             int dimension)
+{
+  return path_rng_1D(kg,
+                     rng_state->rng_hash,
+                     rng_state->sample * num_branches + branch,
+                     rng_state->rng_offset + dimension);
+}
+
+ccl_device_inline void path_branched_rng_2D(const KernelGlobals *kg,
+                                            const RNGState *rng_state,
+                                            int branch,
+                                            int num_branches,
+                                            int dimension,
+                                            float *fx,
+                                            float *fy)
+{
+  path_rng_2D(kg,
+              rng_state->rng_hash,
+              rng_state->sample * num_branches + branch,
+              rng_state->rng_offset + dimension,
+              fx,
+              fy);
+}
+
+/* Utility functions to get light termination value,
+ * since it might not be needed in many cases.
+ */
+ccl_device_inline float path_state_rng_light_termination(const KernelGlobals *kg,
+                                                         const RNGState *state)
 {
-  if (num_branches > 1) {
-    /* Path is splitting into a branch, adjust so that each branch
-     * still gets a unique sample from the same sequence. */
-    state->sample = state->sample * num_branches + branch;
-    state->num_samples = state->num_samples * num_branches;
-    state->branch_factor *= num_branches;
+  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
+    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
   }
+  return 0.0f;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
deleted file mode 100644
index 97d3f292ca3..00000000000
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright 2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    bool
-    kernel_path_subsurface_scatter(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   ShaderData *emission_sd,
-                                   PathRadiance *L,
-                                   ccl_addr_space PathState *state,
-                                   ccl_addr_space Ray *ray,
-                                   ccl_addr_space float3 *throughput,
-                                   ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  PROFILING_INIT(kg, PROFILING_SUBSURFACE);
-
-  float bssrdf_u, bssrdf_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-  const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
-
-  /* do bssrdf scatter step if we picked a bssrdf closure */
-  if (sc) {
-    /* We should never have two consecutive BSSRDF bounces,
-     * the second one should be converted to a diffuse BSDF to
-     * avoid this.
-     */
-    kernel_assert(!(state->flag & PATH_RAY_DIFFUSE_ANCESTOR));
-
-    uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
-
-    LocalIntersection ss_isect;
-    int num_hits = subsurface_scatter_multi_intersect(
-        kg, &ss_isect, sd, state, sc, &lcg_state, bssrdf_u, bssrdf_v, false);
-#  ifdef __VOLUME__
-    bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                    sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* compute lighting with the BSDF closure */
-    for (int hit = 0; hit < num_hits; hit++) {
-      /* NOTE: We reuse the existing ShaderData, we assume the path
-       * integration loop stops when this function returns true.
-       */
-      subsurface_scatter_multi_setup(kg, &ss_isect, hit, sd, state, bssrdf_type, bssrdf_roughness);
-
-      kernel_path_surface_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-      ccl_addr_space PathState *hit_state = &ss_indirect->state[ss_indirect->num_rays];
-      ccl_addr_space Ray *hit_ray = &ss_indirect->rays[ss_indirect->num_rays];
-      ccl_addr_space float3 *hit_tp = &ss_indirect->throughputs[ss_indirect->num_rays];
-      PathRadianceState *hit_L_state = &ss_indirect->L_state[ss_indirect->num_rays];
-
-      *hit_state = *state;
-      *hit_ray = *ray;
-      *hit_tp = *throughput;
-      *hit_L_state = L->state;
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-      if (kernel_path_surface_bounce(kg, sd, hit_tp, hit_state, hit_L_state, hit_ray)) {
-#  ifdef __LAMP_MIS__
-        hit_state->ray_t = 0.0f;
-#  endif /* __LAMP_MIS__ */
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          Ray volume_ray = *ray;
-          /* Setup ray from previous surface point to the new one. */
-          volume_ray.D = normalize_len(hit_ray->P - volume_ray.P, &volume_ray.t);
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-        ss_indirect->num_rays++;
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-ccl_device_inline void kernel_path_subsurface_init_indirect(
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
-{
-  ss_indirect->num_rays = 0;
-}
-
-ccl_device void kernel_path_subsurface_setup_indirect(
-    KernelGlobals *kg,
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect,
-    ccl_addr_space PathState *state,
-    ccl_addr_space Ray *ray,
-    PathRadiance *L,
-    ccl_addr_space float3 *throughput)
-{
-  /* Setup state, ray and throughput for indirect SSS rays. */
-  ss_indirect->num_rays--;
-
-  path_radiance_sum_indirect(L);
-  path_radiance_reset_indirect(L);
-
-  *state = ss_indirect->state[ss_indirect->num_rays];
-  *ray = ss_indirect->rays[ss_indirect->num_rays];
-  L->state = ss_indirect->L_state[ss_indirect->num_rays];
-  *throughput = ss_indirect->throughputs[ss_indirect->num_rays];
-
-  state->rng_offset += ss_indirect->num_rays * PRNG_BOUNCE_NUM;
-}
-
-#endif /* __SUBSURFACE__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_surface.h b/intern/cycles/kernel/kernel_path_surface.h
deleted file mode 100644
index ba48c0bdfc4..00000000000
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) || defined(__SUBSURFACE__) || defined(__SHADOW_TRICKS__) || \
-    defined(__BAKING__)
-/* branched path tracing: connect path directly to position on one or more lights and add it to L
- */
-ccl_device_noinline_cpu void kernel_branched_path_surface_connect_light(
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ShaderData *emission_sd,
-    ccl_addr_space PathState *state,
-    float3 throughput,
-    float num_samples_adjust,
-    PathRadiance *L,
-    int sample_all_lights)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 0;
-  if (kernel_data.integrator.use_direct_light) {
-    if (sample_all_lights) {
-      num_lights = kernel_data.integrator.num_all_lights;
-      if (kernel_data.integrator.pdf_triangles != 0.0f) {
-        num_lights += 1;
-      }
-    }
-    else {
-      num_lights = 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; i++) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = ceil_to_int(num_samples_adjust * light_select_num_samples(kg, i));
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = ceil_to_int(num_samples_adjust * kernel_data.integrator.mesh_light_samples);
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = num_samples_adjust / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#    ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#    endif
-      bool has_emission = false;
-
-      if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-        float terminate = path_branched_rng_light_termination(
-            kg, lamp_rng_hash, state, j, num_samples);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-          /* The sampling probability returned by lamp_light_sample assumes that all lights were
-           * sampled. However, this code only samples lamps, so if the scene also had mesh lights,
-           * the real probability is twice as high. */
-          if (double_pdf) {
-            ls.pdf *= 2.0f;
-          }
-
-          has_emission = direct_emission(
-              kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission) {
-        if (!blocked) {
-          /* accumulate */
-          path_radiance_accum_light(kg,
-                                    L,
-                                    state,
-                                    throughput * num_samples_inv,
-                                    &L_light,
-                                    shadow,
-                                    num_samples_inv,
-                                    is_lamp);
-        }
-        else {
-          path_radiance_accum_total_light(L, state, throughput * num_samples_inv, &L_light);
-        }
-      }
-    }
-  }
-#  endif
-}
-
-/* branched path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_branched_path_surface_bounce(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const ShaderClosure *sc,
-                                                    int sample,
-                                                    int num_samples,
-                                                    ccl_addr_space float3 *throughput,
-                                                    ccl_addr_space PathState *state,
-                                                    PathRadianceState *L_state,
-                                                    ccl_addr_space Ray *ray,
-                                                    float sum_sample_weight)
-{
-  /* sample BSDF */
-  float bsdf_pdf;
-  BsdfEval bsdf_eval ccl_optional_struct_init;
-  float3 bsdf_omega_in ccl_optional_struct_init;
-  differential3 bsdf_domega_in ccl_optional_struct_init;
-  float bsdf_u, bsdf_v;
-  path_branched_rng_2D(
-      kg, state->rng_hash, state, sample, num_samples, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-  int label;
-
-  label = shader_bsdf_sample_closure(
-      kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-  if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-#  ifdef __DENOISING_FEATURES__
-  state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
-#  endif
-
-  /* modify path state */
-  path_state_next(kg, state, label);
-
-  /* setup ray */
-  ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-  ray->D = normalize(bsdf_omega_in);
-  ray->t = FLT_MAX;
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = bsdf_domega_in;
-#  endif
-#  ifdef __OBJECT_MOTION__
-  ray->time = sd->time;
-#  endif
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  if (label & LABEL_TRANSMIT)
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif
-
-  /* branch RNG state */
-  path_state_branch(state, sample, num_samples);
-
-  /* set MIS state */
-  state->min_ray_pdf = fminf(bsdf_pdf, FLT_MAX);
-  state->ray_pdf = bsdf_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-
-  return true;
-}
-
-#endif
-
-/* path tracing: connect path directly to position on a light and add it to L */
-ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg,
-                                                         ShaderData *sd,
-                                                         ShaderData *emission_sd,
-                                                         float3 throughput,
-                                                         ccl_addr_space PathState *state,
-                                                         PathRadiance *L)
-{
-  PROFILING_INIT(kg, PROFILING_CONNECT_LIGHT);
-
-#ifdef __EMISSION__
-#  ifdef __SHADOW_TRICKS__
-  int all = (state->flag & PATH_RAY_SHADOW_CATCHER);
-  kernel_branched_path_surface_connect_light(kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-#  else
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL)) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission) {
-    if (!blocked) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-#  endif
-#endif
-}
-
-/* path tracing: bounce off or through surface to with new direction stored in ray */
-ccl_device bool kernel_path_surface_bounce(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           ccl_addr_space float3 *throughput,
-                                           ccl_addr_space PathState *state,
-                                           PathRadianceState *L_state,
-                                           ccl_addr_space Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SURFACE_BOUNCE);
-
-  /* no BSDF? we can stop here */
-  if (sd->flag & SD_BSDF) {
-    /* sample BSDF */
-    float bsdf_pdf;
-    BsdfEval bsdf_eval ccl_optional_struct_init;
-    float3 bsdf_omega_in ccl_optional_struct_init;
-    differential3 bsdf_domega_in ccl_optional_struct_init;
-    float bsdf_u, bsdf_v;
-    path_state_rng_2D(kg, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
-    int label;
-
-    label = shader_bsdf_sample(
-        kg, sd, bsdf_u, bsdf_v, &bsdf_eval, &bsdf_omega_in, &bsdf_domega_in, &bsdf_pdf);
-
-    if (bsdf_pdf == 0.0f || bsdf_eval_is_zero(&bsdf_eval))
-      return false;
-
-    /* modify throughput */
-    path_radiance_bsdf_bounce(kg, L_state, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
-
-    /* set labels */
-    if (!(label & LABEL_TRANSPARENT)) {
-      state->ray_pdf = bsdf_pdf;
-#ifdef __LAMP_MIS__
-      state->ray_t = 0.0f;
-#endif
-      state->min_ray_pdf = fminf(bsdf_pdf, state->min_ray_pdf);
-    }
-
-    /* update path state */
-    path_state_next(kg, state, label);
-
-    /* setup ray */
-    ray->P = ray_offset(sd->P, (label & LABEL_TRANSMIT) ? -sd->Ng : sd->Ng);
-    ray->D = normalize(bsdf_omega_in);
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-#ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-    ray->dD = bsdf_domega_in;
-#endif
-
-#ifdef __VOLUME__
-    /* enter/exit volume */
-    if (label & LABEL_TRANSMIT)
-      kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#endif
-    return true;
-  }
-#ifdef __VOLUME__
-  else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-    if (!path_state_volume_next(kg, state)) {
-      return false;
-    }
-
-    if (state->bounce == 0)
-      ray->t -= sd->ray_length; /* clipping works through transparent */
-    else
-      ray->t = FLT_MAX;
-
-    /* setup ray position, direction stays unchanged */
-    ray->P = ray_offset(sd->P, -sd->Ng);
-#  ifdef __RAY_DIFFERENTIALS__
-    ray->dP = sd->dP;
-#  endif
-
-    /* enter/exit volume */
-    kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-    return true;
-  }
-#endif
-  else {
-    /* no bsdf or volume? */
-    return false;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
deleted file mode 100644
index a787910e65c..00000000000
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME_SCATTER__
-
-ccl_device_inline void kernel_path_volume_connect_light(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *emission_sd,
-                                                        float3 throughput,
-                                                        ccl_addr_space PathState *state,
-                                                        PathRadiance *L)
-{
-#  ifdef __EMISSION__
-  /* sample illumination from lights to find path contribution */
-  Ray light_ray ccl_optional_struct_init;
-  BsdfEval L_light ccl_optional_struct_init;
-  bool is_lamp = false;
-  bool has_emission = false;
-
-  light_ray.t = 0.0f;
-#    ifdef __OBJECT_MOTION__
-  /* connect to light from given point where shader has been evaluated */
-  light_ray.time = sd->time;
-#    endif
-
-  if (kernel_data.integrator.use_direct_light) {
-    float light_u, light_v;
-    path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-
-    LightSample ls ccl_optional_struct_init;
-    if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-      float terminate = path_state_rng_light_termination(kg, state);
-      has_emission = direct_emission(
-          kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-    }
-  }
-
-  /* trace shadow ray */
-  float3 shadow;
-
-  const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-  if (has_emission && !blocked) {
-    /* accumulate */
-    path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-  }
-#  endif /* __EMISSION__ */
-}
-
-ccl_device_noinline_cpu bool kernel_path_volume_bounce(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space float3 *throughput,
-                                                       ccl_addr_space PathState *state,
-                                                       PathRadianceState *L_state,
-                                                       ccl_addr_space Ray *ray)
-{
-  /* sample phase function */
-  float phase_pdf;
-  BsdfEval phase_eval ccl_optional_struct_init;
-  float3 phase_omega_in ccl_optional_struct_init;
-  differential3 phase_domega_in ccl_optional_struct_init;
-  float phase_u, phase_v;
-  path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
-  int label;
-
-  label = shader_volume_phase_sample(
-      kg, sd, phase_u, phase_v, &phase_eval, &phase_omega_in, &phase_domega_in, &phase_pdf);
-
-  if (phase_pdf == 0.0f || bsdf_eval_is_zero(&phase_eval))
-    return false;
-
-  /* modify throughput */
-  path_radiance_bsdf_bounce(kg, L_state, throughput, &phase_eval, phase_pdf, state->bounce, label);
-
-  /* set labels */
-  state->ray_pdf = phase_pdf;
-#  ifdef __LAMP_MIS__
-  state->ray_t = 0.0f;
-#  endif
-  state->min_ray_pdf = fminf(phase_pdf, state->min_ray_pdf);
-
-  /* update path state */
-  path_state_next(kg, state, label);
-
-  /* Russian roulette termination of volume ray scattering. */
-  float probability = path_state_continuation_probability(kg, state, *throughput);
-
-  if (probability == 0.0f) {
-    return false;
-  }
-  else if (probability != 1.0f) {
-    /* Use dimension from the previous bounce, has not been used yet. */
-    float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE - PRNG_BOUNCE_NUM);
-
-    if (terminate >= probability) {
-      return false;
-    }
-
-    *throughput /= probability;
-  }
-
-  /* setup ray */
-  ray->P = sd->P;
-  ray->D = phase_omega_in;
-  ray->t = FLT_MAX;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD = phase_domega_in;
-#  endif
-
-  return true;
-}
-
-#  if !defined(__SPLIT_KERNEL__) && (defined(__BRANCHED_PATH__) || defined(__VOLUME_DECOUPLED__))
-ccl_device void kernel_branched_path_volume_connect_light(KernelGlobals *kg,
-                                                          ShaderData *sd,
-                                                          ShaderData *emission_sd,
-                                                          float3 throughput,
-                                                          ccl_addr_space PathState *state,
-                                                          PathRadiance *L,
-                                                          bool sample_all_lights,
-                                                          Ray *ray,
-                                                          const VolumeSegment *segment)
-{
-#    ifdef __EMISSION__
-  BsdfEval L_light ccl_optional_struct_init;
-
-  int num_lights = 1;
-  if (sample_all_lights) {
-    num_lights = kernel_data.integrator.num_all_lights;
-    if (kernel_data.integrator.pdf_triangles != 0.0f) {
-      num_lights += 1;
-    }
-  }
-
-  for (int i = 0; i < num_lights; ++i) {
-    /* sample one light at random */
-    int num_samples = 1;
-    int num_all_lights = 1;
-    uint lamp_rng_hash = state->rng_hash;
-    bool double_pdf = false;
-    bool is_mesh_light = false;
-    bool is_lamp = false;
-
-    if (sample_all_lights) {
-      /* lamp sampling */
-      is_lamp = i < kernel_data.integrator.num_all_lights;
-      if (is_lamp) {
-        if (UNLIKELY(light_select_reached_max_bounces(kg, i, state->bounce))) {
-          continue;
-        }
-        num_samples = light_select_num_samples(kg, i);
-        num_all_lights = kernel_data.integrator.num_all_lights;
-        lamp_rng_hash = cmj_hash(state->rng_hash, i);
-        double_pdf = kernel_data.integrator.pdf_triangles != 0.0f;
-      }
-      /* mesh light sampling */
-      else {
-        num_samples = kernel_data.integrator.mesh_light_samples;
-        double_pdf = kernel_data.integrator.num_all_lights != 0;
-        is_mesh_light = true;
-      }
-    }
-
-    float num_samples_inv = 1.0f / (num_samples * num_all_lights);
-
-    for (int j = 0; j < num_samples; j++) {
-      Ray light_ray ccl_optional_struct_init;
-      light_ray.t = 0.0f; /* reset ray */
-#      ifdef __OBJECT_MOTION__
-      light_ray.time = sd->time;
-#      endif
-      bool has_emission = false;
-
-      float3 tp = throughput;
-
-      if (kernel_data.integrator.use_direct_light) {
-        /* sample random position on random light/triangle */
-        float light_u, light_v;
-        path_branched_rng_2D(
-            kg, lamp_rng_hash, state, j, num_samples, PRNG_LIGHT_U, &light_u, &light_v);
-
-        /* only sample triangle lights */
-        if (is_mesh_light && double_pdf) {
-          light_u = 0.5f * light_u;
-        }
-
-        LightSample ls ccl_optional_struct_init;
-        const int lamp = is_lamp ? i : -1;
-        light_sample(kg, lamp, light_u, light_v, sd->time, ray->P, state->bounce, &ls);
-
-        /* sample position on volume segment */
-        float rphase = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
-        float rscatter = path_branched_rng_1D(
-            kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
-
-        VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
-                                                                       state,
-                                                                       ray,
-                                                                       sd,
-                                                                       &tp,
-                                                                       rphase,
-                                                                       rscatter,
-                                                                       segment,
-                                                                       (ls.t != FLT_MAX) ? &ls.P :
-                                                                                           NULL,
-                                                                       false);
-
-        if (result == VOLUME_PATH_SCATTERED) {
-          /* todo: split up light_sample so we don't have to call it again with new position */
-          if (light_sample(kg, lamp, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-            if (double_pdf) {
-              ls.pdf *= 2.0f;
-            }
-
-            /* sample random light */
-            float terminate = path_branched_rng_light_termination(
-                kg, state->rng_hash, state, j, num_samples);
-            has_emission = direct_emission(
-                kg, sd, emission_sd, &ls, state, &light_ray, &L_light, &is_lamp, terminate);
-          }
-        }
-      }
-
-      /* trace shadow ray */
-      float3 shadow;
-
-      const bool blocked = shadow_blocked(kg, sd, emission_sd, state, &light_ray, &shadow);
-
-      if (has_emission && !blocked) {
-        /* accumulate */
-        path_radiance_accum_light(
-            kg, L, state, tp * num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
-      }
-    }
-  }
-#    endif /* __EMISSION__ */
-}
-#  endif /* __SPLIT_KERNEL__ */
-
-#endif /* __VOLUME_SCATTER__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_profiling.h b/intern/cycles/kernel/kernel_profiling.h
index 780830879d8..db8644005ea 100644
--- a/intern/cycles/kernel/kernel_profiling.h
+++ b/intern/cycles/kernel/kernel_profiling.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_PROFILING_H__
-#define __KERNEL_PROFILING_H__
+#pragma once
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_profiling.h"
@@ -24,23 +23,18 @@
 CCL_NAMESPACE_BEGIN
 
 #ifdef __KERNEL_CPU__
-#  define PROFILING_INIT(kg, event) ProfilingHelper profiling_helper(&kg->profiler, event)
+#  define PROFILING_INIT(kg, event) \
+    ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
 #  define PROFILING_EVENT(event) profiling_helper.set_event(event)
-#  define PROFILING_SHADER(shader) \
-    if ((shader) != SHADER_NONE) { \
-      profiling_helper.set_shader((shader)&SHADER_MASK); \
-    }
-#  define PROFILING_OBJECT(object) \
-    if ((object) != PRIM_NONE) { \
-      profiling_helper.set_object(object); \
-    }
+#  define PROFILING_INIT_FOR_SHADER(kg, event) \
+    ProfilingWithShaderHelper profiling_helper((ProfilingState *)&kg->profiler, event)
+#  define PROFILING_SHADER(object, shader) \
+    profiling_helper.set_shader(object, (shader)&SHADER_MASK);
 #else
 #  define PROFILING_INIT(kg, event)
 #  define PROFILING_EVENT(event)
-#  define PROFILING_SHADER(shader)
-#  define PROFILING_OBJECT(object)
+#  define PROFILING_INIT_FOR_SHADER(kg, event)
+#  define PROFILING_SHADER(object, shader)
 #endif /* __KERNEL_CPU__ */
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROFILING_H__ */
diff --git a/intern/cycles/kernel/kernel_projection.h b/intern/cycles/kernel/kernel_projection.h
index c33d7150b5c..192bf7ca5aa 100644
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -30,8 +30,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __KERNEL_PROJECTION_CL__
-#define __KERNEL_PROJECTION_CL__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -257,5 +256,3 @@ ccl_device_inline void spherical_stereo_transform(ccl_constant KernelCamera *cam
 }
 
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_PROJECTION_CL__ */
diff --git a/intern/cycles/kernel/kernel_queues.h b/intern/cycles/kernel/kernel_queues.h
deleted file mode 100644
index d8cc08b3e85..00000000000
--- a/intern/cycles/kernel/kernel_queues.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_QUEUE_H__
-#define __KERNEL_QUEUE_H__
-
-CCL_NAMESPACE_BEGIN
-
-/*
- * Queue utility functions for split kernel
- */
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#  pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
-#endif
-
-/*
- * Enqueue ray index into the queue
- */
-ccl_device void enqueue_ray_index(
-    int ray_index,               /* Ray index to be enqueued. */
-    int queue_number,            /* Queue in which the ray index should be enqueued. */
-    ccl_global int *queues,      /* Buffer of all queues. */
-    int queue_size,              /* Size of each queue. */
-    ccl_global int *queue_index) /* Array of size num_queues; Used for atomic increment. */
-{
-  /* This thread's queue index. */
-  int my_queue_index = atomic_fetch_and_inc_uint32((ccl_global uint *)&queue_index[queue_number]) +
-                       (queue_number * queue_size);
-  queues[my_queue_index] = ray_index;
-}
-
-/*
- * Get the ray index for this thread
- * Returns a positive ray_index for threads that have to do some work;
- * Returns 'QUEUE_EMPTY_SLOT' for threads that don't have any work
- * i.e All ray's in the queue has been successfully allocated and there
- * is no more ray to allocate to other threads.
- */
-ccl_device int get_ray_index(
-    KernelGlobals *kg,
-    int thread_index,       /* Global thread index. */
-    int queue_number,       /* Queue to operate on. */
-    ccl_global int *queues, /* Buffer of all queues. */
-    int queuesize,          /* Size of a queue. */
-    int empty_queue)        /* Empty the queue slot as soon as we fetch the ray index. */
-{
-  int ray_index = queues[queue_number * queuesize + thread_index];
-  if (empty_queue && ray_index != QUEUE_EMPTY_SLOT) {
-    queues[queue_number * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-  }
-  return ray_index;
-}
-
-/* The following functions are to realize Local memory variant of enqueue ray index function. */
-
-/* All threads should call this function. */
-ccl_device void enqueue_ray_index_local(
-    int ray_index,     /* Ray index to enqueue. */
-    int queue_number,  /* Queue in which to enqueue ray index. */
-    char enqueue_flag, /* True for threads whose ray index has to be enqueued. */
-    int queuesize,     /* queue size. */
-    ccl_local_param unsigned int *local_queue_atomics, /* To do local queue atomics. */
-    ccl_global int *Queue_data,                        /* Queues. */
-    ccl_global int *Queue_index)                       /* To do global queue atomics. */
-{
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-
-  /* Get local queue id. */
-  unsigned int lqidx;
-  if (enqueue_flag) {
-    lqidx = atomic_fetch_and_inc_uint32(local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue offset. */
-  if (lidx == 0) {
-    *local_queue_atomics = atomic_fetch_and_add_uint32(
-        (ccl_global uint *)&Queue_index[queue_number], *local_queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* Get global queue index and enqueue ray. */
-  if (enqueue_flag) {
-    unsigned int my_gqidx = queue_number * queuesize + (*local_queue_atomics) + lqidx;
-    Queue_data[my_gqidx] = ray_index;
-  }
-}
-
-ccl_device unsigned int get_local_queue_index(
-    int queue_number, /* Queue in which to enqueue the ray; -1 if no queue */
-    ccl_local_param unsigned int *local_queue_atomics)
-{
-  int my_lqidx = atomic_fetch_and_inc_uint32(&local_queue_atomics[queue_number]);
-  return my_lqidx;
-}
-
-ccl_device unsigned int get_global_per_queue_offset(
-    int queue_number,
-    ccl_local_param unsigned int *local_queue_atomics,
-    ccl_global int *global_queue_atomics)
-{
-  unsigned int queue_offset = atomic_fetch_and_add_uint32(
-      (ccl_global uint *)&global_queue_atomics[queue_number], local_queue_atomics[queue_number]);
-  return queue_offset;
-}
-
-ccl_device unsigned int get_global_queue_index(
-    int queue_number,
-    int queuesize,
-    unsigned int lqidx,
-    ccl_local_param unsigned int *global_per_queue_offset)
-{
-  int my_gqidx = queuesize * queue_number + lqidx + global_per_queue_offset[queue_number];
-  return my_gqidx;
-}
-
-ccl_device int dequeue_ray_index(int queue_number,
-                                 ccl_global int *queues,
-                                 int queue_size,
-                                 ccl_global int *queue_index)
-{
-  int index = atomic_fetch_and_dec_uint32((ccl_global uint *)&queue_index[queue_number]) - 1;
-
-  if (index < 0) {
-    return QUEUE_EMPTY_SLOT;
-  }
-
-  return queues[index + queue_number * queue_size];
-}
-
-CCL_NAMESPACE_END
-
-#endif  // __KERNEL_QUEUE_H__
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index 49e5e25c2e0..240c92bf9d0 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
 #include "kernel/kernel_jitter.h"
 #include "util/util_hash.h"
@@ -37,38 +38,34 @@ CCL_NAMESPACE_BEGIN
  */
 #  define SOBOL_SKIP 64
 
-ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
+ccl_device uint sobol_dimension(const KernelGlobals *kg, int index, int dimension)
 {
   uint result = 0;
   uint i = index + SOBOL_SKIP;
   for (int j = 0, x; (x = find_first_set(i)); i >>= x) {
     j += x;
-    result ^= kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1);
+    result ^= __float_as_uint(kernel_tex_fetch(__sample_pattern_lut, 32 * dimension + j - 1));
   }
   return result;
 }
 
 #endif /* __SOBOL__ */
 
-ccl_device_forceinline float path_rng_1D(
-    KernelGlobals *kg, uint rng_hash, int sample, int num_samples, int dimension)
+ccl_device_forceinline float path_rng_1D(const KernelGlobals *kg,
+                                         uint rng_hash,
+                                         int sample,
+                                         int dimension)
 {
 #ifdef __DEBUG_CORRELATION__
   return (float)drand48();
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    return pmj_sample_1D(kg, sample, rng_hash, dimension);
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    return cmj_sample_1D(sample, num_samples, p);
+    return pmj_sample_1D(kg, sample, rng_hash, dimension);
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol sequence value using direction vectors. */
@@ -88,68 +85,72 @@ ccl_device_forceinline float path_rng_1D(
 #endif
 }
 
-ccl_device_forceinline void path_rng_2D(KernelGlobals *kg,
-                                        uint rng_hash,
-                                        int sample,
-                                        int num_samples,
-                                        int dimension,
-                                        float *fx,
-                                        float *fy)
+ccl_device_forceinline void path_rng_2D(
+    const KernelGlobals *kg, uint rng_hash, int sample, int dimension, float *fx, float *fy)
 {
 #ifdef __DEBUG_CORRELATION__
   *fx = (float)drand48();
   *fy = (float)drand48();
   return;
 #endif
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ) {
-    const float2 f = pmj_sample_2D(kg, sample, rng_hash, dimension);
-    *fx = f.x;
-    *fy = f.y;
-    return;
-  }
-#ifdef __CMJ__
-#  ifdef __SOBOL__
-  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ)
-#  endif
+
+#ifdef __SOBOL__
+  if (kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_PMJ)
+#endif
   {
-    /* Correlated multi-jitter. */
-    int p = rng_hash + dimension;
-    cmj_sample_2D(sample, num_samples, p, fx, fy);
+    pmj_sample_2D(kg, sample, rng_hash, dimension, fx, fy);
+
     return;
   }
-#endif
 
 #ifdef __SOBOL__
   /* Sobol. */
-  *fx = path_rng_1D(kg, rng_hash, sample, num_samples, dimension);
-  *fy = path_rng_1D(kg, rng_hash, sample, num_samples, dimension + 1);
+  *fx = path_rng_1D(kg, rng_hash, sample, dimension);
+  *fy = path_rng_1D(kg, rng_hash, sample, dimension + 1);
 #endif
 }
 
-ccl_device_inline void path_rng_init(KernelGlobals *kg,
-                                     int sample,
-                                     int num_samples,
-                                     uint *rng_hash,
-                                     int x,
-                                     int y,
-                                     float *fx,
-                                     float *fy)
+/**
+ * 1D hash recommended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqint1(uint n)
+{
+  n = (n << 13U) ^ n;
+  n = n * (n * n * 15731U + 789221U) + 1376312589U;
+
+  return n;
+}
+
+/**
+ * 2D hash recommended from "Hash Functions for GPU Rendering" JCGT Vol. 9, No. 3, 2020
+ * See https://www.shadertoy.com/view/4tXyWN and https://www.shadertoy.com/view/XlGcRh
+ * http://www.jcgt.org/published/0009/03/02/paper.pdf
+ */
+ccl_device_inline uint hash_iqnt2d(const uint x, const uint y)
 {
-  /* load state */
-  *rng_hash = hash_uint2(x, y);
-  *rng_hash ^= kernel_data.integrator.seed;
+  const uint qx = 1103515245U * ((x >> 1U) ^ (y));
+  const uint qy = 1103515245U * ((y >> 1U) ^ (x));
+  const uint n = 1103515245U * ((qx) ^ (qy >> 3U));
+
+  return n;
+}
+
+ccl_device_inline uint path_rng_hash_init(const KernelGlobals *ccl_restrict kg,
+                                          const int sample,
+                                          const int x,
+                                          const int y)
+{
+  const uint rng_hash = hash_iqnt2d(x, y) ^ kernel_data.integrator.seed;
 
 #ifdef __DEBUG_CORRELATION__
-  srand48(*rng_hash + sample);
+  srand48(rng_hash + sample);
+#else
+  (void)sample;
 #endif
 
-  if (sample == 0) {
-    *fx = 0.5f;
-    *fy = 0.5f;
-  }
-  else {
-    path_rng_2D(kg, *rng_hash, sample, num_samples, PRNG_FILTER_U, fx, fy);
-  }
+  return rng_hash;
 }
 
 /* Linear Congruential Generator */
@@ -175,113 +176,12 @@ ccl_device uint lcg_init(uint seed)
   return rng;
 }
 
-/* Path Tracing Utility Functions
- *
- * For each random number in each step of the path we must have a unique
- * dimension to avoid using the same sequence twice.
- *
- * For branches in the path we must be careful not to reuse the same number
- * in a sequence and offset accordingly.
- */
-
-ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
-                                          const ccl_addr_space PathState *state,
-                                          int dimension)
-{
-  return path_rng_1D(
-      kg, state->rng_hash, state->sample, state->num_samples, state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_state_rng_2D(
-    KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, float *fx, float *fy)
-{
-  path_rng_2D(kg,
-              state->rng_hash,
-              state->sample,
-              state->num_samples,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-ccl_device_inline float path_state_rng_1D_hash(KernelGlobals *kg,
-                                               const ccl_addr_space PathState *state,
-                                               uint hash)
-{
-  /* Use a hash instead of dimension, this is not great but avoids adding
-   * more dimensions to each bounce which reduces quality of dimensions we
-   * are already using. */
-  return path_rng_1D(kg,
-                     cmj_hash_simple(state->rng_hash, hash),
-                     state->sample,
-                     state->num_samples,
-                     state->rng_offset);
-}
-
-ccl_device_inline float path_branched_rng_1D(KernelGlobals *kg,
-                                             uint rng_hash,
-                                             const ccl_addr_space PathState *state,
-                                             int branch,
-                                             int num_branches,
-                                             int dimension)
-{
-  return path_rng_1D(kg,
-                     rng_hash,
-                     state->sample * num_branches + branch,
-                     state->num_samples * num_branches,
-                     state->rng_offset + dimension);
-}
-
-ccl_device_inline void path_branched_rng_2D(KernelGlobals *kg,
-                                            uint rng_hash,
-                                            const ccl_addr_space PathState *state,
-                                            int branch,
-                                            int num_branches,
-                                            int dimension,
-                                            float *fx,
-                                            float *fy)
-{
-  path_rng_2D(kg,
-              rng_hash,
-              state->sample * num_branches + branch,
-              state->num_samples * num_branches,
-              state->rng_offset + dimension,
-              fx,
-              fy);
-}
-
-/* Utility functions to get light termination value,
- * since it might not be needed in many cases.
- */
-ccl_device_inline float path_state_rng_light_termination(KernelGlobals *kg,
-                                                         const ccl_addr_space PathState *state)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline float path_branched_rng_light_termination(KernelGlobals *kg,
-                                                            uint rng_hash,
-                                                            const ccl_addr_space PathState *state,
-                                                            int branch,
-                                                            int num_branches)
-{
-  if (kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
-    return path_branched_rng_1D(kg, rng_hash, state, branch, num_branches, PRNG_LIGHT_TERMINATE);
-  }
-  return 0.0f;
-}
-
-ccl_device_inline uint lcg_state_init(PathState *state, uint scramble)
-{
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
-}
-
-ccl_device_inline uint lcg_state_init_addrspace(ccl_addr_space PathState *state, uint scramble)
+ccl_device_inline uint lcg_state_init(const uint rng_hash,
+                                      const uint rng_offset,
+                                      const uint sample,
+                                      const uint scramble)
 {
-  return lcg_init(state->rng_hash + state->rng_offset + state->sample * scramble);
+  return lcg_init(rng_hash + rng_offset + sample * scramble);
 }
 
 ccl_device float lcg_step_float_addrspace(ccl_addr_space uint *rng)
@@ -301,8 +201,6 @@ ccl_device_inline bool sample_is_even(int pattern, int sample)
     return __builtin_popcount(sample & 0xaaaaaaaa) & 1;
 #elif defined(__NVCC__)
     return __popc(sample & 0xaaaaaaaa) & 1;
-#elif defined(__KERNEL_OPENCL__)
-    return popcount(sample & 0xaaaaaaaa) & 1;
 #else
     /* TODO(Stefan): pop-count intrinsic for Windows with fallback for older CPUs. */
     int i = sample & 0xaaaaaaaa;
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 7f02e6fc7b3..3052bb53040 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-/*
- * ShaderData, used in four steps:
- *
- * Setup from incoming ray, sampled position and background.
- * Execute for surface, volume or displacement.
- * Evaluate one or more closures.
- * Release.
- */
+/* Functions to evaluate shaders and use the resulting shader closures. */
+
+#pragma once
 
 // clang-format off
 #include "kernel/closure/alloc.h"
@@ -30,479 +25,39 @@
 #include "kernel/closure/emissive.h"
 // clang-format on
 
+#include "kernel/kernel_accumulate.h"
 #include "kernel/svm/svm.h"
 
-CCL_NAMESPACE_BEGIN
-
-/* ShaderData setup from incoming ray */
-
-#ifdef __OBJECT_MOTION__
-ccl_device void shader_setup_object_transforms(KernelGlobals *kg, ShaderData *sd, float time)
-{
-  if (sd->object_flag & SD_OBJECT_MOTION) {
-    sd->ob_tfm = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm = transform_quick_inverse(sd->ob_tfm);
-  }
-  else {
-    sd->ob_tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
-    sd->ob_itfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
-  }
-}
-#endif
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline
-#else
-ccl_device_noinline
-#endif
-    void
-    shader_setup_from_ray(KernelGlobals *kg,
-                          ShaderData *sd,
-                          const Intersection *isect,
-                          const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  sd->object = (isect->object == OBJECT_NONE) ? kernel_tex_fetch(__prim_object, isect->prim) :
-                                                isect->object;
-  sd->lamp = LAMP_NONE;
-
-  sd->type = isect->type;
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-
-  /* matrices and time */
-#ifdef __OBJECT_MOTION__
-  shader_setup_object_transforms(kg, sd, ray->time);
-#endif
-  sd->time = ray->time;
-
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->ray_length = isect->t;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-#ifdef __HAIR__
-  if (sd->type & PRIMITIVE_ALL_CURVE) {
-    /* curve */
-    curve_shader_setup(kg, sd, isect, ray);
-  }
-  else
-#endif
-      if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* static triangle */
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* vectors */
-    sd->P = triangle_refine(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, false);
-  }
-
-  sd->I = -ray->D;
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#endif
-  }
-
-  /* backfacing test */
-  bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#endif
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_transfer(&sd->dP, ray->dP, ray->D, ray->dD, sd->Ng, isect->t);
-  differential_incoming(&sd->dI, ray->dD);
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from BSSRDF scatter */
-
-#ifdef __SUBSURFACE__
-#  ifndef __KERNEL_CUDA__
-ccl_device
-#  else
-ccl_device_inline
-#  endif
-    void
-    shader_setup_from_subsurface(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 const Intersection *isect,
-                                 const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  const bool backfacing = sd->flag & SD_BACKFACING;
-
-  /* object, matrices, time, ray_length stay the same */
-  sd->flag = 0;
-  sd->object_flag = kernel_tex_fetch(__object_flag, sd->object);
-  sd->prim = kernel_tex_fetch(__prim_index, isect->prim);
-  sd->type = isect->type;
-
-  sd->u = isect->u;
-  sd->v = isect->v;
-
-  /* fetch triangle data */
-  if (sd->type == PRIMITIVE_TRIANGLE) {
-    float3 Ng = triangle_normal(kg, sd);
-    sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
-
-    /* static triangle */
-    sd->P = triangle_refine_local(kg, sd, isect, ray);
-    sd->Ng = Ng;
-    sd->N = Ng;
-
-    if (sd->shader & SHADER_SMOOTH_NORMAL)
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-#  ifdef __DPDU__
-    /* dPdu/dPdv */
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-#  endif
-  }
-  else {
-    /* motion triangle */
-    motion_triangle_shader_setup(kg, sd, isect, ray, true);
-  }
-
-  sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-
-  if (isect->object != OBJECT_NONE) {
-    /* instance transform */
-    object_normal_transform_auto(kg, sd, &sd->N);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-#  ifdef __DPDU__
-    object_dir_transform_auto(kg, sd, &sd->dPdu);
-    object_dir_transform_auto(kg, sd, &sd->dPdv);
-#  endif
-  }
-
-  /* backfacing test */
-  if (backfacing) {
-    sd->flag |= SD_BACKFACING;
-    sd->Ng = -sd->Ng;
-    sd->N = -sd->N;
-#  ifdef __DPDU__
-    sd->dPdu = -sd->dPdu;
-    sd->dPdv = -sd->dPdv;
-#  endif
-  }
-
-  /* should not get used in principle as the shading will only use a diffuse
-   * BSDF, but the shader might still access it */
-  sd->I = sd->N;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  differential_dudv(&sd->du, &sd->dv, sd->dPdu, sd->dPdv, sd->dP, sd->Ng);
-  /* don't modify dP and dI */
-#  endif
-
-  PROFILING_SHADER(sd->shader);
-}
-#endif
-
-/* ShaderData setup from position sampled on mesh */
-
-ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
-                                                ShaderData *sd,
-                                                const float3 P,
-                                                const float3 Ng,
-                                                const float3 I,
-                                                int shader,
-                                                int object,
-                                                int prim,
-                                                float u,
-                                                float v,
-                                                float t,
-                                                float time,
-                                                bool object_space,
-                                                int lamp)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = P;
-  sd->N = Ng;
-  sd->Ng = Ng;
-  sd->I = I;
-  sd->shader = shader;
-  if (prim != PRIM_NONE)
-    sd->type = PRIMITIVE_TRIANGLE;
-  else if (lamp != LAMP_NONE)
-    sd->type = PRIMITIVE_LAMP;
-  else
-    sd->type = PRIMITIVE_NONE;
-
-  /* primitive */
-  sd->object = object;
-  sd->lamp = LAMP_NONE;
-  /* Currently no access to bvh prim index for strand sd->prim. */
-  sd->prim = prim;
-  sd->u = u;
-  sd->v = v;
-  sd->time = time;
-  sd->ray_length = t;
-
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  if (sd->object != OBJECT_NONE) {
-    sd->object_flag |= kernel_tex_fetch(__object_flag, sd->object);
-
-#ifdef __OBJECT_MOTION__
-    shader_setup_object_transforms(kg, sd, time);
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->ob_tfm = lamp_fetch_transform(kg, lamp, false);
-    sd->ob_itfm = lamp_fetch_transform(kg, lamp, true);
-    sd->lamp = lamp;
-#else
-  }
-  else if (lamp != LAMP_NONE) {
-    sd->lamp = lamp;
-#endif
-  }
-
-  /* transform into world space */
-  if (object_space) {
-    object_position_transform_auto(kg, sd, &sd->P);
-    object_normal_transform_auto(kg, sd, &sd->Ng);
-    sd->N = sd->Ng;
-    object_dir_transform_auto(kg, sd, &sd->I);
-  }
-
-  if (sd->type & PRIMITIVE_TRIANGLE) {
-    /* smooth normal */
-    if (sd->shader & SHADER_SMOOTH_NORMAL) {
-      sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
-
-      if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-        object_normal_transform_auto(kg, sd, &sd->N);
-      }
-    }
-
-    /* dPdu/dPdv */
-#ifdef __DPDU__
-    triangle_dPdudv(kg, sd->prim, &sd->dPdu, &sd->dPdv);
-
-    if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-      object_dir_transform_auto(kg, sd, &sd->dPdu);
-      object_dir_transform_auto(kg, sd, &sd->dPdv);
-    }
-#endif
-  }
-  else {
-#ifdef __DPDU__
-    sd->dPdu = zero_float3();
-    sd->dPdv = zero_float3();
-#endif
-  }
-
-  /* backfacing test */
-  if (sd->prim != PRIM_NONE) {
-    bool backfacing = (dot(sd->Ng, sd->I) < 0.0f);
-
-    if (backfacing) {
-      sd->flag |= SD_BACKFACING;
-      sd->Ng = -sd->Ng;
-      sd->N = -sd->N;
-#ifdef __DPDU__
-      sd->dPdu = -sd->dPdu;
-      sd->dPdv = -sd->dPdv;
-#endif
-    }
-  }
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* no ray differentials here yet */
-  sd->dP = differential3_zero();
-  sd->dI = differential3_zero();
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#endif
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup for displacement */
-
-ccl_device void shader_setup_from_displace(
-    KernelGlobals *kg, ShaderData *sd, int object, int prim, float u, float v)
-{
-  float3 P, Ng, I = zero_float3();
-  int shader;
-
-  triangle_point_normal(kg, object, prim, u, v, &P, &Ng, &shader);
-
-  /* force smooth shading for displacement */
-  shader |= SHADER_SMOOTH_NORMAL;
-
-  shader_setup_from_sample(
-      kg,
-      sd,
-      P,
-      Ng,
-      I,
-      shader,
-      object,
-      prim,
-      u,
-      v,
-      0.0f,
-      0.5f,
-      !(kernel_tex_fetch(__object_flag, object) & SD_OBJECT_TRANSFORM_APPLIED),
-      LAMP_NONE);
-}
-
-/* ShaderData setup from ray into background */
-
-ccl_device_inline void shader_setup_from_background(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->D;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = kernel_data.background.surface_shader;
-  sd->flag = kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f;
-
-  sd->object = OBJECT_NONE;
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#endif
-
-#ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
+#ifdef __OSL__
+#  include "kernel/osl/osl_shader.h"
 #endif
 
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-
-/* ShaderData setup from point inside volume */
-
-#ifdef __VOLUME__
-ccl_device_inline void shader_setup_from_volume(KernelGlobals *kg, ShaderData *sd, const Ray *ray)
-{
-  PROFILING_INIT(kg, PROFILING_SHADER_SETUP);
-
-  /* vectors */
-  sd->P = ray->P;
-  sd->N = -ray->D;
-  sd->Ng = -ray->D;
-  sd->I = -ray->D;
-  sd->shader = SHADER_NONE;
-  sd->flag = 0;
-  sd->object_flag = 0;
-  sd->time = ray->time;
-  sd->ray_length = 0.0f; /* todo: can we set this to some useful value? */
-
-  sd->object = OBJECT_NONE; /* todo: fill this for texture coordinates */
-  sd->lamp = LAMP_NONE;
-  sd->prim = PRIM_NONE;
-  sd->type = PRIMITIVE_NONE;
-
-  sd->u = 0.0f;
-  sd->v = 0.0f;
-
-#  ifdef __DPDU__
-  /* dPdu/dPdv */
-  sd->dPdu = zero_float3();
-  sd->dPdv = zero_float3();
-#  endif
-
-#  ifdef __RAY_DIFFERENTIALS__
-  /* differentials */
-  sd->dP = ray->dD;
-  differential_incoming(&sd->dI, sd->dP);
-  sd->du = differential_zero();
-  sd->dv = differential_zero();
-#  endif
-
-  /* for NDC coordinates */
-  sd->ray_P = ray->P;
-  sd->ray_dP = ray->dP;
-
-  PROFILING_SHADER(sd->shader);
-  PROFILING_OBJECT(sd->object);
-}
-#endif /* __VOLUME__ */
+CCL_NAMESPACE_BEGIN
 
 /* Merging */
 
-#if defined(__BRANCHED_PATH__) || defined(__VOLUME__)
-ccl_device_inline void shader_merge_closures(ShaderData *sd)
+#if defined(__VOLUME__)
+ccl_device_inline void shader_merge_volume_closures(ShaderData *sd)
 {
-  /* merge identical closures, better when we sample a single closure at a time */
+  /* Merge identical closures to save closure space with stacked volumes. */
   for (int i = 0; i < sd->num_closure; i++) {
     ShaderClosure *sci = &sd->closure[i];
 
+    if (sci->type != CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      continue;
+    }
+
     for (int j = i + 1; j < sd->num_closure; j++) {
       ShaderClosure *scj = &sd->closure[j];
-
-      if (sci->type != scj->type)
+      if (sci->type != scj->type) {
         continue;
-      if (!bsdf_merge(sci, scj))
+      }
+
+      const HenyeyGreensteinVolume *hgi = (const HenyeyGreensteinVolume *)sci;
+      const HenyeyGreensteinVolume *hgj = (const HenyeyGreensteinVolume *)scj;
+      if (!(hgi->g == hgj->g)) {
         continue;
+      }
 
       sci->weight += scj->weight;
       sci->sample_weight += scj->sample_weight;
@@ -520,16 +75,40 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
     }
   }
 }
-#endif /* __BRANCHED_PATH__ || __VOLUME__ */
 
-/* Defensive sampling. */
+ccl_device_inline void shader_copy_volume_phases(ShaderVolumePhases *ccl_restrict phases,
+                                                 const ShaderData *ccl_restrict sd)
+{
+  phases->num_closure = 0;
+
+  for (int i = 0; i < sd->num_closure; i++) {
+    const ShaderClosure *from_sc = &sd->closure[i];
+    const HenyeyGreensteinVolume *from_hg = (const HenyeyGreensteinVolume *)from_sc;
+
+    if (from_sc->type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID) {
+      ShaderVolumeClosure *to_sc = &phases->closure[phases->num_closure];
+
+      to_sc->weight = from_sc->weight;
+      to_sc->sample_weight = from_sc->sample_weight;
+      to_sc->g = from_hg->g;
+      phases->num_closure++;
+      if (phases->num_closure >= MAX_VOLUME_CLOSURE) {
+        break;
+      }
+    }
+  }
+}
+#endif /* __VOLUME__ */
 
-ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space PathState *state)
+ccl_device_inline void shader_prepare_surface_closures(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
-  /* We can likely also do defensive sampling at deeper bounces, particularly
+  /* Defensive sampling.
+   *
+   * We can likely also do defensive sampling at deeper bounces, particularly
    * for cases like a perfect mirror but possibly also others. This will need
    * a good heuristic. */
-  if (state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+  if (INTEGRATOR_STATE(path, bounce) + INTEGRATOR_STATE(path, transparent_bounce) == 0 &&
+      sd->num_closure > 1) {
     float sum = 0.0f;
 
     for (int i = 0; i < sd->num_closure; i++) {
@@ -546,98 +125,119 @@ ccl_device_inline void shader_prepare_closures(ShaderData *sd, ccl_addr_space Pa
       }
     }
   }
+
+  /* Filter glossy.
+   *
+   * Blurring of bsdf after bounces, for rays that have a small likelihood
+   * of following this particular path (diffuse, rough glossy) */
+  if (kernel_data.integrator.filter_glossy != FLT_MAX) {
+    float blur_pdf = kernel_data.integrator.filter_glossy * INTEGRATOR_STATE(path, min_ray_pdf);
+
+    if (blur_pdf < 1.0f) {
+      float blur_roughness = sqrtf(1.0f - blur_pdf) * 0.5f;
+
+      for (int i = 0; i < sd->num_closure; i++) {
+        ShaderClosure *sc = &sd->closure[i];
+        if (CLOSURE_IS_BSDF(sc->type)) {
+          bsdf_blur(kg, sc, blur_roughness);
+        }
+      }
+    }
+  }
 }
 
 /* BSDF */
 
-ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               const float3 omega_in,
-                                               float *pdf,
-                                               const ShaderClosure *skip_sc,
-                                               BsdfEval *result_eval,
-                                               float sum_pdf,
-                                               float sum_sample_weight)
+ccl_device_inline bool shader_bsdf_is_transmission(const ShaderData *sd, const float3 omega_in)
+{
+  return dot(sd->N, omega_in) < 0.0f;
+}
+
+ccl_device_forceinline bool _shader_bsdf_exclude(ClosureType type, uint light_shader_flags)
+{
+  if (!(light_shader_flags & SHADER_EXCLUDE_ANY)) {
+    return false;
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_DIFFUSE) {
+    if (CLOSURE_IS_BSDF_DIFFUSE(type) || CLOSURE_IS_BSDF_BSSRDF(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_GLOSSY) {
+    if (CLOSURE_IS_BSDF_GLOSSY(type)) {
+      return true;
+    }
+  }
+  if (light_shader_flags & SHADER_EXCLUDE_TRANSMIT) {
+    if (CLOSURE_IS_BSDF_TRANSMISSION(type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                const float3 omega_in,
+                                                const bool is_transmission,
+                                                const ShaderClosure *skip_sc,
+                                                BsdfEval *result_eval,
+                                                float sum_pdf,
+                                                float sum_sample_weight,
+                                                const uint light_shader_flags)
 {
   /* this is the veach one-sample model with balance heuristic, some pdf
    * factors drop out when using balance heuristic weighting */
   for (int i = 0; i < sd->num_closure; i++) {
     const ShaderClosure *sc = &sd->closure[i];
 
-    if (sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
+    if (sc == skip_sc) {
+      continue;
+    }
+
+    if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+      if (CLOSURE_IS_BSDF(sc->type) && !_shader_bsdf_exclude(sc->type, light_shader_flags)) {
+        float bsdf_pdf = 0.0f;
+        float3 eval = bsdf_eval(kg, sd, sc, omega_in, is_transmission, &bsdf_pdf);
 
-      if (bsdf_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, 1.0f);
-        sum_pdf += bsdf_pdf * sc->sample_weight;
+        if (bsdf_pdf != 0.0f) {
+          const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                                   CLOSURE_IS_BSDF_BSSRDF(sc->type));
+          bsdf_eval_accum(result_eval, is_diffuse, eval * sc->weight, 1.0f);
+          sum_pdf += bsdf_pdf * sc->sample_weight;
+        }
       }
 
       sum_sample_weight += sc->sample_weight;
     }
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
-}
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void _shader_bsdf_multi_eval_branched(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        const float3 omega_in,
-                                                        BsdfEval *result_eval,
-                                                        float light_pdf,
-                                                        bool use_mis)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-    if (CLOSURE_IS_BSDF(sc->type)) {
-      float bsdf_pdf = 0.0f;
-      float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
-      if (bsdf_pdf != 0.0f) {
-        float mis_weight = use_mis ? power_heuristic(light_pdf, bsdf_pdf) : 1.0f;
-        bsdf_eval_accum(result_eval, sc->type, eval * sc->weight, mis_weight);
-      }
-    }
-  }
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
-#endif /* __BRANCHED_PATH__ */
 
 #ifndef __KERNEL_CUDA__
 ccl_device
 #else
 ccl_device_inline
 #endif
-    void
-    shader_bsdf_eval(KernelGlobals *kg,
+    float
+    shader_bsdf_eval(const KernelGlobals *kg,
                      ShaderData *sd,
                      const float3 omega_in,
-                     BsdfEval *eval,
-                     float light_pdf,
-                     bool use_mis)
+                     const bool is_transmission,
+                     BsdfEval *bsdf_eval,
+                     const uint light_shader_flags)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_EVAL);
-
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
+  bsdf_eval_init(bsdf_eval, false, zero_float3());
 
-#ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched)
-    _shader_bsdf_multi_eval_branched(kg, sd, omega_in, eval, light_pdf, use_mis);
-  else
-#endif
-  {
-    float pdf;
-    _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
-    if (use_mis) {
-      float weight = power_heuristic(light_pdf, pdf);
-      bsdf_eval_mis(eval, weight);
-    }
-  }
+  return _shader_bsdf_multi_eval(
+      kg, sd, omega_in, is_transmission, NULL, bsdf_eval, 0.0f, 0.0f, light_shader_flags);
 }
 
-ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *randu)
+/* Randomly sample a BSSRDF or BSDF proportional to ShaderClosure.sample_weight. */
+ccl_device_inline const ShaderClosure *shader_bsdf_bssrdf_pick(const ShaderData *ccl_restrict sd,
+                                                               float *randu)
 {
-  /* Note the sampling here must match shader_bssrdf_pick,
-   * since we reuse the same random number. */
   int sampled = 0;
 
   if (sd->num_closure > 1) {
@@ -674,106 +274,33 @@ ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, float *r
     }
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSDF(sc->type) ? sc : NULL;
+  return &sd->closure[sampled];
 }
 
-ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
-                                                          ccl_addr_space float3 *throughput,
-                                                          float *randu)
+/* Return weight for picked BSSRDF. */
+ccl_device_inline float3 shader_bssrdf_sample_weight(const ShaderData *ccl_restrict sd,
+                                                     const ShaderClosure *ccl_restrict bssrdf_sc)
 {
-  /* Note the sampling here must match shader_bsdf_pick,
-   * since we reuse the same random number. */
-  int sampled = 0;
+  float3 weight = bssrdf_sc->weight;
 
   if (sd->num_closure > 1) {
-    /* Pick a BSDF or BSSRDF or based on sample weights. */
-    float sum_bsdf = 0.0f;
-    float sum_bssrdf = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSDF(sc->type)) {
-        sum_bsdf += sc->sample_weight;
-      }
-      else if (CLOSURE_IS_BSSRDF(sc->type)) {
-        sum_bssrdf += sc->sample_weight;
-      }
-    }
-
-    float r = (*randu) * (sum_bsdf + sum_bssrdf);
-    float partial_sum = 0.0f;
-
+    float sum = 0.0f;
     for (int i = 0; i < sd->num_closure; i++) {
       const ShaderClosure *sc = &sd->closure[i];
 
       if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r < next_sum) {
-          if (CLOSURE_IS_BSDF(sc->type)) {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
-            return NULL;
-          }
-          else {
-            *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
-            sampled = i;
-
-            /* Rescale to reuse for direction sample, to better preserve stratification. */
-            *randu = (r - partial_sum) / sc->sample_weight;
-            break;
-          }
-        }
-
-        partial_sum = next_sum;
+        sum += sc->sample_weight;
       }
     }
+    weight *= sum / bssrdf_sc->sample_weight;
   }
 
-  const ShaderClosure *sc = &sd->closure[sampled];
-  return CLOSURE_IS_BSSRDF(sc->type) ? sc : NULL;
-}
-
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float randu,
-                                         float randv,
-                                         BsdfEval *bsdf_eval,
-                                         float3 *omega_in,
-                                         differential3 *domega_in,
-                                         float *pdf)
-{
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
-
-  const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
-  if (sc == NULL) {
-    *pdf = 0.0f;
-    return LABEL_NONE;
-  }
-
-  /* BSSRDF should already have been handled elsewhere. */
-  kernel_assert(CLOSURE_IS_BSDF(sc->type));
-
-  int label;
-  float3 eval = zero_float3();
-
-  *pdf = 0.0f;
-  label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
-
-  if (*pdf != 0.0f) {
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
-
-    if (sd->num_closure > 1) {
-      float sweight = sc->sample_weight;
-      _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf * sweight, sweight);
-    }
-  }
-
-  return label;
+  return weight;
 }
 
-ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
+/* Sample direction for picked BSDF, and return evaluation and pdf for all
+ * BSDFs combined using MIS. */
+ccl_device int shader_bsdf_sample_closure(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           const ShaderClosure *sc,
                                           float randu,
@@ -783,7 +310,8 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_SAMPLE);
+  /* BSSRDF should already have been handled elsewhere. */
+  kernel_assert(CLOSURE_IS_BSDF(sc->type));
 
   int label;
   float3 eval = zero_float3();
@@ -791,19 +319,29 @@ ccl_device int shader_bsdf_sample_closure(KernelGlobals *kg,
   *pdf = 0.0f;
   label = bsdf_sample(kg, sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
-  if (*pdf != 0.0f)
-    bsdf_eval_init(bsdf_eval, sc->type, eval * sc->weight, kernel_data.film.use_light_pass);
+  if (*pdf != 0.0f) {
+    const bool is_diffuse = (CLOSURE_IS_BSDF_DIFFUSE(sc->type) ||
+                             CLOSURE_IS_BSDF_BSSRDF(sc->type));
+    bsdf_eval_init(bsdf_eval, is_diffuse, eval * sc->weight);
+
+    if (sd->num_closure > 1) {
+      const bool is_transmission = shader_bsdf_is_transmission(sd, *omega_in);
+      float sweight = sc->sample_weight;
+      *pdf = _shader_bsdf_multi_eval(
+          kg, sd, *omega_in, is_transmission, sc, bsdf_eval, *pdf * sweight, sweight, 0);
+    }
+  }
 
   return label;
 }
 
-ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
+ccl_device float shader_bsdf_average_roughness(const ShaderData *sd)
 {
   float roughness = 0.0f;
   float sum_weight = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF(sc->type)) {
       /* sqrt once to undo the squaring from multiplying roughness on the
@@ -817,17 +355,7 @@ ccl_device float shader_bsdf_average_roughness(ShaderData *sd)
   return (sum_weight > 0.0f) ? roughness / sum_weight : 0.0f;
 }
 
-ccl_device void shader_bsdf_blur(KernelGlobals *kg, ShaderData *sd, float roughness)
-{
-  for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_BSDF(sc->type))
-      bsdf_blur(kg, sc, roughness);
-  }
-}
-
-ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *sd)
+ccl_device float3 shader_bsdf_transparency(const KernelGlobals *kg, const ShaderData *sd)
 {
   if (sd->flag & SD_HAS_ONLY_VOLUME) {
     return one_float3();
@@ -840,7 +368,7 @@ ccl_device float3 shader_bsdf_transparency(KernelGlobals *kg, const ShaderData *
   }
 }
 
-ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *sd)
+ccl_device void shader_bsdf_disable_transparency(const KernelGlobals *kg, ShaderData *sd)
 {
   if (sd->flag & SD_TRANSPARENT) {
     for (int i = 0; i < sd->num_closure; i++) {
@@ -856,7 +384,7 @@ ccl_device void shader_bsdf_disable_transparency(KernelGlobals *kg, ShaderData *
   }
 }
 
-ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_alpha(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 alpha = one_float3() - shader_bsdf_transparency(kg, sd);
 
@@ -866,12 +394,12 @@ ccl_device float3 shader_bsdf_alpha(KernelGlobals *kg, ShaderData *sd)
   return alpha;
 }
 
-ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_diffuse(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type) ||
         CLOSURE_IS_BSDF_BSSRDF(sc->type))
@@ -881,12 +409,12 @@ ccl_device float3 shader_bsdf_diffuse(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_glossy(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
       eval += sc->weight;
@@ -895,12 +423,12 @@ ccl_device float3 shader_bsdf_glossy(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_transmission(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 eval = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
       eval += sc->weight;
@@ -909,12 +437,12 @@ ccl_device float3 shader_bsdf_transmission(KernelGlobals *kg, ShaderData *sd)
   return eval;
 }
 
-ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_bsdf_average_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
       N += sc->N * fabsf(average(sc->weight));
   }
@@ -922,59 +450,44 @@ ccl_device float3 shader_bsdf_average_normal(KernelGlobals *kg, ShaderData *sd)
   return (is_zero(N)) ? sd->N : normalize(N);
 }
 
-ccl_device float3 shader_bsdf_ao(KernelGlobals *kg, ShaderData *sd, float ao_factor, float3 *N_)
+ccl_device float3 shader_bsdf_ao_normal(const KernelGlobals *kg, const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
+    const ShaderClosure *sc = &sd->closure[i];
     if (CLOSURE_IS_BSDF_DIFFUSE(sc->type)) {
       const DiffuseBsdf *bsdf = (const DiffuseBsdf *)sc;
-      eval += sc->weight * ao_factor;
       N += bsdf->N * fabsf(average(sc->weight));
     }
   }
 
-  *N_ = (is_zero(N)) ? sd->N : normalize(N);
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 
 #ifdef __SUBSURFACE__
-ccl_device float3 shader_bssrdf_sum(ShaderData *sd, float3 *N_, float *texture_blur_)
+ccl_device float3 shader_bssrdf_normal(const ShaderData *sd)
 {
-  float3 eval = zero_float3();
   float3 N = zero_float3();
-  float texture_blur = 0.0f, weight_sum = 0.0f;
 
   for (int i = 0; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
+    const ShaderClosure *sc = &sd->closure[i];
 
     if (CLOSURE_IS_BSSRDF(sc->type)) {
       const Bssrdf *bssrdf = (const Bssrdf *)sc;
       float avg_weight = fabsf(average(sc->weight));
 
       N += bssrdf->N * avg_weight;
-      eval += sc->weight;
-      texture_blur += bssrdf->texture_blur * avg_weight;
-      weight_sum += avg_weight;
     }
   }
 
-  if (N_)
-    *N_ = (is_zero(N)) ? sd->N : normalize(N);
-
-  if (texture_blur_)
-    *texture_blur_ = safe_divide(texture_blur, weight_sum);
-
-  return eval;
+  return (is_zero(N)) ? sd->N : normalize(N);
 }
 #endif /* __SUBSURFACE__ */
 
 /* Constant emission optimization */
 
-ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, float3 *eval)
+ccl_device bool shader_constant_emission_eval(const KernelGlobals *kg, int shader, float3 *eval)
 {
   int shader_index = shader & SHADER_MASK;
   int shader_flag = kernel_tex_fetch(__shaders, shader_index).flags;
@@ -992,7 +505,7 @@ ccl_device bool shader_constant_emission_eval(KernelGlobals *kg, int shader, flo
 
 /* Background */
 
-ccl_device float3 shader_background_eval(ShaderData *sd)
+ccl_device float3 shader_background_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return sd->closure_emission_background;
@@ -1004,7 +517,7 @@ ccl_device float3 shader_background_eval(ShaderData *sd)
 
 /* Emission */
 
-ccl_device float3 shader_emissive_eval(ShaderData *sd)
+ccl_device float3 shader_emissive_eval(const ShaderData *sd)
 {
   if (sd->flag & SD_EMISSION) {
     return emissive_simple_eval(sd->Ng, sd->I) * sd->closure_emission_background;
@@ -1016,7 +529,7 @@ ccl_device float3 shader_emissive_eval(ShaderData *sd)
 
 /* Holdout */
 
-ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
+ccl_device float3 shader_holdout_apply(const KernelGlobals *kg, ShaderData *sd)
 {
   float3 weight = zero_float3();
 
@@ -1041,7 +554,7 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
   }
   else {
     for (int i = 0; i < sd->num_closure; i++) {
-      ShaderClosure *sc = &sd->closure[i];
+      const ShaderClosure *sc = &sd->closure[i];
       if (CLOSURE_IS_HOLDOUT(sc->type)) {
         weight += sc->weight;
       }
@@ -1053,14 +566,12 @@ ccl_device float3 shader_holdout_apply(KernelGlobals *kg, ShaderData *sd)
 
 /* Surface Evaluation */
 
-ccl_device void shader_eval_surface(KernelGlobals *kg,
-                                    ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    ccl_global float *buffer,
+template<uint node_feature_mask>
+ccl_device void shader_eval_surface(INTEGRATOR_STATE_CONST_ARGS,
+                                    ShaderData *ccl_restrict sd,
+                                    ccl_global float *ccl_restrict buffer,
                                     int path_flag)
 {
-  PROFILING_INIT(kg, PROFILING_SHADER_EVAL);
-
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
    * shader data also do not have a closure array to save GPU memory. */
@@ -1069,7 +580,7 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   sd->num_closure = 0;
@@ -1078,17 +589,18 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #ifdef __OSL__
   if (kg->osl) {
     if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, sd, state, path_flag);
+      OSLShader::eval_background(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else {
-      OSLShader::eval_surface(kg, sd, state, path_flag);
+      OSLShader::eval_surface(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
   }
   else
 #endif
   {
 #ifdef __SVM__
-    svm_eval_nodes(kg, sd, state, buffer, SHADER_TYPE_SURFACE, path_flag);
+    svm_eval_nodes<node_feature_mask, SHADER_TYPE_SURFACE>(
+        INTEGRATOR_STATE_PASS, sd, buffer, path_flag);
 #else
     if (sd->object == OBJECT_NONE) {
       sd->closure_emission_background = make_float3(0.8f, 0.8f, 0.8f);
@@ -1105,8 +617,11 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 #endif
   }
 
-  if (sd->flag & SD_BSDF_NEEDS_LCG) {
-    sd->lcg_state = lcg_state_init_addrspace(state, 0xb4bc3953);
+  if (KERNEL_NODES_FEATURE(BSDF) && (sd->flag & SD_BSDF_NEEDS_LCG)) {
+    sd->lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                   INTEGRATOR_STATE(path, rng_offset),
+                                   INTEGRATOR_STATE(path, sample),
+                                   0xb4bc3953);
   }
 }
 
@@ -1114,48 +629,47 @@ ccl_device void shader_eval_surface(KernelGlobals *kg,
 
 #ifdef __VOLUME__
 
-ccl_device_inline void _shader_volume_phase_multi_eval(const ShaderData *sd,
-                                                       const float3 omega_in,
-                                                       float *pdf,
-                                                       int skip_phase,
-                                                       BsdfEval *result_eval,
-                                                       float sum_pdf,
-                                                       float sum_sample_weight)
+ccl_device_inline float _shader_volume_phase_multi_eval(const ShaderData *sd,
+                                                        const ShaderVolumePhases *phases,
+                                                        const float3 omega_in,
+                                                        int skip_phase,
+                                                        BsdfEval *result_eval,
+                                                        float sum_pdf,
+                                                        float sum_sample_weight)
 {
-  for (int i = 0; i < sd->num_closure; i++) {
+  for (int i = 0; i < phases->num_closure; i++) {
     if (i == skip_phase)
       continue;
 
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (CLOSURE_IS_PHASE(sc->type)) {
-      float phase_pdf = 0.0f;
-      float3 eval = volume_phase_eval(sd, sc, omega_in, &phase_pdf);
+    const ShaderVolumeClosure *svc = &phases->closure[i];
+    float phase_pdf = 0.0f;
+    float3 eval = volume_phase_eval(sd, svc, omega_in, &phase_pdf);
 
-      if (phase_pdf != 0.0f) {
-        bsdf_eval_accum(result_eval, sc->type, eval, 1.0f);
-        sum_pdf += phase_pdf * sc->sample_weight;
-      }
-
-      sum_sample_weight += sc->sample_weight;
+    if (phase_pdf != 0.0f) {
+      bsdf_eval_accum(result_eval, false, eval, 1.0f);
+      sum_pdf += phase_pdf * svc->sample_weight;
     }
+
+    sum_sample_weight += svc->sample_weight;
   }
 
-  *pdf = (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
+  return (sum_sample_weight > 0.0f) ? sum_pdf / sum_sample_weight : 0.0f;
 }
 
-ccl_device void shader_volume_phase_eval(
-    KernelGlobals *kg, const ShaderData *sd, const float3 omega_in, BsdfEval *eval, float *pdf)
+ccl_device float shader_volume_phase_eval(const KernelGlobals *kg,
+                                          const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
+                                          const float3 omega_in,
+                                          BsdfEval *phase_eval)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_EVAL);
+  bsdf_eval_init(phase_eval, false, zero_float3());
 
-  bsdf_eval_init(eval, NBUILTIN_CLOSURES, zero_float3(), kernel_data.film.use_light_pass);
-
-  _shader_volume_phase_multi_eval(sd, omega_in, pdf, -1, eval, 0.0f, 0.0f);
+  return _shader_volume_phase_multi_eval(sd, phases, omega_in, -1, phase_eval, 0.0f, 0.0f);
 }
 
-ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
+ccl_device int shader_volume_phase_sample(const KernelGlobals *kg,
                                           const ShaderData *sd,
+                                          const ShaderVolumePhases *phases,
                                           float randu,
                                           float randv,
                                           BsdfEval *phase_eval,
@@ -1163,41 +677,34 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
                                           differential3 *domega_in,
                                           float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int sampled = 0;
 
-  if (sd->num_closure > 1) {
+  if (phases->num_closure > 1) {
     /* pick a phase closure based on sample weights */
     float sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
-
-      if (CLOSURE_IS_PHASE(sc->type))
-        sum += sc->sample_weight;
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      sum += svc->sample_weight;
     }
 
     float r = randu * sum;
     float partial_sum = 0.0f;
 
-    for (sampled = 0; sampled < sd->num_closure; sampled++) {
-      const ShaderClosure *sc = &sd->closure[sampled];
+    for (sampled = 0; sampled < phases->num_closure; sampled++) {
+      const ShaderVolumeClosure *svc = &phases->closure[sampled];
+      float next_sum = partial_sum + svc->sample_weight;
 
-      if (CLOSURE_IS_PHASE(sc->type)) {
-        float next_sum = partial_sum + sc->sample_weight;
-
-        if (r <= next_sum) {
-          /* Rescale to reuse for BSDF direction sample. */
-          randu = (r - partial_sum) / sc->sample_weight;
-          break;
-        }
-
-        partial_sum = next_sum;
+      if (r <= next_sum) {
+        /* Rescale to reuse for BSDF direction sample. */
+        randu = (r - partial_sum) / svc->sample_weight;
+        break;
       }
+
+      partial_sum = next_sum;
     }
 
-    if (sampled == sd->num_closure) {
+    if (sampled == phases->num_closure) {
       *pdf = 0.0f;
       return LABEL_NONE;
     }
@@ -1205,23 +712,23 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg,
 
   /* todo: this isn't quite correct, we don't weight anisotropy properly
    * depending on color channels, even if this is perhaps not a common case */
-  const ShaderClosure *sc = &sd->closure[sampled];
+  const ShaderVolumeClosure *svc = &phases->closure[sampled];
   int label;
   float3 eval = zero_float3();
 
   *pdf = 0.0f;
-  label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
+  label = volume_phase_sample(sd, svc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f) {
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
   }
 
   return label;
 }
 
-ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
+ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
                                            const ShaderData *sd,
-                                           const ShaderClosure *sc,
+                                           const ShaderVolumeClosure *sc,
                                            float randu,
                                            float randv,
                                            BsdfEval *phase_eval,
@@ -1229,8 +736,6 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
                                            differential3 *domega_in,
                                            float *pdf)
 {
-  PROFILING_INIT(kg, PROFILING_CLOSURE_VOLUME_SAMPLE);
-
   int label;
   float3 eval = zero_float3();
 
@@ -1238,18 +743,18 @@ ccl_device int shader_phase_sample_closure(KernelGlobals *kg,
   label = volume_phase_sample(sd, sc, randu, randv, &eval, omega_in, domega_in, pdf);
 
   if (*pdf != 0.0f)
-    bsdf_eval_init(phase_eval, sc->type, eval, kernel_data.film.use_light_pass);
+    bsdf_eval_init(phase_eval, false, eval);
 
   return label;
 }
 
 /* Volume Evaluation */
 
-ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          ccl_addr_space PathState *state,
-                                          ccl_addr_space VolumeStack *stack,
-                                          int path_flag)
+template<typename StackReadOp>
+ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
+                                          ShaderData *ccl_restrict sd,
+                                          const int path_flag,
+                                          StackReadOp stack_read)
 {
   /* If path is being terminated, we are tracing a shadow ray or evaluating
    * emission, then we don't need to store closures. The emission and shadow
@@ -1259,7 +764,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
     max_closures = 0;
   }
   else {
-    max_closures = kernel_data.integrator.max_closures;
+    max_closures = kernel_data.max_closures;
   }
 
   /* reset closures once at the start, we will be accumulating the closures
@@ -1268,14 +773,18 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
   sd->num_closure_left = max_closures;
   sd->flag = 0;
   sd->object_flag = 0;
-  sd->type = PRIMITIVE_VOLUME;
 
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
+  for (int i = 0;; i++) {
+    const VolumeStack entry = stack_read(i);
+    if (entry.shader == SHADER_NONE) {
+      break;
+    }
+
     /* setup shaderdata from stack. it's mostly setup already in
      * shader_setup_from_volume, this switching should be quick */
-    sd->object = stack[i].object;
+    sd->object = entry.object;
     sd->lamp = LAMP_NONE;
-    sd->shader = stack[i].shader;
+    sd->shader = entry.shader;
 
     sd->flag &= ~SD_SHADER_FLAGS;
     sd->flag |= kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).flags;
@@ -1295,18 +804,19 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 #  ifdef __SVM__
 #    ifdef __OSL__
     if (kg->osl) {
-      OSLShader::eval_volume(kg, sd, state, path_flag);
+      OSLShader::eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag);
     }
     else
 #    endif
     {
-      svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_VOLUME, path_flag);
+      svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_VOLUME, SHADER_TYPE_VOLUME>(
+          INTEGRATOR_STATE_PASS, sd, NULL, path_flag);
     }
 #  endif
 
-    /* merge closures to avoid exceeding number of closures limit */
+    /* Merge closures to avoid exceeding number of closures limit. */
     if (i > 0)
-      shader_merge_closures(sd);
+      shader_merge_volume_closures(sd);
   }
 }
 
@@ -1314,9 +824,7 @@ ccl_device_inline void shader_eval_volume(KernelGlobals *kg,
 
 /* Displacement Evaluation */
 
-ccl_device void shader_eval_displacement(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         ccl_addr_space PathState *state)
+ccl_device void shader_eval_displacement(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd)
 {
   sd->num_closure = 0;
   sd->num_closure_left = 0;
@@ -1325,11 +833,12 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 #ifdef __SVM__
 #  ifdef __OSL__
   if (kg->osl)
-    OSLShader::eval_displacement(kg, sd, state);
+    OSLShader::eval_displacement(INTEGRATOR_STATE_PASS, sd);
   else
 #  endif
   {
-    svm_eval_nodes(kg, sd, state, NULL, SHADER_TYPE_DISPLACEMENT, 0);
+    svm_eval_nodes<KERNEL_FEATURE_NODE_MASK_DISPLACEMENT, SHADER_TYPE_DISPLACEMENT>(
+        INTEGRATOR_STATE_PASS, sd, NULL, 0);
   }
 #endif
 }
@@ -1337,29 +846,13 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg,
 /* Transparent Shadows */
 
 #ifdef __TRANSPARENT_SHADOWS__
-ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect)
+ccl_device bool shader_transparent_shadow(const KernelGlobals *kg, Intersection *isect)
 {
-  int prim = kernel_tex_fetch(__prim_index, isect->prim);
-  int shader = 0;
-
-#  ifdef __HAIR__
-  if (isect->type & PRIMITIVE_ALL_TRIANGLE) {
-#  endif
-    shader = kernel_tex_fetch(__tri_shader, prim);
-#  ifdef __HAIR__
-  }
-  else {
-    float4 str = kernel_tex_fetch(__curves, prim);
-    shader = __float_as_int(str.z);
-  }
-#  endif
-  int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
-
-  return (flag & SD_HAS_TRANSPARENT_SHADOW) != 0;
+  return (intersection_get_shader_flags(kg, isect) & SD_HAS_TRANSPARENT_SHADOW) != 0;
 }
 #endif /* __TRANSPARENT_SHADOWS__ */
 
-ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+ccl_device float shader_cryptomatte_id(const KernelGlobals *kg, int shader)
 {
   return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
 }
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
deleted file mode 100644
index 3b124122fba..00000000000
--- a/intern/cycles/kernel/kernel_shadow.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __VOLUME__
-/* Get PathState ready for use for volume stack evaluation. */
-#  ifdef __SPLIT_KERNEL__
-ccl_addr_space
-#  endif
-    ccl_device_inline PathState *
-    shadow_blocked_volume_path_state(KernelGlobals *kg,
-                                     VolumeState *volume_state,
-                                     ccl_addr_space PathState *state,
-                                     ShaderData *sd,
-                                     Ray *ray)
-{
-#  ifdef __SPLIT_KERNEL__
-  ccl_addr_space PathState *ps =
-      &kernel_split_state.state_shadow[ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0)];
-#  else
-  PathState *ps = &volume_state->ps;
-#  endif
-  *ps = *state;
-  /* We are checking for shadow on the "other" side of the surface, so need
-   * to discard volume we are currently at.
-   */
-  if (dot(sd->Ng, ray->D) < 0.0f) {
-    kernel_volume_stack_enter_exit(kg, sd, ps->volume_stack);
-  }
-  return ps;
-}
-#endif /* __VOLUME__ */
-
-/* Attenuate throughput accordingly to the given intersection event.
- * Returns true if the throughput is zero and traversal can be aborted.
- */
-ccl_device_forceinline bool shadow_handle_transparent_isect(KernelGlobals *kg,
-                                                            ShaderData *shadow_sd,
-                                                            ccl_addr_space PathState *state,
-#ifdef __VOLUME__
-                                                            ccl_addr_space PathState *volume_state,
-#endif
-                                                            Intersection *isect,
-                                                            Ray *ray,
-                                                            float3 *throughput)
-{
-#ifdef __VOLUME__
-  /* Attenuation between last surface and next surface. */
-  if (volume_state->volume_stack[0].shader != SHADER_NONE) {
-    Ray segment_ray = *ray;
-    segment_ray.t = isect->t;
-    kernel_volume_shadow(kg, shadow_sd, volume_state, &segment_ray, throughput);
-  }
-#endif
-  /* Setup shader data at surface. */
-  shader_setup_from_ray(kg, shadow_sd, isect, ray);
-  /* Attenuation from transparent surface. */
-  if (!(shadow_sd->flag & SD_HAS_ONLY_VOLUME)) {
-    path_state_modify_bounce(state, true);
-    shader_eval_surface(kg, shadow_sd, state, NULL, PATH_RAY_SHADOW);
-    path_state_modify_bounce(state, false);
-    *throughput *= shader_bsdf_transparency(kg, shadow_sd);
-  }
-  /* Stop if all light is blocked. */
-  if (is_zero(*throughput)) {
-    return true;
-  }
-#ifdef __VOLUME__
-  /* Exit/enter volume. */
-  kernel_volume_stack_enter_exit(kg, shadow_sd, volume_state->volume_stack);
-#endif
-  return false;
-}
-
-/* Special version which only handles opaque shadows. */
-ccl_device bool shadow_blocked_opaque(KernelGlobals *kg,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      const uint visibility,
-                                      Ray *ray,
-                                      Intersection *isect,
-                                      float3 *shadow)
-{
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-#ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-    kernel_volume_shadow(kg, shadow_sd, state, ray, shadow);
-  }
-#endif
-  return blocked;
-}
-
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-/* Shadow function to compute how much light is blocked,
- *
- * We trace a single ray. If it hits any opaque surface, or more than a given
- * number of transparent surfaces is hit, then we consider the geometry to be
- * entirely blocked. If not, all transparent surfaces will be recorded and we
- * will shade them one by one to determine how much light is blocked. This all
- * happens in one scene intersection function.
- *
- * Recording all hits works well in some cases but may be slower in others. If
- * we have many semi-transparent hairs, one intersection may be faster because
- * you'd be reinteresecting the same hairs a lot with each step otherwise. If
- * however there is mostly binary transparency then we may be recording many
- * unnecessary intersections when one of the first surfaces blocks all light.
- *
- * From tests in real scenes it seems the performance loss is either minimal,
- * or there is a performance increase anyway due to avoiding the need to send
- * two rays with transparent shadows.
- *
- * On CPU it'll handle all transparent bounces (by allocating storage for
- * intersections when they don't fit into the stack storage).
- *
- * On GPU it'll only handle SHADOW_STACK_MAX_HITS-1 intersections, so this
- * is something to be kept an eye on.
- */
-
-#    define SHADOW_STACK_MAX_HITS 64
-
-/* Actual logic with traversal loop implementation which is free from device
- * specific tweaks.
- *
- * Note that hits array should be as big as max_hits+1.
- */
-ccl_device bool shadow_blocked_transparent_all_loop(KernelGlobals *kg,
-                                                    ShaderData *sd,
-                                                    ShaderData *shadow_sd,
-                                                    ccl_addr_space PathState *state,
-                                                    const uint visibility,
-                                                    Ray *ray,
-                                                    Intersection *hits,
-                                                    uint max_hits,
-                                                    float3 *shadow)
-{
-  /* Intersect to find an opaque surface, or record all transparent
-   * surface hits.
-   */
-  uint num_hits;
-  const bool blocked = scene_intersect_shadow_all(kg, ray, hits, visibility, max_hits, &num_hits);
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  /* If no opaque surface found but we did find transparent hits,
-   * shade them.
-   */
-  if (!blocked && num_hits > 0) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    float last_t = 0.0f;
-    int bounce = state->transparent_bounce;
-    Intersection *isect = hits;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    sort_intersections(hits, num_hits);
-    for (int hit = 0; hit < num_hits; hit++, isect++) {
-      /* Adjust intersection distance for moving ray forward. */
-      float new_t = isect->t;
-      isect->t -= last_t;
-      /* Skip hit if we did not move forward, step by step raytracing
-       * would have skipped it as well then.
-       */
-      if (last_t == new_t) {
-        continue;
-      }
-      last_t = new_t;
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = shadow_sd->P;
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow = throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-/* Here we do all device specific trickery before invoking actual traversal
- * loop to help readability of the actual logic.
- */
-ccl_device bool shadow_blocked_transparent_all(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ShaderData *shadow_sd,
-                                               ccl_addr_space PathState *state,
-                                               const uint visibility,
-                                               Ray *ray,
-                                               uint max_hits,
-                                               float3 *shadow)
-{
-#    ifdef __SPLIT_KERNEL__
-  Intersection hits_[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = &hits_[0];
-#    elif defined(__KERNEL_CUDA__)
-  Intersection *hits = kg->hits_stack;
-#    else
-  Intersection hits_stack[SHADOW_STACK_MAX_HITS];
-  Intersection *hits = hits_stack;
-#    endif
-#    ifndef __KERNEL_GPU__
-  /* Prefer to use stack but use dynamic allocation if too deep max hits
-   * we need max_hits + 1 storage space due to the logic in
-   * scene_intersect_shadow_all which will first store and then check if
-   * the limit is exceeded.
-   *
-   * Ignore this on GPU because of slow/unavailable malloc().
-   */
-  if (max_hits + 1 > SHADOW_STACK_MAX_HITS) {
-    if (kg->transparent_shadow_intersections == NULL) {
-      const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-      kg->transparent_shadow_intersections = (Intersection *)malloc(sizeof(Intersection) *
-                                                                    (transparent_max_bounce + 1));
-    }
-    hits = kg->transparent_shadow_intersections;
-  }
-#    endif /* __KERNEL_GPU__ */
-  /* Invoke actual traversal. */
-  return shadow_blocked_transparent_all_loop(
-      kg, sd, shadow_sd, state, visibility, ray, hits, max_hits, shadow);
-}
-#  endif /* __SHADOW_RECORD_ALL__ */
-
-#  if defined(__KERNEL_GPU__) || !defined(__SHADOW_RECORD_ALL__)
-/* Shadow function to compute how much light is blocked,
- *
- * Here we raytrace from one transparent surface to the next step by step.
- * To minimize overhead in cases where we don't need transparent shadows, we
- * first trace a regular shadow ray. We check if the hit primitive was
- * potentially transparent, and only in that case start marching. this gives
- * one extra ray cast for the cases were we do want transparency.
- */
-
-/* This function is only implementing device-independent traversal logic
- * which requires some precalculation done.
- */
-ccl_device bool shadow_blocked_transparent_stepped_loop(KernelGlobals *kg,
-                                                        ShaderData *sd,
-                                                        ShaderData *shadow_sd,
-                                                        ccl_addr_space PathState *state,
-                                                        const uint visibility,
-                                                        Ray *ray,
-                                                        Intersection *isect,
-                                                        const bool blocked,
-                                                        const bool is_transparent_isect,
-                                                        float3 *shadow)
-{
-#    ifdef __VOLUME__
-#      ifdef __KERNEL_OPTIX__
-  VolumeState &volume_state = kg->volume_state;
-#      else
-  VolumeState volume_state;
-#      endif
-#    endif
-  if (blocked && is_transparent_isect) {
-    float3 throughput = one_float3();
-    float3 Pend = ray->P + ray->D * ray->t;
-    int bounce = state->transparent_bounce;
-#    ifdef __VOLUME__
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-#    endif
-    for (;;) {
-      if (bounce >= kernel_data.integrator.transparent_max_bounce) {
-        return true;
-      }
-      if (!scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_TRANSPARENT, isect)) {
-        break;
-      }
-      if (!shader_transparent_shadow(kg, isect)) {
-        return true;
-      }
-      /* Attenuate the throughput. */
-      if (shadow_handle_transparent_isect(kg,
-                                          shadow_sd,
-                                          state,
-#    ifdef __VOLUME__
-                                          ps,
-#    endif
-                                          isect,
-                                          ray,
-                                          &throughput)) {
-        return true;
-      }
-      /* Move ray forward. */
-      ray->P = ray_offset(shadow_sd->P, -shadow_sd->Ng);
-      if (ray->t != FLT_MAX) {
-        ray->D = normalize_len(Pend - ray->P, &ray->t);
-      }
-      bounce++;
-    }
-#    ifdef __VOLUME__
-    /* Attenuation for last line segment towards light. */
-    if (ps->volume_stack[0].shader != SHADER_NONE) {
-      kernel_volume_shadow(kg, shadow_sd, ps, ray, &throughput);
-    }
-#    endif
-    *shadow *= throughput;
-    return is_zero(throughput);
-  }
-#    ifdef __VOLUME__
-  if (!blocked && state->volume_stack[0].shader != SHADER_NONE) {
-    /* Apply attenuation from current volume shader. */
-#      ifdef __SPLIT_KERNEL__
-    ccl_addr_space
-#      endif
-        PathState *ps = shadow_blocked_volume_path_state(kg, &volume_state, state, sd, ray);
-    kernel_volume_shadow(kg, shadow_sd, ps, ray, shadow);
-  }
-#    endif
-  return blocked;
-}
-
-ccl_device bool shadow_blocked_transparent_stepped(KernelGlobals *kg,
-                                                   ShaderData *sd,
-                                                   ShaderData *shadow_sd,
-                                                   ccl_addr_space PathState *state,
-                                                   const uint visibility,
-                                                   Ray *ray,
-                                                   Intersection *isect,
-                                                   float3 *shadow)
-{
-  bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, isect);
-  bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, isect) : false;
-  return shadow_blocked_transparent_stepped_loop(
-      kg, sd, shadow_sd, state, visibility, ray, isect, blocked, is_transparent_isect, shadow);
-}
-
-#  endif /* __KERNEL_GPU__ || !__SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-
-ccl_device_inline bool shadow_blocked(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ShaderData *shadow_sd,
-                                      ccl_addr_space PathState *state,
-                                      Ray *ray,
-                                      float3 *shadow)
-{
-  *shadow = one_float3();
-#if !defined(__KERNEL_OPTIX__)
-  /* Some common early checks.
-   * Avoid conditional trace call in OptiX though, since those hurt performance there.
-   */
-  if (ray->t == 0.0f) {
-    return false;
-  }
-#endif
-#ifdef __SHADOW_TRICKS__
-  const uint visibility = (state->flag & PATH_RAY_SHADOW_CATCHER) ? PATH_RAY_SHADOW_NON_CATCHER :
-                                                                    PATH_RAY_SHADOW;
-#else
-  const uint visibility = PATH_RAY_SHADOW;
-#endif
-  /* Do actual shadow shading.
-   * First of all, we check if integrator requires transparent shadows.
-   * if not, we use simplest and fastest ever way to calculate occlusion.
-   * Do not do this in OptiX to avoid the additional trace call.
-   */
-#if !defined(__KERNEL_OPTIX__) || !defined(__TRANSPARENT_SHADOWS__)
-  Intersection isect;
-#  ifdef __TRANSPARENT_SHADOWS__
-  if (!kernel_data.integrator.transparent_shadows)
-#  endif
-  {
-    return shadow_blocked_opaque(kg, shadow_sd, state, visibility, ray, &isect, shadow);
-  }
-#endif
-#ifdef __TRANSPARENT_SHADOWS__
-#  ifdef __SHADOW_RECORD_ALL__
-  /* For the transparent shadows we try to use record-all logic on the
-   * devices which supports this.
-   */
-  const int transparent_max_bounce = kernel_data.integrator.transparent_max_bounce;
-  /* Check transparent bounces here, for volume scatter which can do
-   * lighting before surface path termination is checked.
-   */
-  if (state->transparent_bounce >= transparent_max_bounce) {
-    return true;
-  }
-  uint max_hits = transparent_max_bounce - state->transparent_bounce - 1;
-#    if defined(__KERNEL_OPTIX__)
-  /* Always use record-all behavior in OptiX, but ensure there are no out of bounds
-   * accesses to the hit stack.
-   */
-  max_hits = min(max_hits, SHADOW_STACK_MAX_HITS - 1);
-#    elif defined(__KERNEL_GPU__)
-  /* On GPU we do tricky with tracing opaque ray first, this avoids speed
-   * regressions in some files.
-   *
-   * TODO(sergey): Check why using record-all behavior causes slowdown in such
-   * cases. Could that be caused by a higher spill pressure?
-   */
-  const bool blocked = scene_intersect(kg, ray, visibility & PATH_RAY_SHADOW_OPAQUE, &isect);
-  const bool is_transparent_isect = blocked ? shader_transparent_shadow(kg, &isect) : false;
-  if (!blocked || !is_transparent_isect || max_hits + 1 >= SHADOW_STACK_MAX_HITS) {
-    return shadow_blocked_transparent_stepped_loop(
-        kg, sd, shadow_sd, state, visibility, ray, &isect, blocked, is_transparent_isect, shadow);
-  }
-#    endif /* __KERNEL_GPU__ */
-  return shadow_blocked_transparent_all(
-      kg, sd, shadow_sd, state, visibility, ray, max_hits, shadow);
-#  else  /* __SHADOW_RECORD_ALL__ */
-  /* Fallback to a slowest version which works on all devices. */
-  return shadow_blocked_transparent_stepped(
-      kg, sd, shadow_sd, state, visibility, ray, &isect, shadow);
-#  endif /* __SHADOW_RECORD_ALL__ */
-#endif   /* __TRANSPARENT_SHADOWS__ */
-}
-
-#undef SHADOW_STACK_MAX_HITS
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_shadow_catcher.h b/intern/cycles/kernel/kernel_shadow_catcher.h
new file mode 100644
index 00000000000..824749818a4
--- /dev/null
+++ b/intern/cycles/kernel/kernel_shadow_catcher.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/kernel_path_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Check whether current surface bounce is where path is to be split for the shadow catcher. */
+ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_ARGS,
+                                                                  const int object_flag)
+{
+#ifdef __SHADOW_CATCHER__
+  if (!kernel_data.integrator.has_shadow_catcher) {
+    return false;
+  }
+
+  /* Check the flag first, avoiding fetches form global memory. */
+  if ((object_flag & SD_OBJECT_SHADOW_CATCHER) == 0) {
+    return false;
+  }
+  if (object_flag & SD_OBJECT_HOLDOUT_MASK) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if ((path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) == 0) {
+    /* Split only on primary rays, secondary bounces are to treat shadow catcher as a regular
+     * object. */
+    return false;
+  }
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
+    return false;
+  }
+
+  return true;
+#else
+  (void)object_flag;
+  return false;
+#endif
+}
+
+/* Check whether the current path can still split. */
+ccl_device_inline bool kernel_shadow_catcher_path_can_split(INTEGRATOR_STATE_CONST_ARGS)
+{
+  if (INTEGRATOR_PATH_IS_TERMINATED && INTEGRATOR_SHADOW_PATH_IS_TERMINATED) {
+    return false;
+  }
+
+  const int path_flag = INTEGRATOR_STATE(path, flag);
+
+  if (path_flag & PATH_RAY_SHADOW_CATCHER_HIT) {
+    /* Shadow catcher was already hit and the state was split. No further split is allowed. */
+    return false;
+  }
+
+  return (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND) != 0;
+}
+
+/* NOTE: Leaves kernel scheduling information untouched. Use INIT semantic for one of the paths
+ * after this function. */
+ccl_device_inline bool kernel_shadow_catcher_split(INTEGRATOR_STATE_ARGS, const int object_flags)
+{
+#ifdef __SHADOW_CATCHER__
+
+  if (!kernel_shadow_catcher_is_path_split_bounce(INTEGRATOR_STATE_PASS, object_flags)) {
+    return false;
+  }
+
+  /* The split is to be done. Mark the current state as such, so that it stops contributing to the
+   * shadow catcher matte pass, but keeps contributing to the combined pass. */
+  INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_SHADOW_CATCHER_HIT;
+
+  /* Split new state from the current one. This new state will only track contribution of shadow
+   * catcher objects ignoring non-catcher objects. */
+  integrator_state_shadow_catcher_split(INTEGRATOR_STATE_PASS);
+
+  return true;
+#else
+  (void)object_flags;
+  return false;
+#endif
+}
+
+#ifdef __SHADOW_CATCHER__
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_matte_path(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return (INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_HIT) == 0;
+}
+
+ccl_device_forceinline bool kernel_shadow_catcher_is_object_pass(INTEGRATOR_STATE_CONST_ARGS)
+{
+  return INTEGRATOR_STATE(path, flag) & PATH_RAY_SHADOW_CATCHER_PASS;
+}
+
+#endif /* __SHADOW_CATCHER__ */
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
deleted file mode 100644
index 677504a4045..00000000000
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* BSSRDF using disk based importance sampling.
- *
- * BSSRDF Importance Sampling, SIGGRAPH 2013
- * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
- */
-
-ccl_device_inline float3
-subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  float3 eval_sum = zero_float3();
-  float pdf_sum = 0.0f;
-  float sample_weight_inv = 0.0f;
-
-  if (!all) {
-    float sample_weight_sum = 0.0f;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-        sample_weight_sum += sc->sample_weight;
-      }
-    }
-
-    sample_weight_inv = 1.0f / sample_weight_sum;
-  }
-
-  for (int i = 0; i < sd->num_closure; i++) {
-    sc = &sd->closure[i];
-
-    if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-      /* in case of branched path integrate we sample all bssrdf's once,
-       * for path trace we pick one, so adjust pdf for that */
-      float sample_weight = (all) ? 1.0f : sc->sample_weight * sample_weight_inv;
-
-      /* compute pdf */
-      float3 eval = bssrdf_eval(sc, r);
-      float pdf = bssrdf_pdf(sc, disk_r);
-
-      eval_sum += sc->weight * eval;
-      pdf_sum += sample_weight * pdf;
-    }
-  }
-
-  return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3();
-}
-
-ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd,
-                                                      const ShaderClosure *sc,
-                                                      float3 throughput,
-                                                      bool all)
-{
-  /* This is the Veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting. For branched
-   * path tracing (all) we sample all closure and don't use MIS. */
-  if (!all) {
-    float bssrdf_weight = 0.0f;
-    float weight = sc->sample_weight;
-
-    for (int i = 0; i < sd->num_closure; i++) {
-      sc = &sd->closure[i];
-
-      if (CLOSURE_IS_BSSRDF(sc->type)) {
-        bssrdf_weight += sc->sample_weight;
-      }
-    }
-    throughput *= bssrdf_weight / weight;
-  }
-  return throughput;
-}
-
-/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(
-    KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N)
-{
-  sd->flag &= ~SD_CLOSURE_FLAGS;
-  sd->num_closure = 0;
-  sd->num_closure_left = kernel_data.integrator.max_closures;
-
-#ifdef __PRINCIPLED__
-  if (type == CLOSURE_BSSRDF_PRINCIPLED_ID || type == CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID) {
-    PrincipledDiffuseBsdf *bsdf = (PrincipledDiffuseBsdf *)bsdf_alloc(
-        sd, sizeof(PrincipledDiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      bsdf->roughness = roughness;
-      sd->flag |= bsdf_principled_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular Disney principled diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID;
-    }
-  }
-  else if (CLOSURE_IS_BSDF_BSSRDF(type) || CLOSURE_IS_BSSRDF(type))
-#endif /* __PRINCIPLED__ */
-  {
-    DiffuseBsdf *bsdf = (DiffuseBsdf *)bsdf_alloc(sd, sizeof(DiffuseBsdf), weight);
-
-    if (bsdf) {
-      bsdf->N = N;
-      sd->flag |= bsdf_diffuse_setup(bsdf);
-
-      /* replace CLOSURE_BSDF_DIFFUSE_ID with this special ID so render passes
-       * can recognize it as not being a regular diffuse closure */
-      bsdf->type = CLOSURE_BSDF_BSSRDF_ID;
-    }
-  }
-}
-
-/* optionally do blurring of color and/or bump mapping, at the cost of a shader evaluation */
-ccl_device float3 subsurface_color_pow(float3 color, float exponent)
-{
-  color = max(color, zero_float3());
-
-  if (exponent == 1.0f) {
-    /* nothing to do */
-  }
-  else if (exponent == 0.5f) {
-    color.x = sqrtf(color.x);
-    color.y = sqrtf(color.y);
-    color.z = sqrtf(color.z);
-  }
-  else {
-    color.x = powf(color.x, exponent);
-    color.y = powf(color.y, exponent);
-    color.z = powf(color.z, exponent);
-  }
-
-  return color;
-}
-
-ccl_device void subsurface_color_bump_blur(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float3 *eval, float3 *N)
-{
-  /* average color and texture blur at outgoing point */
-  float texture_blur;
-  float3 out_color = shader_bssrdf_sum(sd, NULL, &texture_blur);
-
-  /* do we have bump mapping? */
-  bool bump = (sd->flag & SD_HAS_BSSRDF_BUMP) != 0;
-
-  if (bump || texture_blur > 0.0f) {
-    /* average color and normal at incoming point */
-    shader_eval_surface(kg, sd, state, NULL, state->flag);
-    float3 in_color = shader_bssrdf_sum(sd, (bump) ? N : NULL, NULL);
-
-    /* we simply divide out the average color and multiply with the average
-     * of the other one. we could try to do this per closure but it's quite
-     * tricky to match closures between shader evaluations, their number and
-     * order may change, this is simpler */
-    if (texture_blur > 0.0f) {
-      out_color = subsurface_color_pow(out_color, texture_blur);
-      in_color = subsurface_color_pow(in_color, texture_blur);
-
-      *eval *= safe_divide_color(in_color, out_color);
-    }
-  }
-}
-
-/* Subsurface scattering step, from a point on the surface to other
- * nearby points on the same object.
- */
-ccl_device_inline int subsurface_scatter_disk(KernelGlobals *kg,
-                                              LocalIntersection *ss_isect,
-                                              ShaderData *sd,
-                                              const ShaderClosure *sc,
-                                              uint *lcg_state,
-                                              float disk_u,
-                                              float disk_v,
-                                              bool all)
-{
-  /* pick random axis in local frame and point on disk */
-  float3 disk_N, disk_T, disk_B;
-  float pick_pdf_N, pick_pdf_T, pick_pdf_B;
-
-  disk_N = sd->Ng;
-  make_orthonormals(disk_N, &disk_T, &disk_B);
-
-  if (disk_v < 0.5f) {
-    pick_pdf_N = 0.5f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.25f;
-    disk_v *= 2.0f;
-  }
-  else if (disk_v < 0.75f) {
-    float3 tmp = disk_N;
-    disk_N = disk_T;
-    disk_T = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.5f;
-    pick_pdf_B = 0.25f;
-    disk_v = (disk_v - 0.5f) * 4.0f;
-  }
-  else {
-    float3 tmp = disk_N;
-    disk_N = disk_B;
-    disk_B = tmp;
-    pick_pdf_N = 0.25f;
-    pick_pdf_T = 0.25f;
-    pick_pdf_B = 0.5f;
-    disk_v = (disk_v - 0.75f) * 4.0f;
-  }
-
-  /* sample point on disk */
-  float phi = M_2PI_F * disk_v;
-  float disk_height, disk_r;
-
-  bssrdf_sample(sc, disk_u, &disk_r, &disk_height);
-
-  float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
-
-  /* create ray */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = sd->P + disk_N * disk_height + disk_P;
-  ray->D = -disk_N;
-  ray->t = 2.0f * disk_height;
-  ray->dP = sd->dP;
-  ray->dD = differential3_zero();
-  ray->time = sd->time;
-
-  /* intersect with the same object. if multiple intersections are found it
-   * will use at most BSSRDF_MAX_HITS hits, a random subset of all hits */
-  scene_intersect_local(kg, ray, ss_isect, sd->object, lcg_state, BSSRDF_MAX_HITS);
-  int num_eval_hits = min(ss_isect->num_hits, BSSRDF_MAX_HITS);
-
-  for (int hit = 0; hit < num_eval_hits; hit++) {
-    /* Quickly retrieve P and Ng without setting up ShaderData. */
-    float3 hit_P;
-    if (sd->type & PRIMITIVE_TRIANGLE) {
-      hit_P = triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray);
-    }
-#ifdef __OBJECT_MOTION__
-    else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
-      float3 verts[3];
-      motion_triangle_vertices(kg,
-                               sd->object,
-                               kernel_tex_fetch(__prim_index, ss_isect->hits[hit].prim),
-                               sd->time,
-                               verts);
-      hit_P = motion_triangle_refine_local(kg, sd, &ss_isect->hits[hit], ray, verts);
-    }
-#endif /* __OBJECT_MOTION__ */
-    else {
-      ss_isect->weight[hit] = zero_float3();
-      continue;
-    }
-
-    float3 hit_Ng = ss_isect->Ng[hit];
-    if (ss_isect->hits[hit].object != OBJECT_NONE) {
-      object_normal_transform(kg, sd, &hit_Ng);
-    }
-
-    /* Probability densities for local frame axes. */
-    float pdf_N = pick_pdf_N * fabsf(dot(disk_N, hit_Ng));
-    float pdf_T = pick_pdf_T * fabsf(dot(disk_T, hit_Ng));
-    float pdf_B = pick_pdf_B * fabsf(dot(disk_B, hit_Ng));
-
-    /* Multiple importance sample between 3 axes, power heuristic
-     * found to be slightly better than balance heuristic. pdf_N
-     * in the MIS weight and denominator cancelled out. */
-    float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
-    if (ss_isect->num_hits > BSSRDF_MAX_HITS) {
-      w *= ss_isect->num_hits / (float)BSSRDF_MAX_HITS;
-    }
-
-    /* Real distance to sampled point. */
-    float r = len(hit_P - sd->P);
-
-    /* Evaluate profiles. */
-    float3 eval = subsurface_scatter_eval(sd, sc, disk_r, r, all) * w;
-
-    ss_isect->weight[hit] = eval;
-  }
-
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return num_eval_hits;
-}
-
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void subsurface_scatter_multi_setup(KernelGlobals *kg,
-                                                      LocalIntersection *ss_isect,
-                                                      int hit,
-                                                      ShaderData *sd,
-                                                      ccl_addr_space PathState *state,
-                                                      ClosureType type,
-                                                      float roughness)
-{
-  optixDirectCall<void>(2, kg, ss_isect, hit, sd, state, type, roughness);
-}
-extern "C" __device__ void __direct_callable__subsurface_scatter_multi_setup(
-#else
-ccl_device_noinline void subsurface_scatter_multi_setup(
-#endif
-    KernelGlobals *kg,
-    LocalIntersection *ss_isect,
-    int hit,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ClosureType type,
-    float roughness)
-{
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-
-  /* Workaround for AMD GPU OpenCL compiler. Most probably cache bypass issue. */
-#if defined(__SPLIT_KERNEL__) && defined(__KERNEL_OPENCL_AMD__) && defined(__KERNEL_GPU__)
-  kernel_split_params.dummy_sd_flag = sd->flag;
-#endif
-
-  /* Setup new shading point. */
-  shader_setup_from_subsurface(kg, sd, &ss_isect->hits[hit], ray);
-
-  /* Optionally blur colors and bump mapping. */
-  float3 weight = ss_isect->weight[hit];
-  float3 N = sd->N;
-  subsurface_color_bump_blur(kg, sd, state, &weight, &N);
-
-  /* Setup diffuse BSDF. */
-  subsurface_scatter_setup_diffuse_bsdf(kg, sd, type, roughness, weight, N);
-}
-
-/* Random walk subsurface scattering.
- *
- * "Practical and Controllable Subsurface Scattering for Production Path
- *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-
-ccl_device void subsurface_random_walk_remap(const float A,
-                                             const float d,
-                                             float *sigma_t,
-                                             float *alpha)
-{
-  /* Compute attenuation and scattering coefficients from albedo. */
-  *alpha = 1.0f - expf(A * (-5.09406f + A * (2.61188f - A * 4.31805f)));
-  const float s = 1.9f - A + 3.5f * sqr(A - 0.8f);
-
-  *sigma_t = 1.0f / fmaxf(d * s, 1e-16f);
-}
-
-ccl_device void subsurface_random_walk_coefficients(const ShaderClosure *sc,
-                                                    float3 *sigma_t,
-                                                    float3 *alpha,
-                                                    float3 *weight)
-{
-  const Bssrdf *bssrdf = (const Bssrdf *)sc;
-  const float3 A = bssrdf->albedo;
-  const float3 d = bssrdf->radius;
-  float sigma_t_x, sigma_t_y, sigma_t_z;
-  float alpha_x, alpha_y, alpha_z;
-
-  subsurface_random_walk_remap(A.x, d.x, &sigma_t_x, &alpha_x);
-  subsurface_random_walk_remap(A.y, d.y, &sigma_t_y, &alpha_y);
-  subsurface_random_walk_remap(A.z, d.z, &sigma_t_z, &alpha_z);
-
-  *sigma_t = make_float3(sigma_t_x, sigma_t_y, sigma_t_z);
-  *alpha = make_float3(alpha_x, alpha_y, alpha_z);
-
-  /* Closure mixing and Fresnel weights separate from albedo. */
-  *weight = safe_divide_color(bssrdf->weight, A);
-}
-
-/* References for Dwivedi sampling:
- *
- * [1] "A Zero-variance-based Sampling Scheme for Monte Carlo Subsurface Scattering"
- * by Jaroslav Křivánek and Eugene d'Eon (SIGGRAPH 2014)
- * https://cgg.mff.cuni.cz/~jaroslav/papers/2014-zerovar/
- *
- * [2] "Improving the Dwivedi Sampling Scheme"
- * by Johannes Meng, Johannes Hanika, and Carsten Dachsbacher (EGSR 2016)
- * https://cg.ivd.kit.edu/1951.php
- *
- * [3] "Zero-Variance Theory for Efficient Subsurface Scattering"
- * by Eugene d'Eon and Jaroslav Křivánek (SIGGRAPH 2020)
- * https://iliyan.com/publications/RenderingCourse2020
- */
-
-ccl_device_forceinline float eval_phase_dwivedi(float v, float phase_log, float cos_theta)
-{
-  /* Eq. 9 from [2] using precomputed log((v + 1) / (v - 1)) */
-  return 1.0f / ((v - cos_theta) * phase_log);
-}
-
-ccl_device_forceinline float sample_phase_dwivedi(float v, float phase_log, float rand)
-{
-  /* Based on Eq. 10 from [2]: `v - (v + 1) * pow((v - 1) / (v + 1), rand)`
-   * Since we're already pre-computing `phase_log = log((v + 1) / (v - 1))` for the evaluation,
-   * we can implement the power function like this. */
-  return v - (v + 1) * expf(-rand * phase_log);
-}
-
-ccl_device_forceinline float diffusion_length_dwivedi(float alpha)
-{
-  /* Eq. 67 from [3] */
-  return 1.0f / sqrtf(1.0f - powf(alpha, 2.44294f - 0.0215813f * alpha + 0.578637f / alpha));
-}
-
-ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
-{
-  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
-  float phi = M_2PI_F * randv;
-  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);
-
-  float3 T, B;
-  make_orthonormals(D, &T, &B);
-  return dir.x * T + dir.y * B + dir.z * D;
-}
-
-ccl_device_forceinline float3 subsurface_random_walk_pdf(float3 sigma_t,
-                                                         float t,
-                                                         bool hit,
-                                                         float3 *transmittance)
-{
-  float3 T = volume_color_transmittance(sigma_t, t);
-  if (transmittance) {
-    *transmittance = T;
-  }
-  return hit ? T : sigma_t * T;
-}
-
-#ifdef __KERNEL_OPTIX__
-ccl_device_inline /* inline trace calls */
-#else
-ccl_device_noinline
-#endif
-    bool
-    subsurface_random_walk(KernelGlobals *kg,
-                           LocalIntersection *ss_isect,
-                           ShaderData *sd,
-                           ccl_addr_space PathState *state,
-                           const ShaderClosure *sc,
-                           const float bssrdf_u,
-                           const float bssrdf_v,
-                           bool all)
-{
-  /* Sample diffuse surface scatter into the object. */
-  float3 D;
-  float pdf;
-  sample_cos_hemisphere(-sd->N, bssrdf_u, bssrdf_v, &D, &pdf);
-  if (dot(-sd->Ng, D) <= 0.0f) {
-    return 0;
-  }
-
-  /* Convert subsurface to volume coefficients.
-   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
-  float3 sigma_t, alpha;
-  float3 throughput = one_float3();
-  subsurface_random_walk_coefficients(sc, &sigma_t, &alpha, &throughput);
-  float3 sigma_s = sigma_t * alpha;
-
-  /* Theoretically it should be better to use the exact alpha for the channel we're sampling at
-   * each bounce, but in practice there doesn't seem to be a noticeable difference in exchange
-   * for making the code significantly more complex and slower (if direction sampling depends on
-   * the sampled channel, we need to compute its PDF per-channel and consider it for MIS later on).
-   *
-   * Since the strength of the guided sampling increases as alpha gets lower, using a value that
-   * is too low results in fireflies while one that's too high just gives a bit more noise.
-   * Therefore, the code here uses the highest of the three albedos to be safe. */
-  float diffusion_length = diffusion_length_dwivedi(max3(alpha));
-  /* Precompute term for phase sampling. */
-  float phase_log = logf((diffusion_length + 1) / (diffusion_length - 1));
-
-  /* Setup ray. */
-#ifdef __SPLIT_KERNEL__
-  Ray ray_object = ss_isect->ray;
-  Ray *ray = &ray_object;
-#else
-  Ray *ray = &ss_isect->ray;
-#endif
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->D = D;
-  ray->t = FLT_MAX;
-  ray->time = sd->time;
-
-  /* Modify state for RNGs, decorrelated from other paths. */
-  uint prev_rng_offset = state->rng_offset;
-  uint prev_rng_hash = state->rng_hash;
-  state->rng_hash = cmj_hash(state->rng_hash + state->rng_offset, 0xdeadbeef);
-
-  /* Random walk until we hit the surface again. */
-  bool hit = false;
-  bool have_opposite_interface = false;
-  float opposite_distance = 0.0f;
-
-  /* Todo: Disable for alpha>0.999 or so? */
-  const float guided_fraction = 0.75f;
-
-  for (int bounce = 0; bounce < BSSRDF_MAX_BOUNCES; bounce++) {
-    /* Advance random number offset. */
-    state->rng_offset += PRNG_BOUNCE_NUM;
-
-    /* Sample color channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(alpha, throughput, rphase, &channel_pdf);
-    float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-    float randt = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    /* We need the result of the raycast to compute the full guided PDF, so just remember the
-     * relevant terms to avoid recomputing them later. */
-    float backward_fraction = 0.0f;
-    float forward_pdf_factor = 0.0f;
-    float forward_stretching = 1.0f;
-    float backward_pdf_factor = 0.0f;
-    float backward_stretching = 1.0f;
-
-    /* For the initial ray, we already know the direction, so just do classic distance sampling. */
-    if (bounce > 0) {
-      /* Decide whether we should use guided or classic sampling. */
-      bool guided = (path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE) < guided_fraction);
-
-      /* Determine if we want to sample away from the incoming interface.
-       * This only happens if we found a nearby opposite interface, and the probability for it
-       * depends on how close we are to it already.
-       * This probability term comes from the recorded presentation of [3]. */
-      bool guide_backward = false;
-      if (have_opposite_interface) {
-        /* Compute distance of the random walk between the tangent plane at the starting point
-         * and the assumed opposite interface (the parallel plane that contains the point we
-         * found in our ray query for the opposite side). */
-        float x = clamp(dot(ray->P - sd->P, -sd->N), 0.0f, opposite_distance);
-        backward_fraction = 1.0f / (1.0f + expf((opposite_distance - 2 * x) / diffusion_length));
-        guide_backward = path_state_rng_1D(kg, state, PRNG_TERMINATE) < backward_fraction;
-      }
-
-      /* Sample scattering direction. */
-      float scatter_u, scatter_v;
-      path_state_rng_2D(kg, state, PRNG_BSDF_U, &scatter_u, &scatter_v);
-      float cos_theta;
-      if (guided) {
-        cos_theta = sample_phase_dwivedi(diffusion_length, phase_log, scatter_u);
-        /* The backwards guiding distribution is just mirrored along sd->N, so swapping the
-         * sign here is enough to sample from that instead. */
-        if (guide_backward) {
-          cos_theta = -cos_theta;
-        }
-      }
-      else {
-        cos_theta = 2.0f * scatter_u - 1.0f;
-      }
-      ray->D = direction_from_cosine(sd->N, cos_theta, scatter_v);
-
-      /* Compute PDF factor caused by phase sampling (as the ratio of guided / classic).
-       * Since phase sampling is channel-independent, we can get away with applying a factor
-       * to the guided PDF, which implicitly means pulling out the classic PDF term and letting
-       * it cancel with an equivalent term in the numerator of the full estimator.
-       * For the backward PDF, we again reuse the same probability distribution with a sign swap.
-       */
-      forward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, cos_theta);
-      backward_pdf_factor = 2.0f * eval_phase_dwivedi(diffusion_length, phase_log, -cos_theta);
-
-      /* Prepare distance sampling.
-       * For the backwards case, this also needs the sign swapped since now directions against
-       * sd->N (and therefore with negative cos_theta) are preferred. */
-      forward_stretching = (1.0f - cos_theta / diffusion_length);
-      backward_stretching = (1.0f + cos_theta / diffusion_length);
-      if (guided) {
-        sample_sigma_t *= guide_backward ? backward_stretching : forward_stretching;
-      }
-    }
-
-    /* Sample direction along ray. */
-    float t = -logf(1.0f - randt) / sample_sigma_t;
-
-    /* On the first bounce, we use the raycast to check if the opposite side is nearby.
-     * If yes, we will later use backwards guided sampling in order to have a decent
-     * chance of connecting to it.
-     * Todo: Maybe use less than 10 times the mean free path? */
-    ray->t = (bounce == 0) ? max(t, 10.0f / (min3(sigma_t))) : t;
-    scene_intersect_local(kg, ray, ss_isect, sd->object, NULL, 1);
-    hit = (ss_isect->num_hits > 0);
-
-    if (hit) {
-#ifdef __KERNEL_OPTIX__
-      /* t is always in world space with OptiX. */
-      ray->t = ss_isect->hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = ray->D;
-      object_inverse_dir_transform(kg, sd, &D);
-      D = normalize(D) * ss_isect->hits[0].t;
-      object_dir_transform(kg, sd, &D);
-      ray->t = len(D);
-#endif
-    }
-
-    if (bounce == 0) {
-      /* Check if we hit the opposite side. */
-      if (hit) {
-        have_opposite_interface = true;
-        opposite_distance = dot(ray->P + ray->t * ray->D - sd->P, -sd->N);
-      }
-      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
-       * so check if there would have been a hit in that case. */
-      hit = ray->t < t;
-    }
-
-    /* Use the distance to the exit point for the throughput update if we found one. */
-    if (hit) {
-      t = ray->t;
-    }
-    else if (bounce == 0) {
-      /* Restore original position if nothing was hit after the first bounce,
-       * without the ray_offset() that was added to avoid self-intersection.
-       * Otherwise if that offset is relatively large compared to the scattering
-       * radius, we never go back up high enough to exit the surface. */
-      ray->P = sd->P;
-    }
-
-    /* Advance to new scatter location. */
-    ray->P += t * ray->D;
-
-    float3 transmittance;
-    float3 pdf = subsurface_random_walk_pdf(sigma_t, t, hit, &transmittance);
-    if (bounce > 0) {
-      /* Compute PDF just like we do for classic sampling, but with the stretched sigma_t. */
-      float3 guided_pdf = subsurface_random_walk_pdf(forward_stretching * sigma_t, t, hit, NULL);
-
-      if (have_opposite_interface) {
-        /* First step of MIS: Depending on geometry we might have two methods for guided
-         * sampling, so perform MIS between them. */
-        float3 back_pdf = subsurface_random_walk_pdf(backward_stretching * sigma_t, t, hit, NULL);
-        guided_pdf = mix(
-            guided_pdf * forward_pdf_factor, back_pdf * backward_pdf_factor, backward_fraction);
-      }
-      else {
-        /* Just include phase sampling factor otherwise. */
-        guided_pdf *= forward_pdf_factor;
-      }
-
-      /* Now we apply the MIS balance heuristic between the classic and guided sampling. */
-      pdf = mix(pdf, guided_pdf, guided_fraction);
-    }
-
-    /* Finally, we're applying MIS again to combine the three color channels.
-     * Altogether, the MIS computation combines up to nine different estimators:
-     * {classic, guided, backward_guided} x {r, g, b} */
-    throughput *= (hit ? transmittance : sigma_s * transmittance) / dot(channel_pdf, pdf);
-
-    if (hit) {
-      /* If we hit the surface, we are done. */
-      break;
-    }
-    else if (throughput.x < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.y < VOLUME_THROUGHPUT_EPSILON &&
-             throughput.z < VOLUME_THROUGHPUT_EPSILON) {
-      /* Avoid unnecessary work and precision issue when throughput gets really small. */
-      break;
-    }
-  }
-
-  kernel_assert(isfinite_safe(throughput.x) && isfinite_safe(throughput.y) &&
-                isfinite_safe(throughput.z));
-
-  state->rng_offset = prev_rng_offset;
-  state->rng_hash = prev_rng_hash;
-
-  /* Return number of hits in ss_isect. */
-  if (!hit) {
-    return 0;
-  }
-
-  /* TODO: gain back performance lost from merging with disk BSSRDF. We
-   * only need to return on hit so this indirect ray push/pop overhead
-   * is not actually needed, but it does keep the code simpler. */
-  ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all);
-#ifdef __SPLIT_KERNEL__
-  ss_isect->ray = *ray;
-#endif
-
-  return 1;
-}
-
-ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg,
-                                                         LocalIntersection *ss_isect,
-                                                         ShaderData *sd,
-                                                         ccl_addr_space PathState *state,
-                                                         const ShaderClosure *sc,
-                                                         uint *lcg_state,
-                                                         float bssrdf_u,
-                                                         float bssrdf_v,
-                                                         bool all)
-{
-  if (CLOSURE_IS_DISK_BSSRDF(sc->type)) {
-    return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all);
-  }
-  else {
-    return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_textures.h b/intern/cycles/kernel/kernel_textures.h
index c8e01677d09..bf9b94c1753 100644
--- a/intern/cycles/kernel/kernel_textures.h
+++ b/intern/cycles/kernel/kernel_textures.h
@@ -78,7 +78,7 @@ KERNEL_TEX(KernelShader, __shaders)
 KERNEL_TEX(float, __lookup_table)
 
 /* sobol */
-KERNEL_TEX(uint, __sample_pattern_lut)
+KERNEL_TEX(float, __sample_pattern_lut)
 
 /* image textures */
 KERNEL_TEX(TextureInfo, __texture_info)
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 7cbe18acf28..66b7310ab65 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_TYPES_H__
-#define __KERNEL_TYPES_H__
+#pragma once
 
 #if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
 #  include <embree3/rtcore.h>
@@ -60,27 +59,9 @@ CCL_NAMESPACE_BEGIN
 #define PRIM_NONE (~0)
 #define LAMP_NONE (~0)
 #define ID_NONE (0.0f)
+#define PASS_UNUSED (~0)
 
-#define VOLUME_STACK_SIZE 32
-
-/* Split kernel constants */
-#define WORK_POOL_SIZE_GPU 64
-#define WORK_POOL_SIZE_CPU 1
-#ifdef __KERNEL_GPU__
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_GPU
-#else
-#  define WORK_POOL_SIZE WORK_POOL_SIZE_CPU
-#endif
-
-#define SHADER_SORT_BLOCK_SIZE 2048
-
-#ifdef __KERNEL_OPENCL__
-#  define SHADER_SORT_LOCAL_SIZE 64
-#elif defined(__KERNEL_CUDA__)
-#  define SHADER_SORT_LOCAL_SIZE 32
-#else
-#  define SHADER_SORT_LOCAL_SIZE 1
-#endif
+#define VOLUME_STACK_SIZE 4
 
 /* Kernel features */
 #define __SOBOL__
@@ -93,7 +74,7 @@ CCL_NAMESPACE_BEGIN
 #define __INTERSECTION_REFINE__
 #define __CLAMP_SAMPLE__
 #define __PATCH_EVAL__
-#define __SHADOW_TRICKS__
+#define __SHADOW_CATCHER__
 #define __DENOISING_FEATURES__
 #define __SHADER_RAYTRACE__
 #define __AO__
@@ -102,7 +83,6 @@ CCL_NAMESPACE_BEGIN
 #define __SVM__
 #define __EMISSION__
 #define __HOLDOUT__
-#define __MULTI_CLOSURE__
 #define __TRANSPARENT_SHADOWS__
 #define __BACKGROUND_MIS__
 #define __LAMP_MIS__
@@ -112,7 +92,6 @@ CCL_NAMESPACE_BEGIN
 #define __PRINCIPLED__
 #define __SUBSURFACE__
 #define __VOLUME__
-#define __VOLUME_SCATTER__
 #define __CMJ__
 #define __SHADOW_RECORD_ALL__
 #define __BRANCHED_PATH__
@@ -122,106 +101,60 @@ CCL_NAMESPACE_BEGIN
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
-#  define __VOLUME_DECOUPLED__
 #  define __VOLUME_RECORD_ALL__
 #endif /* __KERNEL_CPU__ */
 
-#ifdef __KERNEL_CUDA__
-#  ifdef __SPLIT_KERNEL__
-#    undef __BRANCHED_PATH__
-#  endif
-#endif /* __KERNEL_CUDA__ */
-
 #ifdef __KERNEL_OPTIX__
 #  undef __BAKING__
-#  undef __BRANCHED_PATH__
 #endif /* __KERNEL_OPTIX__ */
 
-#ifdef __KERNEL_OPENCL__
-#endif /* __KERNEL_OPENCL__ */
-
 /* Scene-based selective features compilation. */
-#ifdef __NO_CAMERA_MOTION__
-#  undef __CAMERA_MOTION__
-#endif
-#ifdef __NO_OBJECT_MOTION__
-#  undef __OBJECT_MOTION__
-#endif
-#ifdef __NO_HAIR__
-#  undef __HAIR__
-#endif
-#ifdef __NO_VOLUME__
-#  undef __VOLUME__
-#  undef __VOLUME_SCATTER__
-#endif
-#ifdef __NO_SUBSURFACE__
-#  undef __SUBSURFACE__
-#endif
-#ifdef __NO_BAKING__
-#  undef __BAKING__
-#endif
-#ifdef __NO_BRANCHED_PATH__
-#  undef __BRANCHED_PATH__
-#endif
-#ifdef __NO_PATCH_EVAL__
-#  undef __PATCH_EVAL__
-#endif
-#ifdef __NO_TRANSPARENT__
-#  undef __TRANSPARENT_SHADOWS__
-#endif
-#ifdef __NO_SHADOW_TRICKS__
-#  undef __SHADOW_TRICKS__
-#endif
-#ifdef __NO_PRINCIPLED__
-#  undef __PRINCIPLED__
-#endif
-#ifdef __NO_DENOISING__
-#  undef __DENOISING_FEATURES__
-#endif
-#ifdef __NO_SHADER_RAYTRACE__
-#  undef __SHADER_RAYTRACE__
+#ifdef __KERNEL_FEATURES__
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_CAMERA_MOTION)
+#    undef __CAMERA_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_OBJECT_MOTION)
+#    undef __OBJECT_MOTION__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_HAIR)
+#    undef __HAIR__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_VOLUME)
+#    undef __VOLUME__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
+#    undef __SUBSURFACE__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
+#    undef __BAKING__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
+#    undef __PATCH_EVAL__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_TRANSPARENT)
+#    undef __TRANSPARENT_SHADOWS__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SHADOW_CATCHER)
+#    undef __SHADOW_CATCHER__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PRINCIPLED)
+#    undef __PRINCIPLED__
+#  endif
+#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_DENOISING)
+#    undef __DENOISING_FEATURES__
+#  endif
 #endif
 
 #ifdef WITH_CYCLES_DEBUG_NAN
 #  define __KERNEL_DEBUG_NAN__
 #endif
 
+/* Features that enable others */
+
 #if defined(__SUBSURFACE__) || defined(__SHADER_RAYTRACE__)
 #  define __BVH_LOCAL__
 #endif
 
-/* Shader Evaluation */
-
-typedef enum ShaderEvalType {
-  SHADER_EVAL_DISPLACE,
-  SHADER_EVAL_BACKGROUND,
-  /* bake types */
-  SHADER_EVAL_BAKE, /* no real shade, it's used in the code to
-                     * differentiate the type of shader eval from the above
-                     */
-  /* data passes */
-  SHADER_EVAL_NORMAL,
-  SHADER_EVAL_UV,
-  SHADER_EVAL_ROUGHNESS,
-  SHADER_EVAL_DIFFUSE_COLOR,
-  SHADER_EVAL_GLOSSY_COLOR,
-  SHADER_EVAL_TRANSMISSION_COLOR,
-  SHADER_EVAL_EMISSION,
-  SHADER_EVAL_AOV_COLOR,
-  SHADER_EVAL_AOV_VALUE,
-
-  /* light passes */
-  SHADER_EVAL_AO,
-  SHADER_EVAL_COMBINED,
-  SHADER_EVAL_SHADOW,
-  SHADER_EVAL_DIFFUSE,
-  SHADER_EVAL_GLOSSY,
-  SHADER_EVAL_TRANSMISSION,
-
-  /* extra */
-  SHADER_EVAL_ENVIRONMENT,
-} ShaderEvalType;
-
 /* Path Tracing
  * note we need to keep the u/v pairs at even values */
 
@@ -252,8 +185,7 @@ enum PathTraceDimension {
 
 enum SamplingPattern {
   SAMPLING_PATTERN_SOBOL = 0,
-  SAMPLING_PATTERN_CMJ = 1,
-  SAMPLING_PATTERN_PMJ = 2,
+  SAMPLING_PATTERN_PMJ = 1,
 
   SAMPLING_NUM_PATTERNS,
 };
@@ -261,7 +193,12 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
 
 enum PathRayFlag {
-  /* Ray visibility. */
+  /* --------------------------------------------------------------------
+   * Ray visibility.
+   *
+   * NOTE: Recalculated after a surface bounce.
+   */
+
   PATH_RAY_CAMERA = (1 << 0),
   PATH_RAY_REFLECT = (1 << 1),
   PATH_RAY_TRANSMIT = (1 << 2),
@@ -269,57 +206,106 @@ enum PathRayFlag {
   PATH_RAY_GLOSSY = (1 << 4),
   PATH_RAY_SINGULAR = (1 << 5),
   PATH_RAY_TRANSPARENT = (1 << 6),
+  PATH_RAY_VOLUME_SCATTER = (1 << 7),
 
   /* Shadow ray visibility. */
-  PATH_RAY_SHADOW_OPAQUE_NON_CATCHER = (1 << 7),
-  PATH_RAY_SHADOW_OPAQUE_CATCHER = (1 << 8),
-  PATH_RAY_SHADOW_OPAQUE = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER | PATH_RAY_SHADOW_OPAQUE_CATCHER),
-  PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER = (1 << 9),
-  PATH_RAY_SHADOW_TRANSPARENT_CATCHER = (1 << 10),
-  PATH_RAY_SHADOW_TRANSPARENT = (PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_CATCHER),
-  PATH_RAY_SHADOW_NON_CATCHER = (PATH_RAY_SHADOW_OPAQUE_NON_CATCHER |
-                                 PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER),
+  PATH_RAY_SHADOW_OPAQUE = (1 << 8),
+  PATH_RAY_SHADOW_TRANSPARENT = (1 << 9),
   PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE | PATH_RAY_SHADOW_TRANSPARENT),
 
-  /* Unused, free to reuse. */
-  PATH_RAY_UNUSED = (1 << 11),
+  /* Special flag to tag unaligned BVH nodes.
+   * Only set and used in BVH nodes to distinguish how to interpret bounding box information stored
+   * in the node (either it should be intersected as AABB or as OBB). */
+  PATH_RAY_NODE_UNALIGNED = (1 << 10),
 
-  /* Ray visibility for volume scattering. */
-  PATH_RAY_VOLUME_SCATTER = (1 << 12),
-
-  /* Special flag to tag unaligned BVH nodes. */
-  PATH_RAY_NODE_UNALIGNED = (1 << 13),
+  /* Subset of flags used for ray visibility for intersection.
+   *
+   * NOTE: SHADOW_CATCHER macros below assume there are no more than
+   * 16 visibility bits. */
+  PATH_RAY_ALL_VISIBILITY = ((1 << 11) - 1),
 
-  PATH_RAY_ALL_VISIBILITY = ((1 << 14) - 1),
+  /* --------------------------------------------------------------------
+   * Path flags.
+   */
 
   /* Don't apply multiple importance sampling weights to emission from
    * lamp or surface hits, because they were not direct light sampled. */
-  PATH_RAY_MIS_SKIP = (1 << 14),
+  PATH_RAY_MIS_SKIP = (1 << 11),
+
   /* Diffuse bounce earlier in the path, skip SSS to improve performance
    * and avoid branching twice with disk sampling SSS. */
-  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 15),
+  PATH_RAY_DIFFUSE_ANCESTOR = (1 << 12),
+
   /* Single pass has been written. */
-  PATH_RAY_SINGLE_PASS_DONE = (1 << 16),
-  /* Ray is behind a shadow catcher. */
-  PATH_RAY_SHADOW_CATCHER = (1 << 17),
-  /* Store shadow data for shadow catcher or denoising. */
-  PATH_RAY_STORE_SHADOW_INFO = (1 << 18),
+  PATH_RAY_SINGLE_PASS_DONE = (1 << 13),
+
   /* Zero background alpha, for camera or transparent glass rays. */
-  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 19),
+  PATH_RAY_TRANSPARENT_BACKGROUND = (1 << 14),
+
   /* Terminate ray immediately at next bounce. */
-  PATH_RAY_TERMINATE_IMMEDIATE = (1 << 20),
+  PATH_RAY_TERMINATE_ON_NEXT_SURFACE = (1 << 15),
+  PATH_RAY_TERMINATE_IN_NEXT_VOLUME = (1 << 16),
+
   /* Ray is to be terminated, but continue with transparent bounces and
    * emission as long as we encounter them. This is required to make the
    * MIS between direct and indirect light rays match, as shadow rays go
    * through transparent surfaces to reach emission too. */
-  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 21),
+  PATH_RAY_TERMINATE_AFTER_TRANSPARENT = (1 << 17),
+
+  /* Terminate ray immediately after volume shading. */
+  PATH_RAY_TERMINATE_AFTER_VOLUME = (1 << 18),
+
   /* Ray is to be terminated. */
-  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_IMMEDIATE | PATH_RAY_TERMINATE_AFTER_TRANSPARENT),
+  PATH_RAY_TERMINATE = (PATH_RAY_TERMINATE_ON_NEXT_SURFACE | PATH_RAY_TERMINATE_IN_NEXT_VOLUME |
+                        PATH_RAY_TERMINATE_AFTER_TRANSPARENT | PATH_RAY_TERMINATE_AFTER_VOLUME),
+
   /* Path and shader is being evaluated for direct lighting emission. */
-  PATH_RAY_EMISSION = (1 << 22)
+  PATH_RAY_EMISSION = (1 << 19),
+
+  /* Perform subsurface scattering. */
+  PATH_RAY_SUBSURFACE = (1 << 20),
+
+  /* Contribute to denoising features. */
+  PATH_RAY_DENOISING_FEATURES = (1 << 21),
+
+  /* Render pass categories. */
+  PATH_RAY_REFLECT_PASS = (1 << 22),
+  PATH_RAY_TRANSMISSION_PASS = (1 << 23),
+  PATH_RAY_VOLUME_PASS = (1 << 24),
+  PATH_RAY_ANY_PASS = (PATH_RAY_REFLECT_PASS | PATH_RAY_TRANSMISSION_PASS | PATH_RAY_VOLUME_PASS),
+
+  /* Shadow ray is for a light or surface. */
+  PATH_RAY_SHADOW_FOR_LIGHT = (1 << 25),
+
+  /* A shadow catcher object was hit and the path was split into two. */
+  PATH_RAY_SHADOW_CATCHER_HIT = (1 << 26),
+
+  /* A shadow catcher object was hit and this path traces only shadow catchers, writing them into
+   * their dedicated pass for later division.
+   *
+   * NOTE: Is not covered with `PATH_RAY_ANY_PASS` because shadow catcher does special handling
+   * which is separate from the light passes. */
+  PATH_RAY_SHADOW_CATCHER_PASS = (1 << 27),
+
+  /* Path is evaluating background for an approximate shadow catcher with non-transparent film. */
+  PATH_RAY_SHADOW_CATCHER_BACKGROUND = (1 << 28),
 };
 
+/* Configure ray visibility bits for rays and objects respectively,
+ * to make shadow catchers work.
+ *
+ * On shadow catcher paths we want to ignore any intersections with non-catchers,
+ * whereas on regular paths we want to intersect all objects. */
+
+#define SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) ((visibility) << 16)
+
+#define SHADOW_CATCHER_PATH_VISIBILITY(path_flag, visibility) \
+  (((path_flag)&PATH_RAY_SHADOW_CATCHER_PASS) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : \
+                                                (visibility))
+
+#define SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility) \
+  (((is_shadow_catcher) ? SHADOW_CATCHER_VISIBILITY_SHIFT(visibility) : 0) | (visibility))
+
 /* Closure Label */
 
 typedef enum ClosureLabel {
@@ -332,6 +318,7 @@ typedef enum ClosureLabel {
   LABEL_TRANSPARENT = 32,
   LABEL_VOLUME_SCATTER = 64,
   LABEL_TRANSMIT_TRANSPARENT = 128,
+  LABEL_SUBSURFACE_SCATTER = 256,
 } ClosureLabel;
 
 /* Render Passes */
@@ -339,17 +326,35 @@ typedef enum ClosureLabel {
 #define PASS_NAME_JOIN(a, b) a##_##b
 #define PASSMASK(pass) (1 << ((PASS_NAME_JOIN(PASS, pass)) % 32))
 
-#define PASSMASK_COMPONENT(comp) \
-  (PASSMASK(PASS_NAME_JOIN(comp, DIRECT)) | PASSMASK(PASS_NAME_JOIN(comp, INDIRECT)) | \
-   PASSMASK(PASS_NAME_JOIN(comp, COLOR)))
-
+// NOTE: Keep in sync with `Pass::get_type_enum()`.
 typedef enum PassType {
   PASS_NONE = 0,
 
-  /* Main passes */
+  /* Light Passes */
   PASS_COMBINED = 1,
-  PASS_DEPTH,
+  PASS_EMISSION,
+  PASS_BACKGROUND,
+  PASS_AO,
+  PASS_SHADOW,
+  PASS_DIFFUSE,
+  PASS_DIFFUSE_DIRECT,
+  PASS_DIFFUSE_INDIRECT,
+  PASS_GLOSSY,
+  PASS_GLOSSY_DIRECT,
+  PASS_GLOSSY_INDIRECT,
+  PASS_TRANSMISSION,
+  PASS_TRANSMISSION_DIRECT,
+  PASS_TRANSMISSION_INDIRECT,
+  PASS_VOLUME,
+  PASS_VOLUME_DIRECT,
+  PASS_VOLUME_INDIRECT,
+  PASS_CATEGORY_LIGHT_END = 31,
+
+  /* Data passes */
+  PASS_DEPTH = 32,
+  PASS_POSITION,
   PASS_NORMAL,
+  PASS_ROUGHNESS,
   PASS_UV,
   PASS_OBJECT_ID,
   PASS_MATERIAL_ID,
@@ -361,31 +366,35 @@ typedef enum PassType {
   PASS_AOV_VALUE,
   PASS_ADAPTIVE_AUX_BUFFER,
   PASS_SAMPLE_COUNT,
-  PASS_CATEGORY_MAIN_END = 31,
-
-  PASS_MIST = 32,
-  PASS_EMISSION,
-  PASS_BACKGROUND,
-  PASS_AO,
-  PASS_SHADOW,
-  PASS_LIGHT, /* no real pass, used to force use_light_pass */
-  PASS_DIFFUSE_DIRECT,
-  PASS_DIFFUSE_INDIRECT,
   PASS_DIFFUSE_COLOR,
-  PASS_GLOSSY_DIRECT,
-  PASS_GLOSSY_INDIRECT,
   PASS_GLOSSY_COLOR,
-  PASS_TRANSMISSION_DIRECT,
-  PASS_TRANSMISSION_INDIRECT,
   PASS_TRANSMISSION_COLOR,
-  PASS_VOLUME_DIRECT = 50,
-  PASS_VOLUME_INDIRECT,
   /* No Scatter color since it's tricky to define what it would even mean. */
-  PASS_CATEGORY_LIGHT_END = 63,
+  PASS_MIST,
+  PASS_DENOISING_NORMAL,
+  PASS_DENOISING_ALBEDO,
+
+  /* PASS_SHADOW_CATCHER accumulates contribution of shadow catcher object which is not affected by
+   * any other object. The pass accessor will divide the combined pass by the shadow catcher. The
+   * result of this division is then to be multiplied with the backdrop. The alpha channel of this
+   * pass contains number of samples which contributed to the color components of the pass.
+   *
+   * PASS_SHADOW_CATCHER_SAMPLE_COUNT contains number of samples for which the path split
+   * happened.
+   *
+   * PASS_SHADOW_CATCHER_MATTE contains pass which contains non-catcher objects. This pass is to be
+   * alpha-overed onto the backdrop (after multiplication). */
+  PASS_SHADOW_CATCHER,
+  PASS_SHADOW_CATCHER_SAMPLE_COUNT,
+  PASS_SHADOW_CATCHER_MATTE,
+
+  PASS_CATEGORY_DATA_END = 63,
 
   PASS_BAKE_PRIMITIVE,
   PASS_BAKE_DIFFERENTIAL,
-  PASS_CATEGORY_BAKE_END = 95
+  PASS_CATEGORY_BAKE_END = 95,
+
+  PASS_NUM,
 } PassType;
 
 #define PASS_ANY (~0)
@@ -398,158 +407,9 @@ typedef enum CryptomatteType {
   CRYPT_ACCURATE = (1 << 3),
 } CryptomatteType;
 
-typedef enum DenoisingPassOffsets {
-  DENOISING_PASS_NORMAL = 0,
-  DENOISING_PASS_NORMAL_VAR = 3,
-  DENOISING_PASS_ALBEDO = 6,
-  DENOISING_PASS_ALBEDO_VAR = 9,
-  DENOISING_PASS_DEPTH = 12,
-  DENOISING_PASS_DEPTH_VAR = 13,
-  DENOISING_PASS_SHADOW_A = 14,
-  DENOISING_PASS_SHADOW_B = 17,
-  DENOISING_PASS_COLOR = 20,
-  DENOISING_PASS_COLOR_VAR = 23,
-  DENOISING_PASS_CLEAN = 26,
-
-  DENOISING_PASS_PREFILTERED_DEPTH = 0,
-  DENOISING_PASS_PREFILTERED_NORMAL = 1,
-  DENOISING_PASS_PREFILTERED_SHADOWING = 4,
-  DENOISING_PASS_PREFILTERED_ALBEDO = 5,
-  DENOISING_PASS_PREFILTERED_COLOR = 8,
-  DENOISING_PASS_PREFILTERED_VARIANCE = 11,
-  DENOISING_PASS_PREFILTERED_INTENSITY = 14,
-
-  DENOISING_PASS_SIZE_BASE = 26,
-  DENOISING_PASS_SIZE_CLEAN = 3,
-  DENOISING_PASS_SIZE_PREFILTERED = 15,
-} DenoisingPassOffsets;
-
-typedef enum eBakePassFilter {
-  BAKE_FILTER_NONE = 0,
-  BAKE_FILTER_DIRECT = (1 << 0),
-  BAKE_FILTER_INDIRECT = (1 << 1),
-  BAKE_FILTER_COLOR = (1 << 2),
-  BAKE_FILTER_DIFFUSE = (1 << 3),
-  BAKE_FILTER_GLOSSY = (1 << 4),
-  BAKE_FILTER_TRANSMISSION = (1 << 5),
-  BAKE_FILTER_EMISSION = (1 << 6),
-  BAKE_FILTER_AO = (1 << 7),
-} eBakePassFilter;
-
-typedef enum BakePassFilterCombos {
-  BAKE_FILTER_COMBINED = (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE |
-                          BAKE_FILTER_GLOSSY | BAKE_FILTER_TRANSMISSION | BAKE_FILTER_EMISSION |
-                          BAKE_FILTER_AO),
-  BAKE_FILTER_DIFFUSE_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_DIRECT = (BAKE_FILTER_DIRECT | BAKE_FILTER_TRANSMISSION),
-  BAKE_FILTER_DIFFUSE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_DIFFUSE),
-  BAKE_FILTER_GLOSSY_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_GLOSSY),
-  BAKE_FILTER_TRANSMISSION_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_TRANSMISSION),
-} BakePassFilterCombos;
-
-typedef enum DenoiseFlag {
-  DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
-  DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
-  DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
-  DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
-  DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
-  DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
-  DENOISING_CLEAN_ALL_PASSES = (1 << 6) - 1,
-} DenoiseFlag;
-
-typedef ccl_addr_space struct PathRadianceState {
-#ifdef __PASSES__
-  float3 diffuse;
-  float3 glossy;
-  float3 transmission;
-  float3 volume;
-
-  float3 direct;
-#endif
-} PathRadianceState;
-
-typedef ccl_addr_space struct PathRadiance {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
-  float transparent;
-  float3 emission;
-#ifdef __PASSES__
-  float3 background;
-  float3 ao;
-
-  float3 indirect;
-  float3 direct_emission;
-
-  float3 color_diffuse;
-  float3 color_glossy;
-  float3 color_transmission;
-
-  float3 direct_diffuse;
-  float3 direct_glossy;
-  float3 direct_transmission;
-  float3 direct_volume;
-
-  float3 indirect_diffuse;
-  float3 indirect_glossy;
-  float3 indirect_transmission;
-  float3 indirect_volume;
-
-  float3 shadow;
-  float mist;
-#endif
-
-  struct PathRadianceState state;
-
-#ifdef __SHADOW_TRICKS__
-  /* Total light reachable across the path, ignoring shadow blocked queries. */
-  float3 path_total;
-  /* Total light reachable across the path with shadow blocked queries
-   * applied here.
-   *
-   * Dividing this figure by path_total will give estimate of shadow pass.
-   */
-  float3 path_total_shaded;
-
-  /* Color of the background on which shadow is alpha-overed. */
-  float3 shadow_background_color;
-
-  /* Path radiance sum and throughput at the moment when ray hits shadow
-   * catcher object.
-   */
-  float shadow_throughput;
-
-  /* Accumulated transparency along the path after shadow catcher bounce. */
-  float shadow_transparency;
-
-  /* Indicate if any shadow catcher data is set. */
-  int has_shadow_catcher;
-#endif
-
-#ifdef __DENOISING_FEATURES__
-  float3 denoising_normal;
-  float3 denoising_albedo;
-  float denoising_depth;
-#endif /* __DENOISING_FEATURES__ */
-} PathRadiance;
-
 typedef struct BsdfEval {
-#ifdef __PASSES__
-  int use_light_pass;
-#endif
-
   float3 diffuse;
-#ifdef __PASSES__
   float3 glossy;
-  float3 transmission;
-  float3 transparent;
-  float3 volume;
-#endif
-#ifdef __SHADOW_TRICKS__
-  float3 sum_no_mis;
-#endif
 } BsdfEval;
 
 /* Shader Flag */
@@ -564,8 +424,10 @@ typedef enum ShaderFlag {
   SHADER_EXCLUDE_TRANSMIT = (1 << 25),
   SHADER_EXCLUDE_CAMERA = (1 << 24),
   SHADER_EXCLUDE_SCATTER = (1 << 23),
+  SHADER_EXCLUDE_SHADOW_CATCHER = (1 << 22),
   SHADER_EXCLUDE_ANY = (SHADER_EXCLUDE_DIFFUSE | SHADER_EXCLUDE_GLOSSY | SHADER_EXCLUDE_TRANSMIT |
-                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER),
+                        SHADER_EXCLUDE_CAMERA | SHADER_EXCLUDE_SCATTER |
+                        SHADER_EXCLUDE_SHADOW_CATCHER),
 
   SHADER_MASK = ~(SHADER_SMOOTH_NORMAL | SHADER_CAST_SHADOW | SHADER_AREA_LIGHT | SHADER_USE_MIS |
                   SHADER_EXCLUDE_ANY)
@@ -612,29 +474,14 @@ typedef struct differential {
 /* Ray */
 
 typedef struct Ray {
-/* TODO(sergey): This is only needed because current AMD
- * compiler has hard time building the kernel with this
- * reshuffle. And at the same time reshuffle will cause
- * less optimal CPU code in certain places.
- *
- * We'll get rid of this nasty exception once AMD compiler
- * is fixed.
- */
-#ifndef __KERNEL_OPENCL_AMD__
   float3 P;   /* origin */
   float3 D;   /* direction */
   float t;    /* length of the ray */
   float time; /* time (for motion blur) */
-#else
-  float t;    /* length of the ray */
-  float time; /* time (for motion blur) */
-  float3 P;   /* origin */
-  float3 D;   /* direction */
-#endif
 
 #ifdef __RAY_DIFFERENTIALS__
-  differential3 dP;
-  differential3 dD;
+  float dP;
+  float dD;
 #endif
 } Ray;
 
@@ -661,9 +508,6 @@ typedef enum PrimitiveType {
   PRIMITIVE_CURVE_RIBBON = (1 << 4),
   PRIMITIVE_MOTION_CURVE_RIBBON = (1 << 5),
   PRIMITIVE_VOLUME = (1 << 6),
-  /* Lamp primitive is not included below on purpose,
-   * since it is no real traceable primitive.
-   */
   PRIMITIVE_LAMP = (1 << 7),
 
   PRIMITIVE_ALL_TRIANGLE = (PRIMITIVE_TRIANGLE | PRIMITIVE_MOTION_TRIANGLE),
@@ -672,16 +516,14 @@ typedef enum PrimitiveType {
   PRIMITIVE_ALL_VOLUME = (PRIMITIVE_VOLUME),
   PRIMITIVE_ALL_MOTION = (PRIMITIVE_MOTION_TRIANGLE | PRIMITIVE_MOTION_CURVE_THICK |
                           PRIMITIVE_MOTION_CURVE_RIBBON),
-  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME),
+  PRIMITIVE_ALL = (PRIMITIVE_ALL_TRIANGLE | PRIMITIVE_ALL_CURVE | PRIMITIVE_ALL_VOLUME |
+                   PRIMITIVE_LAMP),
 
-  /* Total number of different traceable primitives.
-   * NOTE: This is an actual value, not a bitflag.
-   */
-  PRIMITIVE_NUM_TOTAL = 7,
+  PRIMITIVE_NUM = 8,
 } PrimitiveType;
 
-#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM_TOTAL) | (type))
-#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM_TOTAL)
+#define PRIMITIVE_PACK_SEGMENT(type, segment) ((segment << PRIMITIVE_NUM) | (type))
+#define PRIMITIVE_UNPACK_SEGMENT(type) (type >> PRIMITIVE_NUM)
 
 typedef enum CurveShapeType {
   CURVE_RIBBON = 0,
@@ -760,20 +602,14 @@ typedef struct AttributeDescriptor {
 
 /* Closure data */
 
-#ifdef __MULTI_CLOSURE__
-#  ifdef __SPLIT_KERNEL__
-#    define MAX_CLOSURE 1
-#  else
-#    ifndef __MAX_CLOSURE__
-#      define MAX_CLOSURE 64
-#    else
-#      define MAX_CLOSURE __MAX_CLOSURE__
-#    endif
-#  endif
+#ifndef __MAX_CLOSURE__
+#  define MAX_CLOSURE 64
 #else
-#  define MAX_CLOSURE 1
+#  define MAX_CLOSURE __MAX_CLOSURE__
 #endif
 
+#define MAX_VOLUME_CLOSURE 8
+
 /* This struct is the base class for all closures. The common members are
  * duplicated in all derived classes since we don't have C++ in the kernel
  * yet, and because it lets us lay out the members to minimize padding. The
@@ -866,11 +702,14 @@ enum ShaderDataFlag {
   SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
   /* Shader has emission */
   SD_HAS_EMISSION = (1 << 29),
+  /* Shader has raytracing */
+  SD_HAS_RAYTRACE = (1 << 30),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
                      SD_VOLUME_MIS | SD_VOLUME_CUBIC | SD_HAS_BUMP | SD_HAS_DISPLACEMENT |
-                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES)
+                     SD_HAS_CONSTANT_EMISSION | SD_NEED_VOLUME_ATTRIBUTES | SD_HAS_EMISSION |
+                     SD_HAS_RAYTRACE)
 };
 
 /* Object flags. */
@@ -955,19 +794,19 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
 #endif
 
 #ifdef __OBJECT_MOTION__
-  /* object <-> world space transformations, cached to avoid
-   * re-interpolating them constantly for shading */
-  Transform ob_tfm;
-  Transform ob_itfm;
+  /* Object <-> world space transformations for motion blur, cached to avoid
+   * re-interpolating them constantly for shading. */
+  Transform ob_tfm_motion;
+  Transform ob_itfm_motion;
 #endif
 
   /* ray start position, only set for backgrounds */
   float3 ray_P;
-  differential3 ray_dP;
+  float ray_dP;
 
 #ifdef __OSL__
-  struct KernelGlobals *osl_globals;
-  struct PathState *osl_path_state;
+  const struct KernelGlobals *osl_globals;
+  const struct IntegratorStateCPU *osl_path_state;
 #endif
 
   /* LCG state for closures that require additional random numbers. */
@@ -976,7 +815,6 @@ typedef ccl_addr_space struct ccl_align(16) ShaderData
   /* Closure data, we store a fixed array of closures */
   int num_closure;
   int num_closure_left;
-  float randb_closure;
   float3 svm_closure_weight;
 
   /* Closure weights summed directly, so we can evaluate
@@ -998,7 +836,22 @@ typedef ccl_addr_space struct ccl_align(16) ShaderDataTinyStorage
 ShaderDataTinyStorage;
 #define AS_SHADER_DATA(shader_data_tiny_storage) ((ShaderData *)shader_data_tiny_storage)
 
-/* Path State */
+/* Compact volume closures storage.
+ *
+ * Used for decoupled direct/indirect light closure storage. */
+
+ccl_addr_space struct ShaderVolumeClosure {
+  float3 weight;
+  float sample_weight;
+  float g;
+};
+
+ccl_addr_space struct ShaderVolumePhases {
+  ShaderVolumeClosure closure[MAX_VOLUME_CLOSURE];
+  int num_closure;
+};
+
+/* Volume Stack */
 
 #ifdef __VOLUME__
 typedef struct VolumeStack {
@@ -1007,53 +860,6 @@ typedef struct VolumeStack {
 } VolumeStack;
 #endif
 
-typedef struct PathState {
-  /* see enum PathRayFlag */
-  int flag;
-
-  /* random number generator state */
-  uint rng_hash;       /* per pixel hash */
-  int rng_offset;      /* dimension offset */
-  int sample;          /* path sample number */
-  int num_samples;     /* total number of times this path will be sampled */
-  float branch_factor; /* number of branches in indirect paths */
-
-  /* bounce counting */
-  int bounce;
-  int diffuse_bounce;
-  int glossy_bounce;
-  int transmission_bounce;
-  int transparent_bounce;
-
-#ifdef __DENOISING_FEATURES__
-  float denoising_feature_weight;
-  float3 denoising_feature_throughput;
-#endif /* __DENOISING_FEATURES__ */
-
-  /* multiple importance sampling */
-  float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
-  float ray_pdf;     /* last bounce pdf */
-#ifdef __LAMP_MIS__
-  float ray_t; /* accumulated distance through transparent surfaces */
-#endif
-
-  /* volume rendering */
-#ifdef __VOLUME__
-  int volume_bounce;
-  int volume_bounds_bounce;
-  VolumeStack volume_stack[VOLUME_STACK_SIZE];
-#endif
-} PathState;
-
-#ifdef __VOLUME__
-typedef struct VolumeState {
-#  ifdef __SPLIT_KERNEL__
-#  else
-  PathState ps;
-#  endif
-} VolumeState;
-#endif
-
 /* Struct to gather multiple nearby intersections. */
 typedef struct LocalIntersection {
   Ray ray;
@@ -1064,20 +870,6 @@ typedef struct LocalIntersection {
   float3 Ng[LOCAL_MAX_HITS];
 } LocalIntersection;
 
-/* Subsurface */
-
-/* Struct to gather SSS indirect rays and delay tracing them. */
-typedef struct SubsurfaceIndirectRays {
-  PathState state[BSSRDF_MAX_HITS];
-
-  int num_rays;
-
-  struct Ray rays[BSSRDF_MAX_HITS];
-  float3 throughputs[BSSRDF_MAX_HITS];
-  struct PathRadianceState L_state[BSSRDF_MAX_HITS];
-} SubsurfaceIndirectRays;
-static_assert(BSSRDF_MAX_HITS <= LOCAL_MAX_HITS, "BSSRDF hits too high.");
-
 /* Constant Kernel Data
  *
  * These structs are passed from CPU to various devices, and the struct layout
@@ -1128,7 +920,7 @@ typedef struct KernelCamera {
 
   /* render size */
   float width, height;
-  int resolution;
+  int pad1;
 
   /* anamorphic lens bokeh */
   float inv_aperture_ratio;
@@ -1169,11 +961,12 @@ typedef struct KernelFilm {
 
   int light_pass_flag;
   int pass_stride;
-  int use_light_pass;
 
   int pass_combined;
   int pass_depth;
+  int pass_position;
   int pass_normal;
+  int pass_roughness;
   int pass_motion;
 
   int pass_motion_weight;
@@ -1202,7 +995,13 @@ typedef struct KernelFilm {
 
   int pass_shadow;
   float pass_shadow_scale;
+
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+
   int filter_table_offset;
+
   int cryptomatte_passes;
   int cryptomatte_depth;
   int pass_cryptomatte;
@@ -1215,15 +1014,11 @@ typedef struct KernelFilm {
   float mist_inv_depth;
   float mist_falloff;
 
-  int pass_denoising_data;
-  int pass_denoising_clean;
-  int denoising_flags;
+  int pass_denoising_normal;
+  int pass_denoising_albedo;
 
   int pass_aov_color;
   int pass_aov_value;
-  int pass_aov_color_num;
-  int pass_aov_value_num;
-  int pad1, pad2, pad3;
 
   /* XYZ to rendering color space transform. float4 instead of float3 to
    * ensure consistent padding/alignment across devices. */
@@ -1234,19 +1029,54 @@ typedef struct KernelFilm {
 
   int pass_bake_primitive;
   int pass_bake_differential;
-  int pad;
 
-  /* viewport rendering options */
-  int display_pass_stride;
-  int display_pass_components;
-  int display_divide_pass_stride;
-  int use_display_exposure;
-  int use_display_pass_alpha;
+  int use_approximate_shadow_catcher;
 
-  int pad4, pad5, pad6;
+  int pad1, pad2, pad3;
 } KernelFilm;
 static_assert_align(KernelFilm, 16);
 
+typedef struct KernelFilmConvert {
+  int pass_offset;
+  int pass_stride;
+
+  int pass_use_exposure;
+  int pass_use_filter;
+
+  int pass_divide;
+  int pass_indirect;
+
+  int pass_combined;
+  int pass_sample_count;
+  int pass_adaptive_aux_buffer;
+  int pass_motion_weight;
+  int pass_shadow_catcher;
+  int pass_shadow_catcher_sample_count;
+  int pass_shadow_catcher_matte;
+  int pass_background;
+
+  float scale;
+  float exposure;
+  float scale_exposure;
+
+  int use_approximate_shadow_catcher;
+  int use_approximate_shadow_catcher_background;
+  int show_active_pixels;
+
+  /* Number of components to write to. */
+  int num_components;
+
+  /* Number of floats per pixel. When zero is the same as `num_components`.
+   * NOTE: Is ignored for half4 destination. */
+  int pixel_stride;
+
+  int is_denoised;
+
+  /* Padding. */
+  int pad1;
+} KernelFilmConvert;
+static_assert_align(KernelFilmConvert, 16);
+
 typedef struct KernelBackground {
   /* only shader index */
   int surface_shader;
@@ -1255,11 +1085,6 @@ typedef struct KernelBackground {
   int transparent;
   float transparent_roughness_squared_threshold;
 
-  /* ambient occlusion */
-  float ao_factor;
-  float ao_distance;
-  float ao_bounces_factor;
-
   /* portal sampling */
   float portal_weight;
   int num_portals;
@@ -1277,13 +1102,15 @@ typedef struct KernelBackground {
   int map_res_y;
 
   int use_mis;
+
+  /* Padding */
+  int pad1, pad2, pad3;
 } KernelBackground;
 static_assert_align(KernelBackground, 16);
 
 typedef struct KernelIntegrator {
   /* emission */
   int use_direct_light;
-  int use_ambient_occlusion;
   int num_distribution;
   int num_all_lights;
   float pdf_triangles;
@@ -1299,7 +1126,10 @@ typedef struct KernelIntegrator {
   int max_transmission_bounce;
   int max_volume_bounce;
 
+  /* AO bounces */
   int ao_bounces;
+  float ao_bounces_distance;
+  float ao_bounces_factor;
 
   /* transparent */
   int transparent_min_bounce;
@@ -1318,39 +1148,20 @@ typedef struct KernelIntegrator {
   float sample_clamp_direct;
   float sample_clamp_indirect;
 
-  /* branched path */
-  int branched;
-  int volume_decoupled;
-  int diffuse_samples;
-  int glossy_samples;
-  int transmission_samples;
-  int ao_samples;
-  int mesh_light_samples;
-  int subsurface_samples;
-  int sample_all_lights_direct;
-  int sample_all_lights_indirect;
-
   /* mis */
   int use_lamp_mis;
 
   /* sampler */
   int sampling_pattern;
-  int aa_samples;
-  int adaptive_min_samples;
-  int adaptive_step;
-  int adaptive_stop_per_sample;
-  float adaptive_threshold;
 
   /* volume render */
   int use_volumes;
   int volume_max_steps;
   float volume_step_rate;
-  int volume_samples;
-
-  int start_sample;
 
-  int max_closures;
+  int has_shadow_catcher;
 
+  /* padding */
   int pad1, pad2;
 } KernelIntegrator;
 static_assert_align(KernelIntegrator, 16);
@@ -1401,14 +1212,19 @@ typedef struct KernelTables {
 static_assert_align(KernelTables, 16);
 
 typedef struct KernelBake {
+  int use;
   int object_index;
   int tri_offset;
-  int type;
-  int pass_filter;
+  int pad1;
 } KernelBake;
 static_assert_align(KernelBake, 16);
 
 typedef struct KernelData {
+  uint kernel_features;
+  uint max_closures;
+  uint max_shaders;
+  uint pad;
+
   KernelCamera cam;
   KernelFilm film;
   KernelBackground background;
@@ -1485,11 +1301,10 @@ typedef struct KernelLight {
   int type;
   float co[3];
   int shader_id;
-  int samples;
   float max_bounces;
   float random;
   float strength[3];
-  float pad1;
+  float pad1, pad2;
   Transform tfm;
   Transform itfm;
   union {
@@ -1539,110 +1354,6 @@ typedef struct KernelShader {
 } KernelShader;
 static_assert_align(KernelShader, 16);
 
-/* Declarations required for split kernel */
-
-/* Macro for queues */
-/* Value marking queue's empty slot */
-#define QUEUE_EMPTY_SLOT -1
-
-/*
- * Queue 1 - Active rays
- * Queue 2 - Background queue
- * Queue 3 - Shadow ray cast kernel - AO
- * Queue 4 - Shadow ray cast kernel - direct lighting
- */
-
-/* Queue names */
-enum QueueNumber {
-  /* All active rays and regenerated rays are enqueued here. */
-  QUEUE_ACTIVE_AND_REGENERATED_RAYS = 0,
-
-  /* All
-   * 1. Background-hit rays,
-   * 2. Rays that has exited path-iteration but needs to update output buffer
-   * 3. Rays to be regenerated
-   * are enqueued here.
-   */
-  QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contribution for AO are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-
-  /* All rays for which a shadow ray should be cast to determine radiance
-   * contributing for direct lighting are enqueued here.
-   */
-  QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-
-  /* Rays sorted according to shader->id */
-  QUEUE_SHADER_SORTED_RAYS,
-
-#ifdef __BRANCHED_PATH__
-  /* All rays moving to next iteration of the indirect loop for light */
-  QUEUE_LIGHT_INDIRECT_ITER,
-  /* Queue of all inactive rays. These are candidates for sharing work of indirect loops */
-  QUEUE_INACTIVE_RAYS,
-#  ifdef __VOLUME__
-  /* All rays moving to next iteration of the indirect loop for volumes */
-  QUEUE_VOLUME_INDIRECT_ITER,
-#  endif
-#  ifdef __SUBSURFACE__
-  /* All rays moving to next iteration of the indirect loop for subsurface */
-  QUEUE_SUBSURFACE_INDIRECT_ITER,
-#  endif
-#endif /* __BRANCHED_PATH__ */
-
-  NUM_QUEUES
-};
-
-/* We use RAY_STATE_MASK to get ray_state */
-#define RAY_STATE_MASK 0x0F
-#define RAY_FLAG_MASK 0xF0
-enum RayState {
-  RAY_INVALID = 0,
-  /* Denotes ray is actively involved in path-iteration. */
-  RAY_ACTIVE,
-  /* Denotes ray has completed processing all samples and is inactive. */
-  RAY_INACTIVE,
-  /* Denotes ray has exited path-iteration and needs to update output buffer. */
-  RAY_UPDATE_BUFFER,
-  /* Denotes ray needs to skip most surface shader work. */
-  RAY_HAS_ONLY_VOLUME,
-  /* Denotes ray has hit background */
-  RAY_HIT_BACKGROUND,
-  /* Denotes ray has to be regenerated */
-  RAY_TO_REGENERATE,
-  /* Denotes ray has been regenerated */
-  RAY_REGENERATED,
-  /* Denotes ray is moving to next iteration of the branched indirect loop */
-  RAY_LIGHT_INDIRECT_NEXT_ITER,
-  RAY_VOLUME_INDIRECT_NEXT_ITER,
-  RAY_SUBSURFACE_INDIRECT_NEXT_ITER,
-
-  /* Ray flags */
-
-  /* Flags to denote that the ray is currently evaluating the branched indirect loop */
-  RAY_BRANCHED_LIGHT_INDIRECT = (1 << 4),
-  RAY_BRANCHED_VOLUME_INDIRECT = (1 << 5),
-  RAY_BRANCHED_SUBSURFACE_INDIRECT = (1 << 6),
-  RAY_BRANCHED_INDIRECT = (RAY_BRANCHED_LIGHT_INDIRECT | RAY_BRANCHED_VOLUME_INDIRECT |
-                           RAY_BRANCHED_SUBSURFACE_INDIRECT),
-
-  /* Ray is evaluating an iteration of an indirect loop for another thread */
-  RAY_BRANCHED_INDIRECT_SHARED = (1 << 7),
-};
-
-#define ASSIGN_RAY_STATE(ray_state, ray_index, state) \
-  (ray_state[ray_index] = ((ray_state[ray_index] & RAY_FLAG_MASK) | state))
-#define IS_STATE(ray_state, ray_index, state) \
-  ((ray_index) != QUEUE_EMPTY_SLOT && ((ray_state)[(ray_index)] & RAY_STATE_MASK) == (state))
-#define ADD_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] | flag))
-#define REMOVE_RAY_FLAG(ray_state, ray_index, flag) \
-  (ray_state[ray_index] = (ray_state[ray_index] & (~flag)))
-#define IS_FLAG(ray_state, ray_index, flag) (ray_state[ray_index] & flag)
-
 /* Patches */
 
 #define PATCH_MAX_CONTROL_VERTS 16
@@ -1655,7 +1366,7 @@ enum RayState {
 
 /* Work Tiles */
 
-typedef struct WorkTile {
+typedef struct KernelWorkTile {
   uint x, y, w, h;
 
   uint start_sample;
@@ -1664,13 +1375,172 @@ typedef struct WorkTile {
   int offset;
   uint stride;
 
-  ccl_global float *buffer;
-} WorkTile;
+  /* Precalculated parameters used by init_from_camera kernel on GPU. */
+  int path_index_offset;
+  int work_size;
+} KernelWorkTile;
+
+/* Shader Evaluation.
+ *
+ * Position on a primitive on an object at which we want to evaluate the
+ * shader for e.g. mesh displacement or light importance map. */
+
+typedef struct KernelShaderEvalInput {
+  int object;
+  int prim;
+  float u, v;
+} KernelShaderEvalInput;
+static_assert_align(KernelShaderEvalInput, 16);
 
 /* Pre-computed sample table sizes for PMJ02 sampler. */
-#define NUM_PMJ_SAMPLES (64 * 64)
-#define NUM_PMJ_PATTERNS 48
+#define NUM_PMJ_DIVISIONS 32
+#define NUM_PMJ_SAMPLES ((NUM_PMJ_DIVISIONS) * (NUM_PMJ_DIVISIONS))
+#define NUM_PMJ_PATTERNS 1
 
-CCL_NAMESPACE_END
+/* Device kernels.
+ *
+ * Identifier for kernels that can be executed in device queues.
+ *
+ * Some implementation details.
+ *
+ * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
+ * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
+
+typedef enum DeviceKernel {
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
+  DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+  DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+  DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+  DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL,
+
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
+  DEVICE_KERNEL_INTEGRATOR_RESET,
+  DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS,
+
+  DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
+  DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,
+
+#define DECLARE_FILM_CONVERT_KERNEL(variant) \
+  DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA
+
+  DECLARE_FILM_CONVERT_KERNEL(DEPTH),
+  DECLARE_FILM_CONVERT_KERNEL(MIST),
+  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
+  DECLARE_FILM_CONVERT_KERNEL(LIGHT_PATH),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
+  DECLARE_FILM_CONVERT_KERNEL(MOTION),
+  DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW),
+  DECLARE_FILM_CONVERT_KERNEL(COMBINED),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT4),
+
+#undef DECLARE_FILM_CONVERT_KERNEL
+
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X,
+  DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y,
+
+  DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS,
+  DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO,
+  DEVICE_KERNEL_FILTER_COLOR_PREPROCESS,
+  DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS,
+
+  DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS,
+
+  DEVICE_KERNEL_PREFIX_SUM,
+
+  DEVICE_KERNEL_NUM,
+} DeviceKernel;
+
+enum {
+  DEVICE_KERNEL_INTEGRATOR_NUM = DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL + 1,
+};
+
+/* Kernel Features */
+
+enum KernelFeatureFlag : unsigned int {
+  /* Shader nodes. */
+  KERNEL_FEATURE_NODE_BSDF = (1U << 0U),
+  KERNEL_FEATURE_NODE_EMISSION = (1U << 1U),
+  KERNEL_FEATURE_NODE_VOLUME = (1U << 2U),
+  KERNEL_FEATURE_NODE_HAIR = (1U << 3U),
+  KERNEL_FEATURE_NODE_BUMP = (1U << 4U),
+  KERNEL_FEATURE_NODE_BUMP_STATE = (1U << 5U),
+  KERNEL_FEATURE_NODE_VORONOI_EXTRA = (1U << 6U),
+  KERNEL_FEATURE_NODE_RAYTRACE = (1U << 7U),
+
+  /* Use denoising kernels and output denoising passes. */
+  KERNEL_FEATURE_DENOISING = (1U << 8U),
+
+  /* Use path tracing kernels. */
+  KERNEL_FEATURE_PATH_TRACING = (1U << 9U),
 
-#endif /*  __KERNEL_TYPES_H__ */
+  /* BVH/sampling kernel features. */
+  KERNEL_FEATURE_HAIR = (1U << 10U),
+  KERNEL_FEATURE_HAIR_THICK = (1U << 11U),
+  KERNEL_FEATURE_OBJECT_MOTION = (1U << 12U),
+  KERNEL_FEATURE_CAMERA_MOTION = (1U << 13U),
+
+  /* Denotes whether baking functionality is needed. */
+  KERNEL_FEATURE_BAKING = (1U << 14U),
+
+  /* Use subsurface scattering materials. */
+  KERNEL_FEATURE_SUBSURFACE = (1U << 15U),
+
+  /* Use volume materials. */
+  KERNEL_FEATURE_VOLUME = (1U << 16U),
+
+  /* Use OpenSubdiv patch evaluation */
+  KERNEL_FEATURE_PATCH_EVALUATION = (1U << 17U),
+
+  /* Use Transparent shadows */
+  KERNEL_FEATURE_TRANSPARENT = (1U << 18U),
+
+  /* Use shadow catcher. */
+  KERNEL_FEATURE_SHADOW_CATCHER = (1U << 19U),
+
+  /* Per-uber shader usage flags. */
+  KERNEL_FEATURE_PRINCIPLED = (1U << 20U),
+
+  /* Light render passes. */
+  KERNEL_FEATURE_LIGHT_PASSES = (1U << 21U),
+
+  /* Shadow render pass. */
+  KERNEL_FEATURE_SHADOW_PASS = (1U << 22U),
+};
+
+/* Shader node feature mask, to specialize shader evaluation for kernels. */
+
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW \
+  (KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | \
+   KERNEL_FEATURE_NODE_HAIR | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE | \
+   KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_SURFACE \
+  (KERNEL_FEATURE_NODE_MASK_SURFACE_SHADOW | KERNEL_FEATURE_NODE_RAYTRACE)
+#define KERNEL_FEATURE_NODE_MASK_VOLUME \
+  (KERNEL_FEATURE_NODE_EMISSION | KERNEL_FEATURE_NODE_VOLUME | KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+#define KERNEL_FEATURE_NODE_MASK_DISPLACEMENT \
+  (KERNEL_FEATURE_NODE_VORONOI_EXTRA | KERNEL_FEATURE_NODE_BUMP | KERNEL_FEATURE_NODE_BUMP_STATE)
+#define KERNEL_FEATURE_NODE_MASK_BUMP KERNEL_FEATURE_NODE_MASK_DISPLACEMENT
+
+#define KERNEL_NODES_FEATURE(feature) ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
deleted file mode 100644
index f6b34be040e..00000000000
--- a/intern/cycles/kernel/kernel_volume.h
+++ /dev/null
@@ -1,1440 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Ignore paths that have volume throughput below this value, to avoid unnecessary work
- * and precision issues.
- * todo: this value could be tweaked or turned into a probability to avoid unnecessary
- * work in volumes and subsurface scattering. */
-#define VOLUME_THROUGHPUT_EPSILON 1e-6f
-
-/* Events for probalistic scattering */
-
-typedef enum VolumeIntegrateResult {
-  VOLUME_PATH_SCATTERED = 0,
-  VOLUME_PATH_ATTENUATED = 1,
-  VOLUME_PATH_MISSED = 2
-} VolumeIntegrateResult;
-
-/* Volume shader properties
- *
- * extinction coefficient = absorption coefficient + scattering coefficient
- * sigma_t = sigma_a + sigma_s */
-
-typedef struct VolumeShaderCoefficients {
-  float3 sigma_t;
-  float3 sigma_s;
-  float3 emission;
-} VolumeShaderCoefficients;
-
-#ifdef __VOLUME__
-
-/* evaluate shader to get extinction coefficient at P */
-ccl_device_inline bool volume_shader_extinction_sample(KernelGlobals *kg,
-                                                       ShaderData *sd,
-                                                       ccl_addr_space PathState *state,
-                                                       float3 P,
-                                                       float3 *extinction)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, PATH_RAY_SHADOW);
-
-  if (sd->flag & SD_EXTINCTION) {
-    const float density = object_volume_density(kg, sd->object);
-    *extinction = sd->closure_transparent_extinction * density;
-    return true;
-  }
-  else {
-    return false;
-  }
-}
-
-/* evaluate shader to get absorption, scattering and emission at P */
-ccl_device_inline bool volume_shader_sample(KernelGlobals *kg,
-                                            ShaderData *sd,
-                                            ccl_addr_space PathState *state,
-                                            float3 P,
-                                            VolumeShaderCoefficients *coeff)
-{
-  sd->P = P;
-  shader_eval_volume(kg, sd, state, state->volume_stack, state->flag);
-
-  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION)))
-    return false;
-
-  coeff->sigma_s = zero_float3();
-  coeff->sigma_t = (sd->flag & SD_EXTINCTION) ? sd->closure_transparent_extinction : zero_float3();
-  coeff->emission = (sd->flag & SD_EMISSION) ? sd->closure_emission_background : zero_float3();
-
-  if (sd->flag & SD_SCATTER) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      if (CLOSURE_IS_VOLUME(sc->type))
-        coeff->sigma_s += sc->weight;
-    }
-  }
-
-  const float density = object_volume_density(kg, sd->object);
-  coeff->sigma_s *= density;
-  coeff->sigma_t *= density;
-  coeff->emission *= density;
-
-  return true;
-}
-
-#endif /* __VOLUME__ */
-
-ccl_device float3 volume_color_transmittance(float3 sigma, float t)
-{
-  return exp3(-sigma * t);
-}
-
-ccl_device float kernel_volume_channel_get(float3 value, int channel)
-{
-  return (channel == 0) ? value.x : ((channel == 1) ? value.y : value.z);
-}
-
-#ifdef __VOLUME__
-
-ccl_device float volume_stack_step_size(KernelGlobals *kg, ccl_addr_space VolumeStack *stack)
-{
-  float step_size = FLT_MAX;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    bool heterogeneous = false;
-
-    if (shader_flag & SD_HETEROGENEOUS_VOLUME) {
-      heterogeneous = true;
-    }
-    else if (shader_flag & SD_NEED_VOLUME_ATTRIBUTES) {
-      /* We want to render world or objects without any volume grids
-       * as homogeneous, but can only verify this at run-time since other
-       * heterogeneous volume objects may be using the same shader. */
-      int object = stack[i].object;
-      if (object != OBJECT_NONE) {
-        int object_flag = kernel_tex_fetch(__object_flag, object);
-        if (object_flag & SD_OBJECT_HAS_VOLUME_ATTRIBUTES) {
-          heterogeneous = true;
-        }
-      }
-    }
-
-    if (heterogeneous) {
-      float object_step_size = object_volume_step_size(kg, stack[i].object);
-      object_step_size *= kernel_data.integrator.volume_step_rate;
-      step_size = fminf(object_step_size, step_size);
-    }
-  }
-
-  return step_size;
-}
-
-ccl_device int volume_stack_sampling_method(KernelGlobals *kg, VolumeStack *stack)
-{
-  if (kernel_data.integrator.num_all_lights == 0)
-    return 0;
-
-  int method = -1;
-
-  for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-    int shader_flag = kernel_tex_fetch(__shaders, (stack[i].shader & SHADER_MASK)).flags;
-
-    if (shader_flag & SD_VOLUME_MIS) {
-      return SD_VOLUME_MIS;
-    }
-    else if (shader_flag & SD_VOLUME_EQUIANGULAR) {
-      if (method == 0)
-        return SD_VOLUME_MIS;
-
-      method = SD_VOLUME_EQUIANGULAR;
-    }
-    else {
-      if (method == SD_VOLUME_EQUIANGULAR)
-        return SD_VOLUME_MIS;
-
-      method = 0;
-    }
-  }
-
-  return method;
-}
-
-ccl_device_inline void kernel_volume_step_init(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               const float object_step_size,
-                                               float t,
-                                               float *step_size,
-                                               float *step_shade_offset,
-                                               float *steps_offset)
-{
-  const int max_steps = kernel_data.integrator.volume_max_steps;
-  float step = min(object_step_size, t);
-
-  /* compute exact steps in advance for malloc */
-  if (t > max_steps * step) {
-    step = t / (float)max_steps;
-  }
-
-  *step_size = step;
-
-  /* Perform shading at this offset within a step, to integrate over
-   * over the entire step segment. */
-  *step_shade_offset = path_state_rng_1D_hash(kg, state, 0x1e31d8a4);
-
-  /* Shift starting point of all segment by this random amount to avoid
-   * banding artifacts from the volume bounding shape. */
-  *steps_offset = path_state_rng_1D_hash(kg, state, 0x3d22c7b3);
-}
-
-/* Volume Shadows
- *
- * These functions are used to attenuate shadow rays to lights. Both absorption
- * and scattering will block light, represented by the extinction coefficient. */
-
-/* homogeneous volume: assume shader evaluation at the starts gives
- * the extinction coefficient for the entire line segment */
-ccl_device void kernel_volume_shadow_homogeneous(KernelGlobals *kg,
-                                                 ccl_addr_space PathState *state,
-                                                 Ray *ray,
-                                                 ShaderData *sd,
-                                                 float3 *throughput)
-{
-  float3 sigma_t = zero_float3();
-
-  if (volume_shader_extinction_sample(kg, sd, state, ray->P, &sigma_t))
-    *throughput *= volume_color_transmittance(sigma_t, ray->t);
-}
-
-/* heterogeneous volume: integrate stepping through the volume until we
- * reach the end, get absorbed entirely, or run out of iterations */
-ccl_device void kernel_volume_shadow_heterogeneous(KernelGlobals *kg,
-                                                   ccl_addr_space PathState *state,
-                                                   Ray *ray,
-                                                   ShaderData *sd,
-                                                   float3 *throughput,
-                                                   const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * For shadows we do not offset all segments, since the starting point is
-   * already a random distance inside the volume. It also appears to create
-   * banding artifacts for unknown reasons. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, unused;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &unused);
-  const float steps_offset = 1.0f;
-
-  /* compute extinction at the start */
-  float t = 0.0f;
-
-  float3 sum = zero_float3();
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    float3 sigma_t = zero_float3();
-
-    /* compute attenuation over segment */
-    if (volume_shader_extinction_sample(kg, sd, state, new_P, &sigma_t)) {
-      /* Compute expf() only for every Nth step, to save some calculations
-       * because exp(a)*exp(b) = exp(a+b), also do a quick VOLUME_THROUGHPUT_EPSILON
-       * check then. */
-      sum += (-sigma_t * dt);
-      if ((i & 0x07) == 0) { /* ToDo: Other interval? */
-        tp = *throughput * exp3(sum);
-
-        /* stop if nearly all light is blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON)
-          break;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t) {
-      /* Update throughput in case we haven't done it above */
-      tp = *throughput * exp3(sum);
-      break;
-    }
-  }
-
-  *throughput = tp;
-}
-
-/* get the volume attenuation over line segment defined by ray, with the
- * assumption that there are no surfaces blocking light between the endpoints */
-#  if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void kernel_volume_shadow(KernelGlobals *kg,
-                                            ShaderData *shadow_sd,
-                                            ccl_addr_space PathState *state,
-                                            Ray *ray,
-                                            float3 *throughput)
-{
-  optixDirectCall<void>(1, kg, shadow_sd, state, ray, throughput);
-}
-extern "C" __device__ void __direct_callable__kernel_volume_shadow(
-#  else
-ccl_device_noinline void kernel_volume_shadow(
-#  endif
-    KernelGlobals *kg,
-    ShaderData *shadow_sd,
-    ccl_addr_space PathState *state,
-    Ray *ray,
-    float3 *throughput)
-{
-  shader_setup_from_volume(kg, shadow_sd, ray);
-
-  float step_size = volume_stack_step_size(kg, state->volume_stack);
-  if (step_size != FLT_MAX)
-    kernel_volume_shadow_heterogeneous(kg, state, ray, shadow_sd, throughput, step_size);
-  else
-    kernel_volume_shadow_homogeneous(kg, state, ray, shadow_sd, throughput);
-}
-
-#endif /* __VOLUME__ */
-
-/* Equi-angular sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media" */
-
-ccl_device float kernel_volume_equiangular_sample(Ray *ray, float3 light_P, float xi, float *pdf)
-{
-  float t = ray->t;
-
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
-  if (UNLIKELY(theta_b == theta_a)) {
-    *pdf = 0.0f;
-    return 0.0f;
-  }
-  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return min(t, delta + t_); /* min is only for float precision errors */
-}
-
-ccl_device float kernel_volume_equiangular_pdf(Ray *ray, float3 light_P, float sample_t)
-{
-  float delta = dot((light_P - ray->P), ray->D);
-  float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
-  if (UNLIKELY(D == 0.0f)) {
-    return 0.0f;
-  }
-
-  float t = ray->t;
-  float t_ = sample_t - delta;
-
-  float theta_a = -atan2f(delta, D);
-  float theta_b = atan2f(t - delta, D);
-  if (UNLIKELY(theta_b == theta_a)) {
-    return 0.0f;
-  }
-
-  float pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));
-
-  return pdf;
-}
-
-/* Distance sampling */
-
-ccl_device float kernel_volume_distance_sample(
-    float max_t, float3 sigma_t, int channel, float xi, float3 *transmittance, float3 *pdf)
-{
-  /* xi is [0, 1[ so log(0) should never happen, division by zero is
-   * avoided because sample_sigma_t > 0 when SD_SCATTER is set */
-  float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float sample_transmittance = kernel_volume_channel_get(full_transmittance, channel);
-
-  float sample_t = min(max_t, -logf(1.0f - xi * (1.0f - sample_transmittance)) / sample_sigma_t);
-
-  *transmittance = volume_color_transmittance(sigma_t, sample_t);
-  *pdf = safe_divide_color(sigma_t * *transmittance, one_float3() - full_transmittance);
-
-  /* todo: optimization: when taken together with hit/miss decision,
-   * the full_transmittance cancels out drops out and xi does not
-   * need to be remapped */
-
-  return sample_t;
-}
-
-ccl_device float3 kernel_volume_distance_pdf(float max_t, float3 sigma_t, float sample_t)
-{
-  float3 full_transmittance = volume_color_transmittance(sigma_t, max_t);
-  float3 transmittance = volume_color_transmittance(sigma_t, sample_t);
-
-  return safe_divide_color(sigma_t * transmittance, one_float3() - full_transmittance);
-}
-
-/* Emission */
-
-ccl_device float3 kernel_volume_emission_integrate(VolumeShaderCoefficients *coeff,
-                                                   int closure_flag,
-                                                   float3 transmittance,
-                                                   float t)
-{
-  /* integral E * exp(-sigma_t * t) from 0 to t = E * (1 - exp(-sigma_t * t))/sigma_t
-   * this goes to E * t as sigma_t goes to zero
-   *
-   * todo: we should use an epsilon to avoid precision issues near zero sigma_t */
-  float3 emission = coeff->emission;
-
-  if (closure_flag & SD_EXTINCTION) {
-    float3 sigma_t = coeff->sigma_t;
-
-    emission.x *= (sigma_t.x > 0.0f) ? (1.0f - transmittance.x) / sigma_t.x : t;
-    emission.y *= (sigma_t.y > 0.0f) ? (1.0f - transmittance.y) / sigma_t.y : t;
-    emission.z *= (sigma_t.z > 0.0f) ? (1.0f - transmittance.z) / sigma_t.z : t;
-  }
-  else
-    emission *= t;
-
-  return emission;
-}
-
-/* Volume Path */
-
-ccl_device int kernel_volume_sample_channel(float3 albedo,
-                                            float3 throughput,
-                                            float rand,
-                                            float3 *pdf)
-{
-  /* Sample color channel proportional to throughput and single scattering
-   * albedo, to significantly reduce noise with many bounce, following:
-   *
-   * "Practical and Controllable Subsurface Scattering for Production Path
-   *  Tracing". Matt Jen-Yuan Chiang, Peter Kutz, Brent Burley. SIGGRAPH 2016. */
-  float3 weights = fabs(throughput * albedo);
-  float sum_weights = weights.x + weights.y + weights.z;
-  float3 weights_pdf;
-
-  if (sum_weights > 0.0f) {
-    weights_pdf = weights / sum_weights;
-  }
-  else {
-    weights_pdf = make_float3(1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f);
-  }
-
-  *pdf = weights_pdf;
-
-  /* OpenCL does not support -> on float3, so don't use pdf->x. */
-  if (rand < weights_pdf.x) {
-    return 0;
-  }
-  else if (rand < weights_pdf.x + weights_pdf.y) {
-    return 1;
-  }
-  else {
-    return 2;
-  }
-}
-
-#ifdef __VOLUME__
-
-/* homogeneous volume: assume shader evaluation at the start gives
- * the volume shading coefficient for the entire line segment */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_homogeneous(KernelGlobals *kg,
-                                    ccl_addr_space PathState *state,
-                                    Ray *ray,
-                                    ShaderData *sd,
-                                    PathRadiance *L,
-                                    ccl_addr_space float3 *throughput,
-                                    bool probalistic_scatter)
-{
-  VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-  if (!volume_shader_sample(kg, sd, state, ray->P, &coeff))
-    return VOLUME_PATH_MISSED;
-
-  int closure_flag = sd->flag;
-  float t = ray->t;
-  float3 new_tp;
-
-#  ifdef __VOLUME_SCATTER__
-  /* randomly scatter, and if we do t is shortened */
-  if (closure_flag & SD_SCATTER) {
-    /* Sample channel, use MIS with balance heuristic. */
-    float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-    float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-    float3 channel_pdf;
-    int channel = kernel_volume_sample_channel(albedo, *throughput, rphase, &channel_pdf);
-
-    /* decide if we will hit or miss */
-    bool scatter = true;
-    float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-
-    if (probalistic_scatter) {
-      float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-      float sample_transmittance = expf(-sample_sigma_t * t);
-
-      if (1.0f - xi >= sample_transmittance) {
-        scatter = true;
-
-        /* rescale random number so we can reuse it */
-        xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-      }
-      else
-        scatter = false;
-    }
-
-    if (scatter) {
-      /* scattering */
-      float3 pdf;
-      float3 transmittance;
-      float sample_t;
-
-      /* distance sampling */
-      sample_t = kernel_volume_distance_sample(
-          ray->t, coeff.sigma_t, channel, xi, &transmittance, &pdf);
-
-      /* modify pdf for hit/miss decision */
-      if (probalistic_scatter)
-        pdf *= one_float3() - volume_color_transmittance(coeff.sigma_t, t);
-
-      new_tp = *throughput * coeff.sigma_s * transmittance / dot(channel_pdf, pdf);
-      t = sample_t;
-    }
-    else {
-      /* no scattering */
-      float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-      float pdf = dot(channel_pdf, transmittance);
-      new_tp = *throughput * transmittance / pdf;
-    }
-  }
-  else
-#  endif
-      if (closure_flag & SD_EXTINCTION) {
-    /* absorption only, no sampling needed */
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, t);
-    new_tp = *throughput * transmittance;
-  }
-  else {
-    new_tp = *throughput;
-  }
-
-  /* integrate emission attenuated by extinction */
-  if (L && (closure_flag & SD_EMISSION)) {
-    float3 transmittance = volume_color_transmittance(coeff.sigma_t, ray->t);
-    float3 emission = kernel_volume_emission_integrate(
-        &coeff, closure_flag, transmittance, ray->t);
-    path_radiance_accum_emission(kg, L, state, *throughput, emission);
-  }
-
-  /* modify throughput */
-  if (closure_flag & SD_EXTINCTION) {
-    *throughput = new_tp;
-
-    /* prepare to scatter to new direction */
-    if (t < ray->t) {
-      /* adjust throughput and move to new location */
-      sd->P = ray->P + t * ray->D;
-
-      return VOLUME_PATH_SCATTERED;
-    }
-  }
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* heterogeneous volume distance sampling: integrate stepping through the
- * volume until we reach the end, get absorbed entirely, or run out of
- * iterations. this does probabilistically scatter or get transmitted through
- * for path tracing where we don't want to branch. */
-ccl_device VolumeIntegrateResult
-kernel_volume_integrate_heterogeneous_distance(KernelGlobals *kg,
-                                               ccl_addr_space PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               PathRadiance *L,
-                                               ccl_addr_space float3 *throughput,
-                                               const float object_step_size)
-{
-  float3 tp = *throughput;
-
-  /* Prepare for stepping.
-   * Using a different step offset for the first step avoids banding artifacts. */
-  int max_steps = kernel_data.integrator.volume_max_steps;
-  float step_size, step_shade_offset, steps_offset;
-  kernel_volume_step_init(
-      kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-  /* compute coefficients at the start */
-  float t = 0.0f;
-  float3 accum_transmittance = one_float3();
-
-  /* pick random color channel, we use the Veach one-sample
-   * model with balance heuristic for the channels */
-  float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
-  float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
-  bool has_scatter = false;
-
-  for (int i = 0; i < max_steps; i++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 new_tp;
-      float3 transmittance;
-      bool scatter = false;
-
-      /* distance sampling */
-#  ifdef __VOLUME_SCATTER__
-      if ((closure_flag & SD_SCATTER) || (has_scatter && (closure_flag & SD_EXTINCTION))) {
-        has_scatter = true;
-
-        /* Sample channel, use MIS with balance heuristic. */
-        float3 albedo = safe_divide_color(coeff.sigma_s, coeff.sigma_t);
-        float3 channel_pdf;
-        int channel = kernel_volume_sample_channel(albedo, tp, rphase, &channel_pdf);
-
-        /* compute transmittance over full step */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-
-        /* decide if we will scatter or continue */
-        float sample_transmittance = kernel_volume_channel_get(transmittance, channel);
-
-        if (1.0f - xi >= sample_transmittance) {
-          /* compute sampling distance */
-          float sample_sigma_t = kernel_volume_channel_get(coeff.sigma_t, channel);
-          float new_dt = -logf(1.0f - xi) / sample_sigma_t;
-          new_t = t + new_dt;
-
-          /* transmittance and pdf */
-          float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
-          float3 pdf = coeff.sigma_t * new_transmittance;
-
-          /* throughput */
-          new_tp = tp * coeff.sigma_s * new_transmittance / dot(channel_pdf, pdf);
-          scatter = true;
-        }
-        else {
-          /* throughput */
-          float pdf = dot(channel_pdf, transmittance);
-          new_tp = tp * transmittance / pdf;
-
-          /* remap xi so we can reuse it and keep thing stratified */
-          xi = 1.0f - (1.0f - xi) / sample_transmittance;
-        }
-      }
-      else
-#  endif
-          if (closure_flag & SD_EXTINCTION) {
-        /* absorption only, no sampling needed */
-        transmittance = volume_color_transmittance(coeff.sigma_t, dt);
-        new_tp = tp * transmittance;
-      }
-      else {
-        transmittance = zero_float3();
-        new_tp = tp;
-      }
-
-      /* integrate emission attenuated by absorption */
-      if (L && (closure_flag & SD_EMISSION)) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        path_radiance_accum_emission(kg, L, state, tp, emission);
-      }
-
-      /* modify throughput */
-      if (closure_flag & SD_EXTINCTION) {
-        tp = new_tp;
-
-        /* stop if nearly all light blocked */
-        if (tp.x < VOLUME_THROUGHPUT_EPSILON && tp.y < VOLUME_THROUGHPUT_EPSILON &&
-            tp.z < VOLUME_THROUGHPUT_EPSILON) {
-          tp = zero_float3();
-          break;
-        }
-      }
-
-      /* prepare to scatter to new direction */
-      if (scatter) {
-        /* adjust throughput and move to new location */
-        sd->P = ray->P + new_t * ray->D;
-        *throughput = tp;
-
-        return VOLUME_PATH_SCATTERED;
-      }
-      else {
-        /* accumulate transmittance */
-        accum_transmittance *= transmittance;
-      }
-    }
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-  }
-
-  *throughput = tp;
-
-  return VOLUME_PATH_ATTENUATED;
-}
-
-/* get the volume attenuation and emission over line segment defined by
- * ray, with the assumption that there are no surfaces blocking light
- * between the endpoints. distance sampling is used to decide if we will
- * scatter or not. */
-ccl_device_noinline_cpu VolumeIntegrateResult
-kernel_volume_integrate(KernelGlobals *kg,
-                        ccl_addr_space PathState *state,
-                        ShaderData *sd,
-                        Ray *ray,
-                        PathRadiance *L,
-                        ccl_addr_space float3 *throughput,
-                        float step_size)
-{
-  shader_setup_from_volume(kg, sd, ray);
-
-  if (step_size != FLT_MAX)
-    return kernel_volume_integrate_heterogeneous_distance(
-        kg, state, ray, sd, L, throughput, step_size);
-  else
-    return kernel_volume_integrate_homogeneous(kg, state, ray, sd, L, throughput, true);
-}
-
-#  ifndef __SPLIT_KERNEL__
-/* Decoupled Volume Sampling
- *
- * VolumeSegment is list of coefficients and transmittance stored at all steps
- * through a volume. This can then later be used for decoupled sampling as in:
- * "Importance Sampling Techniques for Path Tracing in Participating Media"
- *
- * On the GPU this is only supported (but currently not enabled)
- * for homogeneous volumes (1 step), due to
- * no support for malloc/free and too much stack usage with a fix size array. */
-
-typedef struct VolumeStep {
-  float3 sigma_s;             /* scatter coefficient */
-  float3 sigma_t;             /* extinction coefficient */
-  float3 accum_transmittance; /* accumulated transmittance including this step */
-  float3 cdf_distance;        /* cumulative density function for distance sampling */
-  float t;                    /* distance at end of this step */
-  float shade_t;              /* jittered distance where shading was done in step */
-  int closure_flag;           /* shader evaluation closure flags */
-} VolumeStep;
-
-typedef struct VolumeSegment {
-  VolumeStep stack_step; /* stack storage for homogeneous step, to avoid malloc */
-  VolumeStep *steps;     /* recorded steps */
-  int numsteps;          /* number of steps */
-  int closure_flag;      /* accumulated closure flags from all steps */
-
-  float3 accum_emission;      /* accumulated emission at end of segment */
-  float3 accum_transmittance; /* accumulated transmittance at end of segment */
-  float3 accum_albedo;        /* accumulated average albedo over segment */
-
-  int sampling_method; /* volume sampling method */
-} VolumeSegment;
-
-/* record volume steps to the end of the volume.
- *
- * it would be nice if we could only record up to the point that we need to scatter,
- * but the entire segment is needed to do always scattering, rather than probabilistically
- * hitting or missing the volume. if we don't know the transmittance at the end of the
- * volume we can't generate stratified distance samples up to that transmittance */
-#    ifdef __VOLUME_DECOUPLED__
-ccl_device void kernel_volume_decoupled_record(KernelGlobals *kg,
-                                               PathState *state,
-                                               Ray *ray,
-                                               ShaderData *sd,
-                                               VolumeSegment *segment,
-                                               const float object_step_size)
-{
-  /* prepare for volume stepping */
-  int max_steps;
-  float step_size, step_shade_offset, steps_offset;
-
-  if (object_step_size != FLT_MAX) {
-    max_steps = kernel_data.integrator.volume_max_steps;
-    kernel_volume_step_init(
-        kg, state, object_step_size, ray->t, &step_size, &step_shade_offset, &steps_offset);
-
-#      ifdef __KERNEL_CPU__
-    /* NOTE: For the branched path tracing it's possible to have direct
-     * and indirect light integration both having volume segments allocated.
-     * We detect this using index in the pre-allocated memory. Currently we
-     * only support two segments allocated at a time, if more needed some
-     * modifications to the KernelGlobals will be needed.
-     *
-     * This gives us restrictions that decoupled record should only happen
-     * in the stack manner, meaning if there's subsequent call of decoupled
-     * record it'll need to free memory before its caller frees memory.
-     */
-    const int index = kg->decoupled_volume_steps_index;
-    assert(index < sizeof(kg->decoupled_volume_steps) / sizeof(*kg->decoupled_volume_steps));
-    if (kg->decoupled_volume_steps[index] == NULL) {
-      kg->decoupled_volume_steps[index] = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-    }
-    segment->steps = kg->decoupled_volume_steps[index];
-    ++kg->decoupled_volume_steps_index;
-#      else
-    segment->steps = (VolumeStep *)malloc(sizeof(VolumeStep) * max_steps);
-#      endif
-  }
-  else {
-    max_steps = 1;
-    step_size = ray->t;
-    step_shade_offset = 0.0f;
-    steps_offset = 1.0f;
-    segment->steps = &segment->stack_step;
-  }
-
-  /* init accumulation variables */
-  float3 accum_emission = zero_float3();
-  float3 accum_transmittance = one_float3();
-  float3 accum_albedo = zero_float3();
-  float3 cdf_distance = zero_float3();
-  float t = 0.0f;
-
-  segment->numsteps = 0;
-  segment->closure_flag = 0;
-  bool is_last_step_empty = false;
-
-  VolumeStep *step = segment->steps;
-
-  for (int i = 0; i < max_steps; i++, step++) {
-    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
-    float dt = new_t - t;
-
-    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
-    VolumeShaderCoefficients coeff ccl_optional_struct_init;
-
-    /* compute segment */
-    if (volume_shader_sample(kg, sd, state, new_P, &coeff)) {
-      int closure_flag = sd->flag;
-      float3 sigma_t = coeff.sigma_t;
-
-      /* compute average albedo for channel sampling */
-      if (closure_flag & SD_SCATTER) {
-        accum_albedo += (dt / ray->t) * safe_divide_color(coeff.sigma_s, sigma_t);
-      }
-
-      /* compute accumulated transmittance */
-      float3 transmittance = volume_color_transmittance(sigma_t, dt);
-
-      /* compute emission attenuated by absorption */
-      if (closure_flag & SD_EMISSION) {
-        float3 emission = kernel_volume_emission_integrate(
-            &coeff, closure_flag, transmittance, dt);
-        accum_emission += accum_transmittance * emission;
-      }
-
-      accum_transmittance *= transmittance;
-
-      /* compute pdf for distance sampling */
-      float3 pdf_distance = dt * accum_transmittance * coeff.sigma_s;
-      cdf_distance = cdf_distance + pdf_distance;
-
-      /* write step data */
-      step->sigma_t = sigma_t;
-      step->sigma_s = coeff.sigma_s;
-      step->closure_flag = closure_flag;
-
-      segment->closure_flag |= closure_flag;
-
-      is_last_step_empty = false;
-      segment->numsteps++;
-    }
-    else {
-      if (is_last_step_empty) {
-        /* consecutive empty step, merge */
-        step--;
-      }
-      else {
-        /* store empty step */
-        step->sigma_t = zero_float3();
-        step->sigma_s = zero_float3();
-        step->closure_flag = 0;
-
-        segment->numsteps++;
-        is_last_step_empty = true;
-      }
-    }
-
-    step->accum_transmittance = accum_transmittance;
-    step->cdf_distance = cdf_distance;
-    step->t = new_t;
-    step->shade_t = t + dt * step_shade_offset;
-
-    /* stop if at the end of the volume */
-    t = new_t;
-    if (t == ray->t)
-      break;
-
-    /* stop if nearly all light blocked */
-    if (accum_transmittance.x < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.y < VOLUME_THROUGHPUT_EPSILON &&
-        accum_transmittance.z < VOLUME_THROUGHPUT_EPSILON)
-      break;
-  }
-
-  /* store total emission and transmittance */
-  segment->accum_emission = accum_emission;
-  segment->accum_transmittance = accum_transmittance;
-  segment->accum_albedo = accum_albedo;
-
-  /* normalize cumulative density function for distance sampling */
-  VolumeStep *last_step = segment->steps + segment->numsteps - 1;
-
-  if (!is_zero(last_step->cdf_distance)) {
-    VolumeStep *step = &segment->steps[0];
-    int numsteps = segment->numsteps;
-    float3 inv_cdf_distance_sum = safe_invert_color(last_step->cdf_distance);
-
-    for (int i = 0; i < numsteps; i++, step++)
-      step->cdf_distance *= inv_cdf_distance_sum;
-  }
-}
-
-ccl_device void kernel_volume_decoupled_free(KernelGlobals *kg, VolumeSegment *segment)
-{
-  if (segment->steps != &segment->stack_step) {
-#      ifdef __KERNEL_CPU__
-    /* NOTE: We only allow free last allocated segment.
-     * No random order of alloc/free is supported.
-     */
-    assert(kg->decoupled_volume_steps_index > 0);
-    assert(segment->steps == kg->decoupled_volume_steps[kg->decoupled_volume_steps_index - 1]);
-    --kg->decoupled_volume_steps_index;
-#      else
-    free(segment->steps);
-#      endif
-  }
-}
-#    endif /* __VOLUME_DECOUPLED__ */
-
-/* scattering for homogeneous and heterogeneous volumes, using decoupled ray
- * marching.
- *
- * function is expected to return VOLUME_PATH_SCATTERED when probalistic_scatter is false */
-ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(KernelGlobals *kg,
-                                                                 PathState *state,
-                                                                 Ray *ray,
-                                                                 ShaderData *sd,
-                                                                 float3 *throughput,
-                                                                 float rphase,
-                                                                 float rscatter,
-                                                                 const VolumeSegment *segment,
-                                                                 const float3 *light_P,
-                                                                 bool probalistic_scatter)
-{
-  kernel_assert(segment->closure_flag & SD_SCATTER);
-
-  /* Sample color channel, use MIS with balance heuristic. */
-  float3 channel_pdf;
-  int channel = kernel_volume_sample_channel(
-      segment->accum_albedo, *throughput, rphase, &channel_pdf);
-
-  float xi = rscatter;
-
-  /* probabilistic scattering decision based on transmittance */
-  if (probalistic_scatter) {
-    float sample_transmittance = kernel_volume_channel_get(segment->accum_transmittance, channel);
-
-    if (1.0f - xi >= sample_transmittance) {
-      /* rescale random number so we can reuse it */
-      xi = 1.0f - (1.0f - xi - sample_transmittance) / (1.0f - sample_transmittance);
-    }
-    else {
-      *throughput /= sample_transmittance;
-      return VOLUME_PATH_MISSED;
-    }
-  }
-
-  VolumeStep *step;
-  float3 transmittance;
-  float pdf, sample_t;
-  float mis_weight = 1.0f;
-  bool distance_sample = true;
-  bool use_mis = false;
-
-  if (segment->sampling_method && light_P) {
-    if (segment->sampling_method == SD_VOLUME_MIS) {
-      /* multiple importance sample: randomly pick between
-       * equiangular and distance sampling strategy */
-      if (xi < 0.5f) {
-        xi *= 2.0f;
-      }
-      else {
-        xi = (xi - 0.5f) * 2.0f;
-        distance_sample = false;
-      }
-
-      use_mis = true;
-    }
-    else {
-      /* only equiangular sampling */
-      distance_sample = false;
-    }
-  }
-
-  /* distance sampling */
-  if (distance_sample) {
-    /* find step in cdf */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float prev_cdf = 0.0f;
-      float step_cdf = 1.0f;
-      float3 prev_cdf_distance = zero_float3();
-
-      for (int i = 0;; i++, step++) {
-        /* todo: optimize using binary search */
-        step_cdf = kernel_volume_channel_get(step->cdf_distance, channel);
-
-        if (xi < step_cdf || i == segment->numsteps - 1)
-          break;
-
-        prev_cdf = step_cdf;
-        prev_t = step->t;
-        prev_cdf_distance = step->cdf_distance;
-      }
-
-      /* remap xi so we can reuse it */
-      xi = (xi - prev_cdf) / (step_cdf - prev_cdf);
-
-      /* pdf for picking step */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-
-    /* sample distance and compute transmittance */
-    float3 distance_pdf;
-    sample_t = prev_t + kernel_volume_distance_sample(
-                            step_t, step->sigma_t, channel, xi, &transmittance, &distance_pdf);
-
-    /* modify pdf for hit/miss decision */
-    if (probalistic_scatter)
-      distance_pdf *= one_float3() - segment->accum_transmittance;
-
-    pdf = dot(channel_pdf, distance_pdf * step_pdf_distance);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float equi_pdf = kernel_volume_equiangular_pdf(ray, *light_P, sample_t);
-      mis_weight = 2.0f * power_heuristic(pdf, equi_pdf);
-    }
-  }
-  /* equi-angular sampling */
-  else {
-    /* sample distance */
-    sample_t = kernel_volume_equiangular_sample(ray, *light_P, xi, &pdf);
-
-    /* find step in which sampled distance is located */
-    step = segment->steps;
-
-    float prev_t = 0.0f;
-    float3 step_pdf_distance = one_float3();
-
-    if (segment->numsteps > 1) {
-      float3 prev_cdf_distance = zero_float3();
-
-      int numsteps = segment->numsteps;
-      int high = numsteps - 1;
-      int low = 0;
-      int mid;
-
-      while (low < high) {
-        mid = (low + high) >> 1;
-
-        if (sample_t < step[mid].t)
-          high = mid;
-        else if (sample_t >= step[mid + 1].t)
-          low = mid + 1;
-        else {
-          /* found our interval in step[mid] .. step[mid+1] */
-          prev_t = step[mid].t;
-          prev_cdf_distance = step[mid].cdf_distance;
-          step += mid + 1;
-          break;
-        }
-      }
-
-      if (low >= numsteps - 1) {
-        prev_t = step[numsteps - 1].t;
-        prev_cdf_distance = step[numsteps - 1].cdf_distance;
-        step += numsteps - 1;
-      }
-
-      /* pdf for picking step with distance sampling */
-      step_pdf_distance = step->cdf_distance - prev_cdf_distance;
-    }
-
-    /* determine range in which we will sample */
-    float step_t = step->t - prev_t;
-    float step_sample_t = sample_t - prev_t;
-
-    /* compute transmittance */
-    transmittance = volume_color_transmittance(step->sigma_t, step_sample_t);
-
-    /* multiple importance sampling */
-    if (use_mis) {
-      float3 distance_pdf3 = kernel_volume_distance_pdf(step_t, step->sigma_t, step_sample_t);
-      float distance_pdf = dot(channel_pdf, distance_pdf3 * step_pdf_distance);
-      mis_weight = 2.0f * power_heuristic(pdf, distance_pdf);
-    }
-  }
-  if (sample_t < 0.0f || pdf == 0.0f) {
-    return VOLUME_PATH_MISSED;
-  }
-
-  /* compute transmittance up to this step */
-  if (step != segment->steps)
-    transmittance *= (step - 1)->accum_transmittance;
-
-  /* modify throughput */
-  *throughput *= step->sigma_s * transmittance * (mis_weight / pdf);
-
-  /* evaluate shader to create closures at shading point */
-  if (segment->numsteps > 1) {
-    sd->P = ray->P + step->shade_t * ray->D;
-
-    VolumeShaderCoefficients coeff;
-    volume_shader_sample(kg, sd, state, sd->P, &coeff);
-  }
-
-  /* move to new position */
-  sd->P = ray->P + sample_t * ray->D;
-
-  return VOLUME_PATH_SCATTERED;
-}
-#  endif /* __SPLIT_KERNEL */
-
-/* decide if we need to use decoupled or not */
-ccl_device bool kernel_volume_use_decoupled(KernelGlobals *kg,
-                                            bool heterogeneous,
-                                            bool direct,
-                                            int sampling_method)
-{
-  /* decoupled ray marching for heterogeneous volumes not supported on the GPU,
-   * which also means equiangular and multiple importance sampling is not
-   * support for that case */
-  if (!kernel_data.integrator.volume_decoupled)
-    return false;
-
-#  ifdef __KERNEL_GPU__
-  if (heterogeneous)
-    return false;
-#  endif
-
-  /* equiangular and multiple importance sampling only implemented for decoupled */
-  if (sampling_method != 0)
-    return true;
-
-  /* for all light sampling use decoupled, reusing shader evaluations is
-   * typically faster in that case */
-  if (direct)
-    return kernel_data.integrator.sample_all_lights_direct;
-  else
-    return kernel_data.integrator.sample_all_lights_indirect;
-}
-
-/* Volume Stack
- *
- * This is an array of object/shared ID's that the current segment of the path
- * is inside of. */
-
-ccl_device void kernel_volume_stack_init(KernelGlobals *kg,
-                                         ShaderData *stack_sd,
-                                         ccl_addr_space const PathState *state,
-                                         ccl_addr_space const Ray *ray,
-                                         ccl_addr_space VolumeStack *stack)
-{
-  /* NULL ray happens in the baker, does it need proper initialization of
-   * camera in volume?
-   */
-  if (!kernel_data.cam.is_inside_volume || ray == NULL) {
-    /* Camera is guaranteed to be in the air, only take background volume
-     * into account in this case.
-     */
-    if (kernel_data.background.volume_shader != SHADER_NONE) {
-      stack[0].shader = kernel_data.background.volume_shader;
-      stack[0].object = PRIM_NONE;
-      stack[1].shader = SHADER_NONE;
-    }
-    else {
-      stack[0].shader = SHADER_NONE;
-    }
-    return;
-  }
-
-  kernel_assert(state->flag & PATH_RAY_CAMERA);
-
-  Ray volume_ray = *ray;
-  volume_ray.t = FLT_MAX;
-
-  const uint visibility = (state->flag & PATH_RAY_ALL_VISIBILITY);
-  int stack_index = 0, enclosed_index = 0;
-
-#  ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
-  if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      if (stack_sd->flag & SD_BACKFACING) {
-        bool need_add = true;
-        for (int i = 0; i < enclosed_index && need_add; ++i) {
-          /* If ray exited the volume and never entered to that volume
-           * it means that camera is inside such a volume.
-           */
-          if (enclosed_volumes[i] == stack_sd->object) {
-            need_add = false;
-          }
-        }
-        for (int i = 0; i < stack_index && need_add; ++i) {
-          /* Don't add intersections twice. */
-          if (stack[i].object == stack_sd->object) {
-            need_add = false;
-            break;
-          }
-        }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
-          stack[stack_index].object = stack_sd->object;
-          stack[stack_index].shader = stack_sd->shader;
-          ++stack_index;
-        }
-      }
-      else {
-        /* If ray from camera enters the volume, this volume shouldn't
-         * be added to the stack on exit.
-         */
-        enclosed_volumes[enclosed_index++] = stack_sd->object;
-      }
-    }
-  }
-#  else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
-  int step = 0;
-
-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
-    Intersection isect;
-    if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
-      break;
-    }
-
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    if (stack_sd->flag & SD_BACKFACING) {
-      /* If ray exited the volume and never entered to that volume
-       * it means that camera is inside such a volume.
-       */
-      bool need_add = true;
-      for (int i = 0; i < enclosed_index && need_add; ++i) {
-        /* If ray exited the volume and never entered to that volume
-         * it means that camera is inside such a volume.
-         */
-        if (enclosed_volumes[i] == stack_sd->object) {
-          need_add = false;
-        }
-      }
-      for (int i = 0; i < stack_index && need_add; ++i) {
-        /* Don't add intersections twice. */
-        if (stack[i].object == stack_sd->object) {
-          need_add = false;
-          break;
-        }
-      }
-      if (need_add) {
-        stack[stack_index].object = stack_sd->object;
-        stack[stack_index].shader = stack_sd->shader;
-        ++stack_index;
-      }
-    }
-    else {
-      /* If ray from camera enters the volume, this volume shouldn't
-       * be added to the stack on exit.
-       */
-      enclosed_volumes[enclosed_index++] = stack_sd->object;
-    }
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    ++step;
-  }
-#  endif
-  /* stack_index of 0 means quick checks outside of the kernel gave false
-   * positive, nothing to worry about, just we've wasted quite a few of
-   * ticks just to come into conclusion that camera is in the air.
-   *
-   * In this case we're doing the same above -- check whether background has
-   * volume.
-   */
-  if (stack_index == 0 && kernel_data.background.volume_shader == SHADER_NONE) {
-    stack[0].shader = kernel_data.background.volume_shader;
-    stack[0].object = OBJECT_NONE;
-    stack[1].shader = SHADER_NONE;
-  }
-  else {
-    stack[stack_index].shader = SHADER_NONE;
-  }
-}
-
-ccl_device void kernel_volume_stack_enter_exit(KernelGlobals *kg,
-                                               ShaderData *sd,
-                                               ccl_addr_space VolumeStack *stack)
-{
-  /* todo: we should have some way for objects to indicate if they want the
-   * world shader to work inside them. excluding it by default is problematic
-   * because non-volume objects can't be assumed to be closed manifolds */
-
-  if (!(sd->flag & SD_HAS_VOLUME))
-    return;
-
-  if (sd->flag & SD_BACKFACING) {
-    /* exit volume object: remove from stack */
-    for (int i = 0; stack[i].shader != SHADER_NONE; i++) {
-      if (stack[i].object == sd->object) {
-        /* shift back next stack entries */
-        do {
-          stack[i] = stack[i + 1];
-          i++;
-        } while (stack[i].shader != SHADER_NONE);
-
-        return;
-      }
-    }
-  }
-  else {
-    /* enter volume object: add to stack */
-    int i;
-
-    for (i = 0; stack[i].shader != SHADER_NONE; i++) {
-      /* already in the stack? then we have nothing to do */
-      if (stack[i].object == sd->object)
-        return;
-    }
-
-    /* if we exceed the stack limit, ignore */
-    if (i >= VOLUME_STACK_SIZE - 1)
-      return;
-
-    /* add to the end of the stack */
-    stack[i].shader = sd->shader;
-    stack[i].object = sd->object;
-    stack[i + 1].shader = SHADER_NONE;
-  }
-}
-
-#  ifdef __SUBSURFACE__
-ccl_device void kernel_volume_stack_update_for_subsurface(KernelGlobals *kg,
-                                                          ShaderData *stack_sd,
-                                                          Ray *ray,
-                                                          ccl_addr_space VolumeStack *stack)
-{
-  kernel_assert(kernel_data.integrator.use_volumes);
-
-  Ray volume_ray = *ray;
-
-#    ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
-  if (num_hits > 0) {
-    Intersection *isect = hits;
-
-    qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-
-    for (uint hit = 0; hit < num_hits; ++hit, ++isect) {
-      shader_setup_from_ray(kg, stack_sd, isect, &volume_ray);
-      kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-    }
-  }
-#    else
-  Intersection isect;
-  int step = 0;
-  float3 Pend = ray->P + ray->D * ray->t;
-  while (step < 2 * VOLUME_STACK_SIZE &&
-         scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
-    shader_setup_from_ray(kg, stack_sd, &isect, &volume_ray);
-    kernel_volume_stack_enter_exit(kg, stack_sd, stack);
-
-    /* Move ray forward. */
-    volume_ray.P = ray_offset(stack_sd->P, -stack_sd->Ng);
-    if (volume_ray.t != FLT_MAX) {
-      volume_ray.D = normalize_len(Pend - volume_ray.P, &volume_ray.t);
-    }
-    ++step;
-  }
-#    endif
-}
-#  endif
-
-/* Clean stack after the last bounce.
- *
- * It is expected that all volumes are closed manifolds, so at the time when ray
- * hits nothing (for example, it is a last bounce which goes to environment) the
- * only expected volume in the stack is the world's one. All the rest volume
- * entries should have been exited already.
- *
- * This isn't always true because of ray intersection precision issues, which
- * could lead us to an infinite non-world volume in the stack, causing render
- * artifacts.
- *
- * Use this function after the last bounce to get rid of all volumes apart from
- * the world's one after the last bounce to avoid render artifacts.
- */
-ccl_device_inline void kernel_volume_clean_stack(KernelGlobals *kg,
-                                                 ccl_addr_space VolumeStack *volume_stack)
-{
-  if (kernel_data.background.volume_shader != SHADER_NONE) {
-    /* Keep the world's volume in stack. */
-    volume_stack[1].shader = SHADER_NONE;
-  }
-  else {
-    volume_stack[0].shader = SHADER_NONE;
-  }
-}
-
-#endif /* __VOLUME__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index d1602744f1d..fab0915c38e 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef __KERNEL_WORK_STEALING_H__
-#define __KERNEL_WORK_STEALING_H__
+#pragma once
 
 CCL_NAMESPACE_BEGIN
 
@@ -24,21 +23,24 @@ CCL_NAMESPACE_BEGIN
  */
 
 /* Map global work index to tile, pixel X/Y and sample. */
-ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       uint global_work_index,
                                       ccl_private uint *x,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#ifdef __KERNEL_CUDA__
-  /* Keeping threads for the same pixel together improves performance on CUDA. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#else  /* __KERNEL_CUDA__ */
+#if 0
+  /* Keep threads for the same sample together. */
   uint tile_pixels = tile->w * tile->h;
   uint sample_offset = global_work_index / tile_pixels;
   uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#endif /* __KERNEL_CUDA__ */
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
 
@@ -47,71 +49,4 @@ ccl_device_inline void get_work_pixel(ccl_global const WorkTile *tile,
   *sample = tile->start_sample + sample_offset;
 }
 
-#ifdef __KERNEL_OPENCL__
-#  pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
-#endif
-
-#ifdef __SPLIT_KERNEL__
-/* Returns true if there is work */
-ccl_device bool get_next_work_item(KernelGlobals *kg,
-                                   ccl_global uint *work_pools,
-                                   uint total_work_size,
-                                   uint ray_index,
-                                   ccl_private uint *global_work_index)
-{
-  /* With a small amount of work there may be more threads than work due to
-   * rounding up of global size, stop such threads immediately. */
-  if (ray_index >= total_work_size) {
-    return false;
-  }
-
-  /* Increase atomic work index counter in pool. */
-  uint pool = ray_index / WORK_POOL_SIZE;
-  uint work_index = atomic_fetch_and_inc_uint32(&work_pools[pool]);
-
-  /* Map per-pool work index to a global work index. */
-  uint global_size = ccl_global_size(0) * ccl_global_size(1);
-  kernel_assert(global_size % WORK_POOL_SIZE == 0);
-  kernel_assert(ray_index < global_size);
-
-  *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + (pool * WORK_POOL_SIZE) +
-                       (work_index % WORK_POOL_SIZE);
-
-  /* Test if all work for this pool is done. */
-  return (*global_work_index < total_work_size);
-}
-
-ccl_device bool get_next_work(KernelGlobals *kg,
-                              ccl_global uint *work_pools,
-                              uint total_work_size,
-                              uint ray_index,
-                              ccl_private uint *global_work_index)
-{
-  bool got_work = false;
-  if (kernel_data.film.pass_adaptive_aux_buffer) {
-    do {
-      got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-      if (got_work) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, *global_work_index, &x, &y, &sample);
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float4 *aux = (ccl_global float4 *)(buffer +
-                                                       kernel_data.film.pass_adaptive_aux_buffer);
-        if ((*aux).w == 0.0f) {
-          break;
-        }
-      }
-    } while (got_work);
-  }
-  else {
-    got_work = get_next_work_item(kg, work_pools, total_work_size, ray_index, global_work_index);
-  }
-  return got_work;
-}
-#endif
-
 CCL_NAMESPACE_END
-
-#endif /* __KERNEL_WORK_STEALING_H__ */
diff --git a/intern/cycles/kernel/kernel_write_passes.h b/intern/cycles/kernel/kernel_write_passes.h
index 410218d91d4..9d379495629 100644
--- a/intern/cycles/kernel/kernel_write_passes.h
+++ b/intern/cycles/kernel/kernel_write_passes.h
@@ -14,23 +14,25 @@
  * limitations under the License.
  */
 
-#if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
+#pragma once
+
+#ifdef __KERNEL_GPU__
 #  define __ATOMIC_PASS_WRITE__
 #endif
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
+ccl_device_inline void kernel_write_pass_float(ccl_global float *ccl_restrict buffer, float value)
 {
-  ccl_global float *buf = buffer;
 #ifdef __ATOMIC_PASS_WRITE__
-  atomic_add_and_fetch_float(buf, value);
+  atomic_add_and_fetch_float(buffer, value);
 #else
-  *buf += value;
+  *buffer += value;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3 value)
+ccl_device_inline void kernel_write_pass_float3(ccl_global float *ccl_restrict buffer,
+                                                float3 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -41,12 +43,14 @@ ccl_device_inline void kernel_write_pass_float3(ccl_global float *buffer, float3
   atomic_add_and_fetch_float(buf_y, value.y);
   atomic_add_and_fetch_float(buf_z, value.z);
 #else
-  ccl_global float3 *buf = (ccl_global float3 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
 #endif
 }
 
-ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4 value)
+ccl_device_inline void kernel_write_pass_float4(ccl_global float *ccl_restrict buffer,
+                                                float4 value)
 {
 #ifdef __ATOMIC_PASS_WRITE__
   ccl_global float *buf_x = buffer + 0;
@@ -59,37 +63,26 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, float4
   atomic_add_and_fetch_float(buf_z, value.z);
   atomic_add_and_fetch_float(buf_w, value.w);
 #else
-  ccl_global float4 *buf = (ccl_global float4 *)buffer;
-  *buf += value;
+  buffer[0] += value.x;
+  buffer[1] += value.y;
+  buffer[2] += value.z;
+  buffer[3] += value.w;
 #endif
 }
 
-#ifdef __DENOISING_FEATURES__
-ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, float value)
+ccl_device_inline float kernel_read_pass_float(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float(buffer, value);
-
-  /* The online one-pass variance update that's used for the megakernel can't easily be implemented
-   * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-  kernel_write_pass_float(buffer + 1, value * value);
+  return *buffer;
 }
 
-#  ifdef __ATOMIC_PASS_WRITE__
-#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
-#  else
-ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, float3 value)
+ccl_device_inline float3 kernel_read_pass_float3(ccl_global float *ccl_restrict buffer)
 {
-  buffer[0] += value.x;
-  buffer[1] += value.y;
-  buffer[2] += value.z;
+  return make_float3(buffer[0], buffer[1], buffer[2]);
 }
-#  endif
 
-ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, float3 value)
+ccl_device_inline float4 kernel_read_pass_float4(ccl_global float *ccl_restrict buffer)
 {
-  kernel_write_pass_float3_unaligned(buffer, value);
-  kernel_write_pass_float3_unaligned(buffer + 3, value * value);
+  return make_float4(buffer[0], buffer[1], buffer[2], buffer[3]);
 }
-#endif /* __DENOISING_FEATURES__ */
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter.cpp b/intern/cycles/kernel/kernels/cpu/filter.cpp
deleted file mode 100644
index 145a6b6ac40..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
deleted file mode 100644
index 012daba62d8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
deleted file mode 100644
index 16351a7f949..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
deleted file mode 100644
index 1423b182ab8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleV,
-                                                     float *sampleVV,
-                                                     float *bufferV,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tiles,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample);
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
deleted file mode 100644
index 3d4cb87e104..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-#include "kernel/kernel_compat_cpu.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-#ifdef KERNEL_STUB
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-#endif
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoise filter */
-
-void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
-                                                     TileInfo *tile_info,
-                                                     int x,
-                                                     int y,
-                                                     float *unfilteredA,
-                                                     float *unfilteredB,
-                                                     float *sampleVariance,
-                                                     float *sampleVarianceV,
-                                                     float *bufferVariance,
-                                                     int *prefilter_rect,
-                                                     int buffer_pass_stride,
-                                                     int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
-#else
-  kernel_filter_divide_shadow(sample,
-                              tile_info,
-                              x,
-                              y,
-                              unfilteredA,
-                              unfilteredB,
-                              sampleVariance,
-                              sampleVarianceV,
-                              bufferVariance,
-                              load_int4(prefilter_rect),
-                              buffer_pass_stride,
-                              buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
-                                                   TileInfo *tile_info,
-                                                   int m_offset,
-                                                   int v_offset,
-                                                   int x,
-                                                   int y,
-                                                   float *mean,
-                                                   float *variance,
-                                                   float scale,
-                                                   int *prefilter_rect,
-                                                   int buffer_pass_stride,
-                                                   int buffer_denoising_offset)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
-#else
-  kernel_filter_get_feature(sample,
-                            tile_info,
-                            m_offset,
-                            v_offset,
-                            x,
-                            y,
-                            mean,
-                            variance,
-                            scale,
-                            load_int4(prefilter_rect),
-                            buffer_pass_stride,
-                            buffer_denoising_offset);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_write_feature)(int sample,
-                                                     int x,
-                                                     int y,
-                                                     int *buffer_params,
-                                                     float *from,
-                                                     float *buffer,
-                                                     int out_offset,
-                                                     int *prefilter_rect)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_write_feature);
-#else
-  kernel_filter_write_feature(
-      sample, x, y, load_int4(buffer_params), from, buffer, out_offset, load_int4(prefilter_rect));
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x,
-                                                       int y,
-                                                       ccl_global float *image,
-                                                       ccl_global float *variance,
-                                                       ccl_global float *depth,
-                                                       ccl_global float *output,
-                                                       int *rect,
-                                                       int pass_stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
-#else
-  kernel_filter_detect_outliers(
-      x, y, image, variance, depth, output, load_int4(rect), pass_stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(
-    int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
-#else
-  kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float *buffer,
-                                                           TileInfo *tile_info,
-                                                           int x,
-                                                           int y,
-                                                           int storage_ofs,
-                                                           float *transform,
-                                                           int *rank,
-                                                           int *prefilter_rect,
-                                                           int pass_stride,
-                                                           int frame_stride,
-                                                           bool use_time,
-                                                           int radius,
-                                                           float pca_threshold)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
-#else
-  rank += storage_ofs;
-  transform += storage_ofs * TRANSFORM_SIZE;
-  kernel_filter_construct_transform(buffer,
-                                    tile_info,
-                                    x,
-                                    y,
-                                    load_int4(prefilter_rect),
-                                    pass_stride,
-                                    frame_stride,
-                                    use_time,
-                                    transform,
-                                    rank,
-                                    radius,
-                                    pca_threshold);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
-                                                           int dy,
-                                                           float *weight_image,
-                                                           float *variance_image,
-                                                           float *scale_image,
-                                                           float *difference_image,
-                                                           int *rect,
-                                                           int stride,
-                                                           int channel_offset,
-                                                           int frame_offset,
-                                                           float a,
-                                                           float k_2)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
-#else
-  kernel_filter_nlm_calc_difference(dx,
-                                    dy,
-                                    weight_image,
-                                    variance_image,
-                                    scale_image,
-                                    difference_image,
-                                    load_int4(rect),
-                                    stride,
-                                    channel_offset,
-                                    frame_offset,
-                                    a,
-                                    k_2);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
-#else
-  kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(
-    float *difference_image, float *out_image, int *rect, int stride, int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
-#else
-  kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
-                                                         int dy,
-                                                         float *difference_image,
-                                                         float *image,
-                                                         float *temp_image,
-                                                         float *out_image,
-                                                         float *accum_image,
-                                                         int *rect,
-                                                         int channel_offset,
-                                                         int stride,
-                                                         int f)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
-#else
-  kernel_filter_nlm_update_output(dx,
-                                  dy,
-                                  difference_image,
-                                  image,
-                                  temp_image,
-                                  out_image,
-                                  accum_image,
-                                  load_int4(rect),
-                                  channel_offset,
-                                  stride,
-                                  f);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
-                                                             int dy,
-                                                             int t,
-                                                             float *difference_image,
-                                                             float *buffer,
-                                                             float *transform,
-                                                             int *rank,
-                                                             float *XtWX,
-                                                             float3 *XtWY,
-                                                             int *rect,
-                                                             int *filter_window,
-                                                             int stride,
-                                                             int f,
-                                                             int pass_stride,
-                                                             int frame_offset,
-                                                             bool use_time)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
-#else
-  kernel_filter_nlm_construct_gramian(dx,
-                                      dy,
-                                      t,
-                                      difference_image,
-                                      buffer,
-                                      transform,
-                                      rank,
-                                      XtWX,
-                                      XtWY,
-                                      load_int4(rect),
-                                      load_int4(filter_window),
-                                      stride,
-                                      f,
-                                      pass_stride,
-                                      frame_offset,
-                                      use_time);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
-                                                     float *accum_image,
-                                                     int *rect,
-                                                     int stride)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
-#else
-  kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
-#endif
-}
-
-void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
-                                                int y,
-                                                int storage_ofs,
-                                                float *buffer,
-                                                int *rank,
-                                                float *XtWX,
-                                                float3 *XtWY,
-                                                int *buffer_params,
-                                                int sample)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, filter_finalize);
-#else
-  XtWX += storage_ofs * XTWX_SIZE;
-  XtWY += storage_ofs * XTWY_SIZE;
-  rank += storage_ofs;
-  kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
-#endif
-}
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
deleted file mode 100644
index 75833d83648..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
deleted file mode 100644
index c998cd54d3a..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
deleted file mode 100644
index fc4ef1fca5b..00000000000
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/filter/filter.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/filter_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
deleted file mode 100644
index ea3103f12c3..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common declaration part of all CPU kernels. */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride);
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample);
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
-/* Split kernels */
-
-void KERNEL_FUNCTION_FULL_NAME(data_init)(KernelGlobals *kg,
-                                          ccl_constant KernelData *data,
-                                          ccl_global void *split_data_buffer,
-                                          int num_elements,
-                                          ccl_global char *ray_state,
-                                          int start_sample,
-                                          int end_sample,
-                                          int sx,
-                                          int sy,
-                                          int sw,
-                                          int sh,
-                                          int offset,
-                                          int stride,
-                                          ccl_global int *Queue_index,
-                                          int queuesize,
-                                          ccl_global char *use_queues_flag,
-                                          ccl_global unsigned int *work_pool_wgs,
-                                          unsigned int num_samples,
-                                          ccl_global float *buffer);
-
-#define DECLARE_SPLIT_KERNEL_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * data);
-
-DECLARE_SPLIT_KERNEL_FUNCTION(path_init)
-DECLARE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DECLARE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DECLARE_SPLIT_KERNEL_FUNCTION(do_volume)
-DECLARE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_sort)
-DECLARE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DECLARE_SPLIT_KERNEL_FUNCTION(enqueue_inactive)
-DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DECLARE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-#undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
deleted file mode 100644
index 51d6c23f72f..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Templated common implementation part of all CPU kernels.
- *
- * The idea is that particular .cpp files sets needed optimization flags and
- * simply includes this file without worry of copying actual implementation over.
- */
-
-// clang-format off
-#include "kernel/kernel_compat_cpu.h"
-
-#ifndef KERNEL_STUB
-#  ifndef __SPLIT_KERNEL__
-#    include "kernel/kernel_math.h"
-#    include "kernel/kernel_types.h"
-
-#    include "kernel/split/kernel_split_data.h"
-#    include "kernel/kernel_globals.h"
-
-#    include "kernel/kernel_color.h"
-#    include "kernel/kernels/cpu/kernel_cpu_image.h"
-#    include "kernel/kernel_film.h"
-#    include "kernel/kernel_path.h"
-#    include "kernel/kernel_path_branched.h"
-#    include "kernel/kernel_bake.h"
-#  else
-#    include "kernel/split/kernel_split_common.h"
-
-#    include "kernel/split/kernel_data_init.h"
-#    include "kernel/split/kernel_path_init.h"
-#    include "kernel/split/kernel_scene_intersect.h"
-#    include "kernel/split/kernel_lamp_emission.h"
-#    include "kernel/split/kernel_do_volume.h"
-#    include "kernel/split/kernel_queue_enqueue.h"
-#    include "kernel/split/kernel_indirect_background.h"
-#    include "kernel/split/kernel_shader_setup.h"
-#    include "kernel/split/kernel_shader_sort.h"
-#    include "kernel/split/kernel_shader_eval.h"
-#    include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#    include "kernel/split/kernel_subsurface_scatter.h"
-#    include "kernel/split/kernel_direct_lighting.h"
-#    include "kernel/split/kernel_shadow_blocked_ao.h"
-#    include "kernel/split/kernel_shadow_blocked_dl.h"
-#    include "kernel/split/kernel_enqueue_inactive.h"
-#    include "kernel/split/kernel_next_iteration_setup.h"
-#    include "kernel/split/kernel_indirect_subsurface.h"
-#    include "kernel/split/kernel_buffer_update.h"
-#    include "kernel/split/kernel_adaptive_stopping.h"
-#    include "kernel/split/kernel_adaptive_filter_x.h"
-#    include "kernel/split/kernel_adaptive_filter_y.h"
-#    include "kernel/split/kernel_adaptive_adjust_samples.h"
-#  endif /* __SPLIT_KERNEL__ */
-#else
-#  define STUB_ASSERT(arch, name) \
-    assert(!(#name " kernel stub for architecture " #arch " was called!"))
-
-#  ifdef __SPLIT_KERNEL__
-#    include "kernel/split/kernel_data_init.h"
-#  endif /* __SPLIT_KERNEL__ */
-#endif   /* KERNEL_STUB */
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-#ifndef __SPLIT_KERNEL__
-
-/* Path Tracing */
-
-void KERNEL_FUNCTION_FULL_NAME(path_trace)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, path_trace);
-#  else
-#    ifdef __BRANCHED_PATH__
-  if (kernel_data.integrator.branched) {
-    kernel_branched_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-  else
-#    endif
-  {
-    kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-/* Film */
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
-                                                uchar4 *rgba,
-                                                float *buffer,
-                                                float sample_scale,
-                                                int x,
-                                                int y,
-                                                int offset,
-                                                int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
-#  else
-  kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
-                                                      uchar4 *rgba,
-                                                      float *buffer,
-                                                      float sample_scale,
-                                                      int x,
-                                                      int y,
-                                                      int offset,
-                                                      int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
-#  else
-  kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-#  endif /* KERNEL_STUB */
-}
-
-/* Bake */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, bake);
-#  else
-#    ifdef __BAKING__
-  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#    endif
-#  endif /* KERNEL_STUB */
-}
-
-/* Shader Evaluate */
-
-void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
-                                       uint4 *input,
-                                       float4 *output,
-                                       int type,
-                                       int filter,
-                                       int i,
-                                       int offset,
-                                       int sample)
-{
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, shader);
-#  else
-  if (type == SHADER_EVAL_DISPLACE) {
-    kernel_displace_evaluate(kg, input, output, i);
-  }
-  else {
-    kernel_background_evaluate(kg, input, output, i);
-  }
-#  endif /* KERNEL_STUB */
-}
-
-#else /* __SPLIT_KERNEL__ */
-
-/* Split Kernel Path Tracing */
-
-#  ifdef KERNEL_STUB
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        STUB_ASSERT(KERNEL_ARCH, name); \
-      }
-#  else
-#    define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        kernel_##name(kg); \
-      }
-
-#    define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-      void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals * kg, KernelData * /*data*/) \
-      { \
-        ccl_local type locals; \
-        kernel_##name(kg, &locals); \
-      }
-#  endif /* KERNEL_STUB */
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao,
-                                    BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-#endif   /* __SPLIT_KERNEL__ */
-
-#undef KERNEL_STUB
-#undef STUB_ASSERT
-#undef KERNEL_ARCH
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
deleted file mode 100644
index 989f5e5aaa8..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CPU kernel entry points */
-
-/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
- * one with SSE2 intrinsics.
- */
-#if defined(__x86_64__) || defined(_M_X64)
-#  define __KERNEL_SSE2__
-#endif
-
-#define __SPLIT_KERNEL__
-
-/* When building kernel for native machine detect kernel features from the flags
- * set by compiler.
- */
-#ifdef WITH_KERNEL_NATIVE
-#  ifdef __SSE2__
-#    ifndef __KERNEL_SSE2__
-#      define __KERNEL_SSE2__
-#    endif
-#  endif
-#  ifdef __SSE3__
-#    define __KERNEL_SSE3__
-#  endif
-#  ifdef __SSSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#  ifdef __SSE4_1__
-#    define __KERNEL_SSE41__
-#  endif
-#  ifdef __AVX__
-#    define __KERNEL_AVX__
-#  endif
-#  ifdef __AVX2__
-#    define __KERNEL_SSE__
-#    define __KERNEL_AVX2__
-#  endif
-#endif
-
-/* quiet unused define warnings */
-#if defined(__KERNEL_SSE2__)
-/* do nothing */
-#endif
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
deleted file mode 100644
index 40e485d27c0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
deleted file mode 100644
index 8c44238470e..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2011-2014 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with AVX2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE__
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#    define __KERNEL_AVX__
-#    define __KERNEL_AVX2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_avx2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
deleted file mode 100644
index 7a3f218d5fc..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE2
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse2
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
deleted file mode 100644
index 1cab59e0ea0..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse3
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
deleted file mode 100644
index 637126d9d4c..00000000000
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
- * optimization flags and nearly all functions inlined, while kernel.cpp
- * is compiled without for other CPU's. */
-
-#define __SPLIT_KERNEL__
-
-#include "util/util_optimization.h"
-
-#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_STUB
-#else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
-#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#    define __KERNEL_SSE2__
-#    define __KERNEL_SSE3__
-#    define __KERNEL_SSSE3__
-#    define __KERNEL_SSE41__
-#  endif
-#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
-
-#include "kernel/kernel.h"
-#define KERNEL_ARCH cpu_sse41
-#include "kernel/kernels/cpu/kernel_cpu_impl.h"
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
deleted file mode 100644
index 6c9642d1f03..00000000000
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel_config.h"
-
-#include "kernel/kernel_compat_cuda.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_copy_input(float *buffer,
-                              CCL_FILTER_TILE_INFO,
-                              int4 prefilter_rect,
-                              int buffer_pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2);
-		int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2);
-		int itile = ytile * 3 + xtile;
-		float *const in = ((float *)ccl_get_tile_buffer(itile)) +
-			(tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride;
-		buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride;
-		for (int i = 0; i < buffer_pass_stride; ++i)
-			buffer[i] = in[i];
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		if (num_inputs > 0) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3;
-			out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f);
-			out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f);
-			out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f);
-		}
-		if (num_inputs > 1) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-		if (num_inputs > 2) {
-			float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float);
-			float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3;
-			out[0] = in[0] / num_samples;
-			out[1] = in[1] / num_samples;
-			out[2] = in[2] / num_samples;
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < sw && y < sh) {
-		float *in = rgb + ((ix + x) + (iy + y) * iw) * 3;
-		float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride;
-		out[0] = in[0] * num_samples;
-		out[1] = in[1] * num_samples;
-		out[2] = in[2] * num_samples;
-	}
-}
-
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_divide_shadow(int sample,
-                                 CCL_FILTER_TILE_INFO,
-                                 float *unfilteredA,
-                                 float *unfilteredB,
-                                 float *sampleVariance,
-                                 float *sampleVarianceV,
-                                 float *bufferVariance,
-                                 int4 prefilter_rect,
-                                 int buffer_pass_stride,
-                                 int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            tile_info,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_get_feature(int sample,
-                               CCL_FILTER_TILE_INFO,
-                               int m_offset,
-                               int v_offset,
-                               float *mean,
-                               float *variance,
-                               float scale,
-                               int4 prefilter_rect,
-                               int buffer_pass_stride,
-                               int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          tile_info,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_write_feature(int sample,
-                                 int4 buffer_params,
-                                 int4 filter_area,
-                                 float *from,
-                                 float *buffer,
-                                 int out_offset,
-                                 int4 prefilter_rect)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-	                                x + filter_area.x,
-	                                y + filter_area.y,
-	                                buffer_params,
-	                                from,
-	                                buffer,
-	                                out_offset,
-	                                prefilter_rect);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_detect_outliers(float *image,
-                                   float *variance,
-                                   float *depth,
-                                   float *output,
-                                   int4 prefilter_rect,
-                                   int pass_stride)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
-{
-	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
-                                       CCL_FILTER_TILE_INFO,
-                                       float *transform, int *rank,
-                                       int4 filter_area, int4 rect,
-                                       int radius, float pca_threshold,
-                                       int pass_stride, int frame_stride,
-                                       bool use_time)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int *l_rank = rank + y*filter_area.z + x;
-		float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  tile_info,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
-                                       const float *ccl_restrict variance_image,
-                                       const float *ccl_restrict scale_image,
-                                       float *difference_image,
-                                       int w,
-                                       int h,
-                                       int stride,
-                                       int pass_stride,
-                                       int r,
-                                       int channel_offset,
-                                       int frame_offset,
-                                       float a,
-                                       float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
-                            float *out_image,
-                            int w,
-                            int h,
-                            int stride,
-                            int pass_stride,
-                            int r,
-                            int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
-                                   float *out_image,
-                                   int w,
-                                   int h,
-                                   int stride,
-                                   int pass_stride,
-                                   int r,
-                                   int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
-                                     const float *ccl_restrict image,
-                                     float *out_image,
-                                     float *accum_image,
-                                     int w,
-                                     int h,
-                                     int stride,
-                                     int pass_stride,
-                                     int channel_offset,
-                                     int r,
-                                     int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image,
-                                 const float *ccl_restrict accum_image,
-                                 int w,
-                                 int h,
-                                 int stride)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int t,
-                                         const float *ccl_restrict difference_image,
-                                         const float *ccl_restrict buffer,
-                                         float const* __restrict__ transform,
-                                         int *rank,
-                                         float *XtWX,
-                                         float3 *XtWY,
-                                         int4 filter_window,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f,
-                                         int frame_offset,
-                                         bool use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    threadIdx.y*blockDim.x + threadIdx.x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(float *buffer,
-                            int *rank,
-                            float *XtWX,
-                            float3 *XtWY,
-                            int4 filter_area,
-                            int4 buffer_params,
-                            int sample)
-{
-	int x = blockDim.x*blockIdx.x + threadIdx.x;
-	int y = blockDim.y*blockIdx.y + threadIdx.y;
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
deleted file mode 100644
index cf62b6e781e..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/cuda/kernel_cuda_image.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_work_stealing.h"
-#include "kernel/kernel_adaptive_sampling.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-
-#ifdef __BRANCHED_PATH__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
-kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	uint x, y, sample;
-	KernelGlobals kg;
-	if(thread_is_active) {
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-	
-	if(kernel_data.film.cryptomatte_passes) {
-		__syncthreads();
-		if(thread_is_active) {
-			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-		}
-	}
-}
-#endif
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-	bool thread_is_active = work_index < total_work_size;
-	KernelGlobals kg;
-	if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) {
-		uint x = tile->x + work_index % tile->w;
-		uint y = tile->y + work_index / tile->w;
-		int index = tile->offset + x + y * tile->stride;
-		ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-		kernel_do_adaptive_stopping(&kg, buffer, sample);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->h) {
-			int y = tile->y + ccl_global_id(0);
-			kernel_do_adaptive_filter_x(&kg, y, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint)
-{
-	KernelGlobals kg;
-	if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) {
-		if(ccl_global_id(0) < tile->w) {
-			int x = tile->x + ccl_global_id(0);
-			kernel_do_adaptive_filter_y(&kg, x, tile);
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size)
-{
-	if(kernel_data.film.pass_adaptive_aux_buffer) {
-		int work_index = ccl_global_id(0);
-		bool thread_is_active = work_index < total_work_size;
-		KernelGlobals kg;
-		if(thread_is_active) {
-			uint x = tile->x + work_index % tile->w;
-			uint y = tile->y + work_index / tile->w;
-			int index = tile->offset + x + y * tile->stride;
-			ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride;
-			if(buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-				buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-				float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-				if(sample_multiplier != 1.0f) {
-					kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier);
-				}
-			}
-			else {
-				kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f));
-			}
-		}
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh) {
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_displace(uint4 *input,
-                     float4 *output,
-                     int type,
-                     int sx,
-                     int sw,
-                     int offset,
-                     int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_displace_evaluate(&kg, input, output, x);
-	}
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_background(uint4 *input,
-                       float4 *output,
-                       int type,
-                       int sx,
-                       int sw,
-                       int offset,
-                       int sample)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-
-	if(x < sx + sw) {
-		KernelGlobals kg;
-		kernel_background_evaluate(&kg, input, output, x);
-	}
-}
-
-#ifdef __BAKING__
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(WorkTile *tile, uint total_work_size)
-{
-	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
-		get_work_pixel(tile, work_index, &x, &y, &sample);
-
-		KernelGlobals kg;
-		kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
-	}
-}
-#endif
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
deleted file mode 100644
index 2e47ce2de6c..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_config.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* device data taken from CUDA occupancy calculator */
-
-/* 3.0 and 3.5 */
-#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 63
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.x, 6.x */
-#elif __CUDA_ARCH__ <= 699
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
- * registers */
-#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
-#    define CUDA_KERNEL_MAX_REGISTERS 64
-#  else
-#    define CUDA_KERNEL_MAX_REGISTERS 48
-#  endif
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 7.x, 8.x */
-#elif __CUDA_ARCH__ <= 899
-#  define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-#  define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-#  define CUDA_BLOCK_MAX_THREADS 1024
-#  define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-#  define CUDA_THREADS_BLOCK_WIDTH 16
-#  define CUDA_KERNEL_MAX_REGISTERS 64
-#  define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
-
-/* unknown architecture */
-#else
-#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* For split kernel using all registers seems fastest for now, but this
- * is unlikely to be optimal once we resolve other bottlenecks. */
-
-#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
-
-/* Compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread. */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
-  __launch_bounds__(threads_block_width *threads_block_width, \
-                    CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-                        (threads_block_width * threads_block_width * thread_num_registers))
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-#  error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
-        (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
-    CUDA_MULTIPROCESSOR_MAX_BLOCKS
-#  error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-#  error "Maximum number of registers per thread exceeded"
-#endif
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
deleted file mode 100644
index 95ad7599cf1..00000000000
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* CUDA split kernel entry points */
-
-#ifdef __CUDA_ARCH__
-
-#define __SPLIT_KERNEL__
-
-#include "kernel/kernel_compat_cuda.h"
-#include "kernel_config.h"
-
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-#include "kernel/split/kernel_path_init.h"
-#include "kernel/split/kernel_scene_intersect.h"
-#include "kernel/split/kernel_lamp_emission.h"
-#include "kernel/split/kernel_do_volume.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-#include "kernel/split/kernel_indirect_background.h"
-#include "kernel/split/kernel_shader_setup.h"
-#include "kernel/split/kernel_shader_sort.h"
-#include "kernel/split/kernel_shader_eval.h"
-#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-#include "kernel/split/kernel_direct_lighting.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-#include "kernel/split/kernel_enqueue_inactive.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-#include "kernel/split/kernel_indirect_subsurface.h"
-#include "kernel/split/kernel_buffer_update.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#include "kernel/kernel_film.h"
-
-/* kernels */
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size)
-{
-	*size = split_data_buffer_size(NULL, num_threads);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_path_trace_data_init(
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,
-        int queuesize,
-        ccl_global char *use_queues_flag,
-        ccl_global unsigned int *work_pool_wgs,
-        unsigned int num_samples,
-        ccl_global float *buffer)
-{
-	kernel_data_init(NULL,
-	                 NULL,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		kernel_##name(NULL); \
-	}
-
-#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
-	extern "C" __global__ void \
-	CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \
-	kernel_cuda_##name() \
-	{ \
-		ccl_local type locals; \
-		kernel_##name(NULL, &locals); \
-	}
-
-DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
-DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
-DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
-DEFINE_SPLIT_KERNEL_FUNCTION(do_volume)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals)
-DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao)
-DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
-DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y)
-DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples)
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-extern "C" __global__ void
-CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
-	int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-#endif
-
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
deleted file mode 100644
index 996bc27f71b..00000000000
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL kernel entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-
-#include "kernel/filter/filter_kernel.h"
-
-/* kernels */
-
-__kernel void kernel_ocl_filter_divide_shadow(int sample,
-                                              CCL_FILTER_TILE_INFO,
-                                              ccl_global float *unfilteredA,
-                                              ccl_global float *unfilteredB,
-                                              ccl_global float *sampleVariance,
-                                              ccl_global float *sampleVarianceV,
-                                              ccl_global float *bufferVariance,
-                                              int4 prefilter_rect,
-                                              int buffer_pass_stride,
-                                              int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_divide_shadow(sample,
-		                            CCL_FILTER_TILE_INFO_ARG,
-		                            x, y,
-		                            unfilteredA,
-		                            unfilteredB,
-		                            sampleVariance,
-		                            sampleVarianceV,
-		                            bufferVariance,
-		                            prefilter_rect,
-		                            buffer_pass_stride,
-		                            buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_get_feature(int sample,
-                                            CCL_FILTER_TILE_INFO,
-                                            int m_offset,
-                                            int v_offset,
-                                            ccl_global float *mean,
-                                            ccl_global float *variance,
-                                            float scale,
-                                            int4 prefilter_rect,
-                                            int buffer_pass_stride,
-                                            int buffer_denoising_offset)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_get_feature(sample,
-		                          CCL_FILTER_TILE_INFO_ARG,
-		                          m_offset, v_offset,
-		                          x, y,
-		                          mean, variance,
-		                          scale,
-		                          prefilter_rect,
-		                          buffer_pass_stride,
-		                          buffer_denoising_offset);
-	}
-}
-
-__kernel void kernel_ocl_filter_write_feature(int sample,
-                                              int4 buffer_params,
-                                              int4 filter_area,
-                                              ccl_global float *from,
-                                              ccl_global float *buffer,
-                                              int out_offset,
-                                              int4 prefilter_rect)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		kernel_filter_write_feature(sample,
-		                            x + filter_area.x,
-		                            y + filter_area.y,
-		                            buffer_params,
-		                            from,
-		                            buffer,
-		                            out_offset,
-		                            prefilter_rect);
-	}
-}
-
-__kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
-                                                ccl_global float *variance,
-                                                ccl_global float *depth,
-                                                ccl_global float *output,
-                                                int4 prefilter_rect,
-                                                int pass_stride)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
-                                               ccl_global float *variance,
-                                               ccl_global float *a,
-                                               ccl_global float *b,
-                                               int4 prefilter_rect,
-                                               int r)
-{
-	int x = prefilter_rect.x + get_global_id(0);
-	int y = prefilter_rect.y + get_global_id(1);
-	if(x < prefilter_rect.z && y < prefilter_rect.w) {
-		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
-	}
-}
-
-__kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
-                                                    CCL_FILTER_TILE_INFO,
-                                                    ccl_global float *transform,
-                                                    ccl_global int *rank,
-                                                    int4 filter_area,
-                                                    int4 rect,
-                                                    int pass_stride,
-                                                    int frame_stride,
-                                                    char use_time,
-                                                    int radius,
-                                                    float pca_threshold)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		ccl_global int *l_rank = rank + y*filter_area.z + x;
-		ccl_global float *l_transform = transform + y*filter_area.z + x;
-		kernel_filter_construct_transform(buffer,
-		                                  CCL_FILTER_TILE_INFO_ARG,
-		                                  x + filter_area.x, y + filter_area.y,
-		                                  rect,
-		                                  pass_stride, frame_stride,
-		                                  use_time,
-		                                  l_transform, l_rank,
-		                                  radius, pca_threshold,
-		                                  filter_area.z*filter_area.w,
-		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
-                                                    const ccl_global float *ccl_restrict variance_image,
-                                                    const ccl_global float *ccl_restrict scale_image,
-                                                    ccl_global float *difference_image,
-                                                    int w,
-                                                    int h,
-                                                    int stride,
-                                                    int pass_stride,
-                                                    int r,
-                                                    int channel_offset,
-                                                    int frame_offset,
-                                                    float a,
-                                                    float k_2)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
-		                                  weight_image,
-		                                  variance_image,
-		                                  scale_image,
-		                                  difference_image + ofs,
-		                                  rect, stride,
-		                                  channel_offset,
-		                                  frame_offset,
-		                                  a, k_2);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
-                                         ccl_global float *out_image,
-                                         int w,
-                                         int h,
-                                         int stride,
-                                         int pass_stride,
-                                         int r,
-                                         int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_blur(co.x, co.y,
-		                       difference_image + ofs,
-		                       out_image + ofs,
-		                       rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
-                                                ccl_global float *out_image,
-                                                int w,
-                                                int h,
-                                                int stride,
-                                                int pass_stride,
-                                                int r,
-                                                int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_calc_weight(co.x, co.y,
-		                              difference_image + ofs,
-		                              out_image + ofs,
-		                              rect, stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
-                                                  const ccl_global float *ccl_restrict image,
-                                                  ccl_global float *out_image,
-                                                  ccl_global float *accum_image,
-                                                  int w,
-                                                  int h,
-                                                  int stride,
-                                                  int pass_stride,
-                                                  int channel_offset,
-                                                  int r,
-                                                  int f)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
-		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
-		                                difference_image + ofs,
-		                                image,
-		                                out_image,
-		                                accum_image,
-		                                rect,
-		                                channel_offset,
-		                                stride, f);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
-                                              const ccl_global float *ccl_restrict accum_image,
-                                              int w,
-                                              int h,
-                                              int stride)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < w && y < h) {
-		kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
-	}
-}
-
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int t,
-                                                      const ccl_global float *ccl_restrict difference_image,
-                                                      const ccl_global float *ccl_restrict buffer,
-                                                      const ccl_global float *ccl_restrict transform,
-                                                      ccl_global int *rank,
-                                                      ccl_global float *XtWX,
-                                                      ccl_global float3 *XtWY,
-                                                      int4 filter_window,
-                                                      int w,
-                                                      int h,
-                                                      int stride,
-                                                      int pass_stride,
-                                                      int r,
-                                                      int f,
-                                                      int frame_offset,
-                                                      char use_time)
-{
-	int4 co, rect;
-	int ofs;
-	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
-		kernel_filter_nlm_construct_gramian(co.x, co.y,
-		                                    co.z, co.w,
-		                                    t,
-		                                    difference_image + ofs,
-		                                    buffer,
-		                                    transform, rank,
-		                                    XtWX, XtWY,
-		                                    rect, filter_window,
-		                                    stride, f,
-		                                    pass_stride,
-		                                    frame_offset,
-		                                    use_time,
-		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
-	}
-}
-
-__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
-                                         ccl_global int *rank,
-                                         ccl_global float *XtWX,
-                                         ccl_global float3 *XtWY,
-                                         int4 filter_area,
-                                         int4 buffer_params,
-                                         int sample)
-{
-	int x = get_global_id(0);
-	int y = get_global_id(1);
-	if(x < filter_area.z && y < filter_area.w) {
-		int storage_ofs = y*filter_area.z+x;
-		rank += storage_ofs;
-		XtWX += storage_ofs;
-		XtWY += storage_ofs;
-		kernel_filter_finalize(x, y, buffer, rank,
-		                       filter_area.z*filter_area.w,
-		                       XtWX, XtWY,
-		                       buffer_params, sample);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
deleted file mode 100644
index ebdb99d4730..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_adjust_samples.h"
-
-#define KERNEL_NAME adaptive_adjust_samples
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
deleted file mode 100644
index 76d82d4184e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_x.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_x.h"
-
-#define KERNEL_NAME adaptive_filter_x
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
deleted file mode 100644
index 1e6d15ba0f2..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_filter_y.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_filter_y.h"
-
-#define KERNEL_NAME adaptive_filter_y
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl b/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
deleted file mode 100644
index 51de0059667..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_adaptive_stopping.cl
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_adaptive_stopping.h"
-
-#define KERNEL_NAME adaptive_stopping
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_background.cl b/intern/cycles/kernel/kernels/opencl/kernel_background.cl
deleted file mode 100644
index 0e600676e82..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_background.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_background(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_background_evaluate(kg, input, output, x);
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl b/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
deleted file mode 100644
index 7b81e387467..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_bake.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_bake(
-	ccl_constant KernelData *data,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int sx, int sy, int sw, int sh, int offset, int stride, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh) {
-#ifndef __NO_BAKING__
-		kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#endif
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_base.cl b/intern/cycles/kernel/kernels/opencl/kernel_base.cl
deleted file mode 100644
index 1c2d89e8a92..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_base.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright 2011-2013 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/* OpenCL base kernels entry points */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-
-#include "kernel/kernel_film.h"
-
-
-__kernel void kernel_ocl_convert_to_byte(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_convert_to_half_float(
-	ccl_constant KernelData *data,
-	ccl_global uchar4 *rgba,
-	ccl_global float *buffer,
-
-	KERNEL_BUFFER_PARAMS,
-
-	float sample_scale,
-	int sx, int sy, int sw, int sh, int offset, int stride)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
-		kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
-}
-
-__kernel void kernel_ocl_zero_buffer(ccl_global float4 *buffer, uint64_t size, uint64_t offset)
-{
-	size_t i = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-	if(i < size / sizeof(float4)) {
-		buffer[i+offset/sizeof(float4)] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-	}
-	else if(i == size / sizeof(float4)) {
-		ccl_global uchar *b = (ccl_global uchar*)&buffer[i+offset/sizeof(float4)];
-
-		for(i = 0; i < size % sizeof(float4); i++) {
-			*(b++) = 0;
-		}
-	}
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl b/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
deleted file mode 100644
index 7125348a49f..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_data_init.cl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_data_init.h"
-
-__kernel void kernel_ocl_path_trace_data_init(
-        ccl_global char *kg,
-        ccl_constant KernelData *data,
-        ccl_global void *split_data_buffer,
-        int num_elements,
-        ccl_global char *ray_state,
-		KERNEL_BUFFER_PARAMS,
-        int start_sample,
-        int end_sample,
-        int sx, int sy, int sw, int sh, int offset, int stride,
-        ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
-        int queuesize,                               /* size (capacity) of the queue */
-        ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
-        ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
-        unsigned int num_samples,                    /* Total number of samples per pixel */
-        ccl_global float *buffer)
-{
-	kernel_data_init((KernelGlobals*)kg,
-	                 data,
-	                 split_data_buffer,
-	                 num_elements,
-	                 ray_state,
-	                 KERNEL_BUFFER_ARGS,
-	                 start_sample,
-	                 end_sample,
-	                 sx, sy, sw, sh, offset, stride,
-	                 Queue_index,
-	                 queuesize,
-	                 use_queues_flag,
-	                 work_pool_wgs,
-	                 num_samples,
-	                 buffer);
-}
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl b/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
deleted file mode 100644
index 76cc36971f5..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_displace.cl
+++ /dev/null
@@ -1,36 +0,0 @@
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernels/opencl/kernel_opencl_image.h"
-
-#include "kernel/kernel_path.h"
-#include "kernel/kernel_path_branched.h"
-
-#include "kernel/kernel_bake.h"
-
-__kernel void kernel_ocl_displace(
-	ccl_constant KernelData *data,
-	ccl_global uint4 *input,
-	ccl_global float4 *output,
-
-	KERNEL_BUFFER_PARAMS,
-
-	int type, int sx, int sw, int offset, int sample)
-{
-	KernelGlobals kglobals, *kg = &kglobals;
-
-	kg->data = data;
-
-	kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-	kernel_set_buffer_info(kg);
-
-	int x = sx + ccl_global_id(0);
-
-	if(x < sx + sw) {
-		kernel_displace_evaluate(kg, input, output, x);
-	}
-}
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
deleted file mode 100644
index 8b1332bf013..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_next_iteration_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_next_iteration_setup.h"
-
-#define KERNEL_NAME next_iteration_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
deleted file mode 100644
index bb6b8a40e8e..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef WITH_NANOVDB
-/* Data type to replace `double` used in the NanoVDB headers. Cycles don't need doubles, and is
- * safer and more portable to never use double datatype on GPU.
- * Use a special structure, so that the following is true:
- * - No unnoticed implicit cast or mathematical operations used on scalar 64bit type
- *   (which rules out trick like using `uint64_t` as a drop-in replacement for double).
- * - Padding rules are matching exactly `double`
- *   (which rules out array of `uint8_t`). */
-typedef struct ccl_vdb_double_t {
-  uint64_t i;
-} ccl_vdb_double_t;
-
-#  define double ccl_vdb_double_t
-#  include "nanovdb/CNanoVDB.h"
-#  undef double
-#endif
-
-/* For OpenCL we do manual lookup and interpolation. */
-
-ccl_device_inline ccl_global TextureInfo *kernel_tex_info(KernelGlobals *kg, uint id)
-{
-  const uint tex_offset = id
-#define KERNEL_TEX(type, name) +1
-#include "kernel/kernel_textures.h"
-      ;
-
-  return &((ccl_global TextureInfo *)kg->buffers[0])[tex_offset];
-}
-
-#define tex_fetch(type, info, index) \
-  ((ccl_global type *)(kg->buffers[info->cl_buffer] + info->data))[(index)]
-
-ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
-{
-  x %= width;
-  if (x < 0)
-    x += width;
-  return x;
-}
-
-ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
-{
-  return clamp(x, 0, width - 1);
-}
-
-ccl_device_inline float4 svm_image_texture_read(
-    KernelGlobals *kg, const ccl_global TextureInfo *info, void *acc, int x, int y, int z)
-{
-  const int data_offset = x + info->width * y + info->width * info->height * z;
-  const int texture_type = info->data_type;
-
-  /* Float4 */
-  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
-    return tex_fetch(float4, info, data_offset);
-  }
-  /* Byte4 */
-  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
-    uchar4 r = tex_fetch(uchar4, info, data_offset);
-    float f = 1.0f / 255.0f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Ushort4 */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
-    ushort4 r = tex_fetch(ushort4, info, data_offset);
-    float f = 1.0f / 65535.f;
-    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
-  }
-  /* Float */
-  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
-    float f = tex_fetch(float, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* UShort */
-  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
-    ushort r = tex_fetch(ushort, info, data_offset);
-    float f = r * (1.0f / 65535.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-#ifdef WITH_NANOVDB
-  /* NanoVDB Float */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    float f = cnanovdb_readaccessor_getValueF((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f, f, f, 1.0f);
-  }
-  /* NanoVDB Float3 */
-  else if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    cnanovdb_coord coord;
-    coord.mVec[0] = x;
-    coord.mVec[1] = y;
-    coord.mVec[2] = z;
-    cnanovdb_Vec3F f = cnanovdb_readaccessor_getValueF3((cnanovdb_readaccessor *)acc, &coord);
-    return make_float4(f.mVec[0], f.mVec[1], f.mVec[2], 1.0f);
-  }
-#endif
-#ifdef __KERNEL_CL_KHR_FP16__
-  /* Half and Half4 are optional in OpenCL */
-  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
-    float f = tex_fetch(half, info, data_offset);
-    return make_float4(f, f, f, 1.0f);
-  }
-  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
-    half4 r = tex_fetch(half4, info, data_offset);
-    return make_float4(r.x, r.y, r.z, r.w);
-  }
-#endif
-  /* Byte */
-  else {
-    uchar r = tex_fetch(uchar, info, data_offset);
-    float f = r * (1.0f / 255.0f);
-    return make_float4(f, f, f, 1.0f);
-  }
-}
-
-ccl_device_inline float4
-svm_image_texture_read_2d(KernelGlobals *kg, int id, void *acc, int x, int y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, 0);
-}
-
-ccl_device_inline float4
-svm_image_texture_read_3d(KernelGlobals *kg, int id, void *acc, int x, int y, int z)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-#ifdef WITH_NANOVDB
-  if (info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
-      info->data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-#endif
-    /* Wrap */
-    if (info->extension == EXTENSION_REPEAT) {
-      x = svm_image_texture_wrap_periodic(x, info->width);
-      y = svm_image_texture_wrap_periodic(y, info->height);
-      z = svm_image_texture_wrap_periodic(z, info->depth);
-    }
-    else {
-      x = svm_image_texture_wrap_clamp(x, info->width);
-      y = svm_image_texture_wrap_clamp(y, info->height);
-      z = svm_image_texture_wrap_clamp(z, info->depth);
-    }
-#ifdef WITH_NANOVDB
-  }
-#endif
-
-  return svm_image_texture_read(kg, info, acc, x, y, z);
-}
-
-ccl_device_inline float svm_image_texture_frac(float x, int *ix)
-{
-  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
-  *ix = i;
-  return x - (float)i;
-}
-
-#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
-  { \
-    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
-    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
-    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
-    u[3] = (1.0f / 6.0f) * t * t * t; \
-  } \
-  (void)0
-
-ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->extension == EXTENSION_CLIP) {
-    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    }
-  }
-
-  if (info->interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy;
-    svm_image_texture_frac(x * info->width, &ix);
-    svm_image_texture_frac(y * info->height, &iy);
-
-    return svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-  }
-  else if (info->interpolation == INTERPOLATION_LINEAR) {
-    /* Bilinear interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float4 r;
-    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy);
-    r += (1.0f - ty) * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy);
-    r += ty * (1.0f - tx) * svm_image_texture_read_2d(kg, id, NULL, ix, iy + 1);
-    r += ty * tx * svm_image_texture_read_2d(kg, id, NULL, ix + 1, iy + 1);
-    return r;
-  }
-  else {
-    /* Bicubic interpolation. */
-    int ix, iy;
-    float tx = svm_image_texture_frac(x * info->width - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y * info->height - 0.5f, &iy);
-
-    float u[4], v[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int y = 0; y < 4; y++) {
-      for (int x = 0; x < 4; x++) {
-        float weight = u[x] * v[y];
-        r += weight * svm_image_texture_read_2d(kg, id, NULL, ix + x - 1, iy + y - 1);
-      }
-    }
-    return r;
-  }
-}
-
-ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float3 P, int interp)
-{
-  const ccl_global TextureInfo *info = kernel_tex_info(kg, id);
-
-  if (info->use_transform_3d) {
-    Transform tfm = info->transform_3d;
-    P = transform_point(&tfm, P);
-  }
-
-  float x = P.x;
-  float y = P.y;
-  float z = P.z;
-
-  uint interpolation = (interp == INTERPOLATION_NONE) ? info->interpolation : interp;
-
-#ifdef WITH_NANOVDB
-  cnanovdb_readaccessor acc;
-  if (info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
-      info->data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    ccl_global cnanovdb_griddata *grid =
-        (ccl_global cnanovdb_griddata *)(kg->buffers[info->cl_buffer] + info->data);
-    cnanovdb_readaccessor_init(&acc, cnanovdb_treedata_rootF(cnanovdb_griddata_tree(grid)));
-  }
-  else {
-    if (info->extension == EXTENSION_CLIP) {
-      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
-        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-      }
-    }
-
-    x *= info->width;
-    y *= info->height;
-    z *= info->depth;
-  }
-#  define NANOVDB_ACCESS_POINTER &acc
-#else
-#  define NANOVDB_ACCESS_POINTER NULL
-#endif
-
-  if (interpolation == INTERPOLATION_CLOSEST) {
-    /* Closest interpolation. */
-    int ix, iy, iz;
-    svm_image_texture_frac(x, &ix);
-    svm_image_texture_frac(y, &iy);
-    svm_image_texture_frac(z, &iz);
-
-    return svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-  }
-  else if (interpolation == INTERPOLATION_LINEAR) {
-    /* Trilinear interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float4 r;
-    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
-        svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz);
-    r += (1.0f - tz) * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz);
-    r += (1.0f - tz) * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz);
-    r += (1.0f - tz) * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz);
-
-    r += tz * (1.0f - ty) * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy, iz + 1);
-    r += tz * (1.0f - ty) * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy, iz + 1);
-    r += tz * ty * (1.0f - tx) *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix, iy + 1, iz + 1);
-    r += tz * ty * tx *
-         svm_image_texture_read_3d(kg, id, NANOVDB_ACCESS_POINTER, ix + 1, iy + 1, iz + 1);
-    return r;
-  }
-  else {
-    /* Tricubic interpolation. */
-    int ix, iy, iz;
-    float tx = svm_image_texture_frac(x - 0.5f, &ix);
-    float ty = svm_image_texture_frac(y - 0.5f, &iy);
-    float tz = svm_image_texture_frac(z - 0.5f, &iz);
-
-    float u[4], v[4], w[4];
-    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
-    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
-    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
-
-    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-    for (int z = 0; z < 4; z++) {
-      for (int y = 0; y < 4; y++) {
-        for (int x = 0; x < 4; x++) {
-          float weight = u[x] * v[y] * w[z];
-          r += weight * svm_image_texture_read_3d(
-                            kg, id, NANOVDB_ACCESS_POINTER, ix + x - 1, iy + y - 1, iz + z - 1);
-        }
-      }
-    }
-    return r;
-  }
-#undef NANOVDB_ACCESS_POINTER
-}
-
-#undef SET_CUBIC_SPLINE_WEIGHTS
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl b/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
deleted file mode 100644
index 68ee6f1d536..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_queue_enqueue.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_queue_enqueue.h"
-
-#define KERNEL_NAME queue_enqueue
-#define LOCALS_TYPE QueueEnqueueLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl b/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
deleted file mode 100644
index 10d09377ba9..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_scene_intersect.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_scene_intersect.h"
-
-#define KERNEL_NAME scene_intersect
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
deleted file mode 100644
index 40eaa561863..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_eval.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_eval.h"
-
-#define KERNEL_NAME shader_eval
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
deleted file mode 100644
index 8c36100f762..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_setup.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_setup.h"
-
-#define KERNEL_NAME shader_setup
-#define LOCALS_TYPE unsigned int
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl b/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
deleted file mode 100644
index bcacaa4a054..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shader_sort.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shader_sort.h"
-
-__attribute__((reqd_work_group_size(64, 1, 1)))
-#define KERNEL_NAME shader_sort
-#define LOCALS_TYPE ShaderSortLocals
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-#undef LOCALS_TYPE
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
deleted file mode 100644
index 8de250a375c..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_ao.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_ao.h"
-
-#define KERNEL_NAME shadow_blocked_ao
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl b/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
deleted file mode 100644
index 29da77022ed..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_shadow_blocked_dl.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_shadow_blocked_dl.h"
-
-#define KERNEL_NAME shadow_blocked_dl
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl b/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
deleted file mode 100644
index c3b7b09460a..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_bundle.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"  // PRECOMPILED
-#include "kernel/split/kernel_split_common.h"  // PRECOMPILED
-
-#include "kernel/kernels/opencl/kernel_data_init.cl"
-#include "kernel/kernels/opencl/kernel_path_init.cl"
-#include "kernel/kernels/opencl/kernel_state_buffer_size.cl"
-#include "kernel/kernels/opencl/kernel_scene_intersect.cl"
-#include "kernel/kernels/opencl/kernel_queue_enqueue.cl"
-#include "kernel/kernels/opencl/kernel_shader_setup.cl"
-#include "kernel/kernels/opencl/kernel_shader_sort.cl"
-#include "kernel/kernels/opencl/kernel_enqueue_inactive.cl"
-#include "kernel/kernels/opencl/kernel_next_iteration_setup.cl"
-#include "kernel/kernels/opencl/kernel_indirect_subsurface.cl"
-#include "kernel/kernels/opencl/kernel_buffer_update.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_stopping.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_x.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_filter_y.cl"
-#include "kernel/kernels/opencl/kernel_adaptive_adjust_samples.cl"
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h b/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
deleted file mode 100644
index e123b4cd6ec..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_split_function.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define KERNEL_NAME_JOIN(a, b) a##_##b
-#define KERNEL_NAME_EVAL(a, b) KERNEL_NAME_JOIN(a, b)
-
-__kernel void KERNEL_NAME_EVAL(kernel_ocl_path_trace,
-                               KERNEL_NAME)(ccl_global char *kg_global,
-                                            ccl_constant KernelData *data,
-
-                                            ccl_global void *split_data_buffer,
-                                            ccl_global char *ray_state,
-
-                                            KERNEL_BUFFER_PARAMS,
-
-                                            ccl_global int *queue_index,
-                                            ccl_global char *use_queues_flag,
-                                            ccl_global unsigned int *work_pools,
-                                            ccl_global float *buffer)
-{
-#ifdef LOCALS_TYPE
-  ccl_local LOCALS_TYPE locals;
-#endif
-
-  KernelGlobals *kg = (KernelGlobals *)kg_global;
-
-  if (ccl_local_id(0) + ccl_local_id(1) == 0) {
-    kg->data = data;
-
-    kernel_split_params.queue_index = queue_index;
-    kernel_split_params.use_queues_flag = use_queues_flag;
-    kernel_split_params.work_pools = work_pools;
-    kernel_split_params.tile.buffer = buffer;
-
-    split_data_init(kg,
-                    &kernel_split_state,
-                    ccl_global_size(0) * ccl_global_size(1),
-                    split_data_buffer,
-                    ray_state);
-  }
-
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-
-  KERNEL_NAME_EVAL(kernel, KERNEL_NAME)
-  (kg
-#ifdef LOCALS_TYPE
-   ,
-   &locals
-#endif
-  );
-}
-
-#undef KERNEL_NAME_JOIN
-#undef KERNEL_NAME_EVAL
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl b/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
deleted file mode 100644
index 2b3be38df84..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_subsurface_scatter.cl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel/kernel_compat_opencl.h"
-#include "kernel/split/kernel_split_common.h"
-#include "kernel/split/kernel_subsurface_scatter.h"
-
-#define KERNEL_NAME subsurface_scatter
-#include "kernel/kernels/opencl/kernel_split_function.h"
-#undef KERNEL_NAME
-
diff --git a/intern/cycles/kernel/osl/background.cpp b/intern/cycles/kernel/osl/background.cpp
index 3f9de5ab33d..8e497986dcc 100644
--- a/intern/cycles/kernel/osl/background.cpp
+++ b/intern/cycles/kernel/osl/background.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
 // clang-format on
diff --git a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
index 76a2e41abfa..a2f9d3f759a 100644
--- a/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_diffuse_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
index b78dc8a3a67..812c3b6e71b 100644
--- a/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
+++ b/intern/cycles/kernel/osl/bsdf_phong_ramp.cpp
@@ -34,7 +34,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
diff --git a/intern/cycles/kernel/osl/emissive.cpp b/intern/cycles/kernel/osl/emissive.cpp
index d656723bac2..80dfbee879e 100644
--- a/intern/cycles/kernel/osl/emissive.cpp
+++ b/intern/cycles/kernel/osl/emissive.cpp
@@ -37,7 +37,7 @@
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/kernel_types.h"
 #include "kernel/closure/alloc.h"
 #include "kernel/closure/emissive.h"
diff --git a/intern/cycles/kernel/osl/osl_bssrdf.cpp b/intern/cycles/kernel/osl/osl_bssrdf.cpp
index c5ca8616fbd..5d968ed85e0 100644
--- a/intern/cycles/kernel/osl/osl_bssrdf.cpp
+++ b/intern/cycles/kernel/osl/osl_bssrdf.cpp
@@ -32,7 +32,7 @@
 
 #include <OSL/genclosure.h>
 
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
 #include "kernel/osl/osl_closures.h"
 
 // clang-format off
@@ -50,45 +50,30 @@ CCL_NAMESPACE_BEGIN
 
 using namespace OSL;
 
-static ustring u_cubic("cubic");
-static ustring u_gaussian("gaussian");
-static ustring u_burley("burley");
-static ustring u_principled("principled");
+static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
 static ustring u_random_walk("random_walk");
-static ustring u_principled_random_walk("principled_random_walk");
 
 class CBSSRDFClosure : public CClosurePrimitive {
  public:
   Bssrdf params;
+  float ior;
   ustring method;
 
   CBSSRDFClosure()
   {
-    params.texture_blur = 0.0f;
-    params.sharpness = 0.0f;
-    params.roughness = 0.0f;
+    params.roughness = FLT_MAX;
+    params.anisotropy = 1.0f;
+    ior = 1.4f;
   }
 
   void setup(ShaderData *sd, int path_flag, float3 weight)
   {
-    if (method == u_cubic) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_CUBIC_ID);
-    }
-    else if (method == u_gaussian) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_GAUSSIAN_ID);
-    }
-    else if (method == u_burley) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_BURLEY_ID);
-    }
-    else if (method == u_principled) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_ID);
+    if (method == u_random_walk_fixed_radius) {
+      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
     }
     else if (method == u_random_walk) {
       alloc(sd, path_flag, weight, CLOSURE_BSSRDF_RANDOM_WALK_ID);
     }
-    else if (method == u_principled_random_walk) {
-      alloc(sd, path_flag, weight, CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
-    }
   }
 
   void alloc(ShaderData *sd, int path_flag, float3 weight, ClosureType type)
@@ -106,11 +91,10 @@ class CBSSRDFClosure : public CClosurePrimitive {
       /* create one closure per color channel */
       bssrdf->radius = params.radius;
       bssrdf->albedo = params.albedo;
-      bssrdf->texture_blur = params.texture_blur;
-      bssrdf->sharpness = params.sharpness;
       bssrdf->N = params.N;
       bssrdf->roughness = params.roughness;
-      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+      bssrdf->anisotropy = clamp(params.anisotropy, 0.0f, 0.9f);
+      sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, clamp(ior, 1.01f, 3.8f));
     }
   }
 };
@@ -122,9 +106,9 @@ ClosureParam *closure_bssrdf_params()
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.N),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.radius),
       CLOSURE_FLOAT3_PARAM(CBSSRDFClosure, params.albedo),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.texture_blur, "texture_blur"),
-      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.sharpness, "sharpness"),
       CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.roughness, "roughness"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, ior, "ior"),
+      CLOSURE_FLOAT_KEYPARAM(CBSSRDFClosure, params.anisotropy, "anisotropy"),
       CLOSURE_STRING_KEYPARAM(CBSSRDFClosure, label, "label"),
       CLOSURE_FINISH_PARAM(CBSSRDFClosure)};
   return params;
diff --git a/intern/cycles/kernel/osl/osl_closures.cpp b/intern/cycles/kernel/osl/osl_closures.cpp
index 7ee467a46dd..e814fcca246 100644
--- a/intern/cycles/kernel/osl/osl_closures.cpp
+++ b/intern/cycles/kernel/osl/osl_closures.cpp
@@ -40,10 +40,10 @@
 #include "util/util_param.h"
 
 // clang-format off
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_types.h"
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_random.h"
 
@@ -500,7 +500,7 @@ bool CBSDFClosure::skip(const ShaderData *sd, int path_flag, int scattering)
 {
   /* caustic options */
   if ((scattering & LABEL_GLOSSY) && (path_flag & PATH_RAY_DIFFUSE)) {
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
 
     if ((!kernel_data.integrator.caustics_reflective && (scattering & LABEL_REFLECT)) ||
         (!kernel_data.integrator.caustics_refractive && (scattering & LABEL_TRANSMIT))) {
diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp
index 2b7c21d0bc4..396f42080e4 100644
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -40,22 +40,22 @@
 #include "util/util_string.h"
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-#include "kernel/kernel_random.h"
-#include "kernel/kernel_write_passes.h"
-#include "kernel/kernel_projection.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+#include "kernel/device/cpu/image.h"
+
 #include "kernel/kernel_differential.h"
-#include "kernel/kernel_montecarlo.h"
-#include "kernel/kernel_camera.h"
-#include "kernel/kernels/cpu/kernel_cpu_image.h"
+
+#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/integrator_state_flow.h"
+
 #include "kernel/geom/geom.h"
 #include "kernel/bvh/bvh.h"
 
+#include "kernel/kernel_color.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_path_state.h"
 #include "kernel/kernel_projection.h"
-#include "kernel/kernel_accumulate.h"
 #include "kernel/kernel_shader.h"
 // clang-format on
 
@@ -147,7 +147,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -155,18 +155,19 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
       Transform tfm;
 
       if (time == sd->time)
-        tfm = sd->ob_tfm;
+        tfm = object_get_transform(kg, sd);
       else
         tfm = object_fetch_transform_motion_test(kg, object, time, NULL);
 #else
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+      const Transform tfm = object_get_transform(kg, sd);
 #endif
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -184,7 +185,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
-    KernelGlobals *kg = sd->osl_globals;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
@@ -192,18 +193,19 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
       Transform itfm;
 
       if (time == sd->time)
-        itfm = sd->ob_itfm;
+        itfm = object_get_inverse_transform(kg, sd);
       else
         object_fetch_transform_motion_test(kg, object, time, &itfm);
 #else
-      Transform itfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
+      const Transform itfm = object_get_inverse_transform(kg, sd);
 #endif
       copy_matrix(result, itfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -218,7 +220,7 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                    float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -250,7 +252,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            float time)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -284,21 +286,18 @@ bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_tfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#endif
+      const Transform tfm = object_get_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_tfm);
+      const Transform tfm = lamp_fetch_transform(kg, sd->lamp, false);
+      copy_matrix(result, tfm);
 
       return true;
     }
@@ -315,21 +314,18 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
    * a concept of shader space, so we just use object space for both. */
   if (xform) {
     const ShaderData *sd = (const ShaderData *)xform;
+    const KernelGlobals *kg = sd->osl_globals;
     int object = sd->object;
 
     if (object != OBJECT_NONE) {
-#ifdef __OBJECT_MOTION__
-      Transform tfm = sd->ob_itfm;
-#else
-      KernelGlobals *kg = sd->osl_globals;
-      Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-#endif
+      const Transform tfm = object_get_inverse_transform(kg, sd);
       copy_matrix(result, tfm);
 
       return true;
     }
     else if (sd->type == PRIMITIVE_LAMP) {
-      copy_matrix(result, sd->ob_itfm);
+      const Transform itfm = lamp_fetch_transform(kg, sd->lamp, true);
+      copy_matrix(result, itfm);
 
       return true;
     }
@@ -341,7 +337,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg, OSL::Matrix44 &result, ustring from)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (from == u_ndc) {
     copy_matrix(result, kernel_data.cam.ndctoworld);
@@ -368,7 +364,7 @@ bool OSLRenderServices::get_inverse_matrix(OSL::ShaderGlobals *sg,
                                            ustring to)
 {
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   if (to == u_ndc) {
     copy_matrix(result, kernel_data.cam.worldtondc);
@@ -747,7 +743,7 @@ static bool set_attribute_matrix(const Transform &tfm, TypeDesc type, void *val)
   return false;
 }
 
-static bool get_primitive_attribute(KernelGlobals *kg,
+static bool get_primitive_attribute(const KernelGlobals *kg,
                                     const ShaderData *sd,
                                     const OSLGlobals::Attribute &attr,
                                     const TypeDesc &type,
@@ -808,7 +804,7 @@ static bool get_primitive_attribute(KernelGlobals *kg,
   }
 }
 
-static bool get_mesh_attribute(KernelGlobals *kg,
+static bool get_mesh_attribute(const KernelGlobals *kg,
                                const ShaderData *sd,
                                const OSLGlobals::Attribute &attr,
                                const TypeDesc &type,
@@ -857,8 +853,12 @@ static bool get_object_attribute(const OSLGlobals::Attribute &attr,
   }
 }
 
-bool OSLRenderServices::get_object_standard_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_object_standard_attribute(const KernelGlobals *kg,
+                                                      ShaderData *sd,
+                                                      ustring name,
+                                                      TypeDesc type,
+                                                      bool derivatives,
+                                                      void *val)
 {
   /* todo: turn this into hash table? */
 
@@ -988,8 +988,12 @@ bool OSLRenderServices::get_object_standard_attribute(
     return false;
 }
 
-bool OSLRenderServices::get_background_attribute(
-    KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val)
+bool OSLRenderServices::get_background_attribute(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 ustring name,
+                                                 TypeDesc type,
+                                                 bool derivatives,
+                                                 void *val)
 {
   if (name == u_path_ray_length) {
     /* Ray Length */
@@ -998,38 +1002,32 @@ bool OSLRenderServices::get_background_attribute(
   }
   else if (name == u_path_ray_depth) {
     /* Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_diffuse_depth) {
     /* Diffuse Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->diffuse_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.diffuse_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_glossy_depth) {
     /* Glossy Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->glossy_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.glossy_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transmission_depth) {
     /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transmission_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_path_transparent_depth) {
     /* Transparent Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transparent_bounce;
-    return set_attribute_int(f, type, derivatives, val);
-  }
-  else if (name == u_path_transmission_depth) {
-    /* Transmission Ray Depth */
-    PathState *state = sd->osl_path_state;
-    int f = state->transmission_bounce;
+    const IntegratorStateCPU *state = sd->osl_path_state;
+    int f = state->path.transparent_bounce;
     return set_attribute_int(f, type, derivatives, val);
   }
   else if (name == u_ndc) {
@@ -1043,8 +1041,10 @@ bool OSLRenderServices::get_background_attribute(
       ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);
 
       if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx) - ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy) - ndc[0];
+        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
+                 ndc[0];
+        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
+                 ndc[0];
       }
     }
     else {
@@ -1079,7 +1079,7 @@ bool OSLRenderServices::get_attribute(OSL::ShaderGlobals *sg,
 bool OSLRenderServices::get_attribute(
     ShaderData *sd, bool derivatives, ustring object_name, TypeDesc type, ustring name, void *val)
 {
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
   int prim_type = 0;
   int object;
 
@@ -1208,17 +1208,17 @@ bool OSLRenderServices::texture(ustring filename,
   OSLTextureHandle *handle = (OSLTextureHandle *)texture_handle;
   OSLTextureHandle::Type texture_type = (handle) ? handle->type : OSLTextureHandle::OIIO;
   ShaderData *sd = (ShaderData *)(sg->renderstate);
-  KernelGlobals *kernel_globals = sd->osl_globals;
+  const KernelGlobals *kernel_globals = sd->osl_globals;
   bool status = false;
 
   switch (texture_type) {
     case OSLTextureHandle::BEVEL: {
       /* Bevel shader hack. */
       if (nchannels >= 3) {
-        PathState *state = sd->osl_path_state;
+        const IntegratorStateCPU *state = sd->osl_path_state;
         int num_samples = (int)s;
         float radius = t;
-        float3 N = svm_bevel(kernel_globals, sd, state, radius, num_samples);
+        float3 N = svm_bevel(kernel_globals, state, sd, radius, num_samples);
         result[0] = N.x;
         result[1] = N.y;
         result[2] = N.z;
@@ -1228,7 +1228,7 @@ bool OSLRenderServices::texture(ustring filename,
     }
     case OSLTextureHandle::AO: {
       /* AO shader hack. */
-      PathState *state = sd->osl_path_state;
+      const IntegratorStateCPU *state = sd->osl_path_state;
       int num_samples = (int)s;
       float radius = t;
       float3 N = make_float3(dsdx, dtdx, dsdy);
@@ -1242,7 +1242,7 @@ bool OSLRenderServices::texture(ustring filename,
       if ((int)options.tblur) {
         flags |= NODE_AO_GLOBAL_RADIUS;
       }
-      result[0] = svm_ao(kernel_globals, sd, N, state, radius, num_samples, flags);
+      result[0] = svm_ao(kernel_globals, state, sd, N, radius, num_samples, flags);
       status = true;
       break;
     }
@@ -1355,7 +1355,7 @@ bool OSLRenderServices::texture3d(ustring filename,
     case OSLTextureHandle::SVM: {
       /* Packed texture. */
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       int slot = handle->svm_slot;
       float3 P_float3 = make_float3(P.x, P.y, P.z);
       float4 rgba = kernel_tex_image_interp_3d(kernel_globals, slot, P_float3, INTERPOLATION_NONE);
@@ -1377,7 +1377,7 @@ bool OSLRenderServices::texture3d(ustring filename,
       if (handle && handle->oiio_handle) {
         if (texture_thread_info == NULL) {
           ShaderData *sd = (ShaderData *)(sg->renderstate);
-          KernelGlobals *kernel_globals = sd->osl_globals;
+          const KernelGlobals *kernel_globals = sd->osl_globals;
           OSLThreadData *tdata = kernel_globals->osl_tdata;
           texture_thread_info = tdata->oiio_thread_info;
         }
@@ -1462,7 +1462,7 @@ bool OSLRenderServices::environment(ustring filename,
   if (handle && handle->oiio_handle) {
     if (thread_info == NULL) {
       ShaderData *sd = (ShaderData *)(sg->renderstate);
-      KernelGlobals *kernel_globals = sd->osl_globals;
+      const KernelGlobals *kernel_globals = sd->osl_globals;
       OSLThreadData *tdata = kernel_globals->osl_tdata;
       thread_info = tdata->oiio_thread_info;
     }
@@ -1600,10 +1600,14 @@ bool OSLRenderServices::trace(TraceOpt &options,
   }
 
   /* ray differentials */
-  ray.dP.dx = TO_FLOAT3(dPdx);
-  ray.dP.dy = TO_FLOAT3(dPdy);
-  ray.dD.dx = TO_FLOAT3(dRdx);
-  ray.dD.dy = TO_FLOAT3(dRdy);
+  differential3 dP;
+  dP.dx = TO_FLOAT3(dPdx);
+  dP.dy = TO_FLOAT3(dPdy);
+  ray.dP = differential_make_compact(dP);
+  differential3 dD;
+  dD.dx = TO_FLOAT3(dRdx);
+  dD.dy = TO_FLOAT3(dRdy);
+  ray.dD = differential_make_compact(dD);
 
   /* allocate trace data */
   OSLTraceData *tracedata = (OSLTraceData *)sg->tracedata;
@@ -1613,7 +1617,7 @@ bool OSLRenderServices::trace(TraceOpt &options,
   tracedata->hit = false;
   tracedata->sd.osl_globals = sd->osl_globals;
 
-  KernelGlobals *kg = sd->osl_globals;
+  const KernelGlobals *kg = sd->osl_globals;
 
   /* Can't raytrace from shaders like displacement, before BVH exists. */
   if (kernel_data.bvh.bvh_layout == BVH_LAYOUT_NONE) {
@@ -1646,11 +1650,11 @@ bool OSLRenderServices::getmessage(OSL::ShaderGlobals *sg,
       }
       else {
         ShaderData *sd = &tracedata->sd;
-        KernelGlobals *kg = sd->osl_globals;
+        const KernelGlobals *kg = sd->osl_globals;
 
         if (!tracedata->setup) {
           /* lazy shader data setup */
-          shader_setup_from_ray(kg, sd, &tracedata->isect, &tracedata->ray);
+          shader_setup_from_ray(kg, sd, &tracedata->ray, &tracedata->isect);
           tracedata->setup = true;
         }
 
diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h
index 891b9172dd4..58accb46e7d 100644
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -250,10 +250,18 @@ class OSLRenderServices : public OSL::RendererServices {
                         void *data) override;
 #endif
 
-  static bool get_background_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
-  static bool get_object_standard_attribute(
-      KernelGlobals *kg, ShaderData *sd, ustring name, TypeDesc type, bool derivatives, void *val);
+  static bool get_background_attribute(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       ustring name,
+                                       TypeDesc type,
+                                       bool derivatives,
+                                       void *val);
+  static bool get_object_standard_attribute(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            ustring name,
+                                            TypeDesc type,
+                                            bool derivatives,
+                                            void *val);
 
   static ustring u_distance;
   static ustring u_index;
diff --git a/intern/cycles/kernel/osl/osl_shader.cpp b/intern/cycles/kernel/osl/osl_shader.cpp
index 389c854c495..880ef635c76 100644
--- a/intern/cycles/kernel/osl/osl_shader.cpp
+++ b/intern/cycles/kernel/osl/osl_shader.cpp
@@ -17,14 +17,16 @@
 #include <OSL/oslexec.h>
 
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_montecarlo.h"
 #include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
-#include "kernel/kernel_globals.h"
 
 #include "kernel/geom/geom_object.h"
 
+#include "kernel/integrator/integrator_state.h"
+
 #include "kernel/osl/osl_closures.h"
 #include "kernel/osl/osl_globals.h"
 #include "kernel/osl/osl_services.h"
@@ -39,9 +41,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Threads */
 
-void OSLShader::thread_init(KernelGlobals *kg,
-                            KernelGlobals *kernel_globals,
-                            OSLGlobals *osl_globals)
+void OSLShader::thread_init(KernelGlobals *kg, OSLGlobals *osl_globals)
 {
   /* no osl used? */
   if (!osl_globals->use) {
@@ -87,8 +87,11 @@ void OSLShader::thread_free(KernelGlobals *kg)
 
 /* Globals */
 
-static void shaderdata_to_shaderglobals(
-    KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag, OSLThreadData *tdata)
+static void shaderdata_to_shaderglobals(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        const IntegratorStateCPU *state,
+                                        int path_flag,
+                                        OSLThreadData *tdata)
 {
   OSL::ShaderGlobals *globals = &tdata->globals;
 
@@ -171,7 +174,10 @@ static void flatten_surface_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_surface(const KernelGlobals *kg,
+                             const IntegratorStateCPU *state,
+                             ShaderData *sd,
+                             int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -276,7 +282,10 @@ static void flatten_background_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_background(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd,
+                                int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -331,7 +340,10 @@ static void flatten_volume_closure_tree(ShaderData *sd,
   }
 }
 
-void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag)
+void OSLShader::eval_volume(const KernelGlobals *kg,
+                            const IntegratorStateCPU *state,
+                            ShaderData *sd,
+                            int path_flag)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -354,7 +366,9 @@ void OSLShader::eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state,
 
 /* Displacement */
 
-void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state)
+void OSLShader::eval_displacement(const KernelGlobals *kg,
+                                  const IntegratorStateCPU *state,
+                                  ShaderData *sd)
 {
   /* setup shader globals from shader data */
   OSLThreadData *tdata = kg->osl_tdata;
@@ -377,7 +391,7 @@ void OSLShader::eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *
 
 /* Attributes */
 
-int OSLShader::find_attribute(KernelGlobals *kg,
+int OSLShader::find_attribute(const KernelGlobals *kg,
                               const ShaderData *sd,
                               uint id,
                               AttributeDescriptor *desc)
diff --git a/intern/cycles/kernel/osl/osl_shader.h b/intern/cycles/kernel/osl/osl_shader.h
index a4fa24d0a90..f1f17b141eb 100644
--- a/intern/cycles/kernel/osl/osl_shader.h
+++ b/intern/cycles/kernel/osl/osl_shader.h
@@ -37,6 +37,7 @@ class Scene;
 
 struct ShaderClosure;
 struct ShaderData;
+struct IntegratorStateCPU;
 struct differential3;
 struct KernelGlobals;
 
@@ -49,19 +50,28 @@ class OSLShader {
   static void register_closures(OSLShadingSystem *ss);
 
   /* per thread data */
-  static void thread_init(KernelGlobals *kg,
-                          KernelGlobals *kernel_globals,
-                          OSLGlobals *osl_globals);
+  static void thread_init(KernelGlobals *kg, OSLGlobals *osl_globals);
   static void thread_free(KernelGlobals *kg);
 
   /* eval */
-  static void eval_surface(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_background(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_volume(KernelGlobals *kg, ShaderData *sd, PathState *state, int path_flag);
-  static void eval_displacement(KernelGlobals *kg, ShaderData *sd, PathState *state);
+  static void eval_surface(const KernelGlobals *kg,
+                           const IntegratorStateCPU *state,
+                           ShaderData *sd,
+                           int path_flag);
+  static void eval_background(const KernelGlobals *kg,
+                              const IntegratorStateCPU *state,
+                              ShaderData *sd,
+                              int path_flag);
+  static void eval_volume(const KernelGlobals *kg,
+                          const IntegratorStateCPU *state,
+                          ShaderData *sd,
+                          int path_flag);
+  static void eval_displacement(const KernelGlobals *kg,
+                                const IntegratorStateCPU *state,
+                                ShaderData *sd);
 
   /* attributes */
-  static int find_attribute(KernelGlobals *kg,
+  static int find_attribute(const KernelGlobals *kg,
                             const ShaderData *sd,
                             uint id,
                             AttributeDescriptor *desc);
diff --git a/intern/cycles/kernel/shaders/node_principled_bsdf.osl b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
index 23949f406c7..55afb892d36 100644
--- a/intern/cycles/kernel/shaders/node_principled_bsdf.osl
+++ b/intern/cycles/kernel/shaders/node_principled_bsdf.osl
@@ -18,11 +18,13 @@
 #include "stdcycles.h"
 
 shader node_principled_bsdf(string distribution = "Multiscatter GGX",
-                            string subsurface_method = "burley",
+                            string subsurface_method = "random_walk",
                             color BaseColor = color(0.8, 0.8, 0.8),
                             float Subsurface = 0.0,
                             vector SubsurfaceRadius = vector(1.0, 1.0, 1.0),
                             color SubsurfaceColor = color(0.7, 0.1, 0.1),
+                            float SubsurfaceIOR = 1.4,
+                            float SubsurfaceAnisotropy = 0.0,
                             float Metallic = 0.0,
                             float Specular = 0.5,
                             float SpecularTint = 0.0,
@@ -59,22 +61,17 @@ shader node_principled_bsdf(string distribution = "Multiscatter GGX",
   if (diffuse_weight > 1e-5) {
     if (Subsurface > 1e-5) {
       color mixed_ss_base_color = SubsurfaceColor * Subsurface + BaseColor * (1.0 - Subsurface);
-      if (subsurface_method == "burley") {
-        BSDF = mixed_ss_base_color * bssrdf("principled",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            SubsurfaceColor,
-                                            "roughness",
-                                            Roughness);
-      }
-      else {
-        BSDF = mixed_ss_base_color * bssrdf("principled_random_walk",
-                                            Normal,
-                                            Subsurface * SubsurfaceRadius,
-                                            mixed_ss_base_color,
-                                            "roughness",
-                                            Roughness);
-      }
+
+      BSDF = mixed_ss_base_color * bssrdf(subsurface_method,
+                                          Normal,
+                                          Subsurface * SubsurfaceRadius,
+                                          mixed_ss_base_color,
+                                          "roughness",
+                                          Roughness,
+                                          "ior",
+                                          SubsurfaceIOR,
+                                          "anisotropy",
+                                          SubsurfaceAnisotropy);
     }
     else {
       BSDF = BaseColor * principled_diffuse(Normal, Roughness);
diff --git a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
index b1e854150ab..f55e38c54ff 100644
--- a/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
+++ b/intern/cycles/kernel/shaders/node_subsurface_scattering.osl
@@ -19,27 +19,12 @@
 shader node_subsurface_scattering(color Color = 0.8,
                                   float Scale = 1.0,
                                   vector Radius = vector(0.1, 0.1, 0.1),
-                                  float TextureBlur = 0.0,
-                                  float Sharpness = 0.0,
-                                  string falloff = "cubic",
+                                  float IOR = 1.4,
+                                  float Anisotropy = 0.0,
+                                  string method = "random_walk",
                                   normal Normal = N,
                                   output closure color BSSRDF = 0)
 {
-  if (falloff == "gaussian")
-    BSSRDF = Color *
-             bssrdf("gaussian", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else if (falloff == "cubic")
-    BSSRDF = Color * bssrdf("cubic",
-                            Normal,
-                            Scale * Radius,
-                            Color,
-                            "texture_blur",
-                            TextureBlur,
-                            "sharpness",
-                            Sharpness);
-  else if (falloff == "burley")
-    BSSRDF = Color * bssrdf("burley", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
-  else
-    BSSRDF = Color *
-             bssrdf("random_walk", Normal, Scale * Radius, Color, "texture_blur", TextureBlur);
+  BSSRDF = Color *
+           bssrdf(method, Normal, Scale * Radius, Color, "ior", IOR, "anisotropy", Anisotropy);
 }
diff --git a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h b/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
deleted file mode 100644
index 437a5c9581b..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_adjust_samples.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_adjust_samples(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    int sample = kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples;
-    if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
-      buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
-      float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count];
-      if (sample_multiplier != 1.0f) {
-        kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
-      }
-    }
-    else {
-      kernel_adaptive_post_adjust(kg, buffer, sample / (sample - 1.0f));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h b/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
deleted file mode 100644
index 93f41f7ced4..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_x.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_x(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int y = kernel_split_params.tile.y + pixel_index;
-    kernel_do_adaptive_filter_x(kg, y, &kernel_split_params.tile);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h b/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
deleted file mode 100644
index eca53d079ec..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_filter_y.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_filter_y(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index;
-    kernel_do_adaptive_filter_y(kg, x, &kernel_split_params.tile);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_adaptive_stopping.h b/intern/cycles/kernel/split/kernel_adaptive_stopping.h
deleted file mode 100644
index c8eb1ebd705..00000000000
--- a/intern/cycles/kernel/split/kernel_adaptive_stopping.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2019 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_adaptive_stopping(KernelGlobals *kg)
-{
-  int pixel_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (pixel_index < kernel_split_params.tile.w * kernel_split_params.tile.h &&
-      kernel_split_params.tile.start_sample + kernel_split_params.tile.num_samples >=
-          kernel_data.integrator.adaptive_min_samples) {
-    int x = kernel_split_params.tile.x + pixel_index % kernel_split_params.tile.w;
-    int y = kernel_split_params.tile.y + pixel_index / kernel_split_params.tile.w;
-    int buffer_offset = (kernel_split_params.tile.offset + x +
-                         y * kernel_split_params.tile.stride) *
-                        kernel_data.film.pass_stride;
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-    kernel_do_adaptive_stopping(kg,
-                                buffer,
-                                kernel_split_params.tile.start_sample +
-                                    kernel_split_params.tile.num_samples - 1);
-  }
-}
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_branched.h b/intern/cycles/kernel/split/kernel_branched.h
deleted file mode 100644
index 45f5037d321..00000000000
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#ifdef __BRANCHED_PATH__
-
-/* sets up the various state needed to do an indirect loop */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_init(KernelGlobals *kg,
-                                                                     int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* save a copy of the state to restore later */
-#  define BRANCHED_STORE(name) branched_state->name = kernel_split_state.name[ray_index];
-
-  BRANCHED_STORE(path_state);
-  BRANCHED_STORE(throughput);
-  BRANCHED_STORE(ray);
-  BRANCHED_STORE(isect);
-  BRANCHED_STORE(ray_state);
-
-  *kernel_split_sd(branched_state_sd, ray_index) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(branched_state_sd, ray_index)->closure[i] =
-        kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_STORE
-
-  /* Set loop counters to initial position. */
-  branched_state->next_closure = 0;
-  branched_state->next_sample = 0;
-}
-
-/* ends an indirect loop and restores the previous state */
-ccl_device_inline void kernel_split_branched_path_indirect_loop_end(KernelGlobals *kg,
-                                                                    int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  /* restore state */
-#  define BRANCHED_RESTORE(name) kernel_split_state.name[ray_index] = branched_state->name;
-
-  BRANCHED_RESTORE(path_state);
-  BRANCHED_RESTORE(throughput);
-  BRANCHED_RESTORE(ray);
-  BRANCHED_RESTORE(isect);
-  BRANCHED_RESTORE(ray_state);
-
-  *kernel_split_sd(sd, ray_index) = *kernel_split_sd(branched_state_sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(branched_state_sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, ray_index)->closure[i] =
-        kernel_split_sd(branched_state_sd, ray_index)->closure[i];
-  }
-
-#  undef BRANCHED_RESTORE
-
-  /* leave indirect loop */
-  REMOVE_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT);
-}
-
-ccl_device_inline bool kernel_split_branched_indirect_start_shared(KernelGlobals *kg,
-                                                                   int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int inactive_ray = dequeue_ray_index(QUEUE_INACTIVE_RAYS,
-                                       kernel_split_state.queue_data,
-                                       kernel_split_params.queue_size,
-                                       kernel_split_params.queue_index);
-
-  if (!IS_STATE(ray_state, inactive_ray, RAY_INACTIVE)) {
-    return false;
-  }
-
-#  define SPLIT_DATA_ENTRY(type, name, num) \
-    if (num) { \
-      kernel_split_state.name[inactive_ray] = kernel_split_state.name[ray_index]; \
-    }
-  SPLIT_DATA_ENTRIES_BRANCHED_SHARED
-#  undef SPLIT_DATA_ENTRY
-
-  *kernel_split_sd(sd, inactive_ray) = *kernel_split_sd(sd, ray_index);
-  for (int i = 0; i < kernel_split_sd(sd, ray_index)->num_closure; i++) {
-    kernel_split_sd(sd, inactive_ray)->closure[i] = kernel_split_sd(sd, ray_index)->closure[i];
-  }
-
-  kernel_split_state.branched_state[inactive_ray].shared_sample_count = 0;
-  kernel_split_state.branched_state[inactive_ray].original_ray = ray_index;
-  kernel_split_state.branched_state[inactive_ray].waiting_on_shared_samples = false;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  PathRadiance *inactive_L = &kernel_split_state.path_radiance[inactive_ray];
-
-  path_radiance_init(kg, inactive_L);
-  path_radiance_copy_indirect(inactive_L, L);
-
-  ray_state[inactive_ray] = RAY_REGENERATED;
-  ADD_RAY_FLAG(ray_state, inactive_ray, RAY_BRANCHED_INDIRECT_SHARED);
-  ADD_RAY_FLAG(ray_state, inactive_ray, IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT));
-
-  atomic_fetch_and_inc_uint32(
-      (ccl_global uint *)&kernel_split_state.branched_state[ray_index].shared_sample_count);
-
-  return true;
-}
-
-/* bounce off surface and integrate indirect light */
-ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
-    KernelGlobals *kg,
-    int ray_index,
-    float num_samples_adjust,
-    ShaderData *saved_sd,
-    bool reset_path_state,
-    bool wait_for_shared)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = saved_sd;
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  float3 throughput = branched_state->throughput;
-  ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-
-  float sum_sample_weight = 0.0f;
-#  ifdef __DENOISING_FEATURES__
-  if (ps->denoising_feature_weight > 0.0f) {
-    for (int i = 0; i < sd->num_closure; i++) {
-      const ShaderClosure *sc = &sd->closure[i];
-
-      /* transparency is not handled here, but in outer loop */
-      if (!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
-        continue;
-      }
-
-      sum_sample_weight += sc->sample_weight;
-    }
-  }
-  else {
-    sum_sample_weight = 1.0f;
-  }
-#  endif /* __DENOISING_FEATURES__ */
-
-  for (int i = branched_state->next_closure; i < sd->num_closure; i++) {
-    const ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSDF(sc->type))
-      continue;
-    /* transparency is not handled here, but in outer loop */
-    if (sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
-      continue;
-
-    int num_samples;
-
-    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type))
-      num_samples = kernel_data.integrator.diffuse_samples;
-    else if (CLOSURE_IS_BSDF_BSSRDF(sc->type))
-      num_samples = 1;
-    else if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      num_samples = kernel_data.integrator.glossy_samples;
-    else
-      num_samples = kernel_data.integrator.transmission_samples;
-
-    num_samples = ceil_to_int(num_samples_adjust * num_samples);
-
-    float num_samples_inv = num_samples_adjust / num_samples;
-
-    for (int j = branched_state->next_sample; j < num_samples; j++) {
-      if (reset_path_state) {
-        *ps = branched_state->path_state;
-      }
-
-      ps->rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-      ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-      *tp = throughput;
-
-      ccl_global Ray *bsdf_ray = &kernel_split_state.ray[ray_index];
-
-      if (!kernel_branched_path_surface_bounce(
-              kg, sd, sc, j, num_samples, tp, ps, &L->state, bsdf_ray, sum_sample_weight)) {
-        continue;
-      }
-
-      ps->rng_hash = branched_state->path_state.rng_hash;
-
-      /* update state for next iteration */
-      branched_state->next_closure = i;
-      branched_state->next_sample = j + 1;
-
-      /* start the indirect path */
-      *tp *= num_samples_inv;
-
-      if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-        continue;
-      }
-
-      return true;
-    }
-
-    branched_state->next_sample = 0;
-  }
-
-  branched_state->next_closure = sd->num_closure;
-
-  if (wait_for_shared) {
-    branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-    if (branched_state->waiting_on_shared_samples) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ */
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
deleted file mode 100644
index b96feca582f..00000000000
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of rays that hit the background (sceneintersect
- * kernel), and for the rays of state RAY_UPDATE_BUFFER it updates the ray's
- * accumulated radiance in the output buffer. This kernel also takes care of
- * rays that have been determined to-be-regenerated.
- *
- * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel.
- *
- * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
- * will be eventually set to RAY_TO_REGENERATE state in this kernel.
- * Finally all rays of ray_state RAY_TO_REGENERATE will be regenerated and put
- * in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty.
- */
-ccl_device void kernel_buffer_update(KernelGlobals *kg,
-                                     ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (ray_index == 0) {
-    /* We will empty this queue in this kernel. */
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-  char enqueue_flag = 0;
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    bool ray_was_updated = false;
-
-    if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-      ray_was_updated = true;
-      uint sample = state->sample;
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      /* accumulate result in output buffer */
-      kernel_write_result(kg, buffer, sample, L);
-
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-    }
-
-    if (kernel_data.film.cryptomatte_passes) {
-      /* Make sure no thread is writing to the buffers. */
-      ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      if (ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
-        uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-        ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-        ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
-        kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-      /* We have completed current work; So get next work */
-      ccl_global uint *work_pools = kernel_split_params.work_pools;
-      uint total_work_size = kernel_split_params.total_work_size;
-      uint work_index;
-
-      if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-        /* If work is invalid, this means no more work is available and the thread may exit */
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-      }
-
-      if (IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
-        ccl_global WorkTile *tile = &kernel_split_params.tile;
-        uint x, y, sample;
-        get_work_pixel(tile, work_index, &x, &y, &sample);
-
-        /* Store buffer offset for writing to passes. */
-        uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-        kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-        /* Initialize random numbers and ray. */
-        uint rng_hash;
-        kernel_path_trace_setup(kg, sample, x, y, &rng_hash, ray);
-
-        if (ray->t != 0.0f) {
-          /* Initialize throughput, path radiance, Ray, PathState;
-           * These rays proceed with path-iteration.
-           */
-          *throughput = make_float3(1.0f, 1.0f, 1.0f);
-          path_radiance_init(kg, L);
-          path_state_init(kg,
-                          AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                          state,
-                          rng_hash,
-                          sample,
-                          ray);
-#ifdef __SUBSURFACE__
-          kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-          enqueue_flag = 1;
-        }
-        else {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
-        }
-      }
-    }
-  }
-
-  /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-   * These rays will be made active during next SceneIntersectkernel.
-   */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
deleted file mode 100644
index 2f83a10316d..00000000000
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel Initializes structures needed in path-iteration kernels.
- *
- * Note on Queues:
- * All slots in queues are initialized to queue empty slot;
- * The number of elements in the queues is initialized to 0;
- */
-
-#ifndef __KERNEL_CPU__
-ccl_device void kernel_data_init(
-#else
-void KERNEL_FUNCTION_FULL_NAME(data_init)(
-#endif
-    KernelGlobals *kg,
-    ccl_constant KernelData *data,
-    ccl_global void *split_data_buffer,
-    int num_elements,
-    ccl_global char *ray_state,
-
-#ifdef __KERNEL_OPENCL__
-    KERNEL_BUFFER_PARAMS,
-#endif
-
-    int start_sample,
-    int end_sample,
-    int sx,
-    int sy,
-    int sw,
-    int sh,
-    int offset,
-    int stride,
-    ccl_global int *Queue_index,      /* Tracks the number of elements in queues */
-    int queuesize,                    /* size (capacity) of the queue */
-    ccl_global char *use_queues_flag, /* flag to decide if scene-intersect kernel should use queues
-                                         to fetch ray index */
-    ccl_global unsigned int *work_pools, /* Work pool for each work group */
-    unsigned int num_samples,
-    ccl_global float *buffer)
-{
-#ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, data_init);
-#else
-
-#  ifdef __KERNEL_OPENCL__
-  kg->data = data;
-#  endif
-
-  kernel_split_params.tile.x = sx;
-  kernel_split_params.tile.y = sy;
-  kernel_split_params.tile.w = sw;
-  kernel_split_params.tile.h = sh;
-
-  kernel_split_params.tile.start_sample = start_sample;
-  kernel_split_params.tile.num_samples = num_samples;
-
-  kernel_split_params.tile.offset = offset;
-  kernel_split_params.tile.stride = stride;
-
-  kernel_split_params.tile.buffer = buffer;
-
-  kernel_split_params.total_work_size = sw * sh * num_samples;
-
-  kernel_split_params.work_pools = work_pools;
-
-  kernel_split_params.queue_index = Queue_index;
-  kernel_split_params.queue_size = queuesize;
-  kernel_split_params.use_queues_flag = use_queues_flag;
-
-  split_data_init(kg, &kernel_split_state, num_elements, split_data_buffer, ray_state);
-
-#  ifdef __KERNEL_OPENCL__
-  kernel_set_buffer_pointers(kg, KERNEL_BUFFER_ARGS);
-  kernel_set_buffer_info(kg);
-#  endif
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  /* Initialize queue data and queue index. */
-  if (thread_index < queuesize) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      kernel_split_state.queue_data[i * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
-    }
-  }
-
-  if (thread_index == 0) {
-    for (int i = 0; i < NUM_QUEUES; i++) {
-      Queue_index[i] = 0;
-    }
-
-    /* The scene-intersect kernel should not use the queues very first time.
-     * since the queue would be empty.
-     */
-    *use_queues_flag = 0;
-  }
-#endif /* KERENL_STUB */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_direct_lighting.h b/intern/cycles/kernel/split/kernel_direct_lighting.h
deleted file mode 100644
index 3be2b35812f..00000000000
--- a/intern/cycles/kernel/split/kernel_direct_lighting.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of direct lighting logic.
- * However, the "shadow ray cast" part of direct lighting is handled
- * in the next kernel.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with direct lighting should be executed. Those rays for which
- * a shadow_blocked() function for direct-lighting must be executed, are
- * marked with flag RAY_SHADOW_RAY_CAST_DL and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_DL_RAYS
- *
- * Note on Queues:
- * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue
- * and processes only the rays of state RAY_ACTIVE; If a ray needs to execute
- * the corresponding shadow_blocked part, after direct lighting, the ray is
- * marked with RAY_SHADOW_RAY_CAST_DL flag.
- *
- * State of queues when this kernel is called:
- * - State of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and
- *   QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same before and after this
- *   kernel call.
- * - QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a
- *   shadow_blocked function must be executed, after this kernel call
- *    Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
- */
-ccl_device void kernel_direct_lighting(KernelGlobals *kg,
-                                       ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  char enqueue_flag = 0;
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    /* direct lighting */
-#ifdef __EMISSION__
-    bool flag = (kernel_data.integrator.use_direct_light && (sd->flag & SD_BSDF_HAS_EVAL));
-
-#  ifdef __BRANCHED_PATH__
-    if (flag && kernel_data.integrator.branched) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __BRANCHED_PATH__ */
-
-#  ifdef __SHADOW_TRICKS__
-    if (flag && state->flag & PATH_RAY_SHADOW_CATCHER) {
-      flag = false;
-      enqueue_flag = 1;
-    }
-#  endif /* __SHADOW_TRICKS__ */
-
-    if (flag) {
-      /* Sample illumination from lights to find path contribution. */
-      float light_u, light_v;
-      path_state_rng_2D(kg, state, PRNG_LIGHT_U, &light_u, &light_v);
-      float terminate = path_state_rng_light_termination(kg, state);
-
-      LightSample ls;
-      if (light_sample(kg, -1, light_u, light_v, sd->time, sd->P, state->bounce, &ls)) {
-        Ray light_ray;
-        light_ray.time = sd->time;
-
-        BsdfEval L_light;
-        bool is_lamp;
-        if (direct_emission(kg,
-                            sd,
-                            AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                            &ls,
-                            state,
-                            &light_ray,
-                            &L_light,
-                            &is_lamp,
-                            terminate)) {
-          /* Write intermediate data to global memory to access from
-           * the next kernel.
-           */
-          kernel_split_state.light_ray[ray_index] = light_ray;
-          kernel_split_state.bsdf_eval[ray_index] = L_light;
-          kernel_split_state.is_lamp[ray_index] = is_lamp;
-          /* Mark ray state for next shadow kernel. */
-          enqueue_flag = 1;
-        }
-      }
-    }
-#endif /* __EMISSION__ */
-  }
-
-#ifdef __EMISSION__
-  /* Enqueue RAY_SHADOW_RAY_CAST_DL rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_LIGHT_INDIRECT_NEXT_ITER rays
-   * this is the last kernel before next_iteration_setup that uses local atomics so we do this here
-   */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_LIGHT_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_do_volume.h b/intern/cycles/kernel/split/kernel_do_volume.h
deleted file mode 100644
index 1775e870f07..00000000000
--- a/intern/cycles/kernel/split/kernel_do_volume.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__VOLUME__)
-
-ccl_device_inline void kernel_split_branched_path_volume_indirect_light_init(KernelGlobals *kg,
-                                                                             int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_volume_indirect_light_iter(KernelGlobals *kg,
-                                                                               int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  /* GPU: no decoupled ray marching, scatter probabilistically. */
-  int num_samples = kernel_data.integrator.volume_samples;
-  float num_samples_inv = 1.0f / num_samples;
-
-  Ray volume_ray = branched_state->ray;
-  volume_ray.t = (!IS_STATE(&branched_state->ray_state, 0, RAY_HIT_BACKGROUND)) ?
-                     branched_state->isect.t :
-                     FLT_MAX;
-
-  float step_size = volume_stack_step_size(kg, branched_state->path_state.volume_stack);
-
-  for (int j = branched_state->next_sample; j < num_samples; j++) {
-    ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
-    *ps = branched_state->path_state;
-
-    ccl_global Ray *pray = &kernel_split_state.ray[ray_index];
-    *pray = branched_state->ray;
-
-    ccl_global float3 *tp = &kernel_split_state.throughput[ray_index];
-    *tp = branched_state->throughput * num_samples_inv;
-
-    /* branch RNG state */
-    path_state_branch(ps, j, num_samples);
-
-    /* integrate along volume segment with distance sampling */
-    VolumeIntegrateResult result = kernel_volume_integrate(
-        kg, ps, sd, &volume_ray, L, tp, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-    if (result == VOLUME_PATH_SCATTERED) {
-      /* direct lighting */
-      kernel_path_volume_connect_light(kg, sd, emission_sd, *tp, &branched_state->path_state, L);
-
-      /* indirect light bounce */
-      if (!kernel_path_volume_bounce(kg, sd, tp, ps, &L->state, pray)) {
-        continue;
-      }
-
-      /* start the indirect path */
-      branched_state->next_closure = 0;
-      branched_state->next_sample = j + 1;
-
-      /* Attempting to share too many samples is slow for volumes as it causes us to
-       * loop here more and have many calls to kernel_volume_integrate which evaluates
-       * shaders. The many expensive shader evaluations cause the work load to become
-       * unbalanced and many threads to become idle in this kernel. Limiting the
-       * number of shared samples here helps quite a lot.
-       */
-      if (branched_state->shared_sample_count < 2) {
-        if (kernel_split_branched_indirect_start_shared(kg, ray_index)) {
-          continue;
-        }
-      }
-
-      return true;
-    }
-#  endif
-  }
-
-  branched_state->next_sample = num_samples;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  /* todo: avoid this calculation using decoupled ray marching */
-  float3 throughput = kernel_split_state.throughput[ray_index];
-  kernel_volume_shadow(
-      kg, emission_sd, &kernel_split_state.path_state[ray_index], &volume_ray, &throughput);
-  kernel_split_state.throughput[ray_index] = throughput;
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __VOLUME__ */
-
-ccl_device void kernel_do_volume(KernelGlobals *kg)
-{
-#ifdef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-#  ifdef __BRANCHED_PATH__
-    kernel_split_params.queue_index[QUEUE_VOLUME_INDIRECT_ITER] = 0;
-#  endif /* __BRANCHED_PATH__ */
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (*kernel_split_params.use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    bool hit = !IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND);
-
-    /* Sanitize volume stack. */
-    if (!hit) {
-      kernel_volume_clean_stack(kg, state->volume_stack);
-    }
-    /* volume attenuation, emission, scatter */
-    if (state->volume_stack[0].shader != SHADER_NONE) {
-      Ray volume_ray = *ray;
-      volume_ray.t = (hit) ? isect->t : FLT_MAX;
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif /* __BRANCHED_PATH__ */
-        float step_size = volume_stack_step_size(kg, state->volume_stack);
-
-        {
-          /* integrate along volume segment with distance sampling */
-          VolumeIntegrateResult result = kernel_volume_integrate(
-              kg, state, sd, &volume_ray, L, throughput, step_size);
-
-#  ifdef __VOLUME_SCATTER__
-          if (result == VOLUME_PATH_SCATTERED) {
-            /* direct lighting */
-            kernel_path_volume_connect_light(kg, sd, emission_sd, *throughput, state, L);
-
-            /* indirect light bounce */
-            if (kernel_path_volume_bounce(kg, sd, throughput, state, &L->state, ray)) {
-              ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-            }
-            else {
-              kernel_split_path_end(kg, ray_index);
-            }
-          }
-#  endif /* __VOLUME_SCATTER__ */
-        }
-
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_volume_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif /* __BRANCHED_PATH__ */
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_VOLUME_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_volume_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __VOLUME__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_enqueue_inactive.h b/intern/cycles/kernel/split/kernel_enqueue_inactive.h
deleted file mode 100644
index 745313f89f1..00000000000
--- a/intern/cycles/kernel/split/kernel_enqueue_inactive.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_enqueue_inactive(KernelGlobals *kg,
-                                        ccl_local_param unsigned int *local_queue_atomics)
-{
-#ifdef __BRANCHED_PATH__
-  /* Enqueue RAY_INACTIVE rays into QUEUE_INACTIVE_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  char enqueue_flag = 0;
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_INACTIVE)) {
-    enqueue_flag = 1;
-  }
-
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_INACTIVE_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
deleted file mode 100644
index 61722840b0b..00000000000
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of the logic to process "material of type holdout",
- * indirect primitive emission, bsdf blurring, probabilistic path termination
- * and AO.
- *
- * This kernels determines the rays for which a shadow_blocked() function
- * associated with AO should be executed. Those rays for which a
- * shadow_blocked() function for AO must be executed are marked with flag
- * RAY_SHADOW_RAY_CAST_ao and enqueued into the queue
- * QUEUE_SHADOW_RAY_CAST_AO_RAYS
- *
- * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
- *
- * Note on Queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFFER state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFFER, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and
- *     RAY_REGENERATED rays
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- *   - QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with
- *     flag RAY_SHADOW_RAY_CAST_AO
- */
-
-ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
-    KernelGlobals *kg, ccl_local_param BackgroundAOLocals *locals)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    locals->queue_atomics_bg = 0;
-    locals->queue_atomics_ao = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-#ifdef __AO__
-  char enqueue_flag = 0;
-#endif
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index != QUEUE_EMPTY_SLOT) {
-    ccl_global PathState *state = 0x0;
-    float3 throughput;
-
-    ccl_global char *ray_state = kernel_split_state.ray_state;
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-      ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-      ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-      ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-      PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-      throughput = kernel_split_state.throughput[ray_index];
-      state = &kernel_split_state.path_state[ray_index];
-
-      if (!kernel_path_shader_apply(kg, sd, state, ray, throughput, emission_sd, L, buffer)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-    }
-
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* Path termination. this is a strange place to put the termination, it's
-       * mainly due to the mixed in MIS that we use. gives too many unneeded
-       * shader evaluations, only need emission if we are going to terminate.
-       */
-      float probability = path_state_continuation_probability(kg, state, throughput);
-
-      if (probability == 0.0f) {
-        kernel_split_path_end(kg, ray_index);
-      }
-      else if (probability < 1.0f) {
-        float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
-        if (terminate >= probability) {
-          kernel_split_path_end(kg, ray_index);
-        }
-        else {
-          kernel_split_state.throughput[ray_index] = throughput / probability;
-        }
-      }
-
-#ifdef __DENOISING_FEATURES__
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-        kernel_update_denoising_features(kg, sd, state, L);
-      }
-#endif
-    }
-
-#ifdef __AO__
-    if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-      /* ambient occlusion */
-      if (kernel_data.integrator.use_ambient_occlusion) {
-        enqueue_flag = 1;
-      }
-    }
-#endif /* __AO__ */
-  }
-
-#ifdef __AO__
-  /* Enqueue to-shadow-ray-cast rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          &locals->queue_atomics_ao,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_background.h b/intern/cycles/kernel/split/kernel_indirect_background.h
deleted file mode 100644
index 6d500650cc0..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_background.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_background(KernelGlobals *kg)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int ray_index;
-
-  if (kernel_data.integrator.ao_bounces != INT_MAX) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index != QUEUE_EMPTY_SLOT) {
-      if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-        ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-        if (path_state_ao_bounce(kg, state)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-      }
-    }
-  }
-
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  if (IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    kernel_path_background(kg, state, ray, throughput, sd, buffer, L);
-    kernel_split_path_end(kg, ray_index);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_indirect_subsurface.h b/intern/cycles/kernel/split/kernel_indirect_subsurface.h
deleted file mode 100644
index 3f48f8d6f56..00000000000
--- a/intern/cycles/kernel/split/kernel_indirect_subsurface.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_indirect_subsurface(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index;
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-  ray_index = get_ray_index(kg,
-                            thread_index,
-                            QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-#ifdef __SUBSURFACE__
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-
-  if (IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-    ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-    /* Trace indirect subsurface rays by restarting the loop. this uses less
-     * stack memory than invoking kernel_path_indirect.
-     */
-    if (ss_indirect->num_rays) {
-      kernel_path_subsurface_setup_indirect(kg, ss_indirect, state, ray, L, throughput);
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_lamp_emission.h b/intern/cycles/kernel/split/kernel_lamp_emission.h
deleted file mode 100644
index 7ecb099208d..00000000000
--- a/intern/cycles/kernel/split/kernel_lamp_emission.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- * It processes rays of state RAY_ACTIVE and RAY_HIT_BACKGROUND.
- * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
- */
-ccl_device void kernel_lamp_emission(KernelGlobals *kg)
-{
-#ifndef __VOLUME__
-  /* We will empty this queue in this kernel. */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-  }
-#endif
-  /* Fetch use_queues_flag. */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-#ifndef __VOLUME__
-                              1
-#else
-                              0
-#endif
-    );
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND)) {
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-
-    float3 throughput = kernel_split_state.throughput[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ccl_global Intersection *isect = &kernel_split_state.isect[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    kernel_path_lamp_emission(kg, state, &ray, throughput, isect, sd, L);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_next_iteration_setup.h b/intern/cycles/kernel/split/kernel_next_iteration_setup.h
deleted file mode 100644
index 320f6a414bf..00000000000
--- a/intern/cycles/kernel/split/kernel_next_iteration_setup.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/*This kernel takes care of setting up ray for the next iteration of
- * path-iteration and accumulating radiance corresponding to AO and
- * direct-lighting
- *
- * Ray state of rays that are terminated in this kernel are changed
- * to RAY_UPDATE_BUFFER.
- *
- * Note on queues:
- * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS
- * and processes only the rays of state RAY_ACTIVE.
- * There are different points in this kernel where a ray may terminate and
- * reach RAY_UPDATE_BUFF state. These rays are enqueued into
- * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will still be present
- * in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has
- * been changed to RAY_UPDATE_BUFF, there is no problem.
- *
- * State of queues when this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE,
- *     RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays.
- */
-
-#ifdef __BRANCHED_PATH__
-ccl_device_inline void kernel_split_branched_indirect_light_init(KernelGlobals *kg, int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT);
-}
-
-ccl_device void kernel_split_branched_transparent_bounce(KernelGlobals *kg, int ray_index)
-{
-  ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-
-#  ifdef __VOLUME__
-  if (!(sd->flag & SD_HAS_ONLY_VOLUME)) {
-#  endif
-    /* continue in case of transparency */
-    *throughput *= shader_bsdf_transparency(kg, sd);
-
-    if (is_zero(*throughput)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-
-    /* Update Path State */
-    path_state_next(kg, state, LABEL_TRANSPARENT);
-#  ifdef __VOLUME__
-  }
-  else {
-    if (!path_state_volume_next(kg, state)) {
-      kernel_split_path_end(kg, ray_index);
-      return;
-    }
-  }
-#  endif
-
-  ray->P = ray_offset(sd->P, -sd->Ng);
-  ray->t -= sd->ray_length; /* clipping works through transparent */
-
-#  ifdef __RAY_DIFFERENTIALS__
-  ray->dP = sd->dP;
-  ray->dD.dx = -sd->dI.dx;
-  ray->dD.dy = -sd->dI.dy;
-#  endif /* __RAY_DIFFERENTIALS__ */
-
-#  ifdef __VOLUME__
-  /* enter/exit volume */
-  kernel_volume_stack_enter_exit(kg, sd, state->volume_stack);
-#  endif /* __VOLUME__ */
-}
-#endif /* __BRANCHED_PATH__ */
-
-ccl_device void kernel_next_iteration_setup(KernelGlobals *kg,
-                                            ccl_local_param unsigned int *local_queue_atomics)
-{
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    /* If we are here, then it means that scene-intersect kernel
-     * has already been executed at least once. From the next time,
-     * scene-intersect kernel may operate on queues to fetch ray index
-     */
-    *kernel_split_params.use_queues_flag = 1;
-
-    /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and
-     * QUEUE_SHADOW_RAY_CAST_DL_RAYS queues that were made empty during the
-     * previous kernel.
-     */
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __VOLUME__
-  /* Reactivate only volume rays here, most surface work was skipped. */
-  if (IS_STATE(ray_state, ray_index, RAY_HAS_ONLY_VOLUME)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_ACTIVE);
-  }
-#endif
-
-  bool active = IS_STATE(ray_state, ray_index, RAY_ACTIVE);
-  if (active) {
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-#ifdef __BRANCHED_PATH__
-    if (!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-      /* Compute direct lighting and next bounce. */
-      if (!kernel_path_surface_bounce(kg, sd, throughput, state, &L->state, ray)) {
-        kernel_split_path_end(kg, ray_index);
-      }
-#ifdef __BRANCHED_PATH__
-    }
-    else if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-    else {
-      kernel_split_branched_indirect_light_init(kg, ray_index);
-
-      if (kernel_split_branched_path_surface_indirect_light_iter(
-              kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-        ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-      }
-      else {
-        kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-        kernel_split_branched_transparent_bounce(kg, ray_index);
-      }
-    }
-#endif /* __BRANCHED_PATH__ */
-  }
-
-  /* Enqueue RAY_UPDATE_BUFFER rays. */
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER) && active,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-#ifdef __BRANCHED_PATH__
-  /* iter loop */
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_LIGHT_INDIRECT_ITER] = 0;
-  }
-
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_LIGHT_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_reset_indirect(L);
-
-    if (kernel_split_branched_path_surface_indirect_light_iter(
-            kg, ray_index, 1.0f, kernel_split_sd(branched_state_sd, ray_index), true, true)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-    else {
-      kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-      kernel_split_branched_transparent_bounce(kg, ray_index);
-    }
-  }
-
-#  ifdef __VOLUME__
-  /* Enqueue RAY_VOLUME_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_VOLUME_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-
-#  endif /* __VOLUME__ */
-
-#  ifdef __SUBSURFACE__
-  /* Enqueue RAY_SUBSURFACE_INDIRECT_NEXT_ITER rays */
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  enqueue_ray_index_local(
-      ray_index,
-      QUEUE_SUBSURFACE_INDIRECT_ITER,
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER),
-      kernel_split_params.queue_size,
-      local_queue_atomics,
-      kernel_split_state.queue_data,
-      kernel_split_params.queue_index);
-#  endif /* __SUBSURFACE__ */
-#endif   /* __BRANCHED_PATH__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
deleted file mode 100644
index c686f46a0cd..00000000000
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel initializes structures needed in path-iteration kernels.
- * This is the first kernel in ray-tracing logic.
- *
- * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
- */
-ccl_device void kernel_path_init(KernelGlobals *kg)
-{
-  int ray_index = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0);
-
-  /* This is the first assignment to ray_state;
-   * So we don't use ASSIGN_RAY_STATE macro.
-   */
-  kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
-
-  /* Get work. */
-  ccl_global uint *work_pools = kernel_split_params.work_pools;
-  uint total_work_size = kernel_split_params.total_work_size;
-  uint work_index;
-
-  if (!get_next_work(kg, work_pools, total_work_size, ray_index, &work_index)) {
-    /* No more work, mark ray as inactive */
-    kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
-
-    return;
-  }
-
-  ccl_global WorkTile *tile = &kernel_split_params.tile;
-  uint x, y, sample;
-  get_work_pixel(tile, work_index, &x, &y, &sample);
-
-  /* Store buffer offset for writing to passes. */
-  uint buffer_offset = (tile->offset + x + y * tile->stride) * kernel_data.film.pass_stride;
-  kernel_split_state.buffer_offset[ray_index] = buffer_offset;
-
-  /* Initialize random numbers and ray. */
-  uint rng_hash;
-  kernel_path_trace_setup(kg, sample, x, y, &rng_hash, &kernel_split_state.ray[ray_index]);
-
-  if (kernel_split_state.ray[ray_index].t != 0.0f) {
-    /* Initialize throughput, path radiance, Ray, PathState;
-     * These rays proceed with path-iteration.
-     */
-    kernel_split_state.throughput[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
-    path_radiance_init(kg, &kernel_split_state.path_radiance[ray_index]);
-    path_state_init(kg,
-                    AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]),
-                    &kernel_split_state.path_state[ray_index],
-                    rng_hash,
-                    sample,
-                    &kernel_split_state.ray[ray_index]);
-#ifdef __SUBSURFACE__
-    kernel_path_subsurface_init_indirect(&kernel_split_state.ss_rays[ray_index]);
-#endif
-  }
-  else {
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_queue_enqueue.h b/intern/cycles/kernel/split/kernel_queue_enqueue.h
deleted file mode 100644
index 2db87f7a671..00000000000
--- a/intern/cycles/kernel/split/kernel_queue_enqueue.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel enqueues rays of different ray state into their
- * appropriate queues:
- *
- * 1. Rays that have been determined to hit the background from the
- *    "kernel_scene_intersect" kernel are enqueued in
- *    QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
- * 2. Rays that have been determined to be actively participating in pat
- *    -iteration will be enqueued into QUEUE_ACTIVE_AND_REGENERATED_RAYS.
- *
- * State of queue during other times this kernel is called:
- * At entry,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be empty.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will contain RAY_TO_REGENERATE
- *     and RAY_UPDATE_BUFFER rays.
- * At exit,
- *   - QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays.
- *   - QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with
- *     RAY_TO_REGENERATE, RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND rays.
- */
-ccl_device void kernel_queue_enqueue(KernelGlobals *kg, ccl_local_param QueueEnqueueLocals *locals)
-{
-  /* We have only 2 cases (Hit/Not-Hit) */
-  int lidx = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-
-  if (lidx == 0) {
-    locals->queue_atomics[0] = 0;
-    locals->queue_atomics[1] = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int queue_number = -1;
-
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_UPDATE_BUFFER) ||
-      IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) {
-    queue_number = QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS;
-  }
-  else if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME) ||
-           IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-    queue_number = QUEUE_ACTIVE_AND_REGENERATED_RAYS;
-  }
-
-  unsigned int my_lqidx;
-  if (queue_number != -1) {
-    my_lqidx = get_local_queue_index(queue_number, locals->queue_atomics);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  if (lidx == 0) {
-    locals->queue_atomics[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = get_global_per_queue_offset(
-        QUEUE_ACTIVE_AND_REGENERATED_RAYS, locals->queue_atomics, kernel_split_params.queue_index);
-    locals->queue_atomics[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = get_global_per_queue_offset(
-        QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-        locals->queue_atomics,
-        kernel_split_params.queue_index);
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  unsigned int my_gqidx;
-  if (queue_number != -1) {
-    my_gqidx = get_global_queue_index(
-        queue_number, kernel_split_params.queue_size, my_lqidx, locals->queue_atomics);
-    kernel_split_state.queue_data[my_gqidx] = ray_index;
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_scene_intersect.h b/intern/cycles/kernel/split/kernel_scene_intersect.h
deleted file mode 100644
index 9ac95aafd2f..00000000000
--- a/intern/cycles/kernel/split/kernel_scene_intersect.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel takes care of scene_intersect function.
- *
- * This kernel changes the ray_state of RAY_REGENERATED rays to RAY_ACTIVE.
- * This kernel processes rays of ray state RAY_ACTIVE
- * This kernel determines the rays that have hit the background and changes
- * their ray state to RAY_HIT_BACKGROUND.
- */
-ccl_device void kernel_scene_intersect(KernelGlobals *kg)
-{
-  /* Fetch use_queues_flag */
-  char local_use_queues_flag = *kernel_split_params.use_queues_flag;
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (local_use_queues_flag) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-
-    if (ray_index == QUEUE_EMPTY_SLOT) {
-      return;
-    }
-  }
-
-  /* All regenerated rays become active here */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_REGENERATED)) {
-#ifdef __BRANCHED_PATH__
-    if (kernel_split_state.branched_state[ray_index].waiting_on_shared_samples) {
-      kernel_split_path_end(kg, ray_index);
-    }
-    else
-#endif /* __BRANCHED_PATH__ */
-    {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-    }
-  }
-
-  if (!IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    return;
-  }
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-
-  Intersection isect;
-  const int last_object = state->bounce > 0 ?
-                              intersection_get_object(kg, &kernel_split_state.isect[ray_index]) :
-                              OBJECT_NONE;
-  bool hit = kernel_path_scene_intersect(kg, state, &ray, &isect, L, last_object);
-  kernel_split_state.isect[ray_index] = isect;
-
-  if (!hit) {
-    /* Change the state of rays that hit the background;
-     * These rays undergo special processing in the
-     * background_bufferUpdate kernel.
-     */
-    ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HIT_BACKGROUND);
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
deleted file mode 100644
index c760a2b2049..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel evaluates ShaderData structure from the values computed
- * by the previous kernels.
- */
-ccl_device void kernel_shader_eval(KernelGlobals *kg)
-{
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  /* Sorting on cuda split is not implemented */
-#ifdef __KERNEL_CUDA__
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-#else
-  int queue_index = kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS];
-#endif
-  if (ray_index >= queue_index) {
-    return;
-  }
-  ray_index = get_ray_index(kg,
-                            ray_index,
-#ifdef __KERNEL_CUDA__
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-#else
-                            QUEUE_SHADER_SORTED_RAYS,
-#endif
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            0);
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
-    ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
-
-    shader_eval_surface(kg, kernel_split_sd(sd, ray_index), state, buffer, state->flag);
-#ifdef __BRANCHED_PATH__
-    if (kernel_data.integrator.branched) {
-      shader_merge_closures(kernel_split_sd(sd, ray_index));
-    }
-    else
-#endif
-    {
-      shader_prepare_closures(kernel_split_sd(sd, ray_index), state);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_setup.h b/intern/cycles/kernel/split/kernel_shader_setup.h
deleted file mode 100644
index 551836d1653..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_setup.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* This kernel sets up the ShaderData structure from the values computed
- * by the previous kernels.
- *
- * It also identifies the rays of state RAY_TO_REGENERATE and enqueues them
- * in QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
- */
-ccl_device void kernel_shader_setup(KernelGlobals *kg,
-                                    ccl_local_param unsigned int *local_queue_atomics)
-{
-  /* Enqueue RAY_TO_REGENERATE rays into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. */
-  if (ccl_local_id(0) == 0 && ccl_local_id(1) == 0) {
-    *local_queue_atomics = 0;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  int queue_index = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (ray_index < queue_index) {
-    ray_index = get_ray_index(kg,
-                              ray_index,
-                              QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              0);
-  }
-  else {
-    ray_index = QUEUE_EMPTY_SLOT;
-  }
-
-  char enqueue_flag = (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE)) ? 1 :
-                                                                                               0;
-  enqueue_ray_index_local(ray_index,
-                          QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                          enqueue_flag,
-                          kernel_split_params.queue_size,
-                          local_queue_atomics,
-                          kernel_split_state.queue_data,
-                          kernel_split_params.queue_index);
-
-  /* Continue on with shader evaluation. */
-  if (IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE)) {
-    Intersection isect = kernel_split_state.isect[ray_index];
-    Ray ray = kernel_split_state.ray[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-
-    shader_setup_from_ray(kg, sd, &isect, &ray);
-
-#ifdef __VOLUME__
-    if (sd->flag & SD_HAS_ONLY_VOLUME) {
-      ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_HAS_ONLY_VOLUME);
-    }
-#endif
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shader_sort.h b/intern/cycles/kernel/split/kernel_shader_sort.h
deleted file mode 100644
index 95d33a42014..00000000000
--- a/intern/cycles/kernel/split/kernel_shader_sort.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device void kernel_shader_sort(KernelGlobals *kg, ccl_local_param ShaderSortLocals *locals)
-{
-#ifndef __KERNEL_CUDA__
-  int tid = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  uint qsize = kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS];
-  if (tid == 0) {
-    kernel_split_params.queue_index[QUEUE_SHADER_SORTED_RAYS] = qsize;
-  }
-
-  uint offset = (tid / SHADER_SORT_LOCAL_SIZE) * SHADER_SORT_BLOCK_SIZE;
-  if (offset >= qsize) {
-    return;
-  }
-
-  int lid = ccl_local_id(1) * ccl_local_size(0) + ccl_local_id(0);
-  uint input = QUEUE_ACTIVE_AND_REGENERATED_RAYS * (kernel_split_params.queue_size);
-  uint output = QUEUE_SHADER_SORTED_RAYS * (kernel_split_params.queue_size);
-  ccl_local uint *local_value = &locals->local_value[0];
-  ccl_local ushort *local_index = &locals->local_index[0];
-
-  /* copy to local memory */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint add = input + idx;
-    uint value = (~0);
-    if (idx < qsize) {
-      int ray_index = kernel_split_state.queue_data[add];
-      bool valid = (ray_index != QUEUE_EMPTY_SLOT) &&
-                   IS_STATE(kernel_split_state.ray_state, ray_index, RAY_ACTIVE);
-      if (valid) {
-        value = kernel_split_sd(sd, ray_index)->shader & SHADER_MASK;
-      }
-    }
-    local_value[i + lid] = value;
-    local_index[i + lid] = i + lid;
-  }
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  /* skip sorting for cpu split kernel */
-#  ifdef __KERNEL_OPENCL__
-
-  /* bitonic sort */
-  for (uint length = 1; length < SHADER_SORT_BLOCK_SIZE; length <<= 1) {
-    for (uint inc = length; inc > 0; inc >>= 1) {
-      for (uint ii = 0; ii < SHADER_SORT_BLOCK_SIZE; ii += SHADER_SORT_LOCAL_SIZE) {
-        uint i = lid + ii;
-        bool direction = ((i & (length << 1)) != 0);
-        uint j = i ^ inc;
-        ushort ioff = local_index[i];
-        ushort joff = local_index[j];
-        uint iKey = local_value[ioff];
-        uint jKey = local_value[joff];
-        bool smaller = (jKey < iKey) || (jKey == iKey && j < i);
-        bool swap = smaller ^ (j < i) ^ direction;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-        local_index[i] = (swap) ? joff : ioff;
-        local_index[j] = (swap) ? ioff : joff;
-        ccl_barrier(CCL_LOCAL_MEM_FENCE);
-      }
-    }
-  }
-#  endif /* __KERNEL_OPENCL__ */
-
-  /* copy to destination */
-  for (uint i = 0; i < SHADER_SORT_BLOCK_SIZE; i += SHADER_SORT_LOCAL_SIZE) {
-    uint idx = offset + i + lid;
-    uint lidx = local_index[i + lid];
-    uint outi = output + idx;
-    uint ini = input + offset + lidx;
-    uint value = local_value[lidx];
-    if (idx < qsize) {
-      kernel_split_state.queue_data[outi] = (value == (~0)) ? QUEUE_EMPTY_SLOT :
-                                                              kernel_split_state.queue_data[ini];
-    }
-  }
-#endif /* __KERNEL_CUDA__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h b/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
deleted file mode 100644
index 5d772fc597b..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_ao.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for AO. */
-ccl_device void kernel_shadow_blocked_ao(KernelGlobals *kg)
-{
-  unsigned int ao_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < ao_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_AO_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-  if (ray_index == QUEUE_EMPTY_SLOT) {
-    return;
-  }
-
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-#ifdef __BRANCHED_PATH__
-  if (!kernel_data.integrator.branched ||
-      IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#endif
-    kernel_path_ao(kg, sd, emission_sd, L, state, throughput, shader_bsdf_alpha(kg, sd));
-#ifdef __BRANCHED_PATH__
-  }
-  else {
-    kernel_branched_path_ao(kg, sd, emission_sd, L, state, throughput);
-  }
-#endif
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
deleted file mode 100644
index 5e46d300bca..00000000000
--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Shadow ray cast for direct visible light. */
-ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
-{
-  unsigned int dl_queue_length = kernel_split_params.queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS];
-  ccl_barrier(CCL_LOCAL_MEM_FENCE);
-
-  int ray_index = QUEUE_EMPTY_SLOT;
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index < dl_queue_length) {
-    ray_index = get_ray_index(kg,
-                              thread_index,
-                              QUEUE_SHADOW_RAY_CAST_DL_RAYS,
-                              kernel_split_state.queue_data,
-                              kernel_split_params.queue_size,
-                              1);
-  }
-
-#ifdef __BRANCHED_PATH__
-  /* TODO(mai): move this somewhere else? */
-  if (thread_index == 0) {
-    /* Clear QUEUE_INACTIVE_RAYS before next kernel. */
-    kernel_split_params.queue_index[QUEUE_INACTIVE_RAYS] = 0;
-  }
-#endif /* __BRANCHED_PATH__ */
-
-  if (ray_index == QUEUE_EMPTY_SLOT)
-    return;
-
-  ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-  Ray ray = kernel_split_state.light_ray[ray_index];
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *sd = kernel_split_sd(sd, ray_index);
-  float3 throughput = kernel_split_state.throughput[ray_index];
-
-  BsdfEval L_light = kernel_split_state.bsdf_eval[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-  bool is_lamp = kernel_split_state.is_lamp[ray_index];
-
-#if defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)
-  bool use_branched = false;
-  int all = 0;
-
-  if (state->flag & PATH_RAY_SHADOW_CATCHER) {
-    use_branched = true;
-    all = 1;
-  }
-#  if defined(__BRANCHED_PATH__)
-  else if (kernel_data.integrator.branched) {
-    use_branched = true;
-
-    if (IS_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-      all = (kernel_data.integrator.sample_all_lights_indirect);
-    }
-    else {
-      all = (kernel_data.integrator.sample_all_lights_direct);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-  if (use_branched) {
-    kernel_branched_path_surface_connect_light(
-        kg, sd, emission_sd, state, throughput, 1.0f, L, all);
-  }
-  else
-#endif /* defined(__BRANCHED_PATH__) || defined(__SHADOW_TRICKS__)*/
-  {
-    /* trace shadow ray */
-    float3 shadow;
-
-    if (!shadow_blocked(kg, sd, emission_sd, state, &ray, &shadow)) {
-      /* accumulate */
-      path_radiance_accum_light(kg, L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
-    }
-    else {
-      path_radiance_accum_total_light(L, state, throughput, &L_light);
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_split_common.h b/intern/cycles/kernel/split/kernel_split_common.h
deleted file mode 100644
index 5114f2b03e5..00000000000
--- a/intern/cycles/kernel/split/kernel_split_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_H__
-#define __KERNEL_SPLIT_H__
-
-// clang-format off
-#include "kernel/kernel_math.h"
-#include "kernel/kernel_types.h"
-
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_color.h"
-
-#ifdef __OSL__
-#  include "kernel/osl/osl_shader.h"
-#endif
-
-#ifdef __KERNEL_OPENCL__
-#  include "kernel/kernels/opencl/kernel_opencl_image.h"
-#endif
-#ifdef __KERNEL_CUDA__
-#  include "kernel/kernels/cuda/kernel_cuda_image.h"
-#endif
-#ifdef __KERNEL_CPU__
-#  include "kernel/kernels/cpu/kernel_cpu_image.h"
-#endif
-
-#include "util/util_atomic.h"
-
-#include "kernel/kernel_path.h"
-#ifdef __BRANCHED_PATH__
-#  include "kernel/kernel_path_branched.h"
-#endif
-
-#include "kernel/kernel_queues.h"
-#include "kernel/kernel_work_stealing.h"
-
-#ifdef __BRANCHED_PATH__
-#  include "kernel/split/kernel_branched.h"
-#endif
-// clang-format on
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline void kernel_split_path_end(KernelGlobals *kg, int ray_index)
-{
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-#ifdef __BRANCHED_PATH__
-#  ifdef __SUBSURFACE__
-  ccl_addr_space SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-
-  if (ss_indirect->num_rays) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-  else
-#  endif /* __SUBSURFACE__ */
-      if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT_SHARED)) {
-    int orig_ray = kernel_split_state.branched_state[ray_index].original_ray;
-
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    PathRadiance *orig_ray_L = &kernel_split_state.path_radiance[orig_ray];
-
-    path_radiance_sum_indirect(L);
-    path_radiance_accum_sample(orig_ray_L, L);
-
-    atomic_fetch_and_dec_uint32(
-        (ccl_global uint *)&kernel_split_state.branched_state[orig_ray].shared_sample_count);
-
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_LIGHT_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_LIGHT_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_VOLUME_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_VOLUME_INDIRECT_NEXT_ITER);
-  }
-  else if (IS_FLAG(ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT)) {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER);
-  }
-  else {
-    ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-  }
-#else
-  ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
-#endif
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
deleted file mode 100644
index decc537b39b..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_H__
-#define __KERNEL_SPLIT_DATA_H__
-
-#include "kernel/split/kernel_split_data_types.h"
-
-#include "kernel/kernel_globals.h"
-
-CCL_NAMESPACE_BEGIN
-
-ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
-{
-  (void)kg; /* Unused on CPU. */
-
-  uint64_t size = 0;
-#define SPLIT_DATA_ENTRY(type, name, num) +align_up(num_elements *num * sizeof(type), 16)
-  size = size SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  size += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  return size;
-}
-
-ccl_device_inline void split_data_init(KernelGlobals *kg,
-                                       ccl_global SplitData *split_data,
-                                       size_t num_elements,
-                                       ccl_global void *data,
-                                       ccl_global char *ray_state)
-{
-  (void)kg; /* Unused on CPU. */
-
-  ccl_global char *p = (ccl_global char *)data;
-
-#define SPLIT_DATA_ENTRY(type, name, num) \
-  split_data->name = (type *)p; \
-  p += align_up(num_elements * num * sizeof(type), 16);
-  SPLIT_DATA_ENTRIES;
-#undef SPLIT_DATA_ENTRY
-
-  uint64_t closure_size = sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1);
-
-#ifdef __BRANCHED_PATH__
-  split_data->_branched_state_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-#endif
-
-  split_data->_sd = (ShaderData *)p;
-  p += align_up(num_elements * (sizeof(ShaderData) + closure_size), 16);
-
-  split_data->ray_state = ray_state;
-}
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_H__ */
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
deleted file mode 100644
index 06bdce9947d..00000000000
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright 2011-2016 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __KERNEL_SPLIT_DATA_TYPES_H__
-#define __KERNEL_SPLIT_DATA_TYPES_H__
-
-CCL_NAMESPACE_BEGIN
-
-/* parameters used by the split kernels, we use a single struct to avoid passing these to each
- * kernel */
-
-typedef struct SplitParams {
-  WorkTile tile;
-  uint total_work_size;
-
-  ccl_global unsigned int *work_pools;
-
-  ccl_global int *queue_index;
-  int queue_size;
-  ccl_global char *use_queues_flag;
-
-  /* Place for storing sd->flag. AMD GPU OpenCL compiler workaround */
-  int dummy_sd_flag;
-} SplitParams;
-
-/* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
-
-/* SPLIT_DATA_ENTRY(type, name, num) */
-
-#ifdef __BRANCHED_PATH__
-
-typedef ccl_global struct SplitBranchedState {
-  /* various state that must be kept and restored after an indirect loop */
-  PathState path_state;
-  float3 throughput;
-  Ray ray;
-
-  Intersection isect;
-
-  char ray_state;
-
-  /* indirect loop state */
-  int next_closure;
-  int next_sample;
-
-#  ifdef __SUBSURFACE__
-  int ss_next_closure;
-  int ss_next_sample;
-  int next_hit;
-  int num_hits;
-
-  uint lcg_state;
-  LocalIntersection ss_isect;
-#  endif /* __SUBSURFACE__ */
-
-  int shared_sample_count; /* number of branched samples shared with other threads */
-  int original_ray;        /* index of original ray when sharing branched samples */
-  bool waiting_on_shared_samples;
-} SplitBranchedState;
-
-#  define SPLIT_DATA_BRANCHED_ENTRIES \
-    SPLIT_DATA_ENTRY(SplitBranchedState, branched_state, 1) \
-    SPLIT_DATA_ENTRY(ShaderData, _branched_state_sd, 0)
-#else
-#  define SPLIT_DATA_BRANCHED_ENTRIES
-#endif /* __BRANCHED_PATH__ */
-
-#ifdef __SUBSURFACE__
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES \
-    SPLIT_DATA_ENTRY(ccl_global SubsurfaceIndirectRays, ss_rays, 1)
-#else
-#  define SPLIT_DATA_SUBSURFACE_ENTRIES
-#endif /* __SUBSURFACE__ */
-
-#ifdef __VOLUME__
-#  define SPLIT_DATA_VOLUME_ENTRIES SPLIT_DATA_ENTRY(ccl_global PathState, state_shadow, 1)
-#else
-#  define SPLIT_DATA_VOLUME_ENTRIES
-#endif /* __VOLUME__ */
-
-#define SPLIT_DATA_ENTRIES \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY( \
-      ccl_global int, queue_data, (NUM_QUEUES * 2)) /* TODO(mai): this is too large? */ \
-  SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* Entries to be copied to inactive rays when sharing branched samples
- * (TODO: which are actually needed?) */
-#define SPLIT_DATA_ENTRIES_BRANCHED_SHARED \
-  SPLIT_DATA_ENTRY(ccl_global float3, throughput, 1) \
-  SPLIT_DATA_ENTRY(PathRadiance, path_radiance, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, ray, 1) \
-  SPLIT_DATA_ENTRY(ccl_global PathState, path_state, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Intersection, isect, 1) \
-  SPLIT_DATA_ENTRY(ccl_global BsdfEval, bsdf_eval, 1) \
-  SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
-  SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
-  SPLIT_DATA_ENTRY(ShaderDataTinyStorage, sd_DL_shadow, 1) \
-  SPLIT_DATA_SUBSURFACE_ENTRIES \
-  SPLIT_DATA_VOLUME_ENTRIES \
-  SPLIT_DATA_BRANCHED_ENTRIES \
-  SPLIT_DATA_ENTRY(ShaderData, _sd, 0)
-
-/* struct that holds pointers to data in the shared state buffer */
-typedef struct SplitData {
-#define SPLIT_DATA_ENTRY(type, name, num) type *name;
-  SPLIT_DATA_ENTRIES
-#undef SPLIT_DATA_ENTRY
-
-  /* this is actually in a separate buffer from the rest of the split state data (so it can be read
-   * back from the host easily) but is still used the same as the other data so we have it here in
-   * this struct as well
-   */
-  ccl_global char *ray_state;
-} SplitData;
-
-#ifndef __KERNEL_CUDA__
-#  define kernel_split_state (kg->split_data)
-#  define kernel_split_params (kg->split_param_data)
-#else
-__device__ SplitData __split_data;
-#  define kernel_split_state (__split_data)
-__device__ SplitParams __split_param_data;
-#  define kernel_split_params (__split_param_data)
-#endif /* __KERNEL_CUDA__ */
-
-#define kernel_split_sd(sd, ray_index) \
-  ((ShaderData *)(((ccl_global char *)kernel_split_state._##sd) + \
-                  (sizeof(ShaderData) + \
-                   sizeof(ShaderClosure) * (kernel_data.integrator.max_closures - 1)) * \
-                      (ray_index)))
-
-/* Local storage for queue_enqueue kernel. */
-typedef struct QueueEnqueueLocals {
-  uint queue_atomics[2];
-} QueueEnqueueLocals;
-
-/* Local storage for holdout_emission_blurring_pathtermination_ao kernel. */
-typedef struct BackgroundAOLocals {
-  uint queue_atomics_bg;
-  uint queue_atomics_ao;
-} BackgroundAOLocals;
-
-typedef struct ShaderSortLocals {
-  uint local_value[SHADER_SORT_BLOCK_SIZE];
-  ushort local_index[SHADER_SORT_BLOCK_SIZE];
-} ShaderSortLocals;
-
-CCL_NAMESPACE_END
-
-#endif /* __KERNEL_SPLIT_DATA_TYPES_H__ */
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
deleted file mode 100644
index ba06ae3bc53..00000000000
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright 2011-2017 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-#if defined(__BRANCHED_PATH__) && defined(__SUBSURFACE__)
-
-ccl_device_inline void kernel_split_branched_path_subsurface_indirect_light_init(KernelGlobals *kg,
-                                                                                 int ray_index)
-{
-  kernel_split_branched_path_indirect_loop_init(kg, ray_index);
-
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  branched_state->ss_next_closure = 0;
-  branched_state->ss_next_sample = 0;
-
-  branched_state->num_hits = 0;
-  branched_state->next_hit = 0;
-
-  ADD_RAY_FLAG(kernel_split_state.ray_state, ray_index, RAY_BRANCHED_SUBSURFACE_INDIRECT);
-}
-
-ccl_device_noinline bool kernel_split_branched_path_subsurface_indirect_light_iter(
-    KernelGlobals *kg, int ray_index)
-{
-  SplitBranchedState *branched_state = &kernel_split_state.branched_state[ray_index];
-
-  ShaderData *sd = kernel_split_sd(branched_state_sd, ray_index);
-  PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-  ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-  for (int i = branched_state->ss_next_closure; i < sd->num_closure; i++) {
-    ShaderClosure *sc = &sd->closure[i];
-
-    if (!CLOSURE_IS_BSSRDF(sc->type))
-      continue;
-
-    /* Closure memory will be overwritten, so read required variables now. */
-    Bssrdf *bssrdf = (Bssrdf *)sc;
-    ClosureType bssrdf_type = sc->type;
-    float bssrdf_roughness = bssrdf->roughness;
-
-    /* set up random number generator */
-    if (branched_state->ss_next_sample == 0 && branched_state->next_hit == 0 &&
-        branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-      branched_state->lcg_state = lcg_state_init_addrspace(&branched_state->path_state,
-                                                           0x68bc21eb);
-    }
-    int num_samples = kernel_data.integrator.subsurface_samples * 3;
-    float num_samples_inv = 1.0f / num_samples;
-    uint bssrdf_rng_hash = cmj_hash(branched_state->path_state.rng_hash, i);
-
-    /* do subsurface scatter step with copy of shader data, this will
-     * replace the BSSRDF with a diffuse BSDF closure */
-    for (int j = branched_state->ss_next_sample; j < num_samples; j++) {
-      ccl_global PathState *hit_state = &kernel_split_state.path_state[ray_index];
-      *hit_state = branched_state->path_state;
-      hit_state->rng_hash = bssrdf_rng_hash;
-      path_state_branch(hit_state, j, num_samples);
-
-      ccl_global LocalIntersection *ss_isect = &branched_state->ss_isect;
-      float bssrdf_u, bssrdf_v;
-      path_branched_rng_2D(
-          kg, bssrdf_rng_hash, hit_state, j, num_samples, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
-
-      /* intersection is expensive so avoid doing multiple times for the same input */
-      if (branched_state->next_hit == 0 && branched_state->next_closure == 0 &&
-          branched_state->next_sample == 0) {
-        uint lcg_state = branched_state->lcg_state;
-        LocalIntersection ss_isect_private;
-
-        branched_state->num_hits = subsurface_scatter_multi_intersect(
-            kg, &ss_isect_private, sd, hit_state, sc, &lcg_state, bssrdf_u, bssrdf_v, true);
-
-        branched_state->lcg_state = lcg_state;
-        *ss_isect = ss_isect_private;
-      }
-
-      hit_state->rng_offset += PRNG_BOUNCE_NUM;
-
-#  ifdef __VOLUME__
-      Ray volume_ray = branched_state->ray;
-      bool need_update_volume_stack = kernel_data.integrator.use_volumes &&
-                                      sd->object_flag & SD_OBJECT_INTERSECTS_VOLUME;
-#  endif /* __VOLUME__ */
-
-      /* compute lighting with the BSDF closure */
-      for (int hit = branched_state->next_hit; hit < branched_state->num_hits; hit++) {
-        ShaderData *bssrdf_sd = kernel_split_sd(sd, ray_index);
-        *bssrdf_sd = *sd; /* note: copy happens each iteration of inner loop, this is
-                           * important as the indirect path will write into bssrdf_sd */
-
-        LocalIntersection ss_isect_private = *ss_isect;
-        subsurface_scatter_multi_setup(
-            kg, &ss_isect_private, hit, bssrdf_sd, hit_state, bssrdf_type, bssrdf_roughness);
-        *ss_isect = ss_isect_private;
-
-#  ifdef __VOLUME__
-        if (need_update_volume_stack) {
-          /* Setup ray from previous surface point to the new one. */
-          float3 P = ray_offset(bssrdf_sd->P, -bssrdf_sd->Ng);
-          volume_ray.D = normalize_len(P - volume_ray.P, &volume_ray.t);
-
-          for (int k = 0; k < VOLUME_STACK_SIZE; k++) {
-            hit_state->volume_stack[k] = branched_state->path_state.volume_stack[k];
-          }
-
-          kernel_volume_stack_update_for_subsurface(
-              kg, emission_sd, &volume_ray, hit_state->volume_stack);
-        }
-#  endif /* __VOLUME__ */
-
-#  ifdef __EMISSION__
-        if (branched_state->next_closure == 0 && branched_state->next_sample == 0) {
-          /* direct light */
-          if (kernel_data.integrator.use_direct_light) {
-            int all = (kernel_data.integrator.sample_all_lights_direct) ||
-                      (hit_state->flag & PATH_RAY_SHADOW_CATCHER);
-            kernel_branched_path_surface_connect_light(kg,
-                                                       bssrdf_sd,
-                                                       emission_sd,
-                                                       hit_state,
-                                                       branched_state->throughput,
-                                                       num_samples_inv,
-                                                       L,
-                                                       all);
-          }
-        }
-#  endif /* __EMISSION__ */
-
-        /* indirect light */
-        if (kernel_split_branched_path_surface_indirect_light_iter(
-                kg, ray_index, num_samples_inv, bssrdf_sd, false, false)) {
-          branched_state->ss_next_closure = i;
-          branched_state->ss_next_sample = j;
-          branched_state->next_hit = hit;
-
-          return true;
-        }
-
-        branched_state->next_closure = 0;
-      }
-
-      branched_state->next_hit = 0;
-    }
-
-    branched_state->ss_next_sample = 0;
-  }
-
-  branched_state->ss_next_closure = sd->num_closure;
-
-  branched_state->waiting_on_shared_samples = (branched_state->shared_sample_count > 0);
-  if (branched_state->waiting_on_shared_samples) {
-    return true;
-  }
-
-  kernel_split_branched_path_indirect_loop_end(kg, ray_index);
-
-  return false;
-}
-
-#endif /* __BRANCHED_PATH__ && __SUBSURFACE__ */
-
-ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
-{
-  int thread_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  if (thread_index == 0) {
-    /* We will empty both queues in this kernel. */
-    kernel_split_params.queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
-    kernel_split_params.queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
-  }
-
-  int ray_index = ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0);
-  ray_index = get_ray_index(kg,
-                            ray_index,
-                            QUEUE_ACTIVE_AND_REGENERATED_RAYS,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-  get_ray_index(kg,
-                thread_index,
-                QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS,
-                kernel_split_state.queue_data,
-                kernel_split_params.queue_size,
-                1);
-
-#ifdef __SUBSURFACE__
-  ccl_global char *ray_state = kernel_split_state.ray_state;
-
-  if (IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
-    ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-    PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
-    ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
-    ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
-    ccl_global SubsurfaceIndirectRays *ss_indirect = &kernel_split_state.ss_rays[ray_index];
-    ShaderData *sd = kernel_split_sd(sd, ray_index);
-    ShaderData *emission_sd = AS_SHADER_DATA(&kernel_split_state.sd_DL_shadow[ray_index]);
-
-    if (sd->flag & SD_BSSRDF) {
-
-#  ifdef __BRANCHED_PATH__
-      if (!kernel_data.integrator.branched ||
-          IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
-#  endif
-        if (kernel_path_subsurface_scatter(
-                kg, sd, emission_sd, L, state, ray, throughput, ss_indirect)) {
-          kernel_split_path_end(kg, ray_index);
-        }
-#  ifdef __BRANCHED_PATH__
-      }
-      else {
-        kernel_split_branched_path_subsurface_indirect_light_init(kg, ray_index);
-
-        if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-          ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-        }
-      }
-#  endif
-    }
-  }
-
-#  ifdef __BRANCHED_PATH__
-  if (ccl_global_id(0) == 0 && ccl_global_id(1) == 0) {
-    kernel_split_params.queue_index[QUEUE_SUBSURFACE_INDIRECT_ITER] = 0;
-  }
-
-  /* iter loop */
-  ray_index = get_ray_index(kg,
-                            ccl_global_id(1) * ccl_global_size(0) + ccl_global_id(0),
-                            QUEUE_SUBSURFACE_INDIRECT_ITER,
-                            kernel_split_state.queue_data,
-                            kernel_split_params.queue_size,
-                            1);
-
-  if (IS_STATE(ray_state, ray_index, RAY_SUBSURFACE_INDIRECT_NEXT_ITER)) {
-    /* for render passes, sum and reset indirect light pass variables
-     * for the next samples */
-    path_radiance_sum_indirect(&kernel_split_state.path_radiance[ray_index]);
-    path_radiance_reset_indirect(&kernel_split_state.path_radiance[ray_index]);
-
-    if (kernel_split_branched_path_subsurface_indirect_light_iter(kg, ray_index)) {
-      ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
-    }
-  }
-#  endif /* __BRANCHED_PATH__ */
-
-#endif /* __SUBSURFACE__ */
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h
index 000da1fa615..4aee1ef11b3 100644
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -48,16 +48,18 @@ ccl_device_inline float3 stack_load_float3(float *stack, uint a)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  return make_float3(stack[a + 0], stack[a + 1], stack[a + 2]);
+  float *stack_a = stack + a;
+  return make_float3(stack_a[0], stack_a[1], stack_a[2]);
 }
 
 ccl_device_inline void stack_store_float3(float *stack, uint a, float3 f)
 {
   kernel_assert(a + 2 < SVM_STACK_SIZE);
 
-  stack[a + 0] = f.x;
-  stack[a + 1] = f.y;
-  stack[a + 2] = f.z;
+  float *stack_a = stack + a;
+  stack_a[0] = f.x;
+  stack_a[1] = f.y;
+  stack_a[2] = f.z;
 }
 
 ccl_device_inline float stack_load_float(float *stack, uint a)
@@ -105,14 +107,14 @@ ccl_device_inline bool stack_valid(uint a)
 
 /* Reading Nodes */
 
-ccl_device_inline uint4 read_node(KernelGlobals *kg, int *offset)
+ccl_device_inline uint4 read_node(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   (*offset)++;
   return node;
 }
 
-ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
+ccl_device_inline float4 read_node_float(const KernelGlobals *kg, int *offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, *offset);
   float4 f = make_float4(__uint_as_float(node.x),
@@ -123,7 +125,7 @@ ccl_device_inline float4 read_node_float(KernelGlobals *kg, int *offset)
   return f;
 }
 
-ccl_device_inline float4 fetch_node_float(KernelGlobals *kg, int offset)
+ccl_device_inline float4 fetch_node_float(const KernelGlobals *kg, int offset)
 {
   uint4 node = kernel_tex_fetch(__svm_nodes, offset);
   return make_float4(__uint_as_float(node.x),
@@ -217,26 +219,11 @@ CCL_NAMESPACE_END
 CCL_NAMESPACE_BEGIN
 
 /* Main Interpreter Loop */
-#if defined(__KERNEL_OPTIX__) && defined(__SHADER_RAYTRACE__)
-ccl_device_inline void svm_eval_nodes(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      ccl_addr_space PathState *state,
-                                      ccl_global float *buffer,
-                                      ShaderType type,
-                                      int path_flag)
-{
-  optixDirectCall<void>(0, kg, sd, state, buffer, type, path_flag);
-}
-extern "C" __device__ void __direct_callable__svm_eval_nodes(
-#else
-ccl_device_noinline void svm_eval_nodes(
-#endif
-    KernelGlobals *kg,
-    ShaderData *sd,
-    ccl_addr_space PathState *state,
-    ccl_global float *buffer,
-    ShaderType type,
-    int path_flag)
+template<uint node_feature_mask, ShaderType type>
+ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS,
+                               ShaderData *sd,
+                               ccl_global float *render_buffer,
+                               int path_flag)
 {
   float stack[SVM_STACK_SIZE];
   int offset = sd->shader & SHADER_MASK;
@@ -247,7 +234,6 @@ ccl_device_noinline void svm_eval_nodes(
     switch (node.x) {
       case NODE_END:
         return;
-#if NODES_GROUP(NODE_GROUP_LEVEL_0)
       case NODE_SHADER_JUMP: {
         if (type == SHADER_TYPE_SURFACE)
           offset = node.y;
@@ -260,13 +246,18 @@ ccl_device_noinline void svm_eval_nodes(
         break;
       }
       case NODE_CLOSURE_BSDF:
-        svm_node_closure_bsdf(kg, sd, stack, node, type, path_flag, &offset);
+        offset = svm_node_closure_bsdf<node_feature_mask, type>(
+            kg, sd, stack, node, path_flag, offset);
         break;
       case NODE_CLOSURE_EMISSION:
-        svm_node_closure_emission(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_emission(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_BACKGROUND:
-        svm_node_closure_background(sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_closure_background(sd, stack, node);
+        }
         break;
       case NODE_CLOSURE_SET_WEIGHT:
         svm_node_closure_set_weight(sd, node.y, node.z, node.w);
@@ -275,7 +266,9 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_closure_weight(sd, stack, node.y);
         break;
       case NODE_EMISSION_WEIGHT:
-        svm_node_emission_weight(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(EMISSION)) {
+          svm_node_emission_weight(kg, sd, stack, node);
+        }
         break;
       case NODE_MIX_CLOSURE:
         svm_node_mix_closure(sd, stack, node);
@@ -295,86 +288,108 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_TEX_COORD:
-        svm_node_tex_coord(kg, sd, path_flag, stack, node, &offset);
+        offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
         break;
       case NODE_VALUE_F:
         svm_node_value_f(kg, sd, stack, node.y, node.z);
         break;
       case NODE_VALUE_V:
-        svm_node_value_v(kg, sd, stack, node.y, &offset);
+        offset = svm_node_value_v(kg, sd, stack, node.y, offset);
         break;
       case NODE_ATTR:
-        svm_node_attr(kg, sd, stack, node);
+        svm_node_attr<node_feature_mask>(kg, sd, stack, node);
         break;
       case NODE_VERTEX_COLOR:
         svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_GEOMETRY_BUMP_DX:
-        svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_GEOMETRY_BUMP_DY:
-        svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+        }
         break;
       case NODE_SET_DISPLACEMENT:
-        svm_node_set_displacement(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_displacement(kg, sd, stack, node.y);
+        }
         break;
       case NODE_DISPLACEMENT:
-        svm_node_displacement(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_displacement(kg, sd, stack, node);
+        }
         break;
       case NODE_VECTOR_DISPLACEMENT:
-        svm_node_vector_displacement(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_vector_displacement(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_TEX_IMAGE:
-        svm_node_tex_image(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_image(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_IMAGE_BOX:
         svm_node_tex_image_box(kg, sd, stack, node);
         break;
       case NODE_TEX_NOISE:
-        svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_BUMP)
       case NODE_SET_BUMP:
-        svm_node_set_bump(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_bump(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DX:
-        svm_node_attr_bump_dx(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dx(kg, sd, stack, node);
+        }
         break;
       case NODE_ATTR_BUMP_DY:
-        svm_node_attr_bump_dy(kg, sd, stack, node);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_attr_bump_dy(kg, sd, stack, node);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DX:
-        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_VERTEX_COLOR_BUMP_DY:
-        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DX:
-        svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_TEX_COORD_BUMP_DY:
-        svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+        }
         break;
       case NODE_CLOSURE_SET_NORMAL:
-        svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(BUMP)) {
+          svm_node_set_normal(kg, sd, stack, node.y, node.z);
+        }
         break;
-#    if NODES_FEATURE(NODE_FEATURE_BUMP_STATE)
       case NODE_ENTER_BUMP_EVAL:
-        svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_enter_bump_eval(kg, sd, stack, node.y);
+        }
         break;
       case NODE_LEAVE_BUMP_EVAL:
-        svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        if (KERNEL_NODES_FEATURE(BUMP_STATE)) {
+          svm_node_leave_bump_eval(kg, sd, stack, node.y);
+        }
         break;
-#    endif /* NODES_FEATURE(NODE_FEATURE_BUMP_STATE) */
-#  endif   /* NODES_FEATURE(NODE_FEATURE_BUMP) */
       case NODE_HSV:
-        svm_node_hsv(kg, sd, stack, node, &offset);
+        svm_node_hsv(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_0) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_1)
       case NODE_CLOSURE_HOLDOUT:
         svm_node_closure_holdout(sd, stack, node);
         break;
@@ -384,22 +399,24 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_LAYER_WEIGHT:
         svm_node_layer_weight(sd, stack, node);
         break;
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_CLOSURE_VOLUME:
-        svm_node_closure_volume(kg, sd, stack, node, type);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          svm_node_closure_volume<type>(kg, sd, stack, node);
+        }
         break;
       case NODE_PRINCIPLED_VOLUME:
-        svm_node_principled_volume(kg, sd, stack, node, type, path_flag, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_MATH:
-        svm_node_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_math(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_VECTOR_MATH:
-        svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_RGB_RAMP:
-        svm_node_rgb_ramp(kg, sd, stack, node, &offset);
+        offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
         break;
       case NODE_GAMMA:
         svm_node_gamma(sd, stack, node.y, node.z, node.w);
@@ -408,7 +425,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_brightness(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_LIGHT_PATH:
-        svm_node_light_path(sd, state, stack, node.y, node.z, path_flag);
+        svm_node_light_path(INTEGRATOR_STATE_PASS, sd, stack, node.y, node.z, path_flag);
         break;
       case NODE_OBJECT_INFO:
         svm_node_object_info(kg, sd, stack, node.y, node.z);
@@ -416,22 +433,22 @@ ccl_device_noinline void svm_eval_nodes(
       case NODE_PARTICLE_INFO:
         svm_node_particle_info(kg, sd, stack, node.y, node.z);
         break;
-#  if defined(__HAIR__) && NODES_FEATURE(NODE_FEATURE_HAIR)
+#if defined(__HAIR__)
       case NODE_HAIR_INFO:
-        svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        if (KERNEL_NODES_FEATURE(HAIR)) {
+          svm_node_hair_info(kg, sd, stack, node.y, node.z);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_HAIR) */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_1) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_2)
       case NODE_TEXTURE_MAPPING:
-        svm_node_texture_mapping(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIN_MAX:
-        svm_node_min_max(kg, sd, stack, node.y, node.z, &offset);
+        offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
         break;
       case NODE_CAMERA:
         svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
@@ -440,47 +457,46 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_tex_environment(kg, sd, stack, node);
         break;
       case NODE_TEX_SKY:
-        svm_node_tex_sky(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_sky(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_GRADIENT:
         svm_node_tex_gradient(sd, stack, node);
         break;
       case NODE_TEX_VORONOI:
-        svm_node_tex_voronoi(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_voronoi<node_feature_mask>(
+            kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_MUSGRAVE:
-        svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_TEX_WAVE:
-        svm_node_tex_wave(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_wave(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_MAGIC:
-        svm_node_tex_magic(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_magic(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_CHECKER:
         svm_node_tex_checker(kg, sd, stack, node);
         break;
       case NODE_TEX_BRICK:
-        svm_node_tex_brick(kg, sd, stack, node, &offset);
+        offset = svm_node_tex_brick(kg, sd, stack, node, offset);
         break;
       case NODE_TEX_WHITE_NOISE:
-        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w, &offset);
+        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
         break;
       case NODE_NORMAL:
-        svm_node_normal(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_LIGHT_FALLOFF:
         svm_node_light_falloff(sd, stack, node);
         break;
       case NODE_IES:
-        svm_node_ies(kg, sd, stack, node, &offset);
+        svm_node_ies(kg, sd, stack, node);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_2) */
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_3)
       case NODE_RGB_CURVES:
       case NODE_VECTOR_CURVES:
-        svm_node_curves(kg, sd, stack, node, &offset);
+        offset = svm_node_curves(kg, sd, stack, node, offset);
         break;
       case NODE_TANGENT:
         svm_node_tangent(kg, sd, stack, node);
@@ -492,7 +508,7 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_invert(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_MIX:
-        svm_node_mix(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_SEPARATE_VECTOR:
         svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
@@ -501,10 +517,10 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
         break;
       case NODE_SEPARATE_HSV:
-        svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_COMBINE_HSV:
-        svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_VECTOR_ROTATE:
         svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
@@ -522,39 +538,36 @@ ccl_device_noinline void svm_eval_nodes(
         svm_node_blackbody(kg, sd, stack, node.y, node.z);
         break;
       case NODE_MAP_RANGE:
-        svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
       case NODE_CLAMP:
-        svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, &offset);
+        offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
         break;
-#  ifdef __SHADER_RAYTRACE__
+#ifdef __SHADER_RAYTRACE__
       case NODE_BEVEL:
-        svm_node_bevel(kg, sd, state, stack, node);
+        svm_node_bevel<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
       case NODE_AMBIENT_OCCLUSION:
-        svm_node_ao(kg, sd, state, stack, node);
+        svm_node_ao<node_feature_mask>(INTEGRATOR_STATE_PASS, sd, stack, node);
         break;
-#  endif /* __SHADER_RAYTRACE__ */
-#endif   /* NODES_GROUP(NODE_GROUP_LEVEL_3) */
+#endif
 
-#if NODES_GROUP(NODE_GROUP_LEVEL_4)
-#  if NODES_FEATURE(NODE_FEATURE_VOLUME)
       case NODE_TEX_VOXEL:
-        svm_node_tex_voxel(kg, sd, stack, node, &offset);
+        if (KERNEL_NODES_FEATURE(VOLUME)) {
+          offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+        }
         break;
-#  endif /* NODES_FEATURE(NODE_FEATURE_VOLUME) */
       case NODE_AOV_START:
-        if (!svm_node_aov_check(state, buffer)) {
+        if (!svm_node_aov_check(path_flag, render_buffer)) {
           return;
         }
         break;
       case NODE_AOV_COLOR:
-        svm_node_aov_color(kg, sd, stack, node, buffer);
+        svm_node_aov_color(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
       case NODE_AOV_VALUE:
-        svm_node_aov_value(kg, sd, stack, node, buffer);
+        svm_node_aov_value(INTEGRATOR_STATE_PASS, sd, stack, node, render_buffer);
         break;
-#endif /* NODES_GROUP(NODE_GROUP_LEVEL_4) */
       default:
         kernel_assert(!"Unknown node type was passed to the SVM machine");
         return;
diff --git a/intern/cycles/kernel/svm/svm_ao.h b/intern/cycles/kernel/svm/svm_ao.h
index 4cb986b897a..34ac2cb8fbf 100644
--- a/intern/cycles/kernel/svm/svm_ao.h
+++ b/intern/cycles/kernel/svm/svm_ao.h
@@ -14,20 +14,25 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
-ccl_device_noinline float svm_ao(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float3 N,
-                                 ccl_addr_space PathState *state,
-                                 float max_dist,
-                                 int num_samples,
-                                 int flags)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float __direct_callable__svm_node_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float svm_ao(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                           ShaderData *sd,
+                                                           float3 N,
+                                                           float max_dist,
+                                                           int num_samples,
+                                                           int flags)
 {
   if (flags & NODE_AO_GLOBAL_RADIUS) {
-    max_dist = kernel_data.background.ao_distance;
+    max_dist = kernel_data.integrator.ao_bounces_distance;
   }
 
   /* Early out if no sampling needed. */
@@ -47,11 +52,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   float3 T, B;
   make_orthonormals(N, &T, &B);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   int unoccluded = 0;
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     float2 d = concentric_sample_disk(disk_u, disk_v);
     float3 D = make_float3(d.x, d.y, safe_sqrtf(1.0f - dot(d, d)));
@@ -62,8 +70,8 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
     ray.D = D.x * T + D.y * B + D.z * N;
     ray.t = max_dist;
     ray.time = sd->time;
-    ray.dP = sd->dP;
-    ray.dD = differential3_zero();
+    ray.dP = differential_zero_compact();
+    ray.dD = differential_zero_compact();
 
     if (flags & NODE_AO_ONLY_LOCAL) {
       if (!scene_intersect_local(kg, &ray, NULL, sd->object, NULL, 0)) {
@@ -81,8 +89,14 @@ ccl_device_noinline float svm_ao(KernelGlobals *kg,
   return ((float)unoccluded) / num_samples;
 }
 
-ccl_device void svm_node_ao(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_ao(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint flags, dist_offset, normal_offset, out_ao_offset;
   svm_unpack_node_uchar4(node.y, &flags, &dist_offset, &normal_offset, &out_ao_offset);
@@ -92,7 +106,16 @@ ccl_device void svm_node_ao(
 
   float dist = stack_load_float_default(stack, dist_offset, node.w);
   float3 normal = stack_valid(normal_offset) ? stack_load_float3(stack, normal_offset) : sd->N;
-  float ao = svm_ao(kg, sd, normal, state, dist, samples, flags);
+
+  float ao = 1.0f;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    ao = optixDirectCall<float>(0, INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  else
+    ao = svm_ao(INTEGRATOR_STATE_PASS, sd, normal, dist, samples, flags);
+#  endif
+  }
 
   if (stack_valid(out_ao_offset)) {
     stack_store_float(stack, out_ao_offset, ao);
diff --git a/intern/cycles/kernel/svm/svm_aov.h b/intern/cycles/kernel/svm/svm_aov.h
index 899e466d099..26dec9717b3 100644
--- a/intern/cycles/kernel/svm/svm_aov.h
+++ b/intern/cycles/kernel/svm/svm_aov.h
@@ -14,36 +14,50 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_write_passes.h"
+
 CCL_NAMESPACE_BEGIN
 
-ccl_device_inline bool svm_node_aov_check(ccl_addr_space PathState *state,
-                                          ccl_global float *buffer)
+ccl_device_inline bool svm_node_aov_check(const int path_flag, ccl_global float *render_buffer)
 {
-  int path_flag = state->flag;
-
   bool is_primary = (path_flag & PATH_RAY_CAMERA) && (!(path_flag & PATH_RAY_SINGLE_PASS_DONE));
 
-  return ((buffer != NULL) && is_primary);
+  return ((render_buffer != NULL) && is_primary);
 }
 
-ccl_device void svm_node_aov_color(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_color(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float3 val = stack_load_float3(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float4(buffer + kernel_data.film.pass_aov_color + 4 * node.z,
-                             make_float4(val.x, val.y, val.z, 1.0f));
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_color + node.z);
+    kernel_write_pass_float3(buffer, make_float3(val.x, val.y, val.z));
   }
 }
 
-ccl_device void svm_node_aov_value(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ccl_global float *buffer)
+ccl_device void svm_node_aov_value(INTEGRATOR_STATE_CONST_ARGS,
+                                   ShaderData *sd,
+                                   float *stack,
+                                   uint4 node,
+                                   ccl_global float *render_buffer)
 {
   float val = stack_load_float(stack, node.y);
 
-  if (buffer) {
-    kernel_write_pass_float(buffer + kernel_data.film.pass_aov_value + node.z, val);
+  if (render_buffer && !INTEGRATOR_STATE_IS_NULL) {
+    const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
+    const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
+                                          kernel_data.film.pass_stride;
+    ccl_global float *buffer = render_buffer + render_buffer_offset +
+                               (kernel_data.film.pass_aov_value + node.z);
+    kernel_write_pass_float(buffer, val);
   }
 }
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_attribute.h b/intern/cycles/kernel/svm/svm_attribute.h
index 62740824ad1..5f94b20af73 100644
--- a/intern/cycles/kernel/svm/svm_attribute.h
+++ b/intern/cycles/kernel/svm/svm_attribute.h
@@ -18,8 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Attribute Node */
 
-ccl_device AttributeDescriptor svm_node_attr_init(
-    KernelGlobals *kg, ShaderData *sd, uint4 node, NodeAttributeOutputType *type, uint *out_offset)
+ccl_device AttributeDescriptor svm_node_attr_init(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  uint4 node,
+                                                  NodeAttributeOutputType *type,
+                                                  uint *out_offset)
 {
   *out_offset = node.z;
   *type = (NodeAttributeOutputType)node.w;
@@ -44,31 +47,37 @@ ccl_device AttributeDescriptor svm_node_attr_init(
   return desc;
 }
 
-ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+template<uint node_feature_mask>
+ccl_device_noinline void svm_node_attr(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
   AttributeDescriptor desc = svm_node_attr_init(kg, sd, node, &type, &out_offset);
 
 #ifdef __VOLUME__
-  /* Volumes
-   * NOTE: moving this into its own node type might help improve performance. */
-  if (primitive_is_volume_attribute(sd, desc)) {
-    const float4 value = volume_attribute_float4(kg, sd, desc);
+  if (KERNEL_NODES_FEATURE(VOLUME)) {
+    /* Volumes
+     * NOTE: moving this into its own node type might help improve performance. */
+    if (primitive_is_volume_attribute(sd, desc)) {
+      const float4 value = volume_attribute_float4(kg, sd, desc);
 
-    if (type == NODE_ATTR_OUTPUT_FLOAT) {
-      const float f = volume_attribute_value_to_float(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
-      const float3 f = volume_attribute_value_to_float3(value);
-      stack_store_float3(stack, out_offset, f);
+      if (type == NODE_ATTR_OUTPUT_FLOAT) {
+        const float f = volume_attribute_value_to_float(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      else if (type == NODE_ATTR_OUTPUT_FLOAT3) {
+        const float3 f = volume_attribute_value_to_float3(value);
+        stack_store_float3(stack, out_offset, f);
+      }
+      else {
+        const float f = volume_attribute_value_to_alpha(value);
+        stack_store_float(stack, out_offset, f);
+      }
+      return;
     }
-    else {
-      const float f = volume_attribute_value_to_alpha(value);
-      stack_store_float(stack, out_offset, f);
-    }
-    return;
   }
 #endif
 
@@ -139,7 +148,10 @@ ccl_device void svm_node_attr(KernelGlobals *kg, ShaderData *sd, float *stack, u
   }
 }
 
-ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dx(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
@@ -232,7 +244,10 @@ ccl_device void svm_node_attr_bump_dx(KernelGlobals *kg, ShaderData *sd, float *
   }
 }
 
-ccl_device void svm_node_attr_bump_dy(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_attr_bump_dy(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   NodeAttributeOutputType type = NODE_ATTR_OUTPUT_FLOAT;
   uint out_offset = 0;
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index bf5957ec9e4..9d7ce202d49 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -14,21 +14,95 @@
  * limitations under the License.
  */
 
+#include "kernel/bvh/bvh.h"
+#include "kernel/kernel_montecarlo.h"
+#include "kernel/kernel_random.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifdef __SHADER_RAYTRACE__
 
+/* Planar Cubic BSSRDF falloff, reused for bevel.
+ *
+ * This is basically (Rm - x)^3, with some factors to normalize it. For sampling
+ * we integrate 2*pi*x * (Rm - x)^3, which gives us a quintic equation that as
+ * far as I can tell has no closed form solution. So we get an iterative solution
+ * instead with newton-raphson. */
+
+ccl_device float svm_bevel_cubic_eval(const float radius, float r)
+{
+  const float Rm = radius;
+
+  if (r >= Rm)
+    return 0.0f;
+
+  /* integrate (2*pi*r * 10*(R - r)^3)/(pi * R^5) from 0 to R = 1 */
+  const float Rm5 = (Rm * Rm) * (Rm * Rm) * Rm;
+  const float f = Rm - r;
+  const float num = f * f * f;
+
+  return (10.0f * num) / (Rm5 * M_PI_F);
+}
+
+ccl_device float svm_bevel_cubic_pdf(const float radius, float r)
+{
+  return svm_bevel_cubic_eval(radius, r);
+}
+
+/* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
+ccl_device_forceinline float svm_bevel_cubic_quintic_root_find(float xi)
+{
+  /* newton-raphson iteration, usually succeeds in 2-4 iterations, except
+   * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
+   * should not be too bad */
+  const float tolerance = 1e-6f;
+  const int max_iteration_count = 10;
+  float x = 0.25f;
+  int i;
+
+  for (i = 0; i < max_iteration_count; i++) {
+    float x2 = x * x;
+    float x3 = x2 * x;
+    float nx = (1.0f - x);
+
+    float f = 10.0f * x2 - 20.0f * x3 + 15.0f * x2 * x2 - 4.0f * x2 * x3 - xi;
+    float f_ = 20.0f * (x * nx) * (nx * nx);
+
+    if (fabsf(f) < tolerance || f_ == 0.0f)
+      break;
+
+    x = saturate(x - f / f_);
+  }
+
+  return x;
+}
+
+ccl_device void svm_bevel_cubic_sample(const float radius, float xi, float *r, float *h)
+{
+  float Rm = radius;
+  float r_ = svm_bevel_cubic_quintic_root_find(xi);
+
+  r_ *= Rm;
+  *r = r_;
+
+  /* h^2 + r^2 = Rm^2 */
+  *h = safe_sqrtf(Rm * Rm - r_ * r_);
+}
+
 /* Bevel shader averaging normals from nearby surfaces.
  *
  * Sampling strategy from: BSSRDF Importance Sampling, SIGGRAPH 2013
  * http://library.imageworks.com/pdfs/imageworks-library-BSSRDF-sampling.pdf
  */
 
-ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     ccl_addr_space PathState *state,
-                                     float radius,
-                                     int num_samples)
+#  ifdef __KERNEL_OPTIX__
+extern "C" __device__ float3 __direct_callable__svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  else
+ccl_device float3 svm_bevel(INTEGRATOR_STATE_CONST_ARGS,
+#  endif
+                                                               ShaderData *sd,
+                                                               float radius,
+                                                               int num_samples)
 {
   /* Early out if no sampling needed. */
   if (radius <= 0.0f || num_samples < 1 || sd->object == OBJECT_NONE) {
@@ -41,21 +115,27 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   }
 
   /* Don't bevel for blurry indirect rays. */
-  if (state->min_ray_pdf < 8.0f) {
+  if (INTEGRATOR_STATE(path, min_ray_pdf) < 8.0f) {
     return sd->N;
   }
 
   /* Setup for multi intersection. */
   LocalIntersection isect;
-  uint lcg_state = lcg_state_init_addrspace(state, 0x64c6a40e);
+  uint lcg_state = lcg_state_init(INTEGRATOR_STATE(path, rng_hash),
+                                  INTEGRATOR_STATE(path, rng_offset),
+                                  INTEGRATOR_STATE(path, sample),
+                                  0x64c6a40e);
 
   /* Sample normals from surrounding points on surface. */
   float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
 
+  /* TODO: support ray-tracing in shadow shader evaluation? */
+  RNGState rng_state;
+  path_state_rng_load(INTEGRATOR_STATE_PASS, &rng_state);
+
   for (int sample = 0; sample < num_samples; sample++) {
     float disk_u, disk_v;
-    path_branched_rng_2D(
-        kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
+    path_branched_rng_2D(kg, &rng_state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);
 
     /* Pick random axis in local frame and point on disk. */
     float3 disk_N, disk_T, disk_B;
@@ -97,7 +177,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     float disk_height;
 
     /* Perhaps find something better than Cubic BSSRDF, but happens to work well. */
-    bssrdf_cubic_sample(radius, 0.0f, disk_r, &disk_r, &disk_height);
+    svm_bevel_cubic_sample(radius, disk_r, &disk_r, &disk_height);
 
     float3 disk_P = (disk_r * cosf(phi)) * disk_T + (disk_r * sinf(phi)) * disk_B;
 
@@ -106,8 +186,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
     ray->P = sd->P + disk_N * disk_height + disk_P;
     ray->D = -disk_N;
     ray->t = 2.0f * disk_height;
-    ray->dP = sd->dP;
-    ray->dD = differential3_zero();
+    ray->dP = differential_zero_compact();
+    ray->dD = differential_zero_compact();
     ray->time = sd->time;
 
     /* Intersect with the same object. if multiple intersections are found it
@@ -120,14 +200,16 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       /* Quickly retrieve P and Ng without setting up ShaderData. */
       float3 hit_P;
       if (sd->type & PRIMITIVE_TRIANGLE) {
-        hit_P = triangle_refine_local(kg, sd, &isect.hits[hit], ray);
+        hit_P = triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim);
       }
 #  ifdef __OBJECT_MOTION__
       else if (sd->type & PRIMITIVE_MOTION_TRIANGLE) {
         float3 verts[3];
         motion_triangle_vertices(
             kg, sd->object, kernel_tex_fetch(__prim_index, isect.hits[hit].prim), sd->time, verts);
-        hit_P = motion_triangle_refine_local(kg, sd, &isect.hits[hit], ray, verts);
+        hit_P = motion_triangle_refine_local(
+            kg, sd, ray->P, ray->D, ray->t, isect.hits[hit].object, isect.hits[hit].prim, verts);
       }
 #  endif /* __OBJECT_MOTION__ */
 
@@ -173,7 +255,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
 
       /* Multiple importance sample between 3 axes, power heuristic
        * found to be slightly better than balance heuristic. pdf_N
-       * in the MIS weight and denominator cancelled out. */
+       * in the MIS weight and denominator canceled out. */
       float w = pdf_N / (sqr(pdf_N) + sqr(pdf_T) + sqr(pdf_B));
       if (isect.num_hits > LOCAL_MAX_HITS) {
         w *= isect.num_hits / (float)LOCAL_MAX_HITS;
@@ -183,8 +265,8 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
       float r = len(hit_P - sd->P);
 
       /* Compute weight. */
-      float pdf = bssrdf_cubic_pdf(radius, 0.0f, r);
-      float disk_pdf = bssrdf_cubic_pdf(radius, 0.0f, disk_r);
+      float pdf = svm_bevel_cubic_pdf(radius, r);
+      float disk_pdf = svm_bevel_cubic_pdf(radius, disk_r);
 
       w *= pdf / disk_pdf;
 
@@ -198,19 +280,34 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   return is_zero(N) ? sd->N : (sd->flag & SD_BACKFACING) ? -N : N;
 }
 
-ccl_device void svm_node_bevel(
-    KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, float *stack, uint4 node)
+template<uint node_feature_mask>
+#  if defined(__KERNEL_OPTIX__)
+ccl_device_inline
+#  else
+ccl_device_noinline
+#  endif
+    void
+    svm_node_bevel(INTEGRATOR_STATE_CONST_ARGS, ShaderData *sd, float *stack, uint4 node)
 {
   uint num_samples, radius_offset, normal_offset, out_offset;
   svm_unpack_node_uchar4(node.y, &num_samples, &radius_offset, &normal_offset, &out_offset);
 
   float radius = stack_load_float(stack, radius_offset);
-  float3 bevel_N = svm_bevel(kg, sd, state, radius, num_samples);
 
-  if (stack_valid(normal_offset)) {
-    /* Preserve input normal. */
-    float3 ref_N = stack_load_float3(stack, normal_offset);
-    bevel_N = normalize(ref_N + (bevel_N - sd->N));
+  float3 bevel_N = sd->N;
+
+  if (KERNEL_NODES_FEATURE(RAYTRACE)) {
+#  ifdef __KERNEL_OPTIX__
+    bevel_N = optixDirectCall<float3>(1, INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  else
+    bevel_N = svm_bevel(INTEGRATOR_STATE_PASS, sd, radius, num_samples);
+#  endif
+
+    if (stack_valid(normal_offset)) {
+      /* Preserve input normal. */
+      float3 ref_N = stack_load_float3(stack, normal_offset);
+      bevel_N = normalize(ref_N + (bevel_N - sd->N));
+    }
   }
 
   stack_store_float3(stack, out_offset, bevel_N);
diff --git a/intern/cycles/kernel/svm/svm_blackbody.h b/intern/cycles/kernel/svm/svm_blackbody.h
index adfc50d961e..96b3703b954 100644
--- a/intern/cycles/kernel/svm/svm_blackbody.h
+++ b/intern/cycles/kernel/svm/svm_blackbody.h
@@ -34,8 +34,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Blackbody Node */
 
-ccl_device void svm_node_blackbody(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint temperature_offset, uint col_offset)
+ccl_device_noinline void svm_node_blackbody(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint temperature_offset,
+                                            uint col_offset)
 {
   /* Input */
   float temperature = stack_load_float(stack, temperature_offset);
diff --git a/intern/cycles/kernel/svm/svm_brick.h b/intern/cycles/kernel/svm/svm_brick.h
index 6984afa30a5..dca1b220dd5 100644
--- a/intern/cycles/kernel/svm/svm_brick.h
+++ b/intern/cycles/kernel/svm/svm_brick.h
@@ -72,12 +72,12 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
   return make_float2(tint, mortar);
 }
 
-ccl_device void svm_node_tex_brick(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_brick(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
-  uint4 node4 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
+  uint4 node4 = read_node(kg, &offset);
 
   /* Input and Output Sockets */
   uint co_offset, color1_offset, color2_offset, mortar_offset, scale_offset;
@@ -133,6 +133,7 @@ ccl_device void svm_node_tex_brick(
     stack_store_float3(stack, color_offset, color1 * (1.0f - f) + mortar * f);
   if (stack_valid(fac_offset))
     stack_store_float(stack, fac_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_brightness.h b/intern/cycles/kernel/svm/svm_brightness.h
index 9554b5946fb..2ed812acd71 100644
--- a/intern/cycles/kernel/svm/svm_brightness.h
+++ b/intern/cycles/kernel/svm/svm_brightness.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_brightness(
+ccl_device_noinline void svm_node_brightness(
     ShaderData *sd, float *stack, uint in_color, uint out_color, uint node)
 {
   uint bright_offset, contrast_offset;
diff --git a/intern/cycles/kernel/svm/svm_bump.h b/intern/cycles/kernel/svm/svm_bump.h
index c9d430a2bba..8672839dbab 100644
--- a/intern/cycles/kernel/svm/svm_bump.h
+++ b/intern/cycles/kernel/svm/svm_bump.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Bump Eval Nodes */
 
-ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_enter_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* save state */
   stack_store_float3(stack, offset + 0, sd->P);
@@ -45,10 +45,10 @@ ccl_device void svm_node_enter_bump_eval(KernelGlobals *kg,
   }
 }
 
-ccl_device void svm_node_leave_bump_eval(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint offset)
+ccl_device_noinline void svm_node_leave_bump_eval(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint offset)
 {
   /* restore state */
   sd->P = stack_load_float3(stack, offset + 0);
diff --git a/intern/cycles/kernel/svm/svm_camera.h b/intern/cycles/kernel/svm/svm_camera.h
index 21a17acf5f1..40c0edcdad0 100644
--- a/intern/cycles/kernel/svm/svm_camera.h
+++ b/intern/cycles/kernel/svm/svm_camera.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_camera(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint out_vector,
-                                uint out_zdepth,
-                                uint out_distance)
+ccl_device_noinline void svm_node_camera(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint out_vector,
+                                         uint out_zdepth,
+                                         uint out_distance)
 {
   float distance;
   float zdepth;
diff --git a/intern/cycles/kernel/svm/svm_checker.h b/intern/cycles/kernel/svm/svm_checker.h
index d54cb73df91..a9919c9ddc9 100644
--- a/intern/cycles/kernel/svm/svm_checker.h
+++ b/intern/cycles/kernel/svm/svm_checker.h
@@ -32,7 +32,10 @@ ccl_device float svm_checker(float3 p)
   return ((xi % 2 == yi % 2) == (zi % 2)) ? 1.0f : 0.0f;
 }
 
-ccl_device void svm_node_tex_checker(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_checker(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint4 node)
 {
   uint co_offset, color1_offset, color2_offset, scale_offset;
   uint color_offset, fac_offset;
diff --git a/intern/cycles/kernel/svm/svm_clamp.h b/intern/cycles/kernel/svm/svm_clamp.h
index a85fd82754e..656bd31c085 100644
--- a/intern/cycles/kernel/svm/svm_clamp.h
+++ b/intern/cycles/kernel/svm/svm_clamp.h
@@ -18,18 +18,18 @@ CCL_NAMESPACE_BEGIN
 
 /* Clamp Node */
 
-ccl_device void svm_node_clamp(KernelGlobals *kg,
-                               ShaderData *sd,
-                               float *stack,
-                               uint value_stack_offset,
-                               uint parameters_stack_offsets,
-                               uint result_stack_offset,
-                               int *offset)
+ccl_device_noinline int svm_node_clamp(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint value_stack_offset,
+                                       uint parameters_stack_offsets,
+                                       uint result_stack_offset,
+                                       int offset)
 {
   uint min_stack_offset, max_stack_offset, type;
   svm_unpack_node_uchar3(parameters_stack_offsets, &min_stack_offset, &max_stack_offset, &type);
 
-  uint4 defaults = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float min = stack_load_float_default(stack, min_stack_offset, defaults.x);
@@ -41,6 +41,7 @@ ccl_device void svm_node_clamp(KernelGlobals *kg,
   else {
     stack_store_float(stack, result_stack_offset, clamp(value, min, max));
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_closure.h b/intern/cycles/kernel/svm/svm_closure.h
index bbe8d72edf0..e2f6dde4ace 100644
--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -57,13 +57,9 @@ ccl_device void svm_node_glass_setup(
   }
 }
 
-ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint4 node,
-                                      ShaderType shader_type,
-                                      int path_flag,
-                                      int *offset)
+template<uint node_feature_mask, ShaderType shader_type>
+ccl_device_noinline int svm_node_closure_bsdf(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
   uint type, param1_offset, param2_offset;
 
@@ -73,19 +69,19 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                                        1.0f);
 
   /* note we read this extra node before weight check, so offset is added */
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
 
   /* Only compute BSDF for surfaces, transparent variable is shared with volume extinction. */
-  if (mix_weight == 0.0f || shader_type != SHADER_TYPE_SURFACE) {
+  if ((!KERNEL_NODES_FEATURE(BSDF) || shader_type != SHADER_TYPE_SURFACE) || mix_weight == 0.0f) {
     if (type == CLOSURE_BSDF_PRINCIPLED_ID) {
       /* Read all principled BSDF extra data to get the right offset. */
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
-      read_node(kg, offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
+      read_node(kg, &offset);
     }
 
-    return;
+    return offset;
   }
 
   float3 N = stack_valid(data_node.x) ? stack_load_float3(stack, data_node.x) : sd->N;
@@ -102,7 +98,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
           sheen_offset, sheen_tint_offset, clearcoat_offset, clearcoat_roughness_offset,
           eta_offset, transmission_offset, anisotropic_rotation_offset,
           transmission_roughness_offset;
-      uint4 data_node2 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
 
       float3 T = stack_load_float3(stack, data_node.y);
       svm_unpack_node_uchar4(data_node.z,
@@ -158,7 +154,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
       float specular_weight = (1.0f - final_transmission);
 
       // get the base color
-      uint4 data_base_color = read_node(kg, offset);
+      uint4 data_base_color = read_node(kg, &offset);
       float3 base_color = stack_valid(data_base_color.x) ?
                               stack_load_float3(stack, data_base_color.x) :
                               make_float3(__uint_as_float(data_base_color.y),
@@ -166,16 +162,21 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
                                           __uint_as_float(data_base_color.w));
 
       // get the additional clearcoat normal and subsurface scattering radius
-      uint4 data_cn_ssr = read_node(kg, offset);
+      uint4 data_cn_ssr = read_node(kg, &offset);
       float3 clearcoat_normal = stack_valid(data_cn_ssr.x) ?
                                     stack_load_float3(stack, data_cn_ssr.x) :
                                     sd->N;
       float3 subsurface_radius = stack_valid(data_cn_ssr.y) ?
                                      stack_load_float3(stack, data_cn_ssr.y) :
                                      make_float3(1.0f, 1.0f, 1.0f);
+      float subsurface_ior = stack_valid(data_cn_ssr.z) ? stack_load_float(stack, data_cn_ssr.z) :
+                                                          1.4f;
+      float subsurface_anisotropy = stack_valid(data_cn_ssr.w) ?
+                                        stack_load_float(stack, data_cn_ssr.w) :
+                                        0.0f;
 
       // get the subsurface color
-      uint4 data_subsurface_color = read_node(kg, offset);
+      uint4 data_subsurface_color = read_node(kg, &offset);
       float3 subsurface_color = stack_valid(data_subsurface_color.x) ?
                                     stack_load_float3(stack, data_subsurface_color.x) :
                                     make_float3(__uint_as_float(data_subsurface_color.y),
@@ -222,16 +223,16 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
           if (bssrdf) {
             bssrdf->radius = subsurface_radius * subsurface;
-            bssrdf->albedo = (subsurface_method == CLOSURE_BSSRDF_PRINCIPLED_ID) ?
-                                 subsurface_color :
-                                 mixed_ss_base_color;
-            bssrdf->texture_blur = 0.0f;
-            bssrdf->sharpness = 0.0f;
+            bssrdf->albedo = mixed_ss_base_color;
             bssrdf->N = N;
             bssrdf->roughness = roughness;
 
+            /* Clamps protecting against bad/extreme and non physical values. */
+            subsurface_ior = clamp(subsurface_ior, 1.01f, 3.8f);
+            bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
             /* setup bsdf */
-            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method);
+            sd->flag |= bssrdf_setup(sd, bssrdf, subsurface_method, subsurface_ior);
           }
         }
       }
@@ -733,9 +734,9 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     }
 #ifdef __HAIR__
     case CLOSURE_BSDF_HAIR_PRINCIPLED_ID: {
-      uint4 data_node2 = read_node(kg, offset);
-      uint4 data_node3 = read_node(kg, offset);
-      uint4 data_node4 = read_node(kg, offset);
+      uint4 data_node2 = read_node(kg, &offset);
+      uint4 data_node3 = read_node(kg, &offset);
+      uint4 data_node4 = read_node(kg, &offset);
 
       float3 weight = sd->svm_closure_weight * mix_weight;
 
@@ -878,10 +879,8 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 #endif /* __HAIR__ */
 
 #ifdef __SUBSURFACE__
-    case CLOSURE_BSSRDF_CUBIC_ID:
-    case CLOSURE_BSSRDF_GAUSSIAN_ID:
-    case CLOSURE_BSSRDF_BURLEY_ID:
-    case CLOSURE_BSSRDF_RANDOM_WALK_ID: {
+    case CLOSURE_BSSRDF_RANDOM_WALK_ID:
+    case CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID: {
       float3 weight = sd->svm_closure_weight * mix_weight;
       Bssrdf *bssrdf = bssrdf_alloc(sd, weight);
 
@@ -894,11 +893,14 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
 
         bssrdf->radius = stack_load_float3(stack, data_node.z) * param1;
         bssrdf->albedo = sd->svm_closure_weight;
-        bssrdf->texture_blur = param2;
-        bssrdf->sharpness = stack_load_float(stack, data_node.w);
         bssrdf->N = N;
-        bssrdf->roughness = 0.0f;
-        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type);
+        bssrdf->roughness = FLT_MAX;
+
+        const float subsurface_ior = clamp(param2, 1.01f, 3.8f);
+        const float subsurface_anisotropy = stack_load_float(stack, data_node.w);
+        bssrdf->anisotropy = clamp(subsurface_anisotropy, 0.0f, 0.9f);
+
+        sd->flag |= bssrdf_setup(sd, bssrdf, (ClosureType)type, subsurface_ior);
       }
 
       break;
@@ -907,10 +909,15 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg,
     default:
       break;
   }
+
+  return offset;
 }
 
-ccl_device void svm_node_closure_volume(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, ShaderType shader_type)
+template<ShaderType shader_type>
+ccl_device_noinline void svm_node_closure_volume(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint4 node)
 {
 #ifdef __VOLUME__
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
@@ -961,21 +968,17 @@ ccl_device void svm_node_closure_volume(
 #endif
 }
 
-ccl_device void svm_node_principled_volume(KernelGlobals *kg,
-                                           ShaderData *sd,
-                                           float *stack,
-                                           uint4 node,
-                                           ShaderType shader_type,
-                                           int path_flag,
-                                           int *offset)
+template<ShaderType shader_type>
+ccl_device_noinline int svm_node_principled_volume(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int path_flag, int offset)
 {
 #ifdef __VOLUME__
-  uint4 value_node = read_node(kg, offset);
-  uint4 attr_node = read_node(kg, offset);
+  uint4 value_node = read_node(kg, &offset);
+  uint4 attr_node = read_node(kg, &offset);
 
   /* Only sum extinction for volumes, variable is shared with surface transparency. */
   if (shader_type != SHADER_TYPE_VOLUME) {
-    return;
+    return offset;
   }
 
   uint density_offset, anisotropy_offset, absorption_color_offset, mix_weight_offset;
@@ -985,7 +988,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
                                                        1.0f);
 
   if (mix_weight == 0.0f) {
-    return;
+    return offset;
   }
 
   /* Compute density. */
@@ -1034,7 +1037,7 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
   /* Compute emission. */
   if (path_flag & PATH_RAY_SHADOW) {
     /* Don't need emission for shadows. */
-    return;
+    return offset;
   }
 
   uint emission_offset, emission_color_offset, blackbody_offset, temperature_offset;
@@ -1074,9 +1077,10 @@ ccl_device void svm_node_principled_volume(KernelGlobals *kg,
     }
   }
 #endif
+  return offset;
 }
 
-ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1093,7 +1097,7 @@ ccl_device void svm_node_closure_emission(ShaderData *sd, float *stack, uint4 no
   emission_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_background(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
   float3 weight = sd->svm_closure_weight;
@@ -1110,7 +1114,7 @@ ccl_device void svm_node_closure_background(ShaderData *sd, float *stack, uint4
   background_setup(sd, weight);
 }
 
-ccl_device void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_closure_holdout(ShaderData *sd, float *stack, uint4 node)
 {
   uint mix_weight_offset = node.y;
 
@@ -1145,14 +1149,13 @@ ccl_device void svm_node_closure_set_weight(ShaderData *sd, uint r, uint g, uint
 ccl_device void svm_node_closure_weight(ShaderData *sd, float *stack, uint weight_offset)
 {
   float3 weight = stack_load_float3(stack, weight_offset);
-
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_emission_weight(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_emission_weight(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint color_offset = node.y;
   uint strength_offset = node.z;
@@ -1163,7 +1166,7 @@ ccl_device void svm_node_emission_weight(KernelGlobals *kg,
   svm_node_closure_store_weight(sd, weight);
 }
 
-ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 {
   /* fetch weight from blend input, previous mix closures,
    * and write to stack to be used by closure nodes later */
@@ -1186,7 +1189,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd, float *stack, uint4 node)
 /* (Bump) normal */
 
 ccl_device void svm_node_set_normal(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint in_direction, uint out_normal)
 {
   float3 normal = stack_load_float3(stack, in_direction);
   sd->N = normal;
diff --git a/intern/cycles/kernel/svm/svm_convert.h b/intern/cycles/kernel/svm/svm_convert.h
index 5df6c9fb755..37d40167ccc 100644
--- a/intern/cycles/kernel/svm/svm_convert.h
+++ b/intern/cycles/kernel/svm/svm_convert.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Conversion Nodes */
 
-ccl_device void svm_node_convert(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
+ccl_device_noinline void svm_node_convert(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint from, uint to)
 {
   switch (type) {
     case NODE_CONVERT_FI: {
diff --git a/intern/cycles/kernel/svm/svm_displace.h b/intern/cycles/kernel/svm/svm_displace.h
index 250fac6bcb8..a1d952173d8 100644
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -14,11 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Bump Node */
 
-ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_set_bump(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint4 node)
 {
 #ifdef __RAY_DIFFERENTIALS__
   /* get normal input */
@@ -83,7 +88,7 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 
 /* Displacement Node */
 
-ccl_device void svm_node_set_displacement(KernelGlobals *kg,
+ccl_device void svm_node_set_displacement(const KernelGlobals *kg,
                                           ShaderData *sd,
                                           float *stack,
                                           uint fac_offset)
@@ -92,7 +97,10 @@ ccl_device void svm_node_set_displacement(KernelGlobals *kg,
   sd->P += dP;
 }
 
-ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_displacement(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint4 node)
 {
   uint height_offset, midlevel_offset, scale_offset, normal_offset;
   svm_unpack_node_uchar4(node.y, &height_offset, &midlevel_offset, &scale_offset, &normal_offset);
@@ -119,10 +127,10 @@ ccl_device void svm_node_displacement(KernelGlobals *kg, ShaderData *sd, float *
   stack_store_float3(stack, node.z, dP);
 }
 
-ccl_device void svm_node_vector_displacement(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_vector_displacement(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 data_node = read_node(kg, offset);
+  uint4 data_node = read_node(kg, &offset);
   uint space = data_node.x;
 
   uint vector_offset, midlevel_offset, scale_offset, displacement_offset;
@@ -164,6 +172,7 @@ ccl_device void svm_node_vector_displacement(
   }
 
   stack_store_float3(stack, displacement_offset, dP);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_fresnel.h b/intern/cycles/kernel/svm/svm_fresnel.h
index 96d602e35bf..b5ecdbe2abf 100644
--- a/intern/cycles/kernel/svm/svm_fresnel.h
+++ b/intern/cycles/kernel/svm/svm_fresnel.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Fresnel Node */
 
-ccl_device void svm_node_fresnel(
+ccl_device_noinline void svm_node_fresnel(
     ShaderData *sd, float *stack, uint ior_offset, uint ior_value, uint node)
 {
   uint normal_offset, out_offset;
@@ -37,7 +37,7 @@ ccl_device void svm_node_fresnel(
 
 /* Layer Weight Node */
 
-ccl_device void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_layer_weight(ShaderData *sd, float *stack, uint4 node)
 {
   uint blend_offset = node.y;
   uint blend_value = node.z;
diff --git a/intern/cycles/kernel/svm/svm_gamma.h b/intern/cycles/kernel/svm/svm_gamma.h
index 65eb08eb0eb..f6fafdee941 100644
--- a/intern/cycles/kernel/svm/svm_gamma.h
+++ b/intern/cycles/kernel/svm/svm_gamma.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_gamma(
+ccl_device_noinline void svm_node_gamma(
     ShaderData *sd, float *stack, uint in_gamma, uint in_color, uint out_color)
 {
   float3 color = stack_load_float3(stack, in_color);
diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h
index e48e96dcfa4..10e9f291d0e 100644
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -18,8 +18,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Geometry Node */
 
-ccl_device_inline void svm_node_geometry(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float3 data;
 
@@ -51,8 +51,8 @@ ccl_device_inline void svm_node_geometry(
   stack_store_float3(stack, out_offset, data);
 }
 
-ccl_device void svm_node_geometry_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -75,8 +75,8 @@ ccl_device void svm_node_geometry_bump_dx(
 #endif
 }
 
-ccl_device void svm_node_geometry_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_geometry_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -101,8 +101,8 @@ ccl_device void svm_node_geometry_bump_dy(
 
 /* Object Info */
 
-ccl_device void svm_node_object_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_object_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
 
@@ -140,8 +140,8 @@ ccl_device void svm_node_object_info(
 
 /* Particle Info */
 
-ccl_device void svm_node_particle_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_particle_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   switch (type) {
     case NODE_INFO_PAR_INDEX: {
@@ -199,8 +199,8 @@ ccl_device void svm_node_particle_info(
 
 /* Hair Info */
 
-ccl_device void svm_node_hair_info(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
+ccl_device_noinline void svm_node_hair_info(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint type, uint out_offset)
 {
   float data;
   float3 data3;
diff --git a/intern/cycles/kernel/svm/svm_gradient.h b/intern/cycles/kernel/svm/svm_gradient.h
index 08304bc47e8..cd15f7097e7 100644
--- a/intern/cycles/kernel/svm/svm_gradient.h
+++ b/intern/cycles/kernel/svm/svm_gradient.h
@@ -60,7 +60,7 @@ ccl_device float svm_gradient(float3 p, NodeGradientType type)
   return 0.0f;
 }
 
-ccl_device void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_gradient(ShaderData *sd, float *stack, uint4 node)
 {
   uint type, co_offset, color_offset, fac_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_hsv.h b/intern/cycles/kernel/svm/svm_hsv.h
index c299cf58c7f..6f49a8385aa 100644
--- a/intern/cycles/kernel/svm/svm_hsv.h
+++ b/intern/cycles/kernel/svm/svm_hsv.h
@@ -19,8 +19,10 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_hsv(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_hsv(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint in_color_offset, fac_offset, out_color_offset;
   uint hue_offset, sat_offset, val_offset;
diff --git a/intern/cycles/kernel/svm/svm_ies.h b/intern/cycles/kernel/svm/svm_ies.h
index 56c804b44d0..9c13734ecf0 100644
--- a/intern/cycles/kernel/svm/svm_ies.h
+++ b/intern/cycles/kernel/svm/svm_ies.h
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 /* IES Light */
 
 ccl_device_inline float interpolate_ies_vertical(
-    KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
+    const KernelGlobals *kg, int ofs, int v, int v_num, float v_frac, int h)
 {
   /* Since lookups are performed in spherical coordinates, clamping the coordinates at the low end
    * of v (corresponding to the north pole) would result in artifacts. The proper way of dealing
@@ -39,7 +39,7 @@ ccl_device_inline float interpolate_ies_vertical(
   return cubic_interp(a, b, c, d, v_frac);
 }
 
-ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
+ccl_device_inline float kernel_ies_interp(const KernelGlobals *kg,
                                           int slot,
                                           float h_angle,
                                           float v_angle)
@@ -98,8 +98,10 @@ ccl_device_inline float kernel_ies_interp(KernelGlobals *kg,
   return max(cubic_interp(a, b, c, d, h_frac), 0.0f);
 }
 
-ccl_device void svm_node_ies(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline void svm_node_ies(const KernelGlobals *kg,
+                                      ShaderData *sd,
+                                      float *stack,
+                                      uint4 node)
 {
   uint vector_offset, strength_offset, fac_offset, slot = node.z;
   svm_unpack_node_uchar3(node.y, &strength_offset, &vector_offset, &fac_offset);
diff --git a/intern/cycles/kernel/svm/svm_image.h b/intern/cycles/kernel/svm/svm_image.h
index 9348ddabde5..a344f36977a 100644
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -16,7 +16,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device float4 svm_image_texture(KernelGlobals *kg, int id, float x, float y, uint flags)
+ccl_device float4 svm_image_texture(const KernelGlobals *kg, int id, float x, float y, uint flags)
 {
   if (id == -1) {
     return make_float4(
@@ -44,8 +44,8 @@ ccl_device_inline float3 texco_remap_square(float3 co)
   return (co - make_float3(0.5f, 0.5f, 0.5f)) * 2.0f;
 }
 
-ccl_device void svm_node_tex_image(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_image(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, out_offset, alpha_offset, flags;
 
@@ -71,7 +71,7 @@ ccl_device void svm_node_tex_image(
   int num_nodes = (int)node.y;
   if (num_nodes > 0) {
     /* Remember the offset of the node following the tile nodes. */
-    int next_offset = (*offset) + num_nodes;
+    int next_offset = offset + num_nodes;
 
     /* Find the tile that the UV lies in. */
     int tx = (int)tex_co.x;
@@ -83,7 +83,7 @@ ccl_device void svm_node_tex_image(
 
       /* Find the index of the tile. */
       for (int i = 0; i < num_nodes; i++) {
-        uint4 tile_node = read_node(kg, offset);
+        uint4 tile_node = read_node(kg, &offset);
         if (tile_node.x == tile) {
           id = tile_node.y;
           break;
@@ -102,7 +102,7 @@ ccl_device void svm_node_tex_image(
     }
 
     /* Skip over the remaining nodes. */
-    *offset = next_offset;
+    offset = next_offset;
   }
   else {
     id = -num_nodes;
@@ -114,9 +114,13 @@ ccl_device void svm_node_tex_image(
     stack_store_float3(stack, out_offset, make_float3(f.x, f.y, f.z));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, f.w);
+  return offset;
 }
 
-ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tex_image_box(const KernelGlobals *kg,
+                                                ShaderData *sd,
+                                                float *stack,
+                                                uint4 node)
 {
   /* get object space normal */
   float3 N = sd->N;
@@ -215,10 +219,10 @@ ccl_device void svm_node_tex_image_box(KernelGlobals *kg, ShaderData *sd, float
     stack_store_float(stack, alpha_offset, f.w);
 }
 
-ccl_device void svm_node_tex_environment(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint4 node)
+ccl_device_noinline void svm_node_tex_environment(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint4 node)
 {
   uint id = node.y;
   uint co_offset, out_offset, alpha_offset, flags;
diff --git a/intern/cycles/kernel/svm/svm_invert.h b/intern/cycles/kernel/svm/svm_invert.h
index 02024742b13..27cdaaff473 100644
--- a/intern/cycles/kernel/svm/svm_invert.h
+++ b/intern/cycles/kernel/svm/svm_invert.h
@@ -21,7 +21,7 @@ ccl_device float invert(float color, float factor)
   return factor * (1.0f - color) + (1.0f - factor) * color;
 }
 
-ccl_device void svm_node_invert(
+ccl_device_noinline void svm_node_invert(
     ShaderData *sd, float *stack, uint in_fac, uint in_color, uint out_color)
 {
   float factor = stack_load_float(stack, in_fac);
diff --git a/intern/cycles/kernel/svm/svm_light_path.h b/intern/cycles/kernel/svm/svm_light_path.h
index 768c65918cd..49fabad1cc5 100644
--- a/intern/cycles/kernel/svm/svm_light_path.h
+++ b/intern/cycles/kernel/svm/svm_light_path.h
@@ -18,12 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Light Path Node */
 
-ccl_device void svm_node_light_path(ShaderData *sd,
-                                    ccl_addr_space PathState *state,
-                                    float *stack,
-                                    uint type,
-                                    uint out_offset,
-                                    int path_flag)
+ccl_device_noinline void svm_node_light_path(INTEGRATOR_STATE_CONST_ARGS,
+                                             const ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint out_offset,
+                                             int path_flag)
 {
   float info = 0.0f;
 
@@ -58,21 +58,47 @@ ccl_device void svm_node_light_path(ShaderData *sd,
     case NODE_LP_ray_length:
       info = sd->ray_length;
       break;
-    case NODE_LP_ray_depth:
-      info = (float)state->bounce;
+    case NODE_LP_ray_depth: {
+      /* Read bounce from difference location depending if this is a shadow
+       * path. It's a bit dubious to have integrate state details leak into
+       * this function but hard to avoid currently. */
+      int bounce = (INTEGRATOR_STATE_IS_NULL)    ? 0 :
+                   (path_flag & PATH_RAY_SHADOW) ? INTEGRATOR_STATE(shadow_path, bounce) :
+                                                   INTEGRATOR_STATE(path, bounce);
+
+      /* For background, light emission and shadow evaluation we from a
+       * surface or volume we are effective one bounce further. */
+      if (path_flag & (PATH_RAY_SHADOW | PATH_RAY_EMISSION)) {
+        bounce++;
+      }
+
+      info = (float)bounce;
       break;
+    }
+      /* TODO */
+    case NODE_LP_ray_transparent: {
+      const int bounce = (INTEGRATOR_STATE_IS_NULL) ?
+                             0 :
+                         (path_flag & PATH_RAY_SHADOW) ?
+                             INTEGRATOR_STATE(shadow_path, transparent_bounce) :
+                             INTEGRATOR_STATE(path, transparent_bounce);
+
+      info = (float)bounce;
+      break;
+    }
+#if 0
     case NODE_LP_ray_diffuse:
       info = (float)state->diffuse_bounce;
       break;
     case NODE_LP_ray_glossy:
       info = (float)state->glossy_bounce;
       break;
-    case NODE_LP_ray_transparent:
-      info = (float)state->transparent_bounce;
-      break;
+#endif
+#if 0
     case NODE_LP_ray_transmission:
       info = (float)state->transmission_bounce;
       break;
+#endif
   }
 
   stack_store_float(stack, out_offset, info);
@@ -80,7 +106,7 @@ ccl_device void svm_node_light_path(ShaderData *sd,
 
 /* Light Falloff Node */
 
-ccl_device void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_light_falloff(ShaderData *sd, float *stack, uint4 node)
 {
   uint strength_offset, out_offset, smooth_offset;
 
diff --git a/intern/cycles/kernel/svm/svm_magic.h b/intern/cycles/kernel/svm/svm_magic.h
index 9c160e6d8cc..8784c760860 100644
--- a/intern/cycles/kernel/svm/svm_magic.h
+++ b/intern/cycles/kernel/svm/svm_magic.h
@@ -87,8 +87,8 @@ ccl_device_noinline_cpu float3 svm_magic(float3 p, int n, float distortion)
   return make_float3(0.5f - x, 0.5f - y, 0.5f - z);
 }
 
-ccl_device void svm_node_tex_magic(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_magic(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint depth;
   uint scale_offset, distortion_offset, co_offset, fac_offset, color_offset;
@@ -96,7 +96,7 @@ ccl_device void svm_node_tex_magic(
   svm_unpack_node_uchar3(node.y, &depth, &color_offset, &fac_offset);
   svm_unpack_node_uchar3(node.z, &co_offset, &scale_offset, &distortion_offset);
 
-  uint4 node2 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
   float3 co = stack_load_float3(stack, co_offset);
   float scale = stack_load_float_default(stack, scale_offset, node2.x);
   float distortion = stack_load_float_default(stack, distortion_offset, node2.y);
@@ -107,6 +107,7 @@ ccl_device void svm_node_tex_magic(
     stack_store_float(stack, fac_offset, average(color));
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, color);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_map_range.h b/intern/cycles/kernel/svm/svm_map_range.h
index 533a631c837..c8684981e31 100644
--- a/intern/cycles/kernel/svm/svm_map_range.h
+++ b/intern/cycles/kernel/svm/svm_map_range.h
@@ -24,13 +24,13 @@ ccl_device_inline float smootherstep(float edge0, float edge1, float x)
   return x * x * x * (x * (x * 6.0f - 15.0f) + 10.0f);
 }
 
-ccl_device void svm_node_map_range(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint value_stack_offset,
-                                   uint parameters_stack_offsets,
-                                   uint results_stack_offsets,
-                                   int *offset)
+ccl_device_noinline int svm_node_map_range(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint value_stack_offset,
+                                           uint parameters_stack_offsets,
+                                           uint results_stack_offsets,
+                                           int offset)
 {
   uint from_min_stack_offset, from_max_stack_offset, to_min_stack_offset, to_max_stack_offset;
   uint type_stack_offset, steps_stack_offset, result_stack_offset;
@@ -42,8 +42,8 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
   svm_unpack_node_uchar3(
       results_stack_offsets, &type_stack_offset, &steps_stack_offset, &result_stack_offset);
 
-  uint4 defaults = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float value = stack_load_float(stack, value_stack_offset);
   float from_min = stack_load_float_default(stack, from_min_stack_offset, defaults.x);
@@ -83,6 +83,7 @@ ccl_device void svm_node_map_range(KernelGlobals *kg,
     result = 0.0f;
   }
   stack_store_float(stack, result_stack_offset, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mapping.h b/intern/cycles/kernel/svm/svm_mapping.h
index 6e19c859e19..fcc724405f5 100644
--- a/intern/cycles/kernel/svm/svm_mapping.h
+++ b/intern/cycles/kernel/svm/svm_mapping.h
@@ -18,13 +18,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Mapping Node */
 
-ccl_device void svm_node_mapping(KernelGlobals *kg,
-                                 ShaderData *sd,
-                                 float *stack,
-                                 uint type,
-                                 uint inputs_stack_offsets,
-                                 uint result_stack_offset,
-                                 int *offset)
+ccl_device_noinline void svm_node_mapping(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint type,
+                                          uint inputs_stack_offsets,
+                                          uint result_stack_offset)
 {
   uint vector_stack_offset, location_stack_offset, rotation_stack_offset, scale_stack_offset;
   svm_unpack_node_uchar4(inputs_stack_offsets,
@@ -44,30 +43,40 @@ ccl_device void svm_node_mapping(KernelGlobals *kg,
 
 /* Texture Mapping */
 
-ccl_device void svm_node_texture_mapping(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_texture_mapping(const KernelGlobals *kg,
+                                                 ShaderData *sd,
+                                                 float *stack,
+                                                 uint vec_offset,
+                                                 uint out_offset,
+                                                 int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
   Transform tfm;
-  tfm.x = read_node_float(kg, offset);
-  tfm.y = read_node_float(kg, offset);
-  tfm.z = read_node_float(kg, offset);
+  tfm.x = read_node_float(kg, &offset);
+  tfm.y = read_node_float(kg, &offset);
+  tfm.z = read_node_float(kg, &offset);
 
   float3 r = transform_point(&tfm, v);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
-ccl_device void svm_node_min_max(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint vec_offset, uint out_offset, int *offset)
+ccl_device_noinline int svm_node_min_max(const KernelGlobals *kg,
+                                         ShaderData *sd,
+                                         float *stack,
+                                         uint vec_offset,
+                                         uint out_offset,
+                                         int offset)
 {
   float3 v = stack_load_float3(stack, vec_offset);
 
-  float3 mn = float4_to_float3(read_node_float(kg, offset));
-  float3 mx = float4_to_float3(read_node_float(kg, offset));
+  float3 mn = float4_to_float3(read_node_float(kg, &offset));
+  float3 mx = float4_to_float3(read_node_float(kg, &offset));
 
   float3 r = min(max(mn, v), mx);
   stack_store_float3(stack, out_offset, r);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index 733ea28f9e5..99e7a8f2bda 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_math(KernelGlobals *kg,
-                              ShaderData *sd,
-                              float *stack,
-                              uint type,
-                              uint inputs_stack_offsets,
-                              uint result_stack_offset,
-                              int *offset)
+ccl_device_noinline void svm_node_math(const KernelGlobals *kg,
+                                       ShaderData *sd,
+                                       float *stack,
+                                       uint type,
+                                       uint inputs_stack_offsets,
+                                       uint result_stack_offset)
 {
   uint a_stack_offset, b_stack_offset, c_stack_offset;
   svm_unpack_node_uchar3(inputs_stack_offsets, &a_stack_offset, &b_stack_offset, &c_stack_offset);
@@ -35,13 +34,13 @@ ccl_device void svm_node_math(KernelGlobals *kg,
   stack_store_float(stack, result_stack_offset, result);
 }
 
-ccl_device void svm_node_vector_math(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint type,
-                                     uint inputs_stack_offsets,
-                                     uint outputs_stack_offsets,
-                                     int *offset)
+ccl_device_noinline int svm_node_vector_math(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint type,
+                                             uint inputs_stack_offsets,
+                                             uint outputs_stack_offsets,
+                                             int offset)
 {
   uint value_stack_offset, vector_stack_offset;
   uint a_stack_offset, b_stack_offset, param1_stack_offset;
@@ -60,7 +59,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
   /* 3 Vector Operators */
   if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD ||
       type == NODE_VECTOR_MATH_MULTIPLY_ADD) {
-    uint4 extra_node = read_node(kg, offset);
+    uint4 extra_node = read_node(kg, &offset);
     c = stack_load_float3(stack, extra_node.x);
   }
 
@@ -70,6 +69,7 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
     stack_store_float(stack, value_stack_offset, value);
   if (stack_valid(vector_stack_offset))
     stack_store_float3(stack, vector_stack_offset, vector);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_mix.h b/intern/cycles/kernel/svm/svm_mix.h
index 15114bfd5e4..3e38080977f 100644
--- a/intern/cycles/kernel/svm/svm_mix.h
+++ b/intern/cycles/kernel/svm/svm_mix.h
@@ -18,16 +18,16 @@ CCL_NAMESPACE_BEGIN
 
 /* Node */
 
-ccl_device void svm_node_mix(KernelGlobals *kg,
-                             ShaderData *sd,
-                             float *stack,
-                             uint fac_offset,
-                             uint c1_offset,
-                             uint c2_offset,
-                             int *offset)
+ccl_device_noinline int svm_node_mix(const KernelGlobals *kg,
+                                     ShaderData *sd,
+                                     float *stack,
+                                     uint fac_offset,
+                                     uint c1_offset,
+                                     uint c2_offset,
+                                     int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
 
   float fac = stack_load_float(stack, fac_offset);
   float3 c1 = stack_load_float3(stack, c1_offset);
@@ -35,6 +35,7 @@ ccl_device void svm_node_mix(KernelGlobals *kg,
   float3 result = svm_mix((NodeMix)node1.y, fac, c1, c2);
 
   stack_store_float3(stack, node1.z, result);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 571f62fe27f..03a8b68b3ef 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -700,13 +700,13 @@ ccl_device_noinline_cpu float noise_musgrave_ridged_multi_fractal_4d(
   return value;
 }
 
-ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint offsets1,
-                                      uint offsets2,
-                                      uint offsets3,
-                                      int *offset)
+ccl_device_noinline int svm_node_tex_musgrave(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint offsets1,
+                                              uint offsets2,
+                                              uint offsets3,
+                                              int offset)
 {
   uint type, dimensions, co_stack_offset, w_stack_offset;
   uint scale_stack_offset, detail_stack_offset, dimension_stack_offset, lacunarity_stack_offset;
@@ -720,8 +720,8 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
                          &lacunarity_stack_offset);
   svm_unpack_node_uchar3(offsets3, &offset_stack_offset, &gain_stack_offset, &fac_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 co = stack_load_float3(stack, co_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -844,6 +844,7 @@ ccl_device void svm_node_tex_musgrave(KernelGlobals *kg,
   }
 
   stack_store_float(stack, fac_stack_offset, fac);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index 94d8bfde555..ecb4df6afdf 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -330,7 +330,7 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
  *               |__________________________|
  *
  */
-ccl_device_noinline float perlin_2d(float x, float y)
+ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
   ssei XY;
   ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
@@ -447,7 +447,7 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
  *     v7      (1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -501,7 +501,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
@@ -585,7 +585,7 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
  *                 |__________________________|
  *
  */
-ccl_device_noinline float perlin_3d(float x, float y, float z)
+ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
   ssei XYZ;
   ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
@@ -637,7 +637,7 @@ ccl_device_noinline float perlin_3d(float x, float y, float z)
  *     v15    (1, 1, 1, 1)
  *
  */
-ccl_device_noinline float perlin_4d(float x, float y, float z, float w)
+ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
   ssei XYZW;
   ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 61fd9553802..29b262ac06e 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -140,13 +140,13 @@ ccl_device void noise_texture_4d(float4 co,
   }
 }
 
-ccl_device void svm_node_tex_noise(KernelGlobals *kg,
-                                   ShaderData *sd,
-                                   float *stack,
-                                   uint dimensions,
-                                   uint offsets1,
-                                   uint offsets2,
-                                   int *offset)
+ccl_device_noinline int svm_node_tex_noise(const KernelGlobals *kg,
+                                           ShaderData *sd,
+                                           float *stack,
+                                           uint dimensions,
+                                           uint offsets1,
+                                           uint offsets2,
+                                           int offset)
 {
   uint vector_stack_offset, w_stack_offset, scale_stack_offset;
   uint detail_stack_offset, roughness_stack_offset, distortion_stack_offset;
@@ -160,8 +160,8 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
                          &value_stack_offset,
                          &color_stack_offset);
 
-  uint4 defaults1 = read_node(kg, offset);
-  uint4 defaults2 = read_node(kg, offset);
+  uint4 defaults1 = read_node(kg, &offset);
+  uint4 defaults2 = read_node(kg, &offset);
 
   float3 vector = stack_load_float3(stack, vector_stack_offset);
   float w = stack_load_float_default(stack, w_stack_offset, defaults1.x);
@@ -212,6 +212,7 @@ ccl_device void svm_node_tex_noise(KernelGlobals *kg,
   if (stack_valid(color_stack_offset)) {
     stack_store_float3(stack, color_stack_offset, color);
   }
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_normal.h b/intern/cycles/kernel/svm/svm_normal.h
index 4cd3eab0ed2..724b5f281f9 100644
--- a/intern/cycles/kernel/svm/svm_normal.h
+++ b/intern/cycles/kernel/svm/svm_normal.h
@@ -16,16 +16,16 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_normal(KernelGlobals *kg,
-                                ShaderData *sd,
-                                float *stack,
-                                uint in_normal_offset,
-                                uint out_normal_offset,
-                                uint out_dot_offset,
-                                int *offset)
+ccl_device_noinline int svm_node_normal(const KernelGlobals *kg,
+                                        ShaderData *sd,
+                                        float *stack,
+                                        uint in_normal_offset,
+                                        uint out_normal_offset,
+                                        uint out_dot_offset,
+                                        int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 normal = stack_load_float3(stack, in_normal_offset);
 
   float3 direction;
@@ -39,6 +39,7 @@ ccl_device void svm_node_normal(KernelGlobals *kg,
 
   if (stack_valid(out_dot_offset))
     stack_store_float(stack, out_dot_offset, dot(direction, normalize(normal)));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h
index 85ccf39144b..e92df3c093c 100644
--- a/intern/cycles/kernel/svm/svm_ramp.h
+++ b/intern/cycles/kernel/svm/svm_ramp.h
@@ -21,8 +21,12 @@ CCL_NAMESPACE_BEGIN
 
 /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */
 
-ccl_device_inline float4 rgb_ramp_lookup(
-    KernelGlobals *kg, int offset, float f, bool interpolate, bool extrapolate, int table_size)
+ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg,
+                                         int offset,
+                                         float f,
+                                         bool interpolate,
+                                         bool extrapolate,
+                                         int table_size)
 {
   if ((f < 0.0f || f > 1.0f) && extrapolate) {
     float4 t0, dy;
@@ -53,34 +57,35 @@ ccl_device_inline float4 rgb_ramp_lookup(
   return a;
 }
 
-ccl_device void svm_node_rgb_ramp(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_rgb_ramp(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, alpha_offset;
   uint interpolate = node.z;
 
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &alpha_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
-  float4 color = rgb_ramp_lookup(kg, *offset, fac, interpolate, false, table_size);
+  float4 color = rgb_ramp_lookup(kg, offset, fac, interpolate, false, table_size);
 
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, float4_to_float3(color));
   if (stack_valid(alpha_offset))
     stack_store_float(stack, alpha_offset, color.w);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
-ccl_device void svm_node_curves(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_curves(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint fac_offset, color_offset, out_offset;
   svm_unpack_node_uchar3(node.y, &fac_offset, &color_offset, &out_offset);
 
-  uint table_size = read_node(kg, offset).x;
+  uint table_size = read_node(kg, &offset).x;
 
   float fac = stack_load_float(stack, fac_offset);
   float3 color = stack_load_float3(stack, color_offset);
@@ -89,14 +94,15 @@ ccl_device void svm_node_curves(
   const float range_x = max_x - min_x;
   const float3 relpos = (color - make_float3(min_x, min_x, min_x)) / range_x;
 
-  float r = rgb_ramp_lookup(kg, *offset, relpos.x, true, true, table_size).x;
-  float g = rgb_ramp_lookup(kg, *offset, relpos.y, true, true, table_size).y;
-  float b = rgb_ramp_lookup(kg, *offset, relpos.z, true, true, table_size).z;
+  float r = rgb_ramp_lookup(kg, offset, relpos.x, true, true, table_size).x;
+  float g = rgb_ramp_lookup(kg, offset, relpos.y, true, true, table_size).y;
+  float b = rgb_ramp_lookup(kg, offset, relpos.z, true, true, table_size).z;
 
   color = (1.0f - fac) * color + fac * make_float3(r, g, b);
   stack_store_float3(stack, out_offset, color);
 
-  *offset += table_size;
+  offset += table_size;
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
index f501252062e..8d52845ea3d 100644
--- a/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
+++ b/intern/cycles/kernel/svm/svm_sepcomb_hsv.h
@@ -16,15 +16,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint hue_in,
-                                     uint saturation_in,
-                                     uint value_in,
-                                     int *offset)
+ccl_device_noinline int svm_node_combine_hsv(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint hue_in,
+                                             uint saturation_in,
+                                             uint value_in,
+                                             int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint color_out = node1.y;
 
   float hue = stack_load_float(stack, hue_in);
@@ -36,17 +36,18 @@ ccl_device void svm_node_combine_hsv(KernelGlobals *kg,
 
   if (stack_valid(color_out))
     stack_store_float3(stack, color_out, color);
+  return offset;
 }
 
-ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint color_in,
-                                      uint hue_out,
-                                      uint saturation_out,
-                                      int *offset)
+ccl_device_noinline int svm_node_separate_hsv(const KernelGlobals *kg,
+                                              ShaderData *sd,
+                                              float *stack,
+                                              uint color_in,
+                                              uint hue_out,
+                                              uint saturation_out,
+                                              int offset)
 {
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   uint value_out = node1.y;
 
   float3 color = stack_load_float3(stack, color_in);
@@ -60,6 +61,7 @@ ccl_device void svm_node_separate_hsv(KernelGlobals *kg,
     stack_store_float(stack, saturation_out, color.y);
   if (stack_valid(value_out))
     stack_store_float(stack, value_out, color.z);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_sky.h b/intern/cycles/kernel/svm/svm_sky.h
index b908732f026..b77c4311e72 100644
--- a/intern/cycles/kernel/svm/svm_sky.h
+++ b/intern/cycles/kernel/svm/svm_sky.h
@@ -37,7 +37,7 @@ ccl_device float sky_perez_function(float *lam, float theta, float gamma)
          (1.0f + lam[2] * expf(lam[3] * gamma) + lam[4] * cgamma * cgamma);
 }
 
-ccl_device float3 sky_radiance_preetham(KernelGlobals *kg,
+ccl_device float3 sky_radiance_preetham(const KernelGlobals *kg,
                                         float3 dir,
                                         float sunphi,
                                         float suntheta,
@@ -90,7 +90,7 @@ ccl_device float sky_radiance_internal(float *configuration, float theta, float
           configuration[6] * mieM + configuration[7] * zenith);
 }
 
-ccl_device float3 sky_radiance_hosek(KernelGlobals *kg,
+ccl_device float3 sky_radiance_hosek(const KernelGlobals *kg,
                                      float3 dir,
                                      float sunphi,
                                      float suntheta,
@@ -127,7 +127,7 @@ ccl_device float3 geographical_to_direction(float lat, float lon)
   return make_float3(cos(lat) * cos(lon), cos(lat) * sin(lon), sin(lat));
 }
 
-ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
+ccl_device float3 sky_radiance_nishita(const KernelGlobals *kg,
                                        float3 dir,
                                        float *nishita_data,
                                        uint texture_id)
@@ -209,8 +209,8 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals *kg,
   return xyz_to_rgb(kg, xyz);
 }
 
-ccl_device void svm_node_tex_sky(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_sky(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   /* Load data */
   uint dir_offset = node.y;
@@ -226,49 +226,49 @@ ccl_device void svm_node_tex_sky(
     float sunphi, suntheta, radiance_x, radiance_y, radiance_z;
     float config_x[9], config_y[9], config_z[9];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     sunphi = data.x;
     suntheta = data.y;
     radiance_x = data.z;
     radiance_y = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     radiance_z = data.x;
     config_x[0] = data.y;
     config_x[1] = data.z;
     config_x[2] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[3] = data.x;
     config_x[4] = data.y;
     config_x[5] = data.z;
     config_x[6] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_x[7] = data.x;
     config_x[8] = data.y;
     config_y[0] = data.z;
     config_y[1] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[2] = data.x;
     config_y[3] = data.y;
     config_y[4] = data.z;
     config_y[5] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_y[6] = data.x;
     config_y[7] = data.y;
     config_y[8] = data.z;
     config_z[0] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[1] = data.x;
     config_z[2] = data.y;
     config_z[3] = data.z;
     config_z[4] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     config_z[5] = data.x;
     config_z[6] = data.y;
     config_z[7] = data.z;
@@ -305,19 +305,19 @@ ccl_device void svm_node_tex_sky(
     /* Define variables */
     float nishita_data[10];
 
-    float4 data = read_node_float(kg, offset);
+    float4 data = read_node_float(kg, &offset);
     nishita_data[0] = data.x;
     nishita_data[1] = data.y;
     nishita_data[2] = data.z;
     nishita_data[3] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[4] = data.x;
     nishita_data[5] = data.y;
     nishita_data[6] = data.z;
     nishita_data[7] = data.w;
 
-    data = read_node_float(kg, offset);
+    data = read_node_float(kg, &offset);
     nishita_data[8] = data.x;
     nishita_data[9] = data.y;
     uint texture_id = __float_as_uint(data.z);
@@ -327,6 +327,7 @@ ccl_device void svm_node_tex_sky(
   }
 
   stack_store_float3(stack, out_offset, f);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 46600551cc4..a35253080da 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
+#include "kernel/geom/geom.h"
+#include "kernel/kernel_camera.h"
+#include "kernel/kernel_montecarlo.h"
+
 CCL_NAMESPACE_BEGIN
 
 /* Texture Coordinate Node */
 
-ccl_device void svm_node_tex_coord(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
   float3 data;
   uint type = node.y;
@@ -35,9 +39,9 @@ ccl_device void svm_node_tex_coord(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -92,10 +96,11 @@ ccl_device void svm_node_tex_coord(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 }
 
-ccl_device void svm_node_tex_coord_bump_dx(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dx(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -112,9 +117,9 @@ ccl_device void svm_node_tex_coord_bump_dx(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -136,7 +141,7 @@ ccl_device void svm_node_tex_coord_bump_dx(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dx);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
       data.z = 0.0f;
@@ -169,13 +174,14 @@ ccl_device void svm_node_tex_coord_bump_dx(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_tex_coord_bump_dy(
-    KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_coord_bump_dy(
+    const KernelGlobals *kg, ShaderData *sd, int path_flag, float *stack, uint4 node, int offset)
 {
 #ifdef __RAY_DIFFERENTIALS__
   float3 data;
@@ -192,9 +198,9 @@ ccl_device void svm_node_tex_coord_bump_dy(
       }
       else {
         Transform tfm;
-        tfm.x = read_node_float(kg, offset);
-        tfm.y = read_node_float(kg, offset);
-        tfm.z = read_node_float(kg, offset);
+        tfm.x = read_node_float(kg, &offset);
+        tfm.y = read_node_float(kg, &offset);
+        tfm.z = read_node_float(kg, &offset);
         data = transform_point(&tfm, data);
       }
       break;
@@ -216,7 +222,7 @@ ccl_device void svm_node_tex_coord_bump_dy(
     case NODE_TEXCO_WINDOW: {
       if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
           kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + sd->ray_dP.dy);
+        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
       else
         data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
       data.z = 0.0f;
@@ -249,12 +255,16 @@ ccl_device void svm_node_tex_coord_bump_dy(
   }
 
   stack_store_float3(stack, out_offset, data);
+  return offset;
 #else
-  svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+  return svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
 #endif
 }
 
-ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_normal_map(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint4 node)
 {
   uint color_offset, strength_offset, normal_offset, space;
   svm_unpack_node_uchar4(node.y, &color_offset, &strength_offset, &normal_offset, &space);
@@ -346,7 +356,10 @@ ccl_device void svm_node_normal_map(KernelGlobals *kg, ShaderData *sd, float *st
   stack_store_float3(stack, normal_offset, N);
 }
 
-ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_tangent(const KernelGlobals *kg,
+                                          ShaderData *sd,
+                                          float *stack,
+                                          uint4 node)
 {
   uint tangent_offset, direction_type, axis;
   svm_unpack_node_uchar3(node.y, &tangent_offset, &direction_type, &axis);
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 062afcfa5ac..c053be96c51 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -30,37 +30,6 @@ CCL_NAMESPACE_BEGIN
 
 /* Nodes */
 
-/* Known frequencies of used nodes, used for selective nodes compilation
- * in the kernel. Currently only affects split OpenCL kernel.
- *
- * Keep as defines so it's easy to check which nodes are to be compiled
- * from preprocessor.
- *
- * Lower the number of group more often the node is used.
- */
-#define NODE_GROUP_LEVEL_0 0
-#define NODE_GROUP_LEVEL_1 1
-#define NODE_GROUP_LEVEL_2 2
-#define NODE_GROUP_LEVEL_3 3
-#define NODE_GROUP_LEVEL_4 4
-#define NODE_GROUP_LEVEL_MAX NODE_GROUP_LEVEL_4
-
-#define NODE_FEATURE_VOLUME (1 << 0)
-#define NODE_FEATURE_HAIR (1 << 1)
-#define NODE_FEATURE_BUMP (1 << 2)
-#define NODE_FEATURE_BUMP_STATE (1 << 3)
-#define NODE_FEATURE_VORONOI_EXTRA (1 << 4)
-/* TODO(sergey): Consider using something like ((uint)(-1)).
- * Need to check carefully operand types around usage of this
- * define first.
- */
-#define NODE_FEATURE_ALL \
-  (NODE_FEATURE_VOLUME | NODE_FEATURE_HAIR | NODE_FEATURE_BUMP | NODE_FEATURE_BUMP_STATE | \
-   NODE_FEATURE_VORONOI_EXTRA)
-
-#define NODES_GROUP(group) ((group) <= __NODES_MAX_GROUP__)
-#define NODES_FEATURE(feature) ((__NODES_FEATURES__ & (feature)) != 0)
-
 typedef enum ShaderNodeType {
   NODE_END = 0,
   NODE_SHADER_JUMP,
@@ -572,12 +541,8 @@ typedef enum ClosureType {
   CLOSURE_BSDF_TRANSPARENT_ID,
 
   /* BSSRDF */
-  CLOSURE_BSSRDF_CUBIC_ID,
-  CLOSURE_BSSRDF_GAUSSIAN_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_ID,
-  CLOSURE_BSSRDF_BURLEY_ID,
   CLOSURE_BSSRDF_RANDOM_WALK_ID,
-  CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID,
+  CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID,
 
   /* Other */
   CLOSURE_HOLDOUT_ID,
@@ -620,11 +585,9 @@ typedef enum ClosureType {
    type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
    type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
-#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
+#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID)
-#define CLOSURE_IS_DISK_BSSRDF(type) \
-  (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
+  (type >= CLOSURE_BSSRDF_RANDOM_WALK_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_VOLUME(type) \
   (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
 #define CLOSURE_IS_VOLUME_SCATTER(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
diff --git a/intern/cycles/kernel/svm/svm_value.h b/intern/cycles/kernel/svm/svm_value.h
index 5b76f2c8832..d0478660094 100644
--- a/intern/cycles/kernel/svm/svm_value.h
+++ b/intern/cycles/kernel/svm/svm_value.h
@@ -19,20 +19,21 @@ CCL_NAMESPACE_BEGIN
 /* Value Nodes */
 
 ccl_device void svm_node_value_f(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint ivalue, uint out_offset)
 {
   stack_store_float(stack, out_offset, __uint_as_float(ivalue));
 }
 
-ccl_device void svm_node_value_v(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int *offset)
+ccl_device int svm_node_value_v(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint out_offset, int offset)
 {
   /* read extra data */
-  uint4 node1 = read_node(kg, offset);
+  uint4 node1 = read_node(kg, &offset);
   float3 p = make_float3(
       __uint_as_float(node1.y), __uint_as_float(node1.z), __uint_as_float(node1.w));
 
   stack_store_float3(stack, out_offset, p);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_vector_rotate.h b/intern/cycles/kernel/svm/svm_vector_rotate.h
index 50045752484..55e1bce0158 100644
--- a/intern/cycles/kernel/svm/svm_vector_rotate.h
+++ b/intern/cycles/kernel/svm/svm_vector_rotate.h
@@ -18,11 +18,11 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Rotate */
 
-ccl_device void svm_node_vector_rotate(ShaderData *sd,
-                                       float *stack,
-                                       uint input_stack_offsets,
-                                       uint axis_stack_offsets,
-                                       uint result_stack_offset)
+ccl_device_noinline void svm_node_vector_rotate(ShaderData *sd,
+                                                float *stack,
+                                                uint input_stack_offsets,
+                                                uint axis_stack_offsets,
+                                                uint result_stack_offset)
 {
   uint type, vector_stack_offset, rotation_stack_offset, center_stack_offset, axis_stack_offset,
       angle_stack_offset, invert;
diff --git a/intern/cycles/kernel/svm/svm_vector_transform.h b/intern/cycles/kernel/svm/svm_vector_transform.h
index 1e95492cf1b..8aedb7e0f54 100644
--- a/intern/cycles/kernel/svm/svm_vector_transform.h
+++ b/intern/cycles/kernel/svm/svm_vector_transform.h
@@ -18,10 +18,10 @@ CCL_NAMESPACE_BEGIN
 
 /* Vector Transform */
 
-ccl_device void svm_node_vector_transform(KernelGlobals *kg,
-                                          ShaderData *sd,
-                                          float *stack,
-                                          uint4 node)
+ccl_device_noinline void svm_node_vector_transform(const KernelGlobals *kg,
+                                                   ShaderData *sd,
+                                                   float *stack,
+                                                   uint4 node)
 {
   uint itype, ifrom, ito;
   uint vector_in, vector_out;
diff --git a/intern/cycles/kernel/svm/svm_vertex_color.h b/intern/cycles/kernel/svm/svm_vertex_color.h
index 0aa45835522..986ea244f3a 100644
--- a/intern/cycles/kernel/svm/svm_vertex_color.h
+++ b/intern/cycles/kernel/svm/svm_vertex_color.h
@@ -16,12 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_vertex_color(KernelGlobals *kg,
-                                      ShaderData *sd,
-                                      float *stack,
-                                      uint layer_id,
-                                      uint color_offset,
-                                      uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color(const KernelGlobals *kg,
+                                               ShaderData *sd,
+                                               float *stack,
+                                               uint layer_id,
+                                               uint color_offset,
+                                               uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -35,18 +35,12 @@ ccl_device void svm_node_vertex_color(KernelGlobals *kg,
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dx(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dx(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
@@ -62,18 +56,12 @@ ccl_device_noinline
   }
 }
 
-#ifndef __KERNEL_CUDA__
-ccl_device
-#else
-ccl_device_noinline
-#endif
-    void
-    svm_node_vertex_color_bump_dy(KernelGlobals *kg,
-                                  ShaderData *sd,
-                                  float *stack,
-                                  uint layer_id,
-                                  uint color_offset,
-                                  uint alpha_offset)
+ccl_device_noinline void svm_node_vertex_color_bump_dy(const KernelGlobals *kg,
+                                                       ShaderData *sd,
+                                                       float *stack,
+                                                       uint layer_id,
+                                                       uint color_offset,
+                                                       uint alpha_offset)
 {
   AttributeDescriptor descriptor = find_attribute(kg, sd, layer_id);
   if (descriptor.offset != ATTR_STD_NOT_FOUND) {
diff --git a/intern/cycles/kernel/svm/svm_voronoi.h b/intern/cycles/kernel/svm/svm_voronoi.h
index d0e7db35fab..b1d2eff7f37 100644
--- a/intern/cycles/kernel/svm/svm_voronoi.h
+++ b/intern/cycles/kernel/svm/svm_voronoi.h
@@ -902,16 +902,17 @@ ccl_device void voronoi_n_sphere_radius_4d(float4 coord, float randomness, float
   *outRadius = distance(closestPointToClosestPoint, closestPoint) / 2.0f;
 }
 
-ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
-                                     ShaderData *sd,
-                                     float *stack,
-                                     uint dimensions,
-                                     uint feature,
-                                     uint metric,
-                                     int *offset)
+template<uint node_feature_mask>
+ccl_device_noinline int svm_node_tex_voronoi(const KernelGlobals *kg,
+                                             ShaderData *sd,
+                                             float *stack,
+                                             uint dimensions,
+                                             uint feature,
+                                             uint metric,
+                                             int offset)
 {
-  uint4 stack_offsets = read_node(kg, offset);
-  uint4 defaults = read_node(kg, offset);
+  uint4 stack_offsets = read_node(kg, &offset);
+  uint4 defaults = read_node(kg, &offset);
 
   uint coord_stack_offset, w_stack_offset, scale_stack_offset, smoothness_stack_offset;
   uint exponent_stack_offset, randomness_stack_offset, distance_out_stack_offset,
@@ -997,18 +998,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out_2d);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_2d(coord_2d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_2d);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_2d(coord_2d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_2d);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_2d(coord_2d,
                         exponent,
@@ -1042,18 +1043,18 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
                         &color_out,
                         &position_out);
           break;
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
         case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_3d(coord,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out);
+          if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+            voronoi_smooth_f1_3d(coord,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out);
+          }
           break;
-#endif
         case NODE_VORONOI_F2:
           voronoi_f2_3d(coord,
                         exponent,
@@ -1076,54 +1077,54 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
       break;
     }
 
-#if NODES_FEATURE(NODE_FEATURE_VORONOI_EXTRA)
     case 4: {
-      float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
-      float4 position_out_4d;
-      switch (voronoi_feature) {
-        case NODE_VORONOI_F1:
-          voronoi_f1_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_SMOOTH_F1:
-          voronoi_smooth_f1_4d(coord_4d,
-                               smoothness,
-                               exponent,
-                               randomness,
-                               voronoi_metric,
-                               &distance_out,
-                               &color_out,
-                               &position_out_4d);
-          break;
-        case NODE_VORONOI_F2:
-          voronoi_f2_4d(coord_4d,
-                        exponent,
-                        randomness,
-                        voronoi_metric,
-                        &distance_out,
-                        &color_out,
-                        &position_out_4d);
-          break;
-        case NODE_VORONOI_DISTANCE_TO_EDGE:
-          voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
-          break;
-        case NODE_VORONOI_N_SPHERE_RADIUS:
-          voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
-          break;
-        default:
-          kernel_assert(0);
+      if (KERNEL_NODES_FEATURE(VORONOI_EXTRA)) {
+        float4 coord_4d = make_float4(coord.x, coord.y, coord.z, w);
+        float4 position_out_4d;
+        switch (voronoi_feature) {
+          case NODE_VORONOI_F1:
+            voronoi_f1_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_SMOOTH_F1:
+            voronoi_smooth_f1_4d(coord_4d,
+                                 smoothness,
+                                 exponent,
+                                 randomness,
+                                 voronoi_metric,
+                                 &distance_out,
+                                 &color_out,
+                                 &position_out_4d);
+            break;
+          case NODE_VORONOI_F2:
+            voronoi_f2_4d(coord_4d,
+                          exponent,
+                          randomness,
+                          voronoi_metric,
+                          &distance_out,
+                          &color_out,
+                          &position_out_4d);
+            break;
+          case NODE_VORONOI_DISTANCE_TO_EDGE:
+            voronoi_distance_to_edge_4d(coord_4d, randomness, &distance_out);
+            break;
+          case NODE_VORONOI_N_SPHERE_RADIUS:
+            voronoi_n_sphere_radius_4d(coord_4d, randomness, &radius_out);
+            break;
+          default:
+            kernel_assert(0);
+        }
+        position_out_4d = safe_divide_float4_float(position_out_4d, scale);
+        position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
+        w_out = position_out_4d.w;
       }
-      position_out_4d = safe_divide_float4_float(position_out_4d, scale);
-      position_out = make_float3(position_out_4d.x, position_out_4d.y, position_out_4d.z);
-      w_out = position_out_4d.w;
       break;
     }
-#endif
     default:
       kernel_assert(0);
   }
@@ -1138,6 +1139,7 @@ ccl_device void svm_node_tex_voronoi(KernelGlobals *kg,
     stack_store_float(stack, w_out_stack_offset, w_out);
   if (stack_valid(radius_out_stack_offset))
     stack_store_float(stack, radius_out_stack_offset, radius_out);
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_voxel.h b/intern/cycles/kernel/svm/svm_voxel.h
index 4bc14f82382..78b75405356 100644
--- a/intern/cycles/kernel/svm/svm_voxel.h
+++ b/intern/cycles/kernel/svm/svm_voxel.h
@@ -19,8 +19,8 @@ CCL_NAMESPACE_BEGIN
 /* TODO(sergey): Think of making it more generic volume-type attribute
  * sampler.
  */
-ccl_device void svm_node_tex_voxel(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_voxel(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
   uint co_offset, density_out_offset, color_out_offset, space;
   svm_unpack_node_uchar4(node.z, &co_offset, &density_out_offset, &color_out_offset, &space);
@@ -33,9 +33,9 @@ ccl_device void svm_node_tex_voxel(
   else {
     kernel_assert(space == NODE_TEX_VOXEL_SPACE_WORLD);
     Transform tfm;
-    tfm.x = read_node_float(kg, offset);
-    tfm.y = read_node_float(kg, offset);
-    tfm.z = read_node_float(kg, offset);
+    tfm.x = read_node_float(kg, &offset);
+    tfm.y = read_node_float(kg, &offset);
+    tfm.z = read_node_float(kg, &offset);
     co = transform_point(&tfm, co);
   }
 
@@ -47,6 +47,7 @@ ccl_device void svm_node_tex_voxel(
     stack_store_float(stack, density_out_offset, r.w);
   if (stack_valid(color_out_offset))
     stack_store_float3(stack, color_out_offset, make_float3(r.x, r.y, r.z));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wave.h b/intern/cycles/kernel/svm/svm_wave.h
index c4763475b47..00f980c16df 100644
--- a/intern/cycles/kernel/svm/svm_wave.h
+++ b/intern/cycles/kernel/svm/svm_wave.h
@@ -82,11 +82,11 @@ ccl_device_noinline_cpu float svm_wave(NodeWaveType type,
   }
 }
 
-ccl_device void svm_node_tex_wave(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int *offset)
+ccl_device_noinline int svm_node_tex_wave(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset)
 {
-  uint4 node2 = read_node(kg, offset);
-  uint4 node3 = read_node(kg, offset);
+  uint4 node2 = read_node(kg, &offset);
+  uint4 node3 = read_node(kg, &offset);
 
   /* RNA properties */
   uint type_offset, bands_dir_offset, rings_dir_offset, profile_offset;
@@ -125,6 +125,7 @@ ccl_device void svm_node_tex_wave(
     stack_store_float(stack, fac_offset, f);
   if (stack_valid(color_offset))
     stack_store_float3(stack, color_offset, make_float3(f, f, f));
+  return offset;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/svm/svm_wavelength.h b/intern/cycles/kernel/svm/svm_wavelength.h
index d6144802559..fba8aa63d31 100644
--- a/intern/cycles/kernel/svm/svm_wavelength.h
+++ b/intern/cycles/kernel/svm/svm_wavelength.h
@@ -69,8 +69,8 @@ ccl_static_constant float cie_colour_match[81][3] = {
     {0.0002f, 0.0001f, 0.0000f}, {0.0002f, 0.0001f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f},
     {0.0001f, 0.0000f, 0.0000f}, {0.0001f, 0.0000f, 0.0000f}, {0.0000f, 0.0000f, 0.0000f}};
 
-ccl_device void svm_node_wavelength(
-    KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
+ccl_device_noinline void svm_node_wavelength(
+    const KernelGlobals *kg, ShaderData *sd, float *stack, uint wavelength, uint color_out)
 {
   float lambda_nm = stack_load_float(stack, wavelength);
   float ii = (lambda_nm - 380.0f) * (1.0f / 5.0f);  // scaled 0..80
diff --git a/intern/cycles/kernel/svm/svm_white_noise.h b/intern/cycles/kernel/svm/svm_white_noise.h
index b30d85acaec..0306d2e7b9c 100644
--- a/intern/cycles/kernel/svm/svm_white_noise.h
+++ b/intern/cycles/kernel/svm/svm_white_noise.h
@@ -16,13 +16,12 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device void svm_node_tex_white_noise(KernelGlobals *kg,
-                                         ShaderData *sd,
-                                         float *stack,
-                                         uint dimensions,
-                                         uint inputs_stack_offsets,
-                                         uint ouptuts_stack_offsets,
-                                         int *offset)
+ccl_device_noinline void svm_node_tex_white_noise(const KernelGlobals *kg,
+                                                  ShaderData *sd,
+                                                  float *stack,
+                                                  uint dimensions,
+                                                  uint inputs_stack_offsets,
+                                                  uint ouptuts_stack_offsets)
 {
   uint vector_stack_offset, w_stack_offset, value_stack_offset, color_stack_offset;
   svm_unpack_node_uchar2(inputs_stack_offsets, &vector_stack_offset, &w_stack_offset);
diff --git a/intern/cycles/kernel/svm/svm_wireframe.h b/intern/cycles/kernel/svm/svm_wireframe.h
index 49158bd86d5..7ec913789d2 100644
--- a/intern/cycles/kernel/svm/svm_wireframe.h
+++ b/intern/cycles/kernel/svm/svm_wireframe.h
@@ -35,7 +35,7 @@ CCL_NAMESPACE_BEGIN
 /* Wireframe Node */
 
 ccl_device_inline float wireframe(
-    KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
+    const KernelGlobals *kg, ShaderData *sd, float size, int pixel_size, float3 *P)
 {
 #ifdef __HAIR__
   if (sd->prim != PRIM_NONE && sd->type & PRIMITIVE_ALL_TRIANGLE)
@@ -88,7 +88,10 @@ ccl_device_inline float wireframe(
   return 0.0f;
 }
 
-ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node)
+ccl_device_noinline void svm_node_wireframe(const KernelGlobals *kg,
+                                            ShaderData *sd,
+                                            float *stack,
+                                            uint4 node)
 {
   uint in_size = node.y;
   uint out_fac = node.z;
@@ -100,18 +103,7 @@ ccl_device void svm_node_wireframe(KernelGlobals *kg, ShaderData *sd, float *sta
   int pixel_size = (int)use_pixel_size;
 
   /* Calculate wireframe */
-#ifdef __SPLIT_KERNEL__
-  /* TODO(sergey): This is because sd is actually a global space,
-   * which makes it difficult to re-use same wireframe() function.
-   *
-   * With OpenCL 2.0 it's possible to avoid this change, but for until
-   * then we'll be living with such an exception.
-   */
-  float3 P = sd->P;
-  float f = wireframe(kg, sd, size, pixel_size, &P);
-#else
   float f = wireframe(kg, sd, size, pixel_size, &sd->P);
-#endif
 
   /* TODO(sergey): Think of faster way to calculate derivatives. */
   if (bump_offset == NODE_BUMP_OFFSET_DX) {
diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt
index feead27c5ca..6edb5261b32 100644
--- a/intern/cycles/render/CMakeLists.txt
+++ b/intern/cycles/render/CMakeLists.txt
@@ -32,10 +32,10 @@ set(SRC
   camera.cpp
   colorspace.cpp
   constant_fold.cpp
-  coverage.cpp
   denoising.cpp
   film.cpp
   geometry.cpp
+  gpu_display.cpp
   graph.cpp
   hair.cpp
   image.cpp
@@ -54,6 +54,7 @@ set(SRC
   object.cpp
   osl.cpp
   particles.cpp
+  pass.cpp
   curves.cpp
   scene.cpp
   session.cpp
@@ -76,10 +77,10 @@ set(SRC_HEADERS
   camera.h
   colorspace.h
   constant_fold.h
-  coverage.h
   denoising.h
   film.h
   geometry.h
+  gpu_display.h
   graph.h
   hair.h
   image.h
@@ -95,6 +96,7 @@ set(SRC_HEADERS
   object.h
   osl.h
   particles.h
+  pass.h
   procedural.h
   curves.h
   scene.h
@@ -111,6 +113,7 @@ set(SRC_HEADERS
 set(LIB
   cycles_bvh
   cycles_device
+  cycles_integrator
   cycles_subd
   cycles_util
 )
diff --git a/intern/cycles/render/background.cpp b/intern/cycles/render/background.cpp
index b925e755434..ae6290ac27b 100644
--- a/intern/cycles/render/background.cpp
+++ b/intern/cycles/render/background.cpp
@@ -34,11 +34,7 @@ NODE_DEFINE(Background)
 {
   NodeType *type = NodeType::add("background", create);
 
-  SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
-  SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
-
   SOCKET_BOOLEAN(use_shader, "Use Shader", true);
-  SOCKET_BOOLEAN(use_ao, "Use AO", false);
   SOCKET_UINT(visibility, "Visibility", PATH_RAY_ALL_VISIBILITY);
 
   SOCKET_BOOLEAN(transparent, "Transparent", false);
@@ -80,10 +76,6 @@ void Background::device_update(Device *device, DeviceScene *dscene, Scene *scene
   /* set shader index and transparent option */
   KernelBackground *kbackground = &dscene->data.background;
 
-  kbackground->ao_factor = (use_ao) ? ao_factor : 0.0f;
-  kbackground->ao_bounces_factor = ao_factor;
-  kbackground->ao_distance = ao_distance;
-
   kbackground->transparent = transparent;
   kbackground->surface_shader = scene->shader_manager->get_shader_id(bg_shader);
 
@@ -138,10 +130,6 @@ void Background::tag_update(Scene *scene)
      * and to avoid doing unnecessary updates anywhere else. */
     tag_use_shader_modified();
   }
-
-  if (ao_factor_is_modified() || use_ao_is_modified()) {
-    scene->integrator->tag_update(scene, Integrator::BACKGROUND_AO_MODIFIED);
-  }
 }
 
 Shader *Background::get_shader(const Scene *scene)
diff --git a/intern/cycles/render/background.h b/intern/cycles/render/background.h
index e89ffbc2445..2f7ef0f7737 100644
--- a/intern/cycles/render/background.h
+++ b/intern/cycles/render/background.h
@@ -32,11 +32,7 @@ class Background : public Node {
  public:
   NODE_DECLARE
 
-  NODE_SOCKET_API(float, ao_factor)
-  NODE_SOCKET_API(float, ao_distance)
-
   NODE_SOCKET_API(bool, use_shader)
-  NODE_SOCKET_API(bool, use_ao)
 
   NODE_SOCKET_API(uint, visibility)
   NODE_SOCKET_API(Shader *, shader)
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 317a3937cab..54e496caed6 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -26,58 +26,8 @@
 
 CCL_NAMESPACE_BEGIN
 
-static int aa_samples(Scene *scene, Object *object, ShaderEvalType type)
-{
-  if (type == SHADER_EVAL_UV || type == SHADER_EVAL_ROUGHNESS) {
-    return 1;
-  }
-  else if (type == SHADER_EVAL_NORMAL) {
-    /* Only antialias normal if mesh has bump mapping. */
-    if (object->get_geometry()) {
-      foreach (Node *node, object->get_geometry()->get_used_shaders()) {
-        Shader *shader = static_cast<Shader *>(node);
-        if (shader->has_bump) {
-          return scene->integrator->get_aa_samples();
-        }
-      }
-    }
-
-    return 1;
-  }
-  else {
-    return scene->integrator->get_aa_samples();
-  }
-}
-
-/* Keep it synced with kernel_bake.h logic */
-static int shader_type_to_pass_filter(ShaderEvalType type, int pass_filter)
-{
-  const int component_flags = pass_filter &
-                              (BAKE_FILTER_DIRECT | BAKE_FILTER_INDIRECT | BAKE_FILTER_COLOR);
-
-  switch (type) {
-    case SHADER_EVAL_AO:
-      return BAKE_FILTER_AO;
-    case SHADER_EVAL_SHADOW:
-      return BAKE_FILTER_DIRECT;
-    case SHADER_EVAL_DIFFUSE:
-      return BAKE_FILTER_DIFFUSE | component_flags;
-    case SHADER_EVAL_GLOSSY:
-      return BAKE_FILTER_GLOSSY | component_flags;
-    case SHADER_EVAL_TRANSMISSION:
-      return BAKE_FILTER_TRANSMISSION | component_flags;
-    case SHADER_EVAL_COMBINED:
-      return pass_filter;
-    default:
-      return 0;
-  }
-}
-
 BakeManager::BakeManager()
 {
-  type = SHADER_EVAL_BAKE;
-  pass_filter = 0;
-
   need_update_ = true;
 }
 
@@ -85,32 +35,14 @@ BakeManager::~BakeManager()
 {
 }
 
-bool BakeManager::get_baking()
+bool BakeManager::get_baking() const
 {
   return !object_name.empty();
 }
 
-void BakeManager::set(Scene *scene,
-                      const std::string &object_name_,
-                      ShaderEvalType type_,
-                      int pass_filter_)
+void BakeManager::set(Scene *scene, const std::string &object_name_)
 {
   object_name = object_name_;
-  type = type_;
-  pass_filter = shader_type_to_pass_filter(type_, pass_filter_);
-
-  Pass::add(PASS_BAKE_PRIMITIVE, scene->passes);
-  Pass::add(PASS_BAKE_DIFFERENTIAL, scene->passes);
-
-  if (type == SHADER_EVAL_UV) {
-    /* force UV to be available */
-    Pass::add(PASS_UV, scene->passes);
-  }
-
-  /* force use_light_pass to be true if we bake more than just colors */
-  if (pass_filter & ~BAKE_FILTER_COLOR) {
-    Pass::add(PASS_LIGHT, scene->passes);
-  }
 
   /* create device and update scene */
   scene->film->tag_modified();
@@ -127,29 +59,29 @@ void BakeManager::device_update(Device * /*device*/,
   if (!need_update())
     return;
 
-  scoped_callback_timer timer([scene](double time) {
-    if (scene->update_stats) {
-      scene->update_stats->bake.times.add_entry({"device_update", time});
-    }
-  });
-
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
   KernelBake *kbake = &dscene->data.bake;
+  memset(kbake, 0, sizeof(*kbake));
 
-  kbake->type = type;
-  kbake->pass_filter = pass_filter;
-
-  int object_index = 0;
-  foreach (Object *object, scene->objects) {
-    const Geometry *geom = object->get_geometry();
-    if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
-      kbake->object_index = object_index;
-      kbake->tri_offset = geom->prim_offset;
-      kintegrator->aa_samples = aa_samples(scene, object, type);
-      break;
-    }
+  if (!object_name.empty()) {
+    scoped_callback_timer timer([scene](double time) {
+      if (scene->update_stats) {
+        scene->update_stats->bake.times.add_entry({"device_update", time});
+      }
+    });
+
+    kbake->use = true;
 
-    object_index++;
+    int object_index = 0;
+    foreach (Object *object, scene->objects) {
+      const Geometry *geom = object->get_geometry();
+      if (object->name == object_name && geom->geometry_type == Geometry::MESH) {
+        kbake->object_index = object_index;
+        kbake->tri_offset = geom->prim_offset;
+        break;
+      }
+
+      object_index++;
+    }
   }
 
   need_update_ = false;
diff --git a/intern/cycles/render/bake.h b/intern/cycles/render/bake.h
index 655b9b1cf7e..39e504490c2 100644
--- a/intern/cycles/render/bake.h
+++ b/intern/cycles/render/bake.h
@@ -30,8 +30,8 @@ class BakeManager {
   BakeManager();
   ~BakeManager();
 
-  void set(Scene *scene, const std::string &object_name, ShaderEvalType type, int pass_filter);
-  bool get_baking();
+  void set(Scene *scene, const std::string &object_name);
+  bool get_baking() const;
 
   void device_update(Device *device, DeviceScene *dscene, Scene *scene, Progress &progress);
   void device_free(Device *device, DeviceScene *dscene);
@@ -42,8 +42,6 @@ class BakeManager {
 
  private:
   bool need_update_;
-  ShaderEvalType type;
-  int pass_filter;
   std::string object_name;
 };
 
diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp
index fcfad58995e..1882510cd70 100644
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -28,537 +28,335 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Buffer Params */
+/* --------------------------------------------------------------------
+ * Convert part information to an index of `BufferParams::pass_offset_`.
+ */
 
-BufferParams::BufferParams()
+static int pass_type_mode_to_index(PassType pass_type, PassMode mode)
 {
-  width = 0;
-  height = 0;
-
-  full_x = 0;
-  full_y = 0;
-  full_width = 0;
-  full_height = 0;
+  int index = static_cast<int>(pass_type) * 2;
 
-  denoising_data_pass = false;
-  denoising_clean_pass = false;
-  denoising_prefiltered_pass = false;
+  if (mode == PassMode::DENOISED) {
+    ++index;
+  }
 
-  Pass::add(PASS_COMBINED, passes);
+  return index;
 }
 
-void BufferParams::get_offset_stride(int &offset, int &stride)
+static int pass_to_index(const BufferPass &pass)
 {
-  offset = -(full_x + full_y * width);
-  stride = width;
+  return pass_type_mode_to_index(pass.type, pass.mode);
 }
 
-bool BufferParams::modified(const BufferParams &params)
-{
-  return !(full_x == params.full_x && full_y == params.full_y && width == params.width &&
-           height == params.height && full_width == params.full_width &&
-           full_height == params.full_height && Pass::equals(passes, params.passes) &&
-           denoising_data_pass == params.denoising_data_pass &&
-           denoising_clean_pass == params.denoising_clean_pass &&
-           denoising_prefiltered_pass == params.denoising_prefiltered_pass);
-}
+/* --------------------------------------------------------------------
+ * Buffer pass.
+ */
 
-int BufferParams::get_passes_size()
+NODE_DEFINE(BufferPass)
 {
-  int size = 0;
+  NodeType *type = NodeType::add("buffer_pass", create);
 
-  for (size_t i = 0; i < passes.size(); i++)
-    size += passes[i].components;
+  const NodeEnum *pass_type_enum = Pass::get_type_enum();
+  const NodeEnum *pass_mode_enum = Pass::get_mode_enum();
 
-  if (denoising_data_pass) {
-    size += DENOISING_PASS_SIZE_BASE;
-    if (denoising_clean_pass)
-      size += DENOISING_PASS_SIZE_CLEAN;
-    if (denoising_prefiltered_pass)
-      size += DENOISING_PASS_SIZE_PREFILTERED;
-  }
+  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+  SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+  SOCKET_STRING(name, "Name", ustring());
+  SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
 
-  return align_up(size, 4);
-}
+  SOCKET_INT(offset, "Offset", -1);
 
-int BufferParams::get_denoising_offset()
-{
-  int offset = 0;
-
-  for (size_t i = 0; i < passes.size(); i++)
-    offset += passes[i].components;
-
-  return offset;
+  return type;
 }
 
-int BufferParams::get_denoising_prefiltered_offset()
+BufferPass::BufferPass() : Node(get_node_type())
 {
-  assert(denoising_prefiltered_pass);
-
-  int offset = get_denoising_offset();
-
-  offset += DENOISING_PASS_SIZE_BASE;
-  if (denoising_clean_pass) {
-    offset += DENOISING_PASS_SIZE_CLEAN;
-  }
-
-  return offset;
 }
 
-/* Render Buffer Task */
-
-RenderTile::RenderTile()
+BufferPass::BufferPass(const Pass *scene_pass)
+    : Node(get_node_type()),
+      type(scene_pass->get_type()),
+      mode(scene_pass->get_mode()),
+      name(scene_pass->get_name()),
+      include_albedo(scene_pass->get_include_albedo())
 {
-  x = 0;
-  y = 0;
-  w = 0;
-  h = 0;
-
-  sample = 0;
-  start_sample = 0;
-  num_samples = 0;
-  resolution = 0;
-
-  offset = 0;
-  stride = 0;
-
-  buffer = 0;
-
-  buffers = NULL;
-  stealing_state = NO_STEALING;
 }
 
-/* Render Buffers */
-
-RenderBuffers::RenderBuffers(Device *device)
-    : buffer(device, "RenderBuffers", MEM_READ_WRITE),
-      map_neighbor_copied(false),
-      render_time(0.0f)
+PassInfo BufferPass::get_info() const
 {
+  return Pass::get_info(type, include_albedo);
 }
 
-RenderBuffers::~RenderBuffers()
-{
-  buffer.free();
-}
+/* --------------------------------------------------------------------
+ * Buffer Params.
+ */
 
-void RenderBuffers::reset(BufferParams &params_)
+NODE_DEFINE(BufferParams)
 {
-  params = params_;
-
-  /* re-allocate buffer */
-  buffer.alloc(params.width * params.get_passes_size(), params.height);
-  buffer.zero_to_device();
+  NodeType *type = NodeType::add("buffer_params", create);
+
+  SOCKET_INT(width, "Width", 0);
+  SOCKET_INT(height, "Height", 0);
+
+  SOCKET_INT(full_x, "Full X", 0);
+  SOCKET_INT(full_y, "Full Y", 0);
+  SOCKET_INT(full_width, "Full Width", 0);
+  SOCKET_INT(full_height, "Full Height", 0);
+
+  SOCKET_STRING(layer, "Layer", ustring());
+  SOCKET_STRING(view, "View", ustring());
+  SOCKET_INT(samples, "Samples", 0);
+  SOCKET_FLOAT(exposure, "Exposure", 1.0f);
+  SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+  SOCKET_BOOLEAN(use_transparent_background, "Transparent Background", false);
+
+  /* Notes:
+   *  - Skip passes since they do not follow typical container socket definition.
+   *    Might look into covering those as a socket in the future.
+   *
+   *  - Skip offset, stride, and pass stride since those can be delivered from the passes and
+   *    rest of the sockets. */
+
+  return type;
 }
 
-void RenderBuffers::zero()
+BufferParams::BufferParams() : Node(get_node_type())
 {
-  buffer.zero_to_device();
+  reset_pass_offset();
 }
 
-bool RenderBuffers::copy_from_device()
+void BufferParams::update_passes()
 {
-  if (!buffer.device_pointer)
-    return false;
-
-  buffer.copy_from_device(0, params.width * params.get_passes_size(), params.height);
-
-  return true;
-}
-
-bool RenderBuffers::get_denoising_pass_rect(
-    int type, float exposure, int sample, int components, float *pixels)
-{
-  if (buffer.data() == NULL) {
-    return false;
-  }
-
-  float scale = 1.0f;
-  float alpha_scale = 1.0f / sample;
-  if (type == DENOISING_PASS_PREFILTERED_COLOR || type == DENOISING_PASS_CLEAN ||
-      type == DENOISING_PASS_PREFILTERED_INTENSITY) {
-    scale *= exposure;
-  }
-  else if (type == DENOISING_PASS_PREFILTERED_VARIANCE) {
-    scale *= exposure * exposure * (sample - 1);
-  }
+  update_offset_stride();
+  reset_pass_offset();
+
+  pass_stride = 0;
+  for (const BufferPass &pass : passes) {
+    if (pass.offset != PASS_UNUSED) {
+      const int index = pass_to_index(pass);
+      if (pass_offset_[index] == PASS_UNUSED) {
+        pass_offset_[index] = pass_stride;
+      }
 
-  int offset;
-  if (type == DENOISING_PASS_CLEAN) {
-    /* The clean pass isn't changed by prefiltering, so we use the original one there. */
-    offset = type + params.get_denoising_offset();
-    scale /= sample;
-  }
-  else if (params.denoising_prefiltered_pass) {
-    offset = type + params.get_denoising_prefiltered_offset();
-  }
-  else {
-    switch (type) {
-      case DENOISING_PASS_PREFILTERED_DEPTH:
-        offset = params.get_denoising_offset() + DENOISING_PASS_DEPTH;
-        break;
-      case DENOISING_PASS_PREFILTERED_NORMAL:
-        offset = params.get_denoising_offset() + DENOISING_PASS_NORMAL;
-        break;
-      case DENOISING_PASS_PREFILTERED_ALBEDO:
-        offset = params.get_denoising_offset() + DENOISING_PASS_ALBEDO;
-        break;
-      case DENOISING_PASS_PREFILTERED_COLOR:
-        /* If we're not saving the prefiltering result, return the original noisy pass. */
-        offset = params.get_denoising_offset() + DENOISING_PASS_COLOR;
-        break;
-      default:
-        return false;
+      pass_stride += pass.get_info().num_components;
     }
-    scale /= sample;
   }
+}
 
-  int pass_stride = params.get_passes_size();
-  int size = params.width * params.height;
+void BufferParams::update_passes(const vector<Pass *> &scene_passes)
+{
+  passes.clear();
 
-  float *in = buffer.data() + offset;
+  pass_stride = 0;
+  for (const Pass *scene_pass : scene_passes) {
+    BufferPass buffer_pass(scene_pass);
 
-  if (components == 1) {
-    for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-      pixels[0] = in[0] * scale;
+    if (scene_pass->is_written()) {
+      buffer_pass.offset = pass_stride;
+      pass_stride += scene_pass->get_info().num_components;
     }
-  }
-  else if (components == 3) {
-    for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-      pixels[0] = in[0] * scale;
-      pixels[1] = in[1] * scale;
-      pixels[2] = in[2] * scale;
-    }
-  }
-  else if (components == 4) {
-    /* Since the alpha channel is not involved in denoising, output the Combined alpha channel. */
-    assert(params.passes[0].type == PASS_COMBINED);
-    float *in_combined = buffer.data();
-
-    for (int i = 0; i < size; i++, in += pass_stride, in_combined += pass_stride, pixels += 4) {
-      float3 val = make_float3(in[0], in[1], in[2]);
-      if (type == DENOISING_PASS_PREFILTERED_COLOR && params.denoising_prefiltered_pass) {
-        /* Remove highlight compression from the image. */
-        val = color_highlight_uncompress(val);
-      }
-      pixels[0] = val.x * scale;
-      pixels[1] = val.y * scale;
-      pixels[2] = val.z * scale;
-      pixels[3] = saturate(in_combined[3] * alpha_scale);
+    else {
+      buffer_pass.offset = PASS_UNUSED;
     }
-  }
-  else {
-    return false;
+
+    passes.emplace_back(std::move(buffer_pass));
   }
 
-  return true;
+  update_passes();
 }
 
-bool RenderBuffers::get_pass_rect(
-    const string &name, float exposure, int sample, int components, float *pixels)
+void BufferParams::reset_pass_offset()
 {
-  if (buffer.data() == NULL) {
-    return false;
+  for (int i = 0; i < kNumPassOffsets; ++i) {
+    pass_offset_[i] = PASS_UNUSED;
   }
+}
 
-  float *sample_count = NULL;
-  if (name == "Combined") {
-    int sample_offset = 0;
-    for (size_t j = 0; j < params.passes.size(); j++) {
-      Pass &pass = params.passes[j];
-      if (pass.type != PASS_SAMPLE_COUNT) {
-        sample_offset += pass.components;
-        continue;
-      }
-      else {
-        sample_count = buffer.data() + sample_offset;
-        break;
-      }
-    }
+int BufferParams::get_pass_offset(PassType pass_type, PassMode mode) const
+{
+  if (pass_type == PASS_NONE || pass_type == PASS_UNUSED) {
+    return PASS_UNUSED;
   }
 
-  int pass_offset = 0;
-
-  for (size_t j = 0; j < params.passes.size(); j++) {
-    Pass &pass = params.passes[j];
+  const int index = pass_type_mode_to_index(pass_type, mode);
+  return pass_offset_[index];
+}
 
-    /* Pass is identified by both type and name, multiple of the same type
-     * may exist with a different name. */
-    if (pass.name != name) {
-      pass_offset += pass.components;
-      continue;
+const BufferPass *BufferParams::find_pass(string_view name) const
+{
+  for (const BufferPass &pass : passes) {
+    if (pass.name == name) {
+      return &pass;
     }
+  }
 
-    PassType type = pass.type;
-
-    float *in = buffer.data() + pass_offset;
-    int pass_stride = params.get_passes_size();
-
-    float scale = (pass.filter) ? 1.0f / (float)sample : 1.0f;
-    float scale_exposure = (pass.exposure) ? scale * exposure : scale;
-
-    int size = params.width * params.height;
+  return nullptr;
+}
 
-    if (components == 1 && type == PASS_RENDER_TIME) {
-      /* Render time is not stored by kernel, but measured per tile. */
-      float val = (float)(1000.0 * render_time / (params.width * params.height * sample));
-      for (int i = 0; i < size; i++, pixels++) {
-        pixels[0] = val;
-      }
-    }
-    else if (components == 1) {
-      assert(pass.components == components);
-
-      /* Scalar */
-      if (type == PASS_DEPTH) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = (f == 0.0f) ? 1e10f : f * scale_exposure;
-        }
-      }
-      else if (type == PASS_MIST) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = saturate(f * scale_exposure);
-        }
-      }
-      else {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
-          float f = *in;
-          pixels[0] = f * scale_exposure;
-        }
-      }
-    }
-    else if (components == 3) {
-      assert(pass.components == 4);
-
-      /* RGBA */
-      if (type == PASS_SHADOW) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-        }
-      }
-      else if (pass.divide_type != PASS_NONE) {
-        /* RGB lighting passes that need to divide out color */
-        pass_offset = 0;
-        for (size_t k = 0; k < params.passes.size(); k++) {
-          Pass &color_pass = params.passes[k];
-          if (color_pass.type == pass.divide_type)
-            break;
-          pass_offset += color_pass.components;
-        }
-
-        float *in_divide = buffer.data() + pass_offset;
-
-        for (int i = 0; i < size; i++, in += pass_stride, in_divide += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
-          float3 f_divide = make_float3(in_divide[0], in_divide[1], in_divide[2]);
-
-          f = safe_divide_even_color(f * exposure, f_divide);
-
-          pixels[0] = f.x;
-          pixels[1] = f.y;
-          pixels[2] = f.z;
-        }
-      }
-      else {
-        /* RGB/vector */
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
-          float3 f = make_float3(in[0], in[1], in[2]);
-
-          pixels[0] = f.x * scale_exposure;
-          pixels[1] = f.y * scale_exposure;
-          pixels[2] = f.z * scale_exposure;
-        }
-      }
-    }
-    else if (components == 4) {
-      assert(pass.components == components);
-
-      /* RGBA */
-      if (type == PASS_SHADOW) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float invw = (f.w > 0.0f) ? 1.0f / f.w : 1.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-          pixels[3] = 1.0f;
-        }
-      }
-      else if (type == PASS_MOTION) {
-        /* need to normalize by number of samples accumulated for motion */
-        pass_offset = 0;
-        for (size_t k = 0; k < params.passes.size(); k++) {
-          Pass &color_pass = params.passes[k];
-          if (color_pass.type == PASS_MOTION_WEIGHT)
-            break;
-          pass_offset += color_pass.components;
-        }
-
-        float *in_weight = buffer.data() + pass_offset;
-
-        for (int i = 0; i < size; i++, in += pass_stride, in_weight += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          float w = in_weight[0];
-          float invw = (w > 0.0f) ? 1.0f / w : 0.0f;
-
-          pixels[0] = f.x * invw;
-          pixels[1] = f.y * invw;
-          pixels[2] = f.z * invw;
-          pixels[3] = f.w * invw;
-        }
-      }
-      else if (type == PASS_CRYPTOMATTE) {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-          /* x and z contain integer IDs, don't rescale them.
-             y and w contain matte weights, they get scaled. */
-          pixels[0] = f.x;
-          pixels[1] = f.y * scale;
-          pixels[2] = f.z;
-          pixels[3] = f.w * scale;
-        }
-      }
-      else {
-        for (int i = 0; i < size; i++, in += pass_stride, pixels += 4) {
-          if (sample_count && sample_count[i * pass_stride] < 0.0f) {
-            scale = (pass.filter) ? -1.0f / (sample_count[i * pass_stride]) : 1.0f;
-            scale_exposure = (pass.exposure) ? scale * exposure : scale;
-          }
-
-          float4 f = make_float4(in[0], in[1], in[2], in[3]);
-
-          pixels[0] = f.x * scale_exposure;
-          pixels[1] = f.y * scale_exposure;
-          pixels[2] = f.z * scale_exposure;
-
-          /* Clamp since alpha might be > 1.0 due to Russian roulette. */
-          pixels[3] = saturate(f.w * scale);
-        }
-      }
+const BufferPass *BufferParams::find_pass(PassType type, PassMode mode) const
+{
+  for (const BufferPass &pass : passes) {
+    if (pass.type == type && pass.mode == mode) {
+      return &pass;
     }
-
-    return true;
   }
 
-  return false;
+  return nullptr;
 }
 
-bool RenderBuffers::set_pass_rect(PassType type, int components, float *pixels, int samples)
+const BufferPass *BufferParams::get_actual_display_pass(PassType type, PassMode mode) const
 {
-  if (buffer.data() == NULL) {
-    return false;
-  }
-
-  int pass_offset = 0;
+  const BufferPass *pass = find_pass(type, mode);
+  return get_actual_display_pass(pass);
+}
 
-  for (size_t j = 0; j < params.passes.size(); j++) {
-    Pass &pass = params.passes[j];
+const BufferPass *BufferParams::get_actual_display_pass(const BufferPass *pass) const
+{
+  if (!pass) {
+    return nullptr;
+  }
 
-    if (pass.type != type) {
-      pass_offset += pass.components;
-      continue;
+  if (pass->type == PASS_COMBINED) {
+    const BufferPass *shadow_catcher_matte_pass = find_pass(PASS_SHADOW_CATCHER_MATTE, pass->mode);
+    if (shadow_catcher_matte_pass) {
+      pass = shadow_catcher_matte_pass;
     }
+  }
 
-    float *out = buffer.data() + pass_offset;
-    int pass_stride = params.get_passes_size();
-    int size = params.width * params.height;
-
-    assert(pass.components == components);
+  return pass;
+}
 
-    for (int i = 0; i < size; i++, out += pass_stride, pixels += components) {
-      if (pass.filter) {
-        /* Scale by the number of samples, inverse of what we do in get_pass_rect.
-         * A better solution would be to remove the need for set_pass_rect entirely,
-         * and change baking to bake multiple objects in a tile at once. */
-        for (int j = 0; j < components; j++) {
-          out[j] = pixels[j] * samples;
-        }
-      }
-      else {
-        /* For non-filtered passes just straight copy, these may contain non-float data. */
-        memcpy(out, pixels, sizeof(float) * components);
-      }
-    }
+void BufferParams::update_offset_stride()
+{
+  offset = -(full_x + full_y * width);
+  stride = width;
+}
 
+bool BufferParams::modified(const BufferParams &other) const
+{
+  if (!(width == other.width && height == other.height && full_x == other.full_x &&
+        full_y == other.full_y && full_width == other.full_width &&
+        full_height == other.full_height && offset == other.offset && stride == other.stride &&
+        pass_stride == other.pass_stride && layer == other.layer && view == other.view &&
+        exposure == other.exposure &&
+        use_approximate_shadow_catcher == other.use_approximate_shadow_catcher &&
+        use_transparent_background == other.use_transparent_background)) {
     return true;
   }
 
-  return false;
+  return !(passes == other.passes);
 }
 
-/* Display Buffer */
+/* --------------------------------------------------------------------
+ * Render Buffers.
+ */
 
-DisplayBuffer::DisplayBuffer(Device *device, bool linear)
-    : draw_width(0),
-      draw_height(0),
-      transparent(true), /* todo: determine from background */
-      half_float(linear),
-      rgba_byte(device, "display buffer byte"),
-      rgba_half(device, "display buffer half")
+RenderBuffers::RenderBuffers(Device *device) : buffer(device, "RenderBuffers", MEM_READ_WRITE)
 {
 }
 
-DisplayBuffer::~DisplayBuffer()
+RenderBuffers::~RenderBuffers()
 {
-  rgba_byte.free();
-  rgba_half.free();
+  buffer.free();
 }
 
-void DisplayBuffer::reset(BufferParams &params_)
+void RenderBuffers::reset(const BufferParams &params_)
 {
-  draw_width = 0;
-  draw_height = 0;
+  DCHECK(params_.pass_stride != -1);
 
   params = params_;
 
-  /* allocate display pixels */
-  if (half_float) {
-    rgba_half.alloc_to_device(params.width, params.height);
-  }
-  else {
-    rgba_byte.alloc_to_device(params.width, params.height);
-  }
+  /* re-allocate buffer */
+  buffer.alloc(params.width * params.pass_stride, params.height);
 }
 
-void DisplayBuffer::draw_set(int width, int height)
+void RenderBuffers::zero()
 {
-  assert(width <= params.width && height <= params.height);
+  buffer.zero_to_device();
+}
 
-  draw_width = width;
-  draw_height = height;
+bool RenderBuffers::copy_from_device()
+{
+  DCHECK(params.pass_stride != -1);
+
+  if (!buffer.device_pointer)
+    return false;
+
+  buffer.copy_from_device(0, params.width * params.pass_stride, params.height);
+
+  return true;
 }
 
-void DisplayBuffer::draw(Device *device, const DeviceDrawParams &draw_params)
+void RenderBuffers::copy_to_device()
 {
-  if (draw_width != 0 && draw_height != 0) {
-    device_memory &rgba = (half_float) ? (device_memory &)rgba_half : (device_memory &)rgba_byte;
-
-    device->draw_pixels(rgba,
-                        0,
-                        draw_width,
-                        draw_height,
-                        params.width,
-                        params.height,
-                        params.full_x,
-                        params.full_y,
-                        params.full_width,
-                        params.full_height,
-                        transparent,
-                        draw_params);
-  }
+  buffer.copy_to_device();
 }
 
-bool DisplayBuffer::draw_ready()
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+                                       const BufferParams &dst_params,
+                                       const RenderBuffers *src,
+                                       const BufferParams &src_params,
+                                       const size_t src_offset)
 {
-  return (draw_width != 0 && draw_height != 0);
+  DCHECK_EQ(dst_params.width, src_params.width);
+  /* TODO(sergey): More sanity checks to avoid buffer overrun. */
+
+  /* Create a map of pass offsets to be copied.
+   * Assume offsets are different to allow copying passes between buffers with different set of
+   * passes. */
+
+  struct {
+    int dst_offset;
+    int src_offset;
+  } pass_offsets[PASS_NUM];
+
+  int num_passes = 0;
+
+  for (int i = 0; i < PASS_NUM; ++i) {
+    const PassType pass_type = static_cast<PassType>(i);
+
+    const int dst_pass_offset = dst_params.get_pass_offset(pass_type, PassMode::DENOISED);
+    if (dst_pass_offset == PASS_UNUSED) {
+      continue;
+    }
+
+    const int src_pass_offset = src_params.get_pass_offset(pass_type, PassMode::DENOISED);
+    if (src_pass_offset == PASS_UNUSED) {
+      continue;
+    }
+
+    pass_offsets[num_passes].dst_offset = dst_pass_offset;
+    pass_offsets[num_passes].src_offset = src_pass_offset;
+    ++num_passes;
+  }
+
+  /* Copy passes. */
+  /* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */
+
+  const int64_t dst_width = dst_params.width;
+  const int64_t dst_height = dst_params.height;
+  const int64_t dst_pass_stride = dst_params.pass_stride;
+  const int64_t dst_num_pixels = dst_width * dst_height;
+
+  const int64_t src_pass_stride = src_params.pass_stride;
+  const int64_t src_offset_in_floats = src_offset * src_pass_stride;
+
+  const float *src_pixel = src->buffer.data() + src_offset_in_floats;
+  float *dst_pixel = dst->buffer.data();
+
+  for (int i = 0; i < dst_num_pixels;
+       ++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride) {
+    for (int pass_offset_idx = 0; pass_offset_idx < num_passes; ++pass_offset_idx) {
+      const int dst_pass_offset = pass_offsets[pass_offset_idx].dst_offset;
+      const int src_pass_offset = pass_offsets[pass_offset_idx].src_offset;
+
+      /* TODO(sergey): Support non-RGBA passes. */
+      dst_pixel[dst_pass_offset + 0] = src_pixel[src_pass_offset + 0];
+      dst_pixel[dst_pass_offset + 1] = src_pixel[src_pass_offset + 1];
+      dst_pixel[dst_pass_offset + 2] = src_pixel[src_pass_offset + 2];
+      dst_pixel[dst_pass_offset + 3] = src_pixel[src_pass_offset + 3];
+    }
+  }
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/buffers.h b/intern/cycles/render/buffers.h
index 4ffc628bb52..184ac7197af 100644
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -18,8 +18,8 @@
 #define __BUFFERS_H__
 
 #include "device/device_memory.h"
-
-#include "render/film.h"
+#include "graph/node.h"
+#include "render/pass.h"
 
 #include "kernel/kernel_types.h"
 
@@ -34,170 +34,157 @@ class Device;
 struct DeviceDrawParams;
 struct float4;
 
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferPass : public Node {
+ public:
+  NODE_DECLARE
+
+  PassType type = PASS_NONE;
+  PassMode mode = PassMode::NOISY;
+  ustring name;
+  bool include_albedo = false;
+
+  int offset = -1;
+
+  BufferPass();
+  explicit BufferPass(const Pass *scene_pass);
+
+  BufferPass(BufferPass &&other) noexcept = default;
+  BufferPass(const BufferPass &other) = default;
+
+  BufferPass &operator=(BufferPass &&other) = default;
+  BufferPass &operator=(const BufferPass &other) = default;
+
+  ~BufferPass() = default;
+
+  PassInfo get_info() const;
+
+  inline bool operator==(const BufferPass &other) const
+  {
+    return type == other.type && mode == other.mode && name == other.name &&
+           include_albedo == other.include_albedo && offset == other.offset;
+  }
+  inline bool operator!=(const BufferPass &other) const
+  {
+    return !(*this == other);
+  }
+};
+
 /* Buffer Parameters
  * Size of render buffer and how it fits in the full image (border render). */
 
-class BufferParams {
+/* NOTE: Is not a real scene node. Using Node API for ease of (de)serialization. */
+class BufferParams : public Node {
  public:
-  /* width/height of the physical buffer */
-  int width;
-  int height;
-
-  /* offset into and width/height of the full buffer */
-  int full_x;
-  int full_y;
-  int full_width;
-  int full_height;
-
-  /* passes */
-  vector<Pass> passes;
-  bool denoising_data_pass;
-  /* If only some light path types should be target, an additional pass is needed. */
-  bool denoising_clean_pass;
-  /* When we're prefiltering the passes during rendering, we need to keep both the
-   * original and the prefiltered data around because neighboring tiles might still
-   * need the original data. */
-  bool denoising_prefiltered_pass;
-
-  /* functions */
-  BufferParams();
+  NODE_DECLARE
 
-  void get_offset_stride(int &offset, int &stride);
-  bool modified(const BufferParams &params);
-  int get_passes_size();
-  int get_denoising_offset();
-  int get_denoising_prefiltered_offset();
-};
+  /* Width/height of the physical buffer. */
+  int width = 0;
+  int height = 0;
 
-/* Render Buffers */
+  /* Offset into and width/height of the full buffer. */
+  int full_x = 0;
+  int full_y = 0;
+  int full_width = 0;
+  int full_height = 0;
 
-class RenderBuffers {
- public:
-  /* buffer parameters */
-  BufferParams params;
+  /* Runtime fields, only valid after `update_passes()` or `update_offset_stride()`. */
+  int offset = -1, stride = -1;
 
-  /* float buffer */
-  device_vector<float> buffer;
-  bool map_neighbor_copied;
-  double render_time;
+  /* Runtime fields, only valid after `update_passes()`. */
+  int pass_stride = -1;
 
-  explicit RenderBuffers(Device *device);
-  ~RenderBuffers();
+  /* Properties which are used for accessing buffer pixels outside of scene graph. */
+  vector<BufferPass> passes;
+  ustring layer;
+  ustring view;
+  int samples = 0;
+  float exposure = 1.0f;
+  bool use_approximate_shadow_catcher = false;
+  bool use_transparent_background = false;
 
-  void reset(BufferParams &params);
-  void zero();
+  BufferParams();
 
-  bool copy_from_device();
-  bool get_pass_rect(
-      const string &name, float exposure, int sample, int components, float *pixels);
-  bool get_denoising_pass_rect(
-      int offset, float exposure, int sample, int components, float *pixels);
-  bool set_pass_rect(PassType type, int components, float *pixels, int samples);
-};
+  BufferParams(BufferParams &&other) noexcept = default;
+  BufferParams(const BufferParams &other) = default;
 
-/* Display Buffer
- *
- * The buffer used for drawing during render, filled by converting the render
- * buffers to byte of half float storage */
+  BufferParams &operator=(BufferParams &&other) = default;
+  BufferParams &operator=(const BufferParams &other) = default;
 
-class DisplayBuffer {
- public:
-  /* buffer parameters */
-  BufferParams params;
-  /* dimensions for how much of the buffer is actually ready for display.
-   * with progressive render we can be using only a subset of the buffer.
-   * if these are zero, it means nothing can be drawn yet */
-  int draw_width, draw_height;
-  /* draw alpha channel? */
-  bool transparent;
-  /* use half float? */
-  bool half_float;
-  /* byte buffer for converted result */
-  device_pixels<uchar4> rgba_byte;
-  device_pixels<half4> rgba_half;
-
-  DisplayBuffer(Device *device, bool linear = false);
-  ~DisplayBuffer();
-
-  void reset(BufferParams &params);
-
-  void draw_set(int width, int height);
-  void draw(Device *device, const DeviceDrawParams &draw_params);
-  bool draw_ready();
-};
+  ~BufferParams() = default;
 
-/* Render Tile
- * Rendering task on a buffer */
+  /* Pre-calculate all fields which depends on the passes.
+   *
+   * When the scene passes are given, the buffer passes will be created from them and stored in
+   * this params, and then params are updated for those passes.
+   * The `update_passes()` without parameters updates offsets and strides which are stored outside
+   * of the passes. */
+  void update_passes();
+  void update_passes(const vector<Pass *> &scene_passes);
 
-class RenderTile {
- public:
-  typedef enum { PATH_TRACE = (1 << 0), BAKE = (1 << 1), DENOISE = (1 << 2) } Task;
+  /* Returns PASS_UNUSED if there is no such pass in the buffer. */
+  int get_pass_offset(PassType type, PassMode mode = PassMode::NOISY) const;
 
-  Task task;
-  int x, y, w, h;
-  int start_sample;
-  int num_samples;
-  int sample;
-  int resolution;
-  int offset;
-  int stride;
-  int tile_index;
+  /* Returns nullptr if pass with given name does not exist. */
+  const BufferPass *find_pass(string_view name) const;
+  const BufferPass *find_pass(PassType type, PassMode mode = PassMode::NOISY) const;
 
-  device_ptr buffer;
-  int device_size;
+  /* Get display pass from its name.
+   * Will do special logic to replace combined pass with shadow catcher matte. */
+  const BufferPass *get_actual_display_pass(PassType type, PassMode mode = PassMode::NOISY) const;
+  const BufferPass *get_actual_display_pass(const BufferPass *pass) const;
 
-  typedef enum { NO_STEALING = 0, CAN_BE_STOLEN = 1, WAS_STOLEN = 2 } StealingState;
-  StealingState stealing_state;
+  void update_offset_stride();
 
-  RenderBuffers *buffers;
+  bool modified(const BufferParams &other) const;
 
-  RenderTile();
+ protected:
+  void reset_pass_offset();
 
-  int4 bounds() const
-  {
-    return make_int4(x,      /* xmin */
-                     y,      /* ymin */
-                     x + w,  /* xmax */
-                     y + h); /* ymax */
-  }
+  /* Multiplied by 2 to be able to store noisy and denoised pass types. */
+  static constexpr int kNumPassOffsets = PASS_NUM * 2;
+
+  /* Indexed by an index derived from pass type and mode, indicates offset of the corresponding
+   * pass in the buffer.
+   * If there are multiple passes with same type and mode contains lowest offset of all of them. */
+  int pass_offset_[kNumPassOffsets];
 };
 
-/* Render Tile Neighbors
- * Set of neighboring tiles used for denoising. Tile order:
- *  0 1 2
- *  3 4 5
- *  6 7 8 */
+/* Render Buffers */
 
-class RenderTileNeighbors {
+class RenderBuffers {
  public:
-  static const int SIZE = 9;
-  static const int CENTER = 4;
+  /* buffer parameters */
+  BufferParams params;
 
-  RenderTile tiles[SIZE];
-  RenderTile target;
+  /* float buffer */
+  device_vector<float> buffer;
 
-  RenderTileNeighbors(const RenderTile &center)
-  {
-    tiles[CENTER] = center;
-  }
+  explicit RenderBuffers(Device *device);
+  ~RenderBuffers();
 
-  int4 bounds() const
-  {
-    return make_int4(tiles[3].x,               /* xmin */
-                     tiles[1].y,               /* ymin */
-                     tiles[5].x + tiles[5].w,  /* xmax */
-                     tiles[7].y + tiles[7].h); /* ymax */
-  }
+  void reset(const BufferParams &params);
+  void zero();
 
-  void set_bounds_from_center()
-  {
-    tiles[3].x = tiles[CENTER].x;
-    tiles[1].y = tiles[CENTER].y;
-    tiles[5].x = tiles[CENTER].x + tiles[CENTER].w;
-    tiles[7].y = tiles[CENTER].y + tiles[CENTER].h;
-  }
+  bool copy_from_device();
+  void copy_to_device();
 };
 
+/* Copy denoised passes form source to destination.
+ *
+ * Buffer parameters are provided explicitly, allowing to copy pixels between render buffers which
+ * content corresponds to a render result at a non-unit resolution divider.
+ *
+ * `src_offset` allows to offset source pixel index which is used when a fraction of the source
+ * buffer is to be copied.
+ *
+ * Copy happens of the number of pixels in the destination. */
+void render_buffers_host_copy_denoised(RenderBuffers *dst,
+                                       const BufferParams &dst_params,
+                                       const RenderBuffers *src,
+                                       const BufferParams &src_params,
+                                       const size_t src_offset = 0);
+
 CCL_NAMESPACE_END
 
 #endif /* __BUFFERS_H__ */
diff --git a/intern/cycles/render/camera.cpp b/intern/cycles/render/camera.cpp
index 327f166f9d8..8b69c971991 100644
--- a/intern/cycles/render/camera.cpp
+++ b/intern/cycles/render/camera.cpp
@@ -33,9 +33,9 @@
 
 /* needed for calculating differentials */
 // clang-format off
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/split/kernel_split_data.h"
-#include "kernel/kernel_globals.h"
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
 #include "kernel/kernel_projection.h"
 #include "kernel/kernel_differential.h"
 #include "kernel/kernel_montecarlo.h"
@@ -169,7 +169,6 @@ Camera::Camera() : Node(get_node_type())
 
   width = 1024;
   height = 512;
-  resolution = 1;
 
   use_perspective_motion = false;
 
@@ -455,7 +454,6 @@ void Camera::update(Scene *scene)
   /* render size */
   kcam->width = width;
   kcam->height = height;
-  kcam->resolution = resolution;
 
   /* store differentials */
   kcam->dx = float3_to_float4(dx);
@@ -776,9 +774,11 @@ float Camera::world_to_raster_size(float3 P)
                            &ray);
 #endif
 
-    differential_transfer(&ray.dP, ray.dP, ray.D, ray.dD, ray.D, dist);
+    /* TODO: would it help to use more accurate differentials here? */
+    differential3 dP;
+    differential_transfer_compact(&dP, ray.dP, ray.D, ray.dD, ray.D, dist);
 
-    return max(len(ray.dP.dx), len(ray.dP.dy));
+    return max(len(dP.dx), len(dP.dy));
   }
 
   return res;
@@ -789,12 +789,11 @@ bool Camera::use_motion() const
   return motion.size() > 1;
 }
 
-void Camera::set_screen_size_and_resolution(int width_, int height_, int resolution_)
+void Camera::set_screen_size(int width_, int height_)
 {
-  if (width_ != width || height_ != height || resolution_ != resolution) {
+  if (width_ != width || height_ != height) {
     width = width_;
     height = height_;
-    resolution = resolution_;
     tag_modified();
   }
 }
diff --git a/intern/cycles/render/camera.h b/intern/cycles/render/camera.h
index 5abb4750764..cb8ecac1a7e 100644
--- a/intern/cycles/render/camera.h
+++ b/intern/cycles/render/camera.h
@@ -199,7 +199,6 @@ class Camera : public Node {
  private:
   int width;
   int height;
-  int resolution;
 
  public:
   /* functions */
@@ -225,7 +224,7 @@ class Camera : public Node {
   int motion_step(float time) const;
   bool use_motion() const;
 
-  void set_screen_size_and_resolution(int width_, int height_, int resolution_);
+  void set_screen_size(int width_, int height_);
 
  private:
   /* Private utility functions. */
diff --git a/intern/cycles/render/coverage.cpp b/intern/cycles/render/coverage.cpp
deleted file mode 100644
index 99d4daa6961..00000000000
--- a/intern/cycles/render/coverage.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "render/coverage.h"
-#include "render/buffers.h"
-
-#include "kernel/kernel_compat_cpu.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data.h"
-
-#include "kernel/kernel_globals.h"
-#include "kernel/kernel_id_passes.h"
-
-#include "util/util_map.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool crypomatte_comp(const pair<float, float> &i, const pair<float, float> j)
-{
-  return i.first > j.first;
-}
-
-void Coverage::finalize()
-{
-  int pass_offset = 0;
-  if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-    finalize_buffer(coverage_object, pass_offset);
-    pass_offset += kernel_data.film.cryptomatte_depth * 4;
-  }
-  if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-    finalize_buffer(coverage_material, pass_offset);
-    pass_offset += kernel_data.film.cryptomatte_depth * 4;
-  }
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-    finalize_buffer(coverage_asset, pass_offset);
-  }
-}
-
-void Coverage::init_path_trace()
-{
-  kg->coverage_object = kg->coverage_material = kg->coverage_asset = NULL;
-
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-      coverage_object.clear();
-      coverage_object.resize(tile.w * tile.h);
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-      coverage_material.clear();
-      coverage_material.resize(tile.w * tile.h);
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-      coverage_asset.clear();
-      coverage_asset.resize(tile.w * tile.h);
-    }
-  }
-}
-
-void Coverage::init_pixel(int x, int y)
-{
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    const int pixel_index = tile.w * (y - tile.y) + x - tile.x;
-    if (kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
-      kg->coverage_object = &coverage_object[pixel_index];
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
-      kg->coverage_material = &coverage_material[pixel_index];
-    }
-    if (kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
-      kg->coverage_asset = &coverage_asset[pixel_index];
-    }
-  }
-}
-
-void Coverage::finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
-  if (kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE) {
-    flatten_buffer(coverage, pass_offset);
-  }
-  else {
-    sort_buffer(pass_offset);
-  }
-}
-
-void Coverage::flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset)
-{
-  /* Sort the coverage map and write it to the output */
-  int pixel_index = 0;
-  int pass_stride = tile.buffers->params.get_passes_size();
-  for (int y = 0; y < tile.h; ++y) {
-    for (int x = 0; x < tile.w; ++x) {
-      const CoverageMap &pixel = coverage[pixel_index];
-      if (!pixel.empty()) {
-        /* buffer offset */
-        int index = x + y * tile.stride;
-        float *buffer = (float *)tile.buffer + index * pass_stride;
-
-        /* sort the cryptomatte pixel */
-        vector<pair<float, float>> sorted_pixel;
-        for (CoverageMap::const_iterator it = pixel.begin(); it != pixel.end(); ++it) {
-          sorted_pixel.push_back(std::make_pair(it->second, it->first));
-        }
-        sort(sorted_pixel.begin(), sorted_pixel.end(), crypomatte_comp);
-        int num_slots = 2 * (kernel_data.film.cryptomatte_depth);
-        if (sorted_pixel.size() > num_slots) {
-          float leftover = 0.0f;
-          for (vector<pair<float, float>>::iterator it = sorted_pixel.begin() + num_slots;
-               it != sorted_pixel.end();
-               ++it) {
-            leftover += it->first;
-          }
-          sorted_pixel[num_slots - 1].first += leftover;
-        }
-        int limit = min(num_slots, sorted_pixel.size());
-        for (int i = 0; i < limit; ++i) {
-          kernel_write_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
-                                2 * (kernel_data.film.cryptomatte_depth),
-                                sorted_pixel[i].second,
-                                sorted_pixel[i].first);
-        }
-      }
-      ++pixel_index;
-    }
-  }
-}
-
-void Coverage::sort_buffer(const int pass_offset)
-{
-  /* Sort the coverage map and write it to the output */
-  int pass_stride = tile.buffers->params.get_passes_size();
-  for (int y = 0; y < tile.h; ++y) {
-    for (int x = 0; x < tile.w; ++x) {
-      /* buffer offset */
-      int index = x + y * tile.stride;
-      float *buffer = (float *)tile.buffer + index * pass_stride;
-      kernel_sort_id_slots(buffer + kernel_data.film.pass_cryptomatte + pass_offset,
-                           2 * (kernel_data.film.cryptomatte_depth));
-    }
-  }
-}
-
-CCL_NAMESPACE_END
diff --git a/intern/cycles/render/coverage.h b/intern/cycles/render/coverage.h
deleted file mode 100644
index 12182c614da..00000000000
--- a/intern/cycles/render/coverage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2018 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __COVERAGE_H__
-#define __COVERAGE_H__
-
-#include "util/util_map.h"
-#include "util/util_vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct KernelGlobals;
-class RenderTile;
-
-typedef unordered_map<float, float> CoverageMap;
-
-class Coverage {
- public:
-  Coverage(KernelGlobals *kg_, RenderTile &tile_) : kg(kg_), tile(tile_)
-  {
-  }
-  void init_path_trace();
-  void init_pixel(int x, int y);
-  void finalize();
-
- private:
-  vector<CoverageMap> coverage_object;
-  vector<CoverageMap> coverage_material;
-  vector<CoverageMap> coverage_asset;
-  KernelGlobals *kg;
-  RenderTile &tile;
-  void finalize_buffer(vector<CoverageMap> &coverage, const int pass_offset);
-  void flatten_buffer(vector<CoverageMap> &coverage, const int pass_offset);
-  void sort_buffer(const int pass_offset);
-};
-
-CCL_NAMESPACE_END
-
-#endif /* __COVERAGE_H__ */
diff --git a/intern/cycles/render/denoising.cpp b/intern/cycles/render/denoising.cpp
index ddbe7484800..bcf8d3fa204 100644
--- a/intern/cycles/render/denoising.cpp
+++ b/intern/cycles/render/denoising.cpp
@@ -16,15 +16,17 @@
 
 #include "render/denoising.h"
 
-#include "kernel/filter/filter_defines.h"
+#if 0
 
-#include "util/util_foreach.h"
-#include "util/util_map.h"
-#include "util/util_system.h"
-#include "util/util_task.h"
-#include "util/util_time.h"
+#  include "kernel/filter/filter_defines.h"
 
-#include <OpenImageIO/filesystem.h>
+#  include "util/util_foreach.h"
+#  include "util/util_map.h"
+#  include "util/util_system.h"
+#  include "util/util_task.h"
+#  include "util/util_time.h"
+
+#  include <OpenImageIO/filesystem.h>
 
 CCL_NAMESPACE_BEGIN
 
@@ -225,7 +227,7 @@ bool DenoiseImageLayer::match_channels(int neighbor,
 /* Denoise Task */
 
 DenoiseTask::DenoiseTask(Device *device,
-                         Denoiser *denoiser,
+                         DenoiserPipeline *denoiser,
                          int frame,
                          const vector<int> &neighbor_frames)
     : denoiser(denoiser),
@@ -386,7 +388,6 @@ void DenoiseTask::create_task(DeviceTask &task)
   task.denoising = denoiser->params;
   task.denoising.type = DENOISER_NLM;
   task.denoising.use = true;
-  task.denoising.store_passes = false;
   task.denoising_from_render = false;
 
   task.denoising_frames.resize(neighbor_frames.size());
@@ -863,7 +864,7 @@ bool DenoiseImage::save_output(const string &out_filepath, string &error)
 
 /* File pattern handling and outer loop over frames */
 
-Denoiser::Denoiser(DeviceInfo &device_info)
+DenoiserPipeline::DenoiserPipeline(DeviceInfo &device_info)
 {
   samples_override = 0;
   tile_size = make_int2(64, 64);
@@ -876,18 +877,16 @@ Denoiser::Denoiser(DeviceInfo &device_info)
   /* Initialize device. */
   device = Device::create(device_info, stats, profiler, true);
 
-  DeviceRequestedFeatures req;
-  req.use_denoising = true;
-  device->load_kernels(req);
+  device->load_kernels(KERNEL_FEATURE_DENOISING);
 }
 
-Denoiser::~Denoiser()
+DenoiserPipeline::~DenoiserPipeline()
 {
   delete device;
   TaskScheduler::exit();
 }
 
-bool Denoiser::run()
+bool DenoiserPipeline::run()
 {
   assert(input.size() == output.size());
 
@@ -931,3 +930,5 @@ bool Denoiser::run()
 }
 
 CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/render/denoising.h b/intern/cycles/render/denoising.h
index c1b4d0a5596..097cc570d06 100644
--- a/intern/cycles/render/denoising.h
+++ b/intern/cycles/render/denoising.h
@@ -17,27 +17,31 @@
 #ifndef __DENOISING_H__
 #define __DENOISING_H__
 
-#include "device/device.h"
-#include "device/device_denoising.h"
+#if 0
 
-#include "render/buffers.h"
+/* TODO(sergey): Make it explicit and clear when something is a denoiser, its pipeline or
+ * parameters. Currently it is an annoying mixture of terms used interchangeably. */
 
-#include "util/util_string.h"
-#include "util/util_unique_ptr.h"
-#include "util/util_vector.h"
+#  include "device/device.h"
 
-#include <OpenImageIO/imageio.h>
+#  include "render/buffers.h"
+
+#  include "util/util_string.h"
+#  include "util/util_unique_ptr.h"
+#  include "util/util_vector.h"
+
+#  include <OpenImageIO/imageio.h>
 
 OIIO_NAMESPACE_USING
 
 CCL_NAMESPACE_BEGIN
 
-/* Denoiser */
+/* Denoiser pipeline */
 
-class Denoiser {
+class DenoiserPipeline {
  public:
-  Denoiser(DeviceInfo &device_info);
-  ~Denoiser();
+  DenoiserPipeline(DeviceInfo &device_info);
+  ~DenoiserPipeline();
 
   bool run();
 
@@ -155,7 +159,10 @@ class DenoiseImage {
 
 class DenoiseTask {
  public:
-  DenoiseTask(Device *device, Denoiser *denoiser, int frame, const vector<int> &neighbor_frames);
+  DenoiseTask(Device *device,
+              DenoiserPipeline *denoiser,
+              int frame,
+              const vector<int> &neighbor_frames);
   ~DenoiseTask();
 
   /* Task stages */
@@ -168,7 +175,7 @@ class DenoiseTask {
 
  protected:
   /* Denoiser parameters and device */
-  Denoiser *denoiser;
+  DenoiserPipeline *denoiser;
   Device *device;
 
   /* Frame number to be denoised */
@@ -204,4 +211,6 @@ class DenoiseTask {
 
 CCL_NAMESPACE_END
 
+#endif
+
 #endif /* __DENOISING_H__ */
diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp
index 5df396394c4..8e14b338bd3 100644
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -16,9 +16,12 @@
 
 #include "render/film.h"
 #include "device/device.h"
+#include "render/background.h"
+#include "render/bake.h"
 #include "render/camera.h"
 #include "render/integrator.h"
 #include "render/mesh.h"
+#include "render/object.h"
 #include "render/scene.h"
 #include "render/stats.h"
 #include "render/tables.h"
@@ -31,261 +34,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Pass */
-
-static bool compare_pass_order(const Pass &a, const Pass &b)
-{
-  if (a.components == b.components)
-    return (a.type < b.type);
-  return (a.components > b.components);
-}
-
-static NodeEnum *get_pass_type_enum()
-{
-  static NodeEnum pass_type_enum;
-  pass_type_enum.insert("combined", PASS_COMBINED);
-  pass_type_enum.insert("depth", PASS_DEPTH);
-  pass_type_enum.insert("normal", PASS_NORMAL);
-  pass_type_enum.insert("uv", PASS_UV);
-  pass_type_enum.insert("object_id", PASS_OBJECT_ID);
-  pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
-  pass_type_enum.insert("motion", PASS_MOTION);
-  pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
-  pass_type_enum.insert("render_time", PASS_RENDER_TIME);
-  pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
-  pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
-  pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
-  pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
-  pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
-  pass_type_enum.insert("mist", PASS_MIST);
-  pass_type_enum.insert("emission", PASS_EMISSION);
-  pass_type_enum.insert("background", PASS_BACKGROUND);
-  pass_type_enum.insert("ambient_occlusion", PASS_AO);
-  pass_type_enum.insert("shadow", PASS_SHADOW);
-  pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
-  pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
-  pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
-  pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
-  pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
-  pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
-  pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
-  pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
-  pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
-  pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
-  pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
-  pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
-  pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
-
-  return &pass_type_enum;
-}
-
-NODE_DEFINE(Pass)
-{
-  NodeType *type = NodeType::add("pass", create);
-
-  NodeEnum *pass_type_enum = get_pass_type_enum();
-  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
-  SOCKET_STRING(name, "Name", ustring());
-
-  return type;
-}
-
-Pass::Pass() : Node(get_node_type())
-{
-}
-
-void Pass::add(PassType type, vector<Pass> &passes, const char *name)
-{
-  for (size_t i = 0; i < passes.size(); i++) {
-    if (passes[i].type != type) {
-      continue;
-    }
-
-    /* An empty name is used as a placeholder to signal that any pass of
-     * that type is fine (because the content always is the same).
-     * This is important to support divide_type: If the pass that has a
-     * divide_type is added first, a pass for divide_type with an empty
-     * name will be added. Then, if a matching pass with a name is later
-     * requested, the existing placeholder will be renamed to that.
-     * If the divide_type is explicitly allocated with a name first and
-     * then again as part of another pass, the second one will just be
-     * skipped because that type already exists. */
-
-    /* If no name is specified, any pass of the correct type will match. */
-    if (name == NULL) {
-      return;
-    }
-
-    /* If we already have a placeholder pass, rename that one. */
-    if (passes[i].name.empty()) {
-      passes[i].name = name;
-      return;
-    }
-
-    /* If neither existing nor requested pass have placeholder name, they
-     * must match. */
-    if (name == passes[i].name) {
-      return;
-    }
-  }
-
-  Pass pass;
-
-  pass.type = type;
-  pass.filter = true;
-  pass.exposure = false;
-  pass.divide_type = PASS_NONE;
-  if (name) {
-    pass.name = name;
-  }
-
-  switch (type) {
-    case PASS_NONE:
-      pass.components = 0;
-      break;
-    case PASS_COMBINED:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_DEPTH:
-      pass.components = 1;
-      pass.filter = false;
-      break;
-    case PASS_MIST:
-      pass.components = 1;
-      break;
-    case PASS_NORMAL:
-      pass.components = 4;
-      break;
-    case PASS_UV:
-      pass.components = 4;
-      break;
-    case PASS_MOTION:
-      pass.components = 4;
-      pass.divide_type = PASS_MOTION_WEIGHT;
-      break;
-    case PASS_MOTION_WEIGHT:
-      pass.components = 1;
-      break;
-    case PASS_OBJECT_ID:
-    case PASS_MATERIAL_ID:
-      pass.components = 1;
-      pass.filter = false;
-      break;
-
-    case PASS_EMISSION:
-    case PASS_BACKGROUND:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_AO:
-      pass.components = 4;
-      break;
-    case PASS_SHADOW:
-      pass.components = 4;
-      pass.exposure = false;
-      break;
-    case PASS_LIGHT:
-      /* This isn't a real pass, used by baking to see whether
-       * light data is needed or not.
-       *
-       * Set components to 0 so pass sort below happens in a
-       * determined way.
-       */
-      pass.components = 0;
-      break;
-    case PASS_RENDER_TIME:
-      /* This pass is handled entirely on the host side. */
-      pass.components = 0;
-      break;
-
-    case PASS_DIFFUSE_COLOR:
-    case PASS_GLOSSY_COLOR:
-    case PASS_TRANSMISSION_COLOR:
-      pass.components = 4;
-      break;
-    case PASS_DIFFUSE_DIRECT:
-    case PASS_DIFFUSE_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_DIFFUSE_COLOR;
-      break;
-    case PASS_GLOSSY_DIRECT:
-    case PASS_GLOSSY_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_GLOSSY_COLOR;
-      break;
-    case PASS_TRANSMISSION_DIRECT:
-    case PASS_TRANSMISSION_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      pass.divide_type = PASS_TRANSMISSION_COLOR;
-      break;
-    case PASS_VOLUME_DIRECT:
-    case PASS_VOLUME_INDIRECT:
-      pass.components = 4;
-      pass.exposure = true;
-      break;
-    case PASS_CRYPTOMATTE:
-      pass.components = 4;
-      break;
-    case PASS_ADAPTIVE_AUX_BUFFER:
-      pass.components = 4;
-      break;
-    case PASS_SAMPLE_COUNT:
-      pass.components = 1;
-      pass.exposure = false;
-      break;
-    case PASS_AOV_COLOR:
-      pass.components = 4;
-      break;
-    case PASS_AOV_VALUE:
-      pass.components = 1;
-      break;
-    case PASS_BAKE_PRIMITIVE:
-    case PASS_BAKE_DIFFERENTIAL:
-      pass.components = 4;
-      pass.exposure = false;
-      pass.filter = false;
-      break;
-    default:
-      assert(false);
-      break;
-  }
-
-  passes.push_back(pass);
-
-  /* Order from by components, to ensure alignment so passes with size 4
-   * come first and then passes with size 1. Note this must use stable sort
-   * so cryptomatte passes remain in the right order. */
-  stable_sort(&passes[0], &passes[0] + passes.size(), compare_pass_order);
-
-  if (pass.divide_type != PASS_NONE)
-    Pass::add(pass.divide_type, passes);
-}
-
-bool Pass::equals(const vector<Pass> &A, const vector<Pass> &B)
-{
-  if (A.size() != B.size())
-    return false;
-
-  for (int i = 0; i < A.size(); i++)
-    if (A[i].type != B[i].type || A[i].name != B[i].name)
-      return false;
-
-  return true;
-}
-
-bool Pass::contains(const vector<Pass> &passes, PassType type)
-{
-  for (size_t i = 0; i < passes.size(); i++)
-    if (passes[i].type == type)
-      return true;
-
-  return false;
-}
-
 /* Pixel Filter */
 
 static float filter_func_box(float /*v*/, float /*width*/)
@@ -368,17 +116,11 @@ NODE_DEFINE(Film)
   SOCKET_FLOAT(mist_depth, "Mist Depth", 100.0f);
   SOCKET_FLOAT(mist_falloff, "Mist Falloff", 1.0f);
 
-  SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
-  SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
-  SOCKET_BOOLEAN(denoising_prefiltered_pass, "Generate Denoising Prefiltered Pass", false);
-  SOCKET_INT(denoising_flags, "Denoising Flags", 0);
-  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
-
-  SOCKET_BOOLEAN(use_light_visibility, "Use Light Visibility", false);
-
-  NodeEnum *pass_type_enum = get_pass_type_enum();
+  const NodeEnum *pass_type_enum = Pass::get_type_enum();
   SOCKET_ENUM(display_pass, "Display Pass", *pass_type_enum, PASS_COMBINED);
 
+  SOCKET_BOOLEAN(show_active_pixels, "Show Active Pixels", false);
+
   static NodeEnum cryptomatte_passes_enum;
   cryptomatte_passes_enum.insert("none", CRYPT_NONE);
   cryptomatte_passes_enum.insert("object", CRYPT_OBJECT);
@@ -389,15 +131,13 @@ NODE_DEFINE(Film)
 
   SOCKET_INT(cryptomatte_depth, "Cryptomatte Depth", 0);
 
+  SOCKET_BOOLEAN(use_approximate_shadow_catcher, "Use Approximate Shadow Catcher", false);
+
   return type;
 }
 
-Film::Film() : Node(get_node_type())
+Film::Film() : Node(get_node_type()), filter_table_offset_(TABLE_OFFSET_INVALID)
 {
-  use_light_visibility = false;
-  filter_table_offset = TABLE_OFFSET_INVALID;
-  cryptomatte_passes = CRYPT_NONE;
-  display_pass = PASS_COMBINED;
 }
 
 Film::~Film()
@@ -406,7 +146,8 @@ Film::~Film()
 
 void Film::add_default(Scene *scene)
 {
-  Pass::add(PASS_COMBINED, scene->passes);
+  Pass *pass = scene->create_node<Pass>();
+  pass->set_type(PASS_COMBINED);
 }
 
 void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
@@ -426,50 +167,77 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 
   /* update __data */
   kfilm->exposure = exposure;
+  kfilm->pass_alpha_threshold = pass_alpha_threshold;
   kfilm->pass_flag = 0;
 
-  kfilm->display_pass_stride = -1;
-  kfilm->display_pass_components = 0;
-  kfilm->display_divide_pass_stride = -1;
-  kfilm->use_display_exposure = false;
-  kfilm->use_display_pass_alpha = (display_pass == PASS_COMBINED);
+  kfilm->use_approximate_shadow_catcher = get_use_approximate_shadow_catcher();
 
   kfilm->light_pass_flag = 0;
   kfilm->pass_stride = 0;
-  kfilm->use_light_pass = use_light_visibility;
-  kfilm->pass_aov_value_num = 0;
-  kfilm->pass_aov_color_num = 0;
+
+  /* Mark with PASS_UNUSED to avoid mask test in the kernel. */
+  kfilm->pass_background = PASS_UNUSED;
+  kfilm->pass_emission = PASS_UNUSED;
+  kfilm->pass_ao = PASS_UNUSED;
+  kfilm->pass_diffuse_direct = PASS_UNUSED;
+  kfilm->pass_diffuse_indirect = PASS_UNUSED;
+  kfilm->pass_glossy_direct = PASS_UNUSED;
+  kfilm->pass_glossy_indirect = PASS_UNUSED;
+  kfilm->pass_transmission_direct = PASS_UNUSED;
+  kfilm->pass_transmission_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_volume_direct = PASS_UNUSED;
+  kfilm->pass_volume_indirect = PASS_UNUSED;
+  kfilm->pass_shadow = PASS_UNUSED;
+
+  /* Mark passes as unused so that the kernel knows the pass is inaccessible. */
+  kfilm->pass_denoising_normal = PASS_UNUSED;
+  kfilm->pass_denoising_albedo = PASS_UNUSED;
+  kfilm->pass_sample_count = PASS_UNUSED;
+  kfilm->pass_adaptive_aux_buffer = PASS_UNUSED;
+  kfilm->pass_shadow_catcher = PASS_UNUSED;
+  kfilm->pass_shadow_catcher_sample_count = PASS_UNUSED;
+  kfilm->pass_shadow_catcher_matte = PASS_UNUSED;
 
   bool have_cryptomatte = false;
+  bool have_aov_color = false;
+  bool have_aov_value = false;
 
   for (size_t i = 0; i < scene->passes.size(); i++) {
-    Pass &pass = scene->passes[i];
+    const Pass *pass = scene->passes[i];
 
-    if (pass.type == PASS_NONE) {
+    if (pass->get_type() == PASS_NONE || !pass->is_written()) {
+      continue;
+    }
+
+    if (pass->get_mode() == PassMode::DENOISED) {
+      /* Generally we only storing offsets of the noisy passes. The display pass is an exception
+       * since it is a read operation and not a write. */
+      kfilm->pass_stride += pass->get_info().num_components;
       continue;
     }
 
     /* Can't do motion pass if no motion vectors are available. */
-    if (pass.type == PASS_MOTION || pass.type == PASS_MOTION_WEIGHT) {
+    if (pass->get_type() == PASS_MOTION || pass->get_type() == PASS_MOTION_WEIGHT) {
       if (scene->need_motion() != Scene::MOTION_PASS) {
-        kfilm->pass_stride += pass.components;
+        kfilm->pass_stride += pass->get_info().num_components;
         continue;
       }
     }
 
-    int pass_flag = (1 << (pass.type % 32));
-    if (pass.type <= PASS_CATEGORY_MAIN_END) {
-      kfilm->pass_flag |= pass_flag;
-    }
-    else if (pass.type <= PASS_CATEGORY_LIGHT_END) {
-      kfilm->use_light_pass = 1;
+    const int pass_flag = (1 << (pass->get_type() % 32));
+    if (pass->get_type() <= PASS_CATEGORY_LIGHT_END) {
       kfilm->light_pass_flag |= pass_flag;
     }
+    else if (pass->get_type() <= PASS_CATEGORY_DATA_END) {
+      kfilm->pass_flag |= pass_flag;
+    }
     else {
-      assert(pass.type <= PASS_CATEGORY_BAKE_END);
+      assert(pass->get_type() <= PASS_CATEGORY_BAKE_END);
     }
 
-    switch (pass.type) {
+    switch (pass->get_type()) {
       case PASS_COMBINED:
         kfilm->pass_combined = kfilm->pass_stride;
         break;
@@ -479,6 +247,12 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
       case PASS_NORMAL:
         kfilm->pass_normal = kfilm->pass_stride;
         break;
+      case PASS_POSITION:
+        kfilm->pass_position = kfilm->pass_stride;
+        break;
+      case PASS_ROUGHNESS:
+        kfilm->pass_roughness = kfilm->pass_stride;
+        break;
       case PASS_UV:
         kfilm->pass_uv = kfilm->pass_stride;
         break;
@@ -511,9 +285,6 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
         kfilm->pass_shadow = kfilm->pass_stride;
         break;
 
-      case PASS_LIGHT:
-        break;
-
       case PASS_DIFFUSE_COLOR:
         kfilm->pass_diffuse_color = kfilm->pass_stride;
         break;
@@ -563,78 +334,56 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
                                       kfilm->pass_stride;
         have_cryptomatte = true;
         break;
+
+      case PASS_DENOISING_NORMAL:
+        kfilm->pass_denoising_normal = kfilm->pass_stride;
+        break;
+      case PASS_DENOISING_ALBEDO:
+        kfilm->pass_denoising_albedo = kfilm->pass_stride;
+        break;
+
+      case PASS_SHADOW_CATCHER:
+        kfilm->pass_shadow_catcher = kfilm->pass_stride;
+        break;
+      case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+        kfilm->pass_shadow_catcher_sample_count = kfilm->pass_stride;
+        break;
+      case PASS_SHADOW_CATCHER_MATTE:
+        kfilm->pass_shadow_catcher_matte = kfilm->pass_stride;
+        break;
+
       case PASS_ADAPTIVE_AUX_BUFFER:
         kfilm->pass_adaptive_aux_buffer = kfilm->pass_stride;
         break;
       case PASS_SAMPLE_COUNT:
         kfilm->pass_sample_count = kfilm->pass_stride;
         break;
+
       case PASS_AOV_COLOR:
-        if (kfilm->pass_aov_color_num == 0) {
+        if (!have_aov_color) {
           kfilm->pass_aov_color = kfilm->pass_stride;
+          have_aov_color = true;
         }
-        kfilm->pass_aov_color_num++;
         break;
       case PASS_AOV_VALUE:
-        if (kfilm->pass_aov_value_num == 0) {
+        if (!have_aov_value) {
           kfilm->pass_aov_value = kfilm->pass_stride;
+          have_aov_value = true;
         }
-        kfilm->pass_aov_value_num++;
         break;
       default:
         assert(false);
         break;
     }
 
-    if (pass.type == display_pass) {
-      kfilm->display_pass_stride = kfilm->pass_stride;
-      kfilm->display_pass_components = pass.components;
-      kfilm->use_display_exposure = pass.exposure && (kfilm->exposure != 1.0f);
-    }
-    else if (pass.type == PASS_DIFFUSE_COLOR || pass.type == PASS_TRANSMISSION_COLOR ||
-             pass.type == PASS_GLOSSY_COLOR) {
-      kfilm->display_divide_pass_stride = kfilm->pass_stride;
-    }
-
-    kfilm->pass_stride += pass.components;
-  }
-
-  kfilm->pass_denoising_data = 0;
-  kfilm->pass_denoising_clean = 0;
-  kfilm->denoising_flags = 0;
-  if (denoising_data_pass) {
-    kfilm->pass_denoising_data = kfilm->pass_stride;
-    kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
-    kfilm->denoising_flags = denoising_flags;
-    if (denoising_clean_pass) {
-      kfilm->pass_denoising_clean = kfilm->pass_stride;
-      kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
-      kfilm->use_light_pass = 1;
-    }
-    if (denoising_prefiltered_pass) {
-      kfilm->pass_stride += DENOISING_PASS_SIZE_PREFILTERED;
-    }
-  }
-
-  kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
-
-  /* When displaying the normal/uv pass in the viewport we need to disable
-   * transparency.
-   *
-   * We also don't need to perform light accumulations. Later we want to optimize this to suppress
-   * light calculations. */
-  if (display_pass == PASS_NORMAL || display_pass == PASS_UV) {
-    kfilm->use_light_pass = 0;
-  }
-  else {
-    kfilm->pass_alpha_threshold = pass_alpha_threshold;
+    kfilm->pass_stride += pass->get_info().num_components;
   }
 
   /* update filter table */
   vector<float> table = filter_table(filter_type, filter_width);
-  scene->lookup_tables->remove_table(&filter_table_offset);
-  filter_table_offset = scene->lookup_tables->add_table(dscene, table);
-  kfilm->filter_table_offset = (int)filter_table_offset;
+  scene->lookup_tables->remove_table(&filter_table_offset_);
+  filter_table_offset_ = scene->lookup_tables->add_table(dscene, table);
+  kfilm->filter_table_offset = (int)filter_table_offset_;
 
   /* mist pass parameters */
   kfilm->mist_start = mist_start;
@@ -644,79 +393,298 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
   kfilm->cryptomatte_passes = cryptomatte_passes;
   kfilm->cryptomatte_depth = cryptomatte_depth;
 
-  pass_stride = kfilm->pass_stride;
-  denoising_data_offset = kfilm->pass_denoising_data;
-  denoising_clean_offset = kfilm->pass_denoising_clean;
-
   clear_modified();
 }
 
 void Film::device_free(Device * /*device*/, DeviceScene * /*dscene*/, Scene *scene)
 {
-  scene->lookup_tables->remove_table(&filter_table_offset);
+  scene->lookup_tables->remove_table(&filter_table_offset_);
 }
 
-void Film::tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes)
+int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
 {
-  if (Pass::contains(scene->passes, PASS_UV) != Pass::contains(passes_, PASS_UV)) {
-    scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
+  int offset_color = 0, offset_value = 0;
+  foreach (const Pass *pass, scene->passes) {
+    if (pass->get_name() == name) {
+      if (pass->get_type() == PASS_AOV_VALUE) {
+        is_color = false;
+        return offset_value;
+      }
+      else if (pass->get_type() == PASS_AOV_COLOR) {
+        is_color = true;
+        return offset_color;
+      }
+    }
+
+    if (pass->get_type() == PASS_AOV_VALUE) {
+      offset_value += pass->get_info().num_components;
+    }
+    else if (pass->get_type() == PASS_AOV_COLOR) {
+      offset_color += pass->get_info().num_components;
+    }
+  }
+
+  return -1;
+}
+
+void Film::update_passes(Scene *scene, bool add_sample_count_pass)
+{
+  const Background *background = scene->background;
+  const BakeManager *bake_manager = scene->bake_manager;
+  const ObjectManager *object_manager = scene->object_manager;
+  Integrator *integrator = scene->integrator;
+
+  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
+    return;
+  }
+
+  /* Remove auto generated passes and recreate them. */
+  remove_auto_passes(scene);
+
+  /* Display pass for viewport. */
+  const PassType display_pass = get_display_pass();
+  add_auto_pass(scene, display_pass);
+
+  /* Assumption is that a combined pass always exists for now, for example
+   * adaptive sampling is always based on a combined pass. But we should
+   * try to lift this limitation in the future for faster rendering of
+   * individual passes. */
+  if (display_pass != PASS_COMBINED) {
+    add_auto_pass(scene, PASS_COMBINED);
+  }
+
+  /* Create passes needed for adaptive sampling. */
+  const AdaptiveSampling adaptive_sampling = integrator->get_adaptive_sampling();
+  if (adaptive_sampling.use) {
+    add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    add_auto_pass(scene, PASS_ADAPTIVE_AUX_BUFFER);
+  }
+
+  /* Create passes needed for denoising. */
+  const bool use_denoise = integrator->get_use_denoise();
+  if (use_denoise) {
+    if (integrator->get_use_denoise_pass_normal()) {
+      add_auto_pass(scene, PASS_DENOISING_NORMAL);
+    }
+    if (integrator->get_use_denoise_pass_albedo()) {
+      add_auto_pass(scene, PASS_DENOISING_ALBEDO);
+    }
+  }
+
+  /* Create passes for shadow catcher. */
+  if (scene->has_shadow_catcher()) {
+    const bool need_background = get_use_approximate_shadow_catcher() &&
+                                 !background->get_transparent();
+
+    add_auto_pass(scene, PASS_SHADOW_CATCHER);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_MATTE);
+
+    if (need_background) {
+      add_auto_pass(scene, PASS_BACKGROUND);
+    }
+  }
+  else if (Pass::contains(scene->passes, PASS_SHADOW_CATCHER)) {
+    add_auto_pass(scene, PASS_SHADOW_CATCHER);
+    add_auto_pass(scene, PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+  }
+
+  const vector<Pass *> passes_immutable = scene->passes;
+  for (const Pass *pass : passes_immutable) {
+    const PassInfo info = pass->get_info();
+    /* Add utility passes needed to generate some light passes. */
+    if (info.divide_type != PASS_NONE) {
+      add_auto_pass(scene, info.divide_type);
+    }
+    if (info.direct_type != PASS_NONE) {
+      add_auto_pass(scene, info.direct_type);
+    }
+    if (info.indirect_type != PASS_NONE) {
+      add_auto_pass(scene, info.indirect_type);
+    }
+
+    /* NOTE: Enable all denoised passes when storage is requested.
+     * This way it is possible to tweak denoiser parameters later on. */
+    if (info.support_denoise && use_denoise) {
+      add_auto_pass(scene, pass->get_type(), PassMode::DENOISED);
+    }
+  }
+
+  if (bake_manager->get_baking()) {
+    add_auto_pass(scene, PASS_BAKE_PRIMITIVE, "BakePrimitive");
+    add_auto_pass(scene, PASS_BAKE_DIFFERENTIAL, "BakeDifferential");
+  }
+
+  if (add_sample_count_pass) {
+    if (!Pass::contains(scene->passes, PASS_SAMPLE_COUNT)) {
+      add_auto_pass(scene, PASS_SAMPLE_COUNT);
+    }
+  }
+
+  /* Remove duplicates and initialize internal pass info. */
+  finalize_passes(scene, use_denoise);
 
+  /* Flush scene updates. */
+  const bool have_uv_pass = Pass::contains(scene->passes, PASS_UV);
+  const bool have_motion_pass = Pass::contains(scene->passes, PASS_MOTION);
+  const bool have_ao_pass = Pass::contains(scene->passes, PASS_AO);
+
+  if (have_uv_pass != prev_have_uv_pass) {
+    scene->geometry_manager->tag_update(scene, GeometryManager::UV_PASS_NEEDED);
     foreach (Shader *shader, scene->shaders)
       shader->need_update_uvs = true;
   }
-  else if (Pass::contains(scene->passes, PASS_MOTION) != Pass::contains(passes_, PASS_MOTION)) {
+  if (have_motion_pass != prev_have_motion_pass) {
     scene->geometry_manager->tag_update(scene, GeometryManager::MOTION_PASS_NEEDED);
   }
-  else if (Pass::contains(scene->passes, PASS_AO) != Pass::contains(passes_, PASS_AO)) {
+  if (have_ao_pass != prev_have_ao_pass) {
     scene->integrator->tag_update(scene, Integrator::AO_PASS_MODIFIED);
   }
 
-  if (update_passes) {
-    scene->passes = passes_;
+  prev_have_uv_pass = have_uv_pass;
+  prev_have_motion_pass = have_motion_pass;
+  prev_have_ao_pass = have_ao_pass;
+
+  tag_modified();
+
+  /* Debug logging. */
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "Effective scene passes:";
+    for (const Pass *pass : scene->passes) {
+      VLOG(2) << "- " << *pass;
+    }
   }
 }
 
-int Film::get_aov_offset(Scene *scene, string name, bool &is_color)
+void Film::add_auto_pass(Scene *scene, PassType type, const char *name)
 {
-  int num_color = 0, num_value = 0;
-  foreach (const Pass &pass, scene->passes) {
-    if (pass.type == PASS_AOV_COLOR) {
-      num_color++;
-    }
-    else if (pass.type == PASS_AOV_VALUE) {
-      num_value++;
+  add_auto_pass(scene, type, PassMode::NOISY, name);
+}
+
+void Film::add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name)
+{
+  Pass *pass = new Pass();
+  pass->set_type(type);
+  pass->set_mode(mode);
+  pass->set_name(ustring((name) ? name : ""));
+  pass->is_auto_ = true;
+
+  pass->set_owner(scene);
+  scene->passes.push_back(pass);
+}
+
+void Film::remove_auto_passes(Scene *scene)
+{
+  /* Remove all passes which were automatically created. */
+  vector<Pass *> new_passes;
+
+  for (Pass *pass : scene->passes) {
+    if (!pass->is_auto_) {
+      new_passes.push_back(pass);
     }
     else {
-      continue;
-    }
-
-    if (pass.name == name) {
-      is_color = (pass.type == PASS_AOV_COLOR);
-      return (is_color ? num_color : num_value) - 1;
+      delete pass;
     }
   }
 
-  return -1;
+  scene->passes = new_passes;
 }
 
-int Film::get_pass_stride() const
+static bool compare_pass_order(const Pass *a, const Pass *b)
 {
-  return pass_stride;
-}
+  const int num_components_a = a->get_info().num_components;
+  const int num_components_b = b->get_info().num_components;
 
-int Film::get_denoising_data_offset() const
-{
-  return denoising_data_offset;
+  if (num_components_a == num_components_b) {
+    return (a->get_type() < b->get_type());
+  }
+
+  return num_components_a > num_components_b;
 }
 
-int Film::get_denoising_clean_offset() const
+void Film::finalize_passes(Scene *scene, const bool use_denoise)
 {
-  return denoising_clean_offset;
+  /* Remove duplicate passes. */
+  vector<Pass *> new_passes;
+
+  for (Pass *pass : scene->passes) {
+    /* Disable denoising on passes if denoising is disabled, or if the
+     * pass does not support it. */
+    pass->set_mode((use_denoise && pass->get_info().support_denoise) ? pass->get_mode() :
+                                                                       PassMode::NOISY);
+
+    /* Merge duplicate passes. */
+    bool duplicate_found = false;
+    for (Pass *new_pass : new_passes) {
+      /* If different type or denoising, don't merge. */
+      if (new_pass->get_type() != pass->get_type() || new_pass->get_mode() != pass->get_mode()) {
+        continue;
+      }
+
+      /* If both passes have a name and the names are different, don't merge.
+       * If either pass has a name, we'll use that name. */
+      if (!pass->get_name().empty() && !new_pass->get_name().empty() &&
+          pass->get_name() != new_pass->get_name()) {
+        continue;
+      }
+
+      if (!pass->get_name().empty() && new_pass->get_name().empty()) {
+        new_pass->set_name(pass->get_name());
+      }
+
+      new_pass->is_auto_ &= pass->is_auto_;
+      duplicate_found = true;
+      break;
+    }
+
+    if (!duplicate_found) {
+      new_passes.push_back(pass);
+    }
+    else {
+      delete pass;
+    }
+  }
+
+  /* Order from by components and type, This is required to for AOVs and cryptomatte passes,
+   * which the kernel assumes to be in order. Note this must use stable sort so cryptomatte
+   * passes remain in the right order. */
+  stable_sort(new_passes.begin(), new_passes.end(), compare_pass_order);
+
+  scene->passes = new_passes;
 }
 
-size_t Film::get_filter_table_offset() const
+uint Film::get_kernel_features(const Scene *scene) const
 {
-  return filter_table_offset;
+  uint kernel_features = 0;
+
+  for (const Pass *pass : scene->passes) {
+    if (!pass->is_written()) {
+      continue;
+    }
+
+    const PassType pass_type = pass->get_type();
+    const PassMode pass_mode = pass->get_mode();
+
+    if (pass_mode == PassMode::DENOISED || pass_type == PASS_DENOISING_NORMAL ||
+        pass_type == PASS_DENOISING_ALBEDO) {
+      kernel_features |= KERNEL_FEATURE_DENOISING;
+    }
+
+    if (pass_type != PASS_NONE && pass_type != PASS_COMBINED &&
+        pass_type <= PASS_CATEGORY_LIGHT_END) {
+      kernel_features |= KERNEL_FEATURE_LIGHT_PASSES;
+
+      if (pass_type == PASS_SHADOW) {
+        kernel_features |= KERNEL_FEATURE_SHADOW_PASS;
+      }
+    }
+
+    if (pass_type == PASS_AO) {
+      kernel_features |= KERNEL_FEATURE_NODE_RAYTRACE;
+    }
+  }
+
+  return kernel_features;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/film.h b/intern/cycles/render/film.h
index 462a7275491..5d327353361 100644
--- a/intern/cycles/render/film.h
+++ b/intern/cycles/render/film.h
@@ -17,6 +17,7 @@
 #ifndef __FILM_H__
 #define __FILM_H__
 
+#include "render/pass.h"
 #include "util/util_string.h"
 #include "util/util_vector.h"
 
@@ -38,36 +39,15 @@ typedef enum FilterType {
   FILTER_NUM_TYPES,
 } FilterType;
 
-class Pass : public Node {
- public:
-  NODE_DECLARE
-
-  Pass();
-
-  PassType type;
-  int components;
-  bool filter;
-  bool exposure;
-  PassType divide_type;
-  ustring name;
-
-  static void add(PassType type, vector<Pass> &passes, const char *name = NULL);
-  static bool equals(const vector<Pass> &A, const vector<Pass> &B);
-  static bool contains(const vector<Pass> &passes, PassType);
-};
-
 class Film : public Node {
  public:
   NODE_DECLARE
 
   NODE_SOCKET_API(float, exposure)
-  NODE_SOCKET_API(bool, denoising_data_pass)
-  NODE_SOCKET_API(bool, denoising_clean_pass)
-  NODE_SOCKET_API(bool, denoising_prefiltered_pass)
-  NODE_SOCKET_API(int, denoising_flags)
   NODE_SOCKET_API(float, pass_alpha_threshold)
 
   NODE_SOCKET_API(PassType, display_pass)
+  NODE_SOCKET_API(bool, show_active_pixels)
 
   NODE_SOCKET_API(FilterType, filter_type)
   NODE_SOCKET_API(float, filter_width)
@@ -76,17 +56,18 @@ class Film : public Node {
   NODE_SOCKET_API(float, mist_depth)
   NODE_SOCKET_API(float, mist_falloff)
 
-  NODE_SOCKET_API(bool, use_light_visibility)
   NODE_SOCKET_API(CryptomatteType, cryptomatte_passes)
   NODE_SOCKET_API(int, cryptomatte_depth)
 
-  NODE_SOCKET_API(bool, use_adaptive_sampling)
+  /* Approximate shadow catcher pass into its matte pass, so that both artificial objects and
+   * shadows can be alpha-overed onto a backdrop. */
+  NODE_SOCKET_API(bool, use_approximate_shadow_catcher)
 
  private:
-  int pass_stride;
-  int denoising_data_offset;
-  int denoising_clean_offset;
-  size_t filter_table_offset;
+  size_t filter_table_offset_;
+  bool prev_have_uv_pass = false;
+  bool prev_have_motion_pass = false;
+  bool prev_have_ao_pass = false;
 
  public:
   Film();
@@ -98,14 +79,20 @@ class Film : public Node {
   void device_update(Device *device, DeviceScene *dscene, Scene *scene);
   void device_free(Device *device, DeviceScene *dscene, Scene *scene);
 
-  void tag_passes_update(Scene *scene, const vector<Pass> &passes_, bool update_passes = true);
-
   int get_aov_offset(Scene *scene, string name, bool &is_color);
 
-  int get_pass_stride() const;
-  int get_denoising_data_offset() const;
-  int get_denoising_clean_offset() const;
-  size_t get_filter_table_offset() const;
+  /* Update passes so that they contain all passes required for the configured functionality.
+   *
+   * If `add_sample_count_pass` is true then the SAMPLE_COUNT pass is ensured to be added. */
+  void update_passes(Scene *scene, bool add_sample_count_pass);
+
+  uint get_kernel_features(const Scene *scene) const;
+
+ private:
+  void add_auto_pass(Scene *scene, PassType type, const char *name = nullptr);
+  void add_auto_pass(Scene *scene, PassType type, PassMode mode, const char *name = nullptr);
+  void remove_auto_passes(Scene *scene);
+  void finalize_passes(Scene *scene, const bool use_denoise);
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/geometry.cpp b/intern/cycles/render/geometry.cpp
index 7ec1d2d9abb..6804a006fe6 100644
--- a/intern/cycles/render/geometry.cpp
+++ b/intern/cycles/render/geometry.cpp
@@ -215,6 +215,12 @@ void Geometry::compute_bvh(
       msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
 
     Object object;
+
+    /* Ensure all visibility bits are set at the geometry level BVH. In
+     * the object level BVH is where actual visibility is tested. */
+    object.set_is_shadow_catcher(true);
+    object.set_visibility(~0);
+
     object.set_geometry(this);
 
     vector<Geometry *> geometry;
@@ -315,7 +321,7 @@ void GeometryManager::update_osl_attributes(Device *device,
 {
 #ifdef WITH_OSL
   /* for OSL, a hash map is used to lookup the attribute by name. */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   og->object_name_map.clear();
   og->attribute_map.clear();
@@ -1855,8 +1861,8 @@ void GeometryManager::device_update(Device *device,
     });
 
     Camera *dicing_camera = scene->dicing_camera;
-    dicing_camera->set_screen_size_and_resolution(
-        dicing_camera->get_full_width(), dicing_camera->get_full_height(), 1);
+    dicing_camera->set_screen_size(dicing_camera->get_full_width(),
+                                   dicing_camera->get_full_height());
     dicing_camera->update(scene);
 
     size_t i = 0;
@@ -2157,7 +2163,7 @@ void GeometryManager::device_free(Device *device, DeviceScene *dscene, bool forc
   dscene->data.bvh.bvh_layout = BVH_LAYOUT_NONE;
 
 #ifdef WITH_OSL
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   if (og) {
     og->object_name_map.clear();
diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/render/gpu_display.cpp
new file mode 100644
index 00000000000..a8f0cc50583
--- /dev/null
+++ b/intern/cycles/render/gpu_display.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/gpu_display.h"
+
+#include "render/buffers.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+void GPUDisplay::reset(const BufferParams &buffer_params)
+{
+  thread_scoped_lock lock(mutex_);
+
+  const GPUDisplayParams old_params = params_;
+
+  params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y);
+  params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height);
+  params_.size = make_int2(buffer_params.width, buffer_params.height);
+
+  /* If the parameters did change tag texture as unusable. This avoids drawing old texture content
+   * in an updated configuration of the viewport. For example, avoids drawing old frame when render
+   * border did change.
+   * If the parameters did not change, allow drawing the current state of the texture, which will
+   * not count as an up-to-date redraw. This will avoid flickering when doping camera navigation by
+   * showing a previously rendered frame for until the new one is ready. */
+  if (old_params.modified(params_)) {
+    texture_state_.is_usable = false;
+  }
+
+  texture_state_.is_outdated = true;
+}
+
+void GPUDisplay::mark_texture_updated()
+{
+  texture_state_.is_outdated = false;
+  texture_state_.is_usable = true;
+}
+
+/* --------------------------------------------------------------------
+ * Update procedure.
+ */
+
+bool GPUDisplay::update_begin(int texture_width, int texture_height)
+{
+  DCHECK(!update_state_.is_active);
+
+  if (update_state_.is_active) {
+    LOG(ERROR) << "Attempt to re-activate update process.";
+    return false;
+  }
+
+  /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+   * The update itself is non-blocking however, for better performance and to avoid
+   * potential deadlocks due to locks held by the subclass. */
+  GPUDisplayParams params;
+  {
+    thread_scoped_lock lock(mutex_);
+    params = params_;
+    texture_state_.size = make_int2(texture_width, texture_height);
+  }
+
+  if (!do_update_begin(params, texture_width, texture_height)) {
+    LOG(ERROR) << "GPUDisplay implementation could not begin update.";
+    return false;
+  }
+
+  update_state_.is_active = true;
+
+  return true;
+}
+
+void GPUDisplay::update_end()
+{
+  DCHECK(update_state_.is_active);
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to deactivate inactive update process.";
+    return;
+  }
+
+  do_update_end();
+
+  update_state_.is_active = false;
+}
+
+int2 GPUDisplay::get_texture_size() const
+{
+  return texture_state_.size;
+}
+
+/* --------------------------------------------------------------------
+ * Texture update from CPU buffer.
+ */
+
+void GPUDisplay::copy_pixels_to_texture(
+    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+{
+  DCHECK(update_state_.is_active);
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    return;
+  }
+
+  mark_texture_updated();
+  do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height);
+}
+
+/* --------------------------------------------------------------------
+ * Texture buffer mapping.
+ */
+
+half4 *GPUDisplay::map_texture_buffer()
+{
+  DCHECK(!texture_buffer_state_.is_mapped);
+  DCHECK(update_state_.is_active);
+
+  if (texture_buffer_state_.is_mapped) {
+    LOG(ERROR) << "Attempt to re-map an already mapped texture buffer.";
+    return nullptr;
+  }
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update.";
+    return nullptr;
+  }
+
+  half4 *mapped_rgba_pixels = do_map_texture_buffer();
+
+  if (mapped_rgba_pixels) {
+    texture_buffer_state_.is_mapped = true;
+  }
+
+  return mapped_rgba_pixels;
+}
+
+void GPUDisplay::unmap_texture_buffer()
+{
+  DCHECK(texture_buffer_state_.is_mapped);
+
+  if (!texture_buffer_state_.is_mapped) {
+    LOG(ERROR) << "Attempt to unmap non-mapped texture buffer.";
+    return;
+  }
+
+  texture_buffer_state_.is_mapped = false;
+
+  mark_texture_updated();
+  do_unmap_texture_buffer();
+}
+
+/* --------------------------------------------------------------------
+ * Graphics interoperability.
+ */
+
+DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get()
+{
+  DCHECK(!texture_buffer_state_.is_mapped);
+  DCHECK(update_state_.is_active);
+
+  if (texture_buffer_state_.is_mapped) {
+    LOG(ERROR)
+        << "Attempt to use graphics interoperability mode while the texture buffer is mapped.";
+    return DeviceGraphicsInteropDestination();
+  }
+
+  if (!update_state_.is_active) {
+    LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update.";
+    return DeviceGraphicsInteropDestination();
+  }
+
+  /* Assume that interop will write new values to the texture. */
+  mark_texture_updated();
+
+  return do_graphics_interop_get();
+}
+
+void GPUDisplay::graphics_interop_activate()
+{
+}
+
+void GPUDisplay::graphics_interop_deactivate()
+{
+}
+
+/* --------------------------------------------------------------------
+ * Drawing.
+ */
+
+bool GPUDisplay::draw()
+{
+  /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time.
+   * The drawing itself is non-blocking however, for better performance and to avoid
+   * potential deadlocks due to locks held by the subclass. */
+  GPUDisplayParams params;
+  bool is_usable;
+  bool is_outdated;
+
+  {
+    thread_scoped_lock lock(mutex_);
+    params = params_;
+    is_usable = texture_state_.is_usable;
+    is_outdated = texture_state_.is_outdated;
+  }
+
+  if (is_usable) {
+    do_draw(params);
+  }
+
+  return !is_outdated;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/render/gpu_display.h
new file mode 100644
index 00000000000..a01348d28d5
--- /dev/null
+++ b/intern/cycles/render/gpu_display.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "device/device_graphics_interop.h"
+#include "util/util_half.h"
+#include "util/util_thread.h"
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+class BufferParams;
+
+/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored
+ * in a GPU-side texture, which is updated from a path tracer and drawn by an application.
+ *
+ * The base GPUDisplay does some special texture state tracking, which allows render Session to
+ * make decisions on whether reset for an updated state is possible or not. This state should only
+ * be tracked in a base class and a particular implementation should not worry about it.
+ *
+ * The subclasses should only implement the pure virtual methods, which allows them to not worry
+ * about parent method calls, which helps them to be as small and reliable as possible. */
+
+class GPUDisplayParams {
+ public:
+  /* Offset of the display within a viewport.
+   * For example, set to a lower-bottom corner of border render in Blender's viewport. */
+  int2 offset = make_int2(0, 0);
+
+  /* Full viewport size.
+   *
+   * NOTE: Is not affected by the resolution divider. */
+  int2 full_size = make_int2(0, 0);
+
+  /* Effective vieport size.
+   * In the case of border render, size of the border rectangle.
+   *
+   * NOTE: Is not affected by the resolution divider. */
+  int2 size = make_int2(0, 0);
+
+  bool modified(const GPUDisplayParams &other) const
+  {
+    return !(offset == other.offset && full_size == other.full_size && size == other.size);
+  }
+};
+
+class GPUDisplay {
+ public:
+  GPUDisplay() = default;
+  virtual ~GPUDisplay() = default;
+
+  /* Reset the display for the new state of render session. Is called whenever session is reset,
+   * which happens on changes like viewport navigation or viewport dimension change.
+   *
+   * This call will configure parameters for a changed buffer and reset the texture state. */
+  void reset(const BufferParams &buffer_params);
+
+  const GPUDisplayParams &get_params() const
+  {
+    return params_;
+  }
+
+  /* --------------------------------------------------------------------
+   * Update procedure.
+   *
+   * These calls indicates a desire of the caller to update content of the displayed texture. */
+
+  /* Returns true when update is ready. Update should be finished with update_end().
+   *
+   * If false is returned then no update is possible, and no update_end() call is needed.
+   *
+   * The texture width and height denotes an actual resolution of the underlying render result. */
+  bool update_begin(int texture_width, int texture_height);
+
+  void update_end();
+
+  /* Get currently configured texture size of the display (as configured by `update_begin()`. */
+  int2 get_texture_size() const;
+
+  /* --------------------------------------------------------------------
+   * Texture update from CPU buffer.
+   *
+   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   *
+   * Most portable implementation, which must be supported by all platforms. Might not be the most
+   * efficient one.
+   */
+
+  /* Copy buffer of rendered pixels of a given size into a given position of the texture.
+   *
+   * This function does not acquire a lock. The reason for this is is to allow use of this function
+   * for partial updates from different devices. In this case the caller will acquire the lock
+   * once, update all the slices and release
+   * the lock once. This will ensure that draw() will never use partially updated texture. */
+  void copy_pixels_to_texture(
+      const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height);
+
+  /* --------------------------------------------------------------------
+   * Texture buffer mapping.
+   *
+   * This functionality is used to update GPU-side texture content without need to maintain CPU
+   * side buffer on the caller.
+   *
+   * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`.
+   *
+   * NOTE: Texture buffer can not be mapped while graphics interoperability is active. This means
+   * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and
+   * `graphics_interop_end()` calls.
+   */
+
+  /* Map pixels memory form texture to a buffer available for write from CPU. Width and height will
+   * define a requested size of the texture to write to.
+   * Upon success a non-null pointer is returned and the texture buffer is to be unmapped.
+   * If an error happens during mapping, or if mapping is not supported by this GPU display a
+   * null pointer is returned and the buffer is NOT to be unmapped.
+   *
+   * NOTE: Usually the implementation will rely on a GPU context of some sort, and the GPU context
+   * is often can not be bound to two threads simultaneously, and can not be released from a
+   * different thread. This means that the mapping API should be used from the single thread only,
+   */
+  half4 *map_texture_buffer();
+  void unmap_texture_buffer();
+
+  /* --------------------------------------------------------------------
+   * Graphics interoperability.
+   *
+   * A special code path which allows to update texture content directly from the GPU compute
+   * device. Complementary part of DeviceGraphicsInterop.
+   *
+   * NOTE: Graphics interoperability can not be used while the texture buffer is mapped. This means
+   * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and
+   * `unmap_texture_buffer()` calls. */
+
+  /* Get GPUDisplay graphics interoperability information which acts as a destination for the
+   * device API. */
+  DeviceGraphicsInteropDestination graphics_interop_get();
+
+  /* (De)activate GPU display for graphics interoperability outside of regular display update
+   * routines. */
+  virtual void graphics_interop_activate();
+  virtual void graphics_interop_deactivate();
+
+  /* --------------------------------------------------------------------
+   * Drawing.
+   */
+
+  /* Clear the texture by filling it with all zeroes.
+   *
+   * This call might happen in parallel with draw, but can never happen in parallel with the
+   * update.
+   *
+   * The actual zero-ing can be deferred to a later moment. What is important is that after clear
+   * and before pixels update the drawing texture will be fully empty, and that partial update
+   * after clear will write new pixel values for an updating area, leaving everything else zeroed.
+   *
+   * If the GPU display supports graphics interoperability then the zeroing the display is to be
+   * delegated to the device via the `DeviceGraphicsInteropDestination`. */
+  virtual void clear() = 0;
+
+  /* Draw the current state of the texture.
+   *
+   * Returns true if this call did draw an updated state of the texture. */
+  bool draw();
+
+ protected:
+  /* Implementation-specific calls which subclasses are to implement.
+   * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to
+   * simplify their particular implementation. */
+  virtual bool do_update_begin(const GPUDisplayParams &params,
+                               int texture_width,
+                               int texture_height) = 0;
+  virtual void do_update_end() = 0;
+
+  virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels,
+                                         int texture_x,
+                                         int texture_y,
+                                         int pixels_width,
+                                         int pixels_height) = 0;
+
+  virtual half4 *do_map_texture_buffer() = 0;
+  virtual void do_unmap_texture_buffer() = 0;
+
+  /* Note that this might be called in parallel to do_update_begin() and do_update_end(),
+   * the subclass is responsible for appropriate mutex locks to avoid multiple threads
+   * editing and drawing the texture at the same time. */
+  virtual void do_draw(const GPUDisplayParams &params) = 0;
+
+  virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0;
+
+ private:
+  thread_mutex mutex_;
+  GPUDisplayParams params_;
+
+  /* Mark texture as its content has been updated.
+   * Used from places which knows that the texture content has been brought up-to-date, so that the
+   * drawing knows whether it can be performed, and whether drawing happened with an up-to-date
+   * texture state. */
+  void mark_texture_updated();
+
+  /* State of the update process. */
+  struct {
+    /* True when update is in process, indicated by `update_begin()` / `update_end()`. */
+    bool is_active = false;
+  } update_state_;
+
+  /* State of the texture, which is needed for an integration with render session and interactive
+   * updates and navigation. */
+  struct {
+    /* Denotes whether possibly existing state of GPU side texture is still usable.
+     * It will not be usable in cases like render border did change (in this case we don't want
+     * previous texture to be rendered at all).
+     *
+     * However, if only navigation or object in scene did change, then the outdated state of the
+     * texture is still usable for draw, preventing display viewport flickering on navigation and
+     * object modifications. */
+    bool is_usable = false;
+
+    /* Texture is considered outdated after `reset()` until the next call of
+     * `copy_pixels_to_texture()`. */
+    bool is_outdated = true;
+
+    /* Texture size in pixels. */
+    int2 size = make_int2(0, 0);
+  } texture_state_;
+
+  /* State of the texture buffer. Is tracked to perform sanity checks. */
+  struct {
+    /* True when the texture buffer is mapped with `map_texture_buffer()`. */
+    bool is_mapped = false;
+  } texture_buffer_state_;
+};
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/graph.h b/intern/cycles/render/graph.h
index 5102b182593..3584754fad1 100644
--- a/intern/cycles/render/graph.h
+++ b/intern/cycles/render/graph.h
@@ -224,10 +224,6 @@ class ShaderNode : public Node {
   {
     return false;
   }
-  virtual bool has_raytrace()
-  {
-    return false;
-  }
   vector<ShaderInput *> inputs;
   vector<ShaderOutput *> outputs;
 
@@ -242,22 +238,13 @@ class ShaderNode : public Node {
    * that those functions are for selective compilation only?
    */
 
-  /* Nodes are split into several groups, group of level 0 contains
-   * nodes which are most commonly used, further levels are extension
-   * of previous one and includes less commonly used nodes.
-   */
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_0;
-  }
-
   /* Node feature are used to disable huge nodes inside the group,
    * so it's possible to disable huge nodes inside of the required
    * nodes group.
    */
   virtual int get_feature()
   {
-    return bump == SHADER_BUMP_NONE ? 0 : NODE_FEATURE_BUMP;
+    return bump == SHADER_BUMP_NONE ? 0 : KERNEL_FEATURE_NODE_BUMP;
   }
 
   /* Get closure ID to which the node compiles into. */
diff --git a/intern/cycles/render/integrator.cpp b/intern/cycles/render/integrator.cpp
index d8749cec9fa..d74d14242bb 100644
--- a/intern/cycles/render/integrator.cpp
+++ b/intern/cycles/render/integrator.cpp
@@ -53,6 +53,8 @@ NODE_DEFINE(Integrator)
   SOCKET_INT(transparent_max_bounce, "Transparent Max Bounce", 7);
 
   SOCKET_INT(ao_bounces, "AO Bounces", 0);
+  SOCKET_FLOAT(ao_factor, "AO Factor", 0.0f);
+  SOCKET_FLOAT(ao_distance, "AO Distance", FLT_MAX);
 
   SOCKET_INT(volume_max_steps, "Volume Max Steps", 1024);
   SOCKET_FLOAT(volume_step_rate, "Volume Step Rate", 1.0f);
@@ -66,33 +68,39 @@ NODE_DEFINE(Integrator)
   SOCKET_BOOLEAN(motion_blur, "Motion Blur", false);
 
   SOCKET_INT(aa_samples, "AA Samples", 0);
-  SOCKET_INT(diffuse_samples, "Diffuse Samples", 1);
-  SOCKET_INT(glossy_samples, "Glossy Samples", 1);
-  SOCKET_INT(transmission_samples, "Transmission Samples", 1);
-  SOCKET_INT(ao_samples, "AO Samples", 1);
-  SOCKET_INT(mesh_light_samples, "Mesh Light Samples", 1);
-  SOCKET_INT(subsurface_samples, "Subsurface Samples", 1);
-  SOCKET_INT(volume_samples, "Volume Samples", 1);
   SOCKET_INT(start_sample, "Start Sample", 0);
 
+  SOCKET_BOOLEAN(use_adaptive_sampling, "Use Adaptive Sampling", false);
   SOCKET_FLOAT(adaptive_threshold, "Adaptive Threshold", 0.0f);
   SOCKET_INT(adaptive_min_samples, "Adaptive Min Samples", 0);
 
-  SOCKET_BOOLEAN(sample_all_lights_direct, "Sample All Lights Direct", true);
-  SOCKET_BOOLEAN(sample_all_lights_indirect, "Sample All Lights Indirect", true);
   SOCKET_FLOAT(light_sampling_threshold, "Light Sampling Threshold", 0.05f);
 
-  static NodeEnum method_enum;
-  method_enum.insert("path", PATH);
-  method_enum.insert("branched_path", BRANCHED_PATH);
-  SOCKET_ENUM(method, "Method", method_enum, PATH);
-
   static NodeEnum sampling_pattern_enum;
   sampling_pattern_enum.insert("sobol", SAMPLING_PATTERN_SOBOL);
-  sampling_pattern_enum.insert("cmj", SAMPLING_PATTERN_CMJ);
   sampling_pattern_enum.insert("pmj", SAMPLING_PATTERN_PMJ);
   SOCKET_ENUM(sampling_pattern, "Sampling Pattern", sampling_pattern_enum, SAMPLING_PATTERN_SOBOL);
 
+  static NodeEnum denoiser_type_enum;
+  denoiser_type_enum.insert("optix", DENOISER_OPTIX);
+  denoiser_type_enum.insert("openimagedenoise", DENOISER_OPENIMAGEDENOISE);
+
+  static NodeEnum denoiser_prefilter_enum;
+  denoiser_prefilter_enum.insert("none", DENOISER_PREFILTER_NONE);
+  denoiser_prefilter_enum.insert("fast", DENOISER_PREFILTER_FAST);
+  denoiser_prefilter_enum.insert("accurate", DENOISER_PREFILTER_ACCURATE);
+
+  /* Default to accurate denoising with OpenImageDenoise. For interactive viewport
+   * it's best use OptiX and disable the normal pass since it does not always have
+   * the desired effect for that denoiser. */
+  SOCKET_BOOLEAN(use_denoise, "Use Denoiser", false);
+  SOCKET_ENUM(denoiser_type, "Denoiser Type", denoiser_type_enum, DENOISER_OPENIMAGEDENOISE);
+  SOCKET_INT(denoise_start_sample, "Start Sample to Denoise", 0);
+  SOCKET_BOOLEAN(use_denoise_pass_albedo, "Use Albedo Pass for Denoiser", true);
+  SOCKET_BOOLEAN(use_denoise_pass_normal, "Use Normal Pass for Denoiser", true);
+  SOCKET_ENUM(
+      denoiser_prefilter, "Denoiser Type", denoiser_prefilter_enum, DENOISER_PREFILTER_ACCURATE);
+
   return type;
 }
 
@@ -115,13 +123,20 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
     }
   });
 
-  const bool need_update_lut = ao_samples_is_modified() || diffuse_samples_is_modified() ||
-                               glossy_samples_is_modified() || max_bounce_is_modified() ||
-                               max_transmission_bounce_is_modified() ||
-                               mesh_light_samples_is_modified() || method_is_modified() ||
-                               sampling_pattern_is_modified() ||
-                               subsurface_samples_is_modified() ||
-                               transmission_samples_is_modified() || volume_samples_is_modified();
+  KernelIntegrator *kintegrator = &dscene->data.integrator;
+
+  /* Adaptive sampling requires PMJ samples.
+   *
+   * This also makes detection of sampling pattern a bit more involved: can not rely on the changed
+   * state of socket, since its value might be different from the effective value used here. So
+   * instead compare with previous value in the KernelIntegrator. Only do it if the device was
+   * updated once (in which case the `sample_pattern_lut` will be allocated to a non-zero size). */
+  const SamplingPattern new_sampling_pattern = (use_adaptive_sampling) ? SAMPLING_PATTERN_PMJ :
+                                                                         sampling_pattern;
+
+  const bool need_update_lut = max_bounce_is_modified() || max_transmission_bounce_is_modified() ||
+                               dscene->sample_pattern_lut.size() == 0 ||
+                               kintegrator->sampling_pattern != new_sampling_pattern;
 
   if (need_update_lut) {
     dscene->sample_pattern_lut.tag_realloc();
@@ -129,8 +144,6 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
 
   device_free(device, dscene);
 
-  KernelIntegrator *kintegrator = &dscene->data.integrator;
-
   /* integrator parameters */
   kintegrator->min_bounce = min_bounce + 1;
   kintegrator->max_bounce = max_bounce + 1;
@@ -143,12 +156,9 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->transparent_min_bounce = transparent_min_bounce + 1;
   kintegrator->transparent_max_bounce = transparent_max_bounce + 1;
 
-  if (ao_bounces == 0) {
-    kintegrator->ao_bounces = INT_MAX;
-  }
-  else {
-    kintegrator->ao_bounces = ao_bounces - 1;
-  }
+  kintegrator->ao_bounces = ao_bounces;
+  kintegrator->ao_bounces_distance = ao_distance;
+  kintegrator->ao_bounces_factor = ao_factor;
 
   /* Transparent Shadows
    * We only need to enable transparent shadows, if we actually have
@@ -171,10 +181,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   kintegrator->caustics_refractive = caustics_refractive;
   kintegrator->filter_glossy = (filter_glossy == 0.0f) ? FLT_MAX : 1.0f / filter_glossy;
 
-  kintegrator->seed = hash_uint2(seed, 0);
-
-  kintegrator->use_ambient_occlusion = ((Pass::contains(scene->passes, PASS_AO)) ||
-                                        dscene->data.background.ao_factor != 0.0f);
+  kintegrator->seed = seed;
 
   kintegrator->sample_clamp_direct = (sample_clamp_direct == 0.0f) ? FLT_MAX :
                                                                      sample_clamp_direct * 3.0f;
@@ -182,51 +189,7 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
                                            FLT_MAX :
                                            sample_clamp_indirect * 3.0f;
 
-  kintegrator->branched = (method == BRANCHED_PATH) && device->info.has_branched_path;
-  kintegrator->volume_decoupled = device->info.has_volume_decoupled;
-  kintegrator->diffuse_samples = diffuse_samples;
-  kintegrator->glossy_samples = glossy_samples;
-  kintegrator->transmission_samples = transmission_samples;
-  kintegrator->ao_samples = ao_samples;
-  kintegrator->mesh_light_samples = mesh_light_samples;
-  kintegrator->subsurface_samples = subsurface_samples;
-  kintegrator->volume_samples = volume_samples;
-  kintegrator->start_sample = start_sample;
-
-  if (kintegrator->branched) {
-    kintegrator->sample_all_lights_direct = sample_all_lights_direct;
-    kintegrator->sample_all_lights_indirect = sample_all_lights_indirect;
-  }
-  else {
-    kintegrator->sample_all_lights_direct = false;
-    kintegrator->sample_all_lights_indirect = false;
-  }
-
-  kintegrator->sampling_pattern = sampling_pattern;
-  kintegrator->aa_samples = aa_samples;
-  if (aa_samples > 0 && adaptive_min_samples == 0) {
-    kintegrator->adaptive_min_samples = max(4, (int)sqrtf(aa_samples));
-    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
-            << kintegrator->adaptive_min_samples;
-  }
-  else {
-    kintegrator->adaptive_min_samples = max(4, adaptive_min_samples);
-  }
-
-  kintegrator->adaptive_step = 4;
-  kintegrator->adaptive_stop_per_sample = device->info.has_adaptive_stop_per_sample;
-
-  /* Adaptive step must be a power of two for bitwise operations to work. */
-  assert((kintegrator->adaptive_step & (kintegrator->adaptive_step - 1)) == 0);
-
-  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
-    kintegrator->adaptive_threshold = max(0.001f, 1.0f / (float)aa_samples);
-    VLOG(1) << "Cycles adaptive sampling: automatic threshold = "
-            << kintegrator->adaptive_threshold;
-  }
-  else {
-    kintegrator->adaptive_threshold = adaptive_threshold;
-  }
+  kintegrator->sampling_pattern = new_sampling_pattern;
 
   if (light_sampling_threshold > 0.0f) {
     kintegrator->light_inv_rr_threshold = 1.0f / light_sampling_threshold;
@@ -236,29 +199,15 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
   }
 
   /* sobol directions table */
-  int max_samples = 1;
-
-  if (kintegrator->branched) {
-    foreach (Light *light, scene->lights)
-      max_samples = max(max_samples, light->get_samples());
-
-    max_samples = max(max_samples,
-                      max(diffuse_samples, max(glossy_samples, transmission_samples)));
-    max_samples = max(max_samples, max(ao_samples, max(mesh_light_samples, subsurface_samples)));
-    max_samples = max(max_samples, volume_samples);
-  }
-
-  uint total_bounces = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
-                       max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
-
-  max_samples *= total_bounces;
+  int max_samples = max_bounce + transparent_max_bounce + 3 + VOLUME_BOUNDS_MAX +
+                    max(BSSRDF_MAX_HITS, BSSRDF_MAX_BOUNCES);
 
   int dimensions = PRNG_BASE_NUM + max_samples * PRNG_BOUNCE_NUM;
   dimensions = min(dimensions, SOBOL_MAX_DIMENSIONS);
 
   if (need_update_lut) {
-    if (sampling_pattern == SAMPLING_PATTERN_SOBOL) {
-      uint *directions = dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
+    if (kintegrator->sampling_pattern == SAMPLING_PATTERN_SOBOL) {
+      uint *directions = (uint *)dscene->sample_pattern_lut.alloc(SOBOL_BITS * dimensions);
 
       sobol_generate_direction_vectors((uint(*)[SOBOL_BITS])directions, dimensions);
 
@@ -276,10 +225,13 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
             function_bind(&progressive_multi_jitter_02_generate_2D, sequence, sequence_size, j));
       }
       pool.wait_work();
+
       dscene->sample_pattern_lut.copy_to_device();
     }
   }
 
+  kintegrator->has_shadow_catcher = scene->has_shadow_catcher();
+
   dscene->sample_pattern_lut.clear_modified();
   clear_modified();
 }
@@ -295,17 +247,12 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
     tag_modified();
   }
 
-  if (flag & (AO_PASS_MODIFIED | BACKGROUND_AO_MODIFIED)) {
+  if (flag & AO_PASS_MODIFIED) {
     /* tag only the ao_bounces socket as modified so we avoid updating sample_pattern_lut
      * unnecessarily */
     tag_ao_bounces_modified();
   }
 
-  if ((flag & LIGHT_SAMPLES_MODIFIED) && (method == BRANCHED_PATH)) {
-    /* the number of light samples may affect the size of the sample_pattern_lut */
-    tag_sampling_pattern_modified();
-  }
-
   if (filter_glossy_is_modified()) {
     foreach (Shader *shader, scene->shaders) {
       if (shader->has_integrator_dependency) {
@@ -321,4 +268,65 @@ void Integrator::tag_update(Scene *scene, uint32_t flag)
   }
 }
 
+AdaptiveSampling Integrator::get_adaptive_sampling() const
+{
+  AdaptiveSampling adaptive_sampling;
+
+  adaptive_sampling.use = use_adaptive_sampling;
+
+  if (!adaptive_sampling.use) {
+    return adaptive_sampling;
+  }
+
+  if (aa_samples > 0 && adaptive_threshold == 0.0f) {
+    adaptive_sampling.threshold = max(0.001f, 1.0f / (float)aa_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic threshold = " << adaptive_sampling.threshold;
+  }
+  else {
+    adaptive_sampling.threshold = adaptive_threshold;
+  }
+
+  if (adaptive_sampling.threshold > 0 && adaptive_min_samples == 0) {
+    /* Threshold 0.1 -> 32, 0.01 -> 64, 0.001 -> 128.
+     * This is highly scene dependent, we make a guess that seemed to work well
+     * in various test scenes. */
+    const int min_samples = (int)ceilf(16.0f / powf(adaptive_sampling.threshold, 0.3f));
+    adaptive_sampling.min_samples = max(4, min_samples);
+    VLOG(1) << "Cycles adaptive sampling: automatic min samples = "
+            << adaptive_sampling.min_samples;
+  }
+  else {
+    adaptive_sampling.min_samples = max(4, adaptive_min_samples);
+  }
+
+  /* Arbitrary factor that makes the threshold more similar to what is was before,
+   * and gives arguably more intuitive values. */
+  adaptive_sampling.threshold *= 5.0f;
+
+  adaptive_sampling.adaptive_step = 16;
+
+  DCHECK(is_power_of_two(adaptive_sampling.adaptive_step))
+      << "Adaptive step must be a power of two for bitwise operations to work";
+
+  return adaptive_sampling;
+}
+
+DenoiseParams Integrator::get_denoise_params() const
+{
+  DenoiseParams denoise_params;
+
+  denoise_params.use = use_denoise;
+
+  denoise_params.type = denoiser_type;
+
+  denoise_params.start_sample = denoise_start_sample;
+
+  denoise_params.use_pass_albedo = use_denoise_pass_albedo;
+  denoise_params.use_pass_normal = use_denoise_pass_normal;
+
+  denoise_params.prefilter = denoiser_prefilter;
+
+  return denoise_params;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h
index 4eeeda92d41..32e108d62ca 100644
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,7 +19,9 @@
 
 #include "kernel/kernel_types.h"
 
+#include "device/device_denoise.h" /* For the paramaters and type enum. */
 #include "graph/node.h"
+#include "integrator/adaptive_sampling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -43,6 +45,8 @@ class Integrator : public Node {
   NODE_SOCKET_API(int, transparent_max_bounce)
 
   NODE_SOCKET_API(int, ao_bounces)
+  NODE_SOCKET_API(float, ao_factor)
+  NODE_SOCKET_API(float, ao_distance)
 
   NODE_SOCKET_API(int, volume_max_steps)
   NODE_SOCKET_API(float, volume_step_rate)
@@ -62,37 +66,26 @@ class Integrator : public Node {
   static const int MAX_SAMPLES = (1 << 24);
 
   NODE_SOCKET_API(int, aa_samples)
-  NODE_SOCKET_API(int, diffuse_samples)
-  NODE_SOCKET_API(int, glossy_samples)
-  NODE_SOCKET_API(int, transmission_samples)
-  NODE_SOCKET_API(int, ao_samples)
-  NODE_SOCKET_API(int, mesh_light_samples)
-  NODE_SOCKET_API(int, subsurface_samples)
-  NODE_SOCKET_API(int, volume_samples)
   NODE_SOCKET_API(int, start_sample)
 
-  NODE_SOCKET_API(bool, sample_all_lights_direct)
-  NODE_SOCKET_API(bool, sample_all_lights_indirect)
   NODE_SOCKET_API(float, light_sampling_threshold)
 
+  NODE_SOCKET_API(bool, use_adaptive_sampling)
   NODE_SOCKET_API(int, adaptive_min_samples)
   NODE_SOCKET_API(float, adaptive_threshold)
 
-  enum Method {
-    BRANCHED_PATH = 0,
-    PATH = 1,
-
-    NUM_METHODS,
-  };
-
-  NODE_SOCKET_API(Method, method)
-
   NODE_SOCKET_API(SamplingPattern, sampling_pattern)
 
+  NODE_SOCKET_API(bool, use_denoise);
+  NODE_SOCKET_API(DenoiserType, denoiser_type);
+  NODE_SOCKET_API(int, denoise_start_sample);
+  NODE_SOCKET_API(bool, use_denoise_pass_albedo);
+  NODE_SOCKET_API(bool, use_denoise_pass_normal);
+  NODE_SOCKET_API(DenoiserPrefilter, denoiser_prefilter);
+
   enum : uint32_t {
     AO_PASS_MODIFIED = (1 << 0),
-    BACKGROUND_AO_MODIFIED = (1 << 1),
-    LIGHT_SAMPLES_MODIFIED = (1 << 2),
+    OBJECT_MANAGER = (1 << 1),
 
     /* tag everything in the manager for an update */
     UPDATE_ALL = ~0u,
@@ -107,6 +100,9 @@ class Integrator : public Node {
   void device_free(Device *device, DeviceScene *dscene, bool force_free = false);
 
   void tag_update(Scene *scene, uint32_t flag);
+
+  AdaptiveSampling get_adaptive_sampling() const;
+  DenoiseParams get_denoise_params() const;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/jitter.cpp b/intern/cycles/render/jitter.cpp
index fc47b0e8f0a..e31f8abd446 100644
--- a/intern/cycles/render/jitter.cpp
+++ b/intern/cycles/render/jitter.cpp
@@ -242,12 +242,6 @@ class PMJ02_Generator : public PMJ_Generator {
 
 static void shuffle(float2 points[], int size, int rng_seed)
 {
-  /* Offset samples by 1.0 for faster scrambling in kernel_random.h */
-  for (int i = 0; i < size; ++i) {
-    points[i].x += 1.0f;
-    points[i].y += 1.0f;
-  }
-
   if (rng_seed == 0) {
     return;
   }
diff --git a/intern/cycles/render/light.cpp b/intern/cycles/render/light.cpp
index 15aa4e047b5..ae1150fc07b 100644
--- a/intern/cycles/render/light.cpp
+++ b/intern/cycles/render/light.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include "render/light.h"
 #include "device/device.h"
+
 #include "render/background.h"
 #include "render/film.h"
 #include "render/graph.h"
 #include "render/integrator.h"
+#include "render/light.h"
 #include "render/mesh.h"
 #include "render/nodes.h"
 #include "render/object.h"
@@ -27,6 +28,8 @@
 #include "render/shader.h"
 #include "render/stats.h"
 
+#include "integrator/shader_eval.h"
+
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_logging.h"
@@ -43,63 +46,49 @@ static void shade_background_pixels(Device *device,
                                     vector<float3> &pixels,
                                     Progress &progress)
 {
-  /* create input */
-  device_vector<uint4> d_input(device, "background_input", MEM_READ_ONLY);
-  device_vector<float4> d_output(device, "background_output", MEM_READ_WRITE);
-
-  uint4 *d_input_data = d_input.alloc(width * height);
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < width; x++) {
-      float u = (x + 0.5f) / width;
-      float v = (y + 0.5f) / height;
-
-      uint4 in = make_uint4(__float_as_int(u), __float_as_int(v), 0, 0);
-      d_input_data[x + y * width] = in;
-    }
-  }
-
-  /* compute on device */
-  d_output.alloc(width * height);
-  d_output.zero_to_device();
-  d_input.copy_to_device();
-
+  /* Needs to be up to data for attribute access. */
   device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
 
-  DeviceTask main_task(DeviceTask::SHADER);
-  main_task.shader_input = d_input.device_pointer;
-  main_task.shader_output = d_output.device_pointer;
-  main_task.shader_eval_type = SHADER_EVAL_BACKGROUND;
-  main_task.shader_x = 0;
-  main_task.shader_w = width * height;
-  main_task.num_samples = 1;
-  main_task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
-  /* disabled splitting for now, there's an issue with multi-GPU mem_copy_from */
-  list<DeviceTask> split_tasks;
-  main_task.split(split_tasks, 1, 128 * 128);
-
-  foreach (DeviceTask &task, split_tasks) {
-    device->task_add(task);
-    device->task_wait();
-    d_output.copy_from_device(task.shader_x, 1, task.shader_w);
-  }
-
-  d_input.free();
-
-  float4 *d_output_data = d_output.data();
-
-  pixels.resize(width * height);
-
-  for (int y = 0; y < height; y++) {
-    for (int x = 0; x < width; x++) {
-      pixels[y * width + x].x = d_output_data[y * width + x].x;
-      pixels[y * width + x].y = d_output_data[y * width + x].y;
-      pixels[y * width + x].z = d_output_data[y * width + x].z;
-    }
-  }
+  const int size = width * height;
+  pixels.resize(size);
+
+  /* Evaluate shader on device. */
+  ShaderEval shader_eval(device, progress);
+  shader_eval.eval(
+      SHADER_EVAL_BACKGROUND,
+      size,
+      [&](device_vector<KernelShaderEvalInput> &d_input) {
+        /* Fill coordinates for shading. */
+        KernelShaderEvalInput *d_input_data = d_input.data();
+
+        for (int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            float u = (x + 0.5f) / width;
+            float v = (y + 0.5f) / height;
+
+            KernelShaderEvalInput in;
+            in.object = OBJECT_NONE;
+            in.prim = PRIM_NONE;
+            in.u = u;
+            in.v = v;
+            d_input_data[x + y * width] = in;
+          }
+        }
 
-  d_output.free();
+        return size;
+      },
+      [&](device_vector<float4> &d_output) {
+        /* Copy output to pixel buffer. */
+        float4 *d_output_data = d_output.data();
+
+        for (int y = 0; y < height; y++) {
+          for (int x = 0; x < width; x++) {
+            pixels[y * width + x].x = d_output_data[y * width + x].x;
+            pixels[y * width + x].y = d_output_data[y * width + x].y;
+            pixels[y * width + x].z = d_output_data[y * width + x].z;
+          }
+        }
+      });
 }
 
 /* Light */
@@ -140,15 +129,16 @@ NODE_DEFINE(Light)
 
   SOCKET_BOOLEAN(cast_shadow, "Cast Shadow", true);
   SOCKET_BOOLEAN(use_mis, "Use Mis", false);
+  SOCKET_BOOLEAN(use_camera, "Use Camera", true);
   SOCKET_BOOLEAN(use_diffuse, "Use Diffuse", true);
   SOCKET_BOOLEAN(use_glossy, "Use Glossy", true);
   SOCKET_BOOLEAN(use_transmission, "Use Transmission", true);
   SOCKET_BOOLEAN(use_scatter, "Use Scatter", true);
 
-  SOCKET_INT(samples, "Samples", 1);
   SOCKET_INT(max_bounces, "Max Bounces", 1024);
   SOCKET_UINT(random_id, "Random ID", 0);
 
+  SOCKET_BOOLEAN(is_shadow_catcher, "Shadow Catcher", true);
   SOCKET_BOOLEAN(is_portal, "Is Portal", false);
   SOCKET_BOOLEAN(is_enabled, "Is Enabled", true);
 
@@ -166,10 +156,6 @@ void Light::tag_update(Scene *scene)
 {
   if (is_modified()) {
     scene->light_manager->tag_update(scene, LightManager::LIGHT_MODIFIED);
-
-    if (samples_is_modified()) {
-      scene->integrator->tag_update(scene, Integrator::LIGHT_SAMPLES_MODIFIED);
-    }
   }
 }
 
@@ -193,7 +179,6 @@ LightManager::LightManager()
 {
   update_flags = UPDATE_ALL;
   need_update_background = true;
-  use_light_visibility = false;
   last_background_enabled = false;
   last_background_resolution = 0;
 }
@@ -357,21 +342,23 @@ void LightManager::device_update_distribution(Device *,
     int object_id = j;
     int shader_flag = 0;
 
+    if (!(object->get_visibility() & PATH_RAY_CAMERA)) {
+      shader_flag |= SHADER_EXCLUDE_CAMERA;
+    }
     if (!(object->get_visibility() & PATH_RAY_DIFFUSE)) {
       shader_flag |= SHADER_EXCLUDE_DIFFUSE;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_GLOSSY)) {
       shader_flag |= SHADER_EXCLUDE_GLOSSY;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_TRANSMIT)) {
       shader_flag |= SHADER_EXCLUDE_TRANSMIT;
-      use_light_visibility = true;
     }
     if (!(object->get_visibility() & PATH_RAY_VOLUME_SCATTER)) {
       shader_flag |= SHADER_EXCLUDE_SCATTER;
-      use_light_visibility = true;
+    }
+    if (!(object->get_is_shadow_catcher())) {
+      shader_flag |= SHADER_EXCLUDE_SHADOW_CATCHER;
     }
 
     size_t mesh_num_triangles = mesh->num_triangles();
@@ -496,10 +483,10 @@ void LightManager::device_update_distribution(Device *,
     kfilm->pass_shadow_scale = 1.0f;
 
     if (kintegrator->pdf_triangles != 0.0f)
-      kfilm->pass_shadow_scale *= 0.5f;
+      kfilm->pass_shadow_scale /= 0.5f;
 
     if (num_background_lights < num_lights)
-      kfilm->pass_shadow_scale *= (float)(num_lights - num_background_lights) / (float)num_lights;
+      kfilm->pass_shadow_scale /= (float)(num_lights - num_background_lights) / (float)num_lights;
 
     /* CDF */
     dscene->light_distribution.copy_to_device();
@@ -766,25 +753,26 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
     if (!light->cast_shadow)
       shader_id &= ~SHADER_CAST_SHADOW;
 
+    if (!light->use_camera) {
+      shader_id |= SHADER_EXCLUDE_CAMERA;
+    }
     if (!light->use_diffuse) {
       shader_id |= SHADER_EXCLUDE_DIFFUSE;
-      use_light_visibility = true;
     }
     if (!light->use_glossy) {
       shader_id |= SHADER_EXCLUDE_GLOSSY;
-      use_light_visibility = true;
     }
     if (!light->use_transmission) {
       shader_id |= SHADER_EXCLUDE_TRANSMIT;
-      use_light_visibility = true;
     }
     if (!light->use_scatter) {
       shader_id |= SHADER_EXCLUDE_SCATTER;
-      use_light_visibility = true;
+    }
+    if (!light->is_shadow_catcher) {
+      shader_id |= SHADER_EXCLUDE_SHADOW_CATCHER;
     }
 
     klights[light_index].type = light->light_type;
-    klights[light_index].samples = light->samples;
     klights[light_index].strength[0] = light->strength.x;
     klights[light_index].strength[1] = light->strength.y;
     klights[light_index].strength[2] = light->strength.z;
@@ -836,19 +824,15 @@ void LightManager::device_update_points(Device *, DeviceScene *dscene, Scene *sc
 
       if (!(visibility & PATH_RAY_DIFFUSE)) {
         shader_id |= SHADER_EXCLUDE_DIFFUSE;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_GLOSSY)) {
         shader_id |= SHADER_EXCLUDE_GLOSSY;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_TRANSMIT)) {
         shader_id |= SHADER_EXCLUDE_TRANSMIT;
-        use_light_visibility = true;
       }
       if (!(visibility & PATH_RAY_VOLUME_SCATTER)) {
         shader_id |= SHADER_EXCLUDE_SCATTER;
-        use_light_visibility = true;
       }
     }
     else if (light->light_type == LIGHT_AREA) {
@@ -998,8 +982,6 @@ void LightManager::device_update(Device *device,
 
   device_free(device, dscene, need_update_background);
 
-  use_light_visibility = false;
-
   device_update_points(device, dscene, scene);
   if (progress.get_cancel())
     return;
@@ -1018,8 +1000,6 @@ void LightManager::device_update(Device *device,
   if (progress.get_cancel())
     return;
 
-  scene->film->set_use_light_visibility(use_light_visibility);
-
   update_flags = UPDATE_NONE;
   need_update_background = false;
 }
diff --git a/intern/cycles/render/light.h b/intern/cycles/render/light.h
index fbd709125ff..7f86237c8b3 100644
--- a/intern/cycles/render/light.h
+++ b/intern/cycles/render/light.h
@@ -69,16 +69,17 @@ class Light : public Node {
 
   NODE_SOCKET_API(bool, cast_shadow)
   NODE_SOCKET_API(bool, use_mis)
+  NODE_SOCKET_API(bool, use_camera)
   NODE_SOCKET_API(bool, use_diffuse)
   NODE_SOCKET_API(bool, use_glossy)
   NODE_SOCKET_API(bool, use_transmission)
   NODE_SOCKET_API(bool, use_scatter)
 
+  NODE_SOCKET_API(bool, is_shadow_catcher)
   NODE_SOCKET_API(bool, is_portal)
   NODE_SOCKET_API(bool, is_enabled)
 
   NODE_SOCKET_API(Shader *, shader)
-  NODE_SOCKET_API(int, samples)
   NODE_SOCKET_API(int, max_bounces)
   NODE_SOCKET_API(uint, random_id)
 
@@ -108,8 +109,6 @@ class LightManager {
     UPDATE_NONE = 0u,
   };
 
-  bool use_light_visibility;
-
   /* Need to update background (including multiple importance map) */
   bool need_update_background;
 
diff --git a/intern/cycles/render/mesh_displace.cpp b/intern/cycles/render/mesh_displace.cpp
index b39d81023d9..c00c4c24211 100644
--- a/intern/cycles/render/mesh_displace.cpp
+++ b/intern/cycles/render/mesh_displace.cpp
@@ -16,6 +16,8 @@
 
 #include "device/device.h"
 
+#include "integrator/shader_eval.h"
+
 #include "render/mesh.h"
 #include "render/object.h"
 #include "render/scene.h"
@@ -43,40 +45,28 @@ static float3 compute_face_normal(const Mesh::Triangle &t, float3 *verts)
   return norm / normlen;
 }
 
-bool GeometryManager::displace(
-    Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+/* Fill in coordinates for mesh displacement shader evaluation on device. */
+static int fill_shader_input(const Scene *scene,
+                             const Mesh *mesh,
+                             const int object_index,
+                             device_vector<KernelShaderEvalInput> &d_input)
 {
-  /* verify if we have a displacement shader */
-  if (!mesh->has_true_displacement()) {
-    return false;
-  }
-
-  string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
-  progress.set_status("Updating Mesh", msg);
+  int d_input_size = 0;
+  KernelShaderEvalInput *d_input_data = d_input.data();
 
-  /* find object index. todo: is arbitrary */
-  size_t object_index = OBJECT_NONE;
+  const array<int> &mesh_shaders = mesh->get_shader();
+  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+  const array<float3> &mesh_verts = mesh->get_verts();
 
-  for (size_t i = 0; i < scene->objects.size(); i++) {
-    if (scene->objects[i]->get_geometry() == mesh) {
-      object_index = i;
-      break;
-    }
-  }
-
-  /* setup input for device task */
-  const size_t num_verts = mesh->verts.size();
+  const int num_verts = mesh_verts.size();
   vector<bool> done(num_verts, false);
-  device_vector<uint4> d_input(device, "displace_input", MEM_READ_ONLY);
-  uint4 *d_input_data = d_input.alloc(num_verts);
-  size_t d_input_size = 0;
 
-  size_t num_triangles = mesh->num_triangles();
-  for (size_t i = 0; i < num_triangles; i++) {
+  int num_triangles = mesh->num_triangles();
+  for (int i = 0; i < num_triangles; i++) {
     Mesh::Triangle t = mesh->get_triangle(i);
-    int shader_index = mesh->shader[i];
-    Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-                         static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+    int shader_index = mesh_shaders[i];
+    Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+                         static_cast<Shader *>(mesh_used_shaders[shader_index]) :
                          scene->default_surface;
 
     if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -110,57 +100,41 @@ bool GeometryManager::displace(
       }
 
       /* back */
-      uint4 in = make_uint4(object, prim, __float_as_int(u), __float_as_int(v));
+      KernelShaderEvalInput in;
+      in.object = object;
+      in.prim = prim;
+      in.u = u;
+      in.v = v;
       d_input_data[d_input_size++] = in;
     }
   }
 
-  if (d_input_size == 0)
-    return false;
-
-  /* run device task */
-  device_vector<float4> d_output(device, "displace_output", MEM_READ_WRITE);
-  d_output.alloc(d_input_size);
-  d_output.zero_to_device();
-  d_input.copy_to_device();
-
-  /* needs to be up to data for attribute access */
-  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
-
-  DeviceTask task(DeviceTask::SHADER);
-  task.shader_input = d_input.device_pointer;
-  task.shader_output = d_output.device_pointer;
-  task.shader_eval_type = SHADER_EVAL_DISPLACE;
-  task.shader_x = 0;
-  task.shader_w = d_output.size();
-  task.num_samples = 1;
-  task.get_cancel = function_bind(&Progress::get_cancel, &progress);
-
-  device->task_add(task);
-  device->task_wait();
-
-  if (progress.get_cancel()) {
-    d_input.free();
-    d_output.free();
-    return false;
-  }
+  return d_input_size;
+}
 
-  d_output.copy_from_device(0, 1, d_output.size());
-  d_input.free();
+/* Read back mesh displacement shader output. */
+static void read_shader_output(const Scene *scene,
+                               Mesh *mesh,
+                               const device_vector<float4> &d_output)
+{
+  const array<int> &mesh_shaders = mesh->get_shader();
+  const array<Node *> &mesh_used_shaders = mesh->get_used_shaders();
+  array<float3> &mesh_verts = mesh->get_verts();
 
-  /* read result */
-  done.clear();
-  done.resize(num_verts, false);
-  int k = 0;
+  const int num_verts = mesh_verts.size();
+  const int num_motion_steps = mesh->get_motion_steps();
+  vector<bool> done(num_verts, false);
 
-  float4 *offset = d_output.data();
+  const float4 *d_output_data = d_output.data();
+  int d_output_index = 0;
 
   Attribute *attr_mP = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
-  for (size_t i = 0; i < num_triangles; i++) {
+  int num_triangles = mesh->num_triangles();
+  for (int i = 0; i < num_triangles; i++) {
     Mesh::Triangle t = mesh->get_triangle(i);
-    int shader_index = mesh->shader[i];
-    Shader *shader = (shader_index < mesh->used_shaders.size()) ?
-                         static_cast<Shader *>(mesh->used_shaders[shader_index]) :
+    int shader_index = mesh_shaders[i];
+    Shader *shader = (shader_index < mesh_used_shaders.size()) ?
+                         static_cast<Shader *>(mesh_used_shaders[shader_index]) :
                          scene->default_surface;
 
     if (!shader->has_displacement || shader->get_displacement_method() == DISPLACE_BUMP) {
@@ -170,12 +144,12 @@ bool GeometryManager::displace(
     for (int j = 0; j < 3; j++) {
       if (!done[t.v[j]]) {
         done[t.v[j]] = true;
-        float3 off = float4_to_float3(offset[k++]);
+        float3 off = float4_to_float3(d_output_data[d_output_index++]);
         /* Avoid illegal vertex coordinates. */
         off = ensure_finite3(off);
-        mesh->verts[t.v[j]] += off;
+        mesh_verts[t.v[j]] += off;
         if (attr_mP != NULL) {
-          for (int step = 0; step < mesh->motion_steps - 1; step++) {
+          for (int step = 0; step < num_motion_steps - 1; step++) {
             float3 *mP = attr_mP->data_float3() + step * num_verts;
             mP[t.v[j]] += off;
           }
@@ -183,8 +157,47 @@ bool GeometryManager::displace(
       }
     }
   }
+}
 
-  d_output.free();
+bool GeometryManager::displace(
+    Device *device, DeviceScene *dscene, Scene *scene, Mesh *mesh, Progress &progress)
+{
+  /* verify if we have a displacement shader */
+  if (!mesh->has_true_displacement()) {
+    return false;
+  }
+
+  const size_t num_verts = mesh->verts.size();
+  const size_t num_triangles = mesh->num_triangles();
+
+  if (num_triangles == 0) {
+    return false;
+  }
+
+  string msg = string_printf("Computing Displacement %s", mesh->name.c_str());
+  progress.set_status("Updating Mesh", msg);
+
+  /* find object index. todo: is arbitrary */
+  size_t object_index = OBJECT_NONE;
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    if (scene->objects[i]->get_geometry() == mesh) {
+      object_index = i;
+      break;
+    }
+  }
+
+  /* Needs to be up to data for attribute access. */
+  device->const_copy_to("__data", &dscene->data, sizeof(dscene->data));
+
+  /* Evaluate shader on device. */
+  ShaderEval shader_eval(device, progress);
+  if (!shader_eval.eval(SHADER_EVAL_DISPLACE,
+                        num_verts,
+                        function_bind(&fill_shader_input, scene, mesh, object_index, _1),
+                        function_bind(&read_shader_output, scene, mesh, _1))) {
+    return false;
+  }
 
   /* stitch */
   unordered_set<int> stitch_keys;
@@ -297,8 +310,7 @@ bool GeometryManager::displace(
     }
 
     /* normalize vertex normals */
-    done.clear();
-    done.resize(num_verts, false);
+    vector<bool> done(num_verts, false);
 
     for (size_t i = 0; i < num_triangles; i++) {
       if (tri_has_true_disp[i]) {
@@ -368,8 +380,7 @@ bool GeometryManager::displace(
         }
 
         /* normalize vertex normals */
-        done.clear();
-        done.resize(num_verts, false);
+        vector<bool> done(num_verts, false);
 
         for (size_t i = 0; i < num_triangles; i++) {
           if (tri_has_true_disp[i]) {
diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp
index 795166bcf4c..5303d55242e 100644
--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -2736,18 +2736,21 @@ NODE_DEFINE(PrincipledBsdfNode)
       distribution, "Distribution", distribution_enum, CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID);
 
   static NodeEnum subsurface_method_enum;
-  subsurface_method_enum.insert("burley", CLOSURE_BSSRDF_PRINCIPLED_ID);
-  subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_PRINCIPLED_RANDOM_WALK_ID);
+  subsurface_method_enum.insert("random_walk_fixed_radius",
+                                CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+  subsurface_method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
   SOCKET_ENUM(subsurface_method,
               "Subsurface Method",
               subsurface_method_enum,
-              CLOSURE_BSSRDF_PRINCIPLED_ID);
+              CLOSURE_BSSRDF_RANDOM_WALK_ID);
 
   SOCKET_IN_COLOR(base_color, "Base Color", make_float3(0.8f, 0.8f, 0.8f));
   SOCKET_IN_COLOR(subsurface_color, "Subsurface Color", make_float3(0.8f, 0.8f, 0.8f));
   SOCKET_IN_FLOAT(metallic, "Metallic", 0.0f);
   SOCKET_IN_FLOAT(subsurface, "Subsurface", 0.0f);
   SOCKET_IN_VECTOR(subsurface_radius, "Subsurface Radius", make_float3(0.1f, 0.1f, 0.1f));
+  SOCKET_IN_FLOAT(subsurface_ior, "Subsurface IOR", 1.4f);
+  SOCKET_IN_FLOAT(subsurface_anisotropy, "Subsurface Anisotropy", 0.0f);
   SOCKET_IN_FLOAT(specular, "Specular", 0.0f);
   SOCKET_IN_FLOAT(roughness, "Roughness", 0.5f);
   SOCKET_IN_FLOAT(specular_tint, "Specular Tint", 0.0f);
@@ -2857,6 +2860,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
                                  ShaderInput *p_metallic,
                                  ShaderInput *p_subsurface,
                                  ShaderInput *p_subsurface_radius,
+                                 ShaderInput *p_subsurface_ior,
+                                 ShaderInput *p_subsurface_anisotropy,
                                  ShaderInput *p_specular,
                                  ShaderInput *p_roughness,
                                  ShaderInput *p_specular_tint,
@@ -2896,6 +2901,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
   int transmission_roughness_offset = compiler.stack_assign(p_transmission_roughness);
   int anisotropic_rotation_offset = compiler.stack_assign(p_anisotropic_rotation);
   int subsurface_radius_offset = compiler.stack_assign(p_subsurface_radius);
+  int subsurface_ior_offset = compiler.stack_assign(p_subsurface_ior);
+  int subsurface_anisotropy_offset = compiler.stack_assign(p_subsurface_anisotropy);
 
   compiler.add_node(NODE_CLOSURE_BSDF,
                     compiler.encode_uchar4(closure,
@@ -2929,8 +2936,10 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler,
       __float_as_int(bc_default.y),
       __float_as_int(bc_default.z));
 
-  compiler.add_node(
-      clearcoat_normal_offset, subsurface_radius_offset, SVM_STACK_INVALID, SVM_STACK_INVALID);
+  compiler.add_node(clearcoat_normal_offset,
+                    subsurface_radius_offset,
+                    subsurface_ior_offset,
+                    subsurface_anisotropy_offset);
 
   float3 ss_default = get_float3(subsurface_color_in->socket_type);
 
@@ -2953,6 +2962,8 @@ void PrincipledBsdfNode::compile(SVMCompiler &compiler)
           input("Metallic"),
           input("Subsurface"),
           input("Subsurface Radius"),
+          input("Subsurface IOR"),
+          input("Subsurface Anisotropy"),
           input("Specular"),
           input("Roughness"),
           input("Specular Tint"),
@@ -3048,16 +3059,16 @@ NODE_DEFINE(SubsurfaceScatteringNode)
   SOCKET_IN_NORMAL(normal, "Normal", zero_float3(), SocketType::LINK_NORMAL);
   SOCKET_IN_FLOAT(surface_mix_weight, "SurfaceMixWeight", 0.0f, SocketType::SVM_INTERNAL);
 
-  static NodeEnum falloff_enum;
-  falloff_enum.insert("cubic", CLOSURE_BSSRDF_CUBIC_ID);
-  falloff_enum.insert("gaussian", CLOSURE_BSSRDF_GAUSSIAN_ID);
-  falloff_enum.insert("burley", CLOSURE_BSSRDF_BURLEY_ID);
-  falloff_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
-  SOCKET_ENUM(falloff, "Falloff", falloff_enum, CLOSURE_BSSRDF_BURLEY_ID);
+  static NodeEnum method_enum;
+  method_enum.insert("random_walk_fixed_radius", CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID);
+  method_enum.insert("random_walk", CLOSURE_BSSRDF_RANDOM_WALK_ID);
+  SOCKET_ENUM(method, "Method", method_enum, CLOSURE_BSSRDF_RANDOM_WALK_ID);
+
   SOCKET_IN_FLOAT(scale, "Scale", 0.01f);
   SOCKET_IN_VECTOR(radius, "Radius", make_float3(0.1f, 0.1f, 0.1f));
-  SOCKET_IN_FLOAT(sharpness, "Sharpness", 0.0f);
-  SOCKET_IN_FLOAT(texture_blur, "Texture Blur", 1.0f);
+
+  SOCKET_IN_FLOAT(subsurface_ior, "IOR", 1.4f);
+  SOCKET_IN_FLOAT(subsurface_anisotropy, "Anisotropy", 0.0f);
 
   SOCKET_OUT_CLOSURE(BSSRDF, "BSSRDF");
 
@@ -3066,20 +3077,19 @@ NODE_DEFINE(SubsurfaceScatteringNode)
 
 SubsurfaceScatteringNode::SubsurfaceScatteringNode() : BsdfNode(get_node_type())
 {
-  closure = falloff;
+  closure = method;
 }
 
 void SubsurfaceScatteringNode::compile(SVMCompiler &compiler)
 {
-  closure = falloff;
-  BsdfNode::compile(
-      compiler, input("Scale"), input("Texture Blur"), input("Radius"), input("Sharpness"));
+  closure = method;
+  BsdfNode::compile(compiler, input("Scale"), input("IOR"), input("Radius"), input("Anisotropy"));
 }
 
 void SubsurfaceScatteringNode::compile(OSLCompiler &compiler)
 {
-  closure = falloff;
-  compiler.parameter(this, "falloff");
+  closure = method;
+  compiler.parameter(this, "method");
   compiler.add(this, "node_subsurface_scattering");
 }
 
@@ -3786,20 +3796,6 @@ void GeometryNode::compile(OSLCompiler &compiler)
   compiler.add(this, "node_geometry");
 }
 
-int GeometryNode::get_group()
-{
-  ShaderOutput *out;
-  int result = ShaderNode::get_group();
-
-  /* Backfacing uses NODE_LIGHT_PATH */
-  out = output("Backfacing");
-  if (!out->links.empty()) {
-    result = max(result, NODE_GROUP_LEVEL_1);
-  }
-
-  return result;
-}
-
 /* TextureCoordinate */
 
 NODE_DEFINE(TextureCoordinateNode)
@@ -5926,33 +5922,33 @@ NODE_DEFINE(OutputAOVNode)
 OutputAOVNode::OutputAOVNode() : ShaderNode(get_node_type())
 {
   special_type = SHADER_SPECIAL_TYPE_OUTPUT_AOV;
-  slot = -1;
+  offset = -1;
 }
 
 void OutputAOVNode::simplify_settings(Scene *scene)
 {
-  slot = scene->film->get_aov_offset(scene, name.string(), is_color);
-  if (slot == -1) {
-    slot = scene->film->get_aov_offset(scene, name.string(), is_color);
+  offset = scene->film->get_aov_offset(scene, name.string(), is_color);
+  if (offset == -1) {
+    offset = scene->film->get_aov_offset(scene, name.string(), is_color);
   }
 
-  if (slot == -1 || is_color) {
+  if (offset == -1 || is_color) {
     input("Value")->disconnect();
   }
-  if (slot == -1 || !is_color) {
+  if (offset == -1 || !is_color) {
     input("Color")->disconnect();
   }
 }
 
 void OutputAOVNode::compile(SVMCompiler &compiler)
 {
-  assert(slot >= 0);
+  assert(offset >= 0);
 
   if (is_color) {
-    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), slot);
+    compiler.add_node(NODE_AOV_COLOR, compiler.stack_assign(input("Color")), offset);
   }
   else {
-    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), slot);
+    compiler.add_node(NODE_AOV_VALUE, compiler.stack_assign(input("Value")), offset);
   }
 }
 
diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h
index 3013e9b1866..22bdb06b059 100644
--- a/intern/cycles/render/nodes.h
+++ b/intern/cycles/render/nodes.h
@@ -143,10 +143,6 @@ class EnvironmentTextureNode : public ImageSlotTextureNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   virtual bool equals(const ShaderNode &other)
   {
@@ -170,11 +166,6 @@ class SkyTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(SkyTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeSkyType, sky_type)
   NODE_SOCKET_API(float3, sun_direction)
   NODE_SOCKET_API(float, turbidity)
@@ -224,18 +215,13 @@ class OutputAOVNode : public ShaderNode {
 
   NODE_SOCKET_API(ustring, name)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_4;
-  }
-
   /* Don't allow output node de-duplication. */
   virtual bool equals(const ShaderNode & /*other*/)
   {
     return false;
   }
 
-  int slot;
+  int offset;
   bool is_color;
 };
 
@@ -243,11 +229,6 @@ class GradientTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(GradientTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeGradientType, gradient_type)
   NODE_SOCKET_API(float3, vector)
 };
@@ -269,19 +250,14 @@ class VoronoiTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(VoronoiTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   virtual int get_feature()
   {
     int result = ShaderNode::get_feature();
     if (dimensions == 4) {
-      result |= NODE_FEATURE_VORONOI_EXTRA;
+      result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
     }
     else if (dimensions >= 2 && feature == NODE_VORONOI_SMOOTH_F1) {
-      result |= NODE_FEATURE_VORONOI_EXTRA;
+      result |= KERNEL_FEATURE_NODE_VORONOI_EXTRA;
     }
     return result;
   }
@@ -301,11 +277,6 @@ class MusgraveTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(MusgraveTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(int, dimensions)
   NODE_SOCKET_API(NodeMusgraveType, musgrave_type)
   NODE_SOCKET_API(float, w)
@@ -322,11 +293,6 @@ class WaveTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(WaveTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(NodeWaveType, wave_type)
   NODE_SOCKET_API(NodeWaveBandsDirection, bands_direction)
   NODE_SOCKET_API(NodeWaveRingsDirection, rings_direction)
@@ -345,11 +311,6 @@ class MagicTextureNode : public TextureNode {
  public:
   SHADER_NODE_CLASS(MagicTextureNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
-
   NODE_SOCKET_API(int, depth)
   NODE_SOCKET_API(float3, vector)
   NODE_SOCKET_API(float, scale)
@@ -364,11 +325,6 @@ class CheckerTextureNode : public TextureNode {
   NODE_SOCKET_API(float3, color1)
   NODE_SOCKET_API(float3, color2)
   NODE_SOCKET_API(float, scale)
-
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class BrickTextureNode : public TextureNode {
@@ -390,20 +346,11 @@ class BrickTextureNode : public TextureNode {
   NODE_SOCKET_API(float, brick_width)
   NODE_SOCKET_API(float, row_height)
   NODE_SOCKET_API(float3, vector)
-
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class PointDensityTextureNode : public ShaderNode {
  public:
   SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_4;
-  }
 
   ~PointDensityTextureNode();
   ShaderNode *clone(ShaderGraph *graph) const;
@@ -443,10 +390,6 @@ class IESLightNode : public TextureNode {
 
   ~IESLightNode();
   ShaderNode *clone(ShaderGraph *graph) const;
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(ustring, filename)
   NODE_SOCKET_API(ustring, ies)
@@ -464,10 +407,6 @@ class IESLightNode : public TextureNode {
 class WhiteNoiseTextureNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(WhiteNoiseTextureNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(int, dimensions)
   NODE_SOCKET_API(float3, vector)
@@ -477,10 +416,6 @@ class WhiteNoiseTextureNode : public ShaderNode {
 class MappingNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MappingNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
   void constant_fold(const ConstantFolder &folder);
 
   NODE_SOCKET_API(float3, vector)
@@ -546,6 +481,11 @@ class BsdfBaseNode : public ShaderNode {
     return false;
   }
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_BSDF;
+  }
+
  protected:
   ClosureType closure;
 };
@@ -606,6 +546,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
                ShaderInput *metallic,
                ShaderInput *subsurface,
                ShaderInput *subsurface_radius,
+               ShaderInput *subsurface_ior,
+               ShaderInput *subsurface_anisotropy,
                ShaderInput *specular,
                ShaderInput *roughness,
                ShaderInput *specular_tint,
@@ -622,6 +564,8 @@ class PrincipledBsdfNode : public BsdfBaseNode {
   NODE_SOCKET_API(float3, base_color)
   NODE_SOCKET_API(float3, subsurface_color)
   NODE_SOCKET_API(float3, subsurface_radius)
+  NODE_SOCKET_API(float, subsurface_ior)
+  NODE_SOCKET_API(float, subsurface_anisotropy)
   NODE_SOCKET_API(float, metallic)
   NODE_SOCKET_API(float, subsurface)
   NODE_SOCKET_API(float, specular)
@@ -758,14 +702,14 @@ class SubsurfaceScatteringNode : public BsdfNode {
   bool has_bssrdf_bump();
   ClosureType get_closure_type()
   {
-    return falloff;
+    return method;
   }
 
   NODE_SOCKET_API(float, scale)
   NODE_SOCKET_API(float3, radius)
-  NODE_SOCKET_API(float, sharpness)
-  NODE_SOCKET_API(float, texture_blur)
-  NODE_SOCKET_API(ClosureType, falloff)
+  NODE_SOCKET_API(float, subsurface_ior)
+  NODE_SOCKET_API(float, subsurface_anisotropy)
+  NODE_SOCKET_API(ClosureType, method)
 };
 
 class EmissionNode : public ShaderNode {
@@ -782,6 +726,11 @@ class EmissionNode : public ShaderNode {
     return true;
   }
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+  }
+
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, surface_mix_weight)
@@ -792,6 +741,11 @@ class BackgroundNode : public ShaderNode {
   SHADER_NODE_CLASS(BackgroundNode)
   void constant_fold(const ConstantFolder &folder);
 
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_EMISSION;
+  }
+
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, surface_mix_weight)
@@ -800,10 +754,6 @@ class BackgroundNode : public ShaderNode {
 class HoldoutNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(HoldoutNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual ClosureType get_closure_type()
   {
     return CLOSURE_HOLDOUT_ID;
@@ -821,13 +771,9 @@ class AmbientOcclusionNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-  virtual bool has_raytrace()
+  virtual int get_feature()
   {
-    return true;
+    return KERNEL_FEATURE_NODE_RAYTRACE;
   }
 
   NODE_SOCKET_API(float3, color)
@@ -845,13 +791,9 @@ class VolumeNode : public ShaderNode {
   SHADER_NODE_BASE_CLASS(VolumeNode)
 
   void compile(SVMCompiler &compiler, ShaderInput *param1, ShaderInput *param2);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual int get_feature()
   {
-    return ShaderNode::get_feature() | NODE_FEATURE_VOLUME;
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_VOLUME;
   }
   virtual ClosureType get_closure_type()
   {
@@ -1013,10 +955,6 @@ class UVMapNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(ustring, attribute)
   NODE_SOCKET_API(bool, from_dupli)
@@ -1025,10 +963,6 @@ class UVMapNode : public ShaderNode {
 class LightPathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(LightPathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class LightFalloffNode : public ShaderNode {
@@ -1038,10 +972,6 @@ class LightFalloffNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(float, strength)
   NODE_SOCKET_API(float, smooth)
@@ -1050,10 +980,6 @@ class LightFalloffNode : public ShaderNode {
 class ObjectInfoNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ObjectInfoNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class ParticleInfoNode : public ShaderNode {
@@ -1064,10 +990,6 @@ class ParticleInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 };
 
 class HairInfoNode : public ShaderNode {
@@ -1083,13 +1005,9 @@ class HairInfoNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   virtual int get_feature()
   {
-    return ShaderNode::get_feature() | NODE_FEATURE_HAIR;
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_HAIR;
   }
 };
 
@@ -1168,10 +1086,6 @@ class InvertNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(InvertNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, fac)
   NODE_SOCKET_API(float3, color)
@@ -1182,11 +1096,6 @@ class MixNode : public ShaderNode {
   SHADER_NODE_CLASS(MixNode)
   void constant_fold(const ConstantFolder &folder);
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API(NodeMix, mix_type)
   NODE_SOCKET_API(bool, use_clamp)
   NODE_SOCKET_API(float3, color1)
@@ -1198,10 +1107,6 @@ class CombineRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineRGBNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, r)
   NODE_SOCKET_API(float, g)
@@ -1212,10 +1117,6 @@ class CombineHSVNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineHSVNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, h)
   NODE_SOCKET_API(float, s)
@@ -1226,10 +1127,6 @@ class CombineXYZNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(CombineXYZNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, x)
   NODE_SOCKET_API(float, y)
@@ -1240,10 +1137,6 @@ class GammaNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(GammaNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, gamma)
@@ -1253,10 +1146,6 @@ class BrightContrastNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(BrightContrastNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, color)
   NODE_SOCKET_API(float, bright)
@@ -1267,10 +1156,6 @@ class SeparateRGBNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateRGBNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, color)
 };
@@ -1279,10 +1164,6 @@ class SeparateHSVNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateHSVNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, color)
 };
@@ -1291,10 +1172,6 @@ class SeparateXYZNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(SeparateXYZNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float3, vector)
 };
@@ -1333,10 +1210,6 @@ class CameraNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 };
 
 class FresnelNode : public ShaderNode {
@@ -1346,10 +1219,6 @@ class FresnelNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, normal)
   NODE_SOCKET_API(float, IOR)
@@ -1362,10 +1231,6 @@ class LayerWeightNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API(float3, normal)
   NODE_SOCKET_API(float, blend)
@@ -1378,10 +1243,6 @@ class WireframeNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, size)
   NODE_SOCKET_API(bool, use_pixel_size)
@@ -1390,10 +1251,6 @@ class WireframeNode : public ShaderNode {
 class WavelengthNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(WavelengthNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, wavelength)
 };
@@ -1402,10 +1259,6 @@ class BlackbodyNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(BlackbodyNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(float, temperature)
 };
@@ -1413,10 +1266,6 @@ class BlackbodyNode : public ShaderNode {
 class MapRangeNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MapRangeNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   void expand(ShaderGraph *graph);
 
   NODE_SOCKET_API(float, value)
@@ -1433,10 +1282,6 @@ class ClampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(ClampNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   NODE_SOCKET_API(float, value)
   NODE_SOCKET_API(float, min)
   NODE_SOCKET_API(float, max)
@@ -1446,10 +1291,6 @@ class ClampNode : public ShaderNode {
 class MathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(MathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   void expand(ShaderGraph *graph);
   void constant_fold(const ConstantFolder &folder);
 
@@ -1463,10 +1304,6 @@ class MathNode : public ShaderNode {
 class NormalNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(NormalNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_2;
-  }
 
   NODE_SOCKET_API(float3, direction)
   NODE_SOCKET_API(float3, normal)
@@ -1475,10 +1312,6 @@ class NormalNode : public ShaderNode {
 class VectorMathNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorMathNode)
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
   void constant_fold(const ConstantFolder &folder);
 
   NODE_SOCKET_API(float3, vector1)
@@ -1492,10 +1325,6 @@ class VectorRotateNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorRotateNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
   NODE_SOCKET_API(NodeVectorRotateType, rotate_type)
   NODE_SOCKET_API(bool, invert)
   NODE_SOCKET_API(float3, vector)
@@ -1509,11 +1338,6 @@ class VectorTransformNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(VectorTransformNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API(NodeVectorTransformType, transform_type)
   NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_from)
   NODE_SOCKET_API(NodeVectorTransformConvertSpace, convert_to)
@@ -1530,7 +1354,7 @@ class BumpNode : public ShaderNode {
   }
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(bool, invert)
@@ -1549,11 +1373,6 @@ class CurvesNode : public ShaderNode {
   explicit CurvesNode(const NodeType *node_type);
   SHADER_NODE_BASE_CLASS(CurvesNode)
 
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-
   NODE_SOCKET_API_ARRAY(array<float3>, curves)
   NODE_SOCKET_API(float, min_x)
   NODE_SOCKET_API(float, max_x)
@@ -1583,10 +1402,6 @@ class RGBRampNode : public ShaderNode {
  public:
   SHADER_NODE_CLASS(RGBRampNode)
   void constant_fold(const ConstantFolder &folder);
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_1;
-  }
 
   NODE_SOCKET_API_ARRAY(array<float3>, ramp)
   NODE_SOCKET_API_ARRAY(array<float>, ramp_alpha)
@@ -1656,10 +1471,6 @@ class NormalMapNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
   NODE_SOCKET_API(ustring, attribute)
@@ -1680,10 +1491,6 @@ class TangentNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
 
   NODE_SOCKET_API(NodeTangentDirectionType, direction_type)
   NODE_SOCKET_API(NodeTangentAxis, axis)
@@ -1698,13 +1505,9 @@ class BevelNode : public ShaderNode {
   {
     return true;
   }
-  virtual int get_group()
-  {
-    return NODE_GROUP_LEVEL_3;
-  }
-  virtual bool has_raytrace()
+  virtual int get_feature()
   {
-    return true;
+    return KERNEL_FEATURE_NODE_RAYTRACE;
   }
 
   NODE_SOCKET_API(float, radius)
@@ -1718,7 +1521,7 @@ class DisplacementNode : public ShaderNode {
   void constant_fold(const ConstantFolder &folder);
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
@@ -1739,7 +1542,7 @@ class VectorDisplacementNode : public ShaderNode {
   void constant_fold(const ConstantFolder &folder);
   virtual int get_feature()
   {
-    return NODE_FEATURE_BUMP;
+    return KERNEL_FEATURE_NODE_BUMP;
   }
 
   NODE_SOCKET_API(NodeNormalMapSpace, space)
diff --git a/intern/cycles/render/object.cpp b/intern/cycles/render/object.cpp
index c88d94fe4c2..4637f8fe989 100644
--- a/intern/cycles/render/object.cpp
+++ b/intern/cycles/render/object.cpp
@@ -216,6 +216,10 @@ void Object::tag_update(Scene *scene)
     if (use_holdout_is_modified()) {
       flag |= ObjectManager::HOLDOUT_MODIFIED;
     }
+
+    if (is_shadow_catcher_is_modified()) {
+      scene->tag_shadow_catcher_modified();
+    }
   }
 
   if (geometry) {
@@ -273,14 +277,7 @@ bool Object::is_traceable() const
 
 uint Object::visibility_for_tracing() const
 {
-  uint trace_visibility = visibility;
-  if (is_shadow_catcher) {
-    trace_visibility &= ~PATH_RAY_SHADOW_NON_CATCHER;
-  }
-  else {
-    trace_visibility &= ~PATH_RAY_SHADOW_CATCHER;
-  }
-  return trace_visibility;
+  return SHADOW_CATCHER_OBJECT_VISIBILITY(is_shadow_catcher, visibility & PATH_RAY_ALL_VISIBILITY);
 }
 
 float Object::compute_volume_step_size() const
@@ -680,7 +677,7 @@ void ObjectManager::device_update(Device *device,
 
   /* prepare for static BVH building */
   /* todo: do before to support getting object level coords? */
-  if (scene->params.bvh_type == SceneParams::BVH_STATIC) {
+  if (scene->params.bvh_type == BVH_TYPE_STATIC) {
     scoped_callback_timer timer([scene](double time) {
       if (scene->update_stats) {
         scene->update_stats->object.times.add_entry(
@@ -932,6 +929,11 @@ void ObjectManager::tag_update(Scene *scene, uint32_t flag)
   }
 
   scene->light_manager->tag_update(scene, LightManager::OBJECT_MANAGER);
+
+  /* Integrator's shadow catcher settings depends on object visibility settings. */
+  if (flag & (OBJECT_ADDED | OBJECT_REMOVED | OBJECT_MODIFIED)) {
+    scene->integrator->tag_update(scene, Integrator::OBJECT_MANAGER);
+  }
 }
 
 bool ObjectManager::need_update() const
diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp
index 7dc79f48145..d28b222c10e 100644
--- a/intern/cycles/render/osl.cpp
+++ b/intern/cycles/render/osl.cpp
@@ -113,7 +113,7 @@ void OSLShaderManager::device_update_specific(Device *device,
   scene->image_manager->set_osl_texture_system((void *)ts);
 
   /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
   Shader *background_shader = scene->background->get_shader(scene);
 
   foreach (Shader *shader, scene->shaders) {
@@ -174,7 +174,7 @@ void OSLShaderManager::device_update_specific(Device *device,
 
 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->osl_memory();
+  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
 
   device_free_common(device, dscene, scene);
 
@@ -257,25 +257,36 @@ void OSLShaderManager::shading_system_init()
 
     /* our own ray types */
     static const char *raytypes[] = {
-        "camera",      /* PATH_RAY_CAMERA */
-        "reflection",  /* PATH_RAY_REFLECT */
-        "refraction",  /* PATH_RAY_TRANSMIT */
-        "diffuse",     /* PATH_RAY_DIFFUSE */
-        "glossy",      /* PATH_RAY_GLOSSY */
-        "singular",    /* PATH_RAY_SINGULAR */
-        "transparent", /* PATH_RAY_TRANSPARENT */
-
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE_NON_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_OPAQUE_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_NON_CATCHER */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT_CATCHER */
-
-        "__unused__",  "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
-        "__unused__",
-
-        "__unused__",  "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
-        "__unused__",  "__unused__",       "__unused__", "__unused__",
-        "__unused__",  "__unused__",       "__unused__",
+        "camera",         /* PATH_RAY_CAMERA */
+        "reflection",     /* PATH_RAY_REFLECT */
+        "refraction",     /* PATH_RAY_TRANSMIT */
+        "diffuse",        /* PATH_RAY_DIFFUSE */
+        "glossy",         /* PATH_RAY_GLOSSY */
+        "singular",       /* PATH_RAY_SINGULAR */
+        "transparent",    /* PATH_RAY_TRANSPARENT */
+        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+
+        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+
+        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+        "__unused__", /* PATH_RAY_MIS_SKIP */
+
+        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+
+        "__unused__", /* PATH_RAY_SINGLE_PASS_DONE */
+        "__unused__", /* PATH_RAY_TRANSPARENT_BACKGROUND */
+        "__unused__", /* PATH_RAY_TERMINATE_IMMEDIATE */
+        "__unused__", /* PATH_RAY_TERMINATE_AFTER_TRANSPARENT */
+        "__unused__", /* PATH_RAY_EMISSION */
+        "__unused__", /* PATH_RAY_SUBSURFACE */
+        "__unused__", /* PATH_RAY_DENOISING_FEATURES */
+        "__unused__", /* PATH_RAY_REFLECT_PASS */
+        "__unused__", /* PATH_RAY_TRANSMISSION_PASS */
+        "__unused__", /* PATH_RAY_VOLUME_PASS */
+        "__unused__", /* PATH_RAY_SHADOW_FOR_LIGHT */
+        "__unused__", /* PATH_RAY_SHADOW_CATCHER_HIT */
+        "__unused__", /* PATH_RAY_SHADOW_CATCHER_PASS */
     };
 
     const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
@@ -758,7 +769,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath)
         current_shader->has_surface_bssrdf = true;
         current_shader->has_bssrdf_bump = true; /* can't detect yet */
       }
-      current_shader->has_bump = true; /* can't detect yet */
+      current_shader->has_bump = true;             /* can't detect yet */
+      current_shader->has_surface_raytrace = true; /* can't detect yet */
     }
 
     if (node->has_spatial_varying()) {
@@ -1054,6 +1066,8 @@ void OSLCompiler::generate_nodes(const ShaderNodeSet &nodes)
               current_shader->has_surface_emission = true;
             if (node->has_surface_transparent())
               current_shader->has_surface_transparent = true;
+            if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+              current_shader->has_surface_raytrace = true;
             if (node->has_spatial_varying())
               current_shader->has_surface_spatial_varying = true;
             if (node->has_surface_bssrdf()) {
diff --git a/intern/cycles/render/pass.cpp b/intern/cycles/render/pass.cpp
new file mode 100644
index 00000000000..27ad7c0db97
--- /dev/null
+++ b/intern/cycles/render/pass.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "render/pass.h"
+
+#include "util/util_algorithm.h"
+#include "util/util_logging.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type)
+{
+  const int type_int = static_cast<int>(type);
+
+  const NodeEnum *type_enum = Pass::get_type_enum();
+
+  if (!type_enum->exists(type_int)) {
+    LOG(DFATAL) << "Unhandled pass type " << static_cast<int>(type) << ", not supposed to happen.";
+    return "UNKNOWN";
+  }
+
+  return (*type_enum)[type_int].c_str();
+}
+
+const char *pass_mode_as_string(PassMode mode)
+{
+  switch (mode) {
+    case PassMode::NOISY:
+      return "NOISY";
+    case PassMode::DENOISED:
+      return "DENOISED";
+  }
+
+  LOG(DFATAL) << "Unhandled pass mode " << static_cast<int>(mode) << ", should never happen.";
+  return "UNKNOWN";
+}
+
+std::ostream &operator<<(std::ostream &os, PassMode mode)
+{
+  os << pass_mode_as_string(mode);
+  return os;
+}
+
+const NodeEnum *Pass::get_type_enum()
+{
+  static NodeEnum pass_type_enum;
+
+  if (pass_type_enum.empty()) {
+
+    /* Light Passes. */
+    pass_type_enum.insert("combined", PASS_COMBINED);
+    pass_type_enum.insert("emission", PASS_EMISSION);
+    pass_type_enum.insert("background", PASS_BACKGROUND);
+    pass_type_enum.insert("ao", PASS_AO);
+    pass_type_enum.insert("shadow", PASS_SHADOW);
+    pass_type_enum.insert("diffuse", PASS_DIFFUSE);
+    pass_type_enum.insert("diffuse_direct", PASS_DIFFUSE_DIRECT);
+    pass_type_enum.insert("diffuse_indirect", PASS_DIFFUSE_INDIRECT);
+    pass_type_enum.insert("glossy", PASS_GLOSSY);
+    pass_type_enum.insert("glossy_direct", PASS_GLOSSY_DIRECT);
+    pass_type_enum.insert("glossy_indirect", PASS_GLOSSY_INDIRECT);
+    pass_type_enum.insert("transmission", PASS_TRANSMISSION);
+    pass_type_enum.insert("transmission_direct", PASS_TRANSMISSION_DIRECT);
+    pass_type_enum.insert("transmission_indirect", PASS_TRANSMISSION_INDIRECT);
+    pass_type_enum.insert("volume", PASS_VOLUME);
+    pass_type_enum.insert("volume_direct", PASS_VOLUME_DIRECT);
+    pass_type_enum.insert("volume_indirect", PASS_VOLUME_INDIRECT);
+
+    /* Data passes. */
+    pass_type_enum.insert("depth", PASS_DEPTH);
+    pass_type_enum.insert("position", PASS_POSITION);
+    pass_type_enum.insert("normal", PASS_NORMAL);
+    pass_type_enum.insert("roughness", PASS_ROUGHNESS);
+    pass_type_enum.insert("uv", PASS_UV);
+    pass_type_enum.insert("object_id", PASS_OBJECT_ID);
+    pass_type_enum.insert("material_id", PASS_MATERIAL_ID);
+    pass_type_enum.insert("motion", PASS_MOTION);
+    pass_type_enum.insert("motion_weight", PASS_MOTION_WEIGHT);
+    pass_type_enum.insert("render_time", PASS_RENDER_TIME);
+    pass_type_enum.insert("cryptomatte", PASS_CRYPTOMATTE);
+    pass_type_enum.insert("aov_color", PASS_AOV_COLOR);
+    pass_type_enum.insert("aov_value", PASS_AOV_VALUE);
+    pass_type_enum.insert("adaptive_aux_buffer", PASS_ADAPTIVE_AUX_BUFFER);
+    pass_type_enum.insert("sample_count", PASS_SAMPLE_COUNT);
+    pass_type_enum.insert("diffuse_color", PASS_DIFFUSE_COLOR);
+    pass_type_enum.insert("glossy_color", PASS_GLOSSY_COLOR);
+    pass_type_enum.insert("transmission_color", PASS_TRANSMISSION_COLOR);
+    pass_type_enum.insert("mist", PASS_MIST);
+    pass_type_enum.insert("denoising_normal", PASS_DENOISING_NORMAL);
+    pass_type_enum.insert("denoising_albedo", PASS_DENOISING_ALBEDO);
+
+    pass_type_enum.insert("shadow_catcher", PASS_SHADOW_CATCHER);
+    pass_type_enum.insert("shadow_catcher_sample_count", PASS_SHADOW_CATCHER_SAMPLE_COUNT);
+    pass_type_enum.insert("shadow_catcher_matte", PASS_SHADOW_CATCHER_MATTE);
+
+    pass_type_enum.insert("bake_primitive", PASS_BAKE_PRIMITIVE);
+    pass_type_enum.insert("bake_differential", PASS_BAKE_DIFFERENTIAL);
+  }
+
+  return &pass_type_enum;
+}
+
+const NodeEnum *Pass::get_mode_enum()
+{
+  static NodeEnum pass_mode_enum;
+
+  if (pass_mode_enum.empty()) {
+    pass_mode_enum.insert("noisy", static_cast<int>(PassMode::NOISY));
+    pass_mode_enum.insert("denoised", static_cast<int>(PassMode::DENOISED));
+  }
+
+  return &pass_mode_enum;
+}
+
+NODE_DEFINE(Pass)
+{
+  NodeType *type = NodeType::add("pass", create);
+
+  const NodeEnum *pass_type_enum = get_type_enum();
+  const NodeEnum *pass_mode_enum = get_mode_enum();
+
+  SOCKET_ENUM(type, "Type", *pass_type_enum, PASS_COMBINED);
+  SOCKET_ENUM(mode, "Mode", *pass_mode_enum, static_cast<int>(PassMode::DENOISED));
+  SOCKET_STRING(name, "Name", ustring());
+  SOCKET_BOOLEAN(include_albedo, "Include Albedo", false);
+
+  return type;
+}
+
+Pass::Pass() : Node(get_node_type()), is_auto_(false)
+{
+}
+
+PassInfo Pass::get_info() const
+{
+  return get_info(type, include_albedo);
+}
+
+bool Pass::is_written() const
+{
+  return get_info().is_written;
+}
+
+PassInfo Pass::get_info(const PassType type, const bool include_albedo)
+{
+  PassInfo pass_info;
+
+  pass_info.use_filter = true;
+  pass_info.use_exposure = false;
+  pass_info.divide_type = PASS_NONE;
+  pass_info.use_compositing = false;
+  pass_info.use_denoising_albedo = true;
+
+  switch (type) {
+    case PASS_NONE:
+      pass_info.num_components = 0;
+      break;
+    case PASS_COMBINED:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = true;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_DEPTH:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;
+    case PASS_MIST:
+      pass_info.num_components = 1;
+      break;
+    case PASS_POSITION:
+      pass_info.num_components = 3;
+      break;
+    case PASS_NORMAL:
+      pass_info.num_components = 3;
+      break;
+    case PASS_ROUGHNESS:
+      pass_info.num_components = 1;
+      break;
+    case PASS_UV:
+      pass_info.num_components = 3;
+      break;
+    case PASS_MOTION:
+      pass_info.num_components = 4;
+      pass_info.divide_type = PASS_MOTION_WEIGHT;
+      break;
+    case PASS_MOTION_WEIGHT:
+      pass_info.num_components = 1;
+      break;
+    case PASS_OBJECT_ID:
+    case PASS_MATERIAL_ID:
+      pass_info.num_components = 1;
+      pass_info.use_filter = false;
+      break;
+
+    case PASS_EMISSION:
+    case PASS_BACKGROUND:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;
+    case PASS_AO:
+      pass_info.num_components = 3;
+      break;
+    case PASS_SHADOW:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = false;
+      break;
+    case PASS_RENDER_TIME:
+      /* This pass is handled entirely on the host side. */
+      pass_info.num_components = 0;
+      break;
+
+    case PASS_DIFFUSE_COLOR:
+    case PASS_GLOSSY_COLOR:
+    case PASS_TRANSMISSION_COLOR:
+      pass_info.num_components = 3;
+      break;
+    case PASS_DIFFUSE:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_DIFFUSE_DIRECT;
+      pass_info.indirect_type = PASS_DIFFUSE_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_DIFFUSE_DIRECT:
+    case PASS_DIFFUSE_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_DIFFUSE_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_GLOSSY:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_GLOSSY_DIRECT;
+      pass_info.indirect_type = PASS_GLOSSY_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_GLOSSY_DIRECT:
+    case PASS_GLOSSY_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_GLOSSY_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_TRANSMISSION:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_TRANSMISSION_DIRECT;
+      pass_info.indirect_type = PASS_TRANSMISSION_INDIRECT;
+      pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_TRANSMISSION_DIRECT:
+    case PASS_TRANSMISSION_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.divide_type = (!include_albedo) ? PASS_TRANSMISSION_COLOR : PASS_NONE;
+      pass_info.use_compositing = true;
+      break;
+    case PASS_VOLUME:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.direct_type = PASS_VOLUME_DIRECT;
+      pass_info.indirect_type = PASS_VOLUME_INDIRECT;
+      pass_info.use_compositing = true;
+      pass_info.is_written = false;
+      break;
+    case PASS_VOLUME_DIRECT:
+    case PASS_VOLUME_INDIRECT:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      break;
+
+    case PASS_CRYPTOMATTE:
+      pass_info.num_components = 4;
+      break;
+
+    case PASS_DENOISING_NORMAL:
+      pass_info.num_components = 3;
+      break;
+    case PASS_DENOISING_ALBEDO:
+      pass_info.num_components = 3;
+      break;
+
+    case PASS_SHADOW_CATCHER:
+      pass_info.num_components = 3;
+      pass_info.use_exposure = true;
+      pass_info.use_compositing = true;
+      pass_info.use_denoising_albedo = false;
+      pass_info.support_denoise = true;
+      break;
+    case PASS_SHADOW_CATCHER_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      break;
+    case PASS_SHADOW_CATCHER_MATTE:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = true;
+      pass_info.support_denoise = true;
+      /* Without shadow catcher approximation compositing is not needed.
+       * Since we don't know here whether approximation is used or not, leave the decision up to
+       * the caller which will know that. */
+      break;
+
+    case PASS_ADAPTIVE_AUX_BUFFER:
+      pass_info.num_components = 4;
+      break;
+    case PASS_SAMPLE_COUNT:
+      pass_info.num_components = 1;
+      pass_info.use_exposure = false;
+      break;
+
+    case PASS_AOV_COLOR:
+      pass_info.num_components = 3;
+      break;
+    case PASS_AOV_VALUE:
+      pass_info.num_components = 1;
+      break;
+
+    case PASS_BAKE_PRIMITIVE:
+    case PASS_BAKE_DIFFERENTIAL:
+      pass_info.num_components = 4;
+      pass_info.use_exposure = false;
+      pass_info.use_filter = false;
+      break;
+
+    case PASS_CATEGORY_LIGHT_END:
+    case PASS_CATEGORY_DATA_END:
+    case PASS_CATEGORY_BAKE_END:
+    case PASS_NUM:
+      LOG(DFATAL) << "Unexpected pass type is used " << type;
+      pass_info.num_components = 0;
+      break;
+  }
+
+  return pass_info;
+}
+
+bool Pass::contains(const vector<Pass *> &passes, PassType type)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_type() != type) {
+      continue;
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, const string &name)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_name() == name) {
+      return pass;
+    }
+  }
+
+  return nullptr;
+}
+
+const Pass *Pass::find(const vector<Pass *> &passes, PassType type, PassMode mode)
+{
+  for (const Pass *pass : passes) {
+    if (pass->get_type() != type || pass->get_mode() != mode) {
+      continue;
+    }
+
+    return pass;
+  }
+
+  return nullptr;
+}
+
+int Pass::get_offset(const vector<Pass *> &passes, const Pass *pass)
+{
+  int pass_offset = 0;
+
+  for (const Pass *current_pass : passes) {
+    /* Note that pass name is allowed to be empty. This is why we check for type and mode. */
+    if (current_pass->get_type() == pass->get_type() &&
+        current_pass->get_mode() == pass->get_mode() &&
+        current_pass->get_name() == pass->get_name()) {
+      if (current_pass->is_written()) {
+        return pass_offset;
+      }
+      else {
+        return PASS_UNUSED;
+      }
+    }
+    if (current_pass->is_written()) {
+      pass_offset += current_pass->get_info().num_components;
+    }
+  }
+
+  return PASS_UNUSED;
+}
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass)
+{
+  os << "type: " << pass_type_as_string(pass.get_type());
+  os << ", name: \"" << pass.get_name() << "\"";
+  os << ", mode: " << pass.get_mode();
+  os << ", is_written: " << string_from_bool(pass.is_written());
+
+  return os;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/pass.h b/intern/cycles/render/pass.h
new file mode 100644
index 00000000000..82230c62cb0
--- /dev/null
+++ b/intern/cycles/render/pass.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ostream>  // NOLINT
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+#include "kernel/kernel_types.h"
+
+#include "graph/node.h"
+
+CCL_NAMESPACE_BEGIN
+
+const char *pass_type_as_string(const PassType type);
+
+enum class PassMode {
+  NOISY,
+  DENOISED,
+};
+const char *pass_mode_as_string(PassMode mode);
+std::ostream &operator<<(std::ostream &os, PassMode mode);
+
+struct PassInfo {
+  int num_components = -1;
+  bool use_filter = false;
+  bool use_exposure = false;
+  bool is_written = true;
+  PassType divide_type = PASS_NONE;
+  PassType direct_type = PASS_NONE;
+  PassType indirect_type = PASS_NONE;
+
+  /* Pass access for read can not happen directly and needs some sort of compositing (for example,
+   * light passes due to divide_type, or shadow catcher pass. */
+  bool use_compositing = false;
+
+  /* Used to disable albedo pass for denoising.
+   * Light and shadow catcher passes should not have discontinuity in the denoised result based on
+   * the underlying albedo. */
+  bool use_denoising_albedo = true;
+
+  /* Pass supports denoising. */
+  bool support_denoise = false;
+};
+
+class Pass : public Node {
+ public:
+  NODE_DECLARE
+
+  NODE_SOCKET_API(PassType, type)
+  NODE_SOCKET_API(PassMode, mode)
+  NODE_SOCKET_API(ustring, name)
+  NODE_SOCKET_API(bool, include_albedo)
+
+  Pass();
+
+  PassInfo get_info() const;
+
+  /* The pass is written by the render pipeline (kernel or denoiser). If the pass is written it
+   * will have pixels allocated in a RenderBuffer. Passes which are not written do not have their
+   * pixels allocated to save memory. */
+  bool is_written() const;
+
+ protected:
+  /* The has been created automatically as a requirement to various rendering functionality (such
+   * as adaptive sampling). */
+  bool is_auto_;
+
+ public:
+  static const NodeEnum *get_type_enum();
+  static const NodeEnum *get_mode_enum();
+
+  static PassInfo get_info(PassType type, const bool include_albedo = false);
+
+  static bool contains(const vector<Pass *> &passes, PassType type);
+
+  /* Returns nullptr if there is no pass with the given name or type+mode. */
+  static const Pass *find(const vector<Pass *> &passes, const string &name);
+  static const Pass *find(const vector<Pass *> &passes,
+                          PassType type,
+                          PassMode mode = PassMode::NOISY);
+
+  /* Returns PASS_UNUSED if there is no corresponding pass. */
+  static int get_offset(const vector<Pass *> &passes, const Pass *pass);
+
+  friend class Film;
+};
+
+std::ostream &operator<<(std::ostream &os, const Pass &pass);
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.cpp b/intern/cycles/render/scene.cpp
index c4e7d2c79d6..a4b030190dc 100644
--- a/intern/cycles/render/scene.cpp
+++ b/intern/cycles/render/scene.cpp
@@ -163,12 +163,15 @@ void Scene::free_memory(bool final)
     delete p;
   foreach (Light *l, lights)
     delete l;
+  foreach (Pass *p, passes)
+    delete p;
 
   geometry.clear();
   objects.clear();
   lights.clear();
   particle_systems.clear();
   procedurals.clear();
+  passes.clear();
 
   if (device) {
     camera->device_free(device, &dscene, this);
@@ -253,7 +256,6 @@ void Scene::device_update(Device *device_, Progress &progress)
    * - Camera may be used for adaptive subdivision.
    * - Displacement shader must have all shader data available.
    * - Light manager needs lookup tables and final mesh data to compute emission CDF.
-   * - Film needs light manager to run for use_light_visibility
    * - Lookup tables are done a second time to handle film tables
    */
 
@@ -469,88 +471,110 @@ void Scene::enable_update_stats()
   }
 }
 
-DeviceRequestedFeatures Scene::get_requested_device_features()
+void Scene::update_kernel_features()
 {
-  DeviceRequestedFeatures requested_features;
+  if (!need_update()) {
+    return;
+  }
 
-  shader_manager->get_requested_features(this, &requested_features);
+  /* These features are not being tweaked as often as shaders,
+   * so could be done selective magic for the viewport as well. */
+  uint kernel_features = shader_manager->get_kernel_features(this);
 
-  /* This features are not being tweaked as often as shaders,
-   * so could be done selective magic for the viewport as well.
-   */
   bool use_motion = need_motion() == Scene::MotionType::MOTION_BLUR;
-  requested_features.use_hair = false;
-  requested_features.use_hair_thick = (params.hair_shape == CURVE_THICK);
-  requested_features.use_object_motion = false;
-  requested_features.use_camera_motion = use_motion && camera->use_motion();
+  kernel_features |= KERNEL_FEATURE_PATH_TRACING;
+  if (params.hair_shape == CURVE_THICK) {
+    kernel_features |= KERNEL_FEATURE_HAIR_THICK;
+  }
+  if (use_motion && camera->use_motion()) {
+    kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+  }
   foreach (Object *object, objects) {
     Geometry *geom = object->get_geometry();
     if (use_motion) {
-      requested_features.use_object_motion |= object->use_motion() | geom->get_use_motion_blur();
-      requested_features.use_camera_motion |= geom->get_use_motion_blur();
+      if (object->use_motion() || geom->get_use_motion_blur()) {
+        kernel_features |= KERNEL_FEATURE_OBJECT_MOTION;
+      }
+      if (geom->get_use_motion_blur()) {
+        kernel_features |= KERNEL_FEATURE_CAMERA_MOTION;
+      }
     }
     if (object->get_is_shadow_catcher()) {
-      requested_features.use_shadow_tricks = true;
+      kernel_features |= KERNEL_FEATURE_SHADOW_CATCHER;
     }
     if (geom->is_mesh()) {
       Mesh *mesh = static_cast<Mesh *>(geom);
 #ifdef WITH_OPENSUBDIV
       if (mesh->get_subdivision_type() != Mesh::SUBDIVISION_NONE) {
-        requested_features.use_patch_evaluation = true;
+        kernel_features |= KERNEL_FEATURE_PATCH_EVALUATION;
       }
 #endif
-      requested_features.use_true_displacement |= mesh->has_true_displacement();
     }
     else if (geom->is_hair()) {
-      requested_features.use_hair = true;
+      kernel_features |= KERNEL_FEATURE_HAIR;
     }
   }
 
-  requested_features.use_background_light = light_manager->has_background_light(this);
-
-  requested_features.use_baking = bake_manager->get_baking();
-  requested_features.use_integrator_branched = (integrator->get_method() ==
-                                                Integrator::BRANCHED_PATH);
-  if (film->get_denoising_data_pass()) {
-    requested_features.use_denoising = true;
-    requested_features.use_shadow_tricks = true;
+  if (bake_manager->get_baking()) {
+    kernel_features |= KERNEL_FEATURE_BAKING;
   }
 
-  return requested_features;
-}
+  kernel_features |= film->get_kernel_features(this);
 
-bool Scene::update(Progress &progress, bool &kernel_switch_needed)
-{
-  /* update scene */
-  if (need_update()) {
-    /* Update max_closures. */
-    KernelIntegrator *kintegrator = &dscene.data.integrator;
-    if (params.background) {
-      kintegrator->max_closures = get_max_closure_count();
-    }
-    else {
-      /* Currently viewport render is faster with higher max_closures, needs investigating. */
-      kintegrator->max_closures = MAX_CLOSURE;
-    }
-
-    /* Load render kernels, before device update where we upload data to the GPU. */
-    bool new_kernels_needed = load_kernels(progress, false);
-
-    progress.set_status("Updating Scene");
-    MEM_GUARDED_CALL(&progress, device_update, device, progress);
+  dscene.data.kernel_features = kernel_features;
 
-    DeviceKernelStatus kernel_switch_status = device->get_active_kernel_switch_state();
-    kernel_switch_needed = kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE ||
-                           kernel_switch_status == DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
-    if (new_kernels_needed || kernel_switch_needed) {
-      progress.set_kernel_status("Compiling render kernels");
-      device->wait_for_availability(loaded_kernel_features);
-      progress.set_kernel_status("");
-    }
+  /* Currently viewport render is faster with higher max_closures, needs investigating. */
+  const uint max_closures = (params.background) ? get_max_closure_count() : MAX_CLOSURE;
+  dscene.data.max_closures = max_closures;
+  dscene.data.max_shaders = shaders.size();
+}
 
-    return true;
+bool Scene::update(Progress &progress)
+{
+  if (!need_update()) {
+    return false;
   }
-  return false;
+
+  /* Load render kernels, before device update where we upload data to the GPU. */
+  load_kernels(progress, false);
+
+  /* Upload scene data to the GPU. */
+  progress.set_status("Updating Scene");
+  MEM_GUARDED_CALL(&progress, device_update, device, progress);
+
+  return true;
+}
+
+static void log_kernel_features(const uint features)
+{
+  VLOG(2) << "Requested features:\n";
+  VLOG(2) << "Use BSDF " << string_from_bool(features & KERNEL_FEATURE_NODE_BSDF) << "\n";
+  VLOG(2) << "Use Principled BSDF " << string_from_bool(features & KERNEL_FEATURE_PRINCIPLED)
+          << "\n";
+  VLOG(2) << "Use Emission " << string_from_bool(features & KERNEL_FEATURE_NODE_EMISSION) << "\n";
+  VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_NODE_VOLUME) << "\n";
+  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_NODE_HAIR) << "\n";
+  VLOG(2) << "Use Bump " << string_from_bool(features & KERNEL_FEATURE_NODE_BUMP) << "\n";
+  VLOG(2) << "Use Voronoi " << string_from_bool(features & KERNEL_FEATURE_NODE_VORONOI_EXTRA)
+          << "\n";
+  VLOG(2) << "Use Shader Raytrace " << string_from_bool(features & KERNEL_FEATURE_NODE_RAYTRACE)
+          << "\n";
+  VLOG(2) << "Use Transparent " << string_from_bool(features & KERNEL_FEATURE_TRANSPARENT) << "\n";
+  VLOG(2) << "Use Denoising " << string_from_bool(features & KERNEL_FEATURE_DENOISING) << "\n";
+  VLOG(2) << "Use Path Tracing " << string_from_bool(features & KERNEL_FEATURE_PATH_TRACING)
+          << "\n";
+  VLOG(2) << "Use Hair " << string_from_bool(features & KERNEL_FEATURE_HAIR) << "\n";
+  VLOG(2) << "Use Object Motion " << string_from_bool(features & KERNEL_FEATURE_OBJECT_MOTION)
+          << "\n";
+  VLOG(2) << "Use Camera Motion " << string_from_bool(features & KERNEL_FEATURE_CAMERA_MOTION)
+          << "\n";
+  VLOG(2) << "Use Baking " << string_from_bool(features & KERNEL_FEATURE_BAKING) << "\n";
+  VLOG(2) << "Use Subsurface " << string_from_bool(features & KERNEL_FEATURE_SUBSURFACE) << "\n";
+  VLOG(2) << "Use Volume " << string_from_bool(features & KERNEL_FEATURE_VOLUME) << "\n";
+  VLOG(2) << "Use Patch Evaluation "
+          << string_from_bool(features & KERNEL_FEATURE_PATCH_EVALUATION) << "\n";
+  VLOG(2) << "Use Shadow Catcher " << string_from_bool(features & KERNEL_FEATURE_SHADOW_CATCHER)
+          << "\n";
 }
 
 bool Scene::load_kernels(Progress &progress, bool lock_scene)
@@ -560,15 +584,15 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
     scene_lock = thread_scoped_lock(mutex);
   }
 
-  DeviceRequestedFeatures requested_features = get_requested_device_features();
+  const uint kernel_features = dscene.data.kernel_features;
 
-  if (!kernels_loaded || loaded_kernel_features.modified(requested_features)) {
+  if (!kernels_loaded || loaded_kernel_features != kernel_features) {
     progress.set_status("Loading render kernels (may take a few minutes the first time)");
 
     scoped_timer timer;
 
-    VLOG(2) << "Requested features:\n" << requested_features;
-    if (!device->load_kernels(requested_features)) {
+    log_kernel_features(kernel_features);
+    if (!device->load_kernels(kernel_features)) {
       string message = device->error_message();
       if (message.empty())
         message = "Failed loading render kernel, see console for errors";
@@ -580,7 +604,7 @@ bool Scene::load_kernels(Progress &progress, bool lock_scene)
     }
 
     kernels_loaded = true;
-    loaded_kernel_features = requested_features;
+    loaded_kernel_features = kernel_features;
     return true;
   }
   return false;
@@ -618,6 +642,28 @@ int Scene::get_max_closure_count()
   return max_closure_global;
 }
 
+bool Scene::has_shadow_catcher()
+{
+  if (shadow_catcher_modified_) {
+    has_shadow_catcher_ = false;
+    for (Object *object : objects) {
+      if (object->get_is_shadow_catcher()) {
+        has_shadow_catcher_ = true;
+        break;
+      }
+    }
+
+    shadow_catcher_modified_ = false;
+  }
+
+  return has_shadow_catcher_;
+}
+
+void Scene::tag_shadow_catcher_modified()
+{
+  shadow_catcher_modified_ = true;
+}
+
 template<> Light *Scene::create_node<Light>()
 {
   Light *node = new Light();
@@ -694,6 +740,15 @@ template<> AlembicProcedural *Scene::create_node<AlembicProcedural>()
 #endif
 }
 
+template<> Pass *Scene::create_node<Pass>()
+{
+  Pass *node = new Pass();
+  node->set_owner(this);
+  passes.push_back(node);
+  film->tag_modified();
+  return node;
+}
+
 template<typename T> void delete_node_from_array(vector<T> &nodes, T node)
 {
   for (size_t i = 0; i < nodes.size(); ++i) {
@@ -779,6 +834,12 @@ template<> void Scene::delete_node_impl(AlembicProcedural *node)
 #endif
 }
 
+template<> void Scene::delete_node_impl(Pass *node)
+{
+  delete_node_from_array(passes, node);
+  film->tag_modified();
+}
+
 template<typename T>
 static void remove_nodes_in_set(const set<T *> &nodes_set,
                                 vector<T *> &nodes_array,
@@ -842,4 +903,10 @@ template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOw
   procedural_manager->tag_update();
 }
 
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner)
+{
+  remove_nodes_in_set(nodes, passes, owner);
+  film->tag_modified();
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/scene.h b/intern/cycles/render/scene.h
index 7d8a6774381..cf4a3ba6b12 100644
--- a/intern/cycles/render/scene.h
+++ b/intern/cycles/render/scene.h
@@ -128,7 +128,7 @@ class DeviceScene {
   device_vector<float> lookup_table;
 
   /* integrator */
-  device_vector<uint> sample_pattern_lut;
+  device_vector<float> sample_pattern_lut;
 
   /* ies lights */
   device_vector<float> ies_lights;
@@ -142,27 +142,6 @@ class DeviceScene {
 
 class SceneParams {
  public:
-  /* Type of BVH, in terms whether it is supported dynamic updates of meshes
-   * or whether modifying geometry requires full BVH rebuild.
-   */
-  enum BVHType {
-    /* BVH supports dynamic updates of geometry.
-     *
-     * Faster for updating BVH tree when doing modifications in viewport,
-     * but slower for rendering.
-     */
-    BVH_DYNAMIC = 0,
-    /* BVH tree is calculated for specific scene, updates in geometry
-     * requires full tree rebuild.
-     *
-     * Slower to update BVH tree when modifying objects in viewport, also
-     * slower to build final BVH tree but gives best possible render speed.
-     */
-    BVH_STATIC = 1,
-
-    BVH_NUM_TYPES,
-  };
-
   ShadingSystem shadingsystem;
 
   /* Requested BVH layout.
@@ -186,7 +165,7 @@ class SceneParams {
   {
     shadingsystem = SHADINGSYSTEM_SVM;
     bvh_layout = BVH_LAYOUT_BVH2;
-    bvh_type = BVH_DYNAMIC;
+    bvh_type = BVH_TYPE_DYNAMIC;
     use_bvh_spatial_split = false;
     use_bvh_unaligned_nodes = true;
     num_bvh_time_steps = 0;
@@ -196,7 +175,7 @@ class SceneParams {
     background = true;
   }
 
-  bool modified(const SceneParams &params)
+  bool modified(const SceneParams &params) const
   {
     return !(shadingsystem == params.shadingsystem && bvh_layout == params.bvh_layout &&
              bvh_type == params.bvh_type &&
@@ -236,7 +215,7 @@ class Scene : public NodeOwner {
   vector<Shader *> shaders;
   vector<Light *> lights;
   vector<ParticleSystem *> particle_systems;
-  vector<Pass> passes;
+  vector<Pass *> passes;
   vector<Procedural *> procedurals;
 
   /* data managers */
@@ -291,7 +270,11 @@ class Scene : public NodeOwner {
 
   void enable_update_stats();
 
-  bool update(Progress &progress, bool &kernel_switch_needed);
+  void update_kernel_features();
+  bool update(Progress &progress);
+
+  bool has_shadow_catcher();
+  void tag_shadow_catcher_modified();
 
   /* This function is used to create a node of a specified type instead of
    * calling 'new', and sets the scene as the owner of the node.
@@ -348,13 +331,12 @@ class Scene : public NodeOwner {
   void free_memory(bool final);
 
   bool kernels_loaded;
-  DeviceRequestedFeatures loaded_kernel_features;
+  uint loaded_kernel_features;
 
   bool load_kernels(Progress &progress, bool lock_scene = true);
 
-  /* ** Split kernel routines ** */
-
-  DeviceRequestedFeatures get_requested_device_features();
+  bool has_shadow_catcher_ = false;
+  bool shadow_catcher_modified_ = true;
 
   /* Maximum number of closure during session lifetime. */
   int max_closure_global;
@@ -384,6 +366,8 @@ template<> Shader *Scene::create_node<Shader>();
 
 template<> AlembicProcedural *Scene::create_node<AlembicProcedural>();
 
+template<> Pass *Scene::create_node<Pass>();
+
 template<> void Scene::delete_node_impl(Light *node);
 
 template<> void Scene::delete_node_impl(Mesh *node);
@@ -404,6 +388,8 @@ template<> void Scene::delete_node_impl(Procedural *node);
 
 template<> void Scene::delete_node_impl(AlembicProcedural *node);
 
+template<> void Scene::delete_node_impl(Pass *node);
+
 template<> void Scene::delete_nodes(const set<Light *> &nodes, const NodeOwner *owner);
 
 template<> void Scene::delete_nodes(const set<Geometry *> &nodes, const NodeOwner *owner);
@@ -416,6 +402,8 @@ template<> void Scene::delete_nodes(const set<Shader *> &nodes, const NodeOwner
 
 template<> void Scene::delete_nodes(const set<Procedural *> &nodes, const NodeOwner *owner);
 
+template<> void Scene::delete_nodes(const set<Pass *> &nodes, const NodeOwner *owner);
+
 CCL_NAMESPACE_END
 
 #endif /*  __SCENE_H__ */
diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp
index 1b91c49f0ea..47eeffd97fe 100644
--- a/intern/cycles/render/session.cpp
+++ b/intern/cycles/render/session.cpp
@@ -17,10 +17,15 @@
 #include <limits.h>
 #include <string.h>
 
+#include "device/cpu/device.h"
 #include "device/device.h"
+#include "integrator/pass_accessor_cpu.h"
+#include "integrator/path_trace.h"
+#include "render/background.h"
 #include "render/bake.h"
 #include "render/buffers.h"
 #include "render/camera.h"
+#include "render/gpu_display.h"
 #include "render/graph.h"
 #include "render/integrator.h"
 #include "render/light.h"
@@ -39,70 +44,63 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* Note about  preserve_tile_device option for tile manager:
- * progressive refine and viewport rendering does requires tiles to
- * always be allocated for the same device
- */
-Session::Session(const SessionParams &params_)
-    : params(params_),
-      tile_manager(params.progressive,
-                   params.samples,
-                   params.tile_size,
-                   params.start_resolution,
-                   params.background == false || params.progressive_refine,
-                   params.background,
-                   params.tile_order,
-                   max(params.device.multi_devices.size(), 1),
-                   params.pixel_size),
-      stats(),
-      profiler()
+Session::Session(const SessionParams &params_, const SceneParams &scene_params)
+    : params(params_), render_scheduler_(tile_manager_, params)
 {
-  device_use_gl_ = ((params.device.type != DEVICE_CPU) && !params.background);
-
   TaskScheduler::init(params.threads);
 
-  session_thread_ = NULL;
-  scene = NULL;
-
-  reset_time_ = 0.0;
-  last_update_time_ = 0.0;
+  session_thread_ = nullptr;
 
   delayed_reset_.do_reset = false;
-  delayed_reset_.samples = 0;
-
-  display_outdated_ = false;
-  gpu_draw_ready_ = false;
-  gpu_need_display_buffer_update_ = false;
 
   pause_ = false;
   cancel_ = false;
   new_work_added_ = false;
 
-  buffers = NULL;
-  display = NULL;
+  device = Device::create(params.device, stats, profiler);
 
-  /* Validate denoising parameters. */
-  set_denoising(params.denoising);
+  scene = new Scene(scene_params, device);
 
-  /* Create CPU/GPU devices. */
-  device = Device::create(params.device, stats, profiler, params.background);
-
-  if (!device->error_message().empty()) {
-    progress.set_error(device->error_message());
-    return;
-  }
+  /* Configure path tracer. */
+  path_trace_ = make_unique<PathTrace>(
+      device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
+  path_trace_->set_progress(&progress);
+  path_trace_->tile_buffer_update_cb = [&]() {
+    if (!update_render_tile_cb) {
+      return;
+    }
+    update_render_tile_cb();
+  };
+  path_trace_->tile_buffer_write_cb = [&]() {
+    if (!write_render_tile_cb) {
+      return;
+    }
+    write_render_tile_cb();
+  };
+  path_trace_->tile_buffer_read_cb = [&]() -> bool {
+    if (!read_render_tile_cb) {
+      return false;
+    }
+    read_render_tile_cb();
+    return true;
+  };
+  path_trace_->progress_update_cb = [&]() { update_status_time(); };
 
-  /* Create buffers for interactive rendering. */
-  if (!(params.background && !params.write_render_cb)) {
-    buffers = new RenderBuffers(device);
-    display = new DisplayBuffer(device, params.display_buffer_linear);
-  }
+  tile_manager_.full_buffer_written_cb = [&](string_view filename) {
+    if (!full_buffer_written_cb) {
+      return;
+    }
+    full_buffer_written_cb(filename);
+  };
 }
 
 Session::~Session()
 {
   cancel();
 
+  /* TODO(sergey): Bring the passes in viewport back.
+   * It is unclear why there is such an exception needed though. */
+#if 0
   if (buffers && params.write_render_cb) {
     /* Copy to display buffer and write out image if requested */
     delete display;
@@ -116,12 +114,14 @@ Session::~Session()
     uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h);
     params.write_render_cb((uchar *)pixels, w, h, 4);
   }
+#endif
 
-  /* clean up */
-  tile_manager.device_free();
+  /* Make sure path tracer is destroyed before the device. This is needed because destruction might
+   * need to access device for device memory free. */
+  /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the
+   * pre-defined order. */
+  path_trace_.reset();
 
-  delete buffers;
-  delete display;
   delete scene;
   delete device;
 
@@ -135,15 +135,16 @@ void Session::start()
   }
 }
 
-void Session::cancel()
+void Session::cancel(bool quick)
 {
+  if (quick && path_trace_) {
+    path_trace_->cancel();
+  }
+
   if (session_thread_) {
     /* wait for session thread to end */
     progress.set_cancel("Exiting");
 
-    gpu_need_display_buffer_update_ = false;
-    gpu_need_display_buffer_update_cond_.notify_all();
-
     {
       thread_scoped_lock pause_lock(pause_mutex_);
       pause_ = false;
@@ -157,570 +158,43 @@ void Session::cancel()
 
 bool Session::ready_to_reset()
 {
-  double dt = time_dt() - reset_time_;
-
-  if (!display_outdated_)
-    return (dt > params.reset_timeout);
-  else
-    return (dt > params.cancel_timeout);
+  return path_trace_->ready_to_reset();
 }
 
-/* GPU Session */
-
-void Session::reset_gpu(BufferParams &buffer_params, int samples)
+void Session::run_main_render_loop()
 {
-  thread_scoped_lock pause_lock(pause_mutex_);
-
-  /* block for buffer access and reset immediately. we can't do this
-   * in the thread, because we need to allocate an OpenGL buffer, and
-   * that only works in the main thread */
-  thread_scoped_lock display_lock(display_mutex_);
-  thread_scoped_lock buffers_lock(buffers_mutex_);
+  path_trace_->clear_gpu_display();
 
-  display_outdated_ = true;
-  reset_time_ = time_dt();
+  while (true) {
+    RenderWork render_work = run_update_for_next_iteration();
 
-  reset_(buffer_params, samples);
-
-  gpu_need_display_buffer_update_ = false;
-  gpu_need_display_buffer_update_cond_.notify_all();
-
-  new_work_added_ = true;
-
-  pause_cond_.notify_all();
-}
-
-bool Session::draw_gpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
-  /* block for buffer access */
-  thread_scoped_lock display_lock(display_mutex_);
-
-  /* first check we already rendered something */
-  if (gpu_draw_ready_) {
-    /* then verify the buffers have the expected size, so we don't
-     * draw previous results in a resized window */
-    if (buffer_params.width == display->params.width &&
-        buffer_params.height == display->params.height) {
-      /* for CUDA we need to do tone-mapping still, since we can
-       * only access GL buffers from the main thread. */
-      if (gpu_need_display_buffer_update_) {
-        thread_scoped_lock buffers_lock(buffers_mutex_);
-        copy_to_display_buffer(tile_manager.state.sample);
-        gpu_need_display_buffer_update_ = false;
-        gpu_need_display_buffer_update_cond_.notify_all();
+    if (!render_work) {
+      if (VLOG_IS_ON(2)) {
+        double total_time, render_time;
+        progress.get_time(total_time, render_time);
+        VLOG(2) << "Rendering in main loop is done in " << render_time << " seconds.";
+        VLOG(2) << path_trace_->full_report();
       }
 
-      display->draw(device, draw_params);
-
-      if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
-        return false;
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-void Session::run_gpu()
-{
-  bool tiles_written = false;
-
-  reset_time_ = time_dt();
-  last_update_time_ = time_dt();
-  last_display_time_ = last_update_time_;
-
-  progress.set_render_start_time();
-
-  while (!progress.get_cancel()) {
-    const bool no_tiles = !run_update_for_next_iteration();
-
-    if (no_tiles) {
       if (params.background) {
-        /* if no work left and in background mode, we can stop immediately */
+        /* if no work left and in background mode, we can stop immediately. */
         progress.set_status("Finished");
         break;
       }
     }
 
-    if (run_wait_for_work(no_tiles)) {
-      continue;
-    }
-
-    if (progress.get_cancel()) {
-      break;
-    }
-
-    if (!no_tiles) {
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      if (progress.get_cancel())
-        break;
-
-      /* buffers mutex is locked entirely while rendering each
-       * sample, and released/reacquired on each iteration to allow
-       * reset and draw in between */
-      thread_scoped_lock buffers_lock(buffers_mutex_);
-
-      /* update status and timing */
-      update_status_time();
-
-      /* render */
-      bool delayed_denoise = false;
-      const bool need_denoise = render_need_denoise(delayed_denoise);
-      render(need_denoise);
-
-      device->task_wait();
-
-      if (!device->error_message().empty())
-        progress.set_cancel(device->error_message());
-
-      /* update status and timing */
-      update_status_time();
-
-      gpu_need_display_buffer_update_ = !delayed_denoise;
-      gpu_draw_ready_ = true;
-      progress.set_update();
-
-      /* wait for until display buffer is updated */
-      if (!params.background) {
-        while (gpu_need_display_buffer_update_) {
-          if (progress.get_cancel())
-            break;
-
-          gpu_need_display_buffer_update_cond_.wait(buffers_lock);
-        }
-      }
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      tiles_written = update_progressive_refine(progress.get_cancel());
-
-      if (progress.get_cancel())
-        break;
-    }
-  }
-
-  if (!tiles_written)
-    update_progressive_refine(true);
-}
-
-/* CPU Session */
-
-void Session::reset_cpu(BufferParams &buffer_params, int samples)
-{
-  thread_scoped_lock reset_lock(delayed_reset_.mutex);
-  thread_scoped_lock pause_lock(pause_mutex_);
-
-  display_outdated_ = true;
-  reset_time_ = time_dt();
-
-  delayed_reset_.params = buffer_params;
-  delayed_reset_.samples = samples;
-  delayed_reset_.do_reset = true;
-  device->task_cancel();
-
-  pause_cond_.notify_all();
-}
-
-bool Session::draw_cpu(BufferParams &buffer_params, DeviceDrawParams &draw_params)
-{
-  thread_scoped_lock display_lock(display_mutex_);
-
-  /* first check we already rendered something */
-  if (display->draw_ready()) {
-    /* then verify the buffers have the expected size, so we don't
-     * draw previous results in a resized window */
-    if (buffer_params.width == display->params.width &&
-        buffer_params.height == display->params.height) {
-      display->draw(device, draw_params);
-
-      if (display_outdated_ && (time_dt() - reset_time_) > params.text_timeout)
-        return false;
-
-      return true;
-    }
-  }
-
-  return false;
-}
-
-bool Session::steal_tile(RenderTile &rtile, Device *tile_device, thread_scoped_lock &tile_lock)
-{
-  /* Devices that can get their tiles stolen don't steal tiles themselves.
-   * Additionally, if there are no stealable tiles in flight, give up here. */
-  if (tile_device->info.type == DEVICE_CPU || stealable_tiles_ == 0) {
-    return false;
-  }
-
-  /* Wait until no other thread is trying to steal a tile. */
-  while (tile_stealing_state_ != NOT_STEALING && stealable_tiles_ > 0) {
-    /* Someone else is currently trying to get a tile.
-     * Wait on the condition variable and try later. */
-    tile_steal_cond_.wait(tile_lock);
-  }
-  /* If another thread stole the last stealable tile in the meantime, give up. */
-  if (stealable_tiles_ == 0) {
-    return false;
-  }
-
-  /* There are stealable tiles in flight, so signal that one should be released. */
-  tile_stealing_state_ = WAITING_FOR_TILE;
-
-  /* Wait until a device notices the signal and releases its tile. */
-  while (tile_stealing_state_ != GOT_TILE && stealable_tiles_ > 0) {
-    tile_steal_cond_.wait(tile_lock);
-  }
-  /* If the last stealable tile finished on its own, give up. */
-  if (tile_stealing_state_ != GOT_TILE) {
-    tile_stealing_state_ = NOT_STEALING;
-    return false;
-  }
-
-  /* Successfully stole a tile, now move it to the new device. */
-  rtile = stolen_tile_;
-  rtile.buffers->buffer.move_device(tile_device);
-  rtile.buffer = rtile.buffers->buffer.device_pointer;
-  rtile.stealing_state = RenderTile::NO_STEALING;
-  rtile.num_samples -= (rtile.sample - rtile.start_sample);
-  rtile.start_sample = rtile.sample;
-
-  tile_stealing_state_ = NOT_STEALING;
-
-  /* Poke any threads which might be waiting for NOT_STEALING above. */
-  tile_steal_cond_.notify_one();
-
-  return true;
-}
-
-bool Session::get_tile_stolen()
-{
-  /* If tile_stealing_state is WAITING_FOR_TILE, atomically set it to RELEASING_TILE
-   * and return true. */
-  TileStealingState expected = WAITING_FOR_TILE;
-  return tile_stealing_state_.compare_exchange_weak(expected, RELEASING_TILE);
-}
-
-bool Session::acquire_tile(RenderTile &rtile, Device *tile_device, uint tile_types)
-{
-  if (progress.get_cancel()) {
-    if (params.progressive_refine == false) {
-      /* for progressive refine current sample should be finished for all tiles */
-      return false;
-    }
-  }
-
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  /* get next tile from manager */
-  Tile *tile;
-  int device_num = device->device_number(tile_device);
-
-  while (!tile_manager.next_tile(tile, device_num, tile_types)) {
-    /* Can only steal tiles on devices that support rendering
-     * This is because denoising tiles cannot be stolen (see below)
-     */
-    if ((tile_types & (RenderTile::PATH_TRACE | RenderTile::BAKE)) &&
-        steal_tile(rtile, tile_device, tile_lock)) {
-      return true;
-    }
-
-    /* Wait for denoising tiles to become available */
-    if ((tile_types & RenderTile::DENOISE) && !progress.get_cancel() && tile_manager.has_tiles()) {
-      denoising_cond_.wait(tile_lock);
-      continue;
-    }
-
-    return false;
-  }
-
-  /* fill render tile */
-  rtile.x = tile_manager.state.buffer.full_x + tile->x;
-  rtile.y = tile_manager.state.buffer.full_y + tile->y;
-  rtile.w = tile->w;
-  rtile.h = tile->h;
-  rtile.start_sample = tile_manager.state.sample;
-  rtile.num_samples = tile_manager.state.num_samples;
-  rtile.resolution = tile_manager.state.resolution_divider;
-  rtile.tile_index = tile->index;
-  rtile.stealing_state = RenderTile::NO_STEALING;
-
-  if (tile->state == Tile::DENOISE) {
-    rtile.task = RenderTile::DENOISE;
-  }
-  else {
-    if (tile_device->info.type == DEVICE_CPU) {
-      stealable_tiles_++;
-      rtile.stealing_state = RenderTile::CAN_BE_STOLEN;
-    }
-
-    if (read_bake_tile_cb) {
-      rtile.task = RenderTile::BAKE;
-    }
-    else {
-      rtile.task = RenderTile::PATH_TRACE;
-    }
-  }
-
-  tile_lock.unlock();
-
-  /* in case of a permanent buffer, return it, otherwise we will allocate
-   * a new temporary buffer */
-  if (buffers) {
-    tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
-    rtile.buffer = buffers->buffer.device_pointer;
-    rtile.buffers = buffers;
-
-    device->map_tile(tile_device, rtile);
-
-    /* Reset copy state, since buffer contents change after the tile was acquired */
-    buffers->map_neighbor_copied = false;
-
-    /* This hack ensures that the copy in 'MultiDevice::map_neighbor_tiles' accounts
-     * for the buffer resolution divider. */
-    buffers->buffer.data_width = (buffers->params.width * buffers->params.get_passes_size()) /
-                                 tile_manager.state.resolution_divider;
-    buffers->buffer.data_height = buffers->params.height / tile_manager.state.resolution_divider;
-
-    return true;
-  }
-
-  if (tile->buffers == NULL) {
-    /* fill buffer parameters */
-    BufferParams buffer_params = tile_manager.params;
-    buffer_params.full_x = rtile.x;
-    buffer_params.full_y = rtile.y;
-    buffer_params.width = rtile.w;
-    buffer_params.height = rtile.h;
-
-    /* allocate buffers */
-    tile->buffers = new RenderBuffers(tile_device);
-    tile->buffers->reset(buffer_params);
-  }
-  else if (tile->buffers->buffer.device != tile_device) {
-    /* Move buffer to current tile device again in case it was stolen before.
-     * Not needed for denoising since that already handles mapping of tiles and
-     * neighbors to its own device. */
-    if (rtile.task != RenderTile::DENOISE) {
-      tile->buffers->buffer.move_device(tile_device);
-    }
-  }
-
-  tile->buffers->map_neighbor_copied = false;
-
-  tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
-  rtile.buffer = tile->buffers->buffer.device_pointer;
-  rtile.buffers = tile->buffers;
-  rtile.sample = tile_manager.state.sample;
-
-  if (read_bake_tile_cb) {
-    /* This will read any passes needed as input for baking. */
-    if (tile_manager.state.sample == tile_manager.range_start_sample) {
-      {
-        thread_scoped_lock tile_lock(tile_mutex_);
-        read_bake_tile_cb(rtile);
-      }
-      rtile.buffers->buffer.copy_to_device();
-    }
-  }
-  else {
-    /* This will tag tile as IN PROGRESS in blender-side render pipeline,
-     * which is needed to highlight currently rendering tile before first
-     * sample was processed for it. */
-    update_tile_sample(rtile);
-  }
-
-  return true;
-}
-
-void Session::update_tile_sample(RenderTile &rtile)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  if (update_render_tile_cb) {
-    if (params.progressive_refine == false) {
-      /* todo: optimize this by making it thread safe and removing lock */
-
-      update_render_tile_cb(rtile, true);
-    }
-  }
-
-  update_status_time();
-}
-
-void Session::release_tile(RenderTile &rtile, const bool need_denoise)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  if (rtile.stealing_state != RenderTile::NO_STEALING) {
-    stealable_tiles_--;
-    if (rtile.stealing_state == RenderTile::WAS_STOLEN) {
-      /* If the tile is being stolen, don't release it here - the new device will pick up where
-       * the old one left off. */
-
-      assert(tile_stealing_state_ == RELEASING_TILE);
-      assert(rtile.sample < rtile.start_sample + rtile.num_samples);
-
-      tile_stealing_state_ = GOT_TILE;
-      stolen_tile_ = rtile;
-      tile_steal_cond_.notify_all();
-      return;
-    }
-    else if (stealable_tiles_ == 0) {
-      /* If this was the last stealable tile, wake up any threads still waiting for one. */
-      tile_steal_cond_.notify_all();
-    }
-  }
-
-  progress.add_finished_tile(rtile.task == RenderTile::DENOISE);
-
-  bool delete_tile;
-
-  if (tile_manager.finish_tile(rtile.tile_index, need_denoise, delete_tile)) {
-    /* Finished tile pixels write. */
-    if (write_render_tile_cb && params.progressive_refine == false) {
-      write_render_tile_cb(rtile);
-    }
-
-    if (delete_tile) {
-      delete rtile.buffers;
-      tile_manager.state.tiles[rtile.tile_index].buffers = NULL;
-    }
-  }
-  else {
-    /* In progress tile pixels update. */
-    if (update_render_tile_cb && params.progressive_refine == false) {
-      update_render_tile_cb(rtile, false);
-    }
-  }
-
-  update_status_time();
-
-  /* Notify denoising thread that a tile was finished. */
-  denoising_cond_.notify_all();
-}
-
-void Session::map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-
-  const int4 image_region = make_int4(
-      tile_manager.state.buffer.full_x,
-      tile_manager.state.buffer.full_y,
-      tile_manager.state.buffer.full_x + tile_manager.state.buffer.width,
-      tile_manager.state.buffer.full_y + tile_manager.state.buffer.height);
-
-  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
-
-  if (!tile_manager.schedule_denoising) {
-    /* Fix up tile slices with overlap. */
-    if (tile_manager.slice_overlap != 0) {
-      int y = max(center_tile.y - tile_manager.slice_overlap, image_region.y);
-      center_tile.h = min(center_tile.y + center_tile.h + tile_manager.slice_overlap,
-                          image_region.w) -
-                      y;
-      center_tile.y = y;
-    }
-
-    /* Tiles are not being denoised individually, which means the entire image is processed. */
-    neighbors.set_bounds_from_center();
-  }
-  else {
-    int center_idx = center_tile.tile_index;
-    assert(tile_manager.state.tiles[center_idx].state == Tile::DENOISE);
-
-    for (int dy = -1, i = 0; dy <= 1; dy++) {
-      for (int dx = -1; dx <= 1; dx++, i++) {
-        RenderTile &rtile = neighbors.tiles[i];
-        int nindex = tile_manager.get_neighbor_index(center_idx, i);
-        if (nindex >= 0) {
-          Tile *tile = &tile_manager.state.tiles[nindex];
-
-          rtile.x = image_region.x + tile->x;
-          rtile.y = image_region.y + tile->y;
-          rtile.w = tile->w;
-          rtile.h = tile->h;
-
-          if (buffers) {
-            tile_manager.state.buffer.get_offset_stride(rtile.offset, rtile.stride);
-
-            rtile.buffer = buffers->buffer.device_pointer;
-            rtile.buffers = buffers;
-          }
-          else {
-            assert(tile->buffers);
-            tile->buffers->params.get_offset_stride(rtile.offset, rtile.stride);
-
-            rtile.buffer = tile->buffers->buffer.device_pointer;
-            rtile.buffers = tile->buffers;
-          }
-        }
-        else {
-          int px = center_tile.x + dx * params.tile_size.x;
-          int py = center_tile.y + dy * params.tile_size.y;
-
-          rtile.x = clamp(px, image_region.x, image_region.z);
-          rtile.y = clamp(py, image_region.y, image_region.w);
-          rtile.w = rtile.h = 0;
-
-          rtile.buffer = (device_ptr)NULL;
-          rtile.buffers = NULL;
-        }
-      }
-    }
-  }
-
-  assert(center_tile.buffers);
-  device->map_neighbor_tiles(tile_device, neighbors);
-
-  /* The denoised result is written back to the original tile. */
-  neighbors.target = center_tile;
-}
-
-void Session::unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device)
-{
-  thread_scoped_lock tile_lock(tile_mutex_);
-  device->unmap_neighbor_tiles(tile_device, neighbors);
-}
-
-void Session::run_cpu()
-{
-  bool tiles_written = false;
-
-  last_update_time_ = time_dt();
-  last_display_time_ = last_update_time_;
-
-  while (!progress.get_cancel()) {
-    const bool no_tiles = !run_update_for_next_iteration();
-    bool need_copy_to_display_buffer = false;
-
-    if (no_tiles) {
-      if (params.background) {
-        /* if no work left and in background mode, we can stop immediately */
-        progress.set_status("Finished");
+    const bool did_cancel = progress.get_cancel();
+    if (did_cancel) {
+      render_scheduler_.render_work_reschedule_on_cancel(render_work);
+      if (!render_work) {
         break;
       }
     }
-
-    if (run_wait_for_work(no_tiles)) {
+    else if (run_wait_for_work(render_work)) {
       continue;
     }
 
-    if (progress.get_cancel()) {
-      break;
-    }
-
-    if (!no_tiles) {
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      if (progress.get_cancel())
-        break;
-
+    {
       /* buffers mutex is locked entirely while rendering each
        * sample, and released/reacquired on each iteration to allow
        * reset and draw in between */
@@ -730,49 +204,25 @@ void Session::run_cpu()
       update_status_time();
 
       /* render */
-      bool delayed_denoise = false;
-      const bool need_denoise = render_need_denoise(delayed_denoise);
-      render(need_denoise);
+      path_trace_->render(render_work);
 
       /* update status and timing */
       update_status_time();
 
-      if (!params.background)
-        need_copy_to_display_buffer = !delayed_denoise;
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-    }
-
-    device->task_wait();
-
-    {
-      thread_scoped_lock reset_lock(delayed_reset_.mutex);
-      thread_scoped_lock buffers_lock(buffers_mutex_);
-      thread_scoped_lock display_lock(display_mutex_);
-
-      if (delayed_reset_.do_reset) {
-        /* reset rendering if request from main thread */
-        delayed_reset_.do_reset = false;
-        reset_(delayed_reset_.params, delayed_reset_.samples);
-      }
-      else if (need_copy_to_display_buffer) {
-        /* Only copy to display_buffer if we do not reset, we don't
-         * want to show the result of an incomplete sample */
-        copy_to_display_buffer(tile_manager.state.sample);
+      if (device->have_error()) {
+        const string &error_message = device->error_message();
+        progress.set_error(error_message);
+        progress.set_cancel(error_message);
+        break;
       }
-
-      if (!device->error_message().empty())
-        progress.set_error(device->error_message());
-
-      tiles_written = update_progressive_refine(progress.get_cancel());
     }
 
     progress.set_update();
-  }
 
-  if (!tiles_written)
-    update_progressive_refine(true);
+    if (did_cancel) {
+      break;
+    }
+  }
 }
 
 void Session::run()
@@ -789,10 +239,7 @@ void Session::run()
     /* reset number of rendered samples */
     progress.reset_sample();
 
-    if (device_use_gl_)
-      run_gpu();
-    else
-      run_cpu();
+    run_main_render_loop();
   }
 
   profiler.stop();
@@ -804,31 +251,92 @@ void Session::run()
     progress.set_update();
 }
 
-bool Session::run_update_for_next_iteration()
+RenderWork Session::run_update_for_next_iteration()
 {
+  RenderWork render_work;
+
   thread_scoped_lock scene_lock(scene->mutex);
   thread_scoped_lock reset_lock(delayed_reset_.mutex);
 
+  bool have_tiles = true;
+  bool switched_to_new_tile = false;
+
   if (delayed_reset_.do_reset) {
     thread_scoped_lock buffers_lock(buffers_mutex_);
-    reset_(delayed_reset_.params, delayed_reset_.samples);
-    delayed_reset_.do_reset = false;
+    do_delayed_reset();
+
+    /* After reset make sure the tile manager is at the first big tile. */
+    have_tiles = tile_manager_.next();
+    switched_to_new_tile = true;
+  }
+
+  /* Update number of samples in the integrator.
+   * Ideally this would need to happen once in `Session::set_samples()`, but the issue there is
+   * the initial configuration when Session is created where the `set_samples()` is not used. */
+  scene->integrator->set_aa_samples(params.samples);
+
+  /* Update denoiser settings. */
+  {
+    const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+    path_trace_->set_denoiser_params(denoise_params);
+  }
+
+  /* Update adaptive sampling. */
+  {
+    const AdaptiveSampling adaptive_sampling = scene->integrator->get_adaptive_sampling();
+    path_trace_->set_adaptive_sampling(adaptive_sampling);
   }
 
-  const bool have_tiles = tile_manager.next();
+  render_scheduler_.set_num_samples(params.samples);
+  render_scheduler_.set_time_limit(params.time_limit);
+
+  while (have_tiles) {
+    render_work = render_scheduler_.get_render_work();
+    if (render_work) {
+      break;
+    }
 
-  if (have_tiles) {
+    progress.add_finished_tile(false);
+
+    have_tiles = tile_manager_.next();
+    if (have_tiles) {
+      render_scheduler_.reset_for_next_tile();
+      switched_to_new_tile = true;
+    }
+  }
+
+  if (render_work) {
     scoped_timer update_timer;
-    if (update_scene()) {
+
+    if (switched_to_new_tile) {
+      BufferParams tile_params = buffer_params_;
+
+      const Tile &tile = tile_manager_.get_current_tile();
+      tile_params.width = tile.width;
+      tile_params.height = tile.height;
+      tile_params.full_x = tile.x + buffer_params_.full_x;
+      tile_params.full_y = tile.y + buffer_params_.full_y;
+      tile_params.full_width = buffer_params_.full_width;
+      tile_params.full_height = buffer_params_.full_height;
+      tile_params.update_offset_stride();
+
+      path_trace_->reset(buffer_params_, tile_params);
+    }
+
+    const int resolution = render_work.resolution_divider;
+    const int width = max(1, buffer_params_.full_width / resolution);
+    const int height = max(1, buffer_params_.full_height / resolution);
+
+    if (update_scene(width, height)) {
       profiler.reset(scene->shaders.size(), scene->objects.size());
     }
     progress.add_skip_time(update_timer, params.background);
   }
 
-  return have_tiles;
+  return render_work;
 }
 
-bool Session::run_wait_for_work(bool no_tiles)
+bool Session::run_wait_for_work(const RenderWork &render_work)
 {
   /* In an offline rendering there is no pause, and no tiles will mean the job is fully done. */
   if (params.background) {
@@ -837,19 +345,20 @@ bool Session::run_wait_for_work(bool no_tiles)
 
   thread_scoped_lock pause_lock(pause_mutex_);
 
-  if (!pause_ && !no_tiles) {
+  if (!pause_ && render_work) {
     /* Rendering is not paused and there is work to be done. No need to wait for anything. */
     return false;
   }
 
-  update_status_time(pause_, no_tiles);
+  const bool no_work = !render_work;
+  update_status_time(pause_, no_work);
 
   /* Only leave the loop when rendering is not paused. But even if the current render is un-paused
    * but there is nothing to render keep waiting until new work is added. */
   while (!cancel_) {
     scoped_timer pause_timer;
 
-    if (!pause_ && (!no_tiles || new_work_added_ || delayed_reset_.do_reset)) {
+    if (!pause_ && (render_work || new_work_added_ || delayed_reset_.do_reset)) {
       break;
     }
 
@@ -860,52 +369,89 @@ bool Session::run_wait_for_work(bool no_tiles)
       progress.add_skip_time(pause_timer, params.background);
     }
 
-    update_status_time(pause_, no_tiles);
+    update_status_time(pause_, no_work);
     progress.set_update();
   }
 
   new_work_added_ = false;
 
-  return no_tiles;
+  return no_work;
 }
 
-bool Session::draw(BufferParams &buffer_params, DeviceDrawParams &draw_params)
+void Session::draw()
 {
-  if (device_use_gl_)
-    return draw_gpu(buffer_params, draw_params);
-  else
-    return draw_cpu(buffer_params, draw_params);
+  path_trace_->draw();
 }
 
-void Session::reset_(BufferParams &buffer_params, int samples)
+int2 Session::get_effective_tile_size() const
 {
-  if (buffers && buffer_params.modified(tile_manager.params)) {
-    gpu_draw_ready_ = false;
-    buffers->reset(buffer_params);
-    if (display) {
-      display->reset(buffer_params);
-    }
+  /* No support yet for baking with tiles. */
+  if (!params.use_auto_tile || scene->bake_manager->get_baking()) {
+    return make_int2(buffer_params_.width, buffer_params_.height);
   }
 
-  tile_manager.reset(buffer_params, samples);
-  stealable_tiles_ = 0;
-  tile_stealing_state_ = NOT_STEALING;
-  progress.reset_sample();
+  /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile
+   * and prefer optimal performance. */
+
+  return make_int2(params.tile_size, params.tile_size);
+}
+
+void Session::do_delayed_reset()
+{
+  if (!delayed_reset_.do_reset) {
+    return;
+  }
+  delayed_reset_.do_reset = false;
+
+  params = delayed_reset_.session_params;
+  buffer_params_ = delayed_reset_.buffer_params;
+
+  /* Store parameters used for buffers access outside of scene graph.  */
+  buffer_params_.samples = params.samples;
+  buffer_params_.exposure = scene->film->get_exposure();
+  buffer_params_.use_approximate_shadow_catcher =
+      scene->film->get_use_approximate_shadow_catcher();
+  buffer_params_.use_transparent_background = scene->background->get_transparent();
 
-  bool show_progress = params.background || tile_manager.get_num_effective_samples() != INT_MAX;
-  progress.set_total_pixel_samples(show_progress ? tile_manager.state.total_pixel_samples : 0);
+  /* Tile and work scheduling. */
+  tile_manager_.reset_scheduling(buffer_params_, get_effective_tile_size());
+  render_scheduler_.reset(buffer_params_, params.samples);
 
-  if (!params.background)
+  /* Passes. */
+  /* When multiple tiles are used SAMPLE_COUNT pass is used to keep track of possible partial
+   * tile results. It is safe to use generic update function here which checks for changes since
+   * changes in tile settings re-creates session, which ensures film is fully updated on tile
+   * changes. */
+  scene->film->update_passes(scene, tile_manager_.has_multiple_tiles());
+
+  /* Update for new state of scene and passes. */
+  buffer_params_.update_passes(scene->passes);
+  tile_manager_.update(buffer_params_, scene);
+
+  /* Progress. */
+  progress.reset_sample();
+  progress.set_total_pixel_samples(buffer_params_.width * buffer_params_.height * params.samples);
+
+  if (!params.background) {
     progress.set_start_time();
+  }
   progress.set_render_start_time();
 }
 
-void Session::reset(BufferParams &buffer_params, int samples)
+void Session::reset(const SessionParams &session_params, const BufferParams &buffer_params)
 {
-  if (device_use_gl_)
-    reset_gpu(buffer_params, samples);
-  else
-    reset_cpu(buffer_params, samples);
+  {
+    thread_scoped_lock reset_lock(delayed_reset_.mutex);
+    thread_scoped_lock pause_lock(pause_mutex_);
+
+    delayed_reset_.do_reset = true;
+    delayed_reset_.session_params = session_params;
+    delayed_reset_.buffer_params = buffer_params;
+
+    path_trace_->cancel();
+  }
+
+  pause_cond_.notify_all();
 }
 
 void Session::set_samples(int samples)
@@ -915,7 +461,22 @@ void Session::set_samples(int samples)
   }
 
   params.samples = samples;
-  tile_manager.set_samples(samples);
+
+  {
+    thread_scoped_lock pause_lock(pause_mutex_);
+    new_work_added_ = true;
+  }
+
+  pause_cond_.notify_all();
+}
+
+void Session::set_time_limit(double time_limit)
+{
+  if (time_limit == params.time_limit) {
+    return;
+  }
+
+  params.time_limit = time_limit;
 
   {
     thread_scoped_lock pause_lock(pause_mutex_);
@@ -948,38 +509,9 @@ void Session::set_pause(bool pause)
   }
 }
 
-void Session::set_denoising(const DenoiseParams &denoising)
+void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display)
 {
-  bool need_denoise = denoising.need_denoising_task();
-
-  /* Lock buffers so no denoising operation is triggered while the settings are changed here. */
-  thread_scoped_lock buffers_lock(buffers_mutex_);
-  params.denoising = denoising;
-
-  if (!(params.device.denoisers & denoising.type)) {
-    if (need_denoise) {
-      progress.set_error("Denoiser type not supported by compute device");
-    }
-
-    params.denoising.use = false;
-    need_denoise = false;
-  }
-
-  // TODO(pmours): Query the required overlap value for denoising from the device?
-  tile_manager.slice_overlap = need_denoise && !params.background ? 64 : 0;
-
-  /* Schedule per tile denoising for final renders if we are either denoising or
-   * need prefiltered passes for the native denoiser. */
-  tile_manager.schedule_denoising = need_denoise && !buffers;
-}
-
-void Session::set_denoising_start_sample(int sample)
-{
-  if (sample != params.denoising.start_sample) {
-    params.denoising.start_sample = sample;
-
-    pause_cond_.notify_all();
-  }
+  path_trace_->set_gpu_display(move(gpu_display));
 }
 
 void Session::wait()
@@ -989,81 +521,67 @@ void Session::wait()
     delete session_thread_;
   }
 
-  session_thread_ = NULL;
+  session_thread_ = nullptr;
 }
 
-bool Session::update_scene()
+bool Session::update_scene(int width, int height)
 {
-  /* update camera if dimensions changed for progressive render. the camera
+  /* Update camera if dimensions changed for progressive render. the camera
    * knows nothing about progressive or cropped rendering, it just gets the
-   * image dimensions passed in */
+   * image dimensions passed in. */
   Camera *cam = scene->camera;
-  int width = tile_manager.state.buffer.full_width;
-  int height = tile_manager.state.buffer.full_height;
-  int resolution = tile_manager.state.resolution_divider;
-
-  cam->set_screen_size_and_resolution(width, height, resolution);
+  cam->set_screen_size(width, height);
 
-  /* number of samples is needed by multi jittered
-   * sampling pattern and by baking */
-  Integrator *integrator = scene->integrator;
-  BakeManager *bake_manager = scene->bake_manager;
+  /* First detect which kernel features are used and allocate working memory.
+   * This helps estimate how may device memory is available for the scene and
+   * how much we need to allocate on the host instead. */
+  scene->update_kernel_features();
 
-  if (integrator->get_sampling_pattern() != SAMPLING_PATTERN_SOBOL || bake_manager->get_baking()) {
-    integrator->set_aa_samples(tile_manager.num_samples);
-  }
+  path_trace_->load_kernels();
+  path_trace_->alloc_work_memory();
 
-  bool kernel_switch_needed = false;
-  if (scene->update(progress, kernel_switch_needed)) {
-    if (kernel_switch_needed) {
-      reset(tile_manager.params, params.samples);
-    }
+  if (scene->update(progress)) {
     return true;
   }
+
   return false;
 }
 
+static string status_append(const string &status, const string &suffix)
+{
+  string prefix = status;
+  if (!prefix.empty()) {
+    prefix += ", ";
+  }
+  return prefix + suffix;
+}
+
 void Session::update_status_time(bool show_pause, bool show_done)
 {
-  int progressive_sample = tile_manager.state.sample;
-  int num_samples = tile_manager.get_num_effective_samples();
+  string status, substatus;
 
-  int tile = progress.get_rendered_tiles();
-  int num_tiles = tile_manager.state.num_tiles;
+  const int current_tile = progress.get_rendered_tiles();
+  const int num_tiles = tile_manager_.get_num_tiles();
 
-  /* update status */
-  string status, substatus;
+  const int current_sample = progress.get_current_sample();
+  const int num_samples = render_scheduler_.get_num_samples();
 
-  if (!params.progressive) {
-    const bool is_cpu = params.device.type == DEVICE_CPU;
-    const bool rendering_finished = (tile == num_tiles);
-    const bool is_last_tile = (tile + 1) == num_tiles;
-
-    substatus = string_printf("Rendered %d/%d Tiles", tile, num_tiles);
-
-    if (!rendering_finished && (device->show_samples() || (is_cpu && is_last_tile))) {
-      /* Some devices automatically support showing the sample number:
-       * - CUDADevice
-       * - OpenCLDevice when using the megakernel (the split kernel renders multiple
-       *   samples at the same time, so the current sample isn't really defined)
-       * - CPUDevice when using one thread
-       * For these devices, the current sample is always shown.
-       *
-       * The other option is when the last tile is currently being rendered by the CPU.
-       */
-      substatus += string_printf(", Sample %d/%d", progress.get_current_sample(), num_samples);
-    }
-    if (params.denoising.use && params.denoising.type != DENOISER_OPENIMAGEDENOISE) {
-      substatus += string_printf(", Denoised %d tiles", progress.get_denoised_tiles());
-    }
-    else if (params.denoising.store_passes && params.denoising.type == DENOISER_NLM) {
-      substatus += string_printf(", Prefiltered %d tiles", progress.get_denoised_tiles());
-    }
+  /* TIle. */
+  if (tile_manager_.has_multiple_tiles()) {
+    substatus = status_append(substatus,
+                              string_printf("Rendered %d/%d Tiles", current_tile, num_tiles));
   }
-  else if (tile_manager.num_samples == Integrator::MAX_SAMPLES)
-    substatus = string_printf("Path Tracing Sample %d", progressive_sample + 1);
-  else
-    substatus = string_printf("Path Tracing Sample %d/%d", progressive_sample + 1, num_samples);
+
+  /* Sample. */
+  if (num_samples == Integrator::MAX_SAMPLES) {
+    substatus = status_append(substatus, string_printf("Sample %d", current_sample));
+  }
+  else {
+    substatus = status_append(substatus,
+                              string_printf("Sample %d/%d", current_sample, num_samples));
+  }
+
+  /* TODO(sergey): Denoising status from the path trace. */
 
   if (show_pause) {
     status = "Rendering Paused";
@@ -1080,210 +598,122 @@ void Session::update_status_time(bool show_pause, bool show_done)
   progress.set_status(status, substatus);
 }
 
-bool Session::render_need_denoise(bool &delayed)
+void Session::device_free()
 {
-  delayed = false;
-
-  /* Not supported yet for baking. */
-  if (read_bake_tile_cb) {
-    return false;
-  }
-
-  /* Denoising enabled? */
-  if (!params.denoising.need_denoising_task()) {
-    return false;
-  }
-
-  if (params.background) {
-    /* Background render, only denoise when rendering the last sample. */
-    return tile_manager.done();
-  }
-
-  /* Viewport render. */
-
-  /* It can happen that denoising was already enabled, but the scene still needs an update. */
-  if (scene->film->is_modified() || !scene->film->get_denoising_data_offset()) {
-    return false;
-  }
+  scene->device_free();
+  path_trace_->device_free();
+}
 
-  /* Immediately denoise when we reach the start sample or last sample. */
-  const int num_samples_finished = tile_manager.state.sample + 1;
-  if (num_samples_finished == params.denoising.start_sample ||
-      num_samples_finished == params.samples) {
-    return true;
+void Session::collect_statistics(RenderStats *render_stats)
+{
+  scene->collect_statistics(render_stats);
+  if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
+    render_stats->collect_profiling(scene, profiler);
   }
+}
 
-  /* Do not denoise until the sample at which denoising should start is reached. */
-  if (num_samples_finished < params.denoising.start_sample) {
-    return false;
-  }
+/* --------------------------------------------------------------------
+ * Tile and tile pixels access.
+ */
 
-  /* Avoid excessive denoising in viewport after reaching a certain amount of samples. */
-  delayed = (tile_manager.state.sample >= 20 &&
-             (time_dt() - last_display_time_) < params.progressive_update_timeout);
-  return !delayed;
+bool Session::has_multiple_render_tiles() const
+{
+  return tile_manager_.has_multiple_tiles();
 }
 
-void Session::render(bool need_denoise)
+int2 Session::get_render_tile_size() const
 {
-  if (buffers && tile_manager.state.sample == tile_manager.range_start_sample) {
-    /* Clear buffers. */
-    buffers->zero();
-  }
-
-  if (tile_manager.state.buffer.width == 0 || tile_manager.state.buffer.height == 0) {
-    return; /* Avoid empty launches. */
-  }
+  return path_trace_->get_render_tile_size();
+}
 
-  /* Add path trace task. */
-  DeviceTask task(DeviceTask::RENDER);
-
-  task.acquire_tile = function_bind(&Session::acquire_tile, this, _2, _1, _3);
-  task.release_tile = function_bind(&Session::release_tile, this, _1, need_denoise);
-  task.map_neighbor_tiles = function_bind(&Session::map_neighbor_tiles, this, _1, _2);
-  task.unmap_neighbor_tiles = function_bind(&Session::unmap_neighbor_tiles, this, _1, _2);
-  task.get_cancel = function_bind(&Progress::get_cancel, &this->progress);
-  task.update_tile_sample = function_bind(&Session::update_tile_sample, this, _1);
-  task.update_progress_sample = function_bind(&Progress::add_samples, &this->progress, _1, _2);
-  task.get_tile_stolen = function_bind(&Session::get_tile_stolen, this);
-  task.need_finish_queue = params.progressive_refine;
-  task.integrator_branched = scene->integrator->get_method() == Integrator::BRANCHED_PATH;
-
-  task.adaptive_sampling.use = (scene->integrator->get_sampling_pattern() ==
-                                SAMPLING_PATTERN_PMJ) &&
-                               scene->dscene.data.film.pass_adaptive_aux_buffer;
-  task.adaptive_sampling.min_samples = scene->dscene.data.integrator.adaptive_min_samples;
-  task.adaptive_sampling.adaptive_step = scene->dscene.data.integrator.adaptive_step;
-
-  /* Acquire render tiles by default. */
-  task.tile_types = RenderTile::PATH_TRACE;
-
-  if (need_denoise) {
-    task.denoising = params.denoising;
-
-    task.pass_stride = scene->film->get_pass_stride();
-    task.target_pass_stride = task.pass_stride;
-    task.pass_denoising_data = scene->film->get_denoising_data_offset();
-    task.pass_denoising_clean = scene->film->get_denoising_clean_offset();
-
-    task.denoising_from_render = true;
-
-    if (tile_manager.schedule_denoising) {
-      /* Acquire denoising tiles during rendering. */
-      task.tile_types |= RenderTile::DENOISE;
-    }
-    else {
-      assert(buffers);
-
-      /* Schedule rendering and wait for it to finish. */
-      device->task_add(task);
-      device->task_wait();
-
-      /* Then run denoising on the whole image at once. */
-      task.type = DeviceTask::DENOISE_BUFFER;
-      task.x = tile_manager.state.buffer.full_x;
-      task.y = tile_manager.state.buffer.full_y;
-      task.w = tile_manager.state.buffer.width;
-      task.h = tile_manager.state.buffer.height;
-      task.buffer = buffers->buffer.device_pointer;
-      task.sample = tile_manager.state.sample;
-      task.num_samples = tile_manager.state.num_samples;
-      tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-      task.buffers = buffers;
-    }
-  }
+int2 Session::get_render_tile_offset() const
+{
+  return path_trace_->get_render_tile_offset();
+}
 
-  device->task_add(task);
+string_view Session::get_render_tile_layer() const
+{
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+  return buffer_params.layer;
 }
 
-void Session::copy_to_display_buffer(int sample)
+string_view Session::get_render_tile_view() const
 {
-  /* add film conversion task */
-  DeviceTask task(DeviceTask::FILM_CONVERT);
-
-  task.x = tile_manager.state.buffer.full_x;
-  task.y = tile_manager.state.buffer.full_y;
-  task.w = tile_manager.state.buffer.width;
-  task.h = tile_manager.state.buffer.height;
-  task.rgba_byte = display->rgba_byte.device_pointer;
-  task.rgba_half = display->rgba_half.device_pointer;
-  task.buffer = buffers->buffer.device_pointer;
-  task.sample = sample;
-  tile_manager.state.buffer.get_offset_stride(task.offset, task.stride);
-
-  if (task.w > 0 && task.h > 0) {
-    device->task_add(task);
-    device->task_wait();
-
-    /* set display to new size */
-    display->draw_set(task.w, task.h);
-
-    last_display_time_ = time_dt();
-  }
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
+  return buffer_params.view;
+}
 
-  display_outdated_ = false;
+bool Session::copy_render_tile_from_device()
+{
+  return path_trace_->copy_render_tile_from_device();
 }
 
-bool Session::update_progressive_refine(bool cancel)
+bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels)
 {
-  int sample = tile_manager.state.sample + 1;
-  bool write = sample == tile_manager.num_samples || cancel;
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happening while this function runs. */
 
-  double current_time = time_dt();
+  const BufferParams &buffer_params = path_trace_->get_render_tile_params();
 
-  if (current_time - last_update_time_ < params.progressive_update_timeout) {
-    /* If last sample was processed, we need to write buffers anyway. */
-    if (!write && sample != 1)
-      return false;
+  const BufferPass *pass = buffer_params.find_pass(pass_name);
+  if (pass == nullptr) {
+    return false;
   }
 
-  if (params.progressive_refine) {
-    foreach (Tile &tile, tile_manager.state.tiles) {
-      if (!tile.buffers) {
-        continue;
-      }
-
-      RenderTile rtile;
-      rtile.x = tile_manager.state.buffer.full_x + tile.x;
-      rtile.y = tile_manager.state.buffer.full_y + tile.y;
-      rtile.w = tile.w;
-      rtile.h = tile.h;
-      rtile.sample = sample;
-      rtile.buffers = tile.buffers;
-
-      if (write) {
-        if (write_render_tile_cb)
-          write_render_tile_cb(rtile);
-      }
-      else {
-        if (update_render_tile_cb)
-          update_render_tile_cb(rtile, true);
-      }
+  const bool has_denoised_result = path_trace_->has_denoised_result();
+  if (pass->mode == PassMode::DENOISED && !has_denoised_result) {
+    pass = buffer_params.find_pass(pass->type);
+    if (pass == nullptr) {
+      /* Happens when denoised result pass is requested but is never written by the kernel. */
+      return false;
     }
   }
 
-  last_update_time_ = current_time;
+  pass = buffer_params.get_actual_display_pass(pass);
+
+  const float exposure = buffer_params.exposure;
+  const int num_samples = path_trace_->get_num_render_tile_samples();
 
-  return write;
+  PassAccessor::PassAccessInfo pass_access_info(*pass);
+  pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher;
+  pass_access_info.use_approximate_shadow_catcher_background =
+      pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background;
+
+  const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  const PassAccessor::Destination destination(pixels, num_components);
+
+  return path_trace_->get_render_tile_pixels(pass_accessor, destination);
 }
 
-void Session::device_free()
+bool Session::set_render_tile_pixels(const string &pass_name,
+                                     int num_components,
+                                     const float *pixels)
 {
-  scene->device_free();
+  /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification
+   * is happening while this function runs. */
+
+  const BufferPass *pass = buffer_params_.find_pass(pass_name);
+  if (!pass) {
+    return false;
+  }
+
+  const float exposure = scene->film->get_exposure();
+  const int num_samples = render_scheduler_.get_num_rendered_samples();
 
-  tile_manager.device_free();
+  const PassAccessor::PassAccessInfo pass_access_info(*pass);
+  PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples);
+  PassAccessor::Source source(pixels, num_components);
 
-  /* used from background render only, so no need to
-   * re-create render/display buffers here
-   */
+  return path_trace_->set_render_tile_pixels(pass_accessor, source);
 }
 
-void Session::collect_statistics(RenderStats *render_stats)
+/* --------------------------------------------------------------------
+ * Full-frame on-disk storage.
+ */
+
+void Session::process_full_buffer_from_disk(string_view filename)
 {
-  scene->collect_statistics(render_stats);
-  if (params.use_profiling && (params.device.type == DEVICE_CPU)) {
-    render_stats->collect_profiling(scene, profiler);
-  }
+  path_trace_->process_full_buffer_from_disk(filename);
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h
index 05025c10f9c..5623604bfe8 100644
--- a/intern/cycles/render/session.h
+++ b/intern/cycles/render/session.h
@@ -18,6 +18,7 @@
 #define __SESSION_H__
 
 #include "device/device.h"
+#include "integrator/render_scheduler.h"
 #include "render/buffers.h"
 #include "render/shader.h"
 #include "render/stats.h"
@@ -26,6 +27,7 @@
 #include "util/util_progress.h"
 #include "util/util_stats.h"
 #include "util/util_thread.h"
+#include "util/util_unique_ptr.h"
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -33,41 +35,35 @@ CCL_NAMESPACE_BEGIN
 class BufferParams;
 class Device;
 class DeviceScene;
-class DeviceRequestedFeatures;
-class DisplayBuffer;
+class PathTrace;
 class Progress;
+class GPUDisplay;
 class RenderBuffers;
 class Scene;
+class SceneParams;
 
 /* Session Parameters */
 
 class SessionParams {
  public:
   DeviceInfo device;
+
+  bool headless;
   bool background;
-  bool progressive_refine;
 
-  bool progressive;
   bool experimental;
   int samples;
-  int2 tile_size;
-  TileOrder tile_order;
-  int start_resolution;
-  int denoising_start_sample;
   int pixel_size;
   int threads;
-  bool adaptive_sampling;
-
-  bool use_profiling;
 
-  bool display_buffer_linear;
+  /* Limit in seconds for how long path tracing is allowed to happen.
+   * Zero means no limit is applied. */
+  double time_limit;
 
-  DenoiseParams denoising;
+  bool use_profiling;
 
-  double cancel_timeout;
-  double reset_timeout;
-  double text_timeout;
-  double progressive_update_timeout;
+  bool use_auto_tile;
+  int tile_size;
 
   ShadingSystem shadingsystem;
 
@@ -75,50 +71,32 @@ class SessionParams {
 
   SessionParams()
   {
+    headless = false;
     background = false;
-    progressive_refine = false;
 
-    progressive = false;
     experimental = false;
     samples = 1024;
-    tile_size = make_int2(64, 64);
-    start_resolution = INT_MAX;
-    denoising_start_sample = 0;
     pixel_size = 1;
     threads = 0;
-    adaptive_sampling = false;
+    time_limit = 0.0;
 
     use_profiling = false;
 
-    display_buffer_linear = false;
-
-    cancel_timeout = 0.1;
-    reset_timeout = 0.1;
-    text_timeout = 1.0;
-    progressive_update_timeout = 1.0;
+    use_auto_tile = true;
+    tile_size = 2048;
 
     shadingsystem = SHADINGSYSTEM_SVM;
-    tile_order = TILE_CENTER;
   }
 
-  bool modified(const SessionParams &params)
+  bool modified(const SessionParams &params) const
   {
     /* Modified means we have to recreate the session, any parameter changes
      * that can be handled by an existing Session are omitted. */
-    return !(device == params.device && background == params.background &&
-             progressive_refine == params.progressive_refine &&
-             progressive == params.progressive && experimental == params.experimental &&
-             tile_size == params.tile_size && start_resolution == params.start_resolution &&
+    return !(device == params.device && headless == params.headless &&
+             background == params.background && experimental == params.experimental &&
              pixel_size == params.pixel_size && threads == params.threads &&
-             adaptive_sampling == params.adaptive_sampling &&
-             use_profiling == params.use_profiling &&
-             display_buffer_linear == params.display_buffer_linear &&
-             cancel_timeout == params.cancel_timeout && reset_timeout == params.reset_timeout &&
-             text_timeout == params.text_timeout &&
-             progressive_update_timeout == params.progressive_update_timeout &&
-             tile_order == params.tile_order && shadingsystem == params.shadingsystem &&
-             denoising.type == params.denoising.type &&
-             (denoising.use == params.denoising.use || (device.denoisers & denoising.type)));
+             use_profiling == params.use_profiling && shadingsystem == params.shadingsystem &&
+             use_auto_tile == params.use_auto_tile && tile_size == params.tile_size);
   }
 };
 
@@ -131,34 +109,41 @@ class Session {
  public:
   Device *device;
   Scene *scene;
-  RenderBuffers *buffers;
-  DisplayBuffer *display;
   Progress progress;
   SessionParams params;
-  TileManager tile_manager;
   Stats stats;
   Profiler profiler;
 
-  function<void(RenderTile &)> write_render_tile_cb;
-  function<void(RenderTile &, bool)> update_render_tile_cb;
-  function<void(RenderTile &)> read_bake_tile_cb;
+  function<void(void)> write_render_tile_cb;
+  function<void(void)> update_render_tile_cb;
+  function<void(void)> read_render_tile_cb;
+
+  /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after
+   * writing. Allows an engine integration to keep track of those files without worry about
+   * transferring the information when it needs to re-create session during rendering. */
+  function<void(string_view)> full_buffer_written_cb;
 
-  explicit Session(const SessionParams &params);
+  explicit Session(const SessionParams &params, const SceneParams &scene_params);
   ~Session();
 
   void start();
-  void cancel();
-  bool draw(BufferParams &params, DeviceDrawParams &draw_params);
+
+  /* When quick cancel is requested path tracing is cancels as soon as possible, without waiting
+   * for the buffer to be uniformly sampled. */
+  void cancel(bool quick = false);
+
+  void draw();
   void wait();
 
   bool ready_to_reset();
-  void reset(BufferParams &params, int samples);
+  void reset(const SessionParams &session_params, const BufferParams &buffer_params);
+
   void set_pause(bool pause);
+
   void set_samples(int samples);
-  void set_denoising(const DenoiseParams &denoising);
-  void set_denoising_start_sample(int sample);
+  void set_time_limit(double time_limit);
 
-  bool update_scene();
+  void set_gpu_display(unique_ptr<GPUDisplay> gpu_display);
 
   void device_free();
 
@@ -168,83 +153,95 @@ class Session {
 
   void collect_statistics(RenderStats *stats);
 
- protected:
-  struct DelayedReset {
-    thread_mutex mutex;
-    bool do_reset;
-    BufferParams params;
-    int samples;
-  } delayed_reset_;
+  /* --------------------------------------------------------------------
+   * Tile and tile pixels access.
+   */
 
-  void run();
+  bool has_multiple_render_tiles() const;
 
-  bool run_update_for_next_iteration();
-  bool run_wait_for_work(bool no_tiles);
+  /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */
+  int2 get_render_tile_size() const;
+  int2 get_render_tile_offset() const;
 
-  void update_status_time(bool show_pause = false, bool show_done = false);
+  string_view get_render_tile_layer() const;
+  string_view get_render_tile_view() const;
 
-  void render(bool use_denoise);
-  void copy_to_display_buffer(int sample);
+  bool copy_render_tile_from_device();
 
-  void reset_(BufferParams &params, int samples);
+  bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels);
+  bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels);
 
-  void run_cpu();
-  bool draw_cpu(BufferParams &params, DeviceDrawParams &draw_params);
-  void reset_cpu(BufferParams &params, int samples);
+  /* --------------------------------------------------------------------
+   * Full-frame on-disk storage.
+   */
 
-  void run_gpu();
-  bool draw_gpu(BufferParams &params, DeviceDrawParams &draw_params);
-  void reset_gpu(BufferParams &params, int samples);
+  /* Read given full-frame file from disk, perform needed processing and write it to the software
+   * via the write callback. */
+  void process_full_buffer_from_disk(string_view filename);
 
-  bool render_need_denoise(bool &delayed);
+ protected:
+  struct DelayedReset {
+    thread_mutex mutex;
+    bool do_reset;
+    SessionParams session_params;
+    BufferParams buffer_params;
+  } delayed_reset_;
 
-  bool steal_tile(RenderTile &tile, Device *tile_device, thread_scoped_lock &tile_lock);
-  bool get_tile_stolen();
-  bool acquire_tile(RenderTile &tile, Device *tile_device, uint tile_types);
-  void update_tile_sample(RenderTile &tile);
-  void release_tile(RenderTile &tile, const bool need_denoise);
+  void run();
 
-  void map_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
-  void unmap_neighbor_tiles(RenderTileNeighbors &neighbors, Device *tile_device);
+  /* Update for the new iteration of the main loop in run implementation (run_cpu and run_gpu).
+   *
+   * Will take care of the following things:
+   *  - Delayed reset
+   *  - Scene update
+   *  - Tile manager advance
+   *  - Render scheduler work request
+   *
+   * The updates are done in a proper order with proper locking around them, which guarantees
+   * that the device side of scene and render buffers are always in a consistent state.
+   *
+   * Returns render work which is to be rendered next. */
+  RenderWork run_update_for_next_iteration();
+
+  /* Wait for rendering to be unpaused, or for new tiles for render to arrive.
+   * Returns true if new main render loop iteration is required after this function call.
+   *
+   * The `render_work` is the work which was scheduled by the render scheduler right before
+   * checking the pause. */
+  bool run_wait_for_work(const RenderWork &render_work);
+
+  void run_main_render_loop();
+
+  bool update_scene(int width, int height);
 
-  bool device_use_gl_;
+  void update_status_time(bool show_pause = false, bool show_done = false);
 
-  thread *session_thread_;
+  void do_delayed_reset();
 
-  volatile bool display_outdated_;
+  int2 get_effective_tile_size() const;
 
-  volatile bool gpu_draw_ready_;
-  volatile bool gpu_need_display_buffer_update_;
-  thread_condition_variable gpu_need_display_buffer_update_cond_;
+  thread *session_thread_;
 
-  bool pause_;
-  bool cancel_;
-  bool new_work_added_;
+  bool pause_ = false;
+  bool cancel_ = false;
+  bool new_work_added_ = false;
 
   thread_condition_variable pause_cond_;
   thread_mutex pause_mutex_;
   thread_mutex tile_mutex_;
   thread_mutex buffers_mutex_;
-  thread_mutex display_mutex_;
-  thread_condition_variable denoising_cond_;
-  thread_condition_variable tile_steal_cond_;
-
-  double reset_time_;
-  double last_update_time_;
-  double last_display_time_;
-
-  RenderTile stolen_tile_;
-  typedef enum {
-    NOT_STEALING,     /* There currently is no tile stealing in progress. */
-    WAITING_FOR_TILE, /* A device is waiting for another device to release a tile. */
-    RELEASING_TILE,   /* A device has releasing a stealable tile. */
-    GOT_TILE /* A device has released a stealable tile, which is now stored in stolen_tile. */
-  } TileStealingState;
-  std::atomic<TileStealingState> tile_stealing_state_;
-  int stealable_tiles_;
-
-  /* progressive refine */
-  bool update_progressive_refine(bool cancel);
+
+  TileManager tile_manager_;
+  BufferParams buffer_params_;
+
+  /* Render scheduler is used to get work to be rendered with the current big tile. */
+  RenderScheduler render_scheduler_;
+
+  /* Path tracer object.
+   *
+   * Is a single full-frame path tracer for interactive viewport rendering.
+   * A path tracer for the current big-tile for an offline rendering. */
+  unique_ptr<PathTrace> path_trace_;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/shader.cpp b/intern/cycles/render/shader.cpp
index 59b60904746..f6b23606e58 100644
--- a/intern/cycles/render/shader.cpp
+++ b/intern/cycles/render/shader.cpp
@@ -203,6 +203,7 @@ Shader::Shader() : Node(get_node_type())
   has_surface = false;
   has_surface_transparent = false;
   has_surface_emission = false;
+  has_surface_raytrace = false;
   has_surface_bssrdf = false;
   has_volume = false;
   has_displacement = false;
@@ -485,7 +486,7 @@ void ShaderManager::device_update(Device *device,
   device_update_specific(device, dscene, scene, progress);
 }
 
-void ShaderManager::device_update_common(Device *device,
+void ShaderManager::device_update_common(Device * /*device*/,
                                          DeviceScene *dscene,
                                          Scene *scene,
                                          Progress & /*progress*/)
@@ -508,6 +509,8 @@ void ShaderManager::device_update_common(Device *device,
       flag |= SD_HAS_EMISSION;
     if (shader->has_surface_transparent && shader->get_use_transparent_shadow())
       flag |= SD_HAS_TRANSPARENT_SHADOW;
+    if (shader->has_surface_raytrace)
+      flag |= SD_HAS_RAYTRACE;
     if (shader->has_volume) {
       flag |= SD_HAS_VOLUME;
       has_volumes = true;
@@ -528,12 +531,10 @@ void ShaderManager::device_update_common(Device *device,
       flag |= SD_NEED_VOLUME_ATTRIBUTES;
     if (shader->has_bssrdf_bump)
       flag |= SD_HAS_BSSRDF_BUMP;
-    if (device->info.has_volume_decoupled) {
-      if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
-        flag |= SD_VOLUME_EQUIANGULAR;
-      if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
-        flag |= SD_VOLUME_MIS;
-    }
+    if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_EQUIANGULAR)
+      flag |= SD_VOLUME_EQUIANGULAR;
+    if (shader->get_volume_sampling_method() == VOLUME_SAMPLING_MULTIPLE_IMPORTANCE)
+      flag |= SD_VOLUME_MIS;
     if (shader->get_volume_interpolation_method() == VOLUME_INTERPOLATION_CUBIC)
       flag |= SD_VOLUME_CUBIC;
     if (shader->has_bump)
@@ -682,39 +683,35 @@ void ShaderManager::add_default(Scene *scene)
   }
 }
 
-void ShaderManager::get_requested_graph_features(ShaderGraph *graph,
-                                                 DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_graph_kernel_features(ShaderGraph *graph)
 {
+  uint kernel_features = 0;
+
   foreach (ShaderNode *node, graph->nodes) {
-    requested_features->max_nodes_group = max(requested_features->max_nodes_group,
-                                              node->get_group());
-    requested_features->nodes_features |= node->get_feature();
+    kernel_features |= node->get_feature();
     if (node->special_type == SHADER_SPECIAL_TYPE_CLOSURE) {
       BsdfBaseNode *bsdf_node = static_cast<BsdfBaseNode *>(node);
       if (CLOSURE_IS_VOLUME(bsdf_node->get_closure_type())) {
-        requested_features->nodes_features |= NODE_FEATURE_VOLUME;
+        kernel_features |= KERNEL_FEATURE_NODE_VOLUME;
       }
       else if (CLOSURE_IS_PRINCIPLED(bsdf_node->get_closure_type())) {
-        requested_features->use_principled = true;
+        kernel_features |= KERNEL_FEATURE_PRINCIPLED;
       }
     }
     if (node->has_surface_bssrdf()) {
-      requested_features->use_subsurface = true;
+      kernel_features |= KERNEL_FEATURE_SUBSURFACE;
     }
     if (node->has_surface_transparent()) {
-      requested_features->use_transparent = true;
-    }
-    if (node->has_raytrace()) {
-      requested_features->use_shader_raytrace = true;
+      kernel_features |= KERNEL_FEATURE_TRANSPARENT;
     }
   }
+
+  return kernel_features;
 }
 
-void ShaderManager::get_requested_features(Scene *scene,
-                                           DeviceRequestedFeatures *requested_features)
+uint ShaderManager::get_kernel_features(Scene *scene)
 {
-  requested_features->max_nodes_group = NODE_GROUP_LEVEL_0;
-  requested_features->nodes_features = 0;
+  uint kernel_features = KERNEL_FEATURE_NODE_BSDF | KERNEL_FEATURE_NODE_EMISSION;
   for (int i = 0; i < scene->shaders.size(); i++) {
     Shader *shader = scene->shaders[i];
     if (!shader->reference_count()) {
@@ -722,21 +719,22 @@ void ShaderManager::get_requested_features(Scene *scene,
     }
 
     /* Gather requested features from all the nodes from the graph nodes. */
-    get_requested_graph_features(shader->graph, requested_features);
+    kernel_features |= get_graph_kernel_features(shader->graph);
     ShaderNode *output_node = shader->graph->output();
     if (output_node->input("Displacement")->link != NULL) {
-      requested_features->nodes_features |= NODE_FEATURE_BUMP;
+      kernel_features |= KERNEL_FEATURE_NODE_BUMP;
       if (shader->get_displacement_method() == DISPLACE_BOTH) {
-        requested_features->nodes_features |= NODE_FEATURE_BUMP_STATE;
-        requested_features->max_nodes_group = max(requested_features->max_nodes_group,
-                                                  NODE_GROUP_LEVEL_1);
+        kernel_features |= KERNEL_FEATURE_NODE_BUMP_STATE;
       }
     }
     /* On top of volume nodes, also check if we need volume sampling because
-     * e.g. an Emission node would slip through the NODE_FEATURE_VOLUME check */
-    if (shader->has_volume)
-      requested_features->use_volume |= true;
+     * e.g. an Emission node would slip through the KERNEL_FEATURE_NODE_VOLUME check */
+    if (shader->has_volume) {
+      kernel_features |= KERNEL_FEATURE_VOLUME;
+    }
   }
+
+  return kernel_features;
 }
 
 void ShaderManager::free_memory()
diff --git a/intern/cycles/render/shader.h b/intern/cycles/render/shader.h
index c65cac351a4..5f9adea3949 100644
--- a/intern/cycles/render/shader.h
+++ b/intern/cycles/render/shader.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 
 class Device;
 class DeviceScene;
-class DeviceRequestedFeatures;
 class Mesh;
 class Progress;
 class Scene;
@@ -117,6 +116,7 @@ class Shader : public Node {
   bool has_surface;
   bool has_surface_emission;
   bool has_surface_transparent;
+  bool has_surface_raytrace;
   bool has_volume;
   bool has_displacement;
   bool has_surface_bssrdf;
@@ -216,7 +216,7 @@ class ShaderManager {
   static void add_default(Scene *scene);
 
   /* Selective nodes compilation. */
-  void get_requested_features(Scene *scene, DeviceRequestedFeatures *requested_features);
+  uint get_kernel_features(Scene *scene);
 
   static void free_memory();
 
@@ -244,8 +244,7 @@ class ShaderManager {
 
   size_t beckmann_table_offset;
 
-  void get_requested_graph_features(ShaderGraph *graph,
-                                    DeviceRequestedFeatures *requested_features);
+  uint get_graph_kernel_features(ShaderGraph *graph);
 
   thread_spin_lock attribute_lock_;
 
diff --git a/intern/cycles/render/stats.cpp b/intern/cycles/render/stats.cpp
index 2c6273842e2..73eb7e21ff9 100644
--- a/intern/cycles/render/stats.cpp
+++ b/intern/cycles/render/stats.cpp
@@ -264,53 +264,34 @@ void RenderStats::collect_profiling(Scene *scene, Profiler &prof)
   has_profiling = true;
 
   kernel = NamedNestedSampleStats("Total render time", prof.get_event(PROFILING_UNKNOWN));
-
   kernel.add_entry("Ray setup", prof.get_event(PROFILING_RAY_SETUP));
-  kernel.add_entry("Result writing", prof.get_event(PROFILING_WRITE_RESULT));
-
-  NamedNestedSampleStats &integrator = kernel.add_entry("Path integration",
-                                                        prof.get_event(PROFILING_PATH_INTEGRATE));
-  integrator.add_entry("Scene intersection", prof.get_event(PROFILING_SCENE_INTERSECT));
-  integrator.add_entry("Indirect emission", prof.get_event(PROFILING_INDIRECT_EMISSION));
-  integrator.add_entry("Volumes", prof.get_event(PROFILING_VOLUME));
-
-  NamedNestedSampleStats &shading = integrator.add_entry("Shading", 0);
-  shading.add_entry("Shader Setup", prof.get_event(PROFILING_SHADER_SETUP));
-  shading.add_entry("Shader Eval", prof.get_event(PROFILING_SHADER_EVAL));
-  shading.add_entry("Shader Apply", prof.get_event(PROFILING_SHADER_APPLY));
-  shading.add_entry("Ambient Occlusion", prof.get_event(PROFILING_AO));
-  shading.add_entry("Subsurface", prof.get_event(PROFILING_SUBSURFACE));
-
-  integrator.add_entry("Connect Light", prof.get_event(PROFILING_CONNECT_LIGHT));
-  integrator.add_entry("Surface Bounce", prof.get_event(PROFILING_SURFACE_BOUNCE));
-
-  NamedNestedSampleStats &intersection = kernel.add_entry("Intersection", 0);
-  intersection.add_entry("Full Intersection", prof.get_event(PROFILING_INTERSECT));
-  intersection.add_entry("Local Intersection", prof.get_event(PROFILING_INTERSECT_LOCAL));
-  intersection.add_entry("Shadow All Intersection",
-                         prof.get_event(PROFILING_INTERSECT_SHADOW_ALL));
-  intersection.add_entry("Volume Intersection", prof.get_event(PROFILING_INTERSECT_VOLUME));
-  intersection.add_entry("Volume All Intersection",
-                         prof.get_event(PROFILING_INTERSECT_VOLUME_ALL));
-
-  NamedNestedSampleStats &closure = kernel.add_entry("Closures", 0);
-  closure.add_entry("Surface Closure Evaluation", prof.get_event(PROFILING_CLOSURE_EVAL));
-  closure.add_entry("Surface Closure Sampling", prof.get_event(PROFILING_CLOSURE_SAMPLE));
-  closure.add_entry("Volume Closure Evaluation", prof.get_event(PROFILING_CLOSURE_VOLUME_EVAL));
-  closure.add_entry("Volume Closure Sampling", prof.get_event(PROFILING_CLOSURE_VOLUME_SAMPLE));
-
-  NamedNestedSampleStats &denoising = kernel.add_entry("Denoising",
-                                                       prof.get_event(PROFILING_DENOISING));
-  denoising.add_entry("Construct Transform",
-                      prof.get_event(PROFILING_DENOISING_CONSTRUCT_TRANSFORM));
-  denoising.add_entry("Reconstruct", prof.get_event(PROFILING_DENOISING_RECONSTRUCT));
-
-  NamedNestedSampleStats &prefilter = denoising.add_entry("Prefiltering", 0);
-  prefilter.add_entry("Divide Shadow", prof.get_event(PROFILING_DENOISING_DIVIDE_SHADOW));
-  prefilter.add_entry("Non-Local means", prof.get_event(PROFILING_DENOISING_NON_LOCAL_MEANS));
-  prefilter.add_entry("Get Feature", prof.get_event(PROFILING_DENOISING_GET_FEATURE));
-  prefilter.add_entry("Detect Outliers", prof.get_event(PROFILING_DENOISING_DETECT_OUTLIERS));
-  prefilter.add_entry("Combine Halves", prof.get_event(PROFILING_DENOISING_COMBINE_HALVES));
+  kernel.add_entry("Intersect Closest", prof.get_event(PROFILING_INTERSECT_CLOSEST));
+  kernel.add_entry("Intersect Shadow", prof.get_event(PROFILING_INTERSECT_SHADOW));
+  kernel.add_entry("Intersect Subsurface", prof.get_event(PROFILING_INTERSECT_SUBSURFACE));
+  kernel.add_entry("Intersect Volume Stack", prof.get_event(PROFILING_INTERSECT_VOLUME_STACK));
+
+  NamedNestedSampleStats &surface = kernel.add_entry("Shade Surface", 0);
+  surface.add_entry("Setup", prof.get_event(PROFILING_SHADE_SURFACE_SETUP));
+  surface.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_SURFACE_EVAL));
+  surface.add_entry("Render Passes", prof.get_event(PROFILING_SHADE_SURFACE_PASSES));
+  surface.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_SURFACE_DIRECT_LIGHT));
+  surface.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_SURFACE_INDIRECT_LIGHT));
+  surface.add_entry("Ambient Occlusion", prof.get_event(PROFILING_SHADE_SURFACE_AO));
+
+  NamedNestedSampleStats &volume = kernel.add_entry("Shade Volume", 0);
+  volume.add_entry("Setup", prof.get_event(PROFILING_SHADE_VOLUME_SETUP));
+  volume.add_entry("Integrate", prof.get_event(PROFILING_SHADE_VOLUME_INTEGRATE));
+  volume.add_entry("Direct Light", prof.get_event(PROFILING_SHADE_VOLUME_DIRECT_LIGHT));
+  volume.add_entry("Indirect Light", prof.get_event(PROFILING_SHADE_VOLUME_INDIRECT_LIGHT));
+
+  NamedNestedSampleStats &shadow = kernel.add_entry("Shade Shadow", 0);
+  shadow.add_entry("Setup", prof.get_event(PROFILING_SHADE_SHADOW_SETUP));
+  shadow.add_entry("Surface", prof.get_event(PROFILING_SHADE_SHADOW_SURFACE));
+  shadow.add_entry("Volume", prof.get_event(PROFILING_SHADE_SHADOW_VOLUME));
+
+  NamedNestedSampleStats &light = kernel.add_entry("Shade Light", 0);
+  light.add_entry("Setup", prof.get_event(PROFILING_SHADE_LIGHT_SETUP));
+  light.add_entry("Shader Evaluation", prof.get_event(PROFILING_SHADE_LIGHT_EVAL));
 
   shaders.entries.clear();
   foreach (Shader *shader, scene->shaders) {
diff --git a/intern/cycles/render/svm.cpp b/intern/cycles/render/svm.cpp
index dcb3976e15c..2379eb775a0 100644
--- a/intern/cycles/render/svm.cpp
+++ b/intern/cycles/render/svm.cpp
@@ -446,6 +446,8 @@ void SVMCompiler::generate_node(ShaderNode *node, ShaderNodeSet &done)
   if (current_type == SHADER_TYPE_SURFACE) {
     if (node->has_spatial_varying())
       current_shader->has_surface_spatial_varying = true;
+    if (node->get_feature() & KERNEL_FEATURE_NODE_RAYTRACE)
+      current_shader->has_surface_raytrace = true;
   }
   else if (current_type == SHADER_TYPE_VOLUME) {
     if (node->has_spatial_varying())
@@ -492,6 +494,13 @@ void SVMCompiler::generate_svm_nodes(const ShaderNodeSet &nodes, CompilerState *
 
 void SVMCompiler::generate_closure_node(ShaderNode *node, CompilerState *state)
 {
+  /* Skip generating closure that are not supported or needed for a particular
+   * type of shader. For example a BSDF in a volume shader. */
+  const int node_feature = node->get_feature();
+  if ((state->node_feature_mask & node_feature) != node_feature) {
+    return;
+  }
+
   /* execute dependencies for closure */
   foreach (ShaderInput *in, node->inputs) {
     if (in->link != NULL) {
@@ -555,7 +564,7 @@ void SVMCompiler::find_aov_nodes_and_dependencies(ShaderNodeSet &aov_nodes,
   foreach (ShaderNode *node, graph->nodes) {
     if (node->special_type == SHADER_SPECIAL_TYPE_OUTPUT_AOV) {
       OutputAOVNode *aov_node = static_cast<OutputAOVNode *>(node);
-      if (aov_node->slot >= 0) {
+      if (aov_node->offset >= 0) {
         aov_nodes.insert(aov_node);
         foreach (ShaderInput *in, node->inputs) {
           if (in->link != NULL) {
@@ -785,17 +794,21 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty
         case SHADER_TYPE_SURFACE: /* generate surface shader */
           generate = true;
           shader->has_surface = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE;
           break;
         case SHADER_TYPE_VOLUME: /* generate volume shader */
           generate = true;
           shader->has_volume = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_VOLUME;
           break;
         case SHADER_TYPE_DISPLACEMENT: /* generate displacement shader */
           generate = true;
           shader->has_displacement = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_DISPLACEMENT;
           break;
         case SHADER_TYPE_BUMP: /* generate bump shader */
           generate = true;
+          state.node_feature_mask = KERNEL_FEATURE_NODE_MASK_BUMP;
           break;
         default:
           break;
@@ -867,6 +880,7 @@ void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Sum
   shader->has_surface = false;
   shader->has_surface_emission = false;
   shader->has_surface_transparent = false;
+  shader->has_surface_raytrace = false;
   shader->has_surface_bssrdf = false;
   shader->has_bump = has_bump;
   shader->has_bssrdf_bump = has_bump;
@@ -964,6 +978,7 @@ SVMCompiler::CompilerState::CompilerState(ShaderGraph *graph)
     max_id = max(node->id, max_id);
   }
   nodes_done_flag.resize(max_id + 1, false);
+  node_feature_mask = 0;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/svm.h b/intern/cycles/render/svm.h
index d23ff3e2a47..0353c393ae4 100644
--- a/intern/cycles/render/svm.h
+++ b/intern/cycles/render/svm.h
@@ -192,6 +192,9 @@ class SVMCompiler {
      * all areas to use this flags array.
      */
     vector<bool> nodes_done_flag;
+
+    /* Node features that can be compiled. */
+    uint node_feature_mask;
   };
 
   void stack_clear_temporary(ShaderNode *node);
diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp
index 375c9fd8e09..28910bffa7b 100644
--- a/intern/cycles/render/tile.cpp
+++ b/intern/cycles/render/tile.cpp
@@ -16,601 +16,559 @@
 
 #include "render/tile.h"
 
+#include <atomic>
+
+#include "graph/node.h"
+#include "render/background.h"
+#include "render/film.h"
+#include "render/integrator.h"
+#include "render/scene.h"
 #include "util/util_algorithm.h"
 #include "util/util_foreach.h"
+#include "util/util_logging.h"
+#include "util/util_path.h"
+#include "util/util_string.h"
+#include "util/util_system.h"
 #include "util/util_types.h"
 
 CCL_NAMESPACE_BEGIN
 
-namespace {
+/* --------------------------------------------------------------------
+ * Internal functions.
+ */
 
-class TileComparator {
- public:
-  TileComparator(TileOrder order_, int2 center_, Tile *tiles_)
-      : order(order_), center(center_), tiles(tiles_)
-  {
-  }
+static const char *ATTR_PASSES_COUNT = "cycles.passes.count";
+static const char *ATTR_PASS_SOCKET_PREFIX_FORMAT = "cycles.passes.%d.";
+static const char *ATTR_BUFFER_SOCKET_PREFIX = "cycles.buffer.";
+static const char *ATTR_DENOISE_SOCKET_PREFIX = "cycles.denoise.";
 
-  bool operator()(int a, int b)
-  {
-    switch (order) {
-      case TILE_CENTER: {
-        float2 dist_a = make_float2(center.x - (tiles[a].x + tiles[a].w / 2),
-                                    center.y - (tiles[a].y + tiles[a].h / 2));
-        float2 dist_b = make_float2(center.x - (tiles[b].x + tiles[b].w / 2),
-                                    center.y - (tiles[b].y + tiles[b].h / 2));
-        return dot(dist_a, dist_a) < dot(dist_b, dist_b);
-      }
-      case TILE_LEFT_TO_RIGHT:
-        return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x < tiles[b].x);
-      case TILE_RIGHT_TO_LEFT:
-        return (tiles[a].x == tiles[b].x) ? (tiles[a].y < tiles[b].y) : (tiles[a].x > tiles[b].x);
-      case TILE_TOP_TO_BOTTOM:
-        return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y > tiles[b].y);
-      case TILE_BOTTOM_TO_TOP:
-      default:
-        return (tiles[a].y == tiles[b].y) ? (tiles[a].x < tiles[b].x) : (tiles[a].y < tiles[b].y);
+/* Global counter of ToleManager object instances. */
+static std::atomic<uint64_t> g_instance_index = 0;
+
+/* Construct names of EXR channels which will ensure order of all channels to match exact offsets
+ * in render buffers corresponding to the given passes.
+ *
+ * Returns `std` datatypes so that it can be assigned directly to the OIIO's `ImageSpec`. */
+static std::vector<std::string> exr_channel_names_for_passes(const BufferParams &buffer_params)
+{
+  static const char *component_suffixes[] = {"R", "G", "B", "A"};
+
+  int pass_index = 0;
+  int num_channels = 0;
+  std::vector<std::string> channel_names;
+  for (const BufferPass &pass : buffer_params.passes) {
+    if (pass.offset == PASS_UNUSED) {
+      continue;
     }
-  }
 
- protected:
-  TileOrder order;
-  int2 center;
-  Tile *tiles;
-};
+    const PassInfo pass_info = pass.get_info();
+    num_channels += pass_info.num_components;
 
-inline int2 hilbert_index_to_pos(int n, int d)
-{
-  int2 r, xy = make_int2(0, 0);
-  for (int s = 1; s < n; s *= 2) {
-    r.x = (d >> 1) & 1;
-    r.y = (d ^ r.x) & 1;
-    if (!r.y) {
-      if (r.x) {
-        xy = make_int2(s - 1, s - 1) - xy;
-      }
-      swap(xy.x, xy.y);
+    /* EXR canonically expects first part of channel names to be sorted alphabetically, which is
+     * not guaranteed to be the case with passes names. Assign a prefix based on the pass index
+     * with a fixed width to ensure ordering. This makes it possible to dump existing render
+     * buffers memory to disk and read it back without doing extra mapping. */
+    const string prefix = string_printf("%08d", pass_index);
+
+    const string channel_name_prefix = prefix + string(pass.name) + ".";
+
+    for (int i = 0; i < pass_info.num_components; ++i) {
+      channel_names.push_back(channel_name_prefix + component_suffixes[i]);
     }
-    xy += r * make_int2(s, s);
-    d >>= 2;
+
+    ++pass_index;
   }
-  return xy;
+
+  return channel_names;
 }
 
-enum SpiralDirection {
-  DIRECTION_UP,
-  DIRECTION_LEFT,
-  DIRECTION_DOWN,
-  DIRECTION_RIGHT,
-};
-
-} /* namespace */
-
-TileManager::TileManager(bool progressive_,
-                         int num_samples_,
-                         int2 tile_size_,
-                         int start_resolution_,
-                         bool preserve_tile_device_,
-                         bool background_,
-                         TileOrder tile_order_,
-                         int num_devices_,
-                         int pixel_size_)
+inline string node_socket_attribute_name(const SocketType &socket, const string &attr_name_prefix)
 {
-  progressive = progressive_;
-  tile_size = tile_size_;
-  tile_order = tile_order_;
-  start_resolution = start_resolution_;
-  pixel_size = pixel_size_;
-  slice_overlap = 0;
-  num_samples = num_samples_;
-  num_devices = num_devices_;
-  preserve_tile_device = preserve_tile_device_;
-  background = background_;
-  schedule_denoising = false;
-
-  range_start_sample = 0;
-  range_num_samples = -1;
-
-  BufferParams buffer_params;
-  reset(buffer_params, 0);
+  return attr_name_prefix + string(socket.name);
 }
 
-TileManager::~TileManager()
+template<typename ValidateValueFunc, typename GetValueFunc>
+static bool node_socket_generic_to_image_spec_atttributes(
+    ImageSpec *image_spec,
+    const Node *node,
+    const SocketType &socket,
+    const string &attr_name_prefix,
+    const ValidateValueFunc &validate_value_func,
+    const GetValueFunc &get_value_func)
 {
+  if (!validate_value_func(node, socket)) {
+    return false;
+  }
+
+  image_spec->attribute(node_socket_attribute_name(socket, attr_name_prefix),
+                        get_value_func(node, socket));
+
+  return true;
 }
 
-void TileManager::device_free()
+static bool node_socket_to_image_spec_atttributes(ImageSpec *image_spec,
+                                                  const Node *node,
+                                                  const SocketType &socket,
+                                                  const string &attr_name_prefix)
 {
-  if (schedule_denoising || progressive) {
-    for (int i = 0; i < state.tiles.size(); i++) {
-      delete state.tiles[i].buffers;
-      state.tiles[i].buffers = NULL;
+  const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+  switch (socket.type) {
+    case SocketType::ENUM: {
+      const ustring value = node->get_string(socket);
+
+      /* Validate that the node is consistent with the node type definition. */
+      const NodeEnum &enum_values = *socket.enum_values;
+      if (!enum_values.exists(value)) {
+        LOG(DFATAL) << "Node enum contains invalid value " << value;
+        return false;
+      }
+
+      image_spec->attribute(attr_name, value);
+
+      return true;
     }
-  }
 
-  state.tiles.clear();
+    case SocketType::STRING:
+      image_spec->attribute(attr_name, node->get_string(socket));
+      return true;
+
+    case SocketType::INT:
+      image_spec->attribute(attr_name, node->get_int(socket));
+      return true;
+
+    case SocketType::FLOAT:
+      image_spec->attribute(attr_name, node->get_float(socket));
+      return true;
+
+    case SocketType::BOOLEAN:
+      image_spec->attribute(attr_name, node->get_bool(socket));
+      return true;
+
+    default:
+      LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+      return false;
+  }
 }
 
-static int get_divider(int w, int h, int start_resolution)
+static bool node_socket_from_image_spec_atttributes(Node *node,
+                                                    const SocketType &socket,
+                                                    const ImageSpec &image_spec,
+                                                    const string &attr_name_prefix)
 {
-  int divider = 1;
-  if (start_resolution != INT_MAX) {
-    while (w * h > start_resolution * start_resolution) {
-      w = max(1, w / 2);
-      h = max(1, h / 2);
+  const string attr_name = node_socket_attribute_name(socket, attr_name_prefix);
+
+  switch (socket.type) {
+    case SocketType::ENUM: {
+      /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+      const ustring value(image_spec.get_string_attribute(attr_name, ""));
+
+      /* Validate that the node is consistent with the node type definition. */
+      const NodeEnum &enum_values = *socket.enum_values;
+      if (!enum_values.exists(value)) {
+        LOG(ERROR) << "Invalid enumerator value " << value;
+        return false;
+      }
 
-      divider <<= 1;
+      node->set(socket, enum_values[value]);
+
+      return true;
     }
+
+    case SocketType::STRING:
+      /* TODO(sergey): Avoid construction of `ustring` by using `string_view` in the Node API. */
+      node->set(socket, ustring(image_spec.get_string_attribute(attr_name, "")));
+      return true;
+
+    case SocketType::INT:
+      node->set(socket, image_spec.get_int_attribute(attr_name, 0));
+      return true;
+
+    case SocketType::FLOAT:
+      node->set(socket, image_spec.get_float_attribute(attr_name, 0));
+      return true;
+
+    case SocketType::BOOLEAN:
+      node->set(socket, static_cast<bool>(image_spec.get_int_attribute(attr_name, 0)));
+      return true;
+
+    default:
+      LOG(DFATAL) << "Unhandled socket type " << socket.type << ", should never happen.";
+      return false;
   }
-  return divider;
 }
 
-void TileManager::reset(BufferParams &params_, int num_samples_)
+static bool node_to_image_spec_atttributes(ImageSpec *image_spec,
+                                           const Node *node,
+                                           const string &attr_name_prefix)
 {
-  params = params_;
-
-  set_samples(num_samples_);
-
-  state.buffer = BufferParams();
-  state.sample = range_start_sample - 1;
-  state.num_tiles = 0;
-  state.num_samples = 0;
-  state.resolution_divider = get_divider(params.width, params.height, start_resolution);
-  state.render_tiles.clear();
-  state.denoising_tiles.clear();
-  device_free();
+  for (const SocketType &socket : node->type->inputs) {
+    if (!node_socket_to_image_spec_atttributes(image_spec, node, socket, attr_name_prefix)) {
+      return false;
+    }
+  }
+
+  return true;
 }
 
-void TileManager::set_samples(int num_samples_)
+static bool node_from_image_spec_atttributes(Node *node,
+                                             const ImageSpec &image_spec,
+                                             const string &attr_name_prefix)
 {
-  num_samples = num_samples_;
+  for (const SocketType &socket : node->type->inputs) {
+    if (!node_socket_from_image_spec_atttributes(node, socket, image_spec, attr_name_prefix)) {
+      return false;
+    }
+  }
+
+  return true;
+}
 
-  /* No real progress indication is possible when using unlimited samples. */
-  if (num_samples == INT_MAX) {
-    state.total_pixel_samples = 0;
+static bool buffer_params_to_image_spec_atttributes(ImageSpec *image_spec,
+                                                    const BufferParams &buffer_params)
+{
+  if (!node_to_image_spec_atttributes(image_spec, &buffer_params, ATTR_BUFFER_SOCKET_PREFIX)) {
+    return false;
   }
-  else {
-    uint64_t pixel_samples = 0;
-    /* While rendering in the viewport, the initial preview resolution is increased to the native
-     * resolution before the actual rendering begins. Therefore, additional pixel samples will be
-     * rendered. */
-    int divider = max(get_divider(params.width, params.height, start_resolution) / 2, pixel_size);
-    while (divider > pixel_size) {
-      int image_w = max(1, params.width / divider);
-      int image_h = max(1, params.height / divider);
-      pixel_samples += image_w * image_h;
-      divider >>= 1;
-    }
 
-    int image_w = max(1, params.width / divider);
-    int image_h = max(1, params.height / divider);
-    state.total_pixel_samples = pixel_samples +
-                                (uint64_t)get_num_effective_samples() * image_w * image_h;
-    if (schedule_denoising) {
-      state.total_pixel_samples += params.width * params.height;
+  /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+  const int num_passes = buffer_params.passes.size();
+  image_spec->attribute(ATTR_PASSES_COUNT, num_passes);
+
+  for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+    const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
+
+    const BufferPass *pass = &buffer_params.passes[pass_index];
+    if (!node_to_image_spec_atttributes(image_spec, pass, attr_name_prefix)) {
+      return false;
     }
   }
+
+  return true;
 }
 
-/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render
- * device. If sliced is true, slice image into as much pieces as how many devices are rendering
- * this image. */
-int TileManager::gen_tiles(bool sliced)
+static bool buffer_params_from_image_spec_atttributes(BufferParams *buffer_params,
+                                                      const ImageSpec &image_spec)
 {
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
-  int2 center = make_int2(image_w / 2, image_h / 2);
-
-  int num = preserve_tile_device || sliced ? min(image_h, num_devices) : 1;
-  int slice_num = sliced ? num : 1;
-  int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-
-  device_free();
-  state.render_tiles.clear();
-  state.denoising_tiles.clear();
-  state.render_tiles.resize(num);
-  state.denoising_tiles.resize(num);
-  state.tile_stride = tile_w;
-  vector<list<int>>::iterator tile_list;
-  tile_list = state.render_tiles.begin();
-
-  if (tile_order == TILE_HILBERT_SPIRAL) {
-    assert(!sliced && slice_overlap == 0);
-
-    int tile_h = (tile_size.y >= image_h) ? 1 : divide_up(image_h, tile_size.y);
-    state.tiles.resize(tile_w * tile_h);
-
-    /* Size of blocks in tiles, must be a power of 2 */
-    const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12) ? 8 : 4;
-
-    int tiles_per_device = divide_up(tile_w * tile_h, num);
-    int cur_device = 0, cur_tiles = 0;
-
-    int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size);
-    /* Number of blocks to fill the image */
-    int blocks_x = (block_size.x >= image_w) ? 1 : divide_up(image_w, block_size.x);
-    int blocks_y = (block_size.y >= image_h) ? 1 : divide_up(image_h, block_size.y);
-    int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */
-    /* Offset of spiral (to keep it centered) */
-    int2 offset = make_int2((image_w - n * block_size.x) / 2, (image_h - n * block_size.y) / 2);
-    offset = (offset / tile_size) * tile_size; /* Round to tile border. */
-
-    int2 block = make_int2(0, 0); /* Current block */
-    SpiralDirection prev_dir = DIRECTION_UP, dir = DIRECTION_UP;
-    for (int i = 0;;) {
-      /* Generate the tiles in the current block. */
-      for (int hilbert_index = 0; hilbert_index < hilbert_size * hilbert_size; hilbert_index++) {
-        int2 tile, hilbert_pos = hilbert_index_to_pos(hilbert_size, hilbert_index);
-        /* Rotate block according to spiral direction. */
-        if (prev_dir == DIRECTION_UP && dir == DIRECTION_UP) {
-          tile = make_int2(hilbert_pos.y, hilbert_pos.x);
-        }
-        else if (dir == DIRECTION_LEFT || prev_dir == DIRECTION_LEFT) {
-          tile = hilbert_pos;
-        }
-        else if (dir == DIRECTION_DOWN) {
-          tile = make_int2(hilbert_size - 1 - hilbert_pos.y, hilbert_size - 1 - hilbert_pos.x);
-        }
-        else {
-          tile = make_int2(hilbert_size - 1 - hilbert_pos.x, hilbert_size - 1 - hilbert_pos.y);
-        }
-
-        int2 pos = block * block_size + tile * tile_size + offset;
-        /* Only add tiles which are in the image (tiles outside of the image can be generated since
-         * the spiral is always square). */
-        if (pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) {
-          int w = min(tile_size.x, image_w - pos.x);
-          int h = min(tile_size.y, image_h - pos.y);
-          int2 ipos = pos / tile_size;
-          int idx = ipos.y * tile_w + ipos.x;
-          state.tiles[idx] = Tile(idx, pos.x, pos.y, w, h, cur_device, Tile::RENDER);
-          tile_list->push_front(idx);
-          cur_tiles++;
-
-          if (cur_tiles == tiles_per_device) {
-            tile_list++;
-            cur_tiles = 0;
-            cur_device++;
-          }
-        }
-      }
+  if (!node_from_image_spec_atttributes(buffer_params, image_spec, ATTR_BUFFER_SOCKET_PREFIX)) {
+    return false;
+  }
 
-      /* Stop as soon as the spiral has reached the center block. */
-      if (block.x == (n - 1) / 2 && block.y == (n - 1) / 2)
-        break;
-
-      /* Advance to next block. */
-      prev_dir = dir;
-      switch (dir) {
-        case DIRECTION_UP:
-          block.y++;
-          if (block.y == (n - i - 1)) {
-            dir = DIRECTION_LEFT;
-          }
-          break;
-        case DIRECTION_LEFT:
-          block.x++;
-          if (block.x == (n - i - 1)) {
-            dir = DIRECTION_DOWN;
-          }
-          break;
-        case DIRECTION_DOWN:
-          block.y--;
-          if (block.y == i) {
-            dir = DIRECTION_RIGHT;
-          }
-          break;
-        case DIRECTION_RIGHT:
-          block.x--;
-          if (block.x == i + 1) {
-            dir = DIRECTION_UP;
-            i++;
-          }
-          break;
-      }
-    }
-    return tile_w * tile_h;
+  /* Passes storage is not covered by the node socket. so "expand" the loop manually. */
+
+  const int num_passes = image_spec.get_int_attribute(ATTR_PASSES_COUNT, 0);
+  if (num_passes == 0) {
+    LOG(ERROR) << "Missing passes count attribute.";
+    return false;
   }
 
-  int idx = 0;
-  for (int slice = 0; slice < slice_num; slice++) {
-    int slice_y = (image_h / slice_num) * slice;
-    int slice_h = (slice == slice_num - 1) ? image_h - slice * (image_h / slice_num) :
-                                             image_h / slice_num;
+  for (int pass_index = 0; pass_index < num_passes; ++pass_index) {
+    const string attr_name_prefix = string_printf(ATTR_PASS_SOCKET_PREFIX_FORMAT, pass_index);
 
-    if (slice_overlap != 0) {
-      int slice_y_offset = max(slice_y - slice_overlap, 0);
-      slice_h = min(slice_y + slice_h + slice_overlap, image_h) - slice_y_offset;
-      slice_y = slice_y_offset;
-    }
+    BufferPass pass;
 
-    int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
-    int tiles_per_device = divide_up(tile_w * tile_h, num);
-    int cur_device = 0, cur_tiles = 0;
-
-    for (int tile_y = 0; tile_y < tile_h; tile_y++) {
-      for (int tile_x = 0; tile_x < tile_w; tile_x++, idx++) {
-        int x = tile_x * tile_size.x;
-        int y = tile_y * tile_size.y;
-        int w = (tile_x == tile_w - 1) ? image_w - x : tile_size.x;
-        int h = (tile_y == tile_h - 1) ? slice_h - y : tile_size.y;
-
-        state.tiles.push_back(
-            Tile(idx, x, y + slice_y, w, h, sliced ? slice : cur_device, Tile::RENDER));
-        tile_list->push_back(idx);
-
-        if (!sliced) {
-          cur_tiles++;
-
-          if (cur_tiles == tiles_per_device) {
-            /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that
-             * case. */
-            if (tile_order != TILE_BOTTOM_TO_TOP) {
-              tile_list->sort(TileComparator(tile_order, center, &state.tiles[0]));
-            }
-            tile_list++;
-            cur_tiles = 0;
-            cur_device++;
-          }
-        }
-      }
-    }
-    if (sliced) {
-      tile_list++;
+    if (!node_from_image_spec_atttributes(&pass, image_spec, attr_name_prefix)) {
+      return false;
     }
+
+    buffer_params->passes.emplace_back(std::move(pass));
   }
 
-  return idx;
+  buffer_params->update_passes();
+
+  return true;
 }
 
-void TileManager::gen_render_tiles()
+/* Configure image specification for the given buffer parameters and passes.
+ *
+ * Image channels will be strictly ordered to match content of corresponding buffer, and the
+ * metadata will be set so that the render buffers and passes can be reconstructed from it.
+ *
+ * If the tile size different from (0, 0) the image specification will be configured to use the
+ * given tile size for tiled IO. */
+static bool configure_image_spec_from_buffer(ImageSpec *image_spec,
+                                             const BufferParams &buffer_params,
+                                             const int2 tile_size = make_int2(0, 0))
 {
-  /* Regenerate just the render tiles for progressive render. */
-  foreach (Tile &tile, state.tiles) {
-    tile.state = Tile::RENDER;
-    state.render_tiles[tile.device].push_back(tile.index);
+  const std::vector<std::string> channel_names = exr_channel_names_for_passes(buffer_params);
+  const int num_channels = channel_names.size();
+
+  *image_spec = ImageSpec(
+      buffer_params.width, buffer_params.height, num_channels, TypeDesc::FLOAT);
+
+  image_spec->channelnames = move(channel_names);
+
+  if (!buffer_params_to_image_spec_atttributes(image_spec, buffer_params)) {
+    return false;
+  }
+
+  if (tile_size.x != 0 || tile_size.y != 0) {
+    DCHECK_GT(tile_size.x, 0);
+    DCHECK_GT(tile_size.y, 0);
+
+    image_spec->tile_width = tile_size.x;
+    image_spec->tile_height = tile_size.y;
   }
+
+  return true;
 }
 
-void TileManager::set_tiles()
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
+
+TileManager::TileManager()
 {
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
+  /* Use process ID to separate different processes.
+   * To ensure uniqueness from within a process use combination of object address and instance
+   * index. This solves problem of possible object re-allocation at the same time, and solves
+   * possible conflict when the counter overflows while there are still active instances of the
+   * class. */
+  const int tile_manager_id = g_instance_index.fetch_add(1, std::memory_order_relaxed);
+  tile_file_unique_part_ = to_string(system_self_process_id()) + "-" +
+                           to_string(reinterpret_cast<uintptr_t>(this)) + "-" +
+                           to_string(tile_manager_id);
+}
 
-  state.num_tiles = gen_tiles(!background);
+TileManager::~TileManager()
+{
+}
+
+void TileManager::reset_scheduling(const BufferParams &params, int2 tile_size)
+{
+  VLOG(3) << "Using tile size of " << tile_size;
+
+  close_tile_output();
+
+  tile_size_ = tile_size;
+
+  tile_state_.num_tiles_x = divide_up(params.width, tile_size_.x);
+  tile_state_.num_tiles_y = divide_up(params.height, tile_size_.y);
+  tile_state_.num_tiles = tile_state_.num_tiles_x * tile_state_.num_tiles_y;
+
+  tile_state_.next_tile_index = 0;
+
+  tile_state_.current_tile = Tile();
+}
+
+void TileManager::update(const BufferParams &params, const Scene *scene)
+{
+  DCHECK_NE(params.pass_stride, -1);
+
+  buffer_params_ = params;
 
-  state.buffer.width = image_w;
-  state.buffer.height = image_h;
+  /* TODO(sergey): Proper Error handling, so that if configuration has failed we don't attempt to
+   * write to a partially configured file. */
+  configure_image_spec_from_buffer(&write_state_.image_spec, buffer_params_, tile_size_);
 
-  state.buffer.full_x = params.full_x / resolution;
-  state.buffer.full_y = params.full_y / resolution;
-  state.buffer.full_width = max(1, params.full_width / resolution);
-  state.buffer.full_height = max(1, params.full_height / resolution);
+  const DenoiseParams denoise_params = scene->integrator->get_denoise_params();
+  node_to_image_spec_atttributes(
+      &write_state_.image_spec, &denoise_params, ATTR_DENOISE_SOCKET_PREFIX);
 }
 
-int TileManager::get_neighbor_index(int index, int neighbor)
+bool TileManager::done()
 {
-  /* Neighbor indices:
-   *   0 1 2
-   *   3 4 5
-   *   6 7 8
-   */
-  static const int dx[] = {-1, 0, 1, -1, 0, 1, -1, 0, 1};
-  static const int dy[] = {-1, -1, -1, 0, 0, 0, 1, 1, 1};
-
-  int resolution = state.resolution_divider;
-  int image_w = max(1, params.width / resolution);
-  int image_h = max(1, params.height / resolution);
-
-  int num = min(image_h, num_devices);
-  int slice_num = !background ? num : 1;
-  int slice_h = image_h / slice_num;
-
-  int tile_w = (tile_size.x >= image_w) ? 1 : divide_up(image_w, tile_size.x);
-  int tile_h = (tile_size.y >= slice_h) ? 1 : divide_up(slice_h, tile_size.y);
-
-  /* Tiles in the state tile list are always indexed from left to right, top to bottom. */
-  int nx = (index % tile_w) + dx[neighbor];
-  int ny = (index / tile_w) + dy[neighbor];
-  if (nx < 0 || ny < 0 || nx >= tile_w || ny >= tile_h * slice_num)
-    return -1;
-
-  return ny * state.tile_stride + nx;
+  return tile_state_.next_tile_index == tile_state_.num_tiles;
 }
 
-/* Checks whether all neighbors of a tile (as well as the tile itself) are at least at state
- * min_state. */
-bool TileManager::check_neighbor_state(int index, Tile::State min_state)
+bool TileManager::next()
 {
-  if (index < 0 || state.tiles[index].state < min_state) {
+  if (done()) {
     return false;
   }
-  for (int neighbor = 0; neighbor < 9; neighbor++) {
-    int nindex = get_neighbor_index(index, neighbor);
-    /* Out-of-bounds tiles don't matter. */
-    if (nindex >= 0 && state.tiles[nindex].state < min_state) {
-      return false;
-    }
-  }
+
+  tile_state_.current_tile = get_tile_for_index(tile_state_.next_tile_index);
+
+  ++tile_state_.next_tile_index;
 
   return true;
 }
 
-/* Returns whether the tile should be written (and freed if no denoising is used) instead of
- * updating. */
-bool TileManager::finish_tile(const int index, const bool need_denoise, bool &delete_tile)
+Tile TileManager::get_tile_for_index(int index) const
 {
-  delete_tile = false;
-
-  switch (state.tiles[index].state) {
-    case Tile::RENDER: {
-      if (!(schedule_denoising && need_denoise)) {
-        state.tiles[index].state = Tile::DONE;
-        delete_tile = !progressive;
-        return true;
-      }
-      state.tiles[index].state = Tile::RENDERED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been
-       * rendered. If yes, it can be denoised. */
-      for (int neighbor = 0; neighbor < 9; neighbor++) {
-        int nindex = get_neighbor_index(index, neighbor);
-        if (check_neighbor_state(nindex, Tile::RENDERED)) {
-          state.tiles[nindex].state = Tile::DENOISE;
-          state.denoising_tiles[state.tiles[nindex].device].push_back(nindex);
-        }
-      }
-      return false;
-    }
-    case Tile::DENOISE: {
-      state.tiles[index].state = Tile::DENOISED;
-      /* For each neighbor and the tile itself, check whether all of its neighbors have been
-       * denoised. If yes, it can be freed. */
-      for (int neighbor = 0; neighbor < 9; neighbor++) {
-        int nindex = get_neighbor_index(index, neighbor);
-        if (check_neighbor_state(nindex, Tile::DENOISED)) {
-          state.tiles[nindex].state = Tile::DONE;
-          /* Do not delete finished tiles in progressive mode. */
-          if (!progressive) {
-            /* It can happen that the tile just finished denoising and already can be freed here.
-             * However, in that case it still has to be written before deleting, so we can't delete
-             * it yet. */
-            if (neighbor == 4) {
-              delete_tile = true;
-            }
-            else {
-              delete state.tiles[nindex].buffers;
-              state.tiles[nindex].buffers = NULL;
-            }
-          }
-        }
-      }
-      return true;
-    }
-    default:
-      assert(false);
-      return true;
+  /* TODO(sergey): Consider using hilbert spiral, or. maybe, even configurable. Not sure this
+   * brings a lot of value since this is only applicable to BIG tiles. */
+
+  const int tile_y = index / tile_state_.num_tiles_x;
+  const int tile_x = index - tile_y * tile_state_.num_tiles_x;
+
+  Tile tile;
+
+  tile.x = tile_x * tile_size_.x;
+  tile.y = tile_y * tile_size_.y;
+  tile.width = tile_size_.x;
+  tile.height = tile_size_.y;
+
+  tile.width = min(tile.width, buffer_params_.width - tile.x);
+  tile.height = min(tile.height, buffer_params_.height - tile.y);
+
+  return tile;
+}
+
+const Tile &TileManager::get_current_tile() const
+{
+  return tile_state_.current_tile;
+}
+
+bool TileManager::open_tile_output()
+{
+  write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" +
+                                        to_string(write_state_.tile_file_index) + ".exr");
+
+  write_state_.tile_out = ImageOutput::create(write_state_.filename);
+  if (!write_state_.tile_out) {
+    LOG(ERROR) << "Error creating image output for " << write_state_.filename;
+    return false;
+  }
+
+  if (!write_state_.tile_out->supports("tiles")) {
+    LOG(ERROR) << "Progress tile file format does not support tiling.";
+    return false;
   }
+
+  write_state_.tile_out->open(write_state_.filename, write_state_.image_spec);
+  write_state_.num_tiles_written = 0;
+
+  VLOG(3) << "Opened tile file " << write_state_.filename;
+
+  return true;
 }
 
-bool TileManager::next_tile(Tile *&tile, int device, uint tile_types)
+bool TileManager::close_tile_output()
 {
-  /* Preserve device if requested, unless this is a separate denoising device that just wants to
-   * grab any available tile. */
-  const bool preserve_device = preserve_tile_device && device < num_devices;
-
-  if (tile_types & RenderTile::DENOISE) {
-    int tile_index = -1;
-    int logical_device = preserve_device ? device : 0;
-
-    while (logical_device < state.denoising_tiles.size()) {
-      if (state.denoising_tiles[logical_device].empty()) {
-        if (preserve_device) {
-          break;
-        }
-        else {
-          logical_device++;
-          continue;
-        }
-      }
+  if (!write_state_.tile_out) {
+    return true;
+  }
 
-      tile_index = state.denoising_tiles[logical_device].front();
-      state.denoising_tiles[logical_device].pop_front();
-      break;
-    }
+  const bool success = write_state_.tile_out->close();
+  write_state_.tile_out = nullptr;
 
-    if (tile_index >= 0) {
-      tile = &state.tiles[tile_index];
-      return true;
-    }
+  if (!success) {
+    LOG(ERROR) << "Error closing tile file.";
+    return false;
   }
 
-  if (tile_types & RenderTile::PATH_TRACE) {
-    int tile_index = -1;
-    int logical_device = preserve_device ? device : 0;
-
-    while (logical_device < state.render_tiles.size()) {
-      if (state.render_tiles[logical_device].empty()) {
-        if (preserve_device) {
-          break;
-        }
-        else {
-          logical_device++;
-          continue;
-        }
-      }
+  VLOG(3) << "Tile output is closed.";
 
-      tile_index = state.render_tiles[logical_device].front();
-      state.render_tiles[logical_device].pop_front();
-      break;
+  return true;
+}
+
+bool TileManager::write_tile(const RenderBuffers &tile_buffers)
+{
+  if (!write_state_.tile_out) {
+    if (!open_tile_output()) {
+      return false;
     }
+  }
 
-    if (tile_index >= 0) {
-      tile = &state.tiles[tile_index];
-      return true;
+  DCHECK_EQ(tile_buffers.params.pass_stride, buffer_params_.pass_stride);
+
+  const BufferParams &tile_params = tile_buffers.params;
+
+  vector<float> pixel_storage;
+  const float *pixels = tile_buffers.buffer.data();
+
+  /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with
+   * empty pixels for tiles which are on the image boundary. */
+  if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) {
+    const int64_t pass_stride = tile_params.pass_stride;
+    const int64_t src_row_stride = tile_params.width * pass_stride;
+
+    const int64_t dst_row_stride = tile_size_.x * pass_stride;
+    pixel_storage.resize(dst_row_stride * tile_size_.y);
+
+    const float *src = tile_buffers.buffer.data();
+    float *dst = pixel_storage.data();
+    pixels = dst;
+
+    for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) {
+      memcpy(dst, src, src_row_stride * sizeof(float));
     }
   }
 
-  return false;
-}
+  const int tile_x = tile_params.full_x - buffer_params_.full_x;
+  const int tile_y = tile_params.full_y - buffer_params_.full_y;
 
-bool TileManager::done()
-{
-  int end_sample = (range_num_samples == -1) ? num_samples :
-                                               range_start_sample + range_num_samples;
-  return (state.resolution_divider == pixel_size) &&
-         (state.sample + state.num_samples >= end_sample);
+  VLOG(3) << "Write tile at " << tile_x << ", " << tile_y;
+  if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) {
+    LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror();
+  }
+
+  ++write_state_.num_tiles_written;
+
+  return true;
 }
 
-bool TileManager::has_tiles()
+void TileManager::finish_write_tiles()
 {
-  foreach (Tile &tile, state.tiles) {
-    if (tile.state != Tile::DONE) {
-      return true;
+  if (!write_state_.tile_out) {
+    /* None of the tiles were written hence the file was not created.
+     * Avoid creation of fully empty file since it is redundant. */
+    return;
+  }
+
+  /* EXR expects all tiles to present in file. So explicitly write missing tiles as all-zero. */
+  if (write_state_.num_tiles_written < tile_state_.num_tiles) {
+    vector<float> pixel_storage(tile_size_.x * tile_size_.y * buffer_params_.pass_stride);
+
+    for (int tile_index = write_state_.num_tiles_written; tile_index < tile_state_.num_tiles;
+         ++tile_index) {
+      const Tile tile = get_tile_for_index(tile_index);
+
+      VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y;
+
+      write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data());
     }
   }
-  return false;
+
+  close_tile_output();
+
+  if (full_buffer_written_cb) {
+    full_buffer_written_cb(write_state_.filename);
+  }
+
+  /* Advance the counter upon explicit finish of the file.
+   * Makes it possible to re-use tile manager for another scene, and avoids unnecessary increments
+   * of the tile-file-within-session index. */
+  ++write_state_.tile_file_index;
+
+  write_state_.filename = "";
 }
 
-bool TileManager::next()
+bool TileManager::read_full_buffer_from_disk(const string_view filename,
+                                             RenderBuffers *buffers,
+                                             DenoiseParams *denoise_params)
 {
-  if (done())
+  unique_ptr<ImageInput> in(ImageInput::open(filename));
+  if (!in) {
+    LOG(ERROR) << "Error opening tile file " << filename;
     return false;
+  }
+
+  const ImageSpec &image_spec = in->spec();
 
-  if (progressive && state.resolution_divider > pixel_size) {
-    state.sample = 0;
-    state.resolution_divider = max(state.resolution_divider / 2, pixel_size);
-    state.num_samples = 1;
-    set_tiles();
+  BufferParams buffer_params;
+  if (!buffer_params_from_image_spec_atttributes(&buffer_params, image_spec)) {
+    return false;
   }
-  else {
-    state.sample++;
+  buffers->reset(buffer_params);
 
-    if (progressive)
-      state.num_samples = 1;
-    else if (range_num_samples == -1)
-      state.num_samples = num_samples;
-    else
-      state.num_samples = range_num_samples;
+  if (!node_from_image_spec_atttributes(denoise_params, image_spec, ATTR_DENOISE_SOCKET_PREFIX)) {
+    return false;
+  }
 
-    state.resolution_divider = pixel_size;
+  if (!in->read_image(TypeDesc::FLOAT, buffers->buffer.data())) {
+    LOG(ERROR) << "Error reading pixels from the tile file " << in->geterror();
+    return false;
+  }
 
-    if (state.sample == range_start_sample) {
-      set_tiles();
-    }
-    else {
-      gen_render_tiles();
-    }
+  if (!in->close()) {
+    LOG(ERROR) << "Error closing tile file " << in->geterror();
+    return false;
   }
 
   return true;
 }
 
-int TileManager::get_num_effective_samples()
-{
-  return (range_num_samples == -1) ? num_samples : range_num_samples;
-}
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h
index 790a56f9445..71b9e966278 100644
--- a/intern/cycles/render/tile.h
+++ b/intern/cycles/render/tile.h
@@ -14,159 +14,151 @@
  * limitations under the License.
  */
 
-#ifndef __TILE_H__
-#define __TILE_H__
-
-#include <limits.h>
+#pragma once
 
 #include "render/buffers.h"
-#include "util/util_list.h"
+#include "util/util_image.h"
+#include "util/util_string.h"
+#include "util/util_unique_ptr.h"
 
 CCL_NAMESPACE_BEGIN
 
-/* Tile */
+class DenoiseParams;
+class Scene;
+
+/* --------------------------------------------------------------------
+ * Tile.
+ */
 
 class Tile {
  public:
-  int index;
-  int x, y, w, h;
-  int device;
-  /* RENDER: The tile has to be rendered.
-   * RENDERED: The tile has been rendered, but can't be denoised yet (waiting for neighbors).
-   * DENOISE: The tile can be denoised now.
-   * DENOISED: The tile has been denoised, but can't be freed yet (waiting for neighbors).
-   * DONE: The tile is finished and has been freed. */
-  typedef enum { RENDER = 0, RENDERED, DENOISE, DENOISED, DONE } State;
-  State state;
-  RenderBuffers *buffers;
+  int x = 0, y = 0;
+  int width = 0, height = 0;
 
   Tile()
   {
   }
-
-  Tile(int index_, int x_, int y_, int w_, int h_, int device_, State state_ = RENDER)
-      : index(index_), x(x_), y(y_), w(w_), h(h_), device(device_), state(state_), buffers(NULL)
-  {
-  }
 };
 
-/* Tile order */
-
-/* Note: this should match enum_tile_order in properties.py */
-enum TileOrder {
-  TILE_CENTER = 0,
-  TILE_RIGHT_TO_LEFT = 1,
-  TILE_LEFT_TO_RIGHT = 2,
-  TILE_TOP_TO_BOTTOM = 3,
-  TILE_BOTTOM_TO_TOP = 4,
-  TILE_HILBERT_SPIRAL = 5,
-};
-
-/* Tile Manager */
+/* --------------------------------------------------------------------
+ * Tile Manager.
+ */
 
 class TileManager {
  public:
-  BufferParams params;
-
-  struct State {
-    vector<Tile> tiles;
-    int tile_stride;
-    BufferParams buffer;
-    int sample;
-    int num_samples;
-    int resolution_divider;
-    int num_tiles;
-
-    /* Total samples over all pixels: Generally num_samples*num_pixels,
-     * but can be higher due to the initial resolution division for previews. */
-    uint64_t total_pixel_samples;
-
-    /* These lists contain the indices of the tiles to be rendered/denoised and are used
-     * when acquiring a new tile for the device.
-     * Each list in each vector is for one logical device. */
-    vector<list<int>> render_tiles;
-    vector<list<int>> denoising_tiles;
-  } state;
-
-  int num_samples;
-  int slice_overlap;
-
-  TileManager(bool progressive,
-              int num_samples,
-              int2 tile_size,
-              int start_resolution,
-              bool preserve_tile_device,
-              bool background,
-              TileOrder tile_order,
-              int num_devices = 1,
-              int pixel_size = 1);
+  /* This callback is invoked by whenever on-dist tiles storage file is closed after writing. */
+  function<void(string_view)> full_buffer_written_cb;
+
+  TileManager();
   ~TileManager();
 
-  void device_free();
-  void reset(BufferParams &params, int num_samples);
-  void set_samples(int num_samples);
+  TileManager(const TileManager &other) = delete;
+  TileManager(TileManager &&other) noexcept = delete;
+  TileManager &operator=(const TileManager &other) = delete;
+  TileManager &operator=(TileManager &&other) = delete;
+
+  /* Reset current progress and start new rendering of the full-frame parameters in tiles of the
+   * given size.
+   * Only touches scheduling-related state of the tile manager. */
+  /* TODO(sergey): Consider using tile area instead of exact size to help dealing with extreme
+   * cases of stretched renders. */
+  void reset_scheduling(const BufferParams &params, int2 tile_size);
+
+  /* Update for the known buffer passes and scene parameters.
+   * Will store all parameters needed for buffers access outside of the scene graph. */
+  void update(const BufferParams &params, const Scene *scene);
+
+  inline int get_num_tiles() const
+  {
+    return tile_state_.num_tiles;
+  }
+
+  inline bool has_multiple_tiles() const
+  {
+    return tile_state_.num_tiles > 1;
+  }
+
   bool next();
-  bool next_tile(Tile *&tile, int device, uint tile_types);
-  bool finish_tile(const int index, const bool need_denoise, bool &delete_tile);
   bool done();
-  bool has_tiles();
 
-  void set_tile_order(TileOrder tile_order_)
+  const Tile &get_current_tile() const;
+
+  /* Write render buffer of a tile to a file on disk.
+   *
+   * Opens file for write when first tile is written.
+   *
+   * Returns true on success. */
+  bool write_tile(const RenderBuffers &tile_buffers);
+
+  /* Inform the tile manager that no more tiles will be written to disk.
+   * The file will be considered final, all handles to it will be closed. */
+  void finish_write_tiles();
+
+  /* Check whether any tile has been written to disk. */
+  inline bool has_written_tiles() const
   {
-    tile_order = tile_order_;
+    return write_state_.num_tiles_written != 0;
   }
 
-  int get_neighbor_index(int index, int neighbor);
-  bool check_neighbor_state(int index, Tile::State state);
+  /* Read full frame render buffer from tiles file on disk.
+   *
+   * Returns true on success. */
+  bool read_full_buffer_from_disk(string_view filename,
+                                  RenderBuffers *buffers,
+                                  DenoiseParams *denoise_params);
 
-  /* ** Sample range rendering. ** */
+ protected:
+  /* Get tile configuration for its index.
+   * The tile index must be within [0, state_.tile_state_). */
+  Tile get_tile_for_index(int index) const;
 
-  /* Start sample in the range. */
-  int range_start_sample;
+  bool open_tile_output();
+  bool close_tile_output();
 
-  /* Number to samples in the rendering range. */
-  int range_num_samples;
+  /* Part of an on-disk tile file name which avoids conflicts between several Cycles instances or
+   * several sessions. */
+  string tile_file_unique_part_;
 
-  /* Get number of actual samples to render. */
-  int get_num_effective_samples();
+  int2 tile_size_ = make_int2(0, 0);
 
-  /* Schedule tiles for denoising after they've been rendered. */
-  bool schedule_denoising;
+  BufferParams buffer_params_;
 
- protected:
-  void set_tiles();
-
-  bool progressive;
-  int2 tile_size;
-  TileOrder tile_order;
-  int start_resolution;
-  int pixel_size;
-  int num_devices;
-
-  /* in some cases it is important that the same tile will be returned for the same
-   * device it was originally generated for (i.e. viewport rendering when buffer is
-   * allocating once for tile and then always used by it)
-   *
-   * in other cases any tile could be handled by any device (i.e. final rendering
-   * without progressive refine)
-   */
-  bool preserve_tile_device;
-
-  /* for background render tiles should exactly match render parts generated from
-   * blender side, which means image first gets split into tiles and then tiles are
-   * assigning to render devices
-   *
-   * however viewport rendering expects tiles to be allocated in a special way,
-   * meaning image is being sliced horizontally first and every device handles
-   * its own slice
-   */
-  bool background;
-
-  /* Generate tile list, return number of tiles. */
-  int gen_tiles(bool sliced);
-  void gen_render_tiles();
+  /* Tile scheduling state. */
+  struct {
+    int num_tiles_x = 0;
+    int num_tiles_y = 0;
+    int num_tiles = 0;
+
+    int next_tile_index;
+
+    Tile current_tile;
+  } tile_state_;
+
+  /* State of tiles writing to a file on disk. */
+  struct {
+    /* Index of a tile file used during the current session.
+     * This number is used for the file name construction, making it possible to render several
+     * scenes throughout duration of the session and keep all results available for later read
+     * access. */
+    int tile_file_index = 0;
+
+    string filename;
+
+    /* Specification of the tile image which corresponds to the buffer parameters.
+     * Contains channels configured according to the passes configuration in the path traces.
+     *
+     * Output images are saved using this specification, input images are expected to have matched
+     * specification. */
+    ImageSpec image_spec;
+
+    /* Output handle for the tile file.
+     *
+     * This file can not be closed until all tiles has been provided, so the handle is stored in
+     * the state and is created whenever writing is requested. */
+    unique_ptr<ImageOutput> tile_out;
+
+    int num_tiles_written = 0;
+  } write_state_;
 };
 
 CCL_NAMESPACE_END
-
-#endif /* __TILE_H__ */
diff --git a/intern/cycles/test/CMakeLists.txt b/intern/cycles/test/CMakeLists.txt
index 65a692acd03..0f6b435813f 100644
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -32,6 +32,7 @@ set(INC
 set(ALL_CYCLES_LIBRARIES
   cycles_device
   cycles_kernel
+  cycles_integrator
   cycles_render
   cycles_bvh
   cycles_graph
@@ -45,8 +46,12 @@ include_directories(${INC})
 cycles_link_directories()
 
 set(SRC
+  integrator_adaptive_sampling_test.cpp
+  integrator_render_scheduler_test.cpp
+  integrator_tile_test.cpp
   render_graph_finalize_test.cpp
   util_aligned_malloc_test.cpp
+  util_math_test.cpp
   util_path_test.cpp
   util_string_test.cpp
   util_task_test.cpp
diff --git a/intern/cycles/test/integrator_adaptive_sampling_test.cpp b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
new file mode 100644
index 00000000000..3ed6a23125d
--- /dev/null
+++ b/intern/cycles/test/integrator_adaptive_sampling_test.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/adaptive_sampling.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(AdaptiveSampling, schedule_samples)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 0;
+  adaptive_sampling.adaptive_step = 4;
+
+  for (int sample = 2; sample < 32; ++sample) {
+    for (int num_samples = 8; num_samples < 32; ++num_samples) {
+      const int num_samples_aligned = adaptive_sampling.align_samples(sample, num_samples);
+      /* NOTE: `sample + num_samples_aligned` is the number of samples after rendering, so need
+       * to convert this to the 0-based index of the last sample. */
+      EXPECT_TRUE(adaptive_sampling.need_filter(sample + num_samples_aligned - 1));
+    }
+  }
+}
+
+TEST(AdaptiveSampling, align_samples)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+  adaptive_sampling.adaptive_step = 4;
+
+  /* Filtering will happen at the following samples:
+   * 15, 19, 23, 27, 31, 35, 39, 43 */
+
+  /* Requested sample and number of samples will result in number of samples lower than
+   * `min_samples`. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 4), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 7), 7);
+
+  /* Request number of samples higher than the minimum samples before filter, but prior to the
+   * first sample at which filtering will happen. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 15), 15);
+
+  /* When rendering many samples from the very beginning, limit number of samples by the first
+   * sample at which filtering is to happen. */
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 16), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 17), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 20), 16);
+  EXPECT_EQ(adaptive_sampling.align_samples(0, 60), 16);
+
+  /* Similar to above, but start sample is not 0. */
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 8), 7);
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 20), 7);
+  EXPECT_EQ(adaptive_sampling.align_samples(9, 60), 7);
+
+  /* Start sample is past the minimum required samples, but prior to the first filter sample. */
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 6), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 20), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(12, 60), 4);
+
+  /* Start sample is the sample which is to be filtered. */
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 4), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 6), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 10), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(58, 2), 2);
+
+  /* Start sample is past the sample which is to be filtered. */
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 3), 3);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 4), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 5), 4);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 10), 4);
+
+  /* Should never exceed requested number of samples. */
+  EXPECT_EQ(adaptive_sampling.align_samples(15, 2), 1);
+  EXPECT_EQ(adaptive_sampling.align_samples(16, 2), 2);
+  EXPECT_EQ(adaptive_sampling.align_samples(17, 2), 2);
+  EXPECT_EQ(adaptive_sampling.align_samples(18, 2), 2);
+}
+
+TEST(AdaptiveSampling, need_filter)
+{
+  AdaptiveSampling adaptive_sampling;
+  adaptive_sampling.use = true;
+  adaptive_sampling.min_samples = 11 /* rounded of sqrt(128) */;
+  adaptive_sampling.adaptive_step = 4;
+
+  const vector<int> expected_samples_to_filter = {
+      {15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59}};
+
+  vector<int> actual_samples_to_filter;
+  for (int sample = 0; sample < 60; ++sample) {
+    if (adaptive_sampling.need_filter(sample)) {
+      actual_samples_to_filter.push_back(sample);
+    }
+  }
+
+  EXPECT_EQ(actual_samples_to_filter, expected_samples_to_filter);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_render_scheduler_test.cpp b/intern/cycles/test/integrator_render_scheduler_test.cpp
new file mode 100644
index 00000000000..b4efbc2d1a7
--- /dev/null
+++ b/intern/cycles/test/integrator_render_scheduler_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/render_scheduler.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(IntegratorRenderScheduler, calculate_resolution_divider_for_resolution)
+{
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 1920), 1);
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 960), 2);
+  EXPECT_EQ(calculate_resolution_divider_for_resolution(1920, 1080, 480), 4);
+}
+
+TEST(IntegratorRenderScheduler, calculate_resolution_for_divider)
+{
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 1), 1440);
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 2), 720);
+  EXPECT_EQ(calculate_resolution_for_divider(1920, 1080, 4), 360);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp
new file mode 100644
index 00000000000..5bb57b48c3c
--- /dev/null
+++ b/intern/cycles/test/integrator_tile_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "integrator/tile.h"
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(tile_calculate_best_size, Basic)
+{
+  /* Make sure CPU-like case is handled properly. */
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1), TileSize(1, 1, 1));
+
+  /* Enough path states to fit an entire image with all samples. */
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080),
+            TileSize(1920, 1080, 1));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100),
+            TileSize(1920, 1080, 100));
+}
+
+TEST(tile_calculate_best_size, Extreme)
+{
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072), TileSize(1, 1, 512));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072), TileSize(1, 1, 1024));
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072), TileSize(1, 1, 4096));
+
+  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024),
+            TileSize(1, 1, 1024));
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/render_graph_finalize_test.cpp b/intern/cycles/test/render_graph_finalize_test.cpp
index da9b29314a7..19c211fe5f7 100644
--- a/intern/cycles/test/render_graph_finalize_test.cpp
+++ b/intern/cycles/test/render_graph_finalize_test.cpp
@@ -181,7 +181,7 @@ class RenderGraph : public testing::Test {
     util_logging_start();
     util_logging_verbosity_set(1);
 
-    device_cpu = Device::create(device_info, stats, profiler, true);
+    device_cpu = Device::create(device_info, stats, profiler);
     scene = new Scene(scene_params, device_cpu);
   }
 
diff --git a/intern/cycles/test/util_math_test.cpp b/intern/cycles/test/util_math_test.cpp
new file mode 100644
index 00000000000..b6ce3ef0cf3
--- /dev/null
+++ b/intern/cycles/test/util_math_test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "testing/testing.h"
+
+#include "util/util_math.h"
+
+CCL_NAMESPACE_BEGIN
+
+TEST(math, next_power_of_two)
+{
+  EXPECT_EQ(next_power_of_two(0), 1);
+  EXPECT_EQ(next_power_of_two(1), 2);
+  EXPECT_EQ(next_power_of_two(2), 4);
+  EXPECT_EQ(next_power_of_two(3), 4);
+  EXPECT_EQ(next_power_of_two(4), 8);
+}
+
+TEST(math, prev_power_of_two)
+{
+  EXPECT_EQ(prev_power_of_two(0), 0);
+
+  EXPECT_EQ(prev_power_of_two(1), 1);
+  EXPECT_EQ(prev_power_of_two(2), 1);
+
+  EXPECT_EQ(prev_power_of_two(3), 2);
+  EXPECT_EQ(prev_power_of_two(4), 2);
+
+  EXPECT_EQ(prev_power_of_two(5), 4);
+  EXPECT_EQ(prev_power_of_two(6), 4);
+  EXPECT_EQ(prev_power_of_two(7), 4);
+  EXPECT_EQ(prev_power_of_two(8), 4);
+}
+
+TEST(math, reverse_integer_bits)
+{
+  EXPECT_EQ(reverse_integer_bits(0xFFFFFFFF), 0xFFFFFFFF);
+  EXPECT_EQ(reverse_integer_bits(0x00000000), 0x00000000);
+  EXPECT_EQ(reverse_integer_bits(0x1), 0x80000000);
+  EXPECT_EQ(reverse_integer_bits(0x80000000), 0x1);
+  EXPECT_EQ(reverse_integer_bits(0xFFFF0000), 0x0000FFFF);
+  EXPECT_EQ(reverse_integer_bits(0x0000FFFF), 0xFFFF0000);
+  EXPECT_EQ(reverse_integer_bits(0x00FF0000), 0x0000FF00);
+  EXPECT_EQ(reverse_integer_bits(0x0000FF00), 0x00FF0000);
+  EXPECT_EQ(reverse_integer_bits(0xAAAAAAAA), 0x55555555);
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/test/util_string_test.cpp b/intern/cycles/test/util_string_test.cpp
index 97f8daa65de..c9022d1b132 100644
--- a/intern/cycles/test/util_string_test.cpp
+++ b/intern/cycles/test/util_string_test.cpp
@@ -281,4 +281,40 @@ TEST(util_string_remove_trademark, r_space_middle)
   EXPECT_EQ(str, "foo bar baz");
 }
 
+/* ******** Tests for string_startswith() ******** */
+
+TEST(string_startswith, basic)
+{
+  EXPECT_TRUE(string_startswith("", ""));
+
+  EXPECT_FALSE(string_startswith("", "World"));
+  EXPECT_TRUE(string_startswith("Hello", ""));
+
+  EXPECT_FALSE(string_startswith("Hello", "World"));
+
+  EXPECT_TRUE(string_startswith("Hello", "Hello"));
+  EXPECT_TRUE(string_startswith("Hello", "He"));
+  EXPECT_TRUE(string_startswith("Hello", "H"));
+
+  EXPECT_FALSE(string_startswith("Hello", "e"));
+  EXPECT_FALSE(string_startswith("Hello", "HelloWorld"));
+}
+
+TEST(string_endswith, basic)
+{
+  EXPECT_TRUE(string_endswith("", ""));
+
+  EXPECT_FALSE(string_endswith("", "World"));
+  EXPECT_TRUE(string_endswith("Hello", ""));
+
+  EXPECT_FALSE(string_endswith("Hello", "World"));
+
+  EXPECT_TRUE(string_endswith("Hello", "Hello"));
+  EXPECT_TRUE(string_endswith("Hello", "lo"));
+  EXPECT_TRUE(string_endswith("Hello", "o"));
+
+  EXPECT_FALSE(string_endswith("Hello", "e"));
+  EXPECT_FALSE(string_endswith("Hello", "WorldHello"));
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h
index 13d177d2b25..de17efafcf2 100644
--- a/intern/cycles/util/util_atomic.h
+++ b/intern/cycles/util/util_atomic.h
@@ -34,56 +34,6 @@
 
 #else /* __KERNEL_GPU__ */
 
-#  ifdef __KERNEL_OPENCL__
-
-/* Float atomics implementation credits:
- *   http://suhorukov.blogspot.in/2011/12/opencl-11-atomic-operations-on-floating.html
- */
-ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *source,
-                                                   const float operand)
-{
-  union {
-    unsigned int int_value;
-    float float_value;
-  } new_value;
-  union {
-    unsigned int int_value;
-    float float_value;
-  } prev_value;
-  do {
-    prev_value.float_value = *source;
-    new_value.float_value = prev_value.float_value + operand;
-  } while (atomic_cmpxchg((volatile ccl_global unsigned int *)source,
-                          prev_value.int_value,
-                          new_value.int_value) != prev_value.int_value);
-  return new_value.float_value;
-}
-
-ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
-                                                      const float old_val,
-                                                      const float new_val)
-{
-  union {
-    unsigned int int_value;
-    float float_value;
-  } new_value, prev_value, result;
-  prev_value.float_value = old_val;
-  new_value.float_value = new_val;
-  result.int_value = atomic_cmpxchg(
-      (volatile ccl_global unsigned int *)dest, prev_value.int_value, new_value.int_value);
-  return result.float_value;
-}
-
-#    define atomic_fetch_and_add_uint32(p, x) atomic_add((p), (x))
-#    define atomic_fetch_and_inc_uint32(p) atomic_inc((p))
-#    define atomic_fetch_and_dec_uint32(p) atomic_dec((p))
-#    define atomic_fetch_and_or_uint32(p, x) atomic_or((p), (x))
-
-#    define CCL_LOCAL_MEM_FENCE CLK_LOCAL_MEM_FENCE
-#    define ccl_barrier(flags) barrier(flags)
-
-#  endif /* __KERNEL_OPENCL__ */
-
 #  ifdef __KERNEL_CUDA__
 
 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp
index 74ecefa1917..1d598725c84 100644
--- a/intern/cycles/util/util_debug.cpp
+++ b/intern/cycles/util/util_debug.cpp
@@ -26,13 +26,7 @@
 CCL_NAMESPACE_BEGIN
 
 DebugFlags::CPU::CPU()
-    : avx2(true),
-      avx(true),
-      sse41(true),
-      sse3(true),
-      sse2(true),
-      bvh_layout(BVH_LAYOUT_AUTO),
-      split_kernel(false)
+    : avx2(true), avx(true), sse41(true), sse3(true), sse2(true), bvh_layout(BVH_LAYOUT_AUTO)
 {
   reset();
 }
@@ -58,11 +52,9 @@ void DebugFlags::CPU::reset()
 #undef CHECK_CPU_FLAGS
 
   bvh_layout = BVH_LAYOUT_AUTO;
-
-  split_kernel = false;
 }
 
-DebugFlags::CUDA::CUDA() : adaptive_compile(false), split_kernel(false)
+DebugFlags::CUDA::CUDA() : adaptive_compile(false)
 {
   reset();
 }
@@ -71,8 +63,6 @@ void DebugFlags::CUDA::reset()
 {
   if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL)
     adaptive_compile = true;
-
-  split_kernel = false;
 }
 
 DebugFlags::OptiX::OptiX()
@@ -82,42 +72,7 @@ DebugFlags::OptiX::OptiX()
 
 void DebugFlags::OptiX::reset()
 {
-  cuda_streams = 1;
-  curves_api = false;
-}
-
-DebugFlags::OpenCL::OpenCL() : device_type(DebugFlags::OpenCL::DEVICE_ALL), debug(false)
-{
-  reset();
-}
-
-void DebugFlags::OpenCL::reset()
-{
-  /* Initialize device type from environment variables. */
-  device_type = DebugFlags::OpenCL::DEVICE_ALL;
-  char *device = getenv("CYCLES_OPENCL_TEST");
-  if (device) {
-    if (strcmp(device, "NONE") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_NONE;
-    }
-    else if (strcmp(device, "ALL") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_ALL;
-    }
-    else if (strcmp(device, "DEFAULT") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_DEFAULT;
-    }
-    else if (strcmp(device, "CPU") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_CPU;
-    }
-    else if (strcmp(device, "GPU") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_GPU;
-    }
-    else if (strcmp(device, "ACCELERATOR") == 0) {
-      device_type = DebugFlags::OpenCL::DEVICE_ACCELERATOR;
-    }
-  }
-  /* Initialize other flags from environment variables. */
-  debug = (getenv("CYCLES_OPENCL_DEBUG") != NULL);
+  use_debug = false;
 }
 
 DebugFlags::DebugFlags() : viewport_static_bvh(false), running_inside_blender(false)
@@ -131,7 +86,6 @@ void DebugFlags::reset()
   cpu.reset();
   cuda.reset();
   optix.reset();
-  opencl.reset();
 }
 
 std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
@@ -142,40 +96,13 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags)
      << "  SSE4.1     : " << string_from_bool(debug_flags.cpu.sse41) << "\n"
      << "  SSE3       : " << string_from_bool(debug_flags.cpu.sse3) << "\n"
      << "  SSE2       : " << string_from_bool(debug_flags.cpu.sse2) << "\n"
-     << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n"
-     << "  Split      : " << string_from_bool(debug_flags.cpu.split_kernel) << "\n";
+     << "  BVH layout : " << bvh_layout_name(debug_flags.cpu.bvh_layout) << "\n";
 
   os << "CUDA flags:\n"
      << "  Adaptive Compile : " << string_from_bool(debug_flags.cuda.adaptive_compile) << "\n";
 
   os << "OptiX flags:\n"
-     << "  CUDA streams : " << debug_flags.optix.cuda_streams << "\n";
-
-  const char *opencl_device_type;
-  switch (debug_flags.opencl.device_type) {
-    case DebugFlags::OpenCL::DEVICE_NONE:
-      opencl_device_type = "NONE";
-      break;
-    case DebugFlags::OpenCL::DEVICE_ALL:
-      opencl_device_type = "ALL";
-      break;
-    case DebugFlags::OpenCL::DEVICE_DEFAULT:
-      opencl_device_type = "DEFAULT";
-      break;
-    case DebugFlags::OpenCL::DEVICE_CPU:
-      opencl_device_type = "CPU";
-      break;
-    case DebugFlags::OpenCL::DEVICE_GPU:
-      opencl_device_type = "GPU";
-      break;
-    case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
-      opencl_device_type = "ACCELERATOR";
-      break;
-  }
-  os << "OpenCL flags:\n"
-     << "  Device type    : " << opencl_device_type << "\n"
-     << "  Debug          : " << string_from_bool(debug_flags.opencl.debug) << "\n"
-     << "  Memory limit   : " << string_human_readable_size(debug_flags.opencl.mem_limit) << "\n";
+     << "  Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n";
   return os;
 }
 
diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h
index f7e53f90f74..99e2723180c 100644
--- a/intern/cycles/util/util_debug.h
+++ b/intern/cycles/util/util_debug.h
@@ -79,9 +79,6 @@ class DebugFlags {
      * CPUs and GPUs can be selected here instead.
      */
     BVHLayout bvh_layout;
-
-    /* Whether split kernel is used */
-    bool split_kernel;
   };
 
   /* Descriptor of CUDA feature-set to be used. */
@@ -94,9 +91,6 @@ class DebugFlags {
     /* Whether adaptive feature based runtime compile is enabled or not.
      * Requires the CUDA Toolkit and only works on Linux atm. */
     bool adaptive_compile;
-
-    /* Whether split kernel is used */
-    bool split_kernel;
   };
 
   /* Descriptor of OptiX feature-set to be used. */
@@ -106,61 +100,9 @@ class DebugFlags {
     /* Reset flags to their defaults. */
     void reset();
 
-    /* Number of CUDA streams to launch kernels concurrently from. */
-    int cuda_streams;
-
-    /* Use OptiX curves API for hair instead of custom implementation. */
-    bool curves_api;
-  };
-
-  /* Descriptor of OpenCL feature-set to be used. */
-  struct OpenCL {
-    OpenCL();
-
-    /* Reset flags to their defaults. */
-    void reset();
-
-    /* Available device types.
-     * Only gives a hint which devices to let user to choose from, does not
-     * try to use any sort of optimal device or so.
-     */
-    enum DeviceType {
-      /* None of OpenCL devices will be used. */
-      DEVICE_NONE,
-      /* All OpenCL devices will be used. */
-      DEVICE_ALL,
-      /* Default system OpenCL device will be used. */
-      DEVICE_DEFAULT,
-      /* Host processor will be used. */
-      DEVICE_CPU,
-      /* GPU devices will be used. */
-      DEVICE_GPU,
-      /* Dedicated OpenCL accelerator device will be used. */
-      DEVICE_ACCELERATOR,
-    };
-
-    /* Available kernel types. */
-    enum KernelType {
-      /* Do automated guess which kernel to use, based on the officially
-       * supported GPUs and such.
-       */
-      KERNEL_DEFAULT,
-      /* Force mega kernel to be used. */
-      KERNEL_MEGA,
-      /* Force split kernel to be used. */
-      KERNEL_SPLIT,
-    };
-
-    /* Requested device type. */
-    DeviceType device_type;
-
-    /* Use debug version of the kernel. */
-    bool debug;
-
-    /* TODO(mai): Currently this is only for OpenCL, but we should have it implemented for all
-     * devices. */
-    /* Artificial memory limit in bytes (0 if disabled). */
-    size_t mem_limit;
+    /* Load OptiX module with debug capabilities. Will lower logging verbosity level, enable
+     * validations, and lower optimization level. */
+    bool use_debug;
   };
 
   /* Get instance of debug flags registry. */
@@ -182,9 +124,6 @@ class DebugFlags {
   /* Requested OptiX flags. */
   OptiX optix;
 
-  /* Requested OpenCL flags. */
-  OpenCL opencl;
-
  private:
   DebugFlags();
 
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 0a239a944a5..9b1698d461a 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -43,9 +43,9 @@
 #  define ccl_local_param
 #  define ccl_private
 #  define ccl_restrict __restrict
-#  define ccl_ref &
 #  define ccl_optional_struct_init
 #  define ccl_loop_no_unroll
+#  define ccl_attr_maybe_unused [[maybe_unused]]
 #  define __KERNEL_WITH_SSE_ALIGN__
 
 #  if defined(_WIN32) && !defined(FREE_WINDOWS)
@@ -62,7 +62,6 @@
 #    define ccl_may_alias
 #    define ccl_always_inline __forceinline
 #    define ccl_never_inline __declspec(noinline)
-#    define ccl_maybe_unused
 #  else /* _WIN32 && !FREE_WINDOWS */
 #    define ccl_device_inline static inline __attribute__((always_inline))
 #    define ccl_device_forceinline static inline __attribute__((always_inline))
@@ -74,7 +73,6 @@
 #    define ccl_may_alias __attribute__((__may_alias__))
 #    define ccl_always_inline __attribute__((always_inline))
 #    define ccl_never_inline __attribute__((noinline))
-#    define ccl_maybe_unused __attribute__((used))
 #  endif /* _WIN32 && !FREE_WINDOWS */
 
 /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h
index a8d4ee75e20..d9edfec5da3 100644
--- a/intern/cycles/util/util_half.h
+++ b/intern/cycles/util/util_half.h
@@ -28,14 +28,8 @@ CCL_NAMESPACE_BEGIN
 
 /* Half Floats */
 
-#ifdef __KERNEL_OPENCL__
-
-#  define float4_store_half(h, f, scale) vstore_half4(f *(scale), 0, h);
-
-#else
-
 /* CUDA has its own half data type, no need to define then */
-#  ifndef __KERNEL_CUDA__
+#ifndef __KERNEL_CUDA__
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -59,27 +53,27 @@ class half {
  private:
   unsigned short v;
 };
-#  endif
+#endif
 
 struct half4 {
   half x, y, z, w;
 };
 
-#  ifdef __KERNEL_CUDA__
+#ifdef __KERNEL_CUDA__
 
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
 {
-  h[0] = __float2half(f.x * scale);
-  h[1] = __float2half(f.y * scale);
-  h[2] = __float2half(f.z * scale);
-  h[3] = __float2half(f.w * scale);
+  h[0] = __float2half(f.x);
+  h[1] = __float2half(f.y);
+  h[2] = __float2half(f.z);
+  h[3] = __float2half(f.w);
 }
 
-#  else
+#else
 
-ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
+ccl_device_inline void float4_store_half(half *h, float4 f)
 {
-#    ifndef __KERNEL_SSE2__
+#  ifndef __KERNEL_SSE2__
   for (int i = 0; i < 4; i++) {
     /* optimized float to half for pixels:
      * assumes no negative, no nan, no inf, and sets denormal to 0 */
@@ -87,8 +81,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
       uint i;
       float f;
     } in;
-    float fscale = f[i] * scale;
-    in.f = (fscale > 0.0f) ? ((fscale < 65504.0f) ? fscale : 65504.0f) : 0.0f;
+    in.f = (f[i] > 0.0f) ? ((f[i] < 65504.0f) ? f[i] : 65504.0f) : 0.0f;
     int x = in.i;
 
     int absolute = x & 0x7FFFFFFF;
@@ -98,23 +91,22 @@ ccl_device_inline void float4_store_half(half *h, float4 f, float scale)
 
     h[i] = (rshift & 0x7FFF);
   }
-#    else
+#  else
   /* same as above with SSE */
-  ssef fscale = load4f(f) * scale;
-  ssef x = min(max(fscale, 0.0f), 65504.0f);
+  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
 
-#      ifdef __KERNEL_AVX2__
+#    ifdef __KERNEL_AVX2__
   ssei rpack = _mm_cvtps_ph(x, 0);
-#      else
+#    else
   ssei absolute = cast(x) & 0x7FFFFFFF;
   ssei Z = absolute + 0xC8000000;
   ssei result = andnot(absolute < 0x38800000, Z);
   ssei rshift = (result >> 13) & 0x7FFF;
   ssei rpack = _mm_packs_epi32(rshift, rshift);
-#      endif
+#    endif
 
   _mm_storel_pi((__m64 *)h, _mm_castsi128_ps(rpack));
-#    endif
+#  endif
 }
 
 ccl_device_inline float half_to_float(half h)
@@ -160,8 +152,6 @@ ccl_device_inline half float_to_half(float f)
   return (value_bits | sign_bit);
 }
 
-#  endif
-
 #endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_logging.h b/intern/cycles/util/util_logging.h
index c161299acd0..35c2d436d09 100644
--- a/intern/cycles/util/util_logging.h
+++ b/intern/cycles/util/util_logging.h
@@ -49,6 +49,7 @@ class LogMessageVoidify {
 #  define LOG(severity) LOG_SUPPRESS()
 #  define VLOG(severity) LOG_SUPPRESS()
 #  define VLOG_IF(severity, condition) LOG_SUPPRESS()
+#  define VLOG_IS_ON(severity) false
 
 #  define CHECK(expression) LOG_SUPPRESS()
 
diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h
index c5996ebfcb6..6d728dde679 100644
--- a/intern/cycles/util/util_math.h
+++ b/intern/cycles/util/util_math.h
@@ -26,11 +26,9 @@
 #  include <cmath>
 #endif
 
-#ifndef __KERNEL_OPENCL__
-#  include <float.h>
-#  include <math.h>
-#  include <stdio.h>
-#endif /* __KERNEL_OPENCL__ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
 
 #include "util/util_types.h"
 
@@ -86,7 +84,6 @@ CCL_NAMESPACE_BEGIN
 /* Scalar */
 
 #ifdef _WIN32
-#  ifndef __KERNEL_OPENCL__
 ccl_device_inline float fmaxf(float a, float b)
 {
   return (a > b) ? a : b;
@@ -96,8 +93,7 @@ ccl_device_inline float fminf(float a, float b)
 {
   return (a < b) ? a : b;
 }
-#  endif /* !__KERNEL_OPENCL__ */
-#endif   /* _WIN32 */
+#endif /* _WIN32 */
 
 #ifndef __KERNEL_GPU__
 using std::isfinite;
@@ -119,6 +115,11 @@ ccl_device_inline int min(int a, int b)
   return (a < b) ? a : b;
 }
 
+ccl_device_inline uint min(uint a, uint b)
+{
+  return (a < b) ? a : b;
+}
+
 ccl_device_inline float max(float a, float b)
 {
   return (a > b) ? a : b;
@@ -166,7 +167,6 @@ ccl_device_inline float max4(float a, float b, float c, float d)
   return max(max(a, b), max(c, d));
 }
 
-#ifndef __KERNEL_OPENCL__
 /* Int/Float conversion */
 
 ccl_device_inline int as_int(uint i)
@@ -241,24 +241,23 @@ ccl_device_inline float __uint_as_float(uint i)
 
 ccl_device_inline int4 __float4_as_int4(float4 f)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(f.m128));
-#  else
+#else
   return make_int4(
       __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z), __float_as_int(f.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 __int4_as_float4(int4 i)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_castsi128_ps(i.m128));
-#  else
+#else
   return make_float4(
       __int_as_float(i.x), __int_as_float(i.y), __int_as_float(i.z), __int_as_float(i.w));
-#  endif
+#endif
 }
-#endif /* __KERNEL_OPENCL__ */
 
 /* Versions of functions which are safe for fast math. */
 ccl_device_inline bool isnan_safe(float f)
@@ -279,7 +278,6 @@ ccl_device_inline float ensure_finite(float v)
   return isfinite_safe(v) ? v : 0.0f;
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int clamp(int a, int mn, int mx)
 {
   return min(max(a, mn), mx);
@@ -309,8 +307,6 @@ ccl_device_inline float smoothstep(float edge0, float edge1, float x)
   return result;
 }
 
-#endif /* __KERNEL_OPENCL__ */
-
 #ifndef __KERNEL_CUDA__
 ccl_device_inline float saturate(float a)
 {
@@ -451,7 +447,6 @@ CCL_NAMESPACE_END
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_OPENCL__
 /* Interpolation */
 
 template<class A, class B> A lerp(const A &a, const A &b, const B &t)
@@ -459,15 +454,9 @@ template<class A, class B> A lerp(const A &a, const A &b, const B &t)
   return (A)(a * ((B)1 - t) + b * t);
 }
 
-#endif /* __KERNEL_OPENCL__ */
-
 /* Triangle */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float triangle_area(const float3 &v1, const float3 &v2, const float3 &v3)
-#else
-ccl_device_inline float triangle_area(const float3 v1, const float3 v2, const float3 v3)
-#endif
 {
   return len(cross(v3 - v2, v1 - v2)) * 0.5f;
 }
@@ -665,11 +654,7 @@ ccl_device_inline float pow22(float a)
 
 ccl_device_inline float beta(float x, float y)
 {
-#ifndef __KERNEL_OPENCL__
   return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));
-#else
-  return expf(lgamma(x) + lgamma(y) - lgamma(x + y));
-#endif
 }
 
 ccl_device_inline float xor_signmask(float x, int y)
@@ -686,8 +671,6 @@ ccl_device_inline uint count_leading_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return __clz(x);
-#elif defined(__KERNEL_OPENCL__)
-  return clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -704,8 +687,6 @@ ccl_device_inline uint count_trailing_zeros(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return (__ffs(x) - 1);
-#elif defined(__KERNEL_OPENCL__)
-  return (31 - count_leading_zeros(x & -x));
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -722,8 +703,6 @@ ccl_device_inline uint find_first_set(uint x)
 {
 #if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__)
   return __ffs(x);
-#elif defined(__KERNEL_OPENCL__)
-  return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
 #else
 #  ifdef _MSC_VER
   return (x != 0) ? (32 - count_leading_zeros(x & (-x))) : 0;
@@ -797,6 +776,52 @@ ccl_device_inline float precise_angle(float3 a, float3 b)
   return 2.0f * atan2f(len(a - b), len(a + b));
 }
 
+/* Return value which is greater than the given one and is a power of two. */
+ccl_device_inline uint next_power_of_two(uint x)
+{
+  return x == 0 ? 1 : 1 << (32 - count_leading_zeros(x));
+}
+
+/* Return value which is lower than the given one and is a power of two. */
+ccl_device_inline uint prev_power_of_two(uint x)
+{
+  return x < 2 ? x : 1 << (31 - count_leading_zeros(x - 1));
+}
+
+#ifndef __has_builtin
+#  define __has_builtin(v) 0
+#endif
+
+/* Reverses the bits of a 32 bit integer. */
+ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
+{
+  /* Use a native instruction if it exists. */
+#if defined(__arm__) || defined(__aarch64__)
+  __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
+  return x;
+#elif defined(__KERNEL_CUDA__)
+  return __brev(x);
+#elif __has_builtin(__builtin_bitreverse32)
+  return __builtin_bitreverse32(x);
+#else
+  /* Flip pairwise. */
+  x = ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
+  /* Flip pairs. */
+  x = ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
+  /* Flip nibbles. */
+  x = ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
+  /* Flip bytes. CPUs have an instruction for that, pretty fast one. */
+#  ifdef _MSC_VER
+  return _byteswap_ulong(x);
+#  elif defined(__INTEL_COMPILER)
+  return (uint32_t)_bswap((int)x);
+#  else
+  /* Assuming gcc or clang. */
+  return __builtin_bswap32(x);
+#  endif
+#endif
+}
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_MATH_H__ */
diff --git a/intern/cycles/util/util_math_float2.h b/intern/cycles/util/util_math_float2.h
index 17f6f3c9382..70b80c33544 100644
--- a/intern/cycles/util/util_math_float2.h
+++ b/intern/cycles/util/util_math_float2.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float2 operator-(const float2 &a);
 ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
 ccl_device_inline float2 operator*(const float2 &a, float f);
@@ -64,7 +63,6 @@ ccl_device_inline float2 fabs(const float2 &a);
 ccl_device_inline float2 as_float2(const float4 &a);
 ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
 ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
 
@@ -82,7 +80,6 @@ ccl_device_inline float2 one_float2()
   return make_float2(1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float2 operator-(const float2 &a)
 {
   return make_float2(-a.x, -a.y);
@@ -262,8 +259,6 @@ ccl_device_inline float2 floor(const float2 &a)
   return make_float2(floorf(a.x), floorf(a.y));
 }
 
-#endif /* !__KERNEL_OPENCL__ */
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
   return (b != 0.0f) ? a / b : zero_float2();
diff --git a/intern/cycles/util/util_math_float3.h b/intern/cycles/util/util_math_float3.h
index 9673c043189..30a1b4c3f77 100644
--- a/intern/cycles/util/util_math_float3.h
+++ b/intern/cycles/util/util_math_float3.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float3 operator-(const float3 &a);
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
 ccl_device_inline float3 operator*(const float3 &a, const float f);
@@ -63,7 +62,6 @@ ccl_device_inline float3 rcp(const float3 &a);
 ccl_device_inline float3 sqrt(const float3 &a);
 ccl_device_inline float3 floor(const float3 &a);
 ccl_device_inline float3 ceil(const float3 &a);
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a);
 ccl_device_inline float max3(float3 a);
@@ -105,50 +103,49 @@ ccl_device_inline float3 one_float3()
   return make_float3(1.0f, 1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float3 operator-(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
-#  else
+#else
   return make_float3(-a.x, -a.y, -a.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float3 &a, const float f)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
-#  else
+#else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator*(const float f, const float3 &a)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
-#  else
+#else
   return make_float3(a.x * f, a.y * f, a.z * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator/(const float f, const float3 &a)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
-#  else
+#else
   return make_float3(f / a.x, f / a.y, f / a.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator/(const float3 &a, const float f)
@@ -159,11 +156,11 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 
 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return float3(_mm_div_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator+(const float3 &a, const float f)
@@ -173,11 +170,11 @@ ccl_device_inline float3 operator+(const float3 &a, const float f)
 
 ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_add_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator-(const float3 &a, const float f)
@@ -187,11 +184,11 @@ ccl_device_inline float3 operator-(const float3 &a, const float f)
 
 ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_sub_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
@@ -227,11 +224,11 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 
 ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
-#  else
+#else
   return (a.x == b.x && a.y == b.y && a.z == b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
@@ -246,20 +243,20 @@ ccl_device_inline float distance(const float3 &a, const float3 &b)
 
 ccl_device_inline float dot(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
-#  else
+#else
   return a.x * b.x + a.y * b.y + a.z * b.z;
-#  endif
+#endif
 }
 
 ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
   return a.x * b.x + a.y * b.y;
-#  endif
+#endif
 }
 
 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
@@ -270,30 +267,30 @@ ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 
 ccl_device_inline float3 normalize(const float3 &a)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
   __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
   return float3(_mm_div_ps(a.m128, norm));
-#  else
+#else
   return a / len(a);
-#  endif
+#endif
 }
 
 ccl_device_inline float3 min(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_min_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 max(const float3 &a, const float3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_max_ps(a.m128, b.m128));
-#  else
+#else
   return make_float3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
@@ -303,43 +300,43 @@ ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &
 
 ccl_device_inline float3 fabs(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
-#    ifdef __KERNEL_NEON__
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
   return float3(vabsq_f32(a.m128));
-#    else
+#  else
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff));
   return float3(_mm_and_ps(a.m128, mask));
-#    endif
-#  else
-  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
 #  endif
+#else
+  return make_float3(fabsf(a.x), fabsf(a.y), fabsf(a.z));
+#endif
 }
 
 ccl_device_inline float3 sqrt(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_sqrt_ps(a));
-#  else
+#else
   return make_float3(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 floor(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_floor_ps(a));
-#  else
+#else
   return make_float3(floorf(a.x), floorf(a.y), floorf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 ceil(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float3(_mm_ceil_ps(a));
-#  else
+#else
   return make_float3(ceilf(a.x), ceilf(a.y), ceilf(a.z));
-#  endif
+#endif
 }
 
 ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
@@ -349,14 +346,13 @@ ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
 
 ccl_device_inline float3 rcp(const float3 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float3(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#  else
+#else
   return make_float3(1.0f / a.x, 1.0f / a.y, 1.0f / a.z);
-#  endif
+#endif
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 ccl_device_inline float min3(float3 a)
 {
@@ -483,11 +479,7 @@ ccl_device_inline float average(const float3 a)
 
 ccl_device_inline bool isequal_float3(const float3 a, const float3 b)
 {
-#ifdef __KERNEL_OPENCL__
-  return all(a == b);
-#else
   return a == b;
-#endif
 }
 
 ccl_device_inline float3 pow3(float3 v, float e)
diff --git a/intern/cycles/util/util_math_float4.h b/intern/cycles/util/util_math_float4.h
index 0ba2bafa2f0..19af5c8c638 100644
--- a/intern/cycles/util/util_math_float4.h
+++ b/intern/cycles/util/util_math_float4.h
@@ -27,7 +27,6 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float4 operator-(const float4 &a);
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
 ccl_device_inline float4 operator*(const float4 &a, float f);
@@ -66,7 +65,6 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 ccl_device_inline float4 fabs(const float4 &a);
 ccl_device_inline float4 floor(const float4 &a);
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_OPENCL__*/
 
 ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b);
 
@@ -112,33 +110,32 @@ ccl_device_inline float4 one_float4()
   return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline float4 operator-(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
   return float4(_mm_xor_ps(a.m128, mask));
-#  else
+#else
   return make_float4(-a.x, -a.y, -a.z, -a.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_mul_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(const float4 &a, float f)
 {
-#  if defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE__)
   return a * make_float4(f);
-#  else
+#else
   return make_float4(a.x * f, a.y * f, a.z * f, a.w * f);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator*(float f, const float4 &a)
@@ -153,11 +150,11 @@ ccl_device_inline float4 operator/(const float4 &a, float f)
 
 ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_div_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator+(const float4 &a, const float f)
@@ -167,11 +164,11 @@ ccl_device_inline float4 operator+(const float4 &a, const float f)
 
 ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_add_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator-(const float4 &a, const float f)
@@ -181,11 +178,11 @@ ccl_device_inline float4 operator-(const float4 &a, const float f)
 
 ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_sub_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
@@ -215,38 +212,38 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
 
 ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x < b.x, a.y < b.y, a.z < b.z, a.w < b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x >= b.x, a.y >= b.y, a.z >= b.z, a.w >= b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
-#  else
+#else
   return make_int4(a.x <= b.x, a.y <= b.y, a.z <= b.z, a.w <= b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
-#  else
+#else
   return (a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float distance(const float4 &a, const float4 &b)
@@ -256,16 +253,16 @@ ccl_device_inline float distance(const float4 &a, const float4 &b)
 
 ccl_device_inline float dot(const float4 &a, const float4 &b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   __m128 t = vmulq_f32(a, b);
   return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
 #  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
 #  endif
+#else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#endif
 }
 
 ccl_device_inline float len_squared(const float4 &a)
@@ -275,21 +272,21 @@ ccl_device_inline float len_squared(const float4 &a)
 
 ccl_device_inline float4 rcp(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   /* Don't use _mm_rcp_ps due to poor precision. */
   return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
-#  else
+#else
   return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 sqrt(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_sqrt_ps(a.m128));
-#  else
+#else
   return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 sqr(const float4 &a)
@@ -299,39 +296,39 @@ ccl_device_inline float4 sqr(const float4 &a)
 
 ccl_device_inline float4 cross(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
          (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#  else
+#else
   return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#  endif
+#endif
 }
 
 ccl_device_inline bool is_zero(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return a == make_float4(0.0f);
-#  else
+#else
   return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#  endif
+#endif
 }
 
 ccl_device_inline float4 reduce_add(const float4 &a)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   return float4(vdupq_n_f32(vaddvq_f32(a)));
-#    elif defined(__KERNEL_SSE3__)
+#  elif defined(__KERNEL_SSE3__)
   float4 h(_mm_hadd_ps(a.m128, a.m128));
   return float4(_mm_hadd_ps(h.m128, h.m128));
-#    else
+#  else
   float4 h(shuffle<1, 0, 3, 2>(a) + a);
   return shuffle<2, 3, 0, 1>(h) + h;
-#    endif
-#  else
+#  endif
+#else
   float sum = (a.x + a.y) + (a.z + a.w);
   return make_float4(sum, sum, sum, sum);
-#  endif
+#endif
 }
 
 ccl_device_inline float average(const float4 &a)
@@ -357,20 +354,20 @@ ccl_device_inline float4 safe_normalize(const float4 &a)
 
 ccl_device_inline float4 min(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_min_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 max(const float4 &a, const float4 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_max_ps(a.m128, b.m128));
-#  else
+#else
   return make_float4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
@@ -380,24 +377,24 @@ ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &
 
 ccl_device_inline float4 fabs(const float4 &a)
 {
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
+#if defined(__KERNEL_SSE__)
+#  if defined(__KERNEL_NEON__)
   return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
 #  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
 #  endif
+#else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#endif
 }
 
 ccl_device_inline float4 floor(const float4 &a)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return float4(_mm_floor_ps(a));
-#  else
+#else
   return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#  endif
+#endif
 }
 
 ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
@@ -405,8 +402,6 @@ ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
   return a + t * (b - a);
 }
 
-#endif /* !__KERNEL_OPENCL__*/
-
 #ifdef __KERNEL_SSE__
 template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
 __forceinline const float4 shuffle(const float4 &b)
diff --git a/intern/cycles/util/util_math_int2.h b/intern/cycles/util/util_math_int2.h
index 0295cd51f7e..5782b878801 100644
--- a/intern/cycles/util/util_math_int2.h
+++ b/intern/cycles/util/util_math_int2.h
@@ -27,20 +27,17 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline bool operator==(const int2 a, const int2 b);
 ccl_device_inline int2 operator+(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator+=(int2 &a, const int2 &b);
 ccl_device_inline int2 operator-(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator*(const int2 &a, const int2 &b);
 ccl_device_inline int2 operator/(const int2 &a, const int2 &b);
-#endif /* !__KERNEL_OPENCL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline bool operator==(const int2 a, const int2 b)
 {
   return (a.x == b.x && a.y == b.y);
@@ -70,7 +67,6 @@ ccl_device_inline int2 operator/(const int2 &a, const int2 &b)
 {
   return make_int2(a.x / b.x, a.y / b.y);
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_math_int3.h b/intern/cycles/util/util_math_int3.h
index d92ed895dc2..e0dfae7c015 100644
--- a/intern/cycles/util/util_math_int3.h
+++ b/intern/cycles/util/util_math_int3.h
@@ -27,52 +27,49 @@ CCL_NAMESPACE_BEGIN
  * Declaration.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int3 min(int3 a, int3 b);
 ccl_device_inline int3 max(int3 a, int3 b);
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx);
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx);
-#endif /* !__KERNEL_OPENCL__ */
 
 /*******************************************************************************
  * Definition.
  */
 
-#ifndef __KERNEL_OPENCL__
 ccl_device_inline int3 min(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_min_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 max(int3 a, int3 b)
 {
-#  if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
+#if defined(__KERNEL_SSE__) && defined(__KERNEL_SSE41__)
   return int3(_mm_max_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int mn, int mx)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return min(max(a, make_int3(mn)), make_int3(mx));
-#  else
+#else
   return make_int3(clamp(a.x, mn, mx), clamp(a.y, mn, mx), clamp(a.z, mn, mx));
-#  endif
+#endif
 }
 
 ccl_device_inline int3 clamp(const int3 &a, int3 &mn, int mx)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return min(max(a, mn), make_int3(mx));
-#  else
+#else
   return make_int3(clamp(a.x, mn.x, mx), clamp(a.y, mn.y, mx), clamp(a.z, mn.z, mx));
-#  endif
+#endif
 }
 
 ccl_device_inline bool operator==(const int3 &a, const int3 &b)
@@ -92,22 +89,21 @@ ccl_device_inline bool operator<(const int3 &a, const int3 &b)
 
 ccl_device_inline int3 operator+(const int3 &a, const int3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int3(_mm_add_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
-#  endif
+#endif
 }
 
 ccl_device_inline int3 operator-(const int3 &a, const int3 &b)
 {
-#  ifdef __KERNEL_SSE__
+#ifdef __KERNEL_SSE__
   return int3(_mm_sub_epi32(a.m128, b.m128));
-#  else
+#else
   return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
-#  endif
+#endif
 }
-#endif /* !__KERNEL_OPENCL__ */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_path.cpp b/intern/cycles/util/util_path.cpp
index 8905c8bc7f0..c78f4615013 100644
--- a/intern/cycles/util/util_path.cpp
+++ b/intern/cycles/util/util_path.cpp
@@ -66,6 +66,7 @@ typedef struct stat path_stat_t;
 
 static string cached_path = "";
 static string cached_user_path = "";
+static string cached_temp_path = "";
 static string cached_xdg_cache_path = "";
 
 namespace {
@@ -335,10 +336,11 @@ static string path_xdg_cache_get()
 }
 #endif
 
-void path_init(const string &path, const string &user_path)
+void path_init(const string &path, const string &user_path, const string &temp_path)
 {
   cached_path = path;
   cached_user_path = user_path;
+  cached_temp_path = temp_path;
 
 #ifdef _MSC_VER
   // workaround for https://svn.boost.org/trac/boost/ticket/6320
@@ -382,6 +384,15 @@ string path_cache_get(const string &sub)
 #endif
 }
 
+string path_temp_get(const string &sub)
+{
+  if (cached_temp_path == "") {
+    cached_temp_path = Filesystem::temp_directory_path();
+  }
+
+  return path_join(cached_temp_path, sub);
+}
+
 #if defined(__linux__) || defined(__APPLE__)
 string path_xdg_home_get(const string &sub = "");
 #endif
@@ -739,177 +750,6 @@ bool path_remove(const string &path)
   return remove(path.c_str()) == 0;
 }
 
-struct SourceReplaceState {
-  typedef map<string, string> ProcessedMapping;
-  /* Base director for all relative include headers. */
-  string base;
-  /* Result of processed files. */
-  ProcessedMapping processed_files;
-  /* Set of files which are considered "precompiled" and which are replaced
-   * with and empty string on a subsequent occurrence in include statement.
-   */
-  set<string> precompiled_headers;
-};
-
-static string path_source_replace_includes_recursive(const string &source,
-                                                     const string &source_filepath,
-                                                     SourceReplaceState *state);
-
-static string line_directive(const SourceReplaceState &state, const string &path, const int line)
-{
-  string unescaped_path = path;
-  /* First we make path relative. */
-  if (string_startswith(unescaped_path, state.base.c_str())) {
-    const string base_file = path_filename(state.base);
-    const size_t base_len = state.base.length();
-    unescaped_path = base_file +
-                     unescaped_path.substr(base_len, unescaped_path.length() - base_len);
-  }
-  /* Second, we replace all unsafe characters. */
-  const size_t length = unescaped_path.length();
-  string escaped_path = "";
-  for (size_t i = 0; i < length; ++i) {
-    const char ch = unescaped_path[i];
-    if (strchr("\"\'\?\\", ch) != NULL) {
-      escaped_path += "\\";
-    }
-    escaped_path += ch;
-  }
-  /* TODO(sergey): Check whether using std::to_string combined with several
-   * concatenation operations is any faster.
-   */
-  return string_printf("#line %d \"%s\"", line, escaped_path.c_str());
-}
-
-static string path_source_handle_preprocessor(const string &preprocessor_line,
-                                              const string &source_filepath,
-                                              const size_t line_number,
-                                              SourceReplaceState *state)
-{
-  string result = preprocessor_line;
-  string token = string_strip(preprocessor_line.substr(1, preprocessor_line.size() - 1));
-  if (string_startswith(token, "include")) {
-    token = string_strip(token.substr(7, token.size() - 7));
-    if (token[0] == '"') {
-      const size_t n_start = 1;
-      const size_t n_end = token.find("\"", n_start);
-      const string filename = token.substr(n_start, n_end - n_start);
-      const bool is_precompiled = string_endswith(token, "// PRECOMPILED");
-      string filepath = path_join(state->base, filename);
-      if (!path_exists(filepath)) {
-        filepath = path_join(path_dirname(source_filepath), filename);
-      }
-      if (is_precompiled) {
-        state->precompiled_headers.insert(filepath);
-      }
-      string text;
-      if (path_read_text(filepath, text)) {
-        text = path_source_replace_includes_recursive(text, filepath, state);
-        /* Use line directives for better error messages. */
-        result = line_directive(*state, filepath, 1) + "\n" + text + "\n" +
-                 line_directive(*state, source_filepath, line_number + 1);
-      }
-    }
-  }
-  return result;
-}
-
-/* Our own little c preprocessor that replaces #includes with the file
- * contents, to work around issue of OpenCL drivers not supporting
- * include paths with spaces in them.
- */
-static string path_source_replace_includes_recursive(const string &source,
-                                                     const string &source_filepath,
-                                                     SourceReplaceState *state)
-{
-  /* Try to re-use processed file without spending time on replacing all
-   * include directives again.
-   */
-  SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find(
-      source_filepath);
-  if (replaced_file != state->processed_files.end()) {
-    if (state->precompiled_headers.find(source_filepath) != state->precompiled_headers.end()) {
-      return "";
-    }
-    return replaced_file->second;
-  }
-  /* Perform full file processing. */
-  string result = "";
-  const size_t source_length = source.length();
-  size_t index = 0;
-  /* Information about where we are in the source. */
-  size_t line_number = 0, column_number = 1;
-  /* Currently gathered non-preprocessor token.
-   * Store as start/length rather than token itself to avoid overhead of
-   * memory re-allocations on each character concatenation.
-   */
-  size_t token_start = 0, token_length = 0;
-  /* Denotes whether we're inside of preprocessor line, together with
-   * preprocessor line itself.
-   *
-   * TODO(sergey): Investigate whether using token start/end position
-   * gives measurable speedup.
-   */
-  bool inside_preprocessor = false;
-  string preprocessor_line = "";
-  /* Actual loop over the whole source. */
-  while (index < source_length) {
-    const char ch = source[index];
-    if (ch == '\n') {
-      if (inside_preprocessor) {
-        result += path_source_handle_preprocessor(
-            preprocessor_line, source_filepath, line_number, state);
-        /* Start gathering net part of the token. */
-        token_start = index;
-        token_length = 0;
-      }
-      inside_preprocessor = false;
-      preprocessor_line = "";
-      column_number = 0;
-      ++line_number;
-    }
-    else if (ch == '#' && column_number == 1 && !inside_preprocessor) {
-      /* Append all possible non-preprocessor token to the result. */
-      if (token_length != 0) {
-        result.append(source, token_start, token_length);
-        token_start = index;
-        token_length = 0;
-      }
-      inside_preprocessor = true;
-    }
-    if (inside_preprocessor) {
-      preprocessor_line += ch;
-    }
-    else {
-      ++token_length;
-    }
-    ++index;
-    ++column_number;
-  }
-  /* Append possible tokens which happened before special events handled
-   * above.
-   */
-  if (token_length != 0) {
-    result.append(source, token_start, token_length);
-  }
-  if (inside_preprocessor) {
-    result += path_source_handle_preprocessor(
-        preprocessor_line, source_filepath, line_number, state);
-  }
-  /* Store result for further reuse. */
-  state->processed_files[source_filepath] = result;
-  return result;
-}
-
-string path_source_replace_includes(const string &source,
-                                    const string &path,
-                                    const string &source_filename)
-{
-  SourceReplaceState state;
-  state.base = path;
-  return path_source_replace_includes_recursive(source, path_join(path, source_filename), &state);
-}
-
 FILE *path_fopen(const string &path, const string &mode)
 {
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_path.h b/intern/cycles/util/util_path.h
index 7a83c2135a4..f899bc2e01c 100644
--- a/intern/cycles/util/util_path.h
+++ b/intern/cycles/util/util_path.h
@@ -32,9 +32,10 @@
 CCL_NAMESPACE_BEGIN
 
 /* program paths */
-void path_init(const string &path = "", const string &user_path = "");
+void path_init(const string &path = "", const string &user_path = "", const string &tmp_path = "");
 string path_get(const string &sub = "");
 string path_user_get(const string &sub = "");
+string path_temp_get(const string &sub = "");
 string path_cache_get(const string &sub = "");
 
 /* path string manipulation */
@@ -65,11 +66,6 @@ bool path_read_text(const string &path, string &text);
 /* File manipulation. */
 bool path_remove(const string &path);
 
-/* source code utility */
-string path_source_replace_includes(const string &source,
-                                    const string &path,
-                                    const string &source_filename = "");
-
 /* cache utility */
 void path_cache_clear_except(const string &name, const set<string> &except);
 
diff --git a/intern/cycles/util/util_profiling.cpp b/intern/cycles/util/util_profiling.cpp
index 073b09f719f..5343f076e22 100644
--- a/intern/cycles/util/util_profiling.cpp
+++ b/intern/cycles/util/util_profiling.cpp
@@ -48,13 +48,7 @@ void Profiler::run()
       }
 
       if (cur_shader >= 0 && cur_shader < shader_samples.size()) {
-        /* Only consider the active shader during events whose runtime significantly depends on it.
-         */
-        if (((cur_event >= PROFILING_SHADER_EVAL) && (cur_event <= PROFILING_SUBSURFACE)) ||
-            ((cur_event >= PROFILING_CLOSURE_EVAL) &&
-             (cur_event <= PROFILING_CLOSURE_VOLUME_SAMPLE))) {
-          shader_samples[cur_shader]++;
-        }
+        shader_samples[cur_shader]++;
       }
 
       if (cur_object >= 0 && cur_object < object_samples.size()) {
diff --git a/intern/cycles/util/util_profiling.h b/intern/cycles/util/util_profiling.h
index ceec08ed894..96bb682c50e 100644
--- a/intern/cycles/util/util_profiling.h
+++ b/intern/cycles/util/util_profiling.h
@@ -28,38 +28,30 @@ CCL_NAMESPACE_BEGIN
 enum ProfilingEvent : uint32_t {
   PROFILING_UNKNOWN,
   PROFILING_RAY_SETUP,
-  PROFILING_PATH_INTEGRATE,
-  PROFILING_SCENE_INTERSECT,
-  PROFILING_INDIRECT_EMISSION,
-  PROFILING_VOLUME,
-  PROFILING_SHADER_SETUP,
-  PROFILING_SHADER_EVAL,
-  PROFILING_SHADER_APPLY,
-  PROFILING_AO,
-  PROFILING_SUBSURFACE,
-  PROFILING_CONNECT_LIGHT,
-  PROFILING_SURFACE_BOUNCE,
-  PROFILING_WRITE_RESULT,
-
-  PROFILING_INTERSECT,
-  PROFILING_INTERSECT_LOCAL,
-  PROFILING_INTERSECT_SHADOW_ALL,
-  PROFILING_INTERSECT_VOLUME,
-  PROFILING_INTERSECT_VOLUME_ALL,
-
-  PROFILING_CLOSURE_EVAL,
-  PROFILING_CLOSURE_SAMPLE,
-  PROFILING_CLOSURE_VOLUME_EVAL,
-  PROFILING_CLOSURE_VOLUME_SAMPLE,
-
-  PROFILING_DENOISING,
-  PROFILING_DENOISING_CONSTRUCT_TRANSFORM,
-  PROFILING_DENOISING_RECONSTRUCT,
-  PROFILING_DENOISING_DIVIDE_SHADOW,
-  PROFILING_DENOISING_NON_LOCAL_MEANS,
-  PROFILING_DENOISING_COMBINE_HALVES,
-  PROFILING_DENOISING_GET_FEATURE,
-  PROFILING_DENOISING_DETECT_OUTLIERS,
+
+  PROFILING_INTERSECT_CLOSEST,
+  PROFILING_INTERSECT_SUBSURFACE,
+  PROFILING_INTERSECT_SHADOW,
+  PROFILING_INTERSECT_VOLUME_STACK,
+
+  PROFILING_SHADE_SURFACE_SETUP,
+  PROFILING_SHADE_SURFACE_EVAL,
+  PROFILING_SHADE_SURFACE_DIRECT_LIGHT,
+  PROFILING_SHADE_SURFACE_INDIRECT_LIGHT,
+  PROFILING_SHADE_SURFACE_AO,
+  PROFILING_SHADE_SURFACE_PASSES,
+
+  PROFILING_SHADE_VOLUME_SETUP,
+  PROFILING_SHADE_VOLUME_INTEGRATE,
+  PROFILING_SHADE_VOLUME_DIRECT_LIGHT,
+  PROFILING_SHADE_VOLUME_INDIRECT_LIGHT,
+
+  PROFILING_SHADE_SHADOW_SETUP,
+  PROFILING_SHADE_SHADOW_SURFACE,
+  PROFILING_SHADE_SHADOW_VOLUME,
+
+  PROFILING_SHADE_LIGHT_SETUP,
+  PROFILING_SHADE_LIGHT_EVAL,
 
   PROFILING_NUM_EVENTS,
 };
@@ -136,37 +128,51 @@ class ProfilingHelper {
     state->event = event;
   }
 
+  ~ProfilingHelper()
+  {
+    state->event = previous_event;
+  }
+
   inline void set_event(ProfilingEvent event)
   {
     state->event = event;
   }
 
-  inline void set_shader(int shader)
+ protected:
+  ProfilingState *state;
+  uint32_t previous_event;
+};
+
+class ProfilingWithShaderHelper : public ProfilingHelper {
+ public:
+  ProfilingWithShaderHelper(ProfilingState *state, ProfilingEvent event)
+      : ProfilingHelper(state, event)
   {
-    state->shader = shader;
-    if (state->active) {
-      assert(shader < state->shader_hits.size());
-      state->shader_hits[shader]++;
-    }
   }
 
-  inline void set_object(int object)
+  ~ProfilingWithShaderHelper()
   {
-    state->object = object;
-    if (state->active) {
-      assert(object < state->object_hits.size());
-      state->object_hits[object]++;
-    }
+    state->object = -1;
+    state->shader = -1;
   }
 
-  ~ProfilingHelper()
+  inline void set_shader(int object, int shader)
   {
-    state->event = previous_event;
+    if (state->active) {
+      state->shader = shader;
+      state->object = object;
+
+      if (shader >= 0) {
+        assert(shader < state->shader_hits.size());
+        state->shader_hits[shader]++;
+      }
+
+      if (object >= 0) {
+        assert(object < state->object_hits.size());
+        state->object_hits[object]++;
+      }
+    }
   }
-
- private:
-  ProfilingState *state;
-  uint32_t previous_event;
 };
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h
index 26534a29dfe..dca8d3d0ab5 100644
--- a/intern/cycles/util/util_progress.h
+++ b/intern/cycles/util/util_progress.h
@@ -46,7 +46,6 @@ class Progress {
     substatus = "";
     sync_status = "";
     sync_substatus = "";
-    kernel_status = "";
     update_cb = function_null;
     cancel = false;
     cancel_message = "";
@@ -87,7 +86,6 @@ class Progress {
     substatus = "";
     sync_status = "";
     sync_substatus = "";
-    kernel_status = "";
     cancel = false;
     cancel_message = "";
     error = false;
@@ -316,24 +314,6 @@ class Progress {
     }
   }
 
-  /* kernel status */
-
-  void set_kernel_status(const string &kernel_status_)
-  {
-    {
-      thread_scoped_lock lock(progress_mutex);
-      kernel_status = kernel_status_;
-    }
-
-    set_update();
-  }
-
-  void get_kernel_status(string &kernel_status_)
-  {
-    thread_scoped_lock lock(progress_mutex);
-    kernel_status_ = kernel_status;
-  }
-
   /* callback */
 
   void set_update()
@@ -378,8 +358,6 @@ class Progress {
   string sync_status;
   string sync_substatus;
 
-  string kernel_status;
-
   volatile bool cancel;
   string cancel_message;
 
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index 8e8caa98a1b..b4a153c329f 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -61,14 +61,14 @@ static struct TrueTy {
   {
     return true;
   }
-} True ccl_maybe_unused;
+} True ccl_attr_maybe_unused;
 
 static struct FalseTy {
   __forceinline operator bool() const
   {
     return false;
   }
-} False ccl_maybe_unused;
+} False ccl_attr_maybe_unused;
 
 static struct ZeroTy {
   __forceinline operator float() const
@@ -79,7 +79,7 @@ static struct ZeroTy {
   {
     return 0;
   }
-} zero ccl_maybe_unused;
+} zero ccl_attr_maybe_unused;
 
 static struct OneTy {
   __forceinline operator float() const
@@ -90,7 +90,7 @@ static struct OneTy {
   {
     return 1;
   }
-} one ccl_maybe_unused;
+} one ccl_attr_maybe_unused;
 
 static struct NegInfTy {
   __forceinline operator float() const
@@ -101,7 +101,7 @@ static struct NegInfTy {
   {
     return std::numeric_limits<int>::min();
   }
-} neg_inf ccl_maybe_unused;
+} neg_inf ccl_attr_maybe_unused;
 
 static struct PosInfTy {
   __forceinline operator float() const
@@ -112,10 +112,10 @@ static struct PosInfTy {
   {
     return std::numeric_limits<int>::max();
   }
-} inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
+} inf ccl_attr_maybe_unused, pos_inf ccl_attr_maybe_unused;
 
 static struct StepTy {
-} step ccl_maybe_unused;
+} step ccl_attr_maybe_unused;
 
 #endif
 
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index d809f2e06d7..7df52d462b7 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -24,9 +24,9 @@
 
 CCL_NAMESPACE_BEGIN
 
-#if defined(__KERNEL_OPENCL__) || defined(CYCLES_CUBIN_CC)
+#if defined(CYCLES_CUBIN_CC)
 #  define static_assert(statement, message)
-#endif /* __KERNEL_OPENCL__ */
+#endif
 
 #define static_assert_align(st, align) \
   static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
diff --git a/intern/cycles/util/util_string.cpp b/intern/cycles/util/util_string.cpp
index 4dfebf14923..9c0b2ca50bb 100644
--- a/intern/cycles/util/util_string.cpp
+++ b/intern/cycles/util/util_string.cpp
@@ -17,6 +17,9 @@
 #include <stdarg.h>
 #include <stdio.h>
 
+#include <algorithm>
+#include <cctype>
+
 #include "util/util_foreach.h"
 #include "util/util_string.h"
 #include "util/util_windows.h"
@@ -107,24 +110,26 @@ void string_split(vector<string> &tokens,
   }
 }
 
-bool string_startswith(const string &s, const char *start)
+bool string_startswith(const string_view s, const string_view start)
 {
-  size_t len = strlen(start);
+  const size_t len = start.size();
 
-  if (len > s.size())
-    return 0;
-  else
-    return strncmp(s.c_str(), start, len) == 0;
+  if (len > s.size()) {
+    return false;
+  }
+
+  return strncmp(s.c_str(), start.data(), len) == 0;
 }
 
-bool string_endswith(const string &s, const string &end)
+bool string_endswith(const string_view s, const string_view end)
 {
-  size_t len = end.length();
+  const size_t len = end.size();
 
-  if (len > s.size())
-    return 0;
-  else
-    return s.compare(s.length() - len, len, end) == 0;
+  if (len > s.size()) {
+    return false;
+  }
+
+  return strncmp(s.c_str() + s.size() - len, end.data(), len) == 0;
 }
 
 string string_strip(const string &s)
@@ -172,6 +177,13 @@ string to_string(const char *str)
   return string(str);
 }
 
+string string_to_lower(const string &s)
+{
+  string r = s;
+  std::transform(r.begin(), r.end(), r.begin(), [](char c) { return std::tolower(c); });
+  return r;
+}
+
 /* Wide char strings helpers for Windows. */
 
 #ifdef _WIN32
diff --git a/intern/cycles/util/util_string.h b/intern/cycles/util/util_string.h
index f2272819b2f..55462cfd8b8 100644
--- a/intern/cycles/util/util_string.h
+++ b/intern/cycles/util/util_string.h
@@ -21,6 +21,11 @@
 #include <string.h>
 #include <string>
 
+/* Use string view implementation from OIIO.
+ * Ideally, need to switch to `std::string_view`, but this first requires getting rid of using
+ * namespace OIIO as it causes symbol collision. */
+#include <OpenImageIO/string_view.h>
+
 #include "util/util_vector.h"
 
 CCL_NAMESPACE_BEGIN
@@ -31,6 +36,8 @@ using std::string;
 using std::stringstream;
 using std::to_string;
 
+using OIIO::string_view;
+
 #ifdef __GNUC__
 #  define PRINTF_ATTRIBUTE __attribute__((format(printf, 1, 2)))
 #else
@@ -45,12 +52,13 @@ void string_split(vector<string> &tokens,
                   const string &separators = "\t ",
                   bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
-bool string_startswith(const string &s, const char *start);
-bool string_endswith(const string &s, const string &end);
+bool string_startswith(string_view s, string_view start);
+bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string string_to_lower(const string &s);
 
 /* Wide char strings are only used on Windows to deal with non-ASCII
  * characters in file names and such. No reason to use such strings
diff --git a/intern/cycles/util/util_system.cpp b/intern/cycles/util/util_system.cpp
index b010881058b..be8c2fb505a 100644
--- a/intern/cycles/util/util_system.cpp
+++ b/intern/cycles/util/util_system.cpp
@@ -403,4 +403,13 @@ size_t system_physical_ram()
 #endif
 }
 
+uint64_t system_self_process_id()
+{
+#ifdef _WIN32
+  return GetCurrentProcessId();
+#else
+  return getpid();
+#endif
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/util/util_system.h b/intern/cycles/util/util_system.h
index c4db8b74339..a1797e6ca44 100644
--- a/intern/cycles/util/util_system.h
+++ b/intern/cycles/util/util_system.h
@@ -65,6 +65,9 @@ size_t system_physical_ram();
 /* Start a new process of the current application with the given arguments. */
 bool system_call_self(const vector<string> &args);
 
+/* Get identifier of the currently running process. */
+uint64_t system_self_process_id();
+
 CCL_NAMESPACE_END
 
 #endif /* __UTIL_SYSTEM_H__ */
diff --git a/intern/cycles/util/util_tbb.h b/intern/cycles/util/util_tbb.h
index 73e0f92d19c..8f84377ac8c 100644
--- a/intern/cycles/util/util_tbb.h
+++ b/intern/cycles/util/util_tbb.h
@@ -23,6 +23,7 @@
 
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
+#include <tbb/parallel_for_each.h>
 #include <tbb/task_arena.h>
 #include <tbb/task_group.h>
 
diff --git a/intern/cycles/util/util_texture.h b/intern/cycles/util/util_texture.h
index 71bf9c65911..4de66bf5f46 100644
--- a/intern/cycles/util/util_texture.h
+++ b/intern/cycles/util/util_texture.h
@@ -85,8 +85,6 @@ typedef struct TextureInfo {
   uint64_t data;
   /* Data Type */
   uint data_type;
-  /* Buffer number for OpenCL. */
-  uint cl_buffer;
   /* Interpolation and extension type. */
   uint interpolation, extension;
   /* Dimensions. */
diff --git a/intern/cycles/util/util_transform.h b/intern/cycles/util/util_transform.h
index f79eac4cbcf..e9cd3b0b483 100644
--- a/intern/cycles/util/util_transform.h
+++ b/intern/cycles/util/util_transform.h
@@ -498,36 +498,12 @@ Transform transform_from_viewplane(BoundBox2D &viewplane);
 
 #endif
 
-/* TODO(sergey): This is only for until we've got OpenCL 2.0
- * on all devices we consider supported. It'll be replaced with
- * generic address space.
- */
+/* TODO: This can be removed when we know if no devices will require explicit
+ * address space qualifiers for this case. */
 
-#ifdef __KERNEL_OPENCL__
-
-#  define OPENCL_TRANSFORM_ADDRSPACE_GLUE(a, b) a##b
-#  define OPENCL_TRANSFORM_ADDRSPACE_DECLARE(function) \
-    ccl_device_inline float3 OPENCL_TRANSFORM_ADDRSPACE_GLUE(function, _addrspace)( \
-        ccl_addr_space const Transform *t, const float3 a) \
-    { \
-      Transform private_tfm = *t; \
-      return function(&private_tfm, a); \
-    }
-
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_point)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction)
-OPENCL_TRANSFORM_ADDRSPACE_DECLARE(transform_direction_transposed)
-
-#  undef OPENCL_TRANSFORM_ADDRSPACE_DECLARE
-#  undef OPENCL_TRANSFORM_ADDRSPACE_GLUE
-#  define transform_point_auto transform_point_addrspace
-#  define transform_direction_auto transform_direction_addrspace
-#  define transform_direction_transposed_auto transform_direction_transposed_addrspace
-#else
-#  define transform_point_auto transform_point
-#  define transform_direction_auto transform_direction
-#  define transform_direction_transposed_auto transform_direction_transposed
-#endif
+#define transform_point_auto transform_point
+#define transform_direction_auto transform_direction
+#define transform_direction_transposed_auto transform_direction_transposed
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 87358877e3c..442c32b3a3d 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -17,9 +17,7 @@
 #ifndef __UTIL_TYPES_H__
 #define __UTIL_TYPES_H__
 
-#ifndef __KERNEL_OPENCL__
-#  include <stdlib.h>
-#endif
+#include <stdlib.h>
 
 /* Standard Integer Types */
 
@@ -44,18 +42,12 @@ CCL_NAMESPACE_BEGIN
 
 /* Shorter Unsigned Names */
 
-#ifndef __KERNEL_OPENCL__
 typedef unsigned char uchar;
 typedef unsigned int uint;
 typedef unsigned short ushort;
-#endif
 
 /* Fixed Bits Types */
 
-#ifdef __KERNEL_OPENCL__
-typedef unsigned long uint64_t;
-#endif
-
 #ifndef __KERNEL_GPU__
 /* Generic Memory Pointer */
 
diff --git a/intern/cycles/util/util_unique_ptr.h b/intern/cycles/util/util_unique_ptr.h
index 3aaaf083eff..3181eafd43d 100644
--- a/intern/cycles/util/util_unique_ptr.h
+++ b/intern/cycles/util/util_unique_ptr.h
@@ -21,6 +21,7 @@
 
 CCL_NAMESPACE_BEGIN
 
+using std::make_unique;
 using std::unique_ptr;
 
 CCL_NAMESPACE_END
diff --git a/release/scripts/modules/bpy/utils/__init__.py b/release/scripts/modules/bpy/utils/__init__.py
index afa04a18ef6..3f0248970c6 100644
--- a/release/scripts/modules/bpy/utils/__init__.py
+++ b/release/scripts/modules/bpy/utils/__init__.py
@@ -858,6 +858,7 @@ def register_tool(tool_cls, *, after=None, separator=False, group=False):
             "description": getattr(tool_cls, "bl_description", tool_cls.__doc__),
             "icon": getattr(tool_cls, "bl_icon", None),
             "cursor": getattr(tool_cls, "bl_cursor", None),
+            "options": getattr(tool_cls, "bl_options", None),
             "widget": getattr(tool_cls, "bl_widget", None),
             "widget_properties": getattr(tool_cls, "bl_widget_properties", None),
             "keymap": getattr(tool_cls, "bl_keymap", None),
diff --git a/release/scripts/modules/rna_manual_reference.py b/release/scripts/modules/rna_manual_reference.py
index 0e3cb7e3cab..40f59307bec 100644
--- a/release/scripts/modules/rna_manual_reference.py
+++ b/release/scripts/modules/rna_manual_reference.py
@@ -209,7 +209,6 @@ url_manual_mapping = (
 	("bpy.types.toolsettings.use_proportional_connected*", "editors/3dview/controls/proportional_editing.html#bpy-types-toolsettings-use-proportional-connected"),
 	("bpy.types.toolsettings.use_proportional_projected*", "editors/3dview/controls/proportional_editing.html#bpy-types-toolsettings-use-proportional-projected"),
 	("bpy.types.view3doverlay.vertex_paint_mode_opacity*", "editors/3dview/display/overlays.html#bpy-types-view3doverlay-vertex-paint-mode-opacity"),
-	("bpy.types.viewlayer.use_pass_cryptomatte_accurate*", "render/layers/passes.html#bpy-types-viewlayer-use-pass-cryptomatte-accurate"),
 	("bpy.types.viewlayer.use_pass_cryptomatte_material*", "render/layers/passes.html#bpy-types-viewlayer-use-pass-cryptomatte-material"),
 	("bpy.ops.gpencil.vertex_color_brightness_contrast*", "grease_pencil/modes/vertex_paint/editing.html#bpy-ops-gpencil-vertex-color-brightness-contrast"),
 	("bpy.ops.view3d.edit_mesh_extrude_individual_move*", "modeling/meshes/editing/face/extrude_faces.html#bpy-ops-view3d-edit-mesh-extrude-individual-move"),
@@ -573,7 +572,6 @@ url_manual_mapping = (
 	("bpy.types.rendersettings.film_transparent*", "render/cycles/render_settings/film.html#bpy-types-rendersettings-film-transparent"),
 	("bpy.types.rendersettings.simplify_volumes*", "render/cycles/render_settings/simplify.html#bpy-types-rendersettings-simplify-volumes"),
 	("bpy.types.rendersettings.use_render_cache*", "render/output/properties/output.html#bpy-types-rendersettings-use-render-cache"),
-	("bpy.types.rendersettings.use_save_buffers*", "render/cycles/render_settings/performance.html#bpy-types-rendersettings-use-save-buffers"),
 	("bpy.types.rendersettings.use_single_layer*", "render/layers/view_layer.html#bpy-types-rendersettings-use-single-layer"),
 	("bpy.types.sceneeevee.use_taa_reprojection*", "render/eevee/render_settings/sampling.html#bpy-types-sceneeevee-use-taa-reprojection"),
 	("bpy.types.sequenceeditor.use_overlay_lock*", "video_editing/preview/sidebar.html#bpy-types-sequenceeditor-use-overlay-lock"),
diff --git a/release/scripts/presets/cycles/sampling/Final.py b/release/scripts/presets/cycles/sampling/Final.py
index f1222d927c1..f3626c4b778 100644
--- a/release/scripts/presets/cycles/sampling/Final.py
+++ b/release/scripts/presets/cycles/sampling/Final.py
@@ -1,18 +1,12 @@
 import bpy
 cycles = bpy.context.scene.cycles
 
-# Path Trace
-cycles.samples = 512
-cycles.preview_samples = 128
-
-# Branched Path Trace
-cycles.aa_samples = 128
-cycles.preview_aa_samples = 32
-
-cycles.diffuse_samples = 4
-cycles.glossy_samples = 4
-cycles.transmission_samples = 4
-cycles.ao_samples = 1
-cycles.mesh_light_samples = 4
-cycles.subsurface_samples = 4
-cycles.volume_samples = 4
+cycles.use_adaptive_sampling = True
+cycles.adaptive_threshold = 0.01
+cycles.samples = 4096
+cycles.adaptive_min_samples = 0
+cycles.time_limit = 0.0
+cycles.use_denoising = True
+cycles.denoiser = 'OPENIMAGEDENOISE'
+cycles.denoising_input_passes = 'RGB_ALBEDO_NORMAL'
+cycles.denoising_prefilter = 'ACCURATE'
diff --git a/release/scripts/presets/cycles/sampling/Preview.py b/release/scripts/presets/cycles/sampling/Preview.py
index c16449e2c8f..66aa9339063 100644
--- a/release/scripts/presets/cycles/sampling/Preview.py
+++ b/release/scripts/presets/cycles/sampling/Preview.py
@@ -1,18 +1,12 @@
 import bpy
 cycles = bpy.context.scene.cycles
 
-# Path Trace
-cycles.samples = 128
-cycles.preview_samples = 32
-
-# Branched Path Trace
-cycles.aa_samples = 32
-cycles.preview_aa_samples = 4
-
-cycles.diffuse_samples = 4
-cycles.glossy_samples = 4
-cycles.transmission_samples = 4
-cycles.ao_samples = 1
-cycles.mesh_light_samples = 4
-cycles.subsurface_samples = 4
-cycles.volume_samples = 4
+cycles.use_adaptive_sampling = True
+cycles.adaptive_threshold = 0.1
+cycles.samples = 1024
+cycles.adaptive_min_samples = 0
+cycles.time_limit = 0.0
+cycles.use_denoising = True
+cycles.denoiser = 'OPENIMAGEDENOISE'
+cycles.denoising_input_passes = 'RGB_ALBEDO_NORMAL'
+cycles.denoising_prefilter = 'ACCURATE'
diff --git a/release/scripts/presets/cycles/viewport_sampling/Final.py b/release/scripts/presets/cycles/viewport_sampling/Final.py
new file mode 100644
index 00000000000..b2cb6bfe90a
--- /dev/null
+++ b/release/scripts/presets/cycles/viewport_sampling/Final.py
@@ -0,0 +1,11 @@
+import bpy
+cycles = bpy.context.scene.cycles
+
+cycles.use_preview_adaptive_sampling = True
+cycles.preview_adaptive_threshold = 0.01
+cycles.preview_samples = 4096
+cycles.preview_adaptive_min_samples = 0
+cycles.use_preview_denoising = True
+cycles.preview_denoiser = 'OPENIMAGEDENOISE'
+cycles.preview_denoising_input_passes = 'RGB_ALBEDO_NORMAL'
+cycles.preview_denoising_prefilter = 'ACCURATE'
diff --git a/release/scripts/presets/cycles/viewport_sampling/Preview.py b/release/scripts/presets/cycles/viewport_sampling/Preview.py
new file mode 100644
index 00000000000..f8319b70d4a
--- /dev/null
+++ b/release/scripts/presets/cycles/viewport_sampling/Preview.py
@@ -0,0 +1,11 @@
+import bpy
+cycles = bpy.context.scene.cycles
+
+cycles.use_preview_adaptive_sampling = True
+cycles.preview_adaptive_threshold = 0.1
+cycles.preview_samples = 1024
+cycles.preview_adaptive_min_samples = 0
+cycles.use_preview_denoising = False
+cycles.preview_denoiser = 'AUTO'
+cycles.preview_denoising_input_passes = 'RGB_ALBEDO'
+cycles.preview_denoising_prefilter = 'FAST'
diff --git a/release/scripts/presets/keyconfig/Blender.py b/release/scripts/presets/keyconfig/Blender.py
index eb66c961472..15cc6097979 100644
--- a/release/scripts/presets/keyconfig/Blender.py
+++ b/release/scripts/presets/keyconfig/Blender.py
@@ -54,6 +54,28 @@ class Prefs(bpy.types.KeyConfigPreferences):
         default='PLAY',
         update=update_fn,
     )
+    use_key_activate_tools: BoolProperty(
+        name="Keys Activate Tools",
+        description=(
+            "Key shortcuts such as G, R, and S activate the tool instead of running it immediately"
+        ),
+        default=False,
+        update=update_fn,
+    )
+
+    rmb_action: EnumProperty(
+        name="Right Mouse Select Action",
+        items=(
+            ('TWEAK', "Select & Tweak",
+             "Right mouse always tweaks"),
+            ('FALLBACK_TOOL', "Selection Tool",
+             "Right mouse uses the selection tool"),
+        ),
+        description=(
+            "Default action for the right mouse button"
+        ),
+        update=update_fn,
+    )
     use_alt_click_leader: BoolProperty(
         name="Alt Click Tool Prompt",
         description=(
@@ -63,6 +85,14 @@ class Prefs(bpy.types.KeyConfigPreferences):
         default=False,
         update=update_fn,
     )
+    use_alt_tool: BoolProperty(
+        name="Alt Tool Access",
+        description=(
+            "Hold Alt to use the active tool when the gizmo would normally be required"
+        ),
+        default=False,
+        update=update_fn,
+    )
     use_select_all_toggle: BoolProperty(
         name="Select All Toggles",
         description=(
@@ -179,13 +209,19 @@ class Prefs(bpy.types.KeyConfigPreferences):
 
         if is_select_left:
             col.row().prop(self, "gizmo_action", text="Activate Gizmo Event", expand=True)
+        else:
+            col.row().prop(self, "rmb_action", text="Right Mouse Select Action", expand=True)
 
         # Checkboxes sub-layout.
         col = layout.column()
         sub = col.column(align=True)
         row = sub.row()
-        row.prop(self, "use_select_all_toggle")
         row.prop(self, "use_alt_click_leader")
+        if is_select_left:
+            row.prop(self, "use_alt_tool")
+        row = sub.row()
+        row.prop(self, "use_select_all_toggle")
+        row.prop(self, "use_key_activate_tools", text="Key Activates Tools")
 
         # 3DView settings.
         col = layout.column()
@@ -222,6 +258,7 @@ def load():
                 prefs.inputs.mouse_emulate_3_button_modifier == 'ALT'
             ),
             spacebar_action=kc_prefs.spacebar_action,
+            use_key_activate_tools=kc_prefs.use_key_activate_tools,
             v3d_tilde_action=kc_prefs.v3d_tilde_action,
             use_v3d_mmb_pan=(kc_prefs.v3d_mmb_action == 'PAN'),
             v3d_alt_mmb_drag_action=kc_prefs.v3d_alt_mmb_drag_action,
@@ -232,6 +269,8 @@ def load():
                 kc_prefs.select_mouse == 'LEFT' and
                 kc_prefs.gizmo_action == 'DRAG'
             ),
+            use_fallback_tool=(True if (kc_prefs.select_mouse == 'LEFT') else (kc_prefs.rmb_action == 'FALLBACK_TOOL')),
+            use_alt_tool=(kc_prefs.use_alt_tool and kc_prefs.select_mouse == 'LEFT'),
             use_alt_click_leader=kc_prefs.use_alt_click_leader,
             use_pie_click_drag=kc_prefs.use_pie_click_drag,
         ),
diff --git a/release/scripts/presets/keyconfig/keymap_data/blender_default.py b/release/scripts/presets/keyconfig/keymap_data/blender_default.py
index 847fc757f48..5ecbe7715e3 100644
--- a/release/scripts/presets/keyconfig/keymap_data/blender_default.py
+++ b/release/scripts/presets/keyconfig/keymap_data/blender_default.py
@@ -46,6 +46,8 @@ class Params:
         "use_select_all_toggle",
         # Activate gizmo on drag (which support it).
         "use_gizmo_drag",
+        # Use the fallback tool instead of tweak for RMB select.
+        "use_fallback_tool",
         # Use pie menu for tab by default (swap 'Tab/Ctrl-Tab').
         "use_v3d_tab_menu",
         # Use extended pie menu for shading.
@@ -54,11 +56,24 @@ class Params:
         "use_v3d_mmb_pan",
         # Alt click to access tools.
         "use_alt_click_leader",
+        # Transform keys G/S/R activate tools instead of immediately transforming.
+        "use_key_activate_tools",
+        # Optionally use a modifier to access tools.
+        "tool_modifier",
         # Experimental option.
         "use_pie_click_drag",
         "v3d_tilde_action",
         # Alt-MMB axis switching 'RELATIVE' or 'ABSOLUTE' axis switching.
         "v3d_alt_mmb_drag_action",
+
+        # Convenience variables:
+        # (derived from other settings).
+        #
+        # This case needs to be checked often,
+        # convenience for: `params.use_fallback_tool if params.select_mouse == 'RIGHT' else False`.
+        "use_fallback_tool_rmb",
+        # Convenience for: `'CLICK' if params.use_fallback_tool_rmb else params.select_mouse_value`.
+        "select_mouse_value_fallback",
     )
 
     def __init__(
@@ -70,11 +85,14 @@ class Params:
 
             # User preferences.
             spacebar_action='TOOL',
+            use_key_activate_tools=False,
             use_select_all_toggle=False,
             use_gizmo_drag=True,
+            use_fallback_tool=False,
             use_v3d_tab_menu=False,
             use_v3d_shade_ex_pie=False,
             use_v3d_mmb_pan=False,
+            use_alt_tool=False,
             use_alt_click_leader=False,
             use_pie_click_drag=False,
             v3d_tilde_action='VIEW',
@@ -96,6 +114,10 @@ class Params:
             self.context_menu_event = {"type": 'W', "value": 'PRESS'}
             self.cursor_set_event = {"type": 'LEFTMOUSE', "value": 'CLICK'}
             self.cursor_tweak_event = None
+            self.use_fallback_tool = use_fallback_tool
+            self.use_fallback_tool_rmb = use_fallback_tool
+            self.select_mouse_value_fallback = 'CLICK' if self.use_fallback_tool_rmb else self.select_mouse_value
+            self.tool_modifier = {}
         else:
             # Left mouse select uses Click event for selection. This is a little
             # less immediate, but is needed to distinguish between click and tweak
@@ -115,11 +137,21 @@ class Params:
 
             self.cursor_set_event = {"type": 'RIGHTMOUSE', "value": 'PRESS', "shift": True}
             self.cursor_tweak_event = {"type": 'EVT_TWEAK_R', "value": 'ANY', "shift": True}
+            self.use_fallback_tool = True
+            self.use_fallback_tool_rmb = False
+            self.select_mouse_value_fallback = self.select_mouse_value
+
+            if use_alt_tool:
+                # Allow `Alt` to be pressed or not.
+                self.tool_modifier = {"alt": -1}
+            else:
+                self.tool_modifier = {}
 
         self.use_mouse_emulate_3_button = use_mouse_emulate_3_button
 
         # User preferences
         self.spacebar_action = spacebar_action
+        self.use_key_activate_tools = use_key_activate_tools
 
         self.use_gizmo_drag = use_gizmo_drag
         self.use_select_all_toggle = use_select_all_toggle
@@ -148,6 +180,15 @@ NUMBERS_0 = ('ZERO', 'ONE', 'TWO', 'THREE', 'FOUR', 'FIVE', 'SIX', 'SEVEN', 'EIG
 
 
 # ------------------------------------------------------------------------------
+# Generic Utilities
+
+def _fallback_id(text, fallback):
+    if fallback:
+        return text + " (fallback)"
+    return text
+
+
+# ------------------------------------------------------------------------------
 # Keymap Item Wrappers
 
 def op_menu(menu, kmi_args):
@@ -170,6 +211,16 @@ def op_tool_cycle(tool, kmi_args):
     return ("wm.tool_set_by_id", kmi_args, {"properties": [("name", tool), ("cycle", True)]})
 
 
+# Utility to select between an operator and a tool,
+# without having to duplicate key map item arguments.
+def op_tool_optional(op_args, tool_pair, params):
+    if params.use_key_activate_tools:
+        kmi_args = op_args[1]
+        op_tool_fn, tool_id = tool_pair
+        return op_tool_fn(tool_id, kmi_args)
+    return op_args
+
+
 # ------------------------------------------------------------------------------
 # Keymap Templates
 
@@ -230,6 +281,14 @@ def _template_items_select_actions(params, operator):
         ]
 
 
+def _template_items_hide_reveal_actions(op_hide, op_reveal):
+    return [
+        (op_reveal, {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        (op_hide, {"type": 'H', "value": 'PRESS'}, {"properties": [("unselected", False)]}),
+        (op_hide, {"type": 'H', "value": 'PRESS', "shift": True}, {"properties": [("unselected", True)]}),
+    ]
+
+
 def _template_items_object_subdivision_set():
     return [
         ("object.subdivision_set",
@@ -764,10 +823,14 @@ def km_property_editor(_params):
         ("object.modifier_copy", {"type": 'D', "value": 'PRESS', "shift": True}, None),
         ("object.modifier_apply", {"type": 'A', "value": 'PRESS', "ctrl": True}, {"properties": [("report", True)]}),
         # Grease pencil modifier panels
-        ("object.gpencil_modifier_remove", {"type": 'X', "value": 'PRESS'}, {"properties": [("report", True)]}),
-        ("object.gpencil_modifier_remove", {"type": 'DEL', "value": 'PRESS'}, {"properties": [("report", True)]}),
-        ("object.gpencil_modifier_copy", {"type": 'D', "value": 'PRESS', "shift": True}, None),
-        ("object.gpencil_modifier_apply", {"type": 'A', "value": 'PRESS', "ctrl": True}, {"properties": [("report", True)]}),
+        ("object.gpencil_modifier_remove",
+         {"type": 'X', "value": 'PRESS'}, {"properties": [("report", True)]}),
+        ("object.gpencil_modifier_remove",
+         {"type": 'DEL', "value": 'PRESS'}, {"properties": [("report", True)]}),
+        ("object.gpencil_modifier_copy",
+         {"type": 'D', "value": 'PRESS', "shift": True}, None),
+        ("object.gpencil_modifier_apply",
+         {"type": 'A', "value": 'PRESS', "ctrl": True}, {"properties": [("report", True)]}),
         # ShaderFX panels
         ("object.shaderfx_remove", {"type": 'X', "value": 'PRESS'}, {"properties": [("report", True)]}),
         ("object.shaderfx_remove", {"type": 'DEL', "value": 'PRESS'}, {"properties": [("report", True)]}),
@@ -882,27 +945,38 @@ def km_uv_editor(params):
     items.extend([
         # Selection modes.
         *_template_items_uv_select_mode(params),
+        *_template_uv_select(
+            type=params.select_mouse,
+            value=('CLICK' if params.use_fallback_tool_rmb else params.select_mouse_value),
+            legacy=params.legacy,
+        ),
         ("uv.mark_seam", {"type": 'E', "value": 'PRESS', "ctrl": True}, None),
-        ("uv.select", {"type": params.select_mouse, "value": params.select_mouse_value},
-         {"properties": [("deselect_all", not params.legacy)]}),
-        ("uv.select", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True},
-         {"properties": [("extend", True)]}),
-        ("uv.select_loop", {"type": params.select_mouse, "value": params.select_mouse_value, "alt": True}, None),
-        ("uv.select_loop", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "alt": True},
+        ("uv.select_loop",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "alt": True}, None),
+        ("uv.select_loop",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "alt": True},
          {"properties": [("extend", True)]}),
-        ("uv.select_edge_ring", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "alt": True}, None),
-        ("uv.select_edge_ring", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "shift": True, "alt": True},
+        ("uv.select_edge_ring",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "alt": True}, None),
+        ("uv.select_edge_ring",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "shift": True, "alt": True},
          {"properties": [("extend", True)]}),
-        ("uv.shortest_path_pick", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True},
+        ("uv.shortest_path_pick",
+         {"type": params.select_mouse, "value": params.select_mouse_value_fallback, "ctrl": True},
          {"properties": [("use_fill", False)]}),
-        ("uv.shortest_path_pick", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "shift": True},
+        ("uv.shortest_path_pick",
+         {"type": params.select_mouse, "value": params.select_mouse_value_fallback, "ctrl": True, "shift": True},
          {"properties": [("use_fill", True)]}),
         ("uv.select_split", {"type": 'Y', "value": 'PRESS'}, None),
-        ("uv.select_box", {"type": 'B', "value": 'PRESS'},
-         {"properties": [("pinned", False)]}),
+        op_tool_optional(
+            ("uv.select_box", {"type": 'B', "value": 'PRESS'},
+             {"properties": [("pinned", False)]}),
+            (op_tool, "builtin.select_box"), params),
         ("uv.select_box", {"type": 'B', "value": 'PRESS', "ctrl": True},
          {"properties": [("pinned", True)]}),
-        ("uv.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("uv.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+            (op_tool, "builtin.select_circle"), params),
         ("uv.select_lasso", {"type": params.action_tweak, "value": 'ANY', "ctrl": True},
          {"properties": [("mode", 'ADD')]}),
         ("uv.select_lasso", {"type": params.action_tweak, "value": 'ANY', "shift": True, "ctrl": True},
@@ -915,6 +989,7 @@ def km_uv_editor(params):
         ("uv.select_more", {"type": 'NUMPAD_PLUS', "value": 'PRESS', "ctrl": True, "repeat": True}, None),
         ("uv.select_less", {"type": 'NUMPAD_MINUS', "value": 'PRESS', "ctrl": True, "repeat": True}, None),
         *_template_items_select_actions(params, "uv.select_all"),
+        *_template_items_hide_reveal_actions("uv.hide", "uv.reveal"),
         ("uv.select_pinned", {"type": 'P', "value": 'PRESS', "shift": True}, None),
         op_menu("IMAGE_MT_uvs_merge", {"type": 'M', "value": 'PRESS'}),
         op_menu("IMAGE_MT_uvs_split", {"type": 'M', "value": 'PRESS', "alt": True}),
@@ -926,11 +1001,6 @@ def km_uv_editor(params):
         ("uv.pin", {"type": 'P', "value": 'PRESS', "alt": True},
          {"properties": [("clear", True)]}),
         op_menu("IMAGE_MT_uvs_unwrap", {"type": 'U', "value": 'PRESS'}),
-        ("uv.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("uv.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("uv.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
         (
             op_menu_pie("IMAGE_MT_uvs_snap_pie", {"type": 'S', "value": 'PRESS', "shift": True})
             if not params.legacy else
@@ -939,10 +1009,16 @@ def km_uv_editor(params):
         op_menu("IMAGE_MT_uvs_select_mode", {"type": 'TAB', "value": 'PRESS', "ctrl": True}),
         *_template_items_proportional_editing(
             params, connected=False, toggle_data_path='tool_settings.use_proportional_edit'),
-        ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
         ("transform.translate", {"type": params.select_tweak, "value": 'ANY'}, None),
-        ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
-        ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.move"), params),
+        op_tool_optional(
+            ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.rotate"), params),
+        op_tool_optional(
+            ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.scale"), params),
         ("transform.shear", {"type": 'S', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
         ("transform.mirror", {"type": 'M', "value": 'PRESS', "ctrl": True}, None),
         ("wm.context_toggle", {"type": 'TAB', "value": 'PRESS', "shift": True},
@@ -1196,26 +1272,21 @@ def km_view3d(params):
         ("view3d.view_axis", {"type": 'NDOF_BUTTON_TOP', "value": 'PRESS', "shift": True},
          {"properties": [("type", 'TOP'), ("align_active", True)]}),
         # Selection.
-        *((
-            "view3d.select",
-            {"type": params.select_mouse, "value": params.select_mouse_value, **{m: True for m in mods}},
-            {"properties": [(c, True) for c in props]},
-        ) for props, mods in (
-            (("deselect_all",) if not params.legacy else (), ()),
-            (("toggle",), ("shift",)),
-            (("center", "object"), ("ctrl",)),
-            (("enumerate",), ("alt",)),
-            (("toggle", "center"), ("shift", "ctrl")),
-            (("center", "enumerate"), ("ctrl", "alt")),
-            (("toggle", "enumerate"), ("shift", "alt")),
-            (("toggle", "center", "enumerate"), ("shift", "ctrl", "alt")),
-        )),
-        ("view3d.select_box", {"type": 'B', "value": 'PRESS'}, None),
+        *_template_view3d_select(
+            type=params.select_mouse,
+            value=params.select_mouse_value_fallback,
+            legacy=params.legacy,
+        ),
+        op_tool_optional(
+            ("view3d.select_box", {"type": 'B', "value": 'PRESS'}, None),
+            (op_tool, "builtin.select_box"), params),
         ("view3d.select_lasso", {"type": params.action_tweak, "value": 'ANY', "ctrl": True},
          {"properties": [("mode", 'ADD')]}),
         ("view3d.select_lasso", {"type": params.action_tweak, "value": 'ANY', "shift": True, "ctrl": True},
          {"properties": [("mode", 'SUB')]}),
-        ("view3d.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("view3d.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+            (op_tool, "builtin.select_circle"), params),
         # Borders.
         ("view3d.clip_border", {"type": 'B', "value": 'PRESS', "alt": True}, None),
         ("view3d.zoom_border", {"type": 'B', "value": 'PRESS', "shift": True}, None),
@@ -1228,23 +1299,37 @@ def km_view3d(params):
         ("view3d.copybuffer", {"type": 'C', "value": 'PRESS', "ctrl": True}, None),
         ("view3d.pastebuffer", {"type": 'V', "value": 'PRESS', "ctrl": True}, None),
         # Transform.
-        ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
         ("transform.translate", {"type": params.select_tweak, "value": 'ANY'}, None),
-        ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
-        ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.move"), params),
+        op_tool_optional(
+            ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.rotate"), params),
+        op_tool_optional(
+            ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.scale"), params),
+        op_tool_optional(
+            ("transform.tosphere", {"type": 'S', "value": 'PRESS', "shift": True, "alt": True}, None),
+            (op_tool_cycle, "builtin.to_sphere"), params),
+        op_tool_optional(
+            ("transform.shear", {"type": 'S', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
+            (op_tool_cycle, "builtin.shear"), params),
         ("transform.bend", {"type": 'W', "value": 'PRESS', "shift": True}, None),
-        ("transform.tosphere", {"type": 'S', "value": 'PRESS', "shift": True, "alt": True}, None),
-        ("transform.shear", {"type": 'S', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
         ("transform.mirror", {"type": 'M', "value": 'PRESS', "ctrl": True}, None),
         ("object.transform_axis_target", {"type": 'T', "value": 'PRESS', "shift": True}, None),
         ("transform.skin_resize", {"type": 'A', "value": 'PRESS', "ctrl": True}, None),
         # Snapping.
         ("wm.context_toggle", {"type": 'TAB', "value": 'PRESS', "shift": True},
          {"properties": [("data_path", 'tool_settings.use_snap')]}),
-        op_panel("VIEW3D_PT_snapping", {"type": 'TAB', "value": 'PRESS', "shift": True, "ctrl": True}, [("keep_open", True)]),
+        op_panel(
+            "VIEW3D_PT_snapping",
+            {"type": 'TAB', "value": 'PRESS', "shift": True, "ctrl": True},
+            [("keep_open", True)],
+        ),
         (
             op_menu_pie("VIEW3D_MT_snap_pie", {"type": 'S', "value": 'PRESS', "shift": True})
-                if not params.legacy else
+            if not params.legacy else
             op_menu("VIEW3D_MT_snap", {"type": 'S', "value": 'PRESS', "shift": True})
         ),
     ])
@@ -1321,11 +1406,17 @@ def km_view3d(params):
              {"properties": [("data_path", 'tool_settings.transform_pivot_point'), ("value", 'ACTIVE_ELEMENT')]}),
             # Old shading.
             ("wm.context_toggle_enum", {"type": 'Z', "value": 'PRESS'},
-             {"properties": [("data_path", 'space_data.shading.type'), ("value_1", 'WIREFRAME'), ("value_2", 'SOLID')]}),
+             {"properties": [
+                 ("data_path", 'space_data.shading.type'), ("value_1", 'WIREFRAME'), ("value_2", 'SOLID'),
+             ]}),
             ("wm.context_toggle_enum", {"type": 'Z', "value": 'PRESS', "shift": True},
-             {"properties": [("data_path", 'space_data.shading.type'), ("value_1", 'RENDERED'), ("value_2", 'SOLID')]}),
+             {"properties": [
+                 ("data_path", 'space_data.shading.type'), ("value_1", 'RENDERED'), ("value_2", 'SOLID'),
+             ]}),
             ("wm.context_toggle_enum", {"type": 'Z', "value": 'PRESS', "alt": True},
-             {"properties": [("data_path", 'space_data.shading.type'), ("value_1", 'MATERIAL'), ("value_2", 'SOLID')]}),
+             {"properties": [
+                 ("data_path", 'space_data.shading.type'), ("value_1", 'MATERIAL'), ("value_2", 'SOLID'),
+             ]}),
         ])
 
     if params.select_mouse == 'LEFTMOUSE' and not params.legacy:
@@ -1380,17 +1471,14 @@ def km_mask_editing(params):
          {"properties": [("mode", 'SUB')]}),
         ("mask.select_more", {"type": 'NUMPAD_PLUS', "value": 'PRESS', "ctrl": True, "repeat": True}, None),
         ("mask.select_less", {"type": 'NUMPAD_MINUS', "value": 'PRESS', "ctrl": True, "repeat": True}, None),
-        ("mask.hide_view_clear", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("mask.hide_view_set", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("mask.hide_view_set", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
+        *_template_items_hide_reveal_actions("mask.hide_view_set", "mask.hide_view_clear"),
         ("clip.select", {"type": params.select_mouse, "value": 'PRESS', "ctrl": True}, None),
         ("mask.cyclic_toggle", {"type": 'C', "value": 'PRESS', "alt": True}, None),
         ("mask.slide_point", {"type": 'LEFTMOUSE', "value": 'PRESS'}, None),
         ("mask.slide_spline_curvature", {"type": 'LEFTMOUSE', "value": 'PRESS'}, None),
         ("mask.handle_type_set", {"type": 'V', "value": 'PRESS'}, None),
-        ("mask.normals_make_consistent", {"type": 'N', "value": 'PRESS', "ctrl" if params.legacy else "shift": True}, None),
+        ("mask.normals_make_consistent",
+         {"type": 'N', "value": 'PRESS', "ctrl" if params.legacy else "shift": True}, None),
         ("mask.parent_set", {"type": 'P', "value": 'PRESS', "ctrl": True}, None),
         ("mask.parent_clear", {"type": 'P', "value": 'PRESS', "alt": True}, None),
         ("mask.shape_key_insert", {"type": 'I', "value": 'PRESS'}, None),
@@ -1501,11 +1589,7 @@ def km_graph_editor_generic(_params):
         ),
         ("graph.extrapolation_type", {"type": 'E', "value": 'PRESS', "shift": True}, None),
         ("anim.channels_find", {"type": 'F', "value": 'PRESS', "ctrl": True}, None),
-        ("graph.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("graph.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("graph.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        *_template_items_hide_reveal_actions("graph.hide", "graph.reveal"),
         ("wm.context_set_enum", {"type": 'TAB', "value": 'PRESS', "ctrl": True},
          {"properties": [("data_path", 'area.type'), ("value", 'DOPESHEET_EDITOR')]}),
     ])
@@ -1832,14 +1916,22 @@ def km_node_editor(params):
          {"properties": [("mode", 'ADD')]}),
         ("node.select_lasso", {"type": 'EVT_TWEAK_L', "value": 'ANY', "shift": True, "ctrl": True, "alt": True},
          {"properties": [("mode", 'SUB')]}),
-        ("node.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("node.select_box", {"type": 'B', "value": 'PRESS'},
+             {"properties": [("tweak", False)]}),
+            (op_tool, "builtin.select_box"), params),
+        op_tool_optional(
+            ("node.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+            (op_tool, "builtin.select_circle"), params),
         ("node.link", {"type": 'EVT_TWEAK_L', "value": 'ANY'},
          {"properties": [("detach", False)]}),
         ("node.link", {"type": 'EVT_TWEAK_L', "value": 'ANY', "ctrl": True},
          {"properties": [("detach", True)]}),
         ("node.resize", {"type": 'EVT_TWEAK_L', "value": 'ANY'}, None),
-        ("node.add_reroute", {"type": 'EVT_TWEAK_L' if params.legacy else 'EVT_TWEAK_R', "value": 'ANY', "shift": True}, None),
-        ("node.links_cut", {"type": 'EVT_TWEAK_L' if params.legacy else 'EVT_TWEAK_R', "value": 'ANY', "ctrl": True}, None),
+        ("node.add_reroute",
+         {"type": 'EVT_TWEAK_L' if params.legacy else 'EVT_TWEAK_R', "value": 'ANY', "shift": True}, None),
+        ("node.links_cut",
+         {"type": 'EVT_TWEAK_L' if params.legacy else 'EVT_TWEAK_R', "value": 'ANY', "ctrl": True}, None),
         ("node.links_mute", {"type": 'EVT_TWEAK_R', "value": 'ANY', "ctrl": True, "alt": True}, None),
         ("node.select_link_viewer", {"type": 'LEFTMOUSE', "value": 'PRESS', "shift": True, "ctrl": True}, None),
         ("node.backimage_move", {"type": 'MIDDLEMOUSE', "value": 'PRESS', "alt": True}, None),
@@ -1866,8 +1958,6 @@ def km_node_editor(params):
         ("node.view_all", {"type": 'HOME', "value": 'PRESS'}, None),
         ("node.view_all", {"type": 'NDOF_BUTTON_FIT', "value": 'PRESS'}, None),
         ("node.view_selected", {"type": 'NUMPAD_PERIOD', "value": 'PRESS'}, None),
-        ("node.select_box", {"type": 'B', "value": 'PRESS'},
-         {"properties": [("tweak", False)]}),
         ("node.delete", {"type": 'X', "value": 'PRESS'}, None),
         ("node.delete", {"type": 'DEL', "value": 'PRESS'}, None),
         ("node.delete_reconnect", {"type": 'X', "value": 'PRESS', "ctrl": True}, None),
@@ -1896,9 +1986,15 @@ def km_node_editor(params):
         ("node.clipboard_paste", {"type": 'V', "value": 'PRESS', "ctrl": True}, None),
         ("node.viewer_border", {"type": 'B', "value": 'PRESS', "ctrl": True}, None),
         ("node.clear_viewer_border", {"type": 'B', "value": 'PRESS', "ctrl": True, "alt": True}, None),
-        ("node.translate_attach", {"type": 'G', "value": 'PRESS'}, {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
-        ("node.translate_attach", {"type": 'EVT_TWEAK_L', "value": 'ANY'}, {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
-        ("node.translate_attach", {"type": params.select_tweak, "value": 'ANY'}, {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
+        ("node.translate_attach",
+         {"type": 'G', "value": 'PRESS'},
+         {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
+        ("node.translate_attach",
+         {"type": 'EVT_TWEAK_L', "value": 'ANY'},
+         {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
+        ("node.translate_attach",
+         {"type": params.select_tweak, "value": 'ANY'},
+         {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
         ("transform.translate", {"type": 'G', "value": 'PRESS'}, {"properties": [("view2d_edge_pan", True)]}),
         ("transform.translate", {"type": 'EVT_TWEAK_L', "value": 'ANY'},
          {"properties": [("release_confirm", True), ("view2d_edge_pan", True)]}),
@@ -1906,9 +2002,15 @@ def km_node_editor(params):
          {"properties": [("release_confirm", True), ("view2d_edge_pan", True)]}),
         ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
         ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
-        ("node.move_detach_links", {"type": 'D', "value": 'PRESS', "alt": True}, {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
-        ("node.move_detach_links_release", {"type": params.action_tweak, "value": 'ANY', "alt": True}, {"properties": [("NODE_OT_translate_attach", [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])])]}),
-        ("node.move_detach_links", {"type": params.select_tweak, "value": 'ANY', "alt": True}, {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
+        ("node.move_detach_links",
+         {"type": 'D', "value": 'PRESS', "alt": True},
+         {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
+        ("node.move_detach_links_release",
+         {"type": params.action_tweak, "value": 'ANY', "alt": True},
+         {"properties": [("NODE_OT_translate_attach", [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])])]}),
+        ("node.move_detach_links",
+         {"type": params.select_tweak, "value": 'ANY', "alt": True},
+         {"properties": [("TRANSFORM_OT_translate", [("view2d_edge_pan", True)])]}),
         ("wm.context_toggle", {"type": 'TAB', "value": 'PRESS', "shift": True},
          {"properties": [("data_path", 'tool_settings.use_snap')]}),
         ("wm.context_menu_enum", {"type": 'TAB', "value": 'PRESS', "shift": True, "ctrl": True},
@@ -1958,7 +2060,7 @@ def km_file_browser(params):
             toolbar_key={"type": 'T', "value": 'PRESS'},
         ),
         ("wm.context_toggle", {"type": 'N', "value": 'PRESS'},
-          {"properties": [("data_path", 'space_data.show_region_tool_props')]}),
+         {"properties": [("data_path", 'space_data.show_region_tool_props')]}),
         ("file.parent", {"type": 'UP_ARROW', "value": 'PRESS', "alt": True}, None),
         ("file.previous", {"type": 'LEFT_ARROW', "value": 'PRESS', "alt": True}, None),
         ("file.next", {"type": 'RIGHT_ARROW', "value": 'PRESS', "alt": True}, None),
@@ -1991,7 +2093,10 @@ def km_file_browser(params):
 
         # Select file under cursor before spawning the context menu.
         ("file.select", {"type": 'RIGHTMOUSE', "value": 'PRESS'},
-         {"properties": [("open", False), ("only_activate_if_selected", params.select_mouse == 'LEFTMOUSE'), ("pass_through", True)]}),
+         {"properties": [
+             ("open", False),
+             ("only_activate_if_selected", params.select_mouse == 'LEFTMOUSE'), ("pass_through", True),
+         ]}),
         *_template_items_context_menu("FILEBROWSER_MT_context_menu", params.context_menu_event),
         *_template_items_context_menu("ASSETBROWSER_MT_context_menu", params.context_menu_event),
     ])
@@ -2117,17 +2222,23 @@ def km_dopesheet(params):
     )
 
     items.extend([
-        ("action.clickselect", {"type": params.select_mouse, "value": 'PRESS'},
+        ("action.clickselect",
+         {"type": params.select_mouse, "value": 'PRESS'},
          {"properties": [("deselect_all", not params.legacy)]}),
-        ("action.clickselect", {"type": params.select_mouse, "value": 'PRESS', "alt": True},
+        ("action.clickselect",
+         {"type": params.select_mouse, "value": 'PRESS', "alt": True},
          {"properties": [("column", True)]}),
-        ("action.clickselect", {"type": params.select_mouse, "value": 'PRESS', "shift": True},
+        ("action.clickselect",
+         {"type": params.select_mouse, "value": 'PRESS', "shift": True},
          {"properties": [("extend", True)]}),
-        ("action.clickselect", {"type": params.select_mouse, "value": 'PRESS', "shift": True, "alt": True},
+        ("action.clickselect",
+         {"type": params.select_mouse, "value": 'PRESS', "shift": True, "alt": True},
          {"properties": [("extend", True), ("column", True)]}),
-        ("action.clickselect", {"type": params.select_mouse, "value": 'PRESS', "ctrl": True, "alt": True},
+        ("action.clickselect",
+         {"type": params.select_mouse, "value": 'PRESS', "ctrl": True, "alt": True},
          {"properties": [("channel", True)]}),
-        ("action.clickselect", {"type": params.select_mouse, "value": 'PRESS', "shift": True, "ctrl": True, "alt": True},
+        ("action.clickselect",
+         {"type": params.select_mouse, "value": 'PRESS', "shift": True, "ctrl": True, "alt": True},
          {"properties": [("extend", True), ("channel", True)]}),
         ("action.select_leftright",
          {"type": params.select_mouse, "value": 'PRESS' if params.legacy else 'CLICK', "ctrl": True},
@@ -2524,6 +2635,9 @@ def km_sequencercommon(params):
         ("wm.context_toggle_enum", {"type": 'TAB', "value": 'PRESS', "ctrl": True},
          {"properties": [("data_path", 'space_data.view_type'), ("value_1", 'SEQUENCER'), ("value_2", 'PREVIEW')]}),
         ("sequencer.refresh_all", {"type": 'R', "value": 'PRESS', "ctrl": True}, None),
+        ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS'}, None),
+        ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS', "shift": True},
+         {"properties": [("extend", True)]}),
     ])
 
     if params.select_mouse == 'LEFTMOUSE' and not params.legacy:
@@ -2606,9 +2720,6 @@ def km_sequencer(params):
              for i in range(10)
              )
         ),
-        ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS'}, None),
-        ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS', "shift": True},
-         {"properties": [("extend", True)]}),
         ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS', "alt": True},
          {"properties": [("linked_handle", True)]}),
         ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS', "shift": True, "alt": True},
@@ -2685,6 +2796,21 @@ def km_sequencerpreview(params):
         ("sequencer.view_zoom_ratio", {"type": 'NUMPAD_8', "value": 'PRESS'},
          {"properties": [("ratio", 0.125)]}),
         ("sequencer.sample", {"type": params.action_mouse, "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.move"), params),
+        op_tool_optional(
+            ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.rotate"), params),
+        op_tool_optional(
+            ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.scale"), params),
+        ("sequencer.strip_transform_clear", {"type": 'G', "alt": True, "value": 'PRESS'},
+         {"properties": [("property", 'POSITION')]}),
+        ("sequencer.strip_transform_clear", {"type": 'S', "alt": True, "value": 'PRESS'},
+         {"properties": [("property", 'SCALE')]}),
+        ("sequencer.strip_transform_clear", {"type": 'R', "alt": True, "value": 'PRESS'},
+         {"properties": [("property", 'ROTATION')]}),
     ])
 
     return keymap
@@ -2867,11 +2993,7 @@ def km_clip_editor(params):
          {"properties": [("action", 'LOCK')]}),
         ("clip.lock_tracks", {"type": 'L', "value": 'PRESS', "alt": True},
          {"properties": [("action", 'UNLOCK')]}),
-        ("clip.hide_tracks", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("clip.hide_tracks", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("clip.hide_tracks_clear", {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        *_template_items_hide_reveal_actions("clip.hide_tracks", "clip.hide_tracks_clear"),
         ("clip.slide_plane_marker", {"type": 'LEFTMOUSE', "value": 'CLICK_DRAG'}, None),
         ("clip.keyframe_insert", {"type": 'I', "value": 'PRESS'}, None),
         ("clip.keyframe_delete", {"type": 'I', "value": 'PRESS', "alt": True}, None),
@@ -2987,6 +3109,7 @@ def km_clip_dopesheet_editor(_params):
 
     return keymap
 
+
 def km_spreadsheet_generic(_params):
     items = []
     keymap = (
@@ -3175,7 +3298,7 @@ def km_animation_channels(params):
 # Modes
 
 
-def km_grease_pencil(_params):
+def km_grease_pencil(params):
     items = []
     keymap = (
         "Grease Pencil",
@@ -3183,34 +3306,48 @@ def km_grease_pencil(_params):
         {"items": items},
     )
 
-    items.extend([
-        # Draw
-        ("gpencil.annotate", {"type": 'LEFTMOUSE', "value": 'PRESS', "key_modifier": 'D'},
-         {"properties": [("mode", 'DRAW'), ("wait_for_input", False)]}),
-        ("gpencil.annotate", {"type": 'LEFTMOUSE', "value": 'PRESS', "key_modifier": 'D', "shift": True},
-         {"properties": [("mode", 'DRAW'), ("wait_for_input", False)]}),
-        # Draw - straight lines
-        ("gpencil.annotate", {"type": 'LEFTMOUSE', "value": 'PRESS', "alt": True, "key_modifier": 'D'},
-         {"properties": [("mode", 'DRAW_STRAIGHT'), ("wait_for_input", False)]}),
-        # Draw - poly lines
-        ("gpencil.annotate", {"type": 'LEFTMOUSE', "value": 'PRESS', "shift": True, "alt": True, "key_modifier": 'D'},
-         {"properties": [("mode", 'DRAW_POLY'), ("wait_for_input", False)]}),
-        # Erase
-        ("gpencil.annotate", {"type": 'RIGHTMOUSE', "value": 'PRESS', "key_modifier": 'D'},
-         {"properties": [("mode", 'ERASER'), ("wait_for_input", False)]}),
-    ])
+    if params.use_key_activate_tools:
+        items.extend([
+            op_tool_cycle("builtin.annotate", {"type": 'D', "value": 'PRESS'}),
+        ])
+    else:
+        items.extend([
+            # Draw
+            ("gpencil.annotate",
+             {"type": 'LEFTMOUSE', "value": 'PRESS', "key_modifier": 'D'},
+             {"properties": [("mode", 'DRAW'), ("wait_for_input", False)]}),
+            ("gpencil.annotate",
+             {"type": 'LEFTMOUSE', "value": 'PRESS', "key_modifier": 'D', "shift": True},
+             {"properties": [("mode", 'DRAW'), ("wait_for_input", False)]}),
+            # Draw - straight lines
+            ("gpencil.annotate",
+             {"type": 'LEFTMOUSE', "value": 'PRESS', "alt": True, "key_modifier": 'D'},
+             {"properties": [("mode", 'DRAW_STRAIGHT'), ("wait_for_input", False)]}),
+            # Draw - poly lines
+            ("gpencil.annotate",
+             {"type": 'LEFTMOUSE', "value": 'PRESS', "shift": True, "alt": True, "key_modifier": 'D'},
+             {"properties": [("mode", 'DRAW_POLY'), ("wait_for_input", False)]}),
+            # Erase
+            ("gpencil.annotate",
+             {"type": 'RIGHTMOUSE', "value": 'PRESS', "key_modifier": 'D'},
+             {"properties": [("mode", 'ERASER'), ("wait_for_input", False)]}),
+        ])
 
     return keymap
 
 
-def _grease_pencil_selection(params):
+def _grease_pencil_selection(params, use_select_mouse=True):
     return [
         # Select all
         *_template_items_select_actions(params, "gpencil.select_all"),
         # Circle select
-        ("gpencil.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("gpencil.select_circle", {"type": 'C', "value": 'PRESS'}, None),
+            (op_tool, "builtin.select_circle"), params),
         # Box select
-        ("gpencil.select_box", {"type": 'B', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("gpencil.select_box", {"type": 'B', "value": 'PRESS'}, None),
+            (op_tool, "builtin.select_box"), params),
         # Lasso select
         ("gpencil.select_lasso", {"type": params.action_tweak, "value": 'ANY', "ctrl": True},
          {"properties": [("mode", 'ADD')]}),
@@ -3221,17 +3358,18 @@ def _grease_pencil_selection(params):
         # There probably isn't too much harm adding this for other editors too
         # as part of standard GP editing keymap. This hotkey combo doesn't seem
         # to see much use under standard scenarios?
-        ("gpencil.select_lasso", {"type": params.action_tweak, "value": 'ANY', "ctrl": True, "alt": True},
+        ("gpencil.select_lasso",
+         {"type": params.action_tweak, "value": 'ANY', "ctrl": True, "alt": True},
          {"properties": [("mode", 'ADD')]}),
-        ("gpencil.select_lasso", {"type": params.action_tweak, "value": 'ANY', "shift": True, "ctrl": True, "alt": True},
+        ("gpencil.select_lasso",
+         {"type": params.action_tweak, "value": 'ANY', "shift": True, "ctrl": True, "alt": True},
          {"properties": [("mode", 'SUB')]}),
-        ("gpencil.select", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True},
-         {"properties": [("extend", True), ("toggle", True)]}),
-        # Whole stroke select
-        ("gpencil.select", {"type": params.select_mouse, "value": params.select_mouse_value, "alt": True},
-         {"properties": [("entire_strokes", True)]}),
-        ("gpencil.select", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "alt": True},
-         {"properties": [("extend", True), ("entire_strokes", True)]}),
+        *_template_view3d_gpencil_select(
+            type=params.select_mouse,
+            value=params.select_mouse_value_fallback,
+            legacy=params.legacy,
+            use_select_mouse=use_select_mouse,
+        ),
         # Select linked
         ("gpencil.select_linked", {"type": 'L', "value": 'PRESS'}, None),
         ("gpencil.select_linked", {"type": 'L', "value": 'PRESS', "ctrl": True}, None),
@@ -3264,17 +3402,18 @@ def km_grease_pencil_stroke_edit_mode(params):
 
     items.extend([
         # Interpolation
-        ("gpencil.interpolate", {"type": 'E', "value": 'PRESS', "ctrl": True}, None),
+        op_tool_optional(
+            ("gpencil.interpolate", {"type": 'E', "value": 'PRESS', "ctrl": True}, None),
+            (op_tool_cycle, "builtin.interpolate"), params),
         ("gpencil.interpolate_sequence", {"type": 'E', "value": 'PRESS', "shift": True, "ctrl": True}, None),
-        # Normal select
-        ("gpencil.select", {"type": params.select_mouse, "value": params.select_mouse_value},
-         {"properties": [("deselect_all", not params.legacy)]}),
         # Selection
         *_grease_pencil_selection(params),
         # Duplicate and move selected points
         ("gpencil.duplicate_move", {"type": 'D', "value": 'PRESS', "shift": True}, None),
         # Extrude and move selected points
-        ("gpencil.extrude_move", {"type": 'E', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("gpencil.extrude_move", {"type": 'E', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.extrude"), params),
         # Delete
         op_menu("VIEW3D_MT_edit_gpencil_delete", {"type": 'X', "value": 'PRESS'}),
         op_menu("VIEW3D_MT_edit_gpencil_delete", {"type": 'DEL', "value": 'PRESS'}),
@@ -3304,11 +3443,7 @@ def km_grease_pencil_stroke_edit_mode(params):
             op_menu("GPENCIL_MT_snap", {"type": 'S', "value": 'PRESS', "shift": True})
         ),
         # Show/hide
-        ("gpencil.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("gpencil.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("gpencil.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
+        *_template_items_hide_reveal_actions("gpencil.hide", "gpencil.reveal"),
         ("gpencil.selection_opacity_toggle", {"type": 'H', "value": 'PRESS', "ctrl": True}, None),
         # Display
         *_grease_pencil_display(),
@@ -3319,16 +3454,30 @@ def km_grease_pencil_stroke_edit_mode(params):
         # Merge Layer
         ("gpencil.layer_merge", {"type": 'M', "value": 'PRESS', "shift": True, "ctrl": True}, None),
         # Transform tools
-        ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
         ("transform.translate", {"type": params.select_tweak, "value": 'ANY'}, None),
-        ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
-        ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("transform.translate", {"type": 'G', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.move"), params),
+        op_tool_optional(
+            ("transform.rotate", {"type": 'R', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.rotate"), params),
+        op_tool_optional(
+            ("transform.resize", {"type": 'S', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.scale"), params),
+        op_tool_optional(
+            ("transform.tosphere", {"type": 'S', "value": 'PRESS', "shift": True, "alt": True}, None),
+            (op_tool_cycle, "builtin.to_sphere"), params),
+        op_tool_optional(
+            ("transform.shear", {"type": 'S', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
+            (op_tool_cycle, "builtin.shear"), params),
         ("transform.mirror", {"type": 'M', "value": 'PRESS', "ctrl": True}, None),
-        ("transform.bend", {"type": 'W', "value": 'PRESS', "shift": True}, None),
-        ("transform.tosphere", {"type": 'S', "value": 'PRESS', "shift": True, "alt": True}, None),
-        ("transform.shear", {"type": 'S', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
-        ("transform.transform", {"type": 'S', "value": 'PRESS', "alt": True},
-         {"properties": [("mode", 'GPENCIL_SHRINKFATTEN')]}),
+        op_tool_optional(
+            ("transform.bend", {"type": 'W', "value": 'PRESS', "shift": True}, None),
+            (op_tool_cycle, "builtin.bend"), params),
+        op_tool_optional(
+            ("transform.transform", {"type": 'S', "value": 'PRESS', "alt": True},
+             {"properties": [("mode", 'GPENCIL_SHRINKFATTEN')]}),
+            (op_tool_cycle, "builtin.radius"), params),
         ("transform.transform", {"type": 'F', "value": 'PRESS', "shift": True},
          {"properties": [("mode", 'GPENCIL_OPACITY')]}),
         # Proportional editing.
@@ -3364,6 +3513,7 @@ def km_grease_pencil_stroke_edit_mode(params):
 
     return keymap
 
+
 def km_grease_pencil_stroke_curve_edit_mode(_params):
     items = []
     keymap = (
@@ -3379,6 +3529,7 @@ def km_grease_pencil_stroke_curve_edit_mode(_params):
 
     return keymap
 
+
 def km_grease_pencil_stroke_paint_mode(params):
     items = []
     keymap = (
@@ -3406,14 +3557,12 @@ def km_grease_pencil_stroke_paint_mode(params):
         ("gpencil.active_frames_delete_all", {"type": 'X', "value": 'PRESS', "shift": True}, None),
         ("gpencil.active_frames_delete_all", {"type": 'DEL', "value": 'PRESS', "shift": True}, None),
         # Interpolation
-        ("gpencil.interpolate", {"type": 'E', "value": 'PRESS', "ctrl": True}, None),
+        op_tool_optional(
+            ("gpencil.interpolate", {"type": 'E', "value": 'PRESS', "ctrl": True}, None),
+            (op_tool_cycle, "builtin.interpolate"), params),
         ("gpencil.interpolate_sequence", {"type": 'E', "value": 'PRESS', "shift": True, "ctrl": True}, None),
         # Show/hide
-        ("gpencil.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("gpencil.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("gpencil.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
+        *_template_items_hide_reveal_actions("gpencil.hide", "gpencil.reveal"),
         # Active layer
         op_menu("GPENCIL_MT_layer_active", {"type": 'Y', "value": 'PRESS'}),
         # Merge Layer
@@ -3522,10 +3671,21 @@ def km_grease_pencil_stroke_paint_fill(_params):
          {"properties": [("on_back", False)]}),
         # If press alternate key, the brush now it's for drawing areas
         ("gpencil.draw", {"type": 'LEFTMOUSE', "value": 'PRESS', "shift": True},
-         {"properties": [("mode", 'DRAW'), ("wait_for_input", False), ("disable_straight", True), ("disable_stabilizer", True)]}),
+         {"properties": [
+             ("mode", 'DRAW'),
+             ("wait_for_input", False),
+             ("disable_straight", True),
+             ("disable_stabilizer", True),
+         ]}),
         # If press alternative key, the brush now it's for drawing lines
         ("gpencil.draw", {"type": 'LEFTMOUSE', "value": 'PRESS', "alt": True},
-         {"properties": [("mode", 'DRAW'), ("wait_for_input", False), ("disable_straight", True), ("disable_stabilizer", True), ("disable_fill", True)]}),
+         {"properties": [
+             ("mode", 'DRAW'),
+             ("wait_for_input", False),
+             ("disable_straight", True),
+             ("disable_stabilizer", True),
+             ("disable_fill", True),
+         ]}),
     ])
 
     return keymap
@@ -3560,7 +3720,7 @@ def km_grease_pencil_stroke_sculpt_mode(params):
 
     items.extend([
         # Selection
-        *_grease_pencil_selection(params),
+        *_grease_pencil_selection(params, use_select_mouse=False),
 
         # Brush strength
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
@@ -3846,10 +4006,12 @@ def km_grease_pencil_stroke_vertex_mode(params):
 
     items.extend([
         # Selection
-        *_grease_pencil_selection(params),
+        *_grease_pencil_selection(params, use_select_mouse=False),
         # Brush strength
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
-         {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength')]}),
+         {"properties": [
+             ("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength'),
+         ]}),
         # Brush size
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
          {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.size')]}),
@@ -3895,7 +4057,9 @@ def km_grease_pencil_stroke_vertex_draw(_params):
          {"properties": [("wait_for_input", False)]}),
         # Brush strength
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
-         {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength')]}),
+         {"properties": [
+             ("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength'),
+         ]}),
         # Brush size
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
          {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.size')]}),
@@ -3918,7 +4082,9 @@ def km_grease_pencil_stroke_vertex_blur(_params):
          {"properties": [("wait_for_input", False)]}),
         # Brush strength
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
-         {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength')]}),
+         {"properties": [
+             ("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength'),
+         ]}),
         # Brush size
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
          {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.size')]}),
@@ -3943,7 +4109,9 @@ def km_grease_pencil_stroke_vertex_average(_params):
          {"properties": [("wait_for_input", False)]}),
         # Brush strength
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
-         {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength')]}),
+         {"properties": [
+             ("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength')],
+          }),
         # Brush size
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
          {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.size')]}),
@@ -3966,7 +4134,9 @@ def km_grease_pencil_stroke_vertex_smear(_params):
          {"properties": [("wait_for_input", False)]}),
         # Brush strength
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
-         {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength')]}),
+         {"properties": [
+             ("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.gpencil_settings.pen_strength'),
+         ]}),
         # Brush size
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
          {"properties": [("data_path_primary", 'tool_settings.gpencil_vertex_paint.brush.size')]}),
@@ -4005,11 +4175,7 @@ def km_face_mask(params):
 
     items.extend([
         *_template_items_select_actions(params, "paint.face_select_all"),
-        ("paint.face_select_hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("paint.face_select_hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("paint.face_select_reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        *_template_items_hide_reveal_actions("paint.face_select_hide", "paint.face_select_reveal"),
         ("paint.face_select_linked", {"type": 'L', "value": 'PRESS', "ctrl": True}, None),
         ("paint.face_select_linked_pick", {"type": 'L', "value": 'PRESS'},
          {"properties": [("deselect", False)]}),
@@ -4051,11 +4217,7 @@ def km_pose(params):
 
     items.extend([
         ("object.parent_set", {"type": 'P', "value": 'PRESS', "ctrl": True}, None),
-        ("pose.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("pose.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("pose.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        *_template_items_hide_reveal_actions("pose.hide", "pose.reveal"),
         op_menu("VIEW3D_MT_pose_apply", {"type": 'A', "value": 'PRESS', "ctrl": True}),
         ("pose.rot_clear", {"type": 'R', "value": 'PRESS', "alt": True}, None),
         ("pose.loc_clear", {"type": 'G', "value": 'PRESS', "alt": True}, None),
@@ -4103,6 +4265,7 @@ def km_pose(params):
         ("pose.push", {"type": 'E', "value": 'PRESS', "ctrl": True}, None),
         ("pose.relax", {"type": 'E', "value": 'PRESS', "alt": True}, None),
         ("pose.breakdown", {"type": 'E', "value": 'PRESS', "shift": True}, None),
+        ("pose.blend_to_neighbour", {"type": 'E', "value": 'PRESS', "shift": True, "alt": True}, None),
         op_menu("VIEW3D_MT_pose_propagate", {"type": 'P', "value": 'PRESS', "alt": True}),
         *(
             (("object.hide_collection",
@@ -4170,17 +4333,15 @@ def km_object_mode(params):
         ("anim.keying_set_active_set", {"type": 'I', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
         ("collection.create", {"type": 'G', "value": 'PRESS', "ctrl": True}, None),
         ("collection.objects_remove", {"type": 'G', "value": 'PRESS', "ctrl": True, "alt": True}, None),
-        ("collection.objects_remove_all", {"type": 'G', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
-        ("collection.objects_add_active", {"type": 'G', "value": 'PRESS', "shift": True, "ctrl": True}, None),
+        ("collection.objects_remove_all",
+         {"type": 'G', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
+        ("collection.objects_add_active",
+         {"type": 'G', "value": 'PRESS', "shift": True, "ctrl": True}, None),
         ("collection.objects_remove_active", {"type": 'G', "value": 'PRESS', "shift": True, "alt": True}, None),
         *_template_items_object_subdivision_set(),
         ("object.move_to_collection", {"type": 'M', "value": 'PRESS'}, None),
         ("object.link_to_collection", {"type": 'M', "value": 'PRESS', "shift": True}, None),
-        ("object.hide_view_clear", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("object.hide_view_set", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("object.hide_view_set", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
+        *_template_items_hide_reveal_actions("object.hide_view_set", "object.hide_view_clear"),
         ("object.hide_collection", {"type": 'H', "value": 'PRESS', "ctrl": True}, None),
         *(
             (("object.hide_collection",
@@ -4267,10 +4428,13 @@ def km_curve(params):
          {"properties": [("deselect", False)]}),
         ("curve.select_linked_pick", {"type": 'L', "value": 'PRESS', "shift": True},
          {"properties": [("deselect", True)]}),
-        ("curve.shortest_path_pick", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True}, None),
+        ("curve.shortest_path_pick",
+         {"type": params.select_mouse, "value": params.select_mouse_value_fallback, "ctrl": True}, None),
         ("curve.separate", {"type": 'P', "value": 'PRESS'}, None),
         ("curve.split", {"type": 'Y', "value": 'PRESS'}, None),
-        ("curve.extrude_move", {"type": 'E', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("curve.extrude_move", {"type": 'E', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.extrude"), params),
         ("curve.duplicate_move", {"type": 'D', "value": 'PRESS', "shift": True}, None),
         ("curve.make_segment", {"type": 'F', "value": 'PRESS'}, None),
         ("curve.cyclic_toggle", {"type": 'C', "value": 'PRESS', "alt": True}, None),
@@ -4279,15 +4443,14 @@ def km_curve(params):
         ("curve.dissolve_verts", {"type": 'X', "value": 'PRESS', "ctrl": True}, None),
         ("curve.dissolve_verts", {"type": 'DEL', "value": 'PRESS', "ctrl": True}, None),
         ("curve.tilt_clear", {"type": 'T', "value": 'PRESS', "alt": True}, None),
-        ("transform.tilt", {"type": 'T', "value": 'PRESS', "ctrl": True}, None),
+        op_tool_optional(
+            ("transform.tilt", {"type": 'T', "value": 'PRESS', "ctrl": True}, None),
+            (op_tool_cycle, "builtin.tilt"), params),
         ("transform.transform", {"type": 'S', "value": 'PRESS', "alt": True},
          {"properties": [("mode", 'CURVE_SHRINKFATTEN')]}),
-        ("curve.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("curve.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("curve.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("curve.normals_make_consistent", {"type": 'N', "value": 'PRESS', "ctrl" if params.legacy else "shift": True}, None),
+        *_template_items_hide_reveal_actions("curve.hide", "curve.reveal"),
+        ("curve.normals_make_consistent",
+         {"type": 'N', "value": 'PRESS', "ctrl" if params.legacy else "shift": True}, None),
         ("object.vertex_parent_set", {"type": 'P', "value": 'PRESS', "ctrl": True}, None),
         op_menu("VIEW3D_MT_hook", {"type": 'H', "value": 'PRESS', "ctrl": True}),
         *_template_items_proportional_editing(
@@ -4328,9 +4491,11 @@ def _template_paint_radial_control(paint, rotation=False, secondary_rotation=Fal
 
     items.extend([
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
-         radial_control_properties(paint, 'size', 'use_unified_size', secondary_rotation=secondary_rotation, color=color, zoom=zoom)),
+         radial_control_properties(
+             paint, 'size', 'use_unified_size', secondary_rotation=secondary_rotation, color=color, zoom=zoom)),
         ("wm.radial_control", {"type": 'F', "value": 'PRESS', "shift": True},
-         radial_control_properties(paint, 'strength', 'use_unified_strength', secondary_rotation=secondary_rotation, color=color)),
+         radial_control_properties(
+             paint, 'strength', 'use_unified_strength', secondary_rotation=secondary_rotation, color=color)),
     ])
 
     if rotation:
@@ -4342,12 +4507,87 @@ def _template_paint_radial_control(paint, rotation=False, secondary_rotation=Fal
     if secondary_rotation:
         items.extend([
             ("wm.radial_control", {"type": 'F', "value": 'PRESS', "ctrl": True, "alt": True},
-             radial_control_properties(paint, 'mask_texture_slot.angle', None, secondary_rotation=secondary_rotation, color=color)),
+             radial_control_properties(
+                 paint, 'mask_texture_slot.angle', None, secondary_rotation=secondary_rotation, color=color)),
         ])
 
     return items
 
 
+def _template_view3d_select(*, type, value, legacy):
+    return [(
+        "view3d.select",
+        {"type": type, "value": value, **{m: True for m in mods}},
+        {"properties": [(c, True) for c in props]},
+    ) for props, mods in (
+        (("deselect_all",) if not legacy else (), ()),
+        (("toggle",), ("shift",)),
+        (("center", "object"), ("ctrl",)),
+        (("enumerate",), ("alt",)),
+        (("toggle", "center"), ("shift", "ctrl")),
+        (("center", "enumerate"), ("ctrl", "alt")),
+        (("toggle", "enumerate"), ("shift", "alt")),
+        (("toggle", "center", "enumerate"), ("shift", "ctrl", "alt")),
+    )]
+
+
+def _template_view3d_select_for_fallback(params, fallback):
+    if (not fallback) and params.use_fallback_tool_rmb:
+        # Needed so we have immediate select+tweak when the default select tool is active.
+        return _template_view3d_select(
+            type=params.select_mouse,
+            value=params.select_mouse_value,
+            legacy=params.legacy,
+        )
+    return []
+
+
+def _template_view3d_gpencil_select(*, type, value, legacy, use_select_mouse=True):
+    return [
+        *([] if not use_select_mouse else [
+            ("gpencil.select", {"type": type, "value": value},
+             {"properties": [("deselect_all", not legacy)]})]),
+        ("gpencil.select", {"type": type, "value": value, "shift": True},
+         {"properties": [("extend", True), ("toggle", True)]}),
+        # Whole stroke select
+        ("gpencil.select", {"type": type, "value": value, "alt": True},
+         {"properties": [("entire_strokes", True)]}),
+        ("gpencil.select", {"type": type, "value": value, "shift": True, "alt": True},
+         {"properties": [("extend", True), ("entire_strokes", True)]}),
+    ]
+
+
+def _template_view3d_gpencil_select_for_fallback(params, fallback):
+    if (not fallback) and params.use_fallback_tool_rmb:
+        # Needed so we have immediate select+tweak when the default select tool is active.
+        return _template_view3d_gpencil_select(
+            type=params.select_mouse,
+            value=params.select_mouse_value,
+            legacy=params.legacy,
+        )
+    return []
+
+
+def _template_uv_select(*, type, value, legacy):
+    return [
+        ("uv.select", {"type": type, "value": value},
+         {"properties": [("deselect_all", not legacy)]}),
+        ("uv.select", {"type": type, "value": value, "shift": True},
+         {"properties": [("extend", True)]}),
+    ]
+
+
+def _template_uv_select_for_fallback(params, fallback):
+    if (not fallback) and params.use_fallback_tool_rmb:
+        # Needed so we have immediate select+tweak when the default select tool is active.
+        return _template_uv_select(
+            type=params.select_mouse,
+            value=params.select_mouse_value,
+            legacy=params.legacy,
+        )
+    return []
+
+
 def km_image_paint(params):
     items = []
     keymap = (
@@ -4521,16 +4761,27 @@ def km_sculpt(params):
         ("sculpt.expand", {"type": 'A', "value": 'PRESS', "shift": True, "alt": True},
          {"properties": [("target", "MASK"), ("falloff_type", "NORMALS"), ("invert", False)]}),
         ("sculpt.expand", {"type": 'W', "value": 'PRESS', "shift": True},
-         {"properties": [("target", "FACE_SETS"), ("falloff_type", "GEODESIC"), ("invert", False), ("use_modify_active", False)]}),
+         {"properties": [
+             ("target", "FACE_SETS"),
+             ("falloff_type", "GEODESIC"),
+             ("invert", False),
+             ("use_modify_active", False)]}),
         ("sculpt.expand", {"type": 'W', "value": 'PRESS', "shift": True, "alt": True},
-         {"properties": [("target", "FACE_SETS"), ("falloff_type", "BOUNDARY_FACE_SET"),("invert", False), ("use_modify_active", True)]}),
+         {"properties": [
+             ("target", "FACE_SETS"),
+             ("falloff_type", "BOUNDARY_FACE_SET"),
+             ("invert", False),
+             ("use_modify_active", True),
+         ]}),
         # Partial Visibility Show/hide
+        # Match keys from: `_template_items_hide_reveal_actions`, cannot use because arguments aren't compatible.
         ("sculpt.face_set_change_visibility", {"type": 'H', "value": 'PRESS'},
          {"properties": [("mode", 'TOGGLE')]}),
         ("sculpt.face_set_change_visibility", {"type": 'H', "value": 'PRESS', "shift": True},
          {"properties": [("mode", 'HIDE_ACTIVE')]}),
         ("sculpt.face_set_change_visibility", {"type": 'H', "value": 'PRESS', "alt": True},
          {"properties": [("mode", 'SHOW_ALL')]}),
+
         ("sculpt.face_set_edit", {"type": 'W', "value": 'PRESS', "ctrl": True},
          {"properties": [("mode", 'GROW')]}),
         ("sculpt.face_set_edit", {"type": 'W', "value": 'PRESS', "ctrl": True, "alt": True},
@@ -4610,7 +4861,7 @@ def km_sculpt(params):
          {"properties": [("data_path", 'tool_settings.sculpt.brush.use_smooth_stroke')]}),
         op_menu("VIEW3D_MT_angle_control", {"type": 'R', "value": 'PRESS'}),
         op_menu_pie("VIEW3D_MT_sculpt_mask_edit_pie", {"type": 'A', "value": 'PRESS'}),
-        op_menu_pie("VIEW3D_MT_sculpt_automasking_pie", {"type": 'A', "alt": True,"value": 'PRESS'}),
+        op_menu_pie("VIEW3D_MT_sculpt_automasking_pie", {"type": 'A', "alt": True, "value": 'PRESS'}),
         op_menu_pie("VIEW3D_MT_sculpt_face_sets_edit_pie", {"type": 'W', "value": 'PRESS'}),
         *_template_items_context_panel("VIEW3D_PT_sculpt_context_menu", params.context_menu_event),
     ])
@@ -4632,34 +4883,50 @@ def km_mesh(params):
 
     items.extend([
         # Tools.
-        ("mesh.loopcut_slide", {"type": 'R', "value": 'PRESS', "ctrl": True},
-         {"properties": [("TRANSFORM_OT_edge_slide", [("release_confirm", False)],)]}),
-        ("mesh.offset_edge_loops_slide", {"type": 'R', "value": 'PRESS', "shift": True, "ctrl": True},
-         {"properties": [("TRANSFORM_OT_edge_slide", [("release_confirm", False)],)]}),
-        ("mesh.inset", {"type": 'I', "value": 'PRESS'}, None),
-        ("mesh.bevel", {"type": 'B', "value": 'PRESS', "ctrl": True},
-         {"properties": [("affect", 'EDGES')]}),
+        op_tool_optional(
+            ("mesh.loopcut_slide", {"type": 'R', "value": 'PRESS', "ctrl": True},
+             {"properties": [("TRANSFORM_OT_edge_slide", [("release_confirm", False)],)]}),
+            (op_tool_cycle, "builtin.loop_cut"), params),
+        op_tool_optional(
+            ("mesh.offset_edge_loops_slide", {"type": 'R', "value": 'PRESS', "shift": True, "ctrl": True},
+             {"properties": [("TRANSFORM_OT_edge_slide", [("release_confirm", False)],)]}),
+            (op_tool_cycle, "builtin.offset_edge_loop_cut"), params),
+        op_tool_optional(
+            ("mesh.inset", {"type": 'I', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.inset_faces"), params),
+        op_tool_optional(
+            ("mesh.bevel", {"type": 'B', "value": 'PRESS', "ctrl": True},
+             {"properties": [("affect", 'EDGES')]}),
+            (op_tool_cycle, "builtin.bevel"), params),
         ("mesh.bevel", {"type": 'B', "value": 'PRESS', "shift": True, "ctrl": True},
          {"properties": [("affect", 'VERTICES')]}),
         # Selection modes.
         *_template_items_editmode_mesh_select_mode(params),
         # Loop Select with alt. Double click in case MMB emulation is on (below).
-        ("mesh.loop_select", {"type": params.select_mouse, "value": params.select_mouse_value, "alt": True}, None),
-        ("mesh.loop_select", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "alt": True},
+        ("mesh.loop_select",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "alt": True}, None),
+        ("mesh.loop_select",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "alt": True},
          {"properties": [("toggle", True)]}),
         # Selection
-        ("mesh.edgering_select", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "alt": True}, None),
-        ("mesh.edgering_select", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "ctrl": True, "alt": True},
+        ("mesh.edgering_select",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True, "alt": True}, None),
+        ("mesh.edgering_select",
+         {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "ctrl": True, "alt": True},
          {"properties": [("toggle", True)]}),
-        ("mesh.shortest_path_pick", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True},
+        ("mesh.shortest_path_pick",
+         {"type": params.select_mouse, "value": params.select_mouse_value_fallback, "ctrl": True},
          {"properties": [("use_fill", False)]}),
-        ("mesh.shortest_path_pick", {"type": params.select_mouse, "value": params.select_mouse_value, "shift": True, "ctrl": True},
+        ("mesh.shortest_path_pick",
+         {"type": params.select_mouse, "value": params.select_mouse_value_fallback, "shift": True, "ctrl": True},
          {"properties": [("use_fill", True)]}),
         *_template_items_select_actions(params, "mesh.select_all"),
         ("mesh.select_more", {"type": 'NUMPAD_PLUS', "value": 'PRESS', "ctrl": True, "repeat": True}, None),
         ("mesh.select_less", {"type": 'NUMPAD_MINUS', "value": 'PRESS', "ctrl": True, "repeat": True}, None),
-        ("mesh.select_next_item", {"type": 'NUMPAD_PLUS', "value": 'PRESS', "shift": True, "ctrl": True, "repeat": True}, None),
-        ("mesh.select_prev_item", {"type": 'NUMPAD_MINUS', "value": 'PRESS', "shift": True, "ctrl": True, "repeat": True}, None),
+        ("mesh.select_next_item",
+         {"type": 'NUMPAD_PLUS', "value": 'PRESS', "shift": True, "ctrl": True, "repeat": True}, None),
+        ("mesh.select_prev_item",
+         {"type": 'NUMPAD_MINUS', "value": 'PRESS', "shift": True, "ctrl": True, "repeat": True}, None),
         ("mesh.select_linked", {"type": 'L', "value": 'PRESS', "ctrl": True}, None),
         ("mesh.select_linked_pick", {"type": 'L', "value": 'PRESS'},
          {"properties": [("deselect", False)]}),
@@ -4668,17 +4935,15 @@ def km_mesh(params):
         ("mesh.select_mirror", {"type": 'M', "value": 'PRESS', "shift": True, "ctrl": True}, None),
         op_menu("VIEW3D_MT_edit_mesh_select_similar", {"type": 'G', "value": 'PRESS', "shift": True}),
         # Hide/reveal.
-        ("mesh.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("mesh.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("mesh.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        *_template_items_hide_reveal_actions("mesh.hide", "mesh.reveal"),
         # Tools.
         ("mesh.normals_make_consistent", {"type": 'N', "value": 'PRESS', "ctrl" if params.legacy else "shift": True},
          {"properties": [("inside", False)]}),
         ("mesh.normals_make_consistent", {"type": 'N', "value": 'PRESS', "shift": True, "ctrl": True},
          {"properties": [("inside", True)]}),
-        ("view3d.edit_mesh_extrude_move_normal", {"type": 'E', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("view3d.edit_mesh_extrude_move_normal", {"type": 'E', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.extrude_region"), params),
         op_menu("VIEW3D_MT_edit_mesh_extrude", {"type": 'E', "value": 'PRESS', "alt": True}),
         ("transform.edge_crease", {"type": 'E', "value": 'PRESS', "shift": True}, None),
         ("mesh.fill", {"type": 'F', "value": 'PRESS', "alt": True}, None),
@@ -4687,8 +4952,11 @@ def km_mesh(params):
         ("mesh.quads_convert_to_tris", {"type": 'T', "value": 'PRESS', "shift": True, "ctrl": True},
          {"properties": [("quad_method", 'FIXED'), ("ngon_method", 'CLIP')]}),
         ("mesh.tris_convert_to_quads", {"type": 'J', "value": 'PRESS', "alt": True}, None),
-        ("mesh.rip_move", {"type": 'V', "value": 'PRESS'},
-         {"properties": [("MESH_OT_rip", [("use_fill", False)],)]}),
+        op_tool_optional(
+            ("mesh.rip_move", {"type": 'V', "value": 'PRESS'},
+             {"properties": [("MESH_OT_rip", [("use_fill", False)],)]}),
+            (op_tool_cycle, "builtin.rip_region"), params),
+        # No tool is available for this.
         ("mesh.rip_move", {"type": 'V', "value": 'PRESS', "alt": True},
          {"properties": [("MESH_OT_rip", [("use_fill", True)],)]}),
         ("mesh.rip_edge_move", {"type": 'D', "value": 'PRESS', "alt": True}, None),
@@ -4702,7 +4970,9 @@ def km_mesh(params):
         ("mesh.split", {"type": 'Y', "value": 'PRESS'}, None),
         ("mesh.vert_connect_path", {"type": 'J', "value": 'PRESS'}, None),
         ("mesh.point_normals", {"type": 'L', "value": 'PRESS', "alt": True}, None),
-        ("transform.vert_slide", {"type": 'V', "value": 'PRESS', "shift": True}, None),
+        op_tool_optional(
+            ("transform.vert_slide", {"type": 'V', "value": 'PRESS', "shift": True}, None),
+            (op_tool_cycle, "builtin.vertex_slide"), params),
         ("mesh.dupli_extrude_cursor", {"type": params.action_mouse, "value": 'CLICK', "ctrl": True},
          {"properties": [("rotate_source", True)]}),
         ("mesh.dupli_extrude_cursor", {"type": params.action_mouse, "value": 'CLICK', "shift": True, "ctrl": True},
@@ -4711,8 +4981,10 @@ def km_mesh(params):
         op_menu("VIEW3D_MT_edit_mesh_delete", {"type": 'DEL', "value": 'PRESS'}),
         ("mesh.dissolve_mode", {"type": 'X', "value": 'PRESS', "ctrl": True}, None),
         ("mesh.dissolve_mode", {"type": 'DEL', "value": 'PRESS', "ctrl": True}, None),
-        ("mesh.knife_tool", {"type": 'K', "value": 'PRESS'},
-         {"properties": [("use_occlude_geometry", True), ("only_selected", False)]}),
+        op_tool_optional(
+            ("mesh.knife_tool", {"type": 'K', "value": 'PRESS'},
+             {"properties": [("use_occlude_geometry", True), ("only_selected", False)]}),
+            (op_tool_cycle, "builtin.knife"), params),
         ("mesh.knife_tool", {"type": 'K', "value": 'PRESS', "shift": True},
          {"properties": [("use_occlude_geometry", False), ("only_selected", True)]}),
         ("object.vertex_parent_set", {"type": 'P', "value": 'PRESS', "ctrl": True}, None),
@@ -4737,16 +5009,20 @@ def km_mesh(params):
              {"properties": [("extend", True)]}),
             ("mesh.loop_select", {"type": params.select_mouse, "value": 'DOUBLE_CLICK', "alt": True},
              {"properties": [("deselect", True)]}),
-            ("mesh.edgering_select", {"type": params.select_mouse, "value": 'DOUBLE_CLICK', "ctrl": True}, None),
-            ("mesh.edgering_select", {"type": params.select_mouse, "value": 'DOUBLE_CLICK', "shift": True, "ctrl": True},
+            ("mesh.edgering_select",
+             {"type": params.select_mouse, "value": 'DOUBLE_CLICK', "ctrl": True}, None),
+            ("mesh.edgering_select",
+             {"type": params.select_mouse, "value": 'DOUBLE_CLICK', "shift": True, "ctrl": True},
              {"properties": [("toggle", True)]}),
         ])
 
     if params.legacy:
         items.extend([
             ("mesh.poke", {"type": 'P', "value": 'PRESS', "alt": True}, None),
-            ("mesh.select_non_manifold", {"type": 'M', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
-            ("mesh.faces_select_linked_flat", {"type": 'F', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
+            ("mesh.select_non_manifold",
+             {"type": 'M', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
+            ("mesh.faces_select_linked_flat",
+             {"type": 'F', "value": 'PRESS', "shift": True, "ctrl": True, "alt": True}, None),
             ("mesh.spin", {"type": 'R', "value": 'PRESS', "alt": True}, None),
             ("mesh.beautify_fill", {"type": 'F', "value": 'PRESS', "shift": True, "alt": True}, None),
             *_template_items_object_subdivision_set(),
@@ -4766,11 +5042,7 @@ def km_armature(params):
 
     items.extend([
         # Hide/reveal.
-        ("armature.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("armature.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
-        ("armature.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
+        *_template_items_hide_reveal_actions("armature.hide", "armature.reveal"),
         # Align & roll.
         ("armature.align", {"type": 'A', "value": 'PRESS', "ctrl": True, "alt": True}, None),
         ("armature.calculate_roll", {"type": 'N', "value": 'PRESS', "ctrl" if params.legacy else "shift": True}, None),
@@ -4801,14 +5073,17 @@ def km_armature(params):
         ("armature.select_linked_pick", {"type": 'L', "value": 'PRESS', "shift": True},
          {"properties": [("deselect", True)]}),
         ("armature.select_linked", {"type": 'L', "value": 'PRESS', "ctrl": True}, None),
-        ("armature.shortest_path_pick", {"type": params.select_mouse, "value": params.select_mouse_value, "ctrl": True}, None),
+        ("armature.shortest_path_pick",
+         {"type": params.select_mouse, "value": params.select_mouse_value_fallback, "ctrl": True}, None),
         # Editing.
         op_menu("VIEW3D_MT_edit_armature_delete", {"type": 'X', "value": 'PRESS'}),
         op_menu("VIEW3D_MT_edit_armature_delete", {"type": 'DEL', "value": 'PRESS'}),
         ("armature.duplicate_move", {"type": 'D', "value": 'PRESS', "shift": True}, None),
         ("armature.dissolve", {"type": 'X', "value": 'PRESS', "ctrl": True}, None),
         ("armature.dissolve", {"type": 'DEL', "value": 'PRESS', "ctrl": True}, None),
-        ("armature.extrude_move", {"type": 'E', "value": 'PRESS'}, None),
+        op_tool_optional(
+            ("armature.extrude_move", {"type": 'E', "value": 'PRESS'}, None),
+            (op_tool_cycle, "builtin.extrude"), params),
         ("armature.extrude_forked", {"type": 'E', "value": 'PRESS', "shift": True}, None),
         ("armature.click_extrude", {"type": params.action_mouse, "value": 'CLICK', "ctrl": True}, None),
         ("armature.fill", {"type": 'F', "value": 'PRESS'}, None),
@@ -4823,11 +5098,17 @@ def km_armature(params):
         ("armature.armature_layers", {"type": 'M', "value": 'PRESS', "shift": True}, None),
         ("armature.bone_layers", {"type": 'M', "value": 'PRESS'}, None),
         # Special transforms.
-        ("transform.bbone_resize", {"type": 'S', "value": 'PRESS', "ctrl": True, "alt": True}, None),
-        ("transform.transform", {"type": 'S', "value": 'PRESS', "alt": True},
-         {"properties": [("mode", 'BONE_ENVELOPE')]}),
-        ("transform.transform", {"type": 'R', "value": 'PRESS', "ctrl": True},
-         {"properties": [("mode", 'BONE_ROLL')]}),
+        op_tool_optional(
+            ("transform.bbone_resize", {"type": 'S', "value": 'PRESS', "ctrl": True, "alt": True}, None),
+            (op_tool_cycle, "builtin.bone_size"), params),
+        op_tool_optional(
+            ("transform.transform", {"type": 'S', "value": 'PRESS', "alt": True},
+             {"properties": [("mode", 'BONE_ENVELOPE')]}),
+            (op_tool_cycle, "builtin.bone_envelope"), params),
+        op_tool_optional(
+            ("transform.transform", {"type": 'R', "value": 'PRESS', "ctrl": True},
+             {"properties": [("mode", 'BONE_ROLL')]}),
+            (op_tool_cycle, "builtin.roll"), params),
         # Menus.
         *_template_items_context_menu("VIEW3D_MT_armature_context_menu", params.context_menu_event),
     ])
@@ -4846,11 +5127,7 @@ def km_metaball(params):
 
     items.extend([
         ("object.metaball_add", {"type": 'A', "value": 'PRESS', "shift": True}, None),
-        ("mball.reveal_metaelems", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("mball.hide_metaelems", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("mball.hide_metaelems", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
+        *_template_items_hide_reveal_actions("mball.hide_metaelems", "mball.reveal_metaelems"),
         ("mball.delete_metaelems", {"type": 'X', "value": 'PRESS'}, None),
         ("mball.delete_metaelems", {"type": 'DEL', "value": 'PRESS'}, None),
         ("mball.duplicate_move", {"type": 'D', "value": 'PRESS', "shift": True}, None),
@@ -4908,11 +5185,7 @@ def km_particle(params):
         ("particle.select_linked", {"type": 'L', "value": 'PRESS', "ctrl": True}, None),
         ("particle.delete", {"type": 'X', "value": 'PRESS'}, None),
         ("particle.delete", {"type": 'DEL', "value": 'PRESS'}, None),
-        ("particle.reveal", {"type": 'H', "value": 'PRESS', "alt": True}, None),
-        ("particle.hide", {"type": 'H', "value": 'PRESS'},
-         {"properties": [("unselected", False)]}),
-        ("particle.hide", {"type": 'H', "value": 'PRESS', "shift": True},
-         {"properties": [("unselected", True)]}),
+        *_template_items_hide_reveal_actions("particle.hide", "particle.reveal"),
         ("particle.brush_edit", {"type": 'LEFTMOUSE', "value": 'PRESS'}, None),
         ("particle.brush_edit", {"type": 'LEFTMOUSE', "value": 'PRESS', "shift": True}, None),
         ("wm.radial_control", {"type": 'F', "value": 'PRESS'},
@@ -5621,6 +5894,7 @@ def km_paint_stroke_modal(_params):
 
     return keymap
 
+
 def km_sculpt_expand_modal(_params):
     items = []
     keymap = (
@@ -5828,38 +6102,60 @@ def km_image_editor_tool_uv_cursor(params):
     )
 
 
-def km_image_editor_tool_uv_select(params):
+def km_image_editor_tool_uv_select(params, *, fallback):
     return (
-        "Image Editor Tool: Uv, Tweak",
+        _fallback_id("Image Editor Tool: Uv, Tweak", fallback),
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select(params, "uv.select", "uv.cursor_set", extend="extend")},
+        {"items": [
+            *([] if fallback else _template_items_tool_select(params, "uv.select", "uv.cursor_set", extend="extend")),
+            *([] if (not params.use_fallback_tool_rmb) else _template_uv_select(
+                type=params.select_mouse, value=params.select_mouse_value, legacy=params.legacy)),
+        ]},
     )
 
 
-def km_image_editor_tool_uv_select_box(params):
+def km_image_editor_tool_uv_select_box(params, *, fallback):
     return (
-        "Image Editor Tool: Uv, Select Box",
+        _fallback_id("Image Editor Tool: Uv, Select Box", fallback),
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple("uv.select_box", type=params.tool_tweak, value='ANY')},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "uv.select_box",
+                type=params.select_tweak if fallback else params.tool_tweak,
+                value='ANY')),
+            *_template_uv_select_for_fallback(params, fallback),
+        ]},
     )
 
 
-def km_image_editor_tool_uv_select_circle(params):
+def km_image_editor_tool_uv_select_circle(params, *, fallback):
     return (
-        "Image Editor Tool: Uv, Select Circle",
+        _fallback_id("Image Editor Tool: Uv, Select Circle", fallback),
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple(
-            "uv.select_circle", type=params.tool_mouse, value='PRESS',
-            properties=[("wait_for_input", False)],
-        )},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "uv.select_circle",
+                type=params.select_tweak if fallback else params.tool_mouse,
+                value='ANY' if fallback else 'PRESS',
+                properties=[("wait_for_input", False)])),
+            # No selection fallback since this operates on press.
+        ]},
     )
 
 
-def km_image_editor_tool_uv_select_lasso(params):
+def km_image_editor_tool_uv_select_lasso(params, *, fallback):
     return (
-        "Image Editor Tool: Uv, Select Lasso",
+        _fallback_id("Image Editor Tool: Uv, Select Lasso", fallback),
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple("uv.select_lasso", type=params.tool_tweak, value='ANY')},
+
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "uv.select_lasso",
+                type=params.select_tweak if fallback else params.tool_tweak,
+                value='ANY')
+              ),
+            *_template_uv_select_for_fallback(params, fallback),
+        ]},
     )
 
 
@@ -5868,7 +6164,7 @@ def km_image_editor_tool_uv_rip_region(params):
         "Image Editor Tool: Uv, Rip Region",
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
-            ("uv.rip_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("uv.rip_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
         ]},
     )
@@ -5898,7 +6194,7 @@ def km_image_editor_tool_uv_move(params):
         "Image Editor Tool: Uv, Move",
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.translate", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.translate", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -5909,7 +6205,7 @@ def km_image_editor_tool_uv_rotate(params):
         "Image Editor Tool: Uv, Rotate",
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.rotate", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.rotate", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -5920,53 +6216,59 @@ def km_image_editor_tool_uv_scale(params):
         "Image Editor Tool: Uv, Scale",
         {"space_type": 'IMAGE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.resize", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.resize", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
 
 
-def km_node_editor_tool_select(params):
+def km_node_editor_tool_select(params, *, fallback):
     return (
-        "Node Tool: Tweak",
+        _fallback_id("Node Tool: Tweak", fallback),
         {"space_type": 'NODE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
-            ("node.select", {"type": params.select_mouse, "value": 'PRESS'},
-             {"properties": [("deselect_all", not params.legacy)]}),
+            *([] if fallback else [
+                ("node.select", {"type": params.select_mouse, "value": 'PRESS'},
+                 {"properties": [("deselect_all", not params.legacy)]}),
+            ]),
         ]},
     )
 
 
-def km_node_editor_tool_select_box(params):
+def km_node_editor_tool_select_box(params, *, fallback):
     return (
-        "Node Tool: Select Box",
+        _fallback_id("Node Tool: Select Box", fallback),
         {"space_type": 'NODE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple(
-            "node.select_box", type=params.tool_tweak, value='ANY',
-            properties=[("tweak", True)],
-        )},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "node.select_box", type=params.tool_tweak, value='ANY',
+                properties=[("tweak", True)],
+            )),
+        ]},
     )
 
 
-def km_node_editor_tool_select_lasso(params):
+def km_node_editor_tool_select_lasso(params, *, fallback):
     return (
-        "Node Tool: Select Lasso",
+        _fallback_id("Node Tool: Select Lasso", fallback),
         {"space_type": 'NODE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple(
-            "node.select_lasso", type=params.tool_mouse, value='PRESS',
-            properties=[("tweak", True)],
-        )},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "node.select_lasso", type=params.tool_mouse, value='PRESS',
+                properties=[("tweak", True)]))
+        ]},
     )
 
 
-def km_node_editor_tool_select_circle(params):
+def km_node_editor_tool_select_circle(params, *, fallback):
     return (
-        "Node Tool: Select Circle",
+        _fallback_id("Node Tool: Select Circle", fallback),
         {"space_type": 'NODE_EDITOR', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple(
-            "node.select_circle", type=params.tool_mouse, value='PRESS',
-            properties=[("wait_for_input", False)],
-        )},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "node.select_circle", type=params.tool_mouse, value='PRESS',
+                properties=[("wait_for_input", False)])),
+        ]},
     )
 
 
@@ -5992,38 +6294,61 @@ def km_3d_view_tool_cursor(params):
     )
 
 
-def km_3d_view_tool_select(params):
+def km_3d_view_tool_select(params, *, fallback):
     return (
-        "3D View Tool: Tweak",
+        _fallback_id("3D View Tool: Tweak", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select(params, "view3d.select", "view3d.cursor3d", extend="toggle")},
+        {"items": [
+            *([] if fallback else _template_items_tool_select(
+                params, "view3d.select", "view3d.cursor3d", extend="toggle")),
+            *([] if (not params.use_fallback_tool_rmb) else _template_view3d_select(
+                type=params.select_mouse, value=params.select_mouse_value, legacy=params.legacy)),
+        ]},
     )
 
 
-def km_3d_view_tool_select_box(params):
+def km_3d_view_tool_select_box(params, *, fallback):
     return (
-        "3D View Tool: Select Box",
+        _fallback_id("3D View Tool: Select Box", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions("view3d.select_box", type=params.tool_tweak, value='ANY')},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions(
+                "view3d.select_box",
+                type=params.select_tweak if fallback else params.tool_tweak,
+                value='ANY')),
+            *_template_view3d_select_for_fallback(params, fallback),
+        ]},
     )
 
 
-def km_3d_view_tool_select_circle(params):
+def km_3d_view_tool_select_circle(params, *, fallback):
     return (
-        "3D View Tool: Select Circle",
+        _fallback_id("3D View Tool: Select Circle", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple(
-            "view3d.select_circle", type=params.tool_mouse, value='PRESS',
-            properties=[("wait_for_input", False)],
-        )},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "view3d.select_circle",
+                # Why circle select should be used on tweak?
+                # So that RMB or Shift-RMB is still able to set an element as active.
+                type=params.select_tweak if fallback else params.tool_mouse,
+                value='ANY' if fallback else 'PRESS',
+                properties=[("wait_for_input", False)])),
+            # No selection fallback since this operates on press.
+        ]},
     )
 
 
-def km_3d_view_tool_select_lasso(params):
+def km_3d_view_tool_select_lasso(params, *, fallback):
     return (
-        "3D View Tool: Select Lasso",
+        _fallback_id("3D View Tool: Select Lasso", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions("view3d.select_lasso", type=params.tool_tweak, value='ANY')},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions(
+                "view3d.select_lasso",
+                type=params.select_tweak if fallback else params.tool_tweak,
+                value='ANY')),
+            *_template_view3d_select_for_fallback(params, fallback),
+        ]}
     )
 
 
@@ -6032,7 +6357,8 @@ def km_3d_view_tool_transform(params):
         "3D View Tool: Transform",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.from_gizmo", {"type": params.tool_tweak, "value": 'ANY'}, None),
+            ("transform.from_gizmo",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6042,7 +6368,8 @@ def km_3d_view_tool_move(params):
         "3D View Tool: Move",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.translate", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.translate",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6053,7 +6380,8 @@ def km_3d_view_tool_rotate(params):
         "3D View Tool: Rotate",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.rotate", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.rotate",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6064,7 +6392,8 @@ def km_3d_view_tool_scale(params):
         "3D View Tool: Scale",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.resize", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.resize",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6076,15 +6405,15 @@ def km_3d_view_tool_shear(params):
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
             ("transform.shear",
-             {"type": params.tool_tweak, "value": 'NORTH'},
+             {"type": params.tool_tweak, "value": 'NORTH', **params.tool_modifier},
              {"properties": [("release_confirm", True), ("orient_axis_ortho", 'Y')]}),
             ("transform.shear",
-             {"type": params.tool_tweak, "value": 'SOUTH'},
+             {"type": params.tool_tweak, "value": 'SOUTH', **params.tool_modifier},
              {"properties": [("release_confirm", True), ("orient_axis_ortho", 'Y')]}),
 
             # Use as fallback to catch diagonals too.
             ("transform.shear",
-             {"type": params.tool_tweak, "value": 'ANY'},
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True), ("orient_axis_ortho", 'X')]}),
         ]},
     )
@@ -6107,7 +6436,7 @@ def km_3d_view_tool_pose_breakdowner(params):
         "3D View Tool: Pose, Breakdowner",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("pose.breakdown", {"type": params.tool_tweak, "value": 'ANY'}, None),
+            ("pose.breakdown", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6117,7 +6446,8 @@ def km_3d_view_tool_pose_push(params):
         "3D View Tool: Pose, Push",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("pose.push", {"type": params.tool_tweak, "value": 'ANY'}, None),
+            ("pose.push",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6127,7 +6457,8 @@ def km_3d_view_tool_pose_relax(params):
         "3D View Tool: Pose, Relax",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("pose.relax", {"type": params.tool_tweak, "value": 'ANY'}, None),
+            ("pose.relax",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6137,7 +6468,8 @@ def km_3d_view_tool_edit_armature_roll(params):
         "3D View Tool: Edit Armature, Roll",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.transform", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.transform",
+             {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True), ("mode", 'BONE_ROLL')]}),
         ]},
     )
@@ -6148,7 +6480,7 @@ def km_3d_view_tool_edit_armature_bone_size(params):
         "3D View Tool: Edit Armature, Bone Size",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.transform", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.transform", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True), ("mode", 'BONE_ENVELOPE')]}),
         ]},
     )
@@ -6160,7 +6492,7 @@ def km_3d_view_tool_edit_armature_bone_envelope(params):
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
 
         {"items": [
-            ("transform.bbone_resize", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.bbone_resize", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6171,7 +6503,7 @@ def km_3d_view_tool_edit_armature_extrude(params):
         "3D View Tool: Edit Armature, Extrude",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("armature.extrude_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("armature.extrude_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
         ]},
     )
@@ -6182,7 +6514,7 @@ def km_3d_view_tool_edit_armature_extrude_to_cursor(params):
         "3D View Tool: Edit Armature, Extrude to Cursor",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("armature.click_extrude", {"type": params.tool_mouse, "value": 'PRESS'}, None),
+            ("armature.click_extrude", {"type": params.tool_mouse, "value": 'PRESS', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6203,7 +6535,7 @@ def km_3d_view_tool_edit_mesh_extrude_region(params):
         "3D View Tool: Edit Mesh, Extrude Region",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.extrude_context_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.extrude_context_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
         ]},
     )
@@ -6214,7 +6546,7 @@ def km_3d_view_tool_edit_mesh_extrude_manifold(params):
         "3D View Tool: Edit Mesh, Extrude Manifold",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.extrude_manifold", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.extrude_manifold", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [
                  ("MESH_OT_extrude_region", [("use_dissolve_ortho_edges", True)]),
                  ("TRANSFORM_OT_translate", [
@@ -6233,7 +6565,7 @@ def km_3d_view_tool_edit_mesh_extrude_along_normals(params):
         "3D View Tool: Edit Mesh, Extrude Along Normals",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.extrude_region_shrink_fatten", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.extrude_region_shrink_fatten", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_shrink_fatten", [("release_confirm", True)])]}),
         ]},
     )
@@ -6244,7 +6576,7 @@ def km_3d_view_tool_edit_mesh_extrude_individual(params):
         "3D View Tool: Edit Mesh, Extrude Individual",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.extrude_faces_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.extrude_faces_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_shrink_fatten", [("release_confirm", True)])]}),
         ]},
     )
@@ -6255,6 +6587,7 @@ def km_3d_view_tool_edit_mesh_extrude_to_cursor(params):
         "3D View Tool: Edit Mesh, Extrude to Cursor",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("mesh.dupli_extrude_cursor", {"type": params.tool_mouse, "value": 'PRESS'}, None),
         ]},
     )
@@ -6265,7 +6598,7 @@ def km_3d_view_tool_edit_mesh_inset_faces(params):
         "3D View Tool: Edit Mesh, Inset Faces",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.inset", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.inset", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6276,7 +6609,7 @@ def km_3d_view_tool_edit_mesh_bevel(params):
         "3D View Tool: Edit Mesh, Bevel",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.bevel", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.bevel", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6287,6 +6620,7 @@ def km_3d_view_tool_edit_mesh_loop_cut(params):
         "3D View Tool: Edit Mesh, Loop Cut",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("mesh.loopcut_slide", {"type": params.tool_mouse, "value": 'PRESS'},
              {"properties": [("TRANSFORM_OT_edge_slide", [("release_confirm", True)])]}),
         ]},
@@ -6298,6 +6632,7 @@ def km_3d_view_tool_edit_mesh_offset_edge_loop_cut(params):
         "3D View Tool: Edit Mesh, Offset Edge Loop Cut",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("mesh.offset_edge_loops_slide", {"type": params.tool_mouse, "value": 'PRESS'}, None),
         ]},
     )
@@ -6308,6 +6643,7 @@ def km_3d_view_tool_edit_mesh_knife(params):
         "3D View Tool: Edit Mesh, Knife",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("mesh.knife_tool", {"type": params.tool_mouse, "value": 'PRESS'},
              {"properties": [("wait_for_input", False)]}),
         ]},
@@ -6319,6 +6655,7 @@ def km_3d_view_tool_edit_mesh_bisect(params):
         "3D View Tool: Edit Mesh, Bisect",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("mesh.bisect", {"type": params.tool_tweak, "value": 'ANY'}, None),
         ]},
     )
@@ -6329,6 +6666,7 @@ def km_3d_view_tool_edit_mesh_poly_build(params):
         "3D View Tool: Edit Mesh, Poly Build",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("mesh.polybuild_extrude_at_cursor_move", {"type": params.tool_mouse, "value": 'PRESS'},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
             ("mesh.polybuild_face_at_cursor_move", {"type": params.tool_mouse, "value": 'PRESS', "ctrl": True},
@@ -6343,7 +6681,7 @@ def km_3d_view_tool_edit_mesh_spin(params):
         "3D View Tool: Edit Mesh, Spin",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.spin", {"type": params.tool_tweak, "value": 'ANY'}, None),
+            ("mesh.spin", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6353,7 +6691,7 @@ def km_3d_view_tool_edit_mesh_spin_duplicate(params):
         "3D View Tool: Edit Mesh, Spin Duplicates",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.spin", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.spin", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("dupli", True)]}),
         ]},
     )
@@ -6364,7 +6702,7 @@ def km_3d_view_tool_edit_mesh_smooth(params):
         "3D View Tool: Edit Mesh, Smooth",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.vertices_smooth", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.vertices_smooth", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("wait_for_input", False)]}),
         ]},
     )
@@ -6375,7 +6713,7 @@ def km_3d_view_tool_edit_mesh_randomize(params):
         "3D View Tool: Edit Mesh, Randomize",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.vertex_random", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.vertex_random", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("wait_for_input", False)]}),
         ]},
     )
@@ -6386,7 +6724,7 @@ def km_3d_view_tool_edit_mesh_edge_slide(params):
         "3D View Tool: Edit Mesh, Edge Slide",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.edge_slide", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.edge_slide", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6397,7 +6735,7 @@ def km_3d_view_tool_edit_mesh_vertex_slide(params):
         "3D View Tool: Edit Mesh, Vertex Slide",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.vert_slide", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.vert_slide", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6408,7 +6746,7 @@ def km_3d_view_tool_edit_mesh_shrink_fatten(params):
         "3D View Tool: Edit Mesh, Shrink/Fatten",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.shrink_fatten", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.shrink_fatten", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6419,7 +6757,7 @@ def km_3d_view_tool_edit_mesh_push_pull(params):
         "3D View Tool: Edit Mesh, Push/Pull",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.push_pull", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.push_pull", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6430,7 +6768,7 @@ def km_3d_view_tool_edit_mesh_to_sphere(params):
         "3D View Tool: Edit Mesh, To Sphere",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.tosphere", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.tosphere", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6441,7 +6779,7 @@ def km_3d_view_tool_edit_mesh_rip_region(params):
         "3D View Tool: Edit Mesh, Rip Region",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.rip_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.rip_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
         ]},
     )
@@ -6452,7 +6790,7 @@ def km_3d_view_tool_edit_mesh_rip_edge(params):
         "3D View Tool: Edit Mesh, Rip Edge",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("mesh.rip_edge_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("mesh.rip_edge_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
         ]},
     )
@@ -6463,6 +6801,7 @@ def km_3d_view_tool_edit_curve_draw(params):
         "3D View Tool: Edit Curve, Draw",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("curve.draw", {"type": params.tool_mouse, "value": 'PRESS'},
              {"properties": [("wait_for_input", False)]}),
         ]},
@@ -6474,7 +6813,7 @@ def km_3d_view_tool_edit_curve_tilt(params):
         "3D View Tool: Edit Curve, Tilt",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.tilt", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.tilt", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("release_confirm", True)]}),
         ]},
     )
@@ -6485,7 +6824,7 @@ def km_3d_view_tool_edit_curve_radius(params):
         "3D View Tool: Edit Curve, Radius",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.transform", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.transform", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("mode", 'CURVE_SHRINKFATTEN'), ("release_confirm", True)]}),
         ]},
     )
@@ -6496,7 +6835,7 @@ def km_3d_view_tool_edit_curve_randomize(params):
         "3D View Tool: Edit Curve, Randomize",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("transform.vertex_random", {"type": params.tool_tweak, "value": 'ANY'},
+            ("transform.vertex_random", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("wait_for_input", False)]}),
         ]},
     )
@@ -6507,7 +6846,7 @@ def km_3d_view_tool_edit_curve_extrude(params):
         "3D View Tool: Edit Curve, Extrude",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("curve.extrude_move", {"type": params.tool_tweak, "value": 'ANY'},
+            ("curve.extrude_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier},
              {"properties": [("TRANSFORM_OT_translate", [("release_confirm", True)])]}),
         ]},
     )
@@ -6518,6 +6857,7 @@ def km_3d_view_tool_edit_curve_extrude_to_cursor(params):
         "3D View Tool: Edit Curve, Extrude to Cursor",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("curve.vertex_add", {"type": params.tool_mouse, "value": 'PRESS'}, None),
         ]},
     )
@@ -6832,12 +7172,16 @@ def km_3d_view_tool_paint_gpencil_eyedropper(params):
         "3D View Tool: Paint Gpencil, Eyedropper",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("ui.eyedropper_gpencil_color", {"type": params.tool_mouse, "value": 'PRESS'}, None),
-            ("ui.eyedropper_gpencil_color", {"type": params.tool_mouse, "value": 'PRESS', "shift": True}, None),
-            ("ui.eyedropper_gpencil_color", {"type": params.tool_mouse, "value": 'PRESS', "shift": True, "ctrl": True}, None),
+            ("ui.eyedropper_gpencil_color",
+             {"type": params.tool_mouse, "value": 'PRESS'}, None),
+            ("ui.eyedropper_gpencil_color",
+             {"type": params.tool_mouse, "value": 'PRESS', "shift": True}, None),
+            ("ui.eyedropper_gpencil_color",
+             {"type": params.tool_mouse, "value": 'PRESS', "shift": True, "ctrl": True}, None),
         ]},
     )
 
+
 def km_3d_view_tool_paint_gpencil_interpolate(params):
     return (
         "3D View Tool: Paint Gpencil, Interpolate",
@@ -6848,38 +7192,60 @@ def km_3d_view_tool_paint_gpencil_interpolate(params):
         ]},
     )
 
-def km_3d_view_tool_edit_gpencil_select(params):
+
+def km_3d_view_tool_edit_gpencil_select(params, *, fallback):
     return (
-        "3D View Tool: Edit Gpencil, Tweak",
+        _fallback_id("3D View Tool: Edit Gpencil, Tweak", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select(params, "gpencil.select", "view3d.cursor3d", extend="toggle")},
+        {"items": [
+            *([] if fallback else _template_items_tool_select(
+                params, "gpencil.select", "view3d.cursor3d", extend="toggle")),
+            *([] if (not params.use_fallback_tool_rmb) else _template_view3d_gpencil_select(
+                type=params.select_mouse, value=params.select_mouse_value, legacy=params.legacy)),
+        ]},
     )
 
 
-def km_3d_view_tool_edit_gpencil_select_box(params):
+def km_3d_view_tool_edit_gpencil_select_box(params, *, fallback):
     return (
-        "3D View Tool: Edit Gpencil, Select Box",
+        _fallback_id("3D View Tool: Edit Gpencil, Select Box", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions("gpencil.select_box", type=params.tool_tweak, value='ANY')},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions(
+                "gpencil.select_box", type=params.select_tweak if fallback else params.tool_tweak, value='ANY')),
+            *_template_view3d_gpencil_select_for_fallback(params, fallback),
+        ]},
     )
 
 
-def km_3d_view_tool_edit_gpencil_select_circle(params):
+def km_3d_view_tool_edit_gpencil_select_circle(params, *, fallback):
     return (
-        "3D View Tool: Edit Gpencil, Select Circle",
+        _fallback_id("3D View Tool: Edit Gpencil, Select Circle", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions_simple(
-            "gpencil.select_circle", type=params.tool_mouse, value='PRESS',
-            properties=[("wait_for_input", False)],
-        )},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions_simple(
+                "gpencil.select_circle",
+                # Why circle select should be used on tweak?
+                # So that RMB or Shift-RMB is still able to set an element as active.
+                type=params.select_tweak if fallback else params.tool_mouse,
+                value='ANY' if fallback else 'PRESS',
+                properties=[("wait_for_input", False)])),
+            # No selection fallback since this operates on press.
+        ]},
     )
 
 
-def km_3d_view_tool_edit_gpencil_select_lasso(params):
+def km_3d_view_tool_edit_gpencil_select_lasso(params, *, fallback):
     return (
-        "3D View Tool: Edit Gpencil, Select Lasso",
+        _fallback_id("3D View Tool: Edit Gpencil, Select Lasso", fallback),
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
-        {"items": _template_items_tool_select_actions("gpencil.select_lasso", type=params.tool_tweak, value='ANY')},
+        {"items": [
+            *([] if (fallback and not params.use_fallback_tool) else _template_items_tool_select_actions(
+                "gpencil.select_lasso",
+                type=params.select_tweak if fallback else params.tool_tweak,
+                value='ANY')),
+            *_template_view3d_gpencil_select_for_fallback(params, fallback),
+        ]}
     )
 
 
@@ -6888,7 +7254,7 @@ def km_3d_view_tool_edit_gpencil_extrude(params):
         "3D View Tool: Edit Gpencil, Extrude",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
-            ("gpencil.extrude_move", {"type": params.tool_tweak, "value": 'ANY'}, None),
+            ("gpencil.extrude_move", {"type": params.tool_tweak, "value": 'ANY', **params.tool_modifier}, None),
         ]},
     )
 
@@ -6898,6 +7264,7 @@ def km_3d_view_tool_edit_gpencil_radius(params):
         "3D View Tool: Edit Gpencil, Radius",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("transform.transform", {"type": params.tool_tweak, "value": 'ANY'},
              {"properties": [("mode", 'GPENCIL_SHRINKFATTEN'), ("release_confirm", True)]}),
         ]},
@@ -6909,6 +7276,7 @@ def km_3d_view_tool_edit_gpencil_bend(params):
         "3D View Tool: Edit Gpencil, Bend",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("transform.bend", {"type": params.tool_tweak, "value": 'ANY'},
              {"properties": [("release_confirm", True)]}),
         ]},
@@ -6920,6 +7288,7 @@ def km_3d_view_tool_edit_gpencil_shear(params):
         "3D View Tool: Edit Gpencil, Shear",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("transform.shear", {"type": params.tool_tweak, "value": 'ANY'},
              {"properties": [("release_confirm", True)]}),
         ]},
@@ -6931,6 +7300,7 @@ def km_3d_view_tool_edit_gpencil_to_sphere(params):
         "3D View Tool: Edit Gpencil, To Sphere",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("transform.tosphere", {"type": params.tool_tweak, "value": 'ANY'},
              {"properties": [("release_confirm", True)]}),
         ]},
@@ -6942,6 +7312,7 @@ def km_3d_view_tool_edit_gpencil_transform_fill(params):
         "3D View Tool: Edit Gpencil, Transform Fill",
         {"space_type": 'VIEW_3D', "region_type": 'WINDOW'},
         {"items": [
+            # No need for `tool_modifier` since this takes all input.
             ("gpencil.transform_fill", {"type": params.tool_tweak, "value": 'ANY'},
              {"properties": [("release_confirm", True)]}),
         ]},
@@ -6994,9 +7365,10 @@ def km_3d_view_tool_sculpt_gpencil_select_lasso(params):
     )
 
 
-def km_sequencer_editor_tool_select(params):
+def km_sequencer_editor_tool_select(params, *, fallback):
     return (
-        "Sequencer Tool: Select",
+        # TODO, fall-back tool support.
+        _fallback_id("Sequencer Tool: Select", fallback),
         {"space_type": 'SEQUENCE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
             ("sequencer.select", {"type": params.select_mouse, "value": 'PRESS'}, None),
@@ -7005,9 +7377,10 @@ def km_sequencer_editor_tool_select(params):
     )
 
 
-def km_sequencer_editor_tool_select_box(params):
+def km_sequencer_editor_tool_select_box(params, *, fallback):
     return (
-        "Sequencer Tool: Select Box",
+        # TODO, fall-back tool support.
+        _fallback_id("Sequencer Tool: Select Box", fallback),
         {"space_type": 'SEQUENCE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
             *_template_items_tool_select_actions_simple(
@@ -7037,7 +7410,45 @@ def km_sequencer_editor_tool_blade(_params):
         {"space_type": 'SEQUENCE_EDITOR', "region_type": 'WINDOW'},
         {"items": [
             ("sequencer.split", {"type": 'LEFTMOUSE', "value": 'PRESS'},
-             {"properties": [("type", 'SOFT'), ("side", 'NO_CHANGE'), ("use_cursor_position", True), ("ignore_selection", True)]}),
+             {"properties": [
+                 ("type", 'SOFT'),
+                 ("side", 'NO_CHANGE'),
+                 ("use_cursor_position", True),
+                 ("ignore_selection", True),
+             ]}),
+        ]},
+    )
+
+
+def km_sequencer_editor_tool_move(params):
+    return (
+        "Sequencer Tool: Move",
+        {"space_type": 'SEQUENCE_EDITOR', "region_type": 'WINDOW'},
+        {"items": [
+            ("transform.translate", {"type": params.tool_tweak, "value": 'ANY'},
+             {"properties": [("release_confirm", True)]}),
+        ]},
+    )
+
+
+def km_sequencer_editor_tool_rotate(params):
+    return (
+        "Sequencer Tool: Rotate",
+        {"space_type": 'SEQUENCE_EDITOR', "region_type": 'WINDOW'},
+        {"items": [
+            ("transform.rotate", {"type": params.tool_tweak, "value": 'ANY'},
+             {"properties": [("release_confirm", True)]}),
+        ]},
+    )
+
+
+def km_sequencer_editor_tool_scale(params):
+    return (
+        "Sequencer Tool: Scale",
+        {"space_type": 'SEQUENCE_EDITOR', "region_type": 'WINDOW'},
+        {"items": [
+            ("transform.resize", {"type": params.tool_tweak, "value": 'ANY'},
+             {"properties": [("release_confirm", True)]}),
         ]},
     )
 
@@ -7187,25 +7598,25 @@ def generate_keymaps(params=None):
 
         km_image_editor_tool_generic_sample(params),
         km_image_editor_tool_uv_cursor(params),
-        km_image_editor_tool_uv_select(params),
-        km_image_editor_tool_uv_select_box(params),
-        km_image_editor_tool_uv_select_circle(params),
-        km_image_editor_tool_uv_select_lasso(params),
+        *(km_image_editor_tool_uv_select(params, fallback=fallback) for fallback in (False, True)),
+        *(km_image_editor_tool_uv_select_box(params, fallback=fallback) for fallback in (False, True)),
+        *(km_image_editor_tool_uv_select_circle(params, fallback=fallback) for fallback in (False, True)),
+        *(km_image_editor_tool_uv_select_lasso(params, fallback=fallback) for fallback in (False, True)),
         km_image_editor_tool_uv_rip_region(params),
         km_image_editor_tool_uv_sculpt_stroke(params),
         km_image_editor_tool_uv_move(params),
         km_image_editor_tool_uv_rotate(params),
         km_image_editor_tool_uv_scale(params),
-        km_node_editor_tool_select(params),
-        km_node_editor_tool_select_box(params),
-        km_node_editor_tool_select_lasso(params),
-        km_node_editor_tool_select_circle(params),
+        *(km_node_editor_tool_select(params, fallback=fallback) for fallback in (False, True)),
+        *(km_node_editor_tool_select_box(params, fallback=fallback) for fallback in (False, True)),
+        *(km_node_editor_tool_select_lasso(params, fallback=fallback) for fallback in (False, True)),
+        *(km_node_editor_tool_select_circle(params, fallback=fallback) for fallback in (False, True)),
         km_node_editor_tool_links_cut(params),
         km_3d_view_tool_cursor(params),
-        km_3d_view_tool_select(params),
-        km_3d_view_tool_select_box(params),
-        km_3d_view_tool_select_circle(params),
-        km_3d_view_tool_select_lasso(params),
+        *(km_3d_view_tool_select(params, fallback=fallback) for fallback in (False, True)),
+        *(km_3d_view_tool_select_box(params, fallback=fallback) for fallback in (False, True)),
+        *(km_3d_view_tool_select_circle(params, fallback=fallback) for fallback in (False, True)),
+        *(km_3d_view_tool_select_lasso(params, fallback=fallback) for fallback in (False, True)),
         km_3d_view_tool_transform(params),
         km_3d_view_tool_move(params),
         km_3d_view_tool_rotate(params),
@@ -7276,10 +7687,10 @@ def generate_keymaps(params=None):
         km_3d_view_tool_paint_gpencil_cutter(params),
         km_3d_view_tool_paint_gpencil_eyedropper(params),
         km_3d_view_tool_paint_gpencil_interpolate(params),
-        km_3d_view_tool_edit_gpencil_select(params),
-        km_3d_view_tool_edit_gpencil_select_box(params),
-        km_3d_view_tool_edit_gpencil_select_circle(params),
-        km_3d_view_tool_edit_gpencil_select_lasso(params),
+        *(km_3d_view_tool_edit_gpencil_select(params, fallback=fallback) for fallback in (False, True)),
+        *(km_3d_view_tool_edit_gpencil_select_box(params, fallback=fallback) for fallback in (False, True)),
+        *(km_3d_view_tool_edit_gpencil_select_circle(params, fallback=fallback) for fallback in (False, True)),
+        *(km_3d_view_tool_edit_gpencil_select_lasso(params, fallback=fallback) for fallback in (False, True)),
         km_3d_view_tool_edit_gpencil_extrude(params),
         km_3d_view_tool_edit_gpencil_radius(params),
         km_3d_view_tool_edit_gpencil_bend(params),
@@ -7291,10 +7702,13 @@ def generate_keymaps(params=None):
         km_3d_view_tool_sculpt_gpencil_select_box(params),
         km_3d_view_tool_sculpt_gpencil_select_circle(params),
         km_3d_view_tool_sculpt_gpencil_select_lasso(params),
-        km_sequencer_editor_tool_select(params),
-        km_sequencer_editor_tool_select_box(params),
+        *(km_sequencer_editor_tool_select(params, fallback=fallback) for fallback in (False, True)),
+        *(km_sequencer_editor_tool_select_box(params, fallback=fallback) for fallback in (False, True)),
         km_sequencer_editor_tool_blade(params),
         km_sequencer_editor_tool_generic_sample(params),
+        km_sequencer_editor_tool_scale(params),
+        km_sequencer_editor_tool_rotate(params),
+        km_sequencer_editor_tool_move(params),
     ]
 
 # ------------------------------------------------------------------------------
diff --git a/release/scripts/startup/bl_operators/userpref.py b/release/scripts/startup/bl_operators/userpref.py
index 623bf583a74..67a02f6e1f4 100644
--- a/release/scripts/startup/bl_operators/userpref.py
+++ b/release/scripts/startup/bl_operators/userpref.py
@@ -268,7 +268,7 @@ class PREFERENCES_OT_keyconfig_export(Operator):
     )
     filepath: StringProperty(
         subtype='FILE_PATH',
-        default="keymap.py",
+        default="",
     )
     filter_folder: BoolProperty(
         name="Filter folders",
@@ -307,7 +307,13 @@ class PREFERENCES_OT_keyconfig_export(Operator):
         return {'FINISHED'}
 
     def invoke(self, context, _event):
+        import os
         wm = context.window_manager
+        if not self.filepath:
+            self.filepath = os.path.join(
+                os.path.expanduser("~"),
+                bpy.path.display_name_to_filepath(wm.keyconfigs.active.name) + ".py",
+            )
         wm.fileselect_add(self)
         return {'RUNNING_MODAL'}
 
diff --git a/release/scripts/startup/bl_operators/wm.py b/release/scripts/startup/bl_operators/wm.py
index 6af86e75b8a..a386df5c428 100644
--- a/release/scripts/startup/bl_operators/wm.py
+++ b/release/scripts/startup/bl_operators/wm.py
@@ -1556,11 +1556,18 @@ class WM_OT_properties_edit(Operator):
                 self.max != self.soft_max
             )
             self.default = str(rna_data["default"])
-        if prop_type == str and not is_array and not value_failed: # String arrays do not support UI data.
+            self.description = rna_data.get("description", "")
+        elif prop_type == str and not is_array and not value_failed: # String arrays do not support UI data.
             ui_data = item.id_properties_ui(prop)
             rna_data = ui_data.as_dict()
             self.subtype =  rna_data["subtype"]
             self.default = str(rna_data["default"])
+            self.description = rna_data.get("description", "")
+        else:
+            self.min = self.soft_min = 0
+            self.max = self.soft_max = 1
+            self.use_soft_limits = False
+            self.description = ""
 
         self._init_subtype(prop_type, is_array, self.subtype)
 
@@ -1611,7 +1618,7 @@ class WM_OT_properties_edit(Operator):
         layout.prop(self, "property")
         layout.prop(self, "value")
 
-        value = self.get_value_eval()
+        value, value_failed = self.get_value_eval()
         proptype, is_array = rna_idprop_value_item_type(value)
 
         row = layout.row()
diff --git a/release/scripts/startup/bl_ui/properties_view_layer.py b/release/scripts/startup/bl_ui/properties_view_layer.py
index ad7d6008238..6b130d7353d 100644
--- a/release/scripts/startup/bl_ui/properties_view_layer.py
+++ b/release/scripts/startup/bl_ui/properties_view_layer.py
@@ -192,8 +192,6 @@ class ViewLayerCryptomattePanel(ViewLayerButtonsPanel, Panel):
                           view_layer.use_pass_cryptomatte_material,
                           view_layer.use_pass_cryptomatte_asset))
         col.prop(view_layer, "pass_cryptomatte_depth", text="Levels")
-        col.prop(view_layer, "use_pass_cryptomatte_accurate",
-                 text="Accurate Mode")
 
 
 class VIEWLAYER_PT_layer_passes_cryptomatte(ViewLayerCryptomattePanel, Panel):
diff --git a/release/scripts/startup/bl_ui/space_sequencer.py b/release/scripts/startup/bl_ui/space_sequencer.py
index 88cf8db686c..543164f25fc 100644
--- a/release/scripts/startup/bl_ui/space_sequencer.py
+++ b/release/scripts/startup/bl_ui/space_sequencer.py
@@ -141,9 +141,14 @@ class SEQUENCER_HT_header(Header):
 
         layout.separator_spacer()
 
+        tool_settings = context.tool_settings
+        sequencer_tool_settings = tool_settings.sequencer_tool_settings
+
+        if st.view_type == 'PREVIEW':
+            layout.prop(sequencer_tool_settings, "pivot_point", text="", icon_only=True)
+            layout.separator_spacer()
+
         if st.view_type in {'SEQUENCER', 'SEQUENCER_PREVIEW'}:
-            tool_settings = context.tool_settings
-            sequencer_tool_settings = tool_settings.sequencer_tool_settings
             row = layout.row(align=True)
             row.prop(sequencer_tool_settings, "overlap_mode", text="")
             row = layout.row(align=True)
@@ -209,6 +214,7 @@ class SEQUENCER_PT_preview_overlay(Panel):
         layout = self.layout
 
         layout.active = st.show_strip_overlay
+        layout.prop(overlay_settings, "show_image_outline")
         layout.prop(ed, "show_overlay", text="Frame Overlay")
         layout.prop(overlay_settings, "show_safe_areas", text="Safe Areas")
         layout.prop(overlay_settings, "show_metadata", text="Metadata")
@@ -241,6 +247,7 @@ class SEQUENCER_PT_sequencer_overlay(Panel):
 
         layout.prop(overlay_settings, "show_strip_offset", text="Offsets")
         layout.prop(overlay_settings, "show_fcurves", text="F-Curves")
+        layout.prop(overlay_settings, "show_thumbnails", text="Thumbnails")
         layout.prop(overlay_settings, "show_grid", text="Grid")
 
         layout.separator()
@@ -1756,6 +1763,9 @@ class SEQUENCER_PT_adjust_transform(SequencerButtonsPanel, Panel):
         col = layout.column(align=True)
         col.prop(strip.transform, "rotation", text="Rotation")
 
+        col = layout.column(align=True)
+        col.prop(strip.transform, "origin")
+
         row = layout.row(heading="Mirror")
         sub = row.row(align=True)
         sub.prop(strip, "use_flip_x", text="X", toggle=True)
diff --git a/release/scripts/startup/bl_ui/space_toolsystem_common.py b/release/scripts/startup/bl_ui/space_toolsystem_common.py
index 28549098e51..98e29d3baba 100644
--- a/release/scripts/startup/bl_ui/space_toolsystem_common.py
+++ b/release/scripts/startup/bl_ui/space_toolsystem_common.py
@@ -118,6 +118,8 @@ ToolDef = namedtuple(
         "draw_settings",
         # Optional draw cursor.
         "draw_cursor",
+        # Various options, see: `bpy.types.WorkSpaceTool.setup` options argument.
+        "options",
     )
 )
 del namedtuple
@@ -133,6 +135,7 @@ def from_dict(kw_args):
         "description": None,
         "icon": None,
         "cursor": None,
+        "options": None,
         "widget": None,
         "widget_properties": None,
         "keymap": None,
@@ -536,6 +539,9 @@ class ToolSelectPanelHelper:
                     visited.add(km_name)
 
                     yield (km_name, cls.bl_space_type, 'WINDOW', [])
+                    # Callable types don't use fall-backs.
+                    if isinstance(km_name, str):
+                        yield (km_name + " (fallback)", cls.bl_space_type, 'WINDOW', [])
 
     # -------------------------------------------------------------------------
     # Layout Generators
@@ -988,16 +994,22 @@ def _activate_by_item(context, space_type, item, index, *, as_fallback=False):
 
     gizmo_group = item.widget or ""
 
+    idname_fallback = (item_fallback and item_fallback.idname) or ""
+    keymap_fallback = (item_fallback and item_fallback.keymap and item_fallback.keymap[0]) or ""
+    if keymap_fallback:
+        keymap_fallback = keymap_fallback + " (fallback)"
+
     tool.setup(
         idname=item.idname,
         keymap=item.keymap[0] if item.keymap is not None else "",
         cursor=item.cursor or 'DEFAULT',
+        options=item.options or set(),
         gizmo_group=gizmo_group,
         data_block=item.data_block or "",
         operator=item.operator or "",
         index=index,
-        idname_fallback=(item_fallback and item_fallback.idname) or "",
-        keymap_fallback=(item_fallback and item_fallback.keymap and item_fallback.keymap[0]) or "",
+        idname_fallback=idname_fallback,
+        keymap_fallback=keymap_fallback,
     )
 
     if (
diff --git a/release/scripts/startup/bl_ui/space_toolsystem_toolbar.py b/release/scripts/startup/bl_ui/space_toolsystem_toolbar.py
index 83d94235c2e..a4a51cb9910 100644
--- a/release/scripts/startup/bl_ui/space_toolsystem_toolbar.py
+++ b/release/scripts/startup/bl_ui/space_toolsystem_toolbar.py
@@ -105,6 +105,7 @@ class _defs_view3d_generic:
             icon="ops.generic.cursor",
             keymap="3D View Tool: Cursor",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -143,6 +144,7 @@ class _defs_view3d_generic:
             icon="ops.view3d.ruler",
             widget="VIEW3D_GGT_ruler",
             keymap="3D View Tool: Measure",
+            options={'KEYMAP_FALLBACK'},
         )
 
 
@@ -237,6 +239,7 @@ class _defs_annotate:
             cursor='PAINT_BRUSH',
             keymap="Generic Tool: Annotate",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn.with_args(draw_settings=draw_settings_common)
@@ -248,6 +251,7 @@ class _defs_annotate:
             cursor='PAINT_BRUSH',
             keymap="Generic Tool: Annotate Line",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn.with_args(draw_settings=draw_settings_common)
@@ -259,6 +263,7 @@ class _defs_annotate:
             cursor='PAINT_BRUSH',
             keymap="Generic Tool: Annotate Polygon",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -274,6 +279,7 @@ class _defs_annotate:
             cursor='ERASER',
             keymap="Generic Tool: Annotate Eraser",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
 
@@ -543,6 +549,7 @@ class _defs_view3d_add:
             widget="VIEW3D_GGT_placement",
             keymap="3D View Tool: Object, Add Primitive",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -569,6 +576,7 @@ class _defs_view3d_add:
             widget="VIEW3D_GGT_placement",
             keymap="3D View Tool: Object, Add Primitive",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -594,6 +602,7 @@ class _defs_view3d_add:
             widget="VIEW3D_GGT_placement",
             keymap="3D View Tool: Object, Add Primitive",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -619,6 +628,7 @@ class _defs_view3d_add:
             widget="VIEW3D_GGT_placement",
             keymap="3D View Tool: Object, Add Primitive",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -643,6 +653,7 @@ class _defs_view3d_add:
             widget="VIEW3D_GGT_placement",
             keymap="3D View Tool: Object, Add Primitive",
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
 
@@ -1107,6 +1118,7 @@ class _defs_edit_mesh:
             widget=None,
             keymap=(),
             draw_settings=draw_settings,
+            options={'KEYMAP_FALLBACK'},
         )
 
     @ToolDef.from_fn
@@ -1713,6 +1725,7 @@ class _defs_image_generic:
             ),
             icon="ops.generic.cursor",
             keymap=(),
+            options={'KEYMAP_FALLBACK'},
         )
 
     # Currently a place holder so we can switch away from the annotation tool.
@@ -1864,6 +1877,7 @@ class _defs_image_uv_edit:
             # TODO: generic operator (UV version of `VIEW3D_GGT_tool_generic_handle_free`).
             widget=None,
             keymap=(),
+            options={'KEYMAP_FALLBACK'},
         )
 
 
@@ -1897,6 +1911,7 @@ class _defs_image_uv_sculpt:
                 operator="sculpt.uv_sculpt_stroke",
                 keymap="Image Editor Tool: Uv, Sculpt Stroke",
                 draw_cursor=draw_cursor,
+                options={'KEYMAP_FALLBACK'},
             ),
         )
 
@@ -2457,6 +2472,39 @@ class _defs_sequencer_generic:
             keymap="Sequencer Tool: Sample",
         )
 
+    @ToolDef.from_fn
+    def translate():
+        return dict(
+            idname="builtin.move",
+            label="Move",
+            icon="ops.transform.translate",
+            widget="SEQUENCER_GGT_gizmo2d_translate",
+            operator="transform.translate",
+            keymap="Sequencer Tool: Move",
+        )
+
+    @ToolDef.from_fn
+    def rotate():
+        return dict(
+            idname="builtin.rotate",
+            label="Rotate",
+            icon="ops.transform.rotate",
+            widget="SEQUENCER_GGT_gizmo2d_rotate",
+            operator="transform.rotate",
+            keymap="Sequencer Tool: Rotate",
+        )
+
+    @ToolDef.from_fn
+    def scale():
+        return dict(
+            idname="builtin.scale",
+            label="Scale",
+            icon="ops.transform.resize",
+            widget="SEQUENCER_GGT_gizmo2d_resize",
+            operator="transform.resize",
+            keymap="Sequencer Tool: Scale",
+        )
+
 
 class _defs_sequencer_select:
     @ToolDef.from_fn
@@ -3048,6 +3096,10 @@ class SEQUENCER_PT_tools_active(ToolSelectPanelHelper, Panel):
         None: [
         ],
         'PREVIEW': [
+            *_tools_select,
+            _defs_sequencer_generic.translate,
+            _defs_sequencer_generic.rotate,
+            _defs_sequencer_generic.scale,
             _defs_sequencer_generic.sample,
             *_tools_annotate,
         ],
@@ -3057,6 +3109,9 @@ class SEQUENCER_PT_tools_active(ToolSelectPanelHelper, Panel):
         ],
         'SEQUENCER_PREVIEW': [
             *_tools_select,
+            _defs_sequencer_generic.translate,
+            _defs_sequencer_generic.rotate,
+            _defs_sequencer_generic.scale,
             _defs_sequencer_generic.blade,
             _defs_sequencer_generic.sample,
             *_tools_annotate,
diff --git a/release/scripts/startup/bl_ui/space_view3d.py b/release/scripts/startup/bl_ui/space_view3d.py
index a332295715c..3879f7de250 100644
--- a/release/scripts/startup/bl_ui/space_view3d.py
+++ b/release/scripts/startup/bl_ui/space_view3d.py
@@ -3453,6 +3453,7 @@ class VIEW3D_MT_pose_slide(Menu):
         layout.operator("pose.push")
         layout.operator("pose.relax")
         layout.operator("pose.breakdown")
+        layout.operator("pose.blend_to_neighbour")
 
 
 class VIEW3D_MT_pose_propagate(Menu):
@@ -3605,6 +3606,7 @@ class VIEW3D_MT_pose_context_menu(Menu):
         layout.operator("pose.push")
         layout.operator("pose.relax")
         layout.operator("pose.breakdown")
+        layout.operator("pose.blend_to_neighbour")
 
         layout.separator()
 
diff --git a/release/scripts/startup/nodeitems_builtins.py b/release/scripts/startup/nodeitems_builtins.py
index aea9cbc5c62..77ffb609dd2 100644
--- a/release/scripts/startup/nodeitems_builtins.py
+++ b/release/scripts/startup/nodeitems_builtins.py
@@ -503,6 +503,7 @@ geometry_node_categories = [
         NodeItem("GeometryNodeAttributeRemove", poll=geometry_nodes_fields_legacy_poll),
 
         NodeItem("GeometryNodeAttributeCapture", poll=geometry_nodes_fields_poll),
+        NodeItem("GeometryNodeAttributeStatistic", poll=geometry_nodes_fields_poll),
     ]),
     GeometryNodeCategory("GEO_COLOR", "Color", items=[
         NodeItem("ShaderNodeMixRGB"),
@@ -526,6 +527,9 @@ geometry_node_categories = [
         NodeItem("GeometryNodeCurveFill"),
         NodeItem("GeometryNodeCurveTrim"),
         NodeItem("GeometryNodeCurveLength"),
+        NodeItem("GeometryNodeCurveParameter", poll=geometry_nodes_fields_poll),
+        NodeItem("GeometryNodeInputTangent", poll=geometry_nodes_fields_poll),
+        NodeItem("GeometryNodeCurveSample", poll=geometry_nodes_fields_poll),
     ]),
     GeometryNodeCategory("GEO_PRIMITIVES_CURVE", "Curve Primitives", items=[
         NodeItem("GeometryNodeCurvePrimitiveLine"),
@@ -595,6 +599,12 @@ geometry_node_categories = [
         NodeItem("GeometryNodeLegacyRotatePoints", poll=geometry_nodes_fields_legacy_poll),
         NodeItem("GeometryNodeLegacyAlignRotationToVector", poll=geometry_nodes_fields_legacy_poll),
     ]),
+    GeometryNodeCategory("GEO_TEXT", "Text", items=[
+        NodeItem("FunctionNodeStringLength"),
+        NodeItem("FunctionNodeStringSubstring"),
+        NodeItem("FunctionNodeValueToString"),
+        NodeItem("GeometryNodeStringJoin"),
+    ]),
     GeometryNodeCategory("GEO_UTILITIES", "Utilities", items=[
         NodeItem("ShaderNodeMapRange"),
         NodeItem("ShaderNodeClamp"),
diff --git a/source/blender/blenkernel/BKE_blender_version.h b/source/blender/blenkernel/BKE_blender_version.h
index 63d6b9121d2..1f25106404a 100644
--- a/source/blender/blenkernel/BKE_blender_version.h
+++ b/source/blender/blenkernel/BKE_blender_version.h
@@ -39,7 +39,7 @@ extern "C" {
 
 /* Blender file format version. */
 #define BLENDER_FILE_VERSION BLENDER_VERSION
-#define BLENDER_FILE_SUBVERSION 23
+#define BLENDER_FILE_SUBVERSION 25
 
 /* Minimum Blender version that supports reading file written with the current
  * version. Older Blender versions will test this and show a warning if the file
diff --git a/source/blender/render/intern/initrender.h b/source/blender/blenkernel/BKE_curve_to_mesh.hh
index f5ac352752f..cc1ef08908d 100644
--- a/source/blender/render/intern/initrender.h
+++ b/source/blender/blenkernel/BKE_curve_to_mesh.hh
@@ -12,27 +12,20 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- * The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
- * All rights reserved.
- */
-
-/** \file
- * \ingroup render
  */
 
 #pragma once
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+struct Mesh;
+struct CurveEval;
+
+/** \file
+ * \ingroup geo
+ */
 
-/* Functions */
+namespace blender::bke {
 
-void RE_parts_init(Render *re);
-void RE_parts_free(Render *re);
-void RE_parts_clamp(Render *re);
+Mesh *curve_to_mesh_sweep(const CurveEval &curve, const CurveEval &profile);
+Mesh *curve_to_wire_mesh(const CurveEval &curve);
 
-#ifdef __cplusplus
-}
-#endif
+}  // namespace blender::bke
diff --git a/source/blender/blenkernel/BKE_geometry_set.hh b/source/blender/blenkernel/BKE_geometry_set.hh
index 98f5de43f84..317d513dae6 100644
--- a/source/blender/blenkernel/BKE_geometry_set.hh
+++ b/source/blender/blenkernel/BKE_geometry_set.hh
@@ -580,6 +580,9 @@ class InstancesComponent : public GeometryComponent {
 
   blender::Span<InstanceReference> references() const;
 
+  void ensure_geometry_instances();
+  GeometrySet &geometry_set_from_reference(const int reference_index);
+
   blender::Span<int> instance_reference_handles() const;
   blender::MutableSpan<int> instance_reference_handles();
   blender::MutableSpan<blender::float4x4> instance_transforms();
@@ -588,6 +591,7 @@ class InstancesComponent : public GeometryComponent {
   blender::Span<int> instance_ids() const;
 
   int instances_amount() const;
+  int references_amount() const;
 
   blender::Span<int> almost_unique_ids() const;
 
diff --git a/source/blender/blenkernel/BKE_gpencil_geom.h b/source/blender/blenkernel/BKE_gpencil_geom.h
index d472fd6f02b..a9cd553a8fe 100644
--- a/source/blender/blenkernel/BKE_gpencil_geom.h
+++ b/source/blender/blenkernel/BKE_gpencil_geom.h
@@ -114,7 +114,12 @@ void BKE_gpencil_dissolve_points(struct bGPdata *gpd,
 bool BKE_gpencil_stroke_stretch(struct bGPDstroke *gps,
                                 const float dist,
                                 const float overshoot_fac,
-                                const short mode);
+                                const short mode,
+                                const bool follow_curvature,
+                                const int extra_point_count,
+                                const float segment_influence,
+                                const float max_angle,
+                                const bool invert_curvature);
 bool BKE_gpencil_stroke_trim_points(struct bGPDstroke *gps,
                                     const int index_from,
                                     const int index_to);
diff --git a/source/blender/blenkernel/BKE_node.h b/source/blender/blenkernel/BKE_node.h
index 8e82ab6d6be..42e2cda8de3 100644
--- a/source/blender/blenkernel/BKE_node.h
+++ b/source/blender/blenkernel/BKE_node.h
@@ -1494,6 +1494,11 @@ int ntreeTexExecTree(struct bNodeTree *ntree,
 #define GEO_NODE_MATERIAL_SELECTION 1081
 #define GEO_NODE_MATERIAL_ASSIGN 1082
 #define GEO_NODE_REALIZE_INSTANCES 1083
+#define GEO_NODE_ATTRIBUTE_STATISTIC 1084
+#define GEO_NODE_CURVE_SAMPLE 1085
+#define GEO_NODE_INPUT_TANGENT 1086
+#define GEO_NODE_STRING_JOIN 1087
+#define GEO_NODE_CURVE_PARAMETER 1088
 
 /** \} */
 
@@ -1507,6 +1512,9 @@ int ntreeTexExecTree(struct bNodeTree *ntree,
 #define FN_NODE_INPUT_VECTOR 1207
 #define FN_NODE_INPUT_STRING 1208
 #define FN_NODE_FLOAT_TO_INT 1209
+#define FN_NODE_VALUE_TO_STRING 1210
+#define FN_NODE_STRING_LENGTH 1211
+#define FN_NODE_STRING_SUBSTRING 1212
 
 /** \} */
 
diff --git a/source/blender/blenkernel/BKE_spline.hh b/source/blender/blenkernel/BKE_spline.hh
index 0fbf39a52fa..541ff19c1cd 100644
--- a/source/blender/blenkernel/BKE_spline.hh
+++ b/source/blender/blenkernel/BKE_spline.hh
@@ -565,6 +565,7 @@ struct CurveEval {
 
   blender::Array<int> control_point_offsets() const;
   blender::Array<int> evaluated_point_offsets() const;
+  blender::Array<float> accumulated_spline_lengths() const;
 
   void assert_valid_point_attributes() const;
 };
diff --git a/source/blender/blenkernel/CMakeLists.txt b/source/blender/blenkernel/CMakeLists.txt
index 0b082bf1c5a..de7864ef36a 100644
--- a/source/blender/blenkernel/CMakeLists.txt
+++ b/source/blender/blenkernel/CMakeLists.txt
@@ -116,6 +116,7 @@ set(SRC
   intern/curve_decimate.c
   intern/curve_deform.c
   intern/curve_eval.cc
+  intern/curve_to_mesh_convert.cc
   intern/curveprofile.c
   intern/customdata.c
   intern/customdata_file.c
@@ -332,6 +333,7 @@ set(SRC
   BKE_cryptomatte.h
   BKE_cryptomatte.hh
   BKE_curve.h
+  BKE_curve_to_mesh.hh
   BKE_curveprofile.h
   BKE_customdata.h
   BKE_customdata_file.h
diff --git a/source/blender/blenkernel/intern/curve_eval.cc b/source/blender/blenkernel/intern/curve_eval.cc
index ea84766943d..8eec7f5dfab 100644
--- a/source/blender/blenkernel/intern/curve_eval.cc
+++ b/source/blender/blenkernel/intern/curve_eval.cc
@@ -143,6 +143,23 @@ blender::Array<int> CurveEval::evaluated_point_offsets() const
   return offsets;
 }
 
+/**
+ * Return the accumulated length at the start of every spline in the curve.
+ *
+ * \note The result is one longer than the spline count; the last element is the total length.
+ */
+blender::Array<float> CurveEval::accumulated_spline_lengths() const
+{
+  Array<float> spline_lengths(splines_.size() + 1);
+  float spline_length = 0.0f;
+  for (const int i : splines_.index_range()) {
+    spline_lengths[i] = spline_length;
+    spline_length += splines_[i]->length();
+  }
+  spline_lengths.last() = spline_length;
+  return spline_lengths;
+}
+
 static BezierSpline::HandleType handle_type_from_dna_bezt(const eBezTriple_Handle dna_handle_type)
 {
   switch (dna_handle_type) {
diff --git a/source/blender/blenkernel/intern/curve_to_mesh_convert.cc b/source/blender/blenkernel/intern/curve_to_mesh_convert.cc
new file mode 100644
index 00000000000..5f2f945192c
--- /dev/null
+++ b/source/blender/blenkernel/intern/curve_to_mesh_convert.cc
@@ -0,0 +1,739 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_array.hh"
+#include "BLI_set.hh"
+#include "BLI_task.hh"
+
+#include "DNA_mesh_types.h"
+#include "DNA_meshdata_types.h"
+
+#include "BKE_attribute_access.hh"
+#include "BKE_attribute_math.hh"
+#include "BKE_geometry_set.hh"
+#include "BKE_material.h"
+#include "BKE_mesh.h"
+#include "BKE_spline.hh"
+
+#include "BKE_curve_to_mesh.hh"
+
+using blender::fn::GMutableSpan;
+using blender::fn::GSpan;
+using blender::fn::GVArray_Typed;
+using blender::fn::GVArrayPtr;
+
+namespace blender::bke {
+
+/** Information about the creation of one curve spline and profile spline combination. */
+struct ResultInfo {
+  const Spline &spline;
+  const Spline &profile;
+  int vert_offset;
+  int edge_offset;
+  int loop_offset;
+  int poly_offset;
+  int spline_vert_len;
+  int spline_edge_len;
+  int profile_vert_len;
+  int profile_edge_len;
+};
+
+static void vert_extrude_to_mesh_data(const Spline &spline,
+                                      const float3 profile_vert,
+                                      MutableSpan<MVert> r_verts,
+                                      MutableSpan<MEdge> r_edges,
+                                      const int vert_offset,
+                                      const int edge_offset)
+{
+  Span<float3> positions = spline.evaluated_positions();
+
+  for (const int i : IndexRange(positions.size() - 1)) {
+    MEdge &edge = r_edges[edge_offset + i];
+    edge.v1 = vert_offset + i;
+    edge.v2 = vert_offset + i + 1;
+    edge.flag = ME_LOOSEEDGE;
+  }
+
+  if (spline.is_cyclic() && spline.evaluated_edges_size() > 1) {
+    MEdge &edge = r_edges[edge_offset + spline.evaluated_edges_size() - 1];
+    edge.v1 = vert_offset;
+    edge.v2 = vert_offset + positions.size() - 1;
+    edge.flag = ME_LOOSEEDGE;
+  }
+
+  for (const int i : positions.index_range()) {
+    MVert &vert = r_verts[vert_offset + i];
+    copy_v3_v3(vert.co, positions[i] + profile_vert);
+  }
+}
+
+static void mark_edges_sharp(MutableSpan<MEdge> edges)
+{
+  for (MEdge &edge : edges) {
+    edge.flag |= ME_SHARP;
+  }
+}
+
+static void spline_extrude_to_mesh_data(const ResultInfo &info,
+                                        MutableSpan<MVert> r_verts,
+                                        MutableSpan<MEdge> r_edges,
+                                        MutableSpan<MLoop> r_loops,
+                                        MutableSpan<MPoly> r_polys)
+{
+  const Spline &spline = info.spline;
+  const Spline &profile = info.profile;
+  if (info.profile_vert_len == 1) {
+    vert_extrude_to_mesh_data(spline,
+                              profile.evaluated_positions()[0],
+                              r_verts,
+                              r_edges,
+                              info.vert_offset,
+                              info.edge_offset);
+    return;
+  }
+
+  /* Add the edges running along the length of the curve, starting at each profile vertex. */
+  const int spline_edges_start = info.edge_offset;
+  for (const int i_profile : IndexRange(info.profile_vert_len)) {
+    const int profile_edge_offset = spline_edges_start + i_profile * info.spline_edge_len;
+    for (const int i_ring : IndexRange(info.spline_edge_len)) {
+      const int i_next_ring = (i_ring == info.spline_vert_len - 1) ? 0 : i_ring + 1;
+
+      const int ring_vert_offset = info.vert_offset + info.profile_vert_len * i_ring;
+      const int next_ring_vert_offset = info.vert_offset + info.profile_vert_len * i_next_ring;
+
+      MEdge &edge = r_edges[profile_edge_offset + i_ring];
+      edge.v1 = ring_vert_offset + i_profile;
+      edge.v2 = next_ring_vert_offset + i_profile;
+      edge.flag = ME_EDGEDRAW | ME_EDGERENDER;
+    }
+  }
+
+  /* Add the edges running along each profile ring. */
+  const int profile_edges_start = spline_edges_start +
+                                  info.profile_vert_len * info.spline_edge_len;
+  for (const int i_ring : IndexRange(info.spline_vert_len)) {
+    const int ring_vert_offset = info.vert_offset + info.profile_vert_len * i_ring;
+
+    const int ring_edge_offset = profile_edges_start + i_ring * info.profile_edge_len;
+    for (const int i_profile : IndexRange(info.profile_edge_len)) {
+      const int i_next_profile = (i_profile == info.profile_vert_len - 1) ? 0 : i_profile + 1;
+
+      MEdge &edge = r_edges[ring_edge_offset + i_profile];
+      edge.v1 = ring_vert_offset + i_profile;
+      edge.v2 = ring_vert_offset + i_next_profile;
+      edge.flag = ME_EDGEDRAW | ME_EDGERENDER;
+    }
+  }
+
+  /* Calculate poly and corner indices. */
+  for (const int i_ring : IndexRange(info.spline_edge_len)) {
+    const int i_next_ring = (i_ring == info.spline_vert_len - 1) ? 0 : i_ring + 1;
+
+    const int ring_vert_offset = info.vert_offset + info.profile_vert_len * i_ring;
+    const int next_ring_vert_offset = info.vert_offset + info.profile_vert_len * i_next_ring;
+
+    const int ring_edge_start = profile_edges_start + info.profile_edge_len * i_ring;
+    const int next_ring_edge_offset = profile_edges_start + info.profile_edge_len * i_next_ring;
+
+    const int ring_poly_offset = info.poly_offset + i_ring * info.profile_edge_len;
+    const int ring_loop_offset = info.loop_offset + i_ring * info.profile_edge_len * 4;
+
+    for (const int i_profile : IndexRange(info.profile_edge_len)) {
+      const int ring_segment_loop_offset = ring_loop_offset + i_profile * 4;
+      const int i_next_profile = (i_profile == info.profile_vert_len - 1) ? 0 : i_profile + 1;
+
+      const int spline_edge_start = spline_edges_start + info.spline_edge_len * i_profile;
+      const int next_spline_edge_start = spline_edges_start +
+                                         info.spline_edge_len * i_next_profile;
+
+      MPoly &poly = r_polys[ring_poly_offset + i_profile];
+      poly.loopstart = ring_segment_loop_offset;
+      poly.totloop = 4;
+      poly.flag = ME_SMOOTH;
+
+      MLoop &loop_a = r_loops[ring_segment_loop_offset];
+      loop_a.v = ring_vert_offset + i_profile;
+      loop_a.e = ring_edge_start + i_profile;
+      MLoop &loop_b = r_loops[ring_segment_loop_offset + 1];
+      loop_b.v = ring_vert_offset + i_next_profile;
+      loop_b.e = next_spline_edge_start + i_ring;
+      MLoop &loop_c = r_loops[ring_segment_loop_offset + 2];
+      loop_c.v = next_ring_vert_offset + i_next_profile;
+      loop_c.e = next_ring_edge_offset + i_profile;
+      MLoop &loop_d = r_loops[ring_segment_loop_offset + 3];
+      loop_d.v = next_ring_vert_offset + i_profile;
+      loop_d.e = spline_edge_start + i_ring;
+    }
+  }
+
+  /* Calculate the positions of each profile ring profile along the spline. */
+  Span<float3> positions = spline.evaluated_positions();
+  Span<float3> tangents = spline.evaluated_tangents();
+  Span<float3> normals = spline.evaluated_normals();
+  Span<float3> profile_positions = profile.evaluated_positions();
+
+  GVArray_Typed<float> radii = spline.interpolate_to_evaluated(spline.radii());
+  for (const int i_ring : IndexRange(info.spline_vert_len)) {
+    float4x4 point_matrix = float4x4::from_normalized_axis_data(
+        positions[i_ring], normals[i_ring], tangents[i_ring]);
+    point_matrix.apply_scale(radii[i_ring]);
+
+    const int ring_vert_start = info.vert_offset + i_ring * info.profile_vert_len;
+    for (const int i_profile : IndexRange(info.profile_vert_len)) {
+      MVert &vert = r_verts[ring_vert_start + i_profile];
+      copy_v3_v3(vert.co, point_matrix * profile_positions[i_profile]);
+    }
+  }
+
+  /* Mark edge loops from sharp vector control points sharp. */
+  if (profile.type() == Spline::Type::Bezier) {
+    const BezierSpline &bezier_spline = static_cast<const BezierSpline &>(profile);
+    Span<int> control_point_offsets = bezier_spline.control_point_offsets();
+    for (const int i : IndexRange(bezier_spline.size())) {
+      if (bezier_spline.point_is_sharp(i)) {
+        mark_edges_sharp(
+            r_edges.slice(spline_edges_start + info.spline_edge_len * control_point_offsets[i],
+                          info.spline_edge_len));
+      }
+    }
+  }
+}
+
+static inline int spline_extrude_vert_size(const Spline &curve, const Spline &profile)
+{
+  return curve.evaluated_points_size() * profile.evaluated_points_size();
+}
+
+static inline int spline_extrude_edge_size(const Spline &curve, const Spline &profile)
+{
+  /* Add the ring edges, with one ring for every curve vertex, and the edge loops
+   * that run along the length of the curve, starting on the first profile. */
+  return curve.evaluated_points_size() * profile.evaluated_edges_size() +
+         curve.evaluated_edges_size() * profile.evaluated_points_size();
+}
+
+static inline int spline_extrude_loop_size(const Spline &curve, const Spline &profile)
+{
+  return curve.evaluated_edges_size() * profile.evaluated_edges_size() * 4;
+}
+
+static inline int spline_extrude_poly_size(const Spline &curve, const Spline &profile)
+{
+  return curve.evaluated_edges_size() * profile.evaluated_edges_size();
+}
+
+struct ResultOffsets {
+  Array<int> vert;
+  Array<int> edge;
+  Array<int> loop;
+  Array<int> poly;
+};
+static ResultOffsets calculate_result_offsets(Span<SplinePtr> profiles, Span<SplinePtr> curves)
+{
+  const int total = profiles.size() * curves.size();
+  Array<int> vert(total + 1);
+  Array<int> edge(total + 1);
+  Array<int> loop(total + 1);
+  Array<int> poly(total + 1);
+
+  int mesh_index = 0;
+  int vert_offset = 0;
+  int edge_offset = 0;
+  int loop_offset = 0;
+  int poly_offset = 0;
+  for (const int i_spline : curves.index_range()) {
+    for (const int i_profile : profiles.index_range()) {
+      vert[mesh_index] = vert_offset;
+      edge[mesh_index] = edge_offset;
+      loop[mesh_index] = loop_offset;
+      poly[mesh_index] = poly_offset;
+      vert_offset += spline_extrude_vert_size(*curves[i_spline], *profiles[i_profile]);
+      edge_offset += spline_extrude_edge_size(*curves[i_spline], *profiles[i_profile]);
+      loop_offset += spline_extrude_loop_size(*curves[i_spline], *profiles[i_profile]);
+      poly_offset += spline_extrude_poly_size(*curves[i_spline], *profiles[i_profile]);
+      mesh_index++;
+    }
+  }
+  vert.last() = vert_offset;
+  edge.last() = edge_offset;
+  loop.last() = loop_offset;
+  poly.last() = poly_offset;
+
+  return {std::move(vert), std::move(edge), std::move(loop), std::move(poly)};
+}
+
+static AttributeDomain get_result_attribute_domain(const MeshComponent &component,
+                                                   const AttributeIDRef &attribute_id)
+{
+  /* Only use a different domain if it is builtin and must only exist on one domain. */
+  if (!component.attribute_is_builtin(attribute_id)) {
+    return ATTR_DOMAIN_POINT;
+  }
+
+  std::optional<AttributeMetaData> meta_data = component.attribute_get_meta_data(attribute_id);
+  if (!meta_data) {
+    /* This function has to return something in this case, but it shouldn't be used,
+     * so return an output that will assert later if the code attempts to handle it. */
+    return ATTR_DOMAIN_AUTO;
+  }
+
+  return meta_data->domain;
+}
+
+/**
+ * The data stored in the attribute and its domain from #OutputAttribute, to avoid calling
+ * `as_span()` for every single profile and curve spline combination, and for readability.
+ */
+struct ResultAttributeData {
+  GMutableSpan data;
+  AttributeDomain domain;
+};
+
+static std::optional<ResultAttributeData> create_attribute_and_get_span(
+    MeshComponent &component,
+    const AttributeIDRef &attribute_id,
+    AttributeMetaData meta_data,
+    Vector<OutputAttribute> &r_attributes)
+{
+  const AttributeDomain domain = get_result_attribute_domain(component, attribute_id);
+  OutputAttribute attribute = component.attribute_try_get_for_output_only(
+      attribute_id, domain, meta_data.data_type);
+  if (!attribute) {
+    return std::nullopt;
+  }
+
+  GMutableSpan span = attribute.as_span();
+  r_attributes.append(std::move(attribute));
+  return std::make_optional<ResultAttributeData>({span, domain});
+}
+
+/**
+ * Store the references to the attribute data from the curve and profile inputs. Here we rely on
+ * the invariants of the storage of curve attributes, that the order will be consistent between
+ * splines, and all splines will have the same attributes.
+ */
+struct ResultAttributes {
+  /**
+   * Result attributes on the mesh corresponding to each attribute on the curve input, in the same
+   * order. The data is optional only in case the attribute does not exist on the mesh for some
+   * reason, like "shade_smooth" when the result has no faces.
+   */
+  Vector<std::optional<ResultAttributeData>> curve_point_attributes;
+  Vector<std::optional<ResultAttributeData>> curve_spline_attributes;
+
+  /**
+   * Result attributes corresponding the attributes on the profile input, in the same order. The
+   * attributes are optional in case the attribute names correspond to a names used by the curve
+   * input, in which case the curve input attributes take precedence.
+   */
+  Vector<std::optional<ResultAttributeData>> profile_point_attributes;
+  Vector<std::optional<ResultAttributeData>> profile_spline_attributes;
+
+  /**
+   * Because some builtin attributes are not stored contiguously, and the curve inputs might have
+   * attributes with those names, it's necessary to keep OutputAttributes around to give access to
+   * the result data in a contiguous array.
+   */
+  Vector<OutputAttribute> attributes;
+};
+static ResultAttributes create_result_attributes(const CurveEval &curve,
+                                                 const CurveEval &profile,
+                                                 Mesh &mesh)
+{
+  MeshComponent mesh_component;
+  mesh_component.replace(&mesh, GeometryOwnershipType::Editable);
+  Set<AttributeIDRef> curve_attributes;
+
+  /* In order to prefer attributes on the main curve input when there are name collisions, first
+   * check the attributes on the curve, then add attributes on the profile that are not also on the
+   * main curve input. */
+  ResultAttributes result;
+  curve.splines().first()->attributes.foreach_attribute(
+      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
+        curve_attributes.add_new(id);
+        result.curve_point_attributes.append(
+            create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
+        return true;
+      },
+      ATTR_DOMAIN_POINT);
+  curve.attributes.foreach_attribute(
+      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
+        curve_attributes.add_new(id);
+        result.curve_spline_attributes.append(
+            create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
+        return true;
+      },
+      ATTR_DOMAIN_CURVE);
+  profile.splines().first()->attributes.foreach_attribute(
+      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
+        if (curve_attributes.contains(id)) {
+          result.profile_point_attributes.append({});
+        }
+        else {
+          result.profile_point_attributes.append(
+              create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
+        }
+        return true;
+      },
+      ATTR_DOMAIN_POINT);
+  profile.attributes.foreach_attribute(
+      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
+        if (curve_attributes.contains(id)) {
+          result.profile_spline_attributes.append({});
+        }
+        else {
+          result.profile_spline_attributes.append(
+              create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
+        }
+        return true;
+      },
+      ATTR_DOMAIN_CURVE);
+
+  return result;
+}
+
+template<typename T>
+static void copy_curve_point_data_to_mesh_verts(const Span<T> src,
+                                                const ResultInfo &info,
+                                                MutableSpan<T> dst)
+{
+  for (const int i_ring : IndexRange(info.spline_vert_len)) {
+    const int ring_vert_start = info.vert_offset + i_ring * info.profile_vert_len;
+    dst.slice(ring_vert_start, info.profile_vert_len).fill(src[i_ring]);
+  }
+}
+
+template<typename T>
+static void copy_curve_point_data_to_mesh_edges(const Span<T> src,
+                                                const ResultInfo &info,
+                                                MutableSpan<T> dst)
+{
+  const int edges_start = info.edge_offset + info.profile_vert_len * info.spline_edge_len;
+  for (const int i_ring : IndexRange(info.spline_vert_len)) {
+    const int ring_edge_start = edges_start + info.profile_edge_len * i_ring;
+    dst.slice(ring_edge_start, info.profile_edge_len).fill(src[i_ring]);
+  }
+}
+
+template<typename T>
+static void copy_curve_point_data_to_mesh_faces(const Span<T> src,
+                                                const ResultInfo &info,
+                                                MutableSpan<T> dst)
+{
+  for (const int i_ring : IndexRange(info.spline_edge_len)) {
+    const int ring_face_start = info.poly_offset + info.profile_edge_len * i_ring;
+    dst.slice(ring_face_start, info.profile_edge_len).fill(src[i_ring]);
+  }
+}
+
+static void copy_curve_point_attribute_to_mesh(const GSpan src,
+                                               const ResultInfo &info,
+                                               ResultAttributeData &dst)
+{
+  GVArrayPtr interpolated_gvarray = info.spline.interpolate_to_evaluated(src);
+  GSpan interpolated = interpolated_gvarray->get_internal_span();
+
+  attribute_math::convert_to_static_type(src.type(), [&](auto dummy) {
+    using T = decltype(dummy);
+    switch (dst.domain) {
+      case ATTR_DOMAIN_POINT:
+        copy_curve_point_data_to_mesh_verts(interpolated.typed<T>(), info, dst.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_EDGE:
+        copy_curve_point_data_to_mesh_edges(interpolated.typed<T>(), info, dst.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_FACE:
+        copy_curve_point_data_to_mesh_faces(interpolated.typed<T>(), info, dst.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_CORNER:
+        /* Unsupported for now, since there are no builtin attributes to convert into. */
+        break;
+      default:
+        BLI_assert_unreachable();
+        break;
+    }
+  });
+}
+
+template<typename T>
+static void copy_profile_point_data_to_mesh_verts(const Span<T> src,
+                                                  const ResultInfo &info,
+                                                  MutableSpan<T> dst)
+{
+  for (const int i_ring : IndexRange(info.spline_vert_len)) {
+    const int profile_vert_start = info.vert_offset + i_ring * info.profile_vert_len;
+    for (const int i_profile : IndexRange(info.profile_vert_len)) {
+      dst[profile_vert_start + i_profile] = src[i_profile];
+    }
+  }
+}
+
+template<typename T>
+static void copy_profile_point_data_to_mesh_edges(const Span<T> src,
+                                                  const ResultInfo &info,
+                                                  MutableSpan<T> dst)
+{
+  for (const int i_profile : IndexRange(info.profile_vert_len)) {
+    const int profile_edge_offset = info.edge_offset + i_profile * info.spline_edge_len;
+    dst.slice(profile_edge_offset, info.spline_edge_len).fill(src[i_profile]);
+  }
+}
+
+template<typename T>
+static void copy_profile_point_data_to_mesh_faces(const Span<T> src,
+                                                  const ResultInfo &info,
+                                                  MutableSpan<T> dst)
+{
+  for (const int i_ring : IndexRange(info.spline_edge_len)) {
+    const int profile_face_start = info.poly_offset + i_ring * info.profile_edge_len;
+    for (const int i_profile : IndexRange(info.profile_edge_len)) {
+      dst[profile_face_start + i_profile] = src[i_profile];
+    }
+  }
+}
+
+static void copy_profile_point_attribute_to_mesh(const GSpan src,
+                                                 const ResultInfo &info,
+                                                 ResultAttributeData &dst)
+{
+  GVArrayPtr interpolated_gvarray = info.profile.interpolate_to_evaluated(src);
+  GSpan interpolated = interpolated_gvarray->get_internal_span();
+
+  attribute_math::convert_to_static_type(src.type(), [&](auto dummy) {
+    using T = decltype(dummy);
+    switch (dst.domain) {
+      case ATTR_DOMAIN_POINT:
+        copy_profile_point_data_to_mesh_verts(interpolated.typed<T>(), info, dst.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_EDGE:
+        copy_profile_point_data_to_mesh_edges(interpolated.typed<T>(), info, dst.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_FACE:
+        copy_profile_point_data_to_mesh_faces(interpolated.typed<T>(), info, dst.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_CORNER:
+        /* Unsupported for now, since there are no builtin attributes to convert into. */
+        break;
+      default:
+        BLI_assert_unreachable();
+        break;
+    }
+  });
+}
+
+static void copy_point_domain_attributes_to_mesh(const ResultInfo &info,
+                                                 ResultAttributes &attributes)
+{
+  if (!attributes.curve_point_attributes.is_empty()) {
+    int i = 0;
+    info.spline.attributes.foreach_attribute(
+        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
+          if (attributes.curve_point_attributes[i]) {
+            copy_curve_point_attribute_to_mesh(*info.spline.attributes.get_for_read(id),
+                                               info,
+                                               *attributes.curve_point_attributes[i]);
+          }
+          i++;
+          return true;
+        },
+        ATTR_DOMAIN_POINT);
+  }
+  if (!attributes.profile_point_attributes.is_empty()) {
+    int i = 0;
+    info.profile.attributes.foreach_attribute(
+        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
+          if (attributes.profile_point_attributes[i]) {
+            copy_profile_point_attribute_to_mesh(*info.profile.attributes.get_for_read(id),
+                                                 info,
+                                                 *attributes.profile_point_attributes[i]);
+          }
+          i++;
+          return true;
+        },
+        ATTR_DOMAIN_POINT);
+  }
+}
+
+template<typename T>
+static void copy_spline_data_to_mesh(Span<T> src, Span<int> offsets, MutableSpan<T> dst)
+{
+  for (const int i : IndexRange(src.size())) {
+    dst.slice(offsets[i], offsets[i + 1] - offsets[i]).fill(src[i]);
+  }
+}
+
+/**
+ * Since the offsets for each combination of curve and profile spline are stored for every mesh
+ * domain, and this just needs to fill the chunks corresponding to each combination, we can use
+ * the same function for all mesh domains.
+ */
+static void copy_spline_attribute_to_mesh(const GSpan src,
+                                          const ResultOffsets &offsets,
+                                          ResultAttributeData &dst_attribute)
+{
+  attribute_math::convert_to_static_type(src.type(), [&](auto dummy) {
+    using T = decltype(dummy);
+    switch (dst_attribute.domain) {
+      case ATTR_DOMAIN_POINT:
+        copy_spline_data_to_mesh(src.typed<T>(), offsets.vert, dst_attribute.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_EDGE:
+        copy_spline_data_to_mesh(src.typed<T>(), offsets.edge, dst_attribute.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_FACE:
+        copy_spline_data_to_mesh(src.typed<T>(), offsets.poly, dst_attribute.data.typed<T>());
+        break;
+      case ATTR_DOMAIN_CORNER:
+        copy_spline_data_to_mesh(src.typed<T>(), offsets.loop, dst_attribute.data.typed<T>());
+        break;
+      default:
+        BLI_assert_unreachable();
+        break;
+    }
+  });
+}
+
+static void copy_spline_domain_attributes_to_mesh(const CurveEval &curve,
+                                                  const CurveEval &profile,
+                                                  const ResultOffsets &offsets,
+                                                  ResultAttributes &attributes)
+{
+  if (!attributes.curve_spline_attributes.is_empty()) {
+    int i = 0;
+    curve.attributes.foreach_attribute(
+        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
+          if (attributes.curve_spline_attributes[i]) {
+            copy_spline_attribute_to_mesh(*curve.attributes.get_for_read(id),
+                                          offsets,
+                                          *attributes.curve_spline_attributes[i]);
+          }
+          i++;
+          return true;
+        },
+        ATTR_DOMAIN_CURVE);
+  }
+  if (!attributes.profile_spline_attributes.is_empty()) {
+    int i = 0;
+    profile.attributes.foreach_attribute(
+        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
+          if (attributes.profile_spline_attributes[i]) {
+            copy_spline_attribute_to_mesh(*profile.attributes.get_for_read(id),
+                                          offsets,
+                                          *attributes.profile_spline_attributes[i]);
+          }
+          i++;
+          return true;
+        },
+        ATTR_DOMAIN_CURVE);
+  }
+}
+
+/**
+ * Extrude all splines in the profile curve along the path of every spline in the curve input.
+ * Transfer curve attributes to the mesh.
+ *
+ * \note Normal calculation is by far the slowest part of calculations relating to the result mesh.
+ * Although it would be a sensible decision to use the better topology information available while
+ * generating the mesh to also generate the normals, that work may wasted if the output mesh is
+ * changed anyway in a way that affects the normals. So currently this code uses the safer /
+ * simpler solution of deferring normal calculation to the rest of Blender.
+ */
+Mesh *curve_to_mesh_sweep(const CurveEval &curve, const CurveEval &profile)
+{
+  Span<SplinePtr> profiles = profile.splines();
+  Span<SplinePtr> curves = curve.splines();
+
+  const ResultOffsets offsets = calculate_result_offsets(profiles, curves);
+  if (offsets.vert.last() == 0) {
+    return nullptr;
+  }
+
+  Mesh *mesh = BKE_mesh_new_nomain(
+      offsets.vert.last(), offsets.edge.last(), 0, offsets.loop.last(), offsets.poly.last());
+  BKE_id_material_eval_ensure_default_slot(&mesh->id);
+  mesh->flag |= ME_AUTOSMOOTH;
+  mesh->smoothresh = DEG2RADF(180.0f);
+  BKE_mesh_normals_tag_dirty(mesh);
+
+  ResultAttributes attributes = create_result_attributes(curve, profile, *mesh);
+
+  threading::parallel_for(curves.index_range(), 128, [&](IndexRange curves_range) {
+    for (const int i_spline : curves_range) {
+      const Spline &spline = *curves[i_spline];
+      if (spline.evaluated_points_size() == 0) {
+        continue;
+      }
+      const int spline_start_index = i_spline * profiles.size();
+      threading::parallel_for(profiles.index_range(), 128, [&](IndexRange profiles_range) {
+        for (const int i_profile : profiles_range) {
+          const Spline &profile = *profiles[i_profile];
+          const int i_mesh = spline_start_index + i_profile;
+          ResultInfo info{
+              spline,
+              profile,
+              offsets.vert[i_mesh],
+              offsets.edge[i_mesh],
+              offsets.loop[i_mesh],
+              offsets.poly[i_mesh],
+              spline.evaluated_points_size(),
+              spline.evaluated_edges_size(),
+              profile.evaluated_points_size(),
+              profile.evaluated_edges_size(),
+          };
+
+          spline_extrude_to_mesh_data(info,
+                                      {mesh->mvert, mesh->totvert},
+                                      {mesh->medge, mesh->totedge},
+                                      {mesh->mloop, mesh->totloop},
+                                      {mesh->mpoly, mesh->totpoly});
+
+          copy_point_domain_attributes_to_mesh(info, attributes);
+        }
+      });
+    }
+  });
+
+  copy_spline_domain_attributes_to_mesh(curve, profile, offsets, attributes);
+
+  for (OutputAttribute &output_attribute : attributes.attributes) {
+    output_attribute.save();
+  }
+
+  return mesh;
+}
+
+static CurveEval get_curve_single_vert()
+{
+  CurveEval curve;
+  std::unique_ptr<PolySpline> spline = std::make_unique<PolySpline>();
+  spline->add_point(float3(0), 0, 0.0f);
+  curve.add_spline(std::move(spline));
+
+  return curve;
+}
+
+/**
+ * Create a loose-edge mesh based on the evaluated path of the curve's splines.
+ * Transfer curve attributes to the mesh.
+ */
+Mesh *curve_to_wire_mesh(const CurveEval &curve)
+{
+  static const CurveEval vert_curve = get_curve_single_vert();
+  return curve_to_mesh_sweep(curve, vert_curve);
+}
+
+}  // namespace blender::bke
diff --git a/source/blender/blenkernel/intern/geometry_component_instances.cc b/source/blender/blenkernel/intern/geometry_component_instances.cc
index c4e1fe2f8e9..9479d012cb8 100644
--- a/source/blender/blenkernel/intern/geometry_component_instances.cc
+++ b/source/blender/blenkernel/intern/geometry_component_instances.cc
@@ -24,6 +24,7 @@
 #include "DNA_collection_types.h"
 
 #include "BKE_geometry_set.hh"
+#include "BKE_geometry_set_instances.hh"
 
 #include "attribute_access_intern.hh"
 
@@ -32,6 +33,7 @@ using blender::Map;
 using blender::MutableSpan;
 using blender::Set;
 using blender::Span;
+using blender::VectorSet;
 
 /* -------------------------------------------------------------------- */
 /** \name Geometry Component Implementation
@@ -120,6 +122,52 @@ blender::Span<int> InstancesComponent::instance_ids() const
 }
 
 /**
+ * If references have a collection or object type, convert them into geometry instances. This
+ * will join geometry components from nested instances if necessary. After that, the geometry
+ * sets can be edited.
+ */
+void InstancesComponent::ensure_geometry_instances()
+{
+  VectorSet<InstanceReference> new_references;
+  new_references.reserve(references_.size());
+  for (const InstanceReference &reference : references_) {
+    if (reference.type() == InstanceReference::Type::Object) {
+      GeometrySet geometry_set;
+      InstancesComponent &instances = geometry_set.get_component_for_write<InstancesComponent>();
+      const int handle = instances.add_reference(reference.object());
+      instances.add_instance(handle, float4x4::identity());
+      new_references.add_new(geometry_set);
+    }
+    else if (reference.type() == InstanceReference::Type::Collection) {
+      GeometrySet geometry_set;
+      InstancesComponent &instances = geometry_set.get_component_for_write<InstancesComponent>();
+      const int handle = instances.add_reference(reference.collection());
+      instances.add_instance(handle, float4x4::identity());
+      new_references.add_new(geometry_set);
+    }
+    else {
+      new_references.add_new(reference);
+    }
+  }
+  references_ = std::move(new_references);
+}
+
+/**
+ * With write access to the instances component, the data in the instanced geometry sets can be
+ * changed. This is a function on the component rather than each reference to ensure `const`
+ * correctness for that reason.
+ */
+GeometrySet &InstancesComponent::geometry_set_from_reference(const int reference_index)
+{
+  /* If this assert fails, it means #ensure_geometry_instances must be called first. */
+  BLI_assert(references_[reference_index].type() == InstanceReference::Type::GeometrySet);
+
+  /* The const cast is okay because the instance's hash in the set
+   * is not changed by adjusting the data inside the geometry set. */
+  return const_cast<GeometrySet &>(references_[reference_index].geometry_set());
+}
+
+/**
  * Returns a handle for the given reference.
  * If the reference exists already, the handle of the existing reference is returned.
  * Otherwise a new handle is added.
@@ -139,6 +187,11 @@ int InstancesComponent::instances_amount() const
   return instance_transforms_.size();
 }
 
+int InstancesComponent::references_amount() const
+{
+  return references_.size();
+}
+
 bool InstancesComponent::is_empty() const
 {
   return this->instance_reference_handles_.size() == 0;
diff --git a/source/blender/blenkernel/intern/gpencil_geom.cc b/source/blender/blenkernel/intern/gpencil_geom.cc
index 8ff026231f5..976b26a1f3a 100644
--- a/source/blender/blenkernel/intern/gpencil_geom.cc
+++ b/source/blender/blenkernel/intern/gpencil_geom.cc
@@ -541,64 +541,241 @@ bool BKE_gpencil_stroke_sample(bGPdata *gpd, bGPDstroke *gps, const float dist,
 }
 
 /**
+ * Give extra stroke points before and after the original tip points.
+ * \param gps: Target stroke
+ * \param count_before: how many extra points to be added before a stroke
+ * \param count_after: how many extra points to be added after a stroke
+ */
+static bool BKE_gpencil_stroke_extra_points(bGPDstroke *gps,
+                                            const int count_before,
+                                            const int count_after)
+{
+  bGPDspoint *pts = gps->points;
+
+  BLI_assert(count_before >= 0);
+  BLI_assert(count_after >= 0);
+  if (!count_before && !count_after) {
+    return false;
+  }
+
+  const int new_count = count_before + count_after + gps->totpoints;
+
+  bGPDspoint *new_pts = (bGPDspoint *)MEM_mallocN(sizeof(bGPDspoint) * new_count, __func__);
+
+  for (int i = 0; i < count_before; i++) {
+    memcpy(&new_pts[i], &pts[0], sizeof(bGPDspoint));
+  }
+  memcpy(&new_pts[count_before], pts, sizeof(bGPDspoint) * gps->totpoints);
+  for (int i = new_count - count_after; i < new_count; i++) {
+    memcpy(&new_pts[i], &pts[gps->totpoints - 1], sizeof(bGPDspoint));
+  }
+
+  if (gps->dvert) {
+    MDeformVert *new_dv = (MDeformVert *)MEM_mallocN(sizeof(MDeformVert) * new_count, __func__);
+
+    for (int i = 0; i < new_count; i++) {
+      MDeformVert *dv = &gps->dvert[CLAMPIS(i - count_before, 0, gps->totpoints - 1)];
+      int inew = i;
+      new_dv[inew].flag = dv->flag;
+      new_dv[inew].totweight = dv->totweight;
+      new_dv[inew].dw = (MDeformWeight *)MEM_mallocN(sizeof(MDeformWeight) * dv->totweight,
+                                                     __func__);
+      memcpy(new_dv[inew].dw, dv->dw, sizeof(MDeformWeight) * dv->totweight);
+    }
+    BKE_gpencil_free_stroke_weights(gps);
+    MEM_freeN(gps->dvert);
+    gps->dvert = new_dv;
+  }
+
+  MEM_freeN(gps->points);
+  gps->points = new_pts;
+  gps->totpoints = new_count;
+
+  return true;
+}
+
+/**
  * Backbone stretch similar to Freestyle.
  * \param gps: Stroke to sample.
- * \param dist: Distance of one segment.
- * \param overshoot_fac: How exact is the follow curve algorithm.
+ * \param dist: Length of the added section.
+ * \param overshoot_fac: Relative length of the curve which is used to determine the extension.
  * \param mode: Affect to Start, End or Both extremes (0->Both, 1->Start, 2->End)
+ * \param follow_curvature: True for approximating curvature of given overshoot.
+ * \param extra_point_count: When follow_curvature is true, use this amount of extra points
  */
 bool BKE_gpencil_stroke_stretch(bGPDstroke *gps,
                                 const float dist,
                                 const float overshoot_fac,
-                                const short mode)
+                                const short mode,
+                                const bool follow_curvature,
+                                const int extra_point_count,
+                                const float segment_influence,
+                                const float max_angle,
+                                const bool invert_curvature)
 {
 #define BOTH 0
 #define START 1
 #define END 2
 
-  bGPDspoint *pt = gps->points, *last_pt, *second_last, *next_pt;
-  int i;
-  float threshold = (overshoot_fac == 0 ? 0.001f : overshoot_fac);
+  const bool do_start = ELEM(mode, BOTH, START);
+  const bool do_end = ELEM(mode, BOTH, END);
+  float used_percent_length = overshoot_fac;
+  CLAMP(used_percent_length, 1e-4f, 1.0f);
+  if (!isfinite(used_percent_length)) {
+    /* #used_percent_length must always be finite, otherwise a segfault occurs.
+     * Since this function should never segfault, set #used_percent_length to a safe fallback. */
+    /* NOTE: This fallback is used if gps->totpoints == 2, see MOD_gpencillength.c */
+    used_percent_length = 0.1f;
+  }
 
-  if (gps->totpoints < 2 || dist < FLT_EPSILON) {
+  if (gps->totpoints <= 1 || dist < FLT_EPSILON || extra_point_count <= 0) {
     return false;
   }
 
-  last_pt = &pt[gps->totpoints - 1];
-  second_last = &pt[gps->totpoints - 2];
-  next_pt = &pt[1];
-
-  if (mode == BOTH || mode == START) {
-    float len1 = 0.0f;
-    i = 1;
-    while (len1 < threshold && gps->totpoints > i) {
-      next_pt = &pt[i];
-      len1 = len_v3v3(&next_pt->x, &pt->x);
-      i++;
+  /* NOTE: When it's just a straight line, we don't need to do the curvature stuff. */
+  if (!follow_curvature || gps->totpoints <= 2) {
+    /* Not following curvature, just straight line. */
+    /* NOTE: #overshoot_point_param can not be zero. */
+    float overshoot_point_param = used_percent_length * (gps->totpoints - 1);
+    float result[3];
+
+    if (do_start) {
+      int index1 = floor(overshoot_point_param);
+      int index2 = ceil(overshoot_point_param);
+      interp_v3_v3v3(result,
+                     &gps->points[index1].x,
+                     &gps->points[index2].x,
+                     fmodf(overshoot_point_param, 1.0f));
+      sub_v3_v3(result, &gps->points[0].x);
+      if (UNLIKELY(is_zero_v3(result))) {
+        sub_v3_v3v3(result, &gps->points[1].x, &gps->points[0].x);
+      }
+      madd_v3_v3fl(&gps->points[0].x, result, -dist / len_v3(result));
+    }
+
+    if (do_end) {
+      int index1 = gps->totpoints - 1 - floor(overshoot_point_param);
+      int index2 = gps->totpoints - 1 - ceil(overshoot_point_param);
+      interp_v3_v3v3(result,
+                     &gps->points[index1].x,
+                     &gps->points[index2].x,
+                     fmodf(overshoot_point_param, 1.0f));
+      sub_v3_v3(result, &gps->points[gps->totpoints - 1].x);
+      if (UNLIKELY(is_zero_v3(result))) {
+        sub_v3_v3v3(
+            result, &gps->points[gps->totpoints - 2].x, &gps->points[gps->totpoints - 1].x);
+      }
+      madd_v3_v3fl(&gps->points[gps->totpoints - 1].x, result, -dist / len_v3(result));
     }
-    float extend1 = (len1 + dist) / len1;
-    float result1[3];
-
-    interp_v3_v3v3(result1, &next_pt->x, &pt->x, extend1);
-    copy_v3_v3(&pt->x, result1);
+    return true;
   }
 
-  if (mode == BOTH || mode == END) {
-    float len2 = 0.0f;
-    i = 2;
-    while (len2 < threshold && gps->totpoints >= i) {
-      second_last = &pt[gps->totpoints - i];
-      len2 = len_v3v3(&last_pt->x, &second_last->x);
-      i++;
+  /* Curvature calculation. */
+
+  /* First allocate the new stroke size. */
+  const int first_old_index = do_start ? extra_point_count : 0;
+  const int last_old_index = gps->totpoints - 1 + first_old_index;
+  const int orig_totpoints = gps->totpoints;
+  BKE_gpencil_stroke_extra_points(gps, first_old_index, do_end ? extra_point_count : 0);
+
+  /* The fractional amount of points to query when calculating the average curvature of the
+   * strokes. */
+  const float overshoot_parameter = used_percent_length * (orig_totpoints - 2);
+  int overshoot_pointcount = ceil(overshoot_parameter);
+  CLAMP(overshoot_pointcount, 1, orig_totpoints - 2);
+
+  /* Do for both sides without code duplication. */
+  float no[3], vec1[3], vec2[3], total_angle[3];
+  for (int k = 0; k < 2; k++) {
+    if ((k == 0 && !do_start) || (k == 1 && !do_end)) {
+      continue;
     }
 
-    float extend2 = (len2 + dist) / len2;
-    float result2[3];
-    interp_v3_v3v3(result2, &second_last->x, &last_pt->x, extend2);
+    const int start_i = k == 0 ? first_old_index :
+                                 last_old_index;  // first_old_index, last_old_index
+    const int dir_i = 1 - k * 2;                  // 1, -1
 
-    copy_v3_v3(&last_pt->x, result2);
-  }
+    sub_v3_v3v3(vec1, &gps->points[start_i + dir_i].x, &gps->points[start_i].x);
+    zero_v3(total_angle);
+    float segment_length = normalize_v3(vec1);
+    float overshoot_length = 0.0f;
+
+    /* Accumulate rotation angle and length. */
+    int j = 0;
+    for (int i = start_i; j < overshoot_pointcount; i += dir_i, j++) {
+      /* Don't fully add last segment to get continuity in overshoot_fac. */
+      float fac = fmin(overshoot_parameter - j, 1.0f);
+
+      /* Read segments. */
+      copy_v3_v3(vec2, vec1);
+      sub_v3_v3v3(vec1, &gps->points[i + dir_i * 2].x, &gps->points[i + dir_i].x);
+      const float len = normalize_v3(vec1);
+      float angle = angle_normalized_v3v3(vec1, vec2) * fac;
+
+      /* Add half of both adjacent legs of the current angle. */
+      const float added_len = (segment_length + len) * 0.5f * fac;
+      overshoot_length += added_len;
+      segment_length = len;
+
+      if (angle > max_angle) {
+        continue;
+      }
+      if (angle > M_PI * 0.995f) {
+        continue;
+      }
+
+      angle *= powf(added_len, segment_influence);
+
+      cross_v3_v3v3(no, vec1, vec2);
+      normalize_v3_length(no, angle);
+      add_v3_v3(total_angle, no);
+    }
 
+    if (UNLIKELY(overshoot_length == 0.0f)) {
+      /* Don't do a proper extension if the used points are all in the same position. */
+      continue;
+    }
+
+    sub_v3_v3v3(vec1, &gps->points[start_i].x, &gps->points[start_i + dir_i].x);
+    /* In general curvature = 1/radius. For the case without the
+     * weights introduced by #segment_influence, the calculation is
+     * curvature = delta angle/delta arclength = len_v3(total_angle) / overshoot_length */
+    float curvature = normalize_v3(total_angle) / overshoot_length;
+    /* Compensate for the weights powf(added_len, segment_influence). */
+    curvature /= powf(overshoot_length / fminf(overshoot_parameter, (float)j), segment_influence);
+    if (invert_curvature) {
+      curvature = -curvature;
+    }
+    const float angle_step = curvature * dist / extra_point_count;
+    float step_length = dist / extra_point_count;
+    if (fabsf(angle_step) > FLT_EPSILON) {
+      /* Make a direct step length from the assigned arc step length. */
+      step_length *= sin(angle_step * 0.5f) / (angle_step * 0.5f);
+    }
+    else {
+      zero_v3(total_angle);
+    }
+    const float prev_length = normalize_v3_length(vec1, step_length);
+
+    /* Build rotation matrix here to get best performance. */
+    float rot[3][3];
+    float q[4];
+    axis_angle_to_quat(q, total_angle, angle_step);
+    quat_to_mat3(rot, q);
+
+    /* Rotate the starting direction to account for change in edge lengths. */
+    axis_angle_to_quat(q,
+                       total_angle,
+                       fmaxf(0.0f, 1.0f - fabs(segment_influence)) *
+                           (curvature * prev_length - angle_step) / 2.0f);
+    mul_qt_v3(q, vec1);
+
+    /* Now iteratively accumulate the segments with a rotating added direction. */
+    for (int i = start_i - dir_i, j = 0; j < extra_point_count; i -= dir_i, j++) {
+      mul_v3_m3v3(vec1, rot, vec1);
+      add_v3_v3v3(&gps->points[i].x, vec1, &gps->points[i + dir_i].x);
+    }
+  }
   return true;
 }
 
@@ -749,6 +926,7 @@ bool BKE_gpencil_stroke_shrink(bGPDstroke *gps, const float dist, const short mo
 
   second_last = &pt[gps->totpoints - 2];
 
+  float len;
   float len1, cut_len1;
   float len2, cut_len2;
   len1 = len2 = cut_len1 = cut_len2 = 0.0f;
@@ -759,11 +937,13 @@ bool BKE_gpencil_stroke_shrink(bGPDstroke *gps, const float dist, const short mo
     i = 0;
     index_end = gps->totpoints - 1;
     while (len1 < dist && gps->totpoints > i + 1) {
-      len1 += len_v3v3(&pt[i].x, &pt[i + 1].x);
+      len = len_v3v3(&pt[i].x, &pt[i + 1].x);
+      len1 += len;
       cut_len1 = len1 - dist;
       i++;
     }
     index_start = i - 1;
+    interp_v3_v3v3(&pt[index_start].x, &pt[index_start + 1].x, &pt[index_start].x, cut_len1 / len);
   }
 
   if (mode == END) {
@@ -771,18 +951,20 @@ bool BKE_gpencil_stroke_shrink(bGPDstroke *gps, const float dist, const short mo
     i = 2;
     while (len2 < dist && gps->totpoints >= i) {
       second_last = &pt[gps->totpoints - i];
-      len2 += len_v3v3(&second_last[1].x, &second_last->x);
+      len = len_v3v3(&second_last[1].x, &second_last->x);
+      len2 += len;
       cut_len2 = len2 - dist;
       i++;
     }
     index_end = gps->totpoints - i + 2;
+    interp_v3_v3v3(&pt[index_end].x, &pt[index_end - 1].x, &pt[index_end].x, cut_len2 / len);
   }
 
   if (index_end <= index_start) {
     index_start = index_end = 0; /* empty stroke */
   }
 
-  if ((index_end == index_start + 1) && (cut_len1 + cut_len2 < dist)) {
+  if ((index_end == index_start + 1) && (cut_len1 + cut_len2 < 0)) {
     index_start = index_end = 0; /* no length left to cut */
   }
 
diff --git a/source/blender/blenkernel/intern/layer.c b/source/blender/blenkernel/intern/layer.c
index b489675cd74..434a2296d95 100644
--- a/source/blender/blenkernel/intern/layer.c
+++ b/source/blender/blenkernel/intern/layer.c
@@ -183,7 +183,6 @@ static ViewLayer *view_layer_add(const char *name)
   view_layer->passflag = SCE_PASS_COMBINED;
   view_layer->pass_alpha_threshold = 0.5f;
   view_layer->cryptomatte_levels = 6;
-  view_layer->cryptomatte_flag = VIEW_LAYER_CRYPTOMATTE_ACCURATE;
   BKE_freestyle_config_init(&view_layer->freestyle_config);
 
   return view_layer;
diff --git a/source/blender/blenkernel/intern/lib_override.c b/source/blender/blenkernel/intern/lib_override.c
index 3fead8b0f39..c60a9104144 100644
--- a/source/blender/blenkernel/intern/lib_override.c
+++ b/source/blender/blenkernel/intern/lib_override.c
@@ -865,7 +865,9 @@ static void lib_override_library_create_post_process(Main *bmain,
             Object *ob_ref = (Object *)id_ref;
             LISTBASE_FOREACH (Collection *, collection, &bmain->collections) {
               if (BKE_collection_has_object(collection, ob_ref) &&
-                  BKE_view_layer_has_collection(view_layer, collection) &&
+                  (view_layer != NULL ?
+                       BKE_view_layer_has_collection(view_layer, collection) :
+                       BKE_collection_has_collection(scene->master_collection, collection)) &&
                   !ID_IS_LINKED(collection) && !ID_IS_OVERRIDE_LIBRARY(collection)) {
                 default_instantiating_collection = collection;
               }
@@ -897,6 +899,8 @@ static void lib_override_library_create_post_process(Main *bmain,
  * \note It will override all IDs tagged with \a LIB_TAG_DOIT, and it does not clear that tag at
  * its beginning, so caller code can add extra data-blocks to be overridden as well.
  *
+ * \param view_layer: the active view layer to search instantiated collections in, can be NULL (in
+ *                    which case \a scene's master collection children hierarchy is used instead).
  * \param id_root: The root ID to create an override from.
  * \param id_reference: Some reference ID used to do some post-processing after overrides have been
  * created, may be NULL. Typically, the Empty object instantiating the linked collection we
@@ -960,6 +964,8 @@ bool BKE_lib_override_library_template_create(struct ID *id)
  * \note This is a thin wrapper around \a BKE_lib_override_library_create, only extra work is to
  * actually convert the proxy itself into an override first.
  *
+ * \param view_layer: the active view layer to search instantiated collections in, can be NULL (in
+ *                    which case \a scene's master collection children hierarchy is used instead).
  * \return true if override was successfully created.
  */
 bool BKE_lib_override_library_proxy_convert(Main *bmain,
@@ -1002,6 +1008,8 @@ bool BKE_lib_override_library_proxy_convert(Main *bmain,
  * data, from an existing override hierarchy.
  *
  * \param id_root: The root liboverride ID to resync from.
+ * \param view_layer: the active view layer to search instantiated collections in, can be NULL (in
+ *                    which case \a scene's master collection children hierarchy is used instead).
  * \return true if override was successfully resynced.
  */
 bool BKE_lib_override_library_resync(Main *bmain,
@@ -1723,6 +1731,9 @@ static int lib_override_libraries_index_define(Main *bmain)
  *
  * Then it will handle the resync of necessary IDs (through calls to
  * #BKE_lib_override_library_resync).
+ *
+ * \param view_layer: the active view layer to search instantiated collections in, can be NULL (in
+ *                    which case \a scene's master collection children hierarchy is used instead).
  */
 void BKE_lib_override_library_main_resync(Main *bmain,
                                           Scene *scene,
diff --git a/source/blender/blenkernel/intern/mesh_convert.cc b/source/blender/blenkernel/intern/mesh_convert.cc
index 07dc6db05aa..467f7d4543e 100644
--- a/source/blender/blenkernel/intern/mesh_convert.cc
+++ b/source/blender/blenkernel/intern/mesh_convert.cc
@@ -1118,7 +1118,7 @@ static Mesh *mesh_new_from_mball_object(Object *object)
    * balls and all evaluated child meta balls (since polygonization is only stored in the mother
    * ball).
    *
-   * We create empty mesh so scripters don't run into None objects. */
+   * Create empty mesh so script-authors don't run into None objects. */
   if (!DEG_is_evaluated_object(object) || object->runtime.curve_cache == nullptr ||
       BLI_listbase_is_empty(&object->runtime.curve_cache->disp)) {
     return (Mesh *)BKE_id_new_nomain(ID_ME, ((ID *)object->data)->name + 2);
diff --git a/source/blender/blenkernel/intern/node.cc b/source/blender/blenkernel/intern/node.cc
index 3a76cbf6f84..2d0239740f8 100644
--- a/source/blender/blenkernel/intern/node.cc
+++ b/source/blender/blenkernel/intern/node.cc
@@ -5175,6 +5175,7 @@ static void registerGeometryNodes()
   register_node_type_geo_attribute_randomize();
   register_node_type_geo_attribute_remove();
   register_node_type_geo_attribute_separate_xyz();
+  register_node_type_geo_attribute_statistic();
   register_node_type_geo_attribute_transfer();
   register_node_type_geo_attribute_vector_math();
   register_node_type_geo_attribute_vector_rotate();
@@ -5182,9 +5183,11 @@ static void registerGeometryNodes()
   register_node_type_geo_bounding_box();
   register_node_type_geo_collection_info();
   register_node_type_geo_convex_hull();
+  register_node_type_geo_curve_sample();
   register_node_type_geo_curve_endpoints();
   register_node_type_geo_curve_fill();
   register_node_type_geo_curve_length();
+  register_node_type_geo_curve_parameter();
   register_node_type_geo_curve_primitive_bezier_segment();
   register_node_type_geo_curve_primitive_circle();
   register_node_type_geo_curve_primitive_line();
@@ -5206,6 +5209,7 @@ static void registerGeometryNodes()
   register_node_type_geo_input_material();
   register_node_type_geo_input_normal();
   register_node_type_geo_input_position();
+  register_node_type_geo_input_tangent();
   register_node_type_geo_is_viewport();
   register_node_type_geo_join_geometry();
   register_node_type_geo_material_assign();
@@ -5232,6 +5236,7 @@ static void registerGeometryNodes()
   register_node_type_geo_realize_instances();
   register_node_type_geo_sample_texture();
   register_node_type_geo_select_by_handle_type();
+  register_node_type_geo_string_join();
   register_node_type_geo_material_selection();
   register_node_type_geo_separate_components();
   register_node_type_geo_set_position();
@@ -5251,6 +5256,9 @@ static void registerFunctionNodes()
   register_node_type_fn_input_string();
   register_node_type_fn_input_vector();
   register_node_type_fn_random_float();
+  register_node_type_fn_string_length();
+  register_node_type_fn_string_substring();
+  register_node_type_fn_value_to_string();
 }
 
 void BKE_node_system_init(void)
diff --git a/source/blender/blenkernel/intern/screen.c b/source/blender/blenkernel/intern/screen.c
index 73e25a22225..4c38536b662 100644
--- a/source/blender/blenkernel/intern/screen.c
+++ b/source/blender/blenkernel/intern/screen.c
@@ -1679,6 +1679,8 @@ static void direct_link_area(BlendDataReader *reader, ScrArea *area)
       sseq->scopes.sep_waveform_ibuf = NULL;
       sseq->scopes.vector_ibuf = NULL;
       sseq->scopes.histogram_ibuf = NULL;
+      memset(&sseq->runtime, 0x0, sizeof(sseq->runtime));
+
     }
     else if (sl->spacetype == SPACE_PROPERTIES) {
       SpaceProperties *sbuts = (SpaceProperties *)sl;
diff --git a/source/blender/blenkernel/intern/spline_bezier.cc b/source/blender/blenkernel/intern/spline_bezier.cc
index 79d2137ee84..b36d7a21669 100644
--- a/source/blender/blenkernel/intern/spline_bezier.cc
+++ b/source/blender/blenkernel/intern/spline_bezier.cc
@@ -214,6 +214,11 @@ void BezierSpline::ensure_auto_handles() const
     return;
   }
 
+  if (this->size() == 1) {
+    auto_handles_dirty_ = false;
+    return;
+  }
+
   for (const int i : IndexRange(this->size())) {
     if (ELEM(HandleType::Auto, handle_types_left_[i], handle_types_right_[i])) {
       const float3 prev_diff = positions_[i] - previous_position(positions_, is_cyclic_, i);
diff --git a/source/blender/blenlib/BLI_array.hh b/source/blender/blenlib/BLI_array.hh
index fc8fc615feb..352bf379d4d 100644
--- a/source/blender/blenlib/BLI_array.hh
+++ b/source/blender/blenlib/BLI_array.hh
@@ -277,6 +277,21 @@ class Array {
   }
 
   /**
+   * Return a reference to the first element in the array.
+   * This invokes undefined behavior when the array is empty.
+   */
+  const T &first() const
+  {
+    BLI_assert(size_ > 0);
+    return *data_;
+  }
+  T &first()
+  {
+    BLI_assert(size_ > 0);
+    return *data_;
+  }
+
+  /**
    * Return a reference to the last element in the array.
    * This invokes undefined behavior when the array is empty.
    */
diff --git a/source/blender/blenlib/BLI_uuid.h b/source/blender/blenlib/BLI_uuid.h
index 1ce294ed723..9b85f8e65bc 100644
--- a/source/blender/blenlib/BLI_uuid.h
+++ b/source/blender/blenlib/BLI_uuid.h
@@ -35,24 +35,25 @@ extern "C" {
 /**
  * UUID generator for random (version 4) UUIDs. See RFC4122 section 4.4.
  * This function is not thread-safe. */
-UUID BLI_uuid_generate_random(void);
+bUUID BLI_uuid_generate_random(void);
 
 /**
  * Return the UUID nil value, consisting of all-zero fields.
  */
-UUID BLI_uuid_nil(void);
+bUUID BLI_uuid_nil(void);
 
 /** Return true only if this is the nil UUID. */
-bool BLI_uuid_is_nil(UUID uuid);
+bool BLI_uuid_is_nil(bUUID uuid);
 
 /** Compare two UUIDs, return true only if they are equal. */
-bool BLI_uuid_equal(UUID uuid1, UUID uuid2);
+bool BLI_uuid_equal(bUUID uuid1, bUUID uuid2);
 
 /**
  * Format UUID as string.
  * The buffer must be at least 37 bytes (36 bytes for the UUID + terminating 0).
+ * Use `UUID_STRING_LEN` from DNA_uuid_types.h if you want to use a constant for this.
  */
-void BLI_uuid_format(char *buffer, UUID uuid) ATTR_NONNULL();
+void BLI_uuid_format(char *buffer, bUUID uuid) ATTR_NONNULL();
 
 /**
  * Parse a string as UUID.
@@ -62,7 +63,7 @@ void BLI_uuid_format(char *buffer, UUID uuid) ATTR_NONNULL();
  * Return true if the string could be parsed, and false otherwise. In the latter case, the UUID may
  * have been partially updated.
  */
-bool BLI_uuid_parse_string(UUID *uuid, const char *buffer) ATTR_NONNULL();
+bool BLI_uuid_parse_string(bUUID *uuid, const char *buffer) ATTR_NONNULL();
 
 #ifdef __cplusplus
 }
@@ -70,6 +71,6 @@ bool BLI_uuid_parse_string(UUID *uuid, const char *buffer) ATTR_NONNULL();
 #  include <ostream>
 
 /** Output the UUID as formatted ASCII string, see #BLI_uuid_format(). */
-std::ostream &operator<<(std::ostream &stream, UUID uuid);
+std::ostream &operator<<(std::ostream &stream, bUUID uuid);
 
 #endif
diff --git a/source/blender/blenlib/intern/uuid.cc b/source/blender/blenlib/intern/uuid.cc
index f5edb356acc..ae34bcb3d32 100644
--- a/source/blender/blenlib/intern/uuid.cc
+++ b/source/blender/blenlib/intern/uuid.cc
@@ -27,9 +27,9 @@
 #include <string>
 
 /* Ensure the UUID struct doesn't have any padding, to be compatible with memcmp(). */
-static_assert(sizeof(UUID) == 16, "expect UUIDs to be 128 bit exactly");
+static_assert(sizeof(bUUID) == 16, "expect UUIDs to be 128 bit exactly");
 
-UUID BLI_uuid_generate_random()
+bUUID BLI_uuid_generate_random()
 {
   static std::mt19937_64 rng = []() {
     std::mt19937_64 rng;
@@ -57,7 +57,7 @@ UUID BLI_uuid_generate_random()
     return rng;
   }();
 
-  UUID uuid;
+  bUUID uuid;
 
   /* RFC4122 suggests setting certain bits to a fixed value, and then randomizing the remaining
    * bits. The opposite is easier to implement, though, so that's what's done here. */
@@ -78,23 +78,23 @@ UUID BLI_uuid_generate_random()
   return uuid;
 }
 
-UUID BLI_uuid_nil(void)
+bUUID BLI_uuid_nil(void)
 {
-  const UUID nil = {0, 0, 0, 0, 0, 0};
+  const bUUID nil = {0, 0, 0, 0, 0, 0};
   return nil;
 }
 
-bool BLI_uuid_is_nil(UUID uuid)
+bool BLI_uuid_is_nil(bUUID uuid)
 {
   return BLI_uuid_equal(BLI_uuid_nil(), uuid);
 }
 
-bool BLI_uuid_equal(const UUID uuid1, const UUID uuid2)
+bool BLI_uuid_equal(const bUUID uuid1, const bUUID uuid2)
 {
   return std::memcmp(&uuid1, &uuid2, sizeof(uuid1)) == 0;
 }
 
-void BLI_uuid_format(char *buffer, const UUID uuid)
+void BLI_uuid_format(char *buffer, const bUUID uuid)
 {
   std::sprintf(buffer,
                "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
@@ -111,7 +111,7 @@ void BLI_uuid_format(char *buffer, const UUID uuid)
                uuid.node[5]);
 }
 
-bool BLI_uuid_parse_string(UUID *uuid, const char *buffer)
+bool BLI_uuid_parse_string(bUUID *uuid, const char *buffer)
 {
   const int num_fields_parsed = std::sscanf(
       buffer,
@@ -130,7 +130,7 @@ bool BLI_uuid_parse_string(UUID *uuid, const char *buffer)
   return num_fields_parsed == 11;
 }
 
-std::ostream &operator<<(std::ostream &stream, UUID uuid)
+std::ostream &operator<<(std::ostream &stream, bUUID uuid)
 {
   std::string buffer(36, '\0');
   BLI_uuid_format(buffer.data(), uuid);
diff --git a/source/blender/blenlib/tests/BLI_uuid_test.cc b/source/blender/blenlib/tests/BLI_uuid_test.cc
index 31c69002c1c..731489c6c9e 100644
--- a/source/blender/blenlib/tests/BLI_uuid_test.cc
+++ b/source/blender/blenlib/tests/BLI_uuid_test.cc
@@ -20,7 +20,7 @@
 
 TEST(BLI_uuid, generate_random)
 {
-  const UUID uuid = BLI_uuid_generate_random();
+  const bUUID uuid = BLI_uuid_generate_random();
 
   // The 4 MSbits represent the "version" of the UUID.
   const uint16_t version = uuid.time_hi_and_version >> 12;
@@ -33,11 +33,11 @@ TEST(BLI_uuid, generate_random)
 
 TEST(BLI_uuid, generate_many_random)
 {
-  const UUID first_uuid = BLI_uuid_generate_random();
+  const bUUID first_uuid = BLI_uuid_generate_random();
 
   /* Generate lots of UUIDs to get some indication that the randomness is okay. */
   for (int i = 0; i < 1000000; ++i) {
-    const UUID uuid = BLI_uuid_generate_random();
+    const bUUID uuid = BLI_uuid_generate_random();
     EXPECT_FALSE(BLI_uuid_equal(first_uuid, uuid));
 
     // Check that the non-random bits are set according to RFC4122.
@@ -50,8 +50,8 @@ TEST(BLI_uuid, generate_many_random)
 
 TEST(BLI_uuid, nil_value)
 {
-  const UUID nil_uuid = BLI_uuid_nil();
-  const UUID zeroes_uuid = {0, 0, 0, 0, 0, 0};
+  const bUUID nil_uuid = BLI_uuid_nil();
+  const bUUID zeroes_uuid = {0, 0, 0, 0, 0, 0};
 
   EXPECT_TRUE(BLI_uuid_equal(nil_uuid, zeroes_uuid));
   EXPECT_TRUE(BLI_uuid_is_nil(nil_uuid));
@@ -63,8 +63,8 @@ TEST(BLI_uuid, nil_value)
 
 TEST(BLI_uuid, equality)
 {
-  const UUID uuid1 = BLI_uuid_generate_random();
-  const UUID uuid2 = BLI_uuid_generate_random();
+  const bUUID uuid1 = BLI_uuid_generate_random();
+  const bUUID uuid2 = BLI_uuid_generate_random();
 
   EXPECT_TRUE(BLI_uuid_equal(uuid1, uuid1));
   EXPECT_FALSE(BLI_uuid_equal(uuid1, uuid2));
@@ -72,7 +72,7 @@ TEST(BLI_uuid, equality)
 
 TEST(BLI_uuid, string_formatting)
 {
-  UUID uuid;
+  bUUID uuid;
   std::string buffer(36, '\0');
 
   memset(&uuid, 0, sizeof(uuid));
@@ -91,12 +91,12 @@ TEST(BLI_uuid, string_formatting)
   EXPECT_EQ("00000001-0002-0003-0405-060000000007", buffer);
 
   /* Somewhat more complex bit patterns. This is a version 1 UUID generated from Python. */
-  const UUID uuid1 = {3540651616, 5282, 4588, 139, 153, {0xf7, 0x73, 0x69, 0x44, 0xdb, 0x8b}};
+  const bUUID uuid1 = {3540651616, 5282, 4588, 139, 153, {0xf7, 0x73, 0x69, 0x44, 0xdb, 0x8b}};
   BLI_uuid_format(buffer.data(), uuid1);
   EXPECT_EQ("d30a0e60-14a2-11ec-8b99-f7736944db8b", buffer);
 
   /* Namespace UUID, example listed in RFC4211. */
-  const UUID namespace_dns = {
+  const bUUID namespace_dns = {
       0x6ba7b810, 0x9dad, 0x11d1, 0x80, 0xb4, {0x00, 0xc0, 0x4f, 0xd4, 0x30, 0xc8}};
   BLI_uuid_format(buffer.data(), namespace_dns);
   EXPECT_EQ("6ba7b810-9dad-11d1-80b4-00c04fd430c8", buffer);
@@ -104,7 +104,7 @@ TEST(BLI_uuid, string_formatting)
 
 TEST(BLI_uuid, string_parsing_ok)
 {
-  UUID uuid;
+  bUUID uuid;
   std::string buffer(36, '\0');
 
   const bool parsed_ok = BLI_uuid_parse_string(&uuid, "d30a0e60-14a2-11ec-8b99-f7736944db8b");
@@ -115,7 +115,7 @@ TEST(BLI_uuid, string_parsing_ok)
 
 TEST(BLI_uuid, string_parsing_capitalisation)
 {
-  UUID uuid;
+  bUUID uuid;
   std::string buffer(36, '\0');
 
   /* RFC4122 demands acceptance of upper-case hex digits. */
@@ -129,7 +129,7 @@ TEST(BLI_uuid, string_parsing_capitalisation)
 
 TEST(BLI_uuid, string_parsing_fail)
 {
-  UUID uuid;
+  bUUID uuid;
   std::string buffer(36, '\0');
 
   const bool parsed_ok = BLI_uuid_parse_string(&uuid, "d30a0e60!14a2-11ec-8b99-f7736944db8b");
@@ -139,7 +139,7 @@ TEST(BLI_uuid, string_parsing_fail)
 TEST(BLI_uuid, stream_operator)
 {
   std::stringstream ss;
-  const UUID uuid = {3540651616, 5282, 4588, 139, 153, {0xf7, 0x73, 0x69, 0x44, 0xdb, 0x8b}};
+  const bUUID uuid = {3540651616, 5282, 4588, 139, 153, {0xf7, 0x73, 0x69, 0x44, 0xdb, 0x8b}};
   ss << uuid;
   EXPECT_EQ(ss.str(), "d30a0e60-14a2-11ec-8b99-f7736944db8b");
 }
diff --git a/source/blender/blenloader/intern/versioning_270.c b/source/blender/blenloader/intern/versioning_270.c
index fa15e541e43..54d1efab7dd 100644
--- a/source/blender/blenloader/intern/versioning_270.c
+++ b/source/blender/blenloader/intern/versioning_270.c
@@ -651,13 +651,6 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *bmain)
         mat->line_col[3] = mat->alpha;
       }
     }
-
-    if (!DNA_struct_elem_find(fd->filesdna, "RenderData", "int", "preview_start_resolution")) {
-      Scene *scene;
-      for (scene = bmain->scenes.first; scene; scene = scene->id.next) {
-        scene->r.preview_start_resolution = 64;
-      }
-    }
   }
 
   if (!MAIN_VERSION_ATLEAST(bmain, 271, 3)) {
@@ -698,15 +691,6 @@ void blo_do_versions_270(FileData *fd, Library *UNUSED(lib), Main *bmain)
     }
   }
 
-  if (!MAIN_VERSION_ATLEAST(bmain, 272, 0)) {
-    if (!DNA_struct_elem_find(fd->filesdna, "RenderData", "int", "preview_start_resolution")) {
-      Scene *scene;
-      for (scene = bmain->scenes.first; scene; scene = scene->id.next) {
-        scene->r.preview_start_resolution = 64;
-      }
-    }
-  }
-
   if (!MAIN_VERSION_ATLEAST(bmain, 272, 1)) {
     Brush *br;
     for (br = bmain->brushes.first; br; br = br->id.next) {
diff --git a/source/blender/blenloader/intern/versioning_280.c b/source/blender/blenloader/intern/versioning_280.c
index f667361d166..69b67460a5d 100644
--- a/source/blender/blenloader/intern/versioning_280.c
+++ b/source/blender/blenloader/intern/versioning_280.c
@@ -1774,7 +1774,7 @@ static void do_versions_seq_set_cache_defaults(Editing *ed)
 
 static bool seq_update_flags_cb(Sequence *seq, void *UNUSED(user_data))
 {
-  seq->flag &= ~(SEQ_FLAG_UNUSED_6 | SEQ_FLAG_UNUSED_18 | SEQ_FLAG_UNUSED_19 | SEQ_FLAG_UNUSED_21);
+  seq->flag &= ~((1 << 6) | (1 << 18) | (1 << 19) | (1 << 21));
   if (seq->type == SEQ_TYPE_SPEED) {
     SpeedControlVars *s = (SpeedControlVars *)seq->effectdata;
     s->flags &= ~(SEQ_SPEED_UNUSED_1);
@@ -3718,7 +3718,7 @@ void blo_do_versions_280(FileData *fd, Library *UNUSED(lib), Main *bmain)
             STRNCPY(node->idname, "ShaderNodeOutputLight");
           }
           if (node->type == SH_NODE_BSDF_PRINCIPLED && node->custom2 == 0) {
-            node->custom2 = SHD_SUBSURFACE_BURLEY;
+            node->custom2 = SHD_SUBSURFACE_DIFFUSION;
           }
         }
       }
diff --git a/source/blender/blenloader/intern/versioning_290.c b/source/blender/blenloader/intern/versioning_290.c
index bafba486c88..be8c4b735be 100644
--- a/source/blender/blenloader/intern/versioning_290.c
+++ b/source/blender/blenloader/intern/versioning_290.c
@@ -1461,7 +1461,6 @@ void blo_do_versions_290(FileData *fd, Library *UNUSED(lib), Main *bmain)
       LISTBASE_FOREACH (Scene *, scene, &bmain->scenes) {
         LISTBASE_FOREACH (ViewLayer *, view_layer, &scene->view_layers) {
           view_layer->cryptomatte_levels = 6;
-          view_layer->cryptomatte_flag = VIEW_LAYER_CRYPTOMATTE_ACCURATE;
         }
       }
     }
diff --git a/source/blender/blenloader/intern/versioning_300.c b/source/blender/blenloader/intern/versioning_300.c
index 30e7c9bde4c..58265bca238 100644
--- a/source/blender/blenloader/intern/versioning_300.c
+++ b/source/blender/blenloader/intern/versioning_300.c
@@ -64,6 +64,7 @@
 #include "MEM_guardedalloc.h"
 #include "readfile.h"
 
+#include "SEQ_iterator.h"
 #include "SEQ_sequencer.h"
 
 #include "RNA_access.h"
@@ -109,7 +110,8 @@ static void version_idproperty_move_data_int(IDPropertyUIDataInt *ui_data,
   if (default_value != NULL) {
     if (default_value->type == IDP_ARRAY) {
       if (default_value->subtype == IDP_INT) {
-        ui_data->default_array = MEM_dupallocN(IDP_Array(default_value));
+        ui_data->default_array = MEM_malloc_arrayN(default_value->len, sizeof(int), __func__);
+        memcpy(ui_data->default_array, IDP_Array(default_value), sizeof(int) * default_value->len);
         ui_data->default_array_len = default_value->len;
       }
     }
@@ -151,9 +153,18 @@ static void version_idproperty_move_data_float(IDPropertyUIDataFloat *ui_data,
   IDProperty *default_value = IDP_GetPropertyFromGroup(prop_ui_data, "default");
   if (default_value != NULL) {
     if (default_value->type == IDP_ARRAY) {
-      if (ELEM(default_value->subtype, IDP_FLOAT, IDP_DOUBLE)) {
-        ui_data->default_array = MEM_dupallocN(IDP_Array(default_value));
-        ui_data->default_array_len = default_value->len;
+      const int size = default_value->len;
+      ui_data->default_array_len = size;
+      if (default_value->subtype == IDP_FLOAT) {
+        ui_data->default_array = MEM_malloc_arrayN(size, sizeof(double), __func__);
+        const float *old_default_array = IDP_Array(default_value);
+        for (int i = 0; i < ui_data->default_array_len; i++) {
+          ui_data->default_array[i] = (double)old_default_array[i];
+        }
+      }
+      else if (default_value->subtype == IDP_DOUBLE) {
+        ui_data->default_array = MEM_malloc_arrayN(size, sizeof(double), __func__);
+        memcpy(ui_data->default_array, IDP_Array(default_value), sizeof(double) * size);
       }
     }
     else if (ELEM(default_value->type, IDP_DOUBLE, IDP_FLOAT)) {
@@ -774,6 +785,69 @@ static void version_geometry_nodes_change_legacy_names(bNodeTree *ntree)
     }
   }
 }
+static bool seq_transform_origin_set(Sequence *seq, void *UNUSED(user_data))
+{
+  StripTransform *transform = seq->strip->transform;
+  if (seq->strip->transform != NULL) {
+    transform->origin[0] = transform->origin[1] = 0.5f;
+  }
+  return true;
+}
+
+static void do_version_subsurface_methods(bNode *node)
+{
+  if (node->type == SH_NODE_SUBSURFACE_SCATTERING) {
+    if (node->custom1 != SHD_SUBSURFACE_RANDOM_WALK) {
+      node->custom1 = SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS;
+    }
+  }
+  else if (node->type == SH_NODE_BSDF_PRINCIPLED) {
+    if (node->custom2 != SHD_SUBSURFACE_RANDOM_WALK) {
+      node->custom2 = SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS;
+    }
+  }
+}
+
+static void version_geometry_nodes_add_attribute_input_settings(NodesModifierData *nmd)
+{
+  /* Before versioning the properties, make sure it hasn't been done already. */
+  LISTBASE_FOREACH (const IDProperty *, property, &nmd->settings.properties->data.group) {
+    if (strstr(property->name, "_use_attribute") || strstr(property->name, "_attribute_name")) {
+      return;
+    }
+  }
+
+  LISTBASE_FOREACH_MUTABLE (IDProperty *, property, &nmd->settings.properties->data.group) {
+    if (!ELEM(property->type, IDP_FLOAT, IDP_INT, IDP_ARRAY)) {
+      continue;
+    }
+
+    if (strstr(property->name, "_use_attribute") || strstr(property->name, "_attribute_name")) {
+      continue;
+    }
+
+    char use_attribute_prop_name[MAX_IDPROP_NAME];
+    BLI_snprintf(use_attribute_prop_name,
+                 sizeof(use_attribute_prop_name),
+                 "%s%s",
+                 property->name,
+                 "_use_attribute");
+
+    IDPropertyTemplate idprop = {0};
+    IDProperty *use_attribute_prop = IDP_New(IDP_INT, &idprop, use_attribute_prop_name);
+    IDP_AddToGroup(nmd->settings.properties, use_attribute_prop);
+
+    char attribute_name_prop_name[MAX_IDPROP_NAME];
+    BLI_snprintf(attribute_name_prop_name,
+                 sizeof(attribute_name_prop_name),
+                 "%s%s",
+                 property->name,
+                 "_attribute_name");
+
+    IDProperty *attribute_prop = IDP_New(IDP_STRING, &idprop, attribute_name_prop_name);
+    IDP_AddToGroup(nmd->settings.properties, attribute_prop);
+  }
+}
 
 /* NOLINTNEXTLINE: readability-function-size */
 void blo_do_versions_300(FileData *fd, Library *UNUSED(lib), Main *bmain)
@@ -1290,6 +1364,62 @@ void blo_do_versions_300(FileData *fd, Library *UNUSED(lib), Main *bmain)
     }
   }
 
+  if (!MAIN_VERSION_ATLEAST(bmain, 300, 24)) {
+    LISTBASE_FOREACH (Scene *, scene, &bmain->scenes) {
+      SequencerToolSettings *sequencer_tool_settings = SEQ_tool_settings_ensure(scene);
+      sequencer_tool_settings->pivot_point = V3D_AROUND_CENTER_MEDIAN;
+
+      if (scene->ed != NULL) {
+        SEQ_for_each_callback(&scene->ed->seqbase, seq_transform_origin_set, NULL);
+      }
+    }
+    LISTBASE_FOREACH (bScreen *, screen, &bmain->screens) {
+      LISTBASE_FOREACH (ScrArea *, area, &screen->areabase) {
+        LISTBASE_FOREACH (SpaceLink *, sl, &area->spacedata) {
+          if (sl->spacetype == SPACE_SEQ) {
+            SpaceSeq *sseq = (SpaceSeq *)sl;
+            sseq->preview_overlay.flag |= SEQ_PREVIEW_SHOW_OUTLINE_SELECTED;
+          }
+        }
+      }
+    }
+
+    LISTBASE_FOREACH (bScreen *, screen, &bmain->screens) {
+      LISTBASE_FOREACH (ScrArea *, area, &screen->areabase) {
+        LISTBASE_FOREACH (SpaceLink *, sl, &area->spacedata) {
+          if (sl->spacetype == SPACE_SEQ) {
+            ListBase *regionbase = (sl == area->spacedata.first) ? &area->regionbase :
+                                                                   &sl->regionbase;
+            LISTBASE_FOREACH (ARegion *, region, regionbase) {
+              if (region->regiontype == RGN_TYPE_WINDOW) {
+                region->v2d.min[1] = 4.0f;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (!MAIN_VERSION_ATLEAST(bmain, 300, 25)) {
+    FOREACH_NODETREE_BEGIN (bmain, ntree, id) {
+      if (ntree->type == NTREE_SHADER) {
+        LISTBASE_FOREACH (bNode *, node, &ntree->nodes) {
+          do_version_subsurface_methods(node);
+        }
+      }
+    }
+    FOREACH_NODETREE_END;
+
+    enum {
+      R_EXR_TILE_FILE = (1 << 10),
+      R_FULL_SAMPLE = (1 << 15),
+    };
+    LISTBASE_FOREACH (Scene *, scene, &bmain->scenes) {
+      scene->r.scemode &= ~(R_EXR_TILE_FILE | R_FULL_SAMPLE);
+    }
+  }
+
   /**
    * Versioning code until next subversion bump goes here.
    *
@@ -1301,5 +1431,12 @@ void blo_do_versions_300(FileData *fd, Library *UNUSED(lib), Main *bmain)
    */
   {
     /* Keep this block, even when empty. */
+    LISTBASE_FOREACH (Object *, ob, &bmain->objects) {
+      LISTBASE_FOREACH (ModifierData *, md, &ob->modifiers) {
+        if (md->type == eModifierType_Nodes) {
+          version_geometry_nodes_add_attribute_input_settings((NodesModifierData *)md);
+        }
+      }
+    }
   }
 }
diff --git a/source/blender/blenloader/intern/versioning_cycles.c b/source/blender/blenloader/intern/versioning_cycles.c
index 90e6b43f02e..da57f27af4e 100644
--- a/source/blender/blenloader/intern/versioning_cycles.c
+++ b/source/blender/blenloader/intern/versioning_cycles.c
@@ -182,8 +182,8 @@ static void displacement_principled_nodes(bNode *node)
     }
   }
   else if (node->type == SH_NODE_BSDF_PRINCIPLED) {
-    if (node->custom2 != SHD_SUBSURFACE_RANDOM_WALK) {
-      node->custom2 = SHD_SUBSURFACE_BURLEY;
+    if (node->custom2 != SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS) {
+      node->custom2 = SHD_SUBSURFACE_DIFFUSION;
     }
   }
 }
@@ -1373,6 +1373,11 @@ void blo_do_versions_cycles(FileData *UNUSED(fd), Library *UNUSED(lib), Main *bm
 
 void do_versions_after_linking_cycles(Main *bmain)
 {
+  const int DENOISER_AUTO = 0;
+  const int DENOISER_NLM = 1;
+  const int DENOISER_OPTIX = 2;
+  const int DENOISER_OPENIMAGEDENOISE = 4;
+
   if (!MAIN_VERSION_ATLEAST(bmain, 280, 66)) {
     /* Shader node tree changes. After lib linking so we have all the typeinfo
      * pointers and updated sockets and we can use the high level node API to
@@ -1578,10 +1583,6 @@ void do_versions_after_linking_cycles(Main *bmain)
       }
 
       if (cscene) {
-        const int DENOISER_AUTO = 0;
-        const int DENOISER_NLM = 1;
-        const int DENOISER_OPTIX = 2;
-
         /* Enable denoiser if it was enabled for one view layer before. */
         cycles_property_int_set(cscene, "denoiser", (use_optix) ? DENOISER_OPTIX : DENOISER_NLM);
         cycles_property_boolean_set(cscene, "use_denoising", use_denoising);
@@ -1637,4 +1638,17 @@ void do_versions_after_linking_cycles(Main *bmain)
       object->visibility_flag |= flag;
     }
   }
+
+  if (!MAIN_VERSION_ATLEAST(bmain, 300, 25)) {
+    /* Removal of NLM denoiser. */
+    for (Scene *scene = bmain->scenes.first; scene; scene = scene->id.next) {
+      IDProperty *cscene = cycles_properties_from_ID(&scene->id);
+
+      if (cscene) {
+        if (cycles_property_int(cscene, "denoiser", DENOISER_NLM) == DENOISER_NLM) {
+          cycles_property_int_set(cscene, "denoiser", DENOISER_OPENIMAGEDENOISE);
+        }
+      }
+    }
+  }
 }
diff --git a/source/blender/blenloader/intern/versioning_defaults.c b/source/blender/blenloader/intern/versioning_defaults.c
index 074cae669af..152ef79a38f 100644
--- a/source/blender/blenloader/intern/versioning_defaults.c
+++ b/source/blender/blenloader/intern/versioning_defaults.c
@@ -54,6 +54,7 @@
 #include "BKE_curveprofile.h"
 #include "BKE_customdata.h"
 #include "BKE_gpencil.h"
+#include "BKE_idprop.h"
 #include "BKE_layer.h"
 #include "BKE_lib_id.h"
 #include "BKE_main.h"
@@ -160,6 +161,7 @@ static void blo_update_defaults_screen(bScreen *screen,
       seq->render_size = SEQ_RENDER_SIZE_PROXY_100;
       seq->timeline_overlay.flag |= SEQ_TIMELINE_SHOW_STRIP_SOURCE | SEQ_TIMELINE_SHOW_STRIP_NAME |
                                     SEQ_TIMELINE_SHOW_STRIP_DURATION | SEQ_TIMELINE_SHOW_GRID;
+      seq->preview_overlay.flag |= SEQ_PREVIEW_SHOW_OUTLINE_SELECTED;
     }
     else if (area->spacetype == SPACE_TEXT) {
       /* Show syntax and line numbers in Script workspace text editor. */
@@ -355,6 +357,12 @@ static void blo_update_defaults_scene(Main *bmain, Scene *scene)
   if (ts->custom_bevel_profile_preset == NULL) {
     ts->custom_bevel_profile_preset = BKE_curveprofile_add(PROF_PRESET_LINE);
   }
+
+  /* Clear ID properties so Cycles gets defaults. */
+  IDProperty *idprop = IDP_GetProperties(&scene->id, false);
+  if (idprop) {
+    IDP_ClearProperty(idprop);
+  }
 }
 
 /**
@@ -581,6 +589,10 @@ void BLO_update_defaults_startup_blend(Main *bmain, const char *app_template)
           bNodeSocket *roughness_socket = nodeFindSocket(node, SOCK_IN, "Roughness");
           bNodeSocketValueFloat *roughness_data = roughness_socket->default_value;
           roughness_data->value = 0.4f;
+          node->custom2 = SHD_SUBSURFACE_RANDOM_WALK;
+        }
+        else if (node->type == SH_NODE_SUBSURFACE_SCATTERING) {
+          node->custom1 = SHD_SUBSURFACE_RANDOM_WALK;
         }
       }
     }
diff --git a/source/blender/compositor/nodes/COM_IDMaskNode.cc b/source/blender/compositor/nodes/COM_IDMaskNode.cc
index b51e79f2dea..761cb8b98cf 100644
--- a/source/blender/compositor/nodes/COM_IDMaskNode.cc
+++ b/source/blender/compositor/nodes/COM_IDMaskNode.cc
@@ -28,7 +28,7 @@ IDMaskNode::IDMaskNode(bNode *editorNode) : Node(editorNode)
   /* pass */
 }
 void IDMaskNode::convertToOperations(NodeConverter &converter,
-                                     const CompositorContext &context) const
+                                     const CompositorContext & /*context*/) const
 {
   bNode *bnode = this->getbNode();
 
@@ -38,7 +38,7 @@ void IDMaskNode::convertToOperations(NodeConverter &converter,
   converter.addOperation(operation);
 
   converter.mapInputSocket(getInputSocket(0), operation->getInputSocket(0));
-  if (bnode->custom2 == 0 || context.getRenderData()->scemode & R_FULL_SAMPLE) {
+  if (bnode->custom2 == 0) {
     converter.mapOutputSocket(getOutputSocket(0), operation->getOutputSocket(0));
   }
   else {
diff --git a/source/blender/compositor/nodes/COM_ZCombineNode.cc b/source/blender/compositor/nodes/COM_ZCombineNode.cc
index ddf66740578..e29748dc317 100644
--- a/source/blender/compositor/nodes/COM_ZCombineNode.cc
+++ b/source/blender/compositor/nodes/COM_ZCombineNode.cc
@@ -31,9 +31,9 @@
 namespace blender::compositor {
 
 void ZCombineNode::convertToOperations(NodeConverter &converter,
-                                       const CompositorContext &context) const
+                                       const CompositorContext & /*context*/) const
 {
-  if ((context.getRenderData()->scemode & R_FULL_SAMPLE) || this->getbNode()->custom2) {
+  if (this->getbNode()->custom2) {
     ZCombineOperation *operation = nullptr;
     if (this->getbNode()->custom1) {
       operation = new ZCombineAlphaOperation();
diff --git a/source/blender/draw/DRW_engine.h b/source/blender/draw/DRW_engine.h
index a125a13eaf9..5e7b812c37b 100644
--- a/source/blender/draw/DRW_engine.h
+++ b/source/blender/draw/DRW_engine.h
@@ -176,6 +176,9 @@ void DRW_deferred_shader_remove(struct GPUMaterial *mat);
 struct DrawDataList *DRW_drawdatalist_from_id(struct ID *id);
 void DRW_drawdata_free(struct ID *id);
 
+bool DRW_opengl_context_release(void);
+void DRW_opengl_context_activate(bool drw_state);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/draw/engines/eevee/eevee_cryptomatte.c b/source/blender/draw/engines/eevee/eevee_cryptomatte.c
index 76a1b561972..49780abc6f4 100644
--- a/source/blender/draw/engines/eevee/eevee_cryptomatte.c
+++ b/source/blender/draw/engines/eevee/eevee_cryptomatte.c
@@ -139,8 +139,6 @@ void EEVEE_cryptomatte_renderpasses_init(EEVEE_Data *vedata)
     g_data->cryptomatte_session = session;
 
     g_data->render_passes |= EEVEE_RENDER_PASS_CRYPTOMATTE | EEVEE_RENDER_PASS_VOLUME_LIGHT;
-    g_data->cryptomatte_accurate_mode = (view_layer->cryptomatte_flag &
-                                         VIEW_LAYER_CRYPTOMATTE_ACCURATE) != 0;
   }
 }
 
@@ -405,7 +403,6 @@ void EEVEE_cryptomatte_output_accumulate(EEVEE_ViewLayerData *UNUSED(sldata), EE
 {
   EEVEE_FramebufferList *fbl = vedata->fbl;
   EEVEE_StorageList *stl = vedata->stl;
-  EEVEE_PrivateData *g_data = stl->g_data;
   EEVEE_EffectsInfo *effects = stl->effects;
   EEVEE_PassList *psl = vedata->psl;
   const DRWContextState *draw_ctx = DRW_context_state_get();
@@ -413,10 +410,9 @@ void EEVEE_cryptomatte_output_accumulate(EEVEE_ViewLayerData *UNUSED(sldata), EE
   const int cryptomatte_levels = view_layer->cryptomatte_levels;
   const int current_sample = effects->taa_current_sample;
 
-  /* In accurate mode all render samples are evaluated. In inaccurate mode this is limited to the
-   * number of cryptomatte levels. This will reduce the overhead of downloading the GPU buffer and
-   * integrating it into the accum buffer. */
-  if (g_data->cryptomatte_accurate_mode || current_sample < cryptomatte_levels) {
+  /* Render samples used by cryptomatte are limited to the number of cryptomatte levels. This will
+   * reduce the overhead of downloading the GPU buffer and integrating it into the accum buffer. */
+  if (current_sample < cryptomatte_levels) {
     static float clear_color[4] = {0.0};
     GPU_framebuffer_bind(fbl->cryptomatte_fb);
     GPU_framebuffer_clear_color(fbl->cryptomatte_fb, clear_color);
diff --git a/source/blender/draw/engines/eevee/eevee_effects.c b/source/blender/draw/engines/eevee/eevee_effects.c
index 3a38edecec6..d5960ea57d5 100644
--- a/source/blender/draw/engines/eevee/eevee_effects.c
+++ b/source/blender/draw/engines/eevee/eevee_effects.c
@@ -38,7 +38,8 @@ static struct {
   struct GPUTexture *color_src;
 
   int depth_src_layer;
-  float cube_texel_size;
+  /* Size can be vec3. But we only use 2 components in the shader. */
+  float texel_size[2];
 } e_data = {NULL}; /* Engine data */
 
 #define SETUP_BUFFER(tex, fb, fb_color) \
@@ -259,6 +260,7 @@ void EEVEE_effects_cache_init(EEVEE_ViewLayerData *sldata, EEVEE_Data *vedata)
     DRW_PASS_CREATE(psl->color_downsample_ps, DRW_STATE_WRITE_COLOR);
     grp = DRW_shgroup_create(EEVEE_shaders_effect_downsample_sh_get(), psl->color_downsample_ps);
     DRW_shgroup_uniform_texture_ex(grp, "source", txl->filtered_radiance, GPU_SAMPLER_FILTER);
+    DRW_shgroup_uniform_vec2(grp, "texelSize", e_data.texel_size, 1);
     DRW_shgroup_call_procedural_triangles(grp, NULL, 1);
   }
 
@@ -267,7 +269,7 @@ void EEVEE_effects_cache_init(EEVEE_ViewLayerData *sldata, EEVEE_Data *vedata)
     grp = DRW_shgroup_create(EEVEE_shaders_effect_downsample_cube_sh_get(),
                              psl->color_downsample_cube_ps);
     DRW_shgroup_uniform_texture_ref(grp, "source", &e_data.color_src);
-    DRW_shgroup_uniform_float(grp, "texelSize", &e_data.cube_texel_size, 1);
+    DRW_shgroup_uniform_float(grp, "texelSize", e_data.texel_size, 1);
     DRW_shgroup_uniform_int_copy(grp, "Layer", 0);
     DRW_shgroup_call_instances(grp, NULL, quad, 6);
   }
@@ -277,6 +279,7 @@ void EEVEE_effects_cache_init(EEVEE_ViewLayerData *sldata, EEVEE_Data *vedata)
     DRW_PASS_CREATE(psl->maxz_downlevel_ps, downsample_write);
     grp = DRW_shgroup_create(EEVEE_shaders_effect_maxz_downlevel_sh_get(), psl->maxz_downlevel_ps);
     DRW_shgroup_uniform_texture_ref_ex(grp, "depthBuffer", &txl->maxzbuffer, GPU_SAMPLER_DEFAULT);
+    DRW_shgroup_uniform_vec2(grp, "texelSize", e_data.texel_size, 1);
     DRW_shgroup_call(grp, quad, NULL);
 
     /* Copy depth buffer to top level of HiZ */
@@ -345,16 +348,22 @@ static void min_downsample_cb(void *vedata, int UNUSED(level))
 }
 #endif
 
-static void max_downsample_cb(void *vedata, int UNUSED(level))
+static void max_downsample_cb(void *vedata, int level)
 {
   EEVEE_PassList *psl = ((EEVEE_Data *)vedata)->psl;
+  EEVEE_TextureList *txl = ((EEVEE_Data *)vedata)->txl;
+  int texture_size[3];
+  GPU_texture_get_mipmap_size(txl->maxzbuffer, level - 1, texture_size);
+  e_data.texel_size[0] = 1.0f / texture_size[0];
+  e_data.texel_size[1] = 1.0f / texture_size[1];
   DRW_draw_pass(psl->maxz_downlevel_ps);
 }
 
 static void simple_downsample_cube_cb(void *vedata, int level)
 {
   EEVEE_PassList *psl = ((EEVEE_Data *)vedata)->psl;
-  e_data.cube_texel_size = (float)(1 << level) / (float)GPU_texture_width(e_data.color_src);
+  e_data.texel_size[0] = (float)(1 << level) / (float)GPU_texture_width(e_data.color_src);
+  e_data.texel_size[1] = e_data.texel_size[0];
   DRW_draw_pass(psl->color_downsample_cube_ps);
 }
 
@@ -390,9 +399,14 @@ void EEVEE_create_minmax_buffer(EEVEE_Data *vedata, GPUTexture *depth_src, int l
   }
 }
 
-static void downsample_radiance_cb(void *vedata, int UNUSED(level))
+static void downsample_radiance_cb(void *vedata, int level)
 {
   EEVEE_PassList *psl = ((EEVEE_Data *)vedata)->psl;
+  EEVEE_TextureList *txl = ((EEVEE_Data *)vedata)->txl;
+  int texture_size[3];
+  GPU_texture_get_mipmap_size(txl->filtered_radiance, level - 1, texture_size);
+  e_data.texel_size[0] = 1.0f / texture_size[0];
+  e_data.texel_size[1] = 1.0f / texture_size[1];
   DRW_draw_pass(psl->color_downsample_ps);
 }
 
diff --git a/source/blender/draw/engines/eevee/eevee_engine.c b/source/blender/draw/engines/eevee/eevee_engine.c
index 6a66e8b1a58..f8e1cc9c923 100644
--- a/source/blender/draw/engines/eevee/eevee_engine.c
+++ b/source/blender/draw/engines/eevee/eevee_engine.c
@@ -648,6 +648,8 @@ RenderEngineType DRW_engine_viewport_eevee_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &EEVEE_render_update_passes,
     &draw_engine_eevee_type,
     {NULL, NULL, NULL},
diff --git a/source/blender/draw/engines/eevee/eevee_private.h b/source/blender/draw/engines/eevee/eevee_private.h
index f51b4fa0127..eae5d161cc3 100644
--- a/source/blender/draw/engines/eevee/eevee_private.h
+++ b/source/blender/draw/engines/eevee/eevee_private.h
@@ -1042,7 +1042,6 @@ typedef struct EEVEE_PrivateData {
   int aov_hash;
   int num_aovs_used;
   struct CryptomatteSession *cryptomatte_session;
-  bool cryptomatte_accurate_mode;
   EEVEE_CryptomatteSample *cryptomatte_accum_buffer;
   float *cryptomatte_download_buffer;
 
diff --git a/source/blender/draw/engines/eevee/shaders/ambient_occlusion_lib.glsl b/source/blender/draw/engines/eevee/shaders/ambient_occlusion_lib.glsl
index d4e3b879426..93641443cac 100644
--- a/source/blender/draw/engines/eevee/shaders/ambient_occlusion_lib.glsl
+++ b/source/blender/draw/engines/eevee/shaders/ambient_occlusion_lib.glsl
@@ -379,7 +379,7 @@ float specular_occlusion(
   /* Visibility to cone angle (eq. 18). */
   float vis_angle = fast_acos(sqrt(1 - visibility));
   /* Roughness to cone angle (eq. 26). */
-  float spec_angle = max(0.001, fast_acos(cone_cosine(roughness)));
+  float spec_angle = max(0.00990998744964599609375, fast_acos(cone_cosine(roughness)));
   /* Angle between cone axes. */
   float cone_cone_dist = fast_acos(saturate(dot(visibility_dir, specular_dir)));
   float cone_nor_dist = fast_acos(saturate(dot(N, specular_dir)));
diff --git a/source/blender/draw/engines/eevee/shaders/effect_downsample_frag.glsl b/source/blender/draw/engines/eevee/shaders/effect_downsample_frag.glsl
index d1cb25af82f..9fc258da185 100644
--- a/source/blender/draw/engines/eevee/shaders/effect_downsample_frag.glsl
+++ b/source/blender/draw/engines/eevee/shaders/effect_downsample_frag.glsl
@@ -9,14 +9,16 @@
 uniform sampler2D source;
 uniform float fireflyFactor;
 
+#ifndef COPY_SRC
+uniform vec2 texelSize;
+#endif
+
 out vec4 FragColor;
 
 void main()
 {
-  vec2 texel_size = 1.0 / vec2(textureSize(source, 0));
-  vec2 uvs = gl_FragCoord.xy * texel_size;
-
 #ifdef COPY_SRC
+  vec2 uvs = gl_FragCoord.xy / vec2(textureSize(source, 0));
   FragColor = textureLod(source, uvs, 0.0);
   FragColor = safe_color(FragColor);
 
@@ -25,7 +27,10 @@ void main()
   FragColor *= 1.0 - max(0.0, luma - fireflyFactor) / luma;
 
 #else
-  vec4 ofs = texel_size.xyxy * vec4(0.75, 0.75, -0.75, -0.75);
+  /* NOTE(@fclem): textureSize() does not work the same on all implementations
+   * when changing the min and max texture levels. Use uniform instead (see T87801). */
+  vec2 uvs = gl_FragCoord.xy * texelSize;
+  vec4 ofs = texelSize.xyxy * vec4(0.75, 0.75, -0.75, -0.75);
   uvs *= 2.0;
 
   FragColor = textureLod(source, uvs + ofs.xy, 0.0);
diff --git a/source/blender/draw/engines/eevee/shaders/effect_minmaxz_frag.glsl b/source/blender/draw/engines/eevee/shaders/effect_minmaxz_frag.glsl
index ccb65d2e5a6..8ef39a55921 100644
--- a/source/blender/draw/engines/eevee/shaders/effect_minmaxz_frag.glsl
+++ b/source/blender/draw/engines/eevee/shaders/effect_minmaxz_frag.glsl
@@ -14,6 +14,10 @@ uniform int depthLayer;
 uniform sampler2D depthBuffer;
 #endif
 
+#ifndef COPY_DEPTH
+uniform vec2 texelSize;
+#endif
+
 #ifdef LAYERED
 #  define sampleLowerMip(t) texture(depthBuffer, vec3(t, depthLayer)).r
 #  define gatherLowerMip(t) textureGather(depthBuffer, vec3(t, depthLayer))
@@ -41,23 +45,24 @@ out vec4 fragColor;
 void main()
 {
   vec2 texel = gl_FragCoord.xy;
-  vec2 texel_size = 1.0 / vec2(textureSize(depthBuffer, 0).xy);
 
 #ifdef COPY_DEPTH
-  vec2 uv = texel * texel_size;
+  vec2 uv = texel / vec2(textureSize(depthBuffer, 0).xy);
 
   float val = sampleLowerMip(uv);
 #else
-  vec2 uv = texel * 2.0 * texel_size;
+  /* NOTE(@fclem): textureSize() does not work the same on all implementations
+   * when changing the min and max texture levels. Use uniform instead (see T87801). */
+  vec2 uv = texel * 2.0 * texelSize;
 
   vec4 samp;
 #  ifdef GPU_ARB_texture_gather
   samp = gatherLowerMip(uv);
 #  else
-  samp.x = sampleLowerMip(uv + vec2(-0.5, -0.5) * texel_size);
-  samp.y = sampleLowerMip(uv + vec2(-0.5, 0.5) * texel_size);
-  samp.z = sampleLowerMip(uv + vec2(0.5, -0.5) * texel_size);
-  samp.w = sampleLowerMip(uv + vec2(0.5, 0.5) * texel_size);
+  samp.x = sampleLowerMip(uv + vec2(-0.5, -0.5) * texelSize);
+  samp.y = sampleLowerMip(uv + vec2(-0.5, 0.5) * texelSize);
+  samp.z = sampleLowerMip(uv + vec2(0.5, -0.5) * texelSize);
+  samp.w = sampleLowerMip(uv + vec2(0.5, 0.5) * texelSize);
 #  endif
 
   float val = minmax4(samp.x, samp.y, samp.z, samp.w);
diff --git a/source/blender/draw/engines/external/external_engine.c b/source/blender/draw/engines/external/external_engine.c
index 89ee3f1b293..cc548a53a8e 100644
--- a/source/blender/draw/engines/external/external_engine.c
+++ b/source/blender/draw/engines/external/external_engine.c
@@ -32,13 +32,19 @@
 #include "BKE_object.h"
 #include "BKE_particle.h"
 
+#include "ED_image.h"
 #include "ED_screen.h"
 
+#include "GPU_batch.h"
+#include "GPU_debug.h"
 #include "GPU_matrix.h"
 #include "GPU_shader.h"
 #include "GPU_state.h"
 #include "GPU_viewport.h"
 
+#include "RE_engine.h"
+#include "RE_pipeline.h"
+
 #include "external_engine.h" /* own include */
 
 /* Shaders */
@@ -137,6 +143,22 @@ static void external_engine_init(void *vedata)
   }
 }
 
+/* Add shading group call which will take care of writing to the depth buffer, so that the
+ * alpha-under overlay will happen for the render buffer. */
+static void external_cache_image_add(DRWShadingGroup *grp)
+{
+  float obmat[4][4];
+  unit_m4(obmat);
+  scale_m4_fl(obmat, 0.5f);
+
+  /* NOTE: Use the same Z-depth value as in the regular image drawing engine. */
+  translate_m4(obmat, 1.0f, 1.0f, 0.75f);
+
+  GPUBatch *geom = DRW_cache_quad_get();
+
+  DRW_shgroup_call_obmat(grp, geom, obmat);
+}
+
 static void external_cache_init(void *vedata)
 {
   EXTERNAL_PassList *psl = ((EXTERNAL_Data *)vedata)->psl;
@@ -162,14 +184,33 @@ static void external_cache_init(void *vedata)
     stl->g_data->depth_shgrp = DRW_shgroup_create(e_data.depth_sh, psl->depth_pass);
   }
 
-  /* Do not draw depth pass when overlays are turned off. */
-  stl->g_data->need_depth = (v3d->flag2 & V3D_HIDE_OVERLAYS) == 0;
+  if (v3d != NULL) {
+    /* Do not draw depth pass when overlays are turned off. */
+    stl->g_data->need_depth = (v3d->flag2 & V3D_HIDE_OVERLAYS) == 0;
+  }
+  else if (draw_ctx->space_data != NULL) {
+    const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+    if (space_type == SPACE_IMAGE) {
+      external_cache_image_add(stl->g_data->depth_shgrp);
+
+      stl->g_data->need_depth = true;
+      stl->g_data->update_depth = true;
+    }
+  }
 }
 
 static void external_cache_populate(void *vedata, Object *ob)
 {
+  const DRWContextState *draw_ctx = DRW_context_state_get();
   EXTERNAL_StorageList *stl = ((EXTERNAL_Data *)vedata)->stl;
 
+  if (draw_ctx->space_data != NULL) {
+    const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+    if (space_type == SPACE_IMAGE) {
+      return;
+    }
+  }
+
   if (!(DRW_object_is_renderable(ob) &&
         DRW_object_visibility_in_active_context(ob) & OB_VISIBLE_SELF)) {
     return;
@@ -210,13 +251,11 @@ static void external_cache_finish(void *UNUSED(vedata))
 {
 }
 
-static void external_draw_scene_do(void *vedata)
+static void external_draw_scene_do_v3d(void *vedata)
 {
   const DRWContextState *draw_ctx = DRW_context_state_get();
-  Scene *scene = draw_ctx->scene;
   RegionView3D *rv3d = draw_ctx->rv3d;
   ARegion *region = draw_ctx->region;
-  const RenderEngineType *type;
 
   DRW_state_reset_ex(DRW_STATE_DEFAULT & ~DRW_STATE_DEPTH_LESS_EQUAL);
 
@@ -229,8 +268,6 @@ static void external_draw_scene_do(void *vedata)
     }
 
     RenderEngine *engine = RE_engine_create(engine_type);
-    engine->tile_x = scene->r.tilex;
-    engine->tile_y = scene->r.tiley;
     engine_type->view_update(engine, draw_ctx->evil_C, draw_ctx->depsgraph);
     rv3d->render_engine = engine;
   }
@@ -241,7 +278,7 @@ static void external_draw_scene_do(void *vedata)
   ED_region_pixelspace(region);
 
   /* Render result draw. */
-  type = rv3d->render_engine->type;
+  const RenderEngineType *type = rv3d->render_engine->type;
   type->view_draw(rv3d->render_engine, draw_ctx->evil_C, draw_ctx->depsgraph);
 
   GPU_bgl_end();
@@ -259,6 +296,116 @@ static void external_draw_scene_do(void *vedata)
   }
 }
 
+/* Configure current matrix stack so that the external engine can use the same drawing code for
+ * both viewport and image editor drawing.
+ *
+ * The engine draws result in the pixel space, and is applying render offset. For image editor we
+ * need to switch from normalized space to pixel space, and "un-apply" offset. */
+static void external_image_space_matrix_set(const RenderEngine *engine)
+{
+  BLI_assert(engine != NULL);
+
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+  const DRWView *view = DRW_view_get_active();
+  struct SpaceImage *space_image = (struct SpaceImage *)draw_ctx->space_data;
+
+  /* Apply current view as transformation matrix.
+   * This will configure drawing for normalized space with current zoom and pan applied. */
+
+  float view_matrix[4][4];
+  DRW_view_viewmat_get(view, view_matrix, false);
+
+  float projection_matrix[4][4];
+  DRW_view_winmat_get(view, projection_matrix, false);
+
+  GPU_matrix_projection_set(projection_matrix);
+  GPU_matrix_set(view_matrix);
+
+  /* Switch from normalized space to pixel space. */
+  {
+    int width, height;
+    ED_space_image_get_size(space_image, &width, &height);
+
+    const float width_inv = width ? 1.0f / width : 0.0f;
+    const float height_inv = height ? 1.0f / height : 0.0f;
+    GPU_matrix_scale_2f(width_inv, height_inv);
+  }
+
+  /* Un-apply render offset. */
+  {
+    Render *render = engine->re;
+    rctf view_rect;
+    rcti render_rect;
+    RE_GetViewPlane(render, &view_rect, &render_rect);
+
+    GPU_matrix_translate_2f(-render_rect.xmin, -render_rect.ymin);
+  }
+}
+
+static void external_draw_scene_do_image(void *UNUSED(vedata))
+{
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+  Scene *scene = draw_ctx->scene;
+  Render *re = RE_GetSceneRender(scene);
+  RenderEngine *engine = RE_engine_get(re);
+
+  /* Is tested before enabling the drawing engine. */
+  BLI_assert(re != NULL);
+  BLI_assert(engine != NULL);
+
+  const DefaultFramebufferList *dfbl = DRW_viewport_framebuffer_list_get();
+
+  /* Clear the depth buffer to the value used by the background overlay so that the overlay is not
+   * happening outside of the drawn image.
+   *
+   * NOTE: The external engine only draws color. The depth is taken care of using the depth pass
+   * which initialized the depth to the values expected by the background overlay. */
+  GPU_framebuffer_clear_depth(dfbl->default_fb, 1.0f);
+
+  GPU_matrix_push_projection();
+  GPU_matrix_push();
+
+  external_image_space_matrix_set(engine);
+
+  GPU_debug_group_begin("External Engine");
+
+  const RenderEngineType *engine_type = engine->type;
+  BLI_assert(engine_type != NULL);
+  BLI_assert(engine_type->draw != NULL);
+
+  engine_type->draw(engine, draw_ctx->evil_C, draw_ctx->depsgraph);
+
+  GPU_debug_group_end();
+
+  GPU_matrix_pop();
+  GPU_matrix_pop_projection();
+
+  DRW_state_reset();
+  GPU_bgl_end();
+
+  RE_engine_draw_release(re);
+}
+
+static void external_draw_scene_do(void *vedata)
+{
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+
+  if (draw_ctx->v3d != NULL) {
+    external_draw_scene_do_v3d(vedata);
+    return;
+  }
+
+  if (draw_ctx->space_data == NULL) {
+    return;
+  }
+
+  const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+  if (space_type == SPACE_IMAGE) {
+    external_draw_scene_do_image(vedata);
+    return;
+  }
+}
+
 static void external_draw_scene(void *vedata)
 {
   const DRWContextState *draw_ctx = DRW_context_state_get();
@@ -297,7 +444,7 @@ static void external_engine_free(void)
 
 static const DrawEngineDataSize external_data_size = DRW_VIEWPORT_DATA_SIZE(EXTERNAL_Data);
 
-static DrawEngineType draw_engine_external_type = {
+DrawEngineType draw_engine_external_type = {
     NULL,
     NULL,
     N_("External"),
@@ -330,8 +477,45 @@ RenderEngineType DRW_engine_viewport_external_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &draw_engine_external_type,
     {NULL, NULL, NULL},
 };
 
+bool DRW_engine_external_acquire_for_image_editor(void)
+{
+  const DRWContextState *draw_ctx = DRW_context_state_get();
+  const SpaceLink *space_data = draw_ctx->space_data;
+  Scene *scene = draw_ctx->scene;
+
+  if (space_data == NULL) {
+    return false;
+  }
+
+  const eSpace_Type space_type = draw_ctx->space_data->spacetype;
+  if (space_type != SPACE_IMAGE) {
+    return false;
+  }
+
+  struct SpaceImage *space_image = (struct SpaceImage *)space_data;
+  const Image *image = ED_space_image(space_image);
+  if (image == NULL || image->type != IMA_TYPE_R_RESULT) {
+    return false;
+  }
+
+  if (image->render_slot != image->last_render_slot) {
+    return false;
+  }
+
+  /* Render is allocated on main thread, so it is safe to access it from here. */
+  Render *re = RE_GetSceneRender(scene);
+
+  if (re == NULL) {
+    return false;
+  }
+
+  return RE_engine_draw_acquire(re);
+}
+
 #undef EXTERNAL_ENGINE
diff --git a/source/blender/draw/engines/external/external_engine.h b/source/blender/draw/engines/external/external_engine.h
index c645fb99e0e..14ec4e2d3c5 100644
--- a/source/blender/draw/engines/external/external_engine.h
+++ b/source/blender/draw/engines/external/external_engine.h
@@ -22,4 +22,12 @@
 
 #pragma once
 
+extern DrawEngineType draw_engine_external_type;
 extern RenderEngineType DRW_engine_viewport_external_type;
+
+/* Check whether an external engine is to be used to draw content of an image editor.
+ * If the drawing is possible, the render engine is "acquired" so that it is not freed by the
+ * render engine for until drawing is finished.
+ *
+ * NOTE: Released by the draw engine when it is done drawing. */
+bool DRW_engine_external_acquire_for_image_editor(void);
diff --git a/source/blender/draw/engines/select/select_engine.c b/source/blender/draw/engines/select/select_engine.c
index 96ab8a28e09..20edd78597b 100644
--- a/source/blender/draw/engines/select/select_engine.c
+++ b/source/blender/draw/engines/select/select_engine.c
@@ -388,6 +388,8 @@ RenderEngineType DRW_engine_viewport_select_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &draw_engine_select_type,
     {NULL, NULL, NULL},
 };
diff --git a/source/blender/draw/engines/workbench/workbench_engine.c b/source/blender/draw/engines/workbench/workbench_engine.c
index f09c019ef8d..635aa7cef25 100644
--- a/source/blender/draw/engines/workbench/workbench_engine.c
+++ b/source/blender/draw/engines/workbench/workbench_engine.c
@@ -651,6 +651,8 @@ RenderEngineType DRW_engine_viewport_workbench_type = {
     NULL,
     NULL,
     NULL,
+    NULL,
+    NULL,
     &workbench_render_update_passes,
     &draw_engine_workbench,
     {NULL, NULL, NULL},
diff --git a/source/blender/draw/intern/DRW_render.h b/source/blender/draw/intern/DRW_render.h
index 660a4adaf51..fb8b8536897 100644
--- a/source/blender/draw/intern/DRW_render.h
+++ b/source/blender/draw/intern/DRW_render.h
@@ -623,6 +623,7 @@ const DRWView *DRW_view_default_get(void);
 void DRW_view_default_set(DRWView *view);
 void DRW_view_reset(void);
 void DRW_view_set_active(DRWView *view);
+const DRWView *DRW_view_get_active(void);
 
 void DRW_view_clip_planes_set(DRWView *view, float (*planes)[4], int plane_len);
 void DRW_view_camtexco_set(DRWView *view, float texco[4]);
diff --git a/source/blender/draw/intern/draw_manager.c b/source/blender/draw/intern/draw_manager.c
index 47adc0acc60..e65fdce5f2e 100644
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -1197,6 +1197,18 @@ static void drw_engines_enable_basic(void)
   use_drw_engine(&draw_engine_basic_type);
 }
 
+static void drw_engine_enable_image_editor(void)
+{
+  if (DRW_engine_external_acquire_for_image_editor()) {
+    use_drw_engine(&draw_engine_external_type);
+  }
+  else {
+    use_drw_engine(&draw_engine_image_type);
+  }
+
+  use_drw_engine(&draw_engine_overlay_type);
+}
+
 static void drw_engines_enable_editors(void)
 {
   SpaceLink *space_data = DST.draw_ctx.space_data;
@@ -1205,8 +1217,7 @@ static void drw_engines_enable_editors(void)
   }
 
   if (space_data->spacetype == SPACE_IMAGE) {
-    use_drw_engine(&draw_engine_image_type);
-    use_drw_engine(&draw_engine_overlay_type);
+    drw_engine_enable_image_editor();
   }
   else if (space_data->spacetype == SPACE_NODE) {
     /* Only enable when drawing the space image backdrop. */
@@ -3188,3 +3199,66 @@ void DRW_draw_state_init_gtests(eGPUShaderConfig sh_cfg)
 #endif
 
 /** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name Draw manager context release/activation
+ *
+ * These functions are used in cases when an OpenGL context creation is needed during the draw.
+ * This happens, for example, when an external engine needs to create its own OpenGL context from
+ * the engine initialization.
+ *
+ * Example of context creation:
+ *
+ *   const bool drw_state = DRW_opengl_context_release();
+ *   gl_context = WM_opengl_context_create();
+ *   DRW_opengl_context_activate(drw_state);
+ *
+ * Example of context destruction:
+ *
+ *   const bool drw_state = DRW_opengl_context_release();
+ *   WM_opengl_context_activate(gl_context);
+ *   WM_opengl_context_dispose(gl_context);
+ *   DRW_opengl_context_activate(drw_state);
+ *
+ *
+ * NOTE: Will only perform context modification when on main thread. This way these functions can
+ * be used in an engine without check on whether it is a draw manager which manages OpenGL context
+ * on the current thread. The downside of this is that if the engine performs OpenGL creation from
+ * a non-main thread, that thread is supposed to not have OpenGL context ever bound by Blender.
+ *
+ * \{ */
+
+bool DRW_opengl_context_release(void)
+{
+  if (!BLI_thread_is_main()) {
+    return false;
+  }
+
+  if (GPU_context_active_get() != DST.gpu_context) {
+    /* Context release is requested from the outside of the draw manager main draw loop, indicate
+     * this to the `DRW_opengl_context_activate()` so that it restores drawable of the window. */
+    return false;
+  }
+
+  GPU_context_active_set(NULL);
+  WM_opengl_context_release(DST.gl_context);
+
+  return true;
+}
+
+void DRW_opengl_context_activate(bool drw_state)
+{
+  if (!BLI_thread_is_main()) {
+    return;
+  }
+
+  if (drw_state) {
+    WM_opengl_context_activate(DST.gl_context);
+    GPU_context_active_set(DST.gpu_context);
+  }
+  else {
+    wm_window_reset_drawable();
+  }
+}
+
+/** \} */
diff --git a/source/blender/draw/intern/draw_manager_exec.c b/source/blender/draw/intern/draw_manager_exec.c
index 22356a3c57b..aa01ca7a262 100644
--- a/source/blender/draw/intern/draw_manager_exec.c
+++ b/source/blender/draw/intern/draw_manager_exec.c
@@ -367,6 +367,11 @@ void DRW_view_set_active(DRWView *view)
   DST.view_active = (view) ? view : DST.view_default;
 }
 
+const DRWView *DRW_view_get_active(void)
+{
+  return DST.view_active;
+}
+
 /* Return True if the given BoundSphere intersect the current view frustum */
 static bool draw_culling_sphere_test(const BoundSphere *frustum_bsphere,
                                      const float (*frustum_planes)[4],
diff --git a/source/blender/editors/animation/anim_deps.c b/source/blender/editors/animation/anim_deps.c
index 97679723d84..088de80bb65 100644
--- a/source/blender/editors/animation/anim_deps.c
+++ b/source/blender/editors/animation/anim_deps.c
@@ -217,8 +217,6 @@ static void animchan_sync_fcurve_scene(bAnimListElem *ale)
   /* Check if this strip is selected. */
   Editing *ed = SEQ_editing_get(scene);
   seq = SEQ_get_sequence_by_name(ed->seqbasep, seq_name, false);
-  MEM_freeN(seq_name);
-
   if (seq == NULL) {
     return;
   }
diff --git a/source/blender/editors/animation/anim_ops.c b/source/blender/editors/animation/anim_ops.c
index 450d7cd100e..b4ea33920b2 100644
--- a/source/blender/editors/animation/anim_ops.c
+++ b/source/blender/editors/animation/anim_ops.c
@@ -241,6 +241,11 @@ static bool use_sequencer_snapping(bContext *C)
 /* Modal Operator init */
 static int change_frame_invoke(bContext *C, wmOperator *op, const wmEvent *event)
 {
+  ARegion *region = CTX_wm_region(C);
+  if (CTX_wm_space_seq(C) != NULL && region->regiontype == RGN_TYPE_PREVIEW) {
+    return OPERATOR_CANCELLED;
+  }
+
   /* Change to frame that mouse is over before adding modal handler,
    * as user could click on a single frame (jump to frame) as well as
    * click-dragging over a range (modal scrubbing).
diff --git a/source/blender/editors/armature/armature_intern.h b/source/blender/editors/armature/armature_intern.h
index f9950d27e97..696355324e6 100644
--- a/source/blender/editors/armature/armature_intern.h
+++ b/source/blender/editors/armature/armature_intern.h
@@ -216,6 +216,7 @@ void POSE_OT_relax(struct wmOperatorType *ot);
 void POSE_OT_push_rest(struct wmOperatorType *ot);
 void POSE_OT_relax_rest(struct wmOperatorType *ot);
 void POSE_OT_breakdown(struct wmOperatorType *ot);
+void POSE_OT_blend_to_neighbours(struct wmOperatorType *ot);
 
 void POSE_OT_propagate(struct wmOperatorType *ot);
 
diff --git a/source/blender/editors/armature/armature_ops.c b/source/blender/editors/armature/armature_ops.c
index fbd89106de5..a1070a8823a 100644
--- a/source/blender/editors/armature/armature_ops.c
+++ b/source/blender/editors/armature/armature_ops.c
@@ -150,6 +150,7 @@ void ED_operatortypes_armature(void)
   WM_operatortype_append(POSE_OT_push_rest);
   WM_operatortype_append(POSE_OT_relax_rest);
   WM_operatortype_append(POSE_OT_breakdown);
+  WM_operatortype_append(POSE_OT_blend_to_neighbours);
 }
 
 void ED_operatormacros_armature(void)
diff --git a/source/blender/editors/armature/pose_slide.c b/source/blender/editors/armature/pose_slide.c
index f23376867af..b273d3aac76 100644
--- a/source/blender/editors/armature/pose_slide.c
+++ b/source/blender/editors/armature/pose_slide.c
@@ -117,6 +117,7 @@ typedef enum ePoseSlide_Modes {
   POSESLIDE_BREAKDOWN,
   POSESLIDE_PUSH_REST,
   POSESLIDE_RELAX_REST,
+  POSESLIDE_BLEND,
 } ePoseSlide_Modes;
 
 /** Transforms/Channels to Affect. */
@@ -423,6 +424,25 @@ static void pose_slide_apply_val(tPoseSlideOp *pso, FCurve *fcu, Object *ob, flo
       (*val) = ((sVal * w2) + (eVal * w1));
       break;
     }
+    case POSESLIDE_BLEND: /* Blend the current pose with the previous (<50%) or next key (>50%). */
+    {
+      /* FCurve value on current frame. */
+      const float cVal = evaluate_fcurve(fcu, cframe);
+      const float factor = ED_slider_factor_get(pso->slider);
+      /* Convert factor to absolute 0-1 range. */
+      const float blend_factor = fabs((factor - 0.5f) * 2);
+
+      if (factor < 0.5) {
+        /* Blend to previous key. */
+        (*val) = (cVal * (1 - blend_factor)) + (sVal * blend_factor);
+      }
+      else {
+        /* Blend to next key. */
+        (*val) = (cVal * (1 - blend_factor)) + (eVal * blend_factor);
+      }
+
+      break;
+    }
     /* Those are handled in pose_slide_rest_pose_apply. */
     case POSESLIDE_PUSH_REST:
     case POSESLIDE_RELAX_REST: {
@@ -614,8 +634,7 @@ static void pose_slide_apply_quat(tPoseSlideOp *pso, tPChanFCurveLink *pfl)
 
       interp_qt_qtqt(quat_final, quat_prev, quat_next, ED_slider_factor_get(pso->slider));
     }
-    else {
-      /* POSESLIDE_PUSH and POSESLIDE_RELAX. */
+    else if (pso->mode == POSESLIDE_PUSH || pso->mode == POSESLIDE_RELAX) {
       float quat_breakdown[4];
       float quat_curr[4];
 
@@ -638,6 +657,32 @@ static void pose_slide_apply_quat(tPoseSlideOp *pso, tPChanFCurveLink *pfl)
         interp_qt_qtqt(quat_final, quat_curr, quat_breakdown, ED_slider_factor_get(pso->slider));
       }
     }
+    else if (pso->mode == POSESLIDE_BLEND) {
+      float quat_blend[4];
+      float quat_curr[4];
+
+      copy_qt_qt(quat_curr, pchan->quat);
+
+      if (ED_slider_factor_get(pso->slider) < 0.5) {
+        quat_blend[0] = evaluate_fcurve(fcu_w, prevFrameF);
+        quat_blend[1] = evaluate_fcurve(fcu_x, prevFrameF);
+        quat_blend[2] = evaluate_fcurve(fcu_y, prevFrameF);
+        quat_blend[3] = evaluate_fcurve(fcu_z, prevFrameF);
+      }
+      else {
+        quat_blend[0] = evaluate_fcurve(fcu_w, nextFrameF);
+        quat_blend[1] = evaluate_fcurve(fcu_x, nextFrameF);
+        quat_blend[2] = evaluate_fcurve(fcu_y, nextFrameF);
+        quat_blend[3] = evaluate_fcurve(fcu_z, nextFrameF);
+      }
+
+      normalize_qt(quat_blend);
+      normalize_qt(quat_curr);
+
+      const float blend_factor = fabs((ED_slider_factor_get(pso->slider) - 0.5f) * 2);
+
+      interp_qt_qtqt(quat_final, quat_curr, quat_blend, blend_factor);
+    }
 
     /* Apply final to the pose bone, keeping compatible for similar keyframe positions. */
     quat_to_compatible_quat(pchan->quat, quat_final, pchan->quat);
@@ -868,6 +913,9 @@ static void pose_slide_draw_status(bContext *C, tPoseSlideOp *pso)
     case POSESLIDE_BREAKDOWN:
       strcpy(mode_str, TIP_("Breakdown"));
       break;
+    case POSESLIDE_BLEND:
+      strcpy(mode_str, TIP_("Blend To Neighbour"));
+      break;
 
     default:
       /* Unknown. */
@@ -1660,6 +1708,56 @@ void POSE_OT_breakdown(wmOperatorType *ot)
   pose_slide_opdef_properties(ot);
 }
 
+/* ........................ */
+static int pose_slide_blend_to_neighbours_invoke(bContext *C, wmOperator *op, const wmEvent *event)
+{
+  /* Initialize data. */
+  if (pose_slide_init(C, op, POSESLIDE_BLEND) == 0) {
+    pose_slide_exit(C, op);
+    return OPERATOR_CANCELLED;
+  }
+
+  /* Do common setup work. */
+  return pose_slide_invoke_common(C, op, event);
+}
+
+static int pose_slide_blend_to_neighbours_exec(bContext *C, wmOperator *op)
+{
+  tPoseSlideOp *pso;
+
+  /* Initialize data (from RNA-props). */
+  if (pose_slide_init(C, op, POSESLIDE_BLEND) == 0) {
+    pose_slide_exit(C, op);
+    return OPERATOR_CANCELLED;
+  }
+
+  pso = op->customdata;
+
+  /* Do common exec work. */
+  return pose_slide_exec_common(C, op, pso);
+}
+
+void POSE_OT_blend_to_neighbours(wmOperatorType *ot)
+{
+  /* Identifiers. */
+  ot->name = "Blend To Neighbour";
+  ot->idname = "POSE_OT_blend_to_neighbour";
+  ot->description = "Blend from current position to previous or next keyframe";
+
+  /* Callbacks. */
+  ot->exec = pose_slide_blend_to_neighbours_exec;
+  ot->invoke = pose_slide_blend_to_neighbours_invoke;
+  ot->modal = pose_slide_modal;
+  ot->cancel = pose_slide_cancel;
+  ot->poll = ED_operator_posemode;
+
+  /* Flags. */
+  ot->flag = OPTYPE_REGISTER | OPTYPE_UNDO | OPTYPE_BLOCKING | OPTYPE_GRAB_CURSOR_X;
+
+  /* Properties. */
+  pose_slide_opdef_properties(ot);
+}
+
 /* **************************************************** */
 /* B) Pose Propagate */
 
diff --git a/source/blender/editors/gpencil/gpencil_interpolate.c b/source/blender/editors/gpencil/gpencil_interpolate.c
index fdd9f44605e..d7cab85abad 100644
--- a/source/blender/editors/gpencil/gpencil_interpolate.c
+++ b/source/blender/editors/gpencil/gpencil_interpolate.c
@@ -316,6 +316,9 @@ static void gpencil_stroke_pair_table(bContext *C,
     if (ELEM(NULL, gps_from, gps_to)) {
       continue;
     }
+    if ((gps_from->totpoints == 0) || (gps_to->totpoints == 0)) {
+      continue;
+    }
     /* Insert the pair entry in the hash table and the list of strokes to keep order. */
     BLI_addtail(&tgpil->selected_strokes, BLI_genericNodeN(gps_from));
     BLI_ghash_insert(tgpil->pair_strokes, gps_from, gps_to);
@@ -1333,6 +1336,9 @@ static int gpencil_interpolate_seq_exec(bContext *C, wmOperator *op)
       if (ELEM(NULL, gps_from, gps_to)) {
         continue;
       }
+      if ((gps_from->totpoints == 0) || (gps_to->totpoints == 0)) {
+        continue;
+      }
 
       /* if destination stroke is smaller, resize new_stroke to size of gps_to stroke */
       if (gps_from->totpoints > gps_to->totpoints) {
diff --git a/source/blender/editors/interface/interface_eyedropper.c b/source/blender/editors/interface/interface_eyedropper.c
index 2e7b0ce532c..58a9f362488 100644
--- a/source/blender/editors/interface/interface_eyedropper.c
+++ b/source/blender/editors/interface/interface_eyedropper.c
@@ -24,6 +24,8 @@
 #include "DNA_screen_types.h"
 #include "DNA_space_types.h"
 
+#include "BLI_math_color.h"
+
 #include "BKE_context.h"
 #include "BKE_screen.h"
 
@@ -107,8 +109,13 @@ static void eyedropper_draw_cursor_text_ex(const int x, const int y, const char
 {
   const uiFontStyle *fstyle = UI_FSTYLE_WIDGET;
 
-  const float col_fg[4] = {1.0f, 1.0f, 1.0f, 1.0f};
-  const float col_bg[4] = {0.0f, 0.0f, 0.0f, 0.2f};
+  /* Use the theme settings from tooltips. */
+  const bTheme *btheme = UI_GetTheme();
+  const uiWidgetColors *wcol = &btheme->tui.wcol_tooltip;
+
+  float col_fg[4], col_bg[4];
+  rgba_uchar_to_float(col_fg, wcol->text);
+  rgba_uchar_to_float(col_bg, wcol->inner);
 
   UI_fontstyle_draw_simple_backdrop(fstyle, x, y + U.widget_unit, name, col_fg, col_bg);
 }
diff --git a/source/blender/editors/interface/interface_style.c b/source/blender/editors/interface/interface_style.c
index 804156ba48c..6b1ff92a855 100644
--- a/source/blender/editors/interface/interface_style.c
+++ b/source/blender/editors/interface/interface_style.c
@@ -312,11 +312,8 @@ void UI_fontstyle_draw_simple_backdrop(const uiFontStyle *fs,
     const float decent = BLF_descender(fs->uifont_id);
     const float margin = height / 4.0f;
 
-    /* backdrop */
-    const float color[4] = {col_bg[0], col_bg[1], col_bg[2], 0.5f};
-
     UI_draw_roundbox_corner_set(UI_CNR_ALL);
-    UI_draw_roundbox_aa(
+    UI_draw_roundbox_4fv(
         &(const rctf){
             .xmin = x - margin,
             .xmax = x + width + margin,
@@ -325,7 +322,7 @@ void UI_fontstyle_draw_simple_backdrop(const uiFontStyle *fs,
         },
         true,
         margin,
-        color);
+        col_bg);
   }
 
   BLF_position(fs->uifont_id, x, y, 0.0f);
diff --git a/source/blender/editors/interface/interface_template_search_menu.c b/source/blender/editors/interface/interface_template_search_menu.c
index 672f1b64943..3a5d65475f7 100644
--- a/source/blender/editors/interface/interface_template_search_menu.c
+++ b/source/blender/editors/interface/interface_template_search_menu.c
@@ -350,24 +350,28 @@ static void menu_types_add_from_keymap_items(bContext *C,
 
       if (handler_base->poll == NULL || handler_base->poll(region, win->eventstate)) {
         wmEventHandler_Keymap *handler = (wmEventHandler_Keymap *)handler_base;
-        wmKeyMap *keymap = WM_event_get_keymap_from_handler(wm, handler);
-        if (keymap && WM_keymap_poll(C, keymap)) {
-          LISTBASE_FOREACH (wmKeyMapItem *, kmi, &keymap->items) {
-            if (kmi->flag & KMI_INACTIVE) {
-              continue;
-            }
-            if (STR_ELEM(kmi->idname, "WM_OT_call_menu", "WM_OT_call_menu_pie")) {
-              char menu_idname[MAX_NAME];
-              RNA_string_get(kmi->ptr, "name", menu_idname);
-              MenuType *mt = WM_menutype_find(menu_idname, false);
-
-              if (mt && BLI_gset_add(menu_tagged, mt)) {
-                /* Unlikely, but possible this will be included twice. */
-                BLI_linklist_prepend(menuid_stack_p, mt);
-
-                void **kmi_p;
-                if (!BLI_ghash_ensure_p(menu_to_kmi, mt, &kmi_p)) {
-                  *kmi_p = kmi;
+        wmEventHandler_KeymapResult km_result;
+        WM_event_get_keymaps_from_handler(wm, handler, &km_result);
+        for (int km_index = 0; km_index < km_result.keymaps_len; km_index++) {
+          wmKeyMap *keymap = km_result.keymaps[km_index];
+          if (keymap && WM_keymap_poll(C, keymap)) {
+            LISTBASE_FOREACH (wmKeyMapItem *, kmi, &keymap->items) {
+              if (kmi->flag & KMI_INACTIVE) {
+                continue;
+              }
+              if (STR_ELEM(kmi->idname, "WM_OT_call_menu", "WM_OT_call_menu_pie")) {
+                char menu_idname[MAX_NAME];
+                RNA_string_get(kmi->ptr, "name", menu_idname);
+                MenuType *mt = WM_menutype_find(menu_idname, false);
+
+                if (mt && BLI_gset_add(menu_tagged, mt)) {
+                  /* Unlikely, but possible this will be included twice. */
+                  BLI_linklist_prepend(menuid_stack_p, mt);
+
+                  void **kmi_p;
+                  if (!BLI_ghash_ensure_p(menu_to_kmi, mt, &kmi_p)) {
+                    *kmi_p = kmi;
+                  }
                 }
               }
             }
diff --git a/source/blender/editors/interface/interface_templates.c b/source/blender/editors/interface/interface_templates.c
index 0c9eb20af19..320371ad9ea 100644
--- a/source/blender/editors/interface/interface_templates.c
+++ b/source/blender/editors/interface/interface_templates.c
@@ -5823,6 +5823,11 @@ void uiTemplateRunningJobs(uiLayout *layout, bContext *C)
       icon = ICON_SEQUENCE;
       break;
     }
+    if (WM_jobs_test(wm, scene, WM_JOB_TYPE_SEQ_DRAW_THUMBNAIL)) {
+      handle_event = B_STOPSEQ;
+      icon = ICON_SEQUENCE;
+      break;
+    }
     if (WM_jobs_test(wm, scene, WM_JOB_TYPE_CLIP_BUILD_PROXY)) {
       handle_event = B_STOPCLIP;
       icon = ICON_TRACKER;
diff --git a/source/blender/editors/interface/view2d_ops.c b/source/blender/editors/interface/view2d_ops.c
index 1fd1b6c984d..4ef4c3dbc6d 100644
--- a/source/blender/editors/interface/view2d_ops.c
+++ b/source/blender/editors/interface/view2d_ops.c
@@ -147,6 +147,8 @@ static void view_pan_init(bContext *C, wmOperator *op)
   const float winy = (float)(BLI_rcti_size_y(&vpd->region->winrct) + 1);
   vpd->facx = (BLI_rctf_size_x(&vpd->v2d->cur)) / winx;
   vpd->facy = (BLI_rctf_size_y(&vpd->v2d->cur)) / winy;
+
+  vpd->v2d->flag |= V2D_IS_NAVIGATING;
 }
 
 /* apply transform to view (i.e. adjust 'cur' rect) */
@@ -190,6 +192,8 @@ static void view_pan_apply(bContext *C, wmOperator *op)
 /* Cleanup temp custom-data. */
 static void view_pan_exit(wmOperator *op)
 {
+  v2dViewPanData *vpd = op->customdata;
+  vpd->v2d->flag &= ~V2D_IS_NAVIGATING;
   MEM_SAFE_FREE(op->customdata);
 }
 
@@ -358,6 +362,7 @@ static int view_edge_pan_modal(bContext *C, wmOperator *op, const wmEvent *event
   View2DEdgePanData *vpd = op->customdata;
 
   if (event->val == KM_RELEASE || event->type == EVT_ESCKEY) {
+    vpd->v2d->flag &= ~V2D_IS_NAVIGATING;
     MEM_SAFE_FREE(op->customdata);
     return (OPERATOR_FINISHED | OPERATOR_PASS_THROUGH);
   }
@@ -371,6 +376,8 @@ static int view_edge_pan_modal(bContext *C, wmOperator *op, const wmEvent *event
 
 static void view_edge_pan_cancel(bContext *UNUSED(C), wmOperator *op)
 {
+  v2dViewPanData *vpd = op->customdata;
+  vpd->v2d->flag &= ~V2D_IS_NAVIGATING;
   MEM_SAFE_FREE(op->customdata);
 }
 
@@ -680,6 +687,8 @@ static void view_zoomdrag_init(bContext *C, wmOperator *op)
   vzd->v2d = &vzd->region->v2d;
   /* False by default. Interactive callbacks (ie invoke()) can set it to true. */
   vzd->zoom_to_mouse_pos = false;
+
+  vzd->v2d->flag |= V2D_IS_NAVIGATING;
 }
 
 /* apply transform to view (i.e. adjust 'cur' rect) */
@@ -809,7 +818,8 @@ static void view_zoomstep_apply(bContext *C, wmOperator *op)
 static void view_zoomstep_exit(wmOperator *op)
 {
   UI_view2d_zoom_cache_reset();
-
+  v2dViewZoomData *vzd = op->customdata;
+  vzd->v2d->flag &= ~V2D_IS_NAVIGATING;
   MEM_SAFE_FREE(op->customdata);
 }
 
@@ -1041,6 +1051,7 @@ static void view_zoomdrag_exit(bContext *C, wmOperator *op)
 
   if (op->customdata) {
     v2dViewZoomData *vzd = op->customdata;
+    vzd->v2d->flag &= ~V2D_IS_NAVIGATING;
 
     if (vzd->timer) {
       WM_event_remove_timer(CTX_wm_manager(C), CTX_wm_window(C), vzd->timer);
@@ -1911,6 +1922,8 @@ static void scroller_activate_init(bContext *C,
     vsm->scrollbar_orig = ((scrollers.vert_max + scrollers.vert_min) / 2) + region->winrct.ymin;
   }
 
+  vsm->v2d->flag |= V2D_IS_NAVIGATING;
+
   ED_region_tag_redraw_no_rebuild(region);
 }
 
@@ -1921,6 +1934,7 @@ static void scroller_activate_exit(bContext *C, wmOperator *op)
     v2dScrollerMove *vsm = op->customdata;
 
     vsm->v2d->scroll_ui &= ~(V2D_SCROLL_H_ACTIVE | V2D_SCROLL_V_ACTIVE);
+    vsm->v2d->flag &= ~V2D_IS_NAVIGATING;
 
     MEM_freeN(op->customdata);
     op->customdata = NULL;
diff --git a/source/blender/editors/mesh/editmesh_extrude_spin_gizmo.c b/source/blender/editors/mesh/editmesh_extrude_spin_gizmo.c
index ae37d6c8deb..5faafa77bba 100644
--- a/source/blender/editors/mesh/editmesh_extrude_spin_gizmo.c
+++ b/source/blender/editors/mesh/editmesh_extrude_spin_gizmo.c
@@ -461,7 +461,7 @@ void MESH_GGT_spin(struct wmGizmoGroupType *gzgt)
   gzgt->name = "Mesh Spin Init";
   gzgt->idname = "MESH_GGT_spin";
 
-  gzgt->flag = WM_GIZMOGROUPTYPE_3D;
+  gzgt->flag = WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP | WM_GIZMOGROUPTYPE_3D;
 
   gzgt->gzmap_params.spaceid = SPACE_VIEW3D;
   gzgt->gzmap_params.regionid = RGN_TYPE_WINDOW;
@@ -1063,7 +1063,7 @@ void MESH_GGT_spin_redo(struct wmGizmoGroupType *gzgt)
   gzgt->name = "Mesh Spin Redo";
   gzgt->idname = "MESH_GGT_spin_redo";
 
-  gzgt->flag = WM_GIZMOGROUPTYPE_3D;
+  gzgt->flag = WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP | WM_GIZMOGROUPTYPE_3D;
 
   gzgt->gzmap_params.spaceid = SPACE_VIEW3D;
   gzgt->gzmap_params.regionid = RGN_TYPE_WINDOW;
diff --git a/source/blender/editors/object/object_bake_api.c b/source/blender/editors/object/object_bake_api.c
index 0a2df655395..26f5b21a311 100644
--- a/source/blender/editors/object/object_bake_api.c
+++ b/source/blender/editors/object/object_bake_api.c
@@ -412,6 +412,7 @@ static bool is_noncolor_pass(eScenePassType pass_type)
 {
   return ELEM(pass_type,
               SCE_PASS_Z,
+              SCE_PASS_POSITION,
               SCE_PASS_NORMAL,
               SCE_PASS_VECTOR,
               SCE_PASS_INDEXOB,
@@ -554,19 +555,10 @@ static bool bake_pass_filter_check(eScenePassType pass_type,
           return true;
         }
 
-        if ((pass_filter & R_BAKE_PASS_FILTER_AO) != 0) {
-          BKE_report(
-              reports,
-              RPT_ERROR,
-              "Combined bake pass Ambient Occlusion contribution requires an enabled light pass "
-              "(bake the Ambient Occlusion pass type instead)");
-        }
-        else {
-          BKE_report(reports,
-                     RPT_ERROR,
-                     "Combined bake pass requires Emit, or a light pass with "
-                     "Direct or Indirect contributions enabled");
-        }
+        BKE_report(reports,
+                   RPT_ERROR,
+                   "Combined bake pass requires Emit, or a light pass with "
+                   "Direct or Indirect contributions enabled");
 
         return false;
       }
diff --git a/source/blender/editors/render/render_preview.c b/source/blender/editors/render/render_preview.c
index 95351de45f0..81aecfdf788 100644
--- a/source/blender/editors/render/render_preview.c
+++ b/source/blender/editors/render/render_preview.c
@@ -479,15 +479,6 @@ static Scene *preview_prepare_scene(
     BKE_color_managed_view_settings_free(&sce->view_settings);
     BKE_color_managed_view_settings_copy(&sce->view_settings, &scene->view_settings);
 
-    /* prevent overhead for small renders and icons (32) */
-    if (id && sp->sizex < 40) {
-      sce->r.tilex = sce->r.tiley = 64;
-    }
-    else {
-      sce->r.tilex = sce->r.xsch / 4;
-      sce->r.tiley = sce->r.ysch / 4;
-    }
-
     if ((id && sp->pr_method == PR_ICON_RENDER) && id_type != ID_WO) {
       sce->r.alphamode = R_ALPHAPREMUL;
     }
diff --git a/source/blender/editors/screen/area.c b/source/blender/editors/screen/area.c
index 9546035375c..c71e68df2fd 100644
--- a/source/blender/editors/screen/area.c
+++ b/source/blender/editors/screen/area.c
@@ -1735,10 +1735,14 @@ static void ed_default_handlers(
     WM_event_add_keymap_handler(handlers, keymap);
   }
   if (flag & ED_KEYMAP_TOOL) {
-    WM_event_add_keymap_handler_dynamic(
-        &region->handlers, WM_event_get_keymap_from_toolsystem_fallback, area);
-    WM_event_add_keymap_handler_dynamic(
-        &region->handlers, WM_event_get_keymap_from_toolsystem, area);
+    if (flag & ED_KEYMAP_GIZMO) {
+      WM_event_add_keymap_handler_dynamic(
+          &region->handlers, WM_event_get_keymap_from_toolsystem_fallback, area);
+    }
+    else {
+      WM_event_add_keymap_handler_dynamic(
+          &region->handlers, WM_event_get_keymap_from_toolsystem, area);
+    }
   }
   if (flag & ED_KEYMAP_FRAMES) {
     /* frame changing/jumping (for all spaces) */
diff --git a/source/blender/editors/space_file/filesel.c b/source/blender/editors/space_file/filesel.c
index 11b06d2b414..f7bdb4326a5 100644
--- a/source/blender/editors/space_file/filesel.c
+++ b/source/blender/editors/space_file/filesel.c
@@ -1271,7 +1271,7 @@ void file_params_rename_end(wmWindowManager *wm,
   /* Ensure smooth-scroll timer is active, even if not needed, because that way rename state is
    * handled properly. */
   file_params_invoke_rename_postscroll(wm, win, sfile);
-  /* Also always activate the rename file, even if renaming was cancelled. */
+  /* Also always activate the rename file, even if renaming was canceled. */
   file_params_renamefile_activate(sfile, params);
 }
 
diff --git a/source/blender/editors/space_node/node_draw.cc b/source/blender/editors/space_node/node_draw.cc
index aa241071425..10a3285be8b 100644
--- a/source/blender/editors/space_node/node_draw.cc
+++ b/source/blender/editors/space_node/node_draw.cc
@@ -1446,6 +1446,8 @@ static int node_error_type_to_icon(const geo_log::NodeWarningType type)
       return ICON_ERROR;
     case geo_log::NodeWarningType::Info:
       return ICON_INFO;
+    case geo_log::NodeWarningType::Legacy:
+      return ICON_ERROR;
   }
 
   BLI_assert(false);
@@ -1456,6 +1458,8 @@ static uint8_t node_error_type_priority(const geo_log::NodeWarningType type)
 {
   switch (type) {
     case geo_log::NodeWarningType::Error:
+      return 4;
+    case geo_log::NodeWarningType::Legacy:
       return 3;
     case geo_log::NodeWarningType::Warning:
       return 2;
diff --git a/source/blender/editors/space_node/node_intern.h b/source/blender/editors/space_node/node_intern.h
index d35fd729131..f069038cc09 100644
--- a/source/blender/editors/space_node/node_intern.h
+++ b/source/blender/editors/space_node/node_intern.h
@@ -175,6 +175,7 @@ int space_node_view_flag(struct bContext *C,
 
 void NODE_OT_view_all(struct wmOperatorType *ot);
 void NODE_OT_view_selected(struct wmOperatorType *ot);
+void NODE_OT_geometry_node_view_legacy(struct wmOperatorType *ot);
 
 void NODE_OT_backimage_move(struct wmOperatorType *ot);
 void NODE_OT_backimage_zoom(struct wmOperatorType *ot);
diff --git a/source/blender/editors/space_node/node_ops.c b/source/blender/editors/space_node/node_ops.c
index 610c2889e7a..df4f63af20b 100644
--- a/source/blender/editors/space_node/node_ops.c
+++ b/source/blender/editors/space_node/node_ops.c
@@ -51,6 +51,7 @@ void node_operatortypes(void)
 
   WM_operatortype_append(NODE_OT_view_all);
   WM_operatortype_append(NODE_OT_view_selected);
+  WM_operatortype_append(NODE_OT_geometry_node_view_legacy);
 
   WM_operatortype_append(NODE_OT_mute_toggle);
   WM_operatortype_append(NODE_OT_hide_toggle);
diff --git a/source/blender/editors/space_node/node_view.cc b/source/blender/editors/space_node/node_view.cc
index f0db0539c4f..762b4b36a39 100644
--- a/source/blender/editors/space_node/node_view.cc
+++ b/source/blender/editors/space_node/node_view.cc
@@ -23,8 +23,10 @@
 
 #include "DNA_node_types.h"
 
+#include "BLI_listbase.h"
 #include "BLI_math.h"
 #include "BLI_rect.h"
+#include "BLI_string_ref.hh"
 #include "BLI_utildefines.h"
 
 #include "BKE_context.h"
@@ -54,6 +56,8 @@
 
 #include "node_intern.h" /* own include */
 
+using blender::StringRef;
+
 /* -------------------------------------------------------------------- */
 /** \name View All Operator
  * \{ */
@@ -700,3 +704,89 @@ void NODE_OT_backimage_sample(wmOperatorType *ot)
 }
 
 /** \} */
+
+/* -------------------------------------------------------------------- */
+/** \name View Geometry Nodes Legacy Operator
+ *
+ *  This operator should be removed when the 2.93 legacy nodes are removed.
+ * \{ */
+
+static int space_node_view_geometry_nodes_legacy(bContext *C, SpaceNode *snode, wmOperator *op)
+{
+  ARegion *region = CTX_wm_region(C);
+
+  /* Only use the node editor's active node tree. Otherwise this will be too complicated. */
+  bNodeTree *node_tree = snode->nodetree;
+  if (node_tree == nullptr || node_tree->type != NTREE_GEOMETRY) {
+    return OPERATOR_CANCELLED;
+  }
+
+  bool found_legacy_node = false;
+  LISTBASE_FOREACH_BACKWARD (bNode *, node, &node_tree->nodes) {
+    StringRef idname{node->idname};
+    if (idname.find("Legacy") == StringRef::not_found) {
+      node->flag &= ~NODE_SELECT;
+    }
+    else {
+      found_legacy_node = true;
+      node->flag |= NODE_SELECT;
+    }
+  }
+
+  if (!found_legacy_node) {
+    WM_report(RPT_INFO, "Legacy node not found, may be in nested node group");
+  }
+
+  const int smooth_viewtx = WM_operator_smooth_viewtx_get(op);
+  if (space_node_view_flag(C, snode, region, NODE_SELECT, smooth_viewtx)) {
+    return OPERATOR_FINISHED;
+  }
+  return OPERATOR_CANCELLED;
+}
+
+static int geometry_node_view_legacy_exec(bContext *C, wmOperator *op)
+{
+  /* Allow running this operator directly in a specific node editor. */
+  if (SpaceNode *snode = CTX_wm_space_node(C)) {
+    return space_node_view_geometry_nodes_legacy(C, snode, op);
+  }
+
+  /* Since the operator is meant to be called from a button in the modifier panel, the node tree
+   * must be found from the screen, using the largest node editor if there is more than one. */
+  if (ScrArea *area = BKE_screen_find_big_area(CTX_wm_screen(C), SPACE_NODE, 0)) {
+    if (SpaceNode *snode = static_cast<SpaceNode *>(area->spacedata.first)) {
+      ScrArea *old_area = CTX_wm_area(C);
+      ARegion *old_region = CTX_wm_region(C);
+
+      /* Override the context since it is used by the View2D panning code. */
+      CTX_wm_area_set(C, area);
+      CTX_wm_region_set(C, static_cast<ARegion *>(area->regionbase.last));
+      const int result = space_node_view_geometry_nodes_legacy(C, snode, op);
+      CTX_wm_area_set(C, old_area);
+      CTX_wm_region_set(C, old_region);
+      return result;
+    }
+  }
+
+  return OPERATOR_CANCELLED;
+}
+
+static bool geometry_node_view_legacy_poll(bContext *C)
+{
+  /* Allow direct execution in a node editor, but also affecting any visible node editor. */
+  return ED_operator_node_active(C) || BKE_screen_find_big_area(CTX_wm_screen(C), SPACE_NODE, 0);
+}
+
+void NODE_OT_geometry_node_view_legacy(wmOperatorType *ot)
+{
+  ot->name = "View Deprecated Geometry Nodes";
+  ot->idname = "NODE_OT_geometry_node_view_legacy";
+  ot->description = "Select and view legacy geometry nodes in the node editor";
+
+  ot->exec = geometry_node_view_legacy_exec;
+  ot->poll = geometry_node_view_legacy_poll;
+
+  ot->flag = OPTYPE_INTERNAL;
+}
+
+/** \} */
diff --git a/source/blender/editors/space_outliner/outliner_draw.c b/source/blender/editors/space_outliner/outliner_draw.c
index c06a1010168..7cdfb553da5 100644
--- a/source/blender/editors/space_outliner/outliner_draw.c
+++ b/source/blender/editors/space_outliner/outliner_draw.c
@@ -2358,7 +2358,10 @@ TreeElementIcon tree_element_get_icon(TreeStoreElem *tselem, TreeElement *te)
             case eGpencilModifierType_Texture:
               data.icon = ICON_TEXTURE;
               break;
-            case eGpencilModifierType_Weight:
+            case eGpencilModifierType_WeightProximity:
+              data.icon = ICON_MOD_VERTEX_WEIGHT;
+              break;
+            case eGpencilModifierType_WeightAngle:
               data.icon = ICON_MOD_VERTEX_WEIGHT;
               break;
 
diff --git a/source/blender/editors/space_outliner/outliner_tree.c b/source/blender/editors/space_outliner/outliner_tree.c
index c5ec656080a..5427ae31ac3 100644
--- a/source/blender/editors/space_outliner/outliner_tree.c
+++ b/source/blender/editors/space_outliner/outliner_tree.c
@@ -1864,6 +1864,15 @@ static void outliner_filter_tree(SpaceOutliner *space_outliner, ViewLayer *view_
       space_outliner, view_layer, &space_outliner->tree, search_string, exclude_filter);
 }
 
+static void outliner_clear_newid_from_main(Main *bmain)
+{
+  ID *id_iter;
+  FOREACH_MAIN_ID_BEGIN (bmain, id_iter) {
+    id_iter->newid = NULL;
+  }
+  FOREACH_MAIN_ID_END;
+}
+
 /* ======================================================= */
 /* Main Tree Building API */
 
@@ -1926,5 +1935,7 @@ void outliner_build_tree(Main *mainvar,
   outliner_filter_tree(space_outliner, view_layer);
   outliner_restore_scrolling_position(space_outliner, region, &focus);
 
-  BKE_main_id_newptr_and_tag_clear(mainvar);
+  /* `ID.newid` pointer is abused when building tree, DO NOT call #BKE_main_id_newptr_and_tag_clear
+   * as this expects valid IDs in this pointer, not random unknown data. */
+  outliner_clear_newid_from_main(mainvar);
 }
diff --git a/source/blender/editors/space_sequencer/sequencer_draw.c b/source/blender/editors/space_sequencer/sequencer_draw.c
index 5b39feacfe3..53f1c35776c 100644
--- a/source/blender/editors/space_sequencer/sequencer_draw.c
+++ b/source/blender/editors/space_sequencer/sequencer_draw.c
@@ -25,6 +25,7 @@
 #include <string.h>
 
 #include "BLI_blenlib.h"
+#include "BLI_ghash.h"
 #include "BLI_math.h"
 #include "BLI_string_utils.h"
 #include "BLI_threads.h"
@@ -44,6 +45,7 @@
 #include "BKE_context.h"
 #include "BKE_fcurve.h"
 #include "BKE_global.h"
+#include "BKE_main.h"
 #include "BKE_scene.h"
 #include "BKE_sound.h"
 
@@ -71,6 +73,7 @@
 #include "BIF_glutil.h"
 
 #include "SEQ_effects.h"
+#include "SEQ_iterator.h"
 #include "SEQ_prefetch.h"
 #include "SEQ_proxy.h"
 #include "SEQ_relations.h"
@@ -1282,6 +1285,526 @@ static void draw_seq_fcurve_overlay(
   }
 }
 
+typedef struct ThumbnailDrawJob {
+  SeqRenderData context;
+  GHash *sequences_ghash;
+  Scene *scene;
+  rctf *view_area;
+  float pixelx;
+  float pixely;
+} ThumbnailDrawJob;
+
+typedef struct ThumbDataItem {
+  Sequence *seq_dupli;
+  Scene *scene;
+} ThumbDataItem;
+
+static void thumbnail_hash_data_free(void *val)
+{
+  ThumbDataItem *item = val;
+  SEQ_sequence_free(item->scene, item->seq_dupli, 0);
+  MEM_freeN(val);
+}
+
+static void thumbnail_freejob(void *data)
+{
+  ThumbnailDrawJob *tj = data;
+  BLI_ghash_free(tj->sequences_ghash, NULL, thumbnail_hash_data_free);
+  MEM_freeN(tj->view_area);
+  MEM_freeN(tj);
+}
+
+static void thumbnail_endjob(void *data)
+{
+  ThumbnailDrawJob *tj = data;
+  WM_main_add_notifier(NC_SCENE | ND_SEQUENCER, tj->scene);
+}
+
+static bool check_seq_need_thumbnails(Sequence *seq, rctf *view_area)
+{
+  if (seq->type != SEQ_TYPE_MOVIE && seq->type != SEQ_TYPE_IMAGE) {
+    return false;
+  }
+  if (min_ii(seq->startdisp, seq->start) > view_area->xmax) {
+    return false;
+  }
+  if (max_ii(seq->enddisp, seq->start + seq->len) < view_area->xmin) {
+    return false;
+  }
+  if (seq->machine + 1.0f < view_area->ymin) {
+    return false;
+  }
+  if (seq->machine > view_area->ymax) {
+    return false;
+  }
+
+  return true;
+}
+
+static void seq_get_thumb_image_dimensions(Sequence *seq,
+                                           float pixelx,
+                                           float pixely,
+                                           float *r_thumb_width,
+                                           float *r_thumb_height,
+                                           float *r_image_width,
+                                           float *r_image_height)
+{
+  float image_width = seq->strip->stripdata->orig_width;
+  float image_height = seq->strip->stripdata->orig_height;
+
+  /* Fix the dimensions to be max SEQ_RENDER_THUMB_SIZE (256) for x or y. */
+  float aspect_ratio = (float)image_width / image_height;
+  if (image_width > image_height) {
+    image_width = SEQ_RENDER_THUMB_SIZE;
+    image_height = round_fl_to_int(image_width / aspect_ratio);
+  }
+  else {
+    image_height = SEQ_RENDER_THUMB_SIZE;
+    image_width = round_fl_to_int(image_height * aspect_ratio);
+  }
+
+  /* Calculate thumb dimensions. */
+  float thumb_height = (SEQ_STRIP_OFSTOP - SEQ_STRIP_OFSBOTTOM) - (20 * U.dpi_fac * pixely);
+  aspect_ratio = ((float)image_width) / image_height;
+  float thumb_h_px = thumb_height / pixely;
+  float thumb_width = aspect_ratio * thumb_h_px * pixelx;
+
+  if (r_thumb_height == NULL) {
+    *r_thumb_width = thumb_width;
+    return;
+  }
+
+  *r_thumb_height = thumb_height;
+  *r_image_width = image_width;
+  *r_image_height = image_height;
+  *r_thumb_width = thumb_width;
+}
+
+static float seq_thumbnail_get_start_frame(Sequence *seq, float frame_step, rctf *view_area)
+{
+  if (seq->start > view_area->xmin && seq->start < view_area->xmax) {
+    return seq->start;
+  }
+
+  /* Drawing and caching both check to see if strip is in view area or not before calling this
+   * function so assuming strip/part of strip in view. */
+
+  int no_invisible_thumbs = (view_area->xmin - seq->start) / frame_step;
+  return ((no_invisible_thumbs - 1) * frame_step) + seq->start;
+}
+
+static void thumbnail_start_job(void *data,
+                                short *stop,
+                                short *UNUSED(do_update),
+                                float *UNUSED(progress))
+{
+  ThumbnailDrawJob *tj = data;
+  float start_frame, frame_step;
+
+  GHashIterator gh_iter;
+  BLI_ghashIterator_init(&gh_iter, tj->sequences_ghash);
+  while (!BLI_ghashIterator_done(&gh_iter) & !*stop) {
+    Sequence *seq_orig = BLI_ghashIterator_getKey(&gh_iter);
+    ThumbDataItem *val = BLI_ghash_lookup(tj->sequences_ghash, seq_orig);
+
+    if (check_seq_need_thumbnails(seq_orig, tj->view_area)) {
+      seq_get_thumb_image_dimensions(
+          val->seq_dupli, tj->pixelx, tj->pixely, &frame_step, NULL, NULL, NULL);
+      start_frame = seq_thumbnail_get_start_frame(seq_orig, frame_step, tj->view_area);
+      SEQ_render_thumbnails(
+          &tj->context, val->seq_dupli, seq_orig, start_frame, frame_step, tj->view_area, stop);
+      SEQ_render_thumbnails_base_set(&tj->context, val->seq_dupli, seq_orig, tj->view_area, stop);
+    }
+    BLI_ghashIterator_step(&gh_iter);
+  }
+}
+
+static SeqRenderData sequencer_thumbnail_context_init(const bContext *C)
+{
+  struct Main *bmain = CTX_data_main(C);
+  struct Depsgraph *depsgraph = CTX_data_depsgraph_pointer(C);
+  Scene *scene = CTX_data_scene(C);
+  SpaceSeq *sseq = CTX_wm_space_seq(C);
+  SeqRenderData context = {0};
+
+  /* Taking rectx and recty as 0 as dimensions not known here, and context is used to calculate
+   * hash key but not necessary as other variables of SeqRenderData are unique enough. */
+  SEQ_render_new_render_data(bmain, depsgraph, scene, 0, 0, sseq->render_size, false, &context);
+  context.view_id = BKE_scene_multiview_view_id_get(&scene->r, STEREO_LEFT_NAME);
+  context.use_proxies = false;
+
+  return context;
+}
+
+static GHash *sequencer_thumbnail_ghash_init(const bContext *C, View2D *v2d, Editing *ed)
+{
+  Scene *scene = CTX_data_scene(C);
+
+  /* Set the data for thumbnail caching job. */
+  GHash *thumb_data_hash = BLI_ghash_ptr_new("seq_duplicates_and_origs");
+
+  LISTBASE_FOREACH (Sequence *, seq, ed->seqbasep) {
+    ThumbDataItem *val_need_update = BLI_ghash_lookup(thumb_data_hash, seq);
+    if (val_need_update == NULL && check_seq_need_thumbnails(seq, &v2d->cur)) {
+      ThumbDataItem *val = MEM_callocN(sizeof(ThumbDataItem), "Thumbnail Hash Values");
+      val->seq_dupli = SEQ_sequence_dupli_recursive(scene, scene, NULL, seq, 0);
+      val->scene = scene;
+      BLI_ghash_insert(thumb_data_hash, seq, val);
+    }
+    else {
+      if (val_need_update != NULL) {
+        val_need_update->seq_dupli->start = seq->start;
+        val_need_update->seq_dupli->startdisp = seq->startdisp;
+      }
+    }
+  }
+
+  return thumb_data_hash;
+}
+
+static void sequencer_thumbnail_init_job(const bContext *C, View2D *v2d, Editing *ed)
+{
+  wmJob *wm_job;
+  ThumbnailDrawJob *tj = NULL;
+  ScrArea *area = CTX_wm_area(C);
+  wm_job = WM_jobs_get(CTX_wm_manager(C),
+                       CTX_wm_window(C),
+                       CTX_data_scene(C),
+                       "Draw Thumbnails",
+                       0,
+                       WM_JOB_TYPE_SEQ_DRAW_THUMBNAIL);
+
+  /* Get the thumbnail job if it exists. */
+  tj = WM_jobs_customdata_get(wm_job);
+  if (!tj) {
+    tj = MEM_callocN(sizeof(ThumbnailDrawJob), "Thumbnail cache job");
+
+    /* Duplicate value of v2d->cur and v2d->tot to have module separation. */
+    rctf *view_area = MEM_callocN(sizeof(struct rctf), "viewport area");
+    view_area->xmax = v2d->cur.xmax;
+    view_area->xmin = v2d->cur.xmin;
+    view_area->ymax = v2d->cur.ymax;
+    view_area->ymin = v2d->cur.ymin;
+
+    tj->scene = CTX_data_scene(C);
+    tj->view_area = view_area;
+    tj->context = sequencer_thumbnail_context_init(C);
+    tj->sequences_ghash = sequencer_thumbnail_ghash_init(C, v2d, ed);
+    tj->pixelx = BLI_rctf_size_x(&v2d->cur) / BLI_rcti_size_x(&v2d->mask);
+    tj->pixely = BLI_rctf_size_y(&v2d->cur) / BLI_rcti_size_y(&v2d->mask);
+    WM_jobs_customdata_set(wm_job, tj, thumbnail_freejob);
+    WM_jobs_timer(wm_job, 0.1, NC_SCENE | ND_SEQUENCER, NC_SCENE | ND_SEQUENCER);
+    WM_jobs_callbacks(wm_job, thumbnail_start_job, NULL, NULL, thumbnail_endjob);
+  }
+
+  if (!WM_jobs_is_running(wm_job)) {
+    G.is_break = false;
+    WM_jobs_start(CTX_wm_manager(C), wm_job);
+  }
+  else {
+    WM_event_add_notifier(C, NC_SCENE | ND_SEQUENCER, NULL);
+  }
+
+  ED_area_tag_redraw(area);
+}
+
+static bool sequencer_thumbnail_v2d_is_navigating(const bContext *C)
+{
+  ARegion *region = CTX_wm_region(C);
+  View2D *v2d = &region->v2d;
+  return (v2d->flag & V2D_IS_NAVIGATING) != 0;
+}
+
+static void sequencer_thumbnail_start_job_if_necessary(const bContext *C,
+                                                       Editing *ed,
+                                                       View2D *v2d,
+                                                       bool thumbnail_is_missing)
+{
+  SpaceSeq *sseq = CTX_wm_space_seq(C);
+
+  if (sequencer_thumbnail_v2d_is_navigating(C)) {
+    WM_event_add_notifier(C, NC_SCENE | ND_SEQUENCER, NULL);
+    return;
+  }
+
+  /* `thumbnail_is_missing` should be set to true if missing image in strip. False when normal call
+   * to all strips done.  */
+  if (v2d->cur.xmax != sseq->runtime.last_thumbnail_area.xmax ||
+      v2d->cur.ymax != sseq->runtime.last_thumbnail_area.ymax || thumbnail_is_missing) {
+
+    /* Stop the job first as view has changed. Pointless to continue old job. */
+    if (v2d->cur.xmax != sseq->runtime.last_thumbnail_area.xmax ||
+        v2d->cur.ymax != sseq->runtime.last_thumbnail_area.ymax) {
+      WM_jobs_stop(CTX_wm_manager(C), NULL, thumbnail_start_job);
+    }
+
+    sequencer_thumbnail_init_job(C, v2d, ed);
+    sseq->runtime.last_thumbnail_area = v2d->cur;
+  }
+}
+
+void last_displayed_thumbnails_list_free(void *val)
+{
+  BLI_gset_free(val, NULL);
+}
+
+static GSet *last_displayed_thumbnails_list_ensure(const bContext *C, Sequence *seq)
+{
+  SpaceSeq *sseq = CTX_wm_space_seq(C);
+  if (sseq->runtime.last_displayed_thumbnails == NULL) {
+    sseq->runtime.last_displayed_thumbnails = BLI_ghash_ptr_new(__func__);
+  }
+
+  GSet *displayed_thumbnails = BLI_ghash_lookup(sseq->runtime.last_displayed_thumbnails, seq);
+  if (displayed_thumbnails == NULL) {
+    displayed_thumbnails = BLI_gset_int_new(__func__);
+    BLI_ghash_insert(sseq->runtime.last_displayed_thumbnails, seq, displayed_thumbnails);
+  }
+
+  return displayed_thumbnails;
+}
+
+static void last_displayed_thumbnails_list_cleanup(GSet *previously_displayed,
+                                                   float range_start,
+                                                   float range_end)
+{
+  GSetIterator gset_iter;
+  BLI_gsetIterator_init(&gset_iter, previously_displayed);
+  while (!BLI_gsetIterator_done(&gset_iter)) {
+    int frame = (float)POINTER_AS_INT(BLI_gsetIterator_getKey(&gset_iter));
+    BLI_gsetIterator_step(&gset_iter);
+
+    if (frame > range_start && frame < range_end) {
+      BLI_gset_remove(previously_displayed, POINTER_FROM_INT(frame), NULL);
+    }
+  }
+}
+
+static int sequencer_thumbnail_closest_previous_frame_get(int timeline_frame,
+                                                          GSet *previously_displayed)
+{
+  int best_diff = INT_MAX;
+  int best_frame = timeline_frame;
+
+  /* Previously displayed thumbnails. */
+  GSetIterator gset_iter;
+  BLI_gsetIterator_init(&gset_iter, previously_displayed);
+  while (!BLI_gsetIterator_done(&gset_iter)) {
+    int frame = POINTER_AS_INT(BLI_gsetIterator_getKey(&gset_iter));
+    int diff = abs(frame - timeline_frame);
+    if (diff < best_diff) {
+      best_diff = diff;
+      best_frame = frame;
+    }
+    BLI_gsetIterator_step(&gset_iter);
+  }
+  return best_frame;
+}
+
+static int sequencer_thumbnail_closest_guaranteed_frame_get(Sequence *seq, int timeline_frame)
+{
+  if (timeline_frame <= seq->startdisp) {
+    return seq->startdisp;
+  }
+
+  /* Set of "guaranteed" thumbnails. */
+  const int frame_index = timeline_frame - seq->startdisp;
+  const int frame_step = SEQ_render_thumbnails_guaranteed_set_frame_step_get(seq);
+  const int relative_base_frame = round_fl_to_int((frame_index / (float)frame_step)) * frame_step;
+  const int nearest_guaranted_absolute_frame = relative_base_frame + seq->startdisp;
+  return nearest_guaranted_absolute_frame;
+}
+
+static ImBuf *sequencer_thumbnail_closest_from_memory(const SeqRenderData *context,
+                                                      Sequence *seq,
+                                                      int timeline_frame,
+                                                      GSet *previously_displayed,
+                                                      rcti *crop,
+                                                      bool clipped)
+{
+  int frame_previous = sequencer_thumbnail_closest_previous_frame_get(timeline_frame,
+                                                                      previously_displayed);
+  ImBuf *ibuf_previous = SEQ_get_thumbnail(context, seq, frame_previous, crop, clipped);
+
+  int frame_guaranteed = sequencer_thumbnail_closest_guaranteed_frame_get(seq, timeline_frame);
+  ImBuf *ibuf_guaranteed = SEQ_get_thumbnail(context, seq, frame_guaranteed, crop, clipped);
+
+  ImBuf *closest_in_memory = NULL;
+
+  if (ibuf_previous && ibuf_guaranteed) {
+    if (abs(frame_previous - timeline_frame) < abs(frame_guaranteed - timeline_frame)) {
+      IMB_freeImBuf(ibuf_guaranteed);
+      closest_in_memory = ibuf_previous;
+    }
+    else {
+      IMB_freeImBuf(ibuf_previous);
+      closest_in_memory = ibuf_guaranteed;
+    }
+  }
+
+  if (ibuf_previous == NULL) {
+    closest_in_memory = ibuf_guaranteed;
+  }
+
+  if (ibuf_guaranteed == NULL) {
+    closest_in_memory = ibuf_previous;
+  }
+
+  return closest_in_memory;
+}
+
+static void draw_seq_strip_thumbnail(View2D *v2d,
+                                     const bContext *C,
+                                     Scene *scene,
+                                     Sequence *seq,
+                                     float y1,
+                                     float y2,
+                                     float pixelx,
+                                     float pixely)
+{
+  bool clipped = false;
+  float image_height, image_width, thumb_width, thumb_height;
+  rcti crop;
+
+  /* If width of the strip too small ignore drawing thumbnails. */
+  if ((y2 - y1) / pixely <= 40 * U.dpi_fac) {
+    return;
+  }
+
+  SeqRenderData context = sequencer_thumbnail_context_init(C);
+
+  if ((seq->flag & SEQ_FLAG_SKIP_THUMBNAILS) != 0) {
+    return;
+  }
+
+  seq_get_thumb_image_dimensions(
+      seq, pixelx, pixely, &thumb_width, &thumb_height, &image_width, &image_height);
+
+  float thumb_y_end = y1 + thumb_height - pixely;
+
+  float cut_off = 0;
+  float upper_thumb_bound = (seq->endstill) ? (seq->start + seq->len) : seq->enddisp;
+  if (seq->type == SEQ_TYPE_IMAGE) {
+    upper_thumb_bound = seq->enddisp;
+  }
+
+  float thumb_x_start = seq_thumbnail_get_start_frame(seq, thumb_width, &v2d->cur);
+  float thumb_x_end;
+
+  while (thumb_x_start + thumb_width < v2d->cur.xmin) {
+    thumb_x_start += thumb_width;
+  }
+
+  /* Ignore thumbs to the left of strip. */
+  while (thumb_x_start + thumb_width < seq->startdisp) {
+    thumb_x_start += thumb_width;
+  }
+
+  GSet *last_displayed_thumbnails = last_displayed_thumbnails_list_ensure(C, seq);
+  /* Cleanup thumbnail list outside of rendered range, which is cleaned up one by one to prevent
+   * flickering after zooming. */
+  if (!sequencer_thumbnail_v2d_is_navigating(C)) {
+    last_displayed_thumbnails_list_cleanup(last_displayed_thumbnails, -FLT_MAX, thumb_x_start);
+  }
+
+  /* Start drawing. */
+  while (thumb_x_start < upper_thumb_bound) {
+    thumb_x_end = thumb_x_start + thumb_width;
+    clipped = false;
+
+    /* Checks to make sure that thumbs are loaded only when in view and within the confines of the
+     * strip. Some may not be required but better to have conditions for safety as x1 here is
+     * point to start caching from and not drawing. */
+    if (thumb_x_start > v2d->cur.xmax) {
+      break;
+    }
+
+    /* Set the clipping bound to show the left handle moving over thumbs and not shift thumbs. */
+    if (IN_RANGE_INCL(seq->startdisp, thumb_x_start, thumb_x_end)) {
+      cut_off = seq->startdisp - thumb_x_start;
+      clipped = true;
+    }
+
+    /* Clip if full thumbnail cannot be displayed. */
+    if (thumb_x_end > (upper_thumb_bound)) {
+      thumb_x_end = upper_thumb_bound;
+      clipped = true;
+      if (thumb_x_end - thumb_x_start < 1) {
+        break;
+      }
+    }
+
+    float zoom_x = thumb_width / image_width;
+    float zoom_y = thumb_height / image_height;
+
+    float cropx_min = (cut_off / pixelx) / (zoom_y / pixely);
+    float cropx_max = ((thumb_x_end - thumb_x_start) / pixelx) / (zoom_y / pixely);
+    if (cropx_max == (thumb_x_end - thumb_x_start)) {
+      cropx_max = cropx_max + 1;
+    }
+    BLI_rcti_init(&crop, (int)(cropx_min), (int)cropx_max, 0, (int)(image_height)-1);
+
+    int timeline_frame = round_fl_to_int(thumb_x_start);
+
+    /* Get the image. */
+    ImBuf *ibuf = SEQ_get_thumbnail(&context, seq, timeline_frame, &crop, clipped);
+
+    if (!ibuf) {
+      sequencer_thumbnail_start_job_if_necessary(C, scene->ed, v2d, true);
+
+      ibuf = sequencer_thumbnail_closest_from_memory(
+          &context, seq, timeline_frame, last_displayed_thumbnails, &crop, clipped);
+    }
+    /* Store recently rendered frames, so they can be reused when zooming. */
+    else if (!sequencer_thumbnail_v2d_is_navigating(C)) {
+      /* Clear images in frame range occupied by new thumbnail. */
+      last_displayed_thumbnails_list_cleanup(
+          last_displayed_thumbnails, thumb_x_start, thumb_x_end);
+      /* Insert new thumbnail frame to list. */
+      BLI_gset_add(last_displayed_thumbnails, POINTER_FROM_INT(timeline_frame));
+    }
+
+    /* If there is no image still, abort. */
+    if (!ibuf) {
+      break;
+    }
+
+    /* Transparency on overlap. */
+    if (seq->flag & SEQ_OVERLAP) {
+      GPU_blend(GPU_BLEND_ALPHA);
+      if (ibuf->rect) {
+        unsigned char *buf = (unsigned char *)ibuf->rect;
+        for (int pixel = ibuf->x * ibuf->y; pixel--; buf += 4) {
+          buf[3] = OVERLAP_ALPHA;
+        }
+      }
+      else if (ibuf->rect_float) {
+        float *buf = (float *)ibuf->rect_float;
+        for (int pixel = ibuf->x * ibuf->y; pixel--; buf += ibuf->channels) {
+          buf[3] = (OVERLAP_ALPHA / 255.0f);
+        }
+      }
+    }
+
+    ED_draw_imbuf_ctx_clipping(C,
+                               ibuf,
+                               thumb_x_start + cut_off,
+                               y1,
+                               true,
+                               thumb_x_start + cut_off,
+                               y1,
+                               thumb_x_end,
+                               thumb_y_end,
+                               zoom_x,
+                               zoom_y);
+    IMB_freeImBuf(ibuf);
+    GPU_blend(GPU_BLEND_NONE);
+    cut_off = 0;
+    thumb_x_start += thumb_width;
+  }
+  last_displayed_thumbnails_list_cleanup(last_displayed_thumbnails, thumb_x_start, FLT_MAX);
+}
+
 /* Draw visible strips. Bounds check are already made. */
 static void draw_seq_strip(const bContext *C,
                            SpaceSeq *sseq,
@@ -1356,6 +1879,12 @@ static void draw_seq_strip(const bContext *C,
   }
 
   if ((sseq->flag & SEQ_SHOW_OVERLAY) &&
+      (sseq->timeline_overlay.flag & SEQ_TIMELINE_SHOW_THUMBNAILS) &&
+      (seq->type == SEQ_TYPE_MOVIE || seq->type == SEQ_TYPE_IMAGE)) {
+    draw_seq_strip_thumbnail(v2d, C, scene, seq, y1, y2, pixelx, pixely);
+  }
+
+  if ((sseq->flag & SEQ_SHOW_OVERLAY) &&
       (sseq->timeline_overlay.flag & SEQ_TIMELINE_SHOW_FCURVES)) {
     draw_seq_fcurve_overlay(scene, v2d, seq, x1, y1, x2, y2, pixelx);
   }
@@ -2056,6 +2585,64 @@ static int sequencer_draw_get_transform_preview_frame(Scene *scene)
   return preview_frame;
 }
 
+static void seq_draw_image_origin_and_outline(const bContext *C, Sequence *seq)
+{
+  SpaceSeq *sseq = CTX_wm_space_seq(C);
+  if ((seq->flag & SELECT) == 0) {
+    return;
+  }
+  if (ED_screen_animation_no_scrub(CTX_wm_manager(C))) {
+    return;
+  }
+  if ((sseq->flag & SEQ_SHOW_OVERLAY) == 0 ||
+      (sseq->preview_overlay.flag & SEQ_PREVIEW_SHOW_OUTLINE_SELECTED) == 0) {
+    return;
+  }
+  if (ELEM(sseq->mainb, SEQ_DRAW_IMG_WAVEFORM, SEQ_DRAW_IMG_VECTORSCOPE, SEQ_DRAW_IMG_HISTOGRAM)) {
+    return;
+  }
+
+  float origin[2];
+  SEQ_image_transform_origin_offset_pixelspace_get(CTX_data_scene(C), seq, origin);
+
+  /* Origin. */
+  GPUVertFormat *format = immVertexFormat();
+  uint pos = GPU_vertformat_attr_add(format, "pos", GPU_COMP_F32, 2, GPU_FETCH_FLOAT);
+  immBindBuiltinProgram(GPU_SHADER_2D_POINT_UNIFORM_SIZE_UNIFORM_COLOR_OUTLINE_AA);
+  immUniform1f("outlineWidth", 1.5f);
+  immUniformColor3f(1.0f, 1.0f, 1.0f);
+  immUniform4f("outlineColor", 0.0f, 0.0f, 0.0f, 1.0f);
+  immUniform1f("size", 15.0f * U.pixelsize);
+  immBegin(GPU_PRIM_POINTS, 1);
+  immVertex2f(pos, origin[0], origin[1]);
+  immEnd();
+  immUnbindProgram();
+
+  /* Outline. */
+  float seq_image_quad[4][2];
+  SEQ_image_transform_final_quad_get(CTX_data_scene(C), seq, seq_image_quad);
+
+  GPU_line_smooth(true);
+  GPU_blend(GPU_BLEND_ALPHA);
+  GPU_line_width(2);
+  immBindBuiltinProgram(GPU_SHADER_2D_UNIFORM_COLOR);
+
+  float col[3];
+  UI_GetThemeColor3fv(TH_SEQ_SELECTED, col);
+  immUniformColor3fv(col);
+  immUniform1f("lineWidth", U.pixelsize);
+  immBegin(GPU_PRIM_LINE_LOOP, 4);
+  immVertex2f(pos, seq_image_quad[0][0], seq_image_quad[0][1]);
+  immVertex2f(pos, seq_image_quad[1][0], seq_image_quad[1][1]);
+  immVertex2f(pos, seq_image_quad[2][0], seq_image_quad[2][1]);
+  immVertex2f(pos, seq_image_quad[3][0], seq_image_quad[3][1]);
+  immEnd();
+  immUnbindProgram();
+  GPU_line_width(1);
+  GPU_blend(GPU_BLEND_NONE);
+  GPU_line_smooth(false);
+}
+
 void sequencer_draw_preview(const bContext *C,
                             Scene *scene,
                             ARegion *region,
@@ -2132,9 +2719,17 @@ void sequencer_draw_preview(const bContext *C,
     sequencer_draw_borders_overlay(sseq, v2d, scene);
   }
 
+  SeqCollection *collection = SEQ_query_rendered_strips(&scene->ed->seqbase, timeline_frame, 0);
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, collection) {
+    seq_draw_image_origin_and_outline(C, seq);
+  }
+  SEQ_collection_free(collection);
+
   if (draw_gpencil && show_imbuf && (sseq->flag & SEQ_SHOW_OVERLAY)) {
     sequencer_draw_gpencil_overlay(C);
   }
+
 #if 0
   sequencer_draw_maskedit(C, scene, region, sseq);
 #endif
diff --git a/source/blender/editors/space_sequencer/sequencer_edit.c b/source/blender/editors/space_sequencer/sequencer_edit.c
index b95b7fa0620..9f21fc0676c 100644
--- a/source/blender/editors/space_sequencer/sequencer_edit.c
+++ b/source/blender/editors/space_sequencer/sequencer_edit.c
@@ -579,7 +579,6 @@ static int sequencer_slip_invoke(bContext *C, wmOperator *op, const wmEvent *eve
 static bool sequencer_slip_recursively(Scene *scene, SlipData *data, int offset)
 {
   /* Only data types supported for now. */
-  Editing *ed = SEQ_editing_get(scene);
   bool changed = false;
 
   /* Iterate in reverse so meta-strips are iterated after their children. */
@@ -633,7 +632,10 @@ static bool sequencer_slip_recursively(Scene *scene, SlipData *data, int offset)
     }
   }
   if (changed) {
-    SEQ_relations_free_imbuf(scene, &ed->seqbase, false);
+    for (int i = data->num_seq - 1; i >= 0; i--) {
+      Sequence *seq = data->seq_array[i];
+      SEQ_relations_invalidate_cache_preprocessed(scene, seq);
+    }
   }
   return changed;
 }
diff --git a/source/blender/editors/space_sequencer/sequencer_intern.h b/source/blender/editors/space_sequencer/sequencer_intern.h
index 767ac76efe6..5b5c381509f 100644
--- a/source/blender/editors/space_sequencer/sequencer_intern.h
+++ b/source/blender/editors/space_sequencer/sequencer_intern.h
@@ -67,6 +67,7 @@ struct ImBuf *sequencer_ibuf_get(struct Main *bmain,
                                  int timeline_frame,
                                  int frame_ofs,
                                  const char *viewname);
+void last_displayed_thumbnails_list_free(void *val);
 
 /* sequencer_edit.c */
 struct View2D;
diff --git a/source/blender/editors/space_sequencer/sequencer_select.c b/source/blender/editors/space_sequencer/sequencer_select.c
index 80d3e2cbdaa..aa6599a7c53 100644
--- a/source/blender/editors/space_sequencer/sequencer_select.c
+++ b/source/blender/editors/space_sequencer/sequencer_select.c
@@ -44,6 +44,7 @@
 #include "SEQ_sequencer.h"
 #include "SEQ_time.h"
 #include "SEQ_transform.h"
+#include "SEQ_utils.h"
 
 /* For menu, popup, icons, etc. */
 
@@ -385,6 +386,20 @@ void recurs_sel_seq(Sequence *seq_meta)
   }
 }
 
+static bool seq_point_image_isect(const Scene *scene, const Sequence *seq, float point[2])
+{
+  float seq_image_quad[4][2];
+  SEQ_image_transform_final_quad_get(scene, seq, seq_image_quad);
+  return isect_point_quad_v2(
+      point, seq_image_quad[0], seq_image_quad[1], seq_image_quad[2], seq_image_quad[3]);
+}
+
+static void sequencer_select_do_updates(bContext *C, Scene *scene)
+{
+  ED_outliner_select_sync_from_sequence_tag(C);
+  WM_event_add_notifier(C, NC_SCENE | ND_SEQUENCER | NA_SELECTED, scene);
+}
+
 /** \} */
 
 /* -------------------------------------------------------------------- */
@@ -523,12 +538,6 @@ static void sequencer_select_set_active(Scene *scene, Sequence *seq)
   recurs_sel_seq(seq);
 }
 
-static void sequencer_select_do_updates(bContext *C, Scene *scene)
-{
-  ED_outliner_select_sync_from_sequence_tag(C);
-  WM_event_add_notifier(C, NC_SCENE | ND_SEQUENCER | NA_SELECTED, scene);
-}
-
 static void sequencer_select_side_of_frame(const bContext *C,
                                            const View2D *v2d,
                                            const int mval[2],
@@ -626,6 +635,45 @@ static void sequencer_select_linked_handle(const bContext *C,
   }
 }
 
+/* Check if click happened on image which belongs to strip. If multiple strips are found, loop
+ * through them in order. */
+static Sequence *seq_select_seq_from_preview(const bContext *C, const int mval[2])
+{
+  Scene *scene = CTX_data_scene(C);
+  Editing *ed = SEQ_editing_get(scene);
+  ListBase *seqbase = SEQ_active_seqbase_get(ed);
+  SpaceSeq *sseq = CTX_wm_space_seq(C);
+  View2D *v2d = UI_view2d_fromcontext(C);
+
+  float mouseco_view[2];
+  UI_view2d_region_to_view(v2d, mval[0], mval[1], &mouseco_view[0], &mouseco_view[1]);
+
+  SeqCollection *strips = SEQ_query_rendered_strips(seqbase, scene->r.cfra, sseq->chanshown);
+  ListBase strips_ordered = {NULL};
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, strips) {
+    if (seq_point_image_isect(scene, seq, mouseco_view)) {
+      BLI_remlink(seqbase, seq);
+      BLI_addtail(&strips_ordered, seq);
+    }
+  }
+  SEQ_collection_free(strips);
+  SEQ_sort(&strips_ordered);
+
+  Sequence *seq_active = SEQ_select_active_get(scene);
+  Sequence *seq_select = strips_ordered.first;
+  LISTBASE_FOREACH (Sequence *, seq_iter, &strips_ordered) {
+    if (seq_iter == seq_active && seq_iter->next != NULL) {
+      seq_select = seq_iter->next;
+      break;
+    }
+  }
+
+  BLI_movelisttolist(seqbase, &strips_ordered);
+
+  return seq_select;
+}
+
 static bool element_already_selected(const Sequence *seq, const int handle_clicked)
 {
   const bool handle_already_selected = ((handle_clicked == SEQ_SIDE_LEFT) &&
@@ -680,8 +728,15 @@ static int sequencer_select_exec(bContext *C, wmOperator *op)
   mval[0] = RNA_int_get(op->ptr, "mouse_x");
   mval[1] = RNA_int_get(op->ptr, "mouse_y");
 
-  int handle_clicked;
-  Sequence *seq = find_nearest_seq(scene, v2d, &handle_clicked, mval);
+  ARegion *region = CTX_wm_region(C);
+  int handle_clicked = SEQ_SIDE_NONE;
+  Sequence *seq = NULL;
+  if (region->regiontype == RGN_TYPE_PREVIEW) {
+    seq = seq_select_seq_from_preview(C, mval);
+  }
+  else {
+    seq = find_nearest_seq(scene, v2d, &handle_clicked, mval);
+  }
 
   /* NOTE: `side_of_frame` and `linked_time` functionality is designed to be shared on one keymap,
    * therefore both properties can be true at the same time. */
@@ -1311,6 +1366,47 @@ void SEQUENCER_OT_select_side(wmOperatorType *ot)
 /** \name Box Select Operator
  * \{ */
 
+static bool seq_box_select_rect_image_isect(const Scene *scene, const Sequence *seq, rctf *rect)
+{
+  float seq_image_quad[4][2];
+  SEQ_image_transform_final_quad_get(scene, seq, seq_image_quad);
+  float rect_quad[4][2] = {{rect->xmax, rect->ymax},
+                           {rect->xmax, rect->ymin},
+                           {rect->xmin, rect->ymin},
+                           {rect->xmin, rect->ymax}};
+
+  return seq_point_image_isect(scene, seq, rect_quad[0]) ||
+         seq_point_image_isect(scene, seq, rect_quad[1]) ||
+         seq_point_image_isect(scene, seq, rect_quad[2]) ||
+         seq_point_image_isect(scene, seq, rect_quad[3]) ||
+         isect_point_quad_v2(
+             seq_image_quad[0], rect_quad[0], rect_quad[1], rect_quad[2], rect_quad[3]) ||
+         isect_point_quad_v2(
+             seq_image_quad[1], rect_quad[0], rect_quad[1], rect_quad[2], rect_quad[3]) ||
+         isect_point_quad_v2(
+             seq_image_quad[2], rect_quad[0], rect_quad[1], rect_quad[2], rect_quad[3]) ||
+         isect_point_quad_v2(
+             seq_image_quad[3], rect_quad[0], rect_quad[1], rect_quad[2], rect_quad[3]);
+}
+
+static void seq_box_select_seq_from_preview(const bContext *C, rctf *rect)
+{
+  Scene *scene = CTX_data_scene(C);
+  Editing *ed = SEQ_editing_get(scene);
+  ListBase *seqbase = SEQ_active_seqbase_get(ed);
+  SpaceSeq *sseq = CTX_wm_space_seq(C);
+
+  SeqCollection *strips = SEQ_query_rendered_strips(seqbase, scene->r.cfra, sseq->chanshown);
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, strips) {
+    if (seq_box_select_rect_image_isect(scene, seq, rect)) {
+      seq->flag |= SELECT;
+    }
+  }
+
+  SEQ_collection_free(strips);
+}
+
 static int sequencer_box_select_exec(bContext *C, wmOperator *op)
 {
   Scene *scene = CTX_data_scene(C);
@@ -1333,6 +1429,13 @@ static int sequencer_box_select_exec(bContext *C, wmOperator *op)
   WM_operator_properties_border_to_rctf(op, &rectf);
   UI_view2d_region_to_view_rctf(v2d, &rectf, &rectf);
 
+  ARegion *region = CTX_wm_region(C);
+  if (region->regiontype == RGN_TYPE_PREVIEW) {
+    seq_box_select_seq_from_preview(C, &rectf);
+    sequencer_select_do_updates(C, scene);
+    return OPERATOR_FINISHED;
+  }
+
   LISTBASE_FOREACH (Sequence *, seq, ed->seqbasep) {
     rctf rq;
     seq_rectf(seq, &rq);
@@ -1378,9 +1481,7 @@ static int sequencer_box_select_exec(bContext *C, wmOperator *op)
     }
   }
 
-  ED_outliner_select_sync_from_sequence_tag(C);
-
-  WM_event_add_notifier(C, NC_SCENE | ND_SEQUENCER | NA_SELECTED, scene);
+  sequencer_select_do_updates(C, scene);
 
   return OPERATOR_FINISHED;
 }
diff --git a/source/blender/editors/space_sequencer/space_sequencer.c b/source/blender/editors/space_sequencer/space_sequencer.c
index 0d09f2564e8..99b75f82922 100644
--- a/source/blender/editors/space_sequencer/space_sequencer.c
+++ b/source/blender/editors/space_sequencer/space_sequencer.c
@@ -32,6 +32,7 @@
 #include "MEM_guardedalloc.h"
 
 #include "BLI_blenlib.h"
+#include "BLI_ghash.h"
 #include "BLI_utildefines.h"
 
 #include "BKE_context.h"
@@ -42,6 +43,7 @@
 
 #include "ED_screen.h"
 #include "ED_space_api.h"
+#include "ED_transform.h"
 #include "ED_view3d.h"
 #include "ED_view3d_offscreen.h" /* Only for sequencer view3d drawing callback. */
 
@@ -98,10 +100,14 @@ static SpaceLink *sequencer_create(const ScrArea *UNUSED(area), const Scene *sce
   sseq->chanshown = 0;
   sseq->view = SEQ_VIEW_SEQUENCE;
   sseq->mainb = SEQ_DRAW_IMG_IMBUF;
-  sseq->flag = SEQ_PREVIEW_SHOW_GPENCIL | SEQ_USE_ALPHA | SEQ_SHOW_MARKERS |
-               SEQ_TIMELINE_SHOW_FCURVES | SEQ_ZOOM_TO_FIT | SEQ_SHOW_OVERLAY |
-               SEQ_TIMELINE_SHOW_STRIP_NAME | SEQ_TIMELINE_SHOW_STRIP_SOURCE |
-               SEQ_TIMELINE_SHOW_STRIP_DURATION | SEQ_TIMELINE_SHOW_GRID;
+  sseq->flag = SEQ_USE_ALPHA | SEQ_SHOW_MARKERS | SEQ_ZOOM_TO_FIT | SEQ_SHOW_OVERLAY;
+  sseq->preview_overlay.flag = SEQ_PREVIEW_SHOW_GPENCIL | SEQ_PREVIEW_SHOW_OUTLINE_SELECTED;
+  sseq->timeline_overlay.flag = SEQ_TIMELINE_SHOW_STRIP_NAME | SEQ_TIMELINE_SHOW_STRIP_SOURCE |
+                                SEQ_TIMELINE_SHOW_STRIP_DURATION | SEQ_TIMELINE_SHOW_GRID |
+                                SEQ_TIMELINE_SHOW_FCURVES;
+
+  BLI_rctf_init(&sseq->runtime.last_thumbnail_area, 0.0f, 0.0f, 0.0f, 0.0f);
+  sseq->runtime.last_displayed_thumbnails = NULL;
 
   /* Tool header. */
   region = MEM_callocN(sizeof(ARegion), "tool header for sequencer");
@@ -172,7 +178,7 @@ static SpaceLink *sequencer_create(const ScrArea *UNUSED(area), const Scene *sce
   region->v2d.cur = region->v2d.tot;
 
   region->v2d.min[0] = 10.0f;
-  region->v2d.min[1] = 0.5f;
+  region->v2d.min[1] = 4.0f;
 
   region->v2d.max[0] = MAXFRAMEF;
   region->v2d.max[1] = MAXSEQ;
@@ -186,6 +192,8 @@ static SpaceLink *sequencer_create(const ScrArea *UNUSED(area), const Scene *sce
   region->v2d.keeptot = 0;
   region->v2d.align = V2D_ALIGN_NO_NEG_Y;
 
+  sseq->runtime.last_displayed_thumbnails = NULL;
+
   return (SpaceLink *)sseq;
 }
 
@@ -216,6 +224,12 @@ static void sequencer_free(SpaceLink *sl)
   if (scopes->histogram_ibuf) {
     IMB_freeImBuf(scopes->histogram_ibuf);
   }
+
+  if (sseq->runtime.last_displayed_thumbnails) {
+    BLI_ghash_free(
+        sseq->runtime.last_displayed_thumbnails, NULL, last_displayed_thumbnails_list_free);
+    sseq->runtime.last_displayed_thumbnails = NULL;
+  }
 }
 
 /* Spacetype init callback. */
@@ -330,6 +344,7 @@ static SpaceLink *sequencer_duplicate(SpaceLink *sl)
   /* XXX  sseq->gpd = gpencil_data_duplicate(sseq->gpd, false); */
 
   memset(&sseqn->scopes, 0, sizeof(sseqn->scopes));
+  memset(&sseqn->runtime, 0, sizeof(sseqn->runtime));
 
   return (SpaceLink *)sseqn;
 }
@@ -481,11 +496,72 @@ static void SEQUENCER_GGT_navigate(wmGizmoGroupType *gzgt)
   VIEW2D_GGT_navigate_impl(gzgt, "SEQUENCER_GGT_navigate");
 }
 
+static void SEQUENCER_GGT_gizmo2d(wmGizmoGroupType *gzgt)
+{
+  gzgt->name = "Sequencer Transform Gizmo";
+  gzgt->idname = "SEQUENCER_GGT_gizmo2d";
+
+  gzgt->flag |= (WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP |
+                 WM_GIZMOGROUPTYPE_DELAY_REFRESH_FOR_TWEAK);
+
+  gzgt->gzmap_params.spaceid = SPACE_SEQ;
+  gzgt->gzmap_params.regionid = RGN_TYPE_PREVIEW;
+
+  ED_widgetgroup_gizmo2d_xform_callbacks_set(gzgt);
+}
+
+static void SEQUENCER_GGT_gizmo2d_translate(wmGizmoGroupType *gzgt)
+{
+  gzgt->name = "Sequencer Translate Gizmo";
+  gzgt->idname = "SEQUENCER_GGT_gizmo2d_translate";
+
+  gzgt->flag |= (WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP |
+                 WM_GIZMOGROUPTYPE_DELAY_REFRESH_FOR_TWEAK);
+
+  gzgt->gzmap_params.spaceid = SPACE_SEQ;
+  gzgt->gzmap_params.regionid = RGN_TYPE_PREVIEW;
+
+  ED_widgetgroup_gizmo2d_xform_no_cage_callbacks_set(gzgt);
+}
+
+static void SEQUENCER_GGT_gizmo2d_resize(wmGizmoGroupType *gzgt)
+{
+  gzgt->name = "Sequencer Transform Gizmo Resize";
+  gzgt->idname = "SEQUENCER_GGT_gizmo2d_resize";
+
+  gzgt->flag |= (WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP |
+                 WM_GIZMOGROUPTYPE_DELAY_REFRESH_FOR_TWEAK);
+
+  gzgt->gzmap_params.spaceid = SPACE_SEQ;
+  gzgt->gzmap_params.regionid = RGN_TYPE_PREVIEW;
+
+  ED_widgetgroup_gizmo2d_resize_callbacks_set(gzgt);
+}
+
+static void SEQUENCER_GGT_gizmo2d_rotate(wmGizmoGroupType *gzgt)
+{
+  gzgt->name = "Sequencer Transform Gizmo Resize";
+  gzgt->idname = "SEQUENCER_GGT_gizmo2d_rotate";
+
+  gzgt->flag |= (WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP |
+                 WM_GIZMOGROUPTYPE_DELAY_REFRESH_FOR_TWEAK);
+
+  gzgt->gzmap_params.spaceid = SPACE_SEQ;
+  gzgt->gzmap_params.regionid = RGN_TYPE_PREVIEW;
+
+  ED_widgetgroup_gizmo2d_rotate_callbacks_set(gzgt);
+}
+
 static void sequencer_gizmos(void)
 {
   wmGizmoMapType *gzmap_type = WM_gizmomaptype_ensure(
       &(const struct wmGizmoMapType_Params){SPACE_SEQ, RGN_TYPE_PREVIEW});
 
+  WM_gizmogrouptype_append(SEQUENCER_GGT_gizmo2d);
+  WM_gizmogrouptype_append(SEQUENCER_GGT_gizmo2d_translate);
+  WM_gizmogrouptype_append(SEQUENCER_GGT_gizmo2d_resize);
+  WM_gizmogrouptype_append(SEQUENCER_GGT_gizmo2d_rotate);
+
   WM_gizmogrouptype_append_and_link(gzmap_type, SEQUENCER_GGT_navigate);
 }
 
@@ -742,6 +818,8 @@ static void sequencer_preview_region_listener(const wmRegionListenerParams *para
   ARegion *region = params->region;
   wmNotifier *wmn = params->notifier;
 
+  WM_gizmomap_tag_refresh(region->gizmo_map);
+
   /* Context changes. */
   switch (wmn->category) {
     case NC_GPENCIL:
diff --git a/source/blender/editors/space_view3d/view3d_gizmo_preselect.c b/source/blender/editors/space_view3d/view3d_gizmo_preselect.c
index 441182d7a5f..918ecb14752 100644
--- a/source/blender/editors/space_view3d/view3d_gizmo_preselect.c
+++ b/source/blender/editors/space_view3d/view3d_gizmo_preselect.c
@@ -58,7 +58,7 @@ void VIEW3D_GGT_mesh_preselect_elem(wmGizmoGroupType *gzgt)
   gzgt->name = "Mesh Preselect Element";
   gzgt->idname = "VIEW3D_GGT_mesh_preselect_elem";
 
-  gzgt->flag = WM_GIZMOGROUPTYPE_3D;
+  gzgt->flag = WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP | WM_GIZMOGROUPTYPE_3D;
 
   gzgt->gzmap_params.spaceid = SPACE_VIEW3D;
   gzgt->gzmap_params.regionid = RGN_TYPE_WINDOW;
@@ -95,7 +95,7 @@ void VIEW3D_GGT_mesh_preselect_edgering(wmGizmoGroupType *gzgt)
   gzgt->name = "Mesh Preselect Edge Ring";
   gzgt->idname = "VIEW3D_GGT_mesh_preselect_edgering";
 
-  gzgt->flag = WM_GIZMOGROUPTYPE_3D;
+  gzgt->flag = WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP | WM_GIZMOGROUPTYPE_3D;
 
   gzgt->gzmap_params.spaceid = SPACE_VIEW3D;
   gzgt->gzmap_params.regionid = RGN_TYPE_WINDOW;
diff --git a/source/blender/editors/space_view3d/view3d_select.c b/source/blender/editors/space_view3d/view3d_select.c
index 3f572bf9d5a..39aed131ea1 100644
--- a/source/blender/editors/space_view3d/view3d_select.c
+++ b/source/blender/editors/space_view3d/view3d_select.c
@@ -2813,7 +2813,9 @@ static int view3d_select_invoke(bContext *C, wmOperator *op, const wmEvent *even
 {
   RNA_int_set_array(op->ptr, "location", event->mval);
 
-  return view3d_select_exec(C, op);
+  const int retval = view3d_select_exec(C, op);
+
+  return WM_operator_flag_only_pass_through_on_press(retval, event);
 }
 
 void VIEW3D_OT_select(wmOperatorType *ot)
diff --git a/source/blender/editors/transform/CMakeLists.txt b/source/blender/editors/transform/CMakeLists.txt
index e9efed3cd61..64a720322c1 100644
--- a/source/blender/editors/transform/CMakeLists.txt
+++ b/source/blender/editors/transform/CMakeLists.txt
@@ -60,6 +60,7 @@ set(SRC
   transform_convert_particle.c
   transform_convert_sculpt.c
   transform_convert_sequencer.c
+  transform_convert_sequencer_image.c
   transform_convert_tracking.c
   transform_draw_cursors.c
   transform_generics.c
diff --git a/source/blender/editors/transform/transform.c b/source/blender/editors/transform/transform.c
index 58491f8c2d3..e58e524e341 100644
--- a/source/blender/editors/transform/transform.c
+++ b/source/blender/editors/transform/transform.c
@@ -1703,11 +1703,13 @@ bool initTransform(bContext *C, TransInfo *t, wmOperator *op, const wmEvent *eve
     t->draw_handle_cursor = WM_paint_cursor_activate(
         SPACE_TYPE_ANY, RGN_TYPE_ANY, transform_draw_cursor_poll, transform_draw_cursor_draw, t);
   }
-  else if (t->spacetype == SPACE_SEQ) {
-    t->draw_handle_view = ED_region_draw_cb_activate(
-        t->region->type, drawTransformView, t, REGION_DRAW_POST_VIEW);
-  }
-  else if (ELEM(t->spacetype, SPACE_IMAGE, SPACE_CLIP, SPACE_NODE, SPACE_GRAPH, SPACE_ACTION)) {
+  else if (ELEM(t->spacetype,
+                SPACE_IMAGE,
+                SPACE_CLIP,
+                SPACE_NODE,
+                SPACE_GRAPH,
+                SPACE_ACTION,
+                SPACE_SEQ)) {
     t->draw_handle_view = ED_region_draw_cb_activate(
         t->region->type, drawTransformView, t, REGION_DRAW_POST_VIEW);
     t->draw_handle_cursor = WM_paint_cursor_activate(
diff --git a/source/blender/editors/transform/transform.h b/source/blender/editors/transform/transform.h
index d1a1937cef1..7f4e533ccd7 100644
--- a/source/blender/editors/transform/transform.h
+++ b/source/blender/editors/transform/transform.h
@@ -87,15 +87,16 @@ typedef enum {
   CTX_PAINT_CURVE = (1 << 7),
   CTX_POSE_BONE = (1 << 8),
   CTX_TEXTURE_SPACE = (1 << 9),
+  CTX_SEQUENCER_IMAGE = (1 << 10),
 
-  CTX_NO_PET = (1 << 10),
-  CTX_AUTOCONFIRM = (1 << 11),
+  CTX_NO_PET = (1 << 11),
+  CTX_AUTOCONFIRM = (1 << 12),
   /** When transforming object's, adjust the object data so it stays in the same place. */
-  CTX_OBMODE_XFORM_OBDATA = (1 << 12),
+  CTX_OBMODE_XFORM_OBDATA = (1 << 13),
   /** Transform object parents without moving their children. */
-  CTX_OBMODE_XFORM_SKIP_CHILDREN = (1 << 13),
+  CTX_OBMODE_XFORM_SKIP_CHILDREN = (1 << 14),
   /** Enable edge scrolling in 2D views */
-  CTX_VIEW2D_EDGE_PAN = (1 << 14),
+  CTX_VIEW2D_EDGE_PAN = (1 << 15),
 } eTContext;
 
 /** #TransInfo.flag */
@@ -240,6 +241,7 @@ typedef enum {
   TC_PARTICLE_VERTS,
   TC_SCULPT,
   TC_SEQ_DATA,
+  TC_SEQ_IMAGE_DATA,
   TC_TRACKING_DATA,
 } eTConvertType;
 
diff --git a/source/blender/editors/transform/transform_convert.c b/source/blender/editors/transform/transform_convert.c
index d756e2c90a6..557fa79e7ac 100644
--- a/source/blender/editors/transform/transform_convert.c
+++ b/source/blender/editors/transform/transform_convert.c
@@ -955,6 +955,7 @@ void special_aftertrans_update(bContext *C, TransInfo *t)
     case TC_OBJECT_TEXSPACE:
     case TC_PAINT_CURVE_VERTS:
     case TC_PARTICLE_VERTS:
+    case TC_SEQ_IMAGE_DATA:
     case TC_NONE:
     default:
       break;
@@ -1042,6 +1043,7 @@ static void init_proportional_edit(TransInfo *t)
     case TC_PAINT_CURVE_VERTS:
     case TC_SCULPT:
     case TC_SEQ_DATA:
+    case TC_SEQ_IMAGE_DATA:
     case TC_TRACKING_DATA:
     case TC_NONE:
     default:
@@ -1120,6 +1122,7 @@ static void init_TransDataContainers(TransInfo *t,
     case TC_PARTICLE_VERTS:
     case TC_SCULPT:
     case TC_SEQ_DATA:
+    case TC_SEQ_IMAGE_DATA:
     case TC_TRACKING_DATA:
     case TC_NONE:
     default:
@@ -1204,6 +1207,7 @@ static eTFlag flags_from_data_type(eTConvertType data_type)
     case TC_NODE_DATA:
     case TC_PAINT_CURVE_VERTS:
     case TC_SEQ_DATA:
+    case TC_SEQ_IMAGE_DATA:
     case TC_TRACKING_DATA:
       return T_POINTS | T_2D_EDIT;
     case TC_ARMATURE_VERTS:
@@ -1282,7 +1286,12 @@ static eTConvertType convert_type_get(const TransInfo *t, Object **r_obj_armatur
     convert_type = TC_NLA_DATA;
   }
   else if (t->spacetype == SPACE_SEQ) {
-    convert_type = TC_SEQ_DATA;
+    if (t->options & CTX_SEQUENCER_IMAGE) {
+      convert_type = TC_SEQ_IMAGE_DATA;
+    }
+    else {
+      convert_type = TC_SEQ_DATA;
+    }
   }
   else if (t->spacetype == SPACE_GRAPH) {
     convert_type = TC_GRAPH_EDIT_DATA;
@@ -1470,6 +1479,10 @@ void createTransData(bContext *C, TransInfo *t)
       t->num.flag |= NUM_NO_FRACTION; /* sequencer has no use for floating point transform. */
       createTransSeqData(t);
       break;
+    case TC_SEQ_IMAGE_DATA:
+      t->obedit_type = -1;
+      createTransSeqImageData(t);
+      break;
     case TC_TRACKING_DATA:
       createTransTrackingData(C, t);
       break;
@@ -1746,6 +1759,9 @@ void recalcData(TransInfo *t)
     case TC_SEQ_DATA:
       recalcData_sequencer(t);
       break;
+    case TC_SEQ_IMAGE_DATA:
+      recalcData_sequencer_image(t);
+      break;
     case TC_TRACKING_DATA:
       recalcData_tracking(t);
       break;
diff --git a/source/blender/editors/transform/transform_convert.h b/source/blender/editors/transform/transform_convert.h
index 9cb0400cad9..66d84bca2d2 100644
--- a/source/blender/editors/transform/transform_convert.h
+++ b/source/blender/editors/transform/transform_convert.h
@@ -218,6 +218,10 @@ void createTransSeqData(TransInfo *t);
 void recalcData_sequencer(TransInfo *t);
 void special_aftertrans_update__sequencer(bContext *C, TransInfo *t);
 
+/* transform_convert_sequencer_image.c */
+void createTransSeqImageData(TransInfo *t);
+void recalcData_sequencer_image(TransInfo *t);
+
 /* transform_convert_tracking.c */
 void createTransTrackingData(bContext *C, TransInfo *t);
 void recalcData_tracking(TransInfo *t);
diff --git a/source/blender/editors/transform/transform_convert_sequencer_image.c b/source/blender/editors/transform/transform_convert_sequencer_image.c
new file mode 100644
index 00000000000..465f8b9a694
--- /dev/null
+++ b/source/blender/editors/transform/transform_convert_sequencer_image.c
@@ -0,0 +1,195 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2021 Blender Foundation.
+ * All rights reserved.
+ */
+
+/** \file
+ * \ingroup edtransform
+ */
+
+#include "MEM_guardedalloc.h"
+
+#include "DNA_space_types.h"
+
+#include "BLI_listbase.h"
+#include "BLI_math.h"
+
+#include "BKE_context.h"
+#include "BKE_report.h"
+
+#include "SEQ_iterator.h"
+#include "SEQ_relations.h"
+#include "SEQ_sequencer.h"
+#include "SEQ_time.h"
+#include "SEQ_transform.h"
+#include "SEQ_utils.h"
+
+#include "UI_view2d.h"
+
+#include "transform.h"
+#include "transform_convert.h"
+
+/** Used for sequencer transform. */
+typedef struct TransDataSeq {
+  struct Sequence *seq;
+  float orig_origin_position[2];
+  float orig_translation[2];
+  float orig_scale[2];
+  float orig_rotation;
+} TransDataSeq;
+
+static TransData *SeqToTransData(const Scene *scene,
+                                 Sequence *seq,
+                                 TransData *td,
+                                 TransData2D *td2d,
+                                 TransDataSeq *tdseq,
+                                 int vert_index)
+{
+  const StripTransform *transform = seq->strip->transform;
+  float origin[2];
+  SEQ_image_transform_origin_offset_pixelspace_get(scene, seq, origin);
+  float vertex[2] = {origin[0], origin[1]};
+
+  /* Add control vertex, so rotation and scale can be calculated. */
+  if (vert_index == 1) {
+    vertex[0] += 1.0f;
+  }
+  else if (vert_index == 2) {
+    vertex[1] += 1.0f;
+  }
+
+  td2d->loc[0] = vertex[0];
+  td2d->loc[1] = vertex[1];
+  td2d->loc2d = NULL;
+  td->loc = td2d->loc;
+  copy_v3_v3(td->iloc, td->loc);
+
+  td->center[0] = origin[0];
+  td->center[1] = origin[1];
+
+  memset(td->axismtx, 0, sizeof(td->axismtx));
+  td->axismtx[2][2] = 1.0f;
+  unit_m3(td->mtx);
+  unit_m3(td->smtx);
+
+  tdseq->seq = seq;
+  copy_v2_v2(tdseq->orig_origin_position, origin);
+  tdseq->orig_translation[0] = transform->xofs;
+  tdseq->orig_translation[1] = transform->yofs;
+  tdseq->orig_scale[0] = transform->scale_x;
+  tdseq->orig_scale[1] = transform->scale_y;
+  tdseq->orig_rotation = transform->rotation;
+
+  td->extra = (void *)tdseq;
+  td->ext = NULL;
+  td->flag |= TD_SELECTED;
+  td->dist = 0.0;
+
+  return td;
+}
+
+static void freeSeqData(TransInfo *UNUSED(t), TransDataContainer *tc, TransCustomData *UNUSED(custom_data))
+{
+  TransData *td = (TransData *)tc->data;
+  MEM_freeN(td->extra);
+}
+
+void createTransSeqImageData(TransInfo *t)
+{
+  Editing *ed = SEQ_editing_get(t->scene);
+  ListBase *seqbase = SEQ_active_seqbase_get(ed);
+  SeqCollection *strips = SEQ_query_rendered_strips(seqbase, t->scene->r.cfra, 0);
+  SEQ_filter_selected_strips(strips);
+
+  const int count = SEQ_collection_len(strips);
+  if (ed == NULL || count == 0) {
+    SEQ_collection_free(strips);
+    return;
+  }
+
+  TransDataContainer *tc = TRANS_DATA_CONTAINER_FIRST_SINGLE(t);
+  tc->custom.type.free_cb = freeSeqData;
+
+  tc->data_len = count * 3; /* 3 vertices per sequence are needed. */
+  TransData *td = tc->data = MEM_callocN(tc->data_len * sizeof(TransData), "TransSeq TransData");
+  TransData2D *td2d = tc->data_2d = MEM_callocN(tc->data_len * sizeof(TransData2D),
+                                                "TransSeq TransData2D");
+  TransDataSeq *tdseq = MEM_callocN(tc->data_len * sizeof(TransDataSeq), "TransSeq TransDataSeq");
+
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, strips) {
+    /* One `Sequence` needs 3 `TransData` entries - center point placed in image origin, then 2
+     * points offset by 1 in X and Y direction respectively, so rotation and scale can be
+     * calculated from these points. */
+    SeqToTransData(t->scene, seq, td++, td2d++, tdseq++, 0);
+    SeqToTransData(t->scene, seq, td++, td2d++, tdseq++, 1);
+    SeqToTransData(t->scene, seq, td++, td2d++, tdseq++, 2);
+  }
+
+  SEQ_collection_free(strips);
+}
+
+void recalcData_sequencer_image(TransInfo *t)
+{
+  TransDataContainer *tc = TRANS_DATA_CONTAINER_FIRST_SINGLE(t);
+  TransData *td = NULL;
+  TransData2D *td2d = NULL;
+  int i;
+
+  for (i = 0, td = tc->data, td2d = tc->data_2d; i < tc->data_len; i++, td++, td2d++) {
+    /* Origin. */
+    float loc[2];
+    copy_v2_v2(loc, td2d->loc);
+    i++, td++, td2d++;
+
+    /* X and Y control points used to read scale and rotation. */
+    float handle_x[2];
+    copy_v2_v2(handle_x, td2d->loc);
+    sub_v2_v2(handle_x, loc);
+    i++, td++, td2d++;
+    float handle_y[2];
+    copy_v2_v2(handle_y, td2d->loc);
+    sub_v2_v2(handle_y, loc);
+
+    TransDataSeq *tdseq = td->extra;
+    Sequence *seq = tdseq->seq;
+    StripTransform *transform = seq->strip->transform;
+    float mirror[2];
+    SEQ_image_transform_mirror_factor_get(seq, mirror);
+
+    /* Calculate translation. */
+    float translation[2];
+    copy_v2_v2(translation, tdseq->orig_origin_position);
+    sub_v2_v2(translation, loc);
+    mul_v2_v2(translation, mirror);
+    transform->xofs = tdseq->orig_translation[0] - translation[0];
+    transform->yofs = tdseq->orig_translation[1] - translation[1];
+
+    /* Scale. */
+    transform->scale_x = tdseq->orig_scale[0] * fabs(len_v2(handle_x));
+    transform->scale_y = tdseq->orig_scale[1] * fabs(len_v2(handle_y));
+
+    /* Rotation. Scaling can cause negative rotation. */
+    if (t->mode == TFM_ROTATION) {
+      float rotation = angle_signed_v2v2(handle_x, (float[]){1, 0}) * mirror[0] * mirror[1];
+      transform->rotation = tdseq->orig_rotation + rotation;
+      transform->rotation += DEG2RAD(360.0);
+      transform->rotation = fmod(transform->rotation, DEG2RAD(360.0));
+    }
+    SEQ_relations_invalidate_cache_preprocessed(t->scene, seq);
+  }
+}
diff --git a/source/blender/editors/transform/transform_draw_cursors.c b/source/blender/editors/transform/transform_draw_cursors.c
index ead8eae0997..af1f3cb72a4 100644
--- a/source/blender/editors/transform/transform_draw_cursors.c
+++ b/source/blender/editors/transform/transform_draw_cursors.c
@@ -95,7 +95,7 @@ static void drawArrow(const uint pos_id, const enum eArrowDirection dir)
 bool transform_draw_cursor_poll(bContext *C)
 {
   ARegion *region = CTX_wm_region(C);
-  return (region && region->regiontype == RGN_TYPE_WINDOW) ? 1 : 0;
+  return (region && ELEM(region->regiontype, RGN_TYPE_WINDOW, RGN_TYPE_PREVIEW)) ? 1 : 0;
 }
 
 /**
diff --git a/source/blender/editors/transform/transform_generics.c b/source/blender/editors/transform/transform_generics.c
index c493b9bd102..fa323f0c1f7 100644
--- a/source/blender/editors/transform/transform_generics.c
+++ b/source/blender/editors/transform/transform_generics.c
@@ -59,6 +59,8 @@
 #include "UI_resources.h"
 #include "UI_view2d.h"
 
+#include "SEQ_sequencer.h"
+
 #include "transform.h"
 #include "transform_convert.h"
 #include "transform_mode.h"
@@ -335,6 +337,11 @@ void initTransInfo(bContext *C, TransInfo *t, wmOperator *op, const wmEvent *eve
       t->options |= CTX_MASK;
     }
   }
+  else if (t->spacetype == SPACE_SEQ && region->regiontype == RGN_TYPE_PREVIEW) {
+    t->view = &region->v2d;
+    t->around = SEQ_tool_settings_pivot_point_get(t->scene);
+    t->options |= CTX_SEQUENCER_IMAGE;
+  }
   else {
     if (region) {
       /* XXX: For now, get View2D from the active region. */
diff --git a/source/blender/editors/transform/transform_gizmo_2d.c b/source/blender/editors/transform/transform_gizmo_2d.c
index 0b677e2560b..0d66db0d7e1 100644
--- a/source/blender/editors/transform/transform_gizmo_2d.c
+++ b/source/blender/editors/transform/transform_gizmo_2d.c
@@ -49,6 +49,11 @@
 #include "ED_screen.h"
 #include "ED_uvedit.h"
 
+#include "SEQ_iterator.h"
+#include "SEQ_sequencer.h"
+#include "SEQ_time.h"
+#include "SEQ_transform.h"
+
 #include "transform.h" /* own include */
 
 /* -------------------------------------------------------------------- */
@@ -234,17 +239,66 @@ static bool gizmo2d_calc_bounds(const bContext *C, float *r_center, float *r_min
   return changed;
 }
 
+static float gizmo2d_calc_rotation(const bContext *C)
+{
+  ScrArea *area = CTX_wm_area(C);
+  if (area->spacetype != SPACE_SEQ) {
+    return 0.0f;
+  }
+
+  Scene *scene = CTX_data_scene(C);
+  Editing *ed = SEQ_editing_get(scene);
+  ListBase *seqbase = SEQ_active_seqbase_get(ed);
+  SeqCollection *strips = SEQ_query_rendered_strips(seqbase, scene->r.cfra, 0);
+  SEQ_filter_selected_strips(strips);
+
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, strips) {
+    if (seq == ed->act_seq) {
+      StripTransform *transform = seq->strip->transform;
+      float mirror[2];
+      SEQ_image_transform_mirror_factor_get(seq, mirror);
+      SEQ_collection_free(strips);
+      return transform->rotation * mirror[0] * mirror[1];
+    }
+  }
+
+  SEQ_collection_free(strips);
+  return 0.0f;
+}
+
 static bool gizmo2d_calc_center(const bContext *C, float r_center[2])
 {
   ScrArea *area = CTX_wm_area(C);
+  Scene *scene = CTX_data_scene(C);
   bool has_select = false;
   zero_v2(r_center);
   if (area->spacetype == SPACE_IMAGE) {
     SpaceImage *sima = area->spacedata.first;
-    Scene *scene = CTX_data_scene(C);
     ViewLayer *view_layer = CTX_data_view_layer(C);
     ED_uvedit_center_from_pivot_ex(sima, scene, view_layer, r_center, sima->around, &has_select);
   }
+  else if (area->spacetype == SPACE_SEQ) {
+    ListBase *seqbase = SEQ_active_seqbase_get(SEQ_editing_get(scene));
+    SeqCollection *strips = SEQ_query_rendered_strips(seqbase, scene->r.cfra, 0);
+    SEQ_filter_selected_strips(strips);
+
+    if (SEQ_collection_len(strips) <= 0) {
+      SEQ_collection_free(strips);
+      return false;
+    }
+
+    has_select = true;
+    Sequence *seq;
+    SEQ_ITERATOR_FOREACH (seq, strips) {
+      float origin[2];
+      SEQ_image_transform_origin_offset_pixelspace_get(scene, seq, origin);
+      add_v2_v2(r_center, origin);
+    }
+    mul_v2_fl(r_center, 1.0f / SEQ_collection_len(strips));
+
+    SEQ_collection_free(strips);
+  }
   return has_select;
 }
 
@@ -338,7 +392,7 @@ static void gizmo2d_xform_setup(const bContext *UNUSED(C), wmGizmoGroup *gzgroup
       }
     }
 
-    RNA_boolean_set(ptr, "release_confirm", 1);
+    RNA_boolean_set(ptr, "release_confirm", true);
   }
 
   {
@@ -539,6 +593,7 @@ void ED_widgetgroup_gizmo2d_xform_no_cage_callbacks_set(wmGizmoGroupType *gzgt)
 typedef struct GizmoGroup_Resize2D {
   wmGizmo *gizmo_xy[3];
   float origin[2];
+  float rotation;
 } GizmoGroup_Resize2D;
 
 static GizmoGroup_Resize2D *gizmogroup2d_resize_init(wmGizmoGroup *gzgroup)
@@ -571,6 +626,7 @@ static void gizmo2d_resize_refresh(const bContext *C, wmGizmoGroup *gzgroup)
       ggd->gizmo_xy[i]->flag &= ~WM_GIZMO_HIDDEN;
     }
     copy_v2_v2(ggd->origin, origin);
+    ggd->rotation = gizmo2d_calc_rotation(C);
   }
 }
 
@@ -595,6 +651,13 @@ static void gizmo2d_resize_draw_prepare(const bContext *C, wmGizmoGroup *gzgroup
   for (int i = 0; i < ARRAY_SIZE(ggd->gizmo_xy); i++) {
     wmGizmo *gz = ggd->gizmo_xy[i];
     WM_gizmo_set_matrix_location(gz, origin);
+
+    if (i < 2) {
+      float axis[3] = {0.0f}, rotated_axis[3];
+      axis[i] = 1.0f;
+      rotate_v3_v3v3fl(rotated_axis, axis, (float[3]){0, 0, 1}, ggd->rotation);
+      WM_gizmo_set_matrix_rotation_from_z_axis(gz, rotated_axis);
+    }
   }
 }
 
@@ -617,10 +680,6 @@ static void gizmo2d_resize_setup(const bContext *UNUSED(C), wmGizmoGroup *gzgrou
 
       /* set up widget data */
       RNA_float_set(gz->ptr, "length", 1.0f);
-      float axis[3] = {0.0f};
-      axis[i] = 1.0f;
-      WM_gizmo_set_matrix_rotation_from_z_axis(gz, axis);
-
       RNA_enum_set(gz->ptr, "draw_style", ED_GIZMO_ARROW_STYLE_BOX);
 
       WM_gizmo_set_line_width(gz, GIZMO_AXIS_LINE_WIDTH);
diff --git a/source/blender/editors/transform/transform_mode.c b/source/blender/editors/transform/transform_mode.c
index b9fb8a86752..b14d499cb66 100644
--- a/source/blender/editors/transform/transform_mode.c
+++ b/source/blender/editors/transform/transform_mode.c
@@ -75,7 +75,7 @@ bool transdata_check_local_center(const TransInfo *t, short around)
            /* implicit: (t->flag & T_EDIT) */
            (ELEM(t->obedit_type, OB_MESH, OB_CURVE, OB_MBALL, OB_ARMATURE, OB_GPENCIL)) ||
            (t->spacetype == SPACE_GRAPH) ||
-           (t->options & (CTX_MOVIECLIP | CTX_MASK | CTX_PAINT_CURVE))));
+           (t->options & (CTX_MOVIECLIP | CTX_MASK | CTX_PAINT_CURVE | CTX_SEQUENCER_IMAGE))));
 }
 
 /* Informs if the mode can be switched during modal. */
diff --git a/source/blender/editors/transform/transform_snap_sequencer.c b/source/blender/editors/transform/transform_snap_sequencer.c
index e82a00bcc77..2acdf5cfd9c 100644
--- a/source/blender/editors/transform/transform_snap_sequencer.c
+++ b/source/blender/editors/transform/transform_snap_sequencer.c
@@ -254,6 +254,10 @@ static int seq_snap_threshold_get_frame_distance(const TransInfo *t)
 
 TransSeqSnapData *transform_snap_sequencer_data_alloc(const TransInfo *t)
 {
+  if (t->data_type == TC_SEQ_IMAGE_DATA) {
+    return NULL;
+  }
+
   TransSeqSnapData *snap_data = MEM_callocN(sizeof(TransSeqSnapData), __func__);
   ListBase *seqbase = SEQ_active_seqbase_get(SEQ_editing_get(t->scene));
 
diff --git a/source/blender/editors/uvedit/uvedit_select.c b/source/blender/editors/uvedit/uvedit_select.c
index c0ccf1b7095..86390882bed 100644
--- a/source/blender/editors/uvedit/uvedit_select.c
+++ b/source/blender/editors/uvedit/uvedit_select.c
@@ -2122,7 +2122,9 @@ static int uv_select_invoke(bContext *C, wmOperator *op, const wmEvent *event)
   UI_view2d_region_to_view(&region->v2d, event->mval[0], event->mval[1], &co[0], &co[1]);
   RNA_float_set_array(op->ptr, "location", co);
 
-  return uv_select_exec(C, op);
+  const int retval = uv_select_exec(C, op);
+
+  return WM_operator_flag_only_pass_through_on_press(retval, event);
 }
 
 void UV_OT_select(wmOperatorType *ot)
@@ -2281,7 +2283,9 @@ static int uv_select_loop_invoke(bContext *C, wmOperator *op, const wmEvent *eve
   UI_view2d_region_to_view(&region->v2d, event->mval[0], event->mval[1], &co[0], &co[1]);
   RNA_float_set_array(op->ptr, "location", co);
 
-  return uv_select_loop_exec(C, op);
+  const int retval = uv_select_loop_exec(C, op);
+
+  return WM_operator_flag_only_pass_through_on_press(retval, event);
 }
 
 void UV_OT_select_loop(wmOperatorType *ot)
@@ -2341,7 +2345,9 @@ static int uv_select_edge_ring_invoke(bContext *C, wmOperator *op, const wmEvent
   UI_view2d_region_to_view(&region->v2d, event->mval[0], event->mval[1], &co[0], &co[1]);
   RNA_float_set_array(op->ptr, "location", co);
 
-  return uv_select_edge_ring_exec(C, op);
+  const int retval = uv_select_edge_ring_exec(C, op);
+
+  return WM_operator_flag_only_pass_through_on_press(retval, event);
 }
 
 void UV_OT_select_edge_ring(wmOperatorType *ot)
diff --git a/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp b/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp
index 937a10f26b1..0a82c237256 100644
--- a/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp
+++ b/source/blender/freestyle/intern/blender_interface/BlenderStrokeRenderer.cpp
@@ -94,17 +94,15 @@ BlenderStrokeRenderer::BlenderStrokeRenderer(Render *re, int render_count)
   freestyle_scene = BKE_scene_add(freestyle_bmain, name);
   freestyle_scene->r.cfra = old_scene->r.cfra;
   freestyle_scene->r.mode = old_scene->r.mode & ~(R_EDGE_FRS | R_BORDER);
-  freestyle_scene->r.xsch = re->rectx;  // old_scene->r.xsch
-  freestyle_scene->r.ysch = re->recty;  // old_scene->r.ysch
-  freestyle_scene->r.xasp = 1.0f;       // old_scene->r.xasp;
-  freestyle_scene->r.yasp = 1.0f;       // old_scene->r.yasp;
-  freestyle_scene->r.tilex = old_scene->r.tilex;
-  freestyle_scene->r.tiley = old_scene->r.tiley;
+  freestyle_scene->r.xsch = re->rectx;    // old_scene->r.xsch
+  freestyle_scene->r.ysch = re->recty;    // old_scene->r.ysch
+  freestyle_scene->r.xasp = 1.0f;         // old_scene->r.xasp;
+  freestyle_scene->r.yasp = 1.0f;         // old_scene->r.yasp;
   freestyle_scene->r.size = 100;          // old_scene->r.size
   freestyle_scene->r.color_mgt_flag = 0;  // old_scene->r.color_mgt_flag;
   freestyle_scene->r.scemode = (old_scene->r.scemode &
                                 ~(R_SINGLE_LAYER | R_NO_FRAME_UPDATE | R_MULTIVIEW)) &
-                               (re->r.scemode | ~R_FULL_SAMPLE);
+                               (re->r.scemode);
   freestyle_scene->r.flag = old_scene->r.flag;
   freestyle_scene->r.threads = old_scene->r.threads;
   freestyle_scene->r.border.xmin = old_scene->r.border.xmin;
diff --git a/source/blender/functions/FN_cpp_type.hh b/source/blender/functions/FN_cpp_type.hh
index 7277bf99c12..643b2fc1f28 100644
--- a/source/blender/functions/FN_cpp_type.hh
+++ b/source/blender/functions/FN_cpp_type.hh
@@ -96,6 +96,7 @@ class CPPType : NonCopyable, NonMovable {
   int64_t size_ = 0;
   int64_t alignment_ = 0;
   uintptr_t alignment_mask_ = 0;
+  bool is_trivial_ = false;
   bool is_trivially_destructible_ = false;
   bool has_special_member_functions_ = false;
 
@@ -340,7 +341,6 @@ class CPPType : NonCopyable, NonMovable {
    */
   void copy_assign(const void *src, void *dst) const
   {
-    BLI_assert(src != dst);
     BLI_assert(this->pointer_can_point_to_instance(src));
     BLI_assert(this->pointer_can_point_to_instance(dst));
 
@@ -371,7 +371,7 @@ class CPPType : NonCopyable, NonMovable {
    */
   void copy_construct(const void *src, void *dst) const
   {
-    BLI_assert(src != dst);
+    BLI_assert(src != dst || is_trivial_);
     BLI_assert(this->pointer_can_point_to_instance(src));
     BLI_assert(this->pointer_can_point_to_instance(dst));
 
@@ -402,7 +402,6 @@ class CPPType : NonCopyable, NonMovable {
    */
   void move_assign(void *src, void *dst) const
   {
-    BLI_assert(src != dst);
     BLI_assert(this->pointer_can_point_to_instance(src));
     BLI_assert(this->pointer_can_point_to_instance(dst));
 
@@ -433,7 +432,7 @@ class CPPType : NonCopyable, NonMovable {
    */
   void move_construct(void *src, void *dst) const
   {
-    BLI_assert(src != dst);
+    BLI_assert(src != dst || is_trivial_);
     BLI_assert(this->pointer_can_point_to_instance(src));
     BLI_assert(this->pointer_can_point_to_instance(dst));
 
@@ -464,7 +463,7 @@ class CPPType : NonCopyable, NonMovable {
    */
   void relocate_assign(void *src, void *dst) const
   {
-    BLI_assert(src != dst);
+    BLI_assert(src != dst || is_trivial_);
     BLI_assert(this->pointer_can_point_to_instance(src));
     BLI_assert(this->pointer_can_point_to_instance(dst));
 
@@ -495,7 +494,7 @@ class CPPType : NonCopyable, NonMovable {
    */
   void relocate_construct(void *src, void *dst) const
   {
-    BLI_assert(src != dst);
+    BLI_assert(src != dst || is_trivial_);
     BLI_assert(this->pointer_can_point_to_instance(src));
     BLI_assert(this->pointer_can_point_to_instance(dst));
 
diff --git a/source/blender/functions/FN_cpp_type_make.hh b/source/blender/functions/FN_cpp_type_make.hh
index 088f6b469f4..74dbcabf81a 100644
--- a/source/blender/functions/FN_cpp_type_make.hh
+++ b/source/blender/functions/FN_cpp_type_make.hh
@@ -195,6 +195,7 @@ CPPType::CPPType(CPPTypeParam<T, Flags> /* unused */, StringRef debug_name)
   debug_name_ = debug_name;
   size_ = (int64_t)sizeof(T);
   alignment_ = (int64_t)alignof(T);
+  is_trivial_ = std::is_trivial_v<T>;
   is_trivially_destructible_ = std::is_trivially_destructible_v<T>;
   if constexpr (std::is_default_constructible_v<T>) {
     default_construct_ = default_construct_cb<T>;
diff --git a/source/blender/functions/intern/multi_function_procedure.cc b/source/blender/functions/intern/multi_function_procedure.cc
index fa95e8de71e..986c5dff0c4 100644
--- a/source/blender/functions/intern/multi_function_procedure.cc
+++ b/source/blender/functions/intern/multi_function_procedure.cc
@@ -419,6 +419,10 @@ bool MFProcedure::validate_initialization() const
     const MultiFunction &fn = *instruction->fn_;
     for (const int param_index : fn.param_indices()) {
       const MFParamType param_type = fn.param_type(param_index);
+      /* If the parameter was an unneeded output, it could be null. */
+      if (!instruction->params_[param_index]) {
+        continue;
+      }
       const MFVariable &variable = *instruction->params_[param_index];
       const InitState state = this->find_initialization_state_before_instruction(*instruction,
                                                                                  variable);
diff --git a/source/blender/gpencil_modifiers/CMakeLists.txt b/source/blender/gpencil_modifiers/CMakeLists.txt
index adf68e534bb..eb1f61b1862 100644
--- a/source/blender/gpencil_modifiers/CMakeLists.txt
+++ b/source/blender/gpencil_modifiers/CMakeLists.txt
@@ -69,7 +69,8 @@ set(SRC
   intern/MOD_gpencilthick.c
   intern/MOD_gpenciltime.c
   intern/MOD_gpenciltint.c
-  intern/MOD_gpencilweight.c
+  intern/MOD_gpencilweight_proximity.c
+  intern/MOD_gpencilweight_angle.c
 
   MOD_gpencil_lineart.h
   MOD_gpencil_modifiertypes.h
diff --git a/source/blender/gpencil_modifiers/MOD_gpencil_modifiertypes.h b/source/blender/gpencil_modifiers/MOD_gpencil_modifiertypes.h
index 043186155b7..d9285f44a37 100644
--- a/source/blender/gpencil_modifiers/MOD_gpencil_modifiertypes.h
+++ b/source/blender/gpencil_modifiers/MOD_gpencil_modifiertypes.h
@@ -44,7 +44,8 @@ extern GpencilModifierTypeInfo modifierType_Gpencil_Armature;
 extern GpencilModifierTypeInfo modifierType_Gpencil_Time;
 extern GpencilModifierTypeInfo modifierType_Gpencil_Multiply;
 extern GpencilModifierTypeInfo modifierType_Gpencil_Texture;
-extern GpencilModifierTypeInfo modifierType_Gpencil_Weight;
+extern GpencilModifierTypeInfo modifierType_Gpencil_WeightProximity;
+extern GpencilModifierTypeInfo modifierType_Gpencil_WeightAngle;
 extern GpencilModifierTypeInfo modifierType_Gpencil_Lineart;
 extern GpencilModifierTypeInfo modifierType_Gpencil_Dash;
 
diff --git a/source/blender/gpencil_modifiers/intern/MOD_gpencil_util.c b/source/blender/gpencil_modifiers/intern/MOD_gpencil_util.c
index 5eb1eeab780..df78ac8110e 100644
--- a/source/blender/gpencil_modifiers/intern/MOD_gpencil_util.c
+++ b/source/blender/gpencil_modifiers/intern/MOD_gpencil_util.c
@@ -63,7 +63,8 @@ void gpencil_modifier_type_init(GpencilModifierTypeInfo *types[])
   INIT_GP_TYPE(Time);
   INIT_GP_TYPE(Multiply);
   INIT_GP_TYPE(Texture);
-  INIT_GP_TYPE(Weight);
+  INIT_GP_TYPE(WeightAngle);
+  INIT_GP_TYPE(WeightProximity);
   INIT_GP_TYPE(Lineart);
   INIT_GP_TYPE(Dash);
 #undef INIT_GP_TYPE
diff --git a/source/blender/gpencil_modifiers/intern/MOD_gpencillength.c b/source/blender/gpencil_modifiers/intern/MOD_gpencillength.c
index 6aa0e6c152e..80b60547e92 100644
--- a/source/blender/gpencil_modifiers/intern/MOD_gpencillength.c
+++ b/source/blender/gpencil_modifiers/intern/MOD_gpencillength.c
@@ -72,9 +72,14 @@ static void copyData(const GpencilModifierData *md, GpencilModifierData *target)
 }
 
 static bool gpencil_modify_stroke(bGPDstroke *gps,
-                                  float length,
+                                  const float length,
                                   const float overshoot_fac,
-                                  const short len_mode)
+                                  const short len_mode,
+                                  const bool use_curvature,
+                                  const int extra_point_count,
+                                  const float segment_influence,
+                                  const float max_angle,
+                                  const bool invert_curvature)
 {
   bool changed = false;
   if (length == 0.0f) {
@@ -82,10 +87,18 @@ static bool gpencil_modify_stroke(bGPDstroke *gps,
   }
 
   if (length > 0.0f) {
-    BKE_gpencil_stroke_stretch(gps, length, overshoot_fac, len_mode);
+    changed = BKE_gpencil_stroke_stretch(gps,
+                                         length,
+                                         overshoot_fac,
+                                         len_mode,
+                                         use_curvature,
+                                         extra_point_count,
+                                         segment_influence,
+                                         max_angle,
+                                         invert_curvature);
   }
   else {
-    changed |= BKE_gpencil_stroke_shrink(gps, fabs(length), len_mode);
+    changed = BKE_gpencil_stroke_shrink(gps, fabs(length), len_mode);
   }
 
   return changed;
@@ -96,12 +109,51 @@ static void applyLength(LengthGpencilModifierData *lmd, bGPdata *gpd, bGPDstroke
   bool changed = false;
   const float len = (lmd->mode == GP_LENGTH_ABSOLUTE) ? 1.0f :
                                                         BKE_gpencil_stroke_length(gps, true);
+  const int totpoints = gps->totpoints;
   if (len < FLT_EPSILON) {
     return;
   }
 
-  changed |= gpencil_modify_stroke(gps, len * lmd->start_fac, lmd->overshoot_fac, 1);
-  changed |= gpencil_modify_stroke(gps, len * lmd->end_fac, lmd->overshoot_fac, 2);
+  /* Always do the stretching first since it might depend on points which could be deleted by the
+   * shrink. */
+  float first_fac = lmd->start_fac;
+  int first_mode = 1;
+  float second_fac = lmd->end_fac;
+  int second_mode = 2;
+  if (first_fac < 0) {
+    SWAP(float, first_fac, second_fac);
+    SWAP(int, first_mode, second_mode);
+  }
+
+  const int first_extra_point_count = ceil(first_fac * lmd->point_density);
+  const int second_extra_point_count = ceil(second_fac * lmd->point_density);
+
+  changed |= gpencil_modify_stroke(gps,
+                                   len * first_fac,
+                                   lmd->overshoot_fac,
+                                   first_mode,
+                                   lmd->flag & GP_LENGTH_USE_CURVATURE,
+                                   first_extra_point_count,
+                                   lmd->segment_influence,
+                                   lmd->max_angle,
+                                   lmd->flag & GP_LENGTH_INVERT_CURVATURE);
+  /* HACK: The second #overshoot_fac needs to be adjusted because it is not
+   * done in the same stretch call, because it can have a different length.
+   * The adjustment needs to be stable when
+   * `ceil(overshoot_fac*(gps->totpoints - 2))` is used in stretch and never
+   * produce a result higher than `totpoints - 2`. */
+  const float second_overshoot_fac = lmd->overshoot_fac * (totpoints - 2) /
+                                     ((float)gps->totpoints - 2) *
+                                     (1.0f - 0.1f / (totpoints - 1.0f));
+  changed |= gpencil_modify_stroke(gps,
+                                   len * second_fac,
+                                   second_overshoot_fac,
+                                   second_mode,
+                                   lmd->flag & GP_LENGTH_USE_CURVATURE,
+                                   second_extra_point_count,
+                                   lmd->segment_influence,
+                                   lmd->max_angle,
+                                   lmd->flag & GP_LENGTH_INVERT_CURVATURE);
 
   if (changed) {
     BKE_gpencil_stroke_geometry_update(gpd, gps);
@@ -117,20 +169,25 @@ static void deformStroke(GpencilModifierData *md,
 {
   bGPdata *gpd = ob->data;
   LengthGpencilModifierData *lmd = (LengthGpencilModifierData *)md;
-  if (is_stroke_affected_by_modifier(ob,
-                                     lmd->layername,
-                                     lmd->material,
-                                     lmd->pass_index,
-                                     lmd->layer_pass,
-                                     1,
-                                     gpl,
-                                     gps,
-                                     lmd->flag & GP_LENGTH_INVERT_LAYER,
-                                     lmd->flag & GP_LENGTH_INVERT_PASS,
-                                     lmd->flag & GP_LENGTH_INVERT_LAYERPASS,
-                                     lmd->flag & GP_LENGTH_INVERT_MATERIAL)) {
-    applyLength(lmd, gpd, gps);
+  if (!is_stroke_affected_by_modifier(ob,
+                                      lmd->layername,
+                                      lmd->material,
+                                      lmd->pass_index,
+                                      lmd->layer_pass,
+                                      1,
+                                      gpl,
+                                      gps,
+                                      lmd->flag & GP_LENGTH_INVERT_LAYER,
+                                      lmd->flag & GP_LENGTH_INVERT_PASS,
+                                      lmd->flag & GP_LENGTH_INVERT_LAYERPASS,
+                                      lmd->flag & GP_LENGTH_INVERT_MATERIAL)) {
+    return;
   }
+  if ((gps->flag & GP_STROKE_CYCLIC) != 0) {
+    /* Don't affect cyclic strokes as they have no start/end. */
+    return;
+  }
+  applyLength(lmd, gpd, gps);
 }
 
 static void bakeModifier(Main *UNUSED(bmain),
@@ -168,10 +225,16 @@ static void panel_draw(const bContext *UNUSED(C), Panel *panel)
 
   uiLayout *col = uiLayoutColumn(layout, true);
 
-  uiItemR(col, ptr, "start_factor", 0, IFACE_("Start"), ICON_NONE);
-  uiItemR(col, ptr, "end_factor", 0, IFACE_("End"), ICON_NONE);
+  if (RNA_enum_get(ptr, "mode") == GP_LENGTH_RELATIVE) {
+    uiItemR(col, ptr, "start_factor", 0, IFACE_("Start"), ICON_NONE);
+    uiItemR(col, ptr, "end_factor", 0, IFACE_("End"), ICON_NONE);
+  }
+  else {
+    uiItemR(col, ptr, "start_length", 0, IFACE_("Start"), ICON_NONE);
+    uiItemR(col, ptr, "end_length", 0, IFACE_("End"), ICON_NONE);
+  }
 
-  uiItemR(layout, ptr, "overshoot_factor", UI_ITEM_R_SLIDER, IFACE_("Overshoot"), ICON_NONE);
+  uiItemR(layout, ptr, "overshoot_factor", UI_ITEM_R_SLIDER, IFACE_("Used Length"), ICON_NONE);
 
   gpencil_modifier_panel_end(layout, ptr);
 }
@@ -181,11 +244,40 @@ static void mask_panel_draw(const bContext *UNUSED(C), Panel *panel)
   gpencil_modifier_masking_panel_draw(panel, true, false);
 }
 
+static void curvature_header_draw(const bContext *UNUSED(C), Panel *panel)
+{
+  uiLayout *layout = panel->layout;
+
+  PointerRNA *ptr = gpencil_modifier_panel_get_property_pointers(panel, NULL);
+
+  uiItemR(layout, ptr, "use_curvature", 0, IFACE_("Curvature"), ICON_NONE);
+}
+
+static void curvature_panel_draw(const bContext *UNUSED(C), Panel *panel)
+{
+  uiLayout *layout = panel->layout;
+
+  PointerRNA *ptr = gpencil_modifier_panel_get_property_pointers(panel, NULL);
+
+  uiLayoutSetPropSep(layout, true);
+
+  uiLayout *col = uiLayoutColumn(layout, false);
+
+  uiLayoutSetActive(col, RNA_boolean_get(ptr, "use_curvature"));
+
+  uiItemR(col, ptr, "point_density", 0, NULL, ICON_NONE);
+  uiItemR(col, ptr, "segment_influence", 0, NULL, ICON_NONE);
+  uiItemR(col, ptr, "max_angle", 0, NULL, ICON_NONE);
+  uiItemR(col, ptr, "invert_curvature", 0, IFACE_("Invert"), ICON_NONE);
+}
+
 static void panelRegister(ARegionType *region_type)
 {
   PanelType *panel_type = gpencil_modifier_panel_register(
       region_type, eGpencilModifierType_Length, panel_draw);
   gpencil_modifier_subpanel_register(
+      region_type, "curvature", "", curvature_header_draw, curvature_panel_draw, panel_type);
+  gpencil_modifier_subpanel_register(
       region_type, "mask", "Influence", NULL, mask_panel_draw, panel_type);
 }
 
diff --git a/source/blender/gpencil_modifiers/intern/MOD_gpencilweight_angle.c b/source/blender/gpencil_modifiers/intern/MOD_gpencilweight_angle.c
new file mode 100644
index 00000000000..2c0f3d2d8ad
--- /dev/null
+++ b/source/blender/gpencil_modifiers/intern/MOD_gpencilweight_angle.c
@@ -0,0 +1,260 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * The Original Code is Copyright (C) 2021, Blender Foundation
+ * This is a new part of Blender
+ */
+
+/** \file
+ * \ingroup modifiers
+ */
+
+#include <stdio.h>
+
+#include "BLI_listbase.h"
+#include "BLI_math.h"
+#include "BLI_utildefines.h"
+
+#include "DNA_defaults.h"
+#include "DNA_gpencil_modifier_types.h"
+#include "DNA_gpencil_types.h"
+#include "DNA_meshdata_types.h"
+#include "DNA_object_types.h"
+#include "DNA_screen_types.h"
+
+#include "BKE_colortools.h"
+#include "BKE_context.h"
+#include "BKE_deform.h"
+#include "BKE_gpencil.h"
+#include "BKE_gpencil_modifier.h"
+#include "BKE_lib_query.h"
+#include "BKE_modifier.h"
+#include "BKE_screen.h"
+
+#include "DEG_depsgraph.h"
+#include "DEG_depsgraph_build.h"
+#include "DEG_depsgraph_query.h"
+
+#include "UI_interface.h"
+#include "UI_resources.h"
+
+#include "RNA_access.h"
+
+#include "MOD_gpencil_modifiertypes.h"
+#include "MOD_gpencil_ui_common.h"
+#include "MOD_gpencil_util.h"
+
+static void initData(GpencilModifierData *md)
+{
+  WeightAngleGpencilModifierData *gpmd = (WeightAngleGpencilModifierData *)md;
+
+  BLI_assert(MEMCMP_STRUCT_AFTER_IS_ZERO(gpmd, modifier));
+
+  MEMCPY_STRUCT_AFTER(gpmd, DNA_struct_default_get(WeightAngleGpencilModifierData), modifier);
+}
+
+static void copyData(const GpencilModifierData *md, GpencilModifierData *target)
+{
+  BKE_gpencil_modifier_copydata_generic(md, target);
+}
+
+/* change stroke thickness */
+static void deformStroke(GpencilModifierData *md,
+                         Depsgraph *UNUSED(depsgraph),
+                         Object *ob,
+                         bGPDlayer *gpl,
+                         bGPDframe *UNUSED(gpf),
+                         bGPDstroke *gps)
+{
+  WeightAngleGpencilModifierData *mmd = (WeightAngleGpencilModifierData *)md;
+  const int def_nr = BKE_object_defgroup_name_index(ob, mmd->vgname);
+
+  if (!is_stroke_affected_by_modifier(ob,
+                                      mmd->layername,
+                                      mmd->material,
+                                      mmd->pass_index,
+                                      mmd->layer_pass,
+                                      1,
+                                      gpl,
+                                      gps,
+                                      mmd->flag & GP_WEIGHT_INVERT_LAYER,
+                                      mmd->flag & GP_WEIGHT_INVERT_PASS,
+                                      mmd->flag & GP_WEIGHT_INVERT_LAYERPASS,
+                                      mmd->flag & GP_WEIGHT_INVERT_MATERIAL)) {
+    return;
+  }
+
+  const int target_def_nr = BKE_object_defgroup_name_index(ob, mmd->target_vgname);
+
+  if (target_def_nr == -1) {
+    return;
+  }
+
+  /* Use default Z up. */
+  float vec_axis[3] = {0.0f, 0.0f, 1.0f};
+  float axis[3] = {0.0f, 0.0f, 0.0f};
+  axis[mmd->axis] = 1.0f;
+  float vec_ref[3];
+  /* Apply modifier rotation (sub 90 degrees for Y axis due Z-Up vector). */
+  float rot_angle = mmd->angle - ((mmd->axis == 1) ? M_PI_2 : 0.0f);
+  rotate_normalized_v3_v3v3fl(vec_ref, vec_axis, axis, rot_angle);
+
+  /* Apply the rotation of the object. */
+  if (mmd->space == GP_SPACE_LOCAL) {
+    mul_mat3_m4_v3(ob->obmat, vec_ref);
+  }
+
+  /* Ensure there is a vertex group. */
+  BKE_gpencil_dvert_ensure(gps);
+
+  float weight_pt = 1.0f;
+  for (int i = 0; i < gps->totpoints; i++) {
+    MDeformVert *dvert = gps->dvert != NULL ? &gps->dvert[i] : NULL;
+    /* Verify point is part of vertex group. */
+    float weight = get_modifier_point_weight(
+        dvert, (mmd->flag & GP_WEIGHT_INVERT_VGROUP) != 0, def_nr);
+    if (weight < 0.0f) {
+      continue;
+    }
+
+    /* Special case for single points. */
+    if (gps->totpoints == 1) {
+      weight_pt = 1.0f;
+      break;
+    }
+
+    bGPDspoint *pt1 = (i > 0) ? &gps->points[i] : &gps->points[i + 1];
+    bGPDspoint *pt2 = (i > 0) ? &gps->points[i - 1] : &gps->points[i];
+    float fpt1[3], fpt2[3];
+    mul_v3_m4v3(fpt1, ob->obmat, &pt1->x);
+    mul_v3_m4v3(fpt2, ob->obmat, &pt2->x);
+
+    float vec[3];
+    sub_v3_v3v3(vec, fpt1, fpt2);
+    float angle = angle_on_axis_v3v3_v3(vec_ref, vec, axis);
+    /* Use sin to get a value between 0 and 1. */
+    weight_pt = 1.0f - sin(angle);
+
+    /* Invert weight if required. */
+    if (mmd->flag & GP_WEIGHT_INVERT_OUTPUT) {
+      weight_pt = 1.0f - weight_pt;
+    }
+    /* Assign weight. */
+    dvert = gps->dvert != NULL ? &gps->dvert[i] : NULL;
+    if (dvert != NULL) {
+      MDeformWeight *dw = BKE_defvert_ensure_index(dvert, target_def_nr);
+      if (dw) {
+        dw->weight = (mmd->flag & GP_WEIGHT_MULTIPLY_DATA) ? dw->weight * weight_pt : weight_pt;
+        CLAMP(dw->weight, mmd->min_weight, 1.0f);
+      }
+    }
+  }
+}
+
+static void bakeModifier(struct Main *UNUSED(bmain),
+                         Depsgraph *depsgraph,
+                         GpencilModifierData *md,
+                         Object *ob)
+{
+  bGPdata *gpd = ob->data;
+
+  LISTBASE_FOREACH (bGPDlayer *, gpl, &gpd->layers) {
+    LISTBASE_FOREACH (bGPDframe *, gpf, &gpl->frames) {
+      LISTBASE_FOREACH (bGPDstroke *, gps, &gpf->strokes) {
+        deformStroke(md, depsgraph, ob, gpl, gpf, gps);
+      }
+    }
+  }
+}
+
+static void foreachIDLink(GpencilModifierData *md, Object *ob, IDWalkFunc walk, void *userData)
+{
+  WeightAngleGpencilModifierData *mmd = (WeightAngleGpencilModifierData *)md;
+
+  walk(userData, ob, (ID **)&mmd->material, IDWALK_CB_USER);
+}
+
+static bool isDisabled(GpencilModifierData *md, int UNUSED(userRenderParams))
+{
+  WeightAngleGpencilModifierData *mmd = (WeightAngleGpencilModifierData *)md;
+
+  return (mmd->target_vgname[0] == '\0');
+}
+
+static void panel_draw(const bContext *UNUSED(C), Panel *panel)
+{
+  uiLayout *row, *sub;
+  uiLayout *layout = panel->layout;
+
+  PointerRNA ob_ptr;
+  PointerRNA *ptr = gpencil_modifier_panel_get_property_pointers(panel, &ob_ptr);
+
+  uiLayoutSetPropSep(layout, true);
+  row = uiLayoutRow(layout, true);
+  uiItemPointerR(row, ptr, "target_vertex_group", &ob_ptr, "vertex_groups", NULL, ICON_NONE);
+  sub = uiLayoutRow(row, true);
+  bool has_output = RNA_string_length(ptr, "target_vertex_group") != 0;
+  uiLayoutSetPropDecorate(sub, false);
+  uiLayoutSetActive(sub, has_output);
+  uiItemR(sub, ptr, "use_invert_output", 0, "", ICON_ARROW_LEFTRIGHT);
+
+  uiItemR(layout, ptr, "angle", 0, NULL, ICON_NONE);
+  uiItemR(layout, ptr, "axis", 0, NULL, ICON_NONE);
+  uiItemR(layout, ptr, "space", 0, NULL, ICON_NONE);
+
+  uiItemR(layout, ptr, "minimum_weight", 0, NULL, ICON_NONE);
+  uiItemR(layout, ptr, "use_multiply", 0, NULL, ICON_NONE);
+
+
+  gpencil_modifier_panel_end(layout, ptr);
+}
+
+static void mask_panel_draw(const bContext *UNUSED(C), Panel *panel)
+{
+  gpencil_modifier_masking_panel_draw(panel, true, true);
+}
+
+static void panelRegister(ARegionType *region_type)
+{
+  PanelType *panel_type = gpencil_modifier_panel_register(
+      region_type, eGpencilModifierType_WeightAngle, panel_draw);
+
+  gpencil_modifier_subpanel_register(
+      region_type, "mask", "Influence", NULL, mask_panel_draw, panel_type);
+}
+
+GpencilModifierTypeInfo modifierType_Gpencil_WeightAngle = {
+    /* name */ "Vertex Weight Angle",
+    /* structName */ "WeightAngleGpencilModifierData",
+    /* structSize */ sizeof(WeightAngleGpencilModifierData),
+    /* type */ eGpencilModifierTypeType_Gpencil,
+    /* flags */ 0,
+
+    /* copyData */ copyData,
+
+    /* deformStroke */ deformStroke,
+    /* generateStrokes */ NULL,
+    /* bakeModifier */ bakeModifier,
+    /* remapTime */ NULL,
+
+    /* initData */ initData,
+    /* freeData */ NULL,
+    /* isDisabled */ isDisabled,
+    /* updateDepsgraph */ NULL,
+    /* dependsOnTime */ NULL,
+    /* foreachIDLink */ foreachIDLink,
+    /* foreachTexLink */ NULL,
+    /* panelRegister */ panelRegister,
+};
diff --git a/source/blender/gpencil_modifiers/intern/MOD_gpencilweight.c b/source/blender/gpencil_modifiers/intern/MOD_gpencilweight_proximity.c
index 686023a36d4..0885828a3a0 100644
--- a/source/blender/gpencil_modifiers/intern/MOD_gpencilweight.c
+++ b/source/blender/gpencil_modifiers/intern/MOD_gpencilweight_proximity.c
@@ -58,11 +58,11 @@
 
 static void initData(GpencilModifierData *md)
 {
-  WeightGpencilModifierData *gpmd = (WeightGpencilModifierData *)md;
+  WeightProxGpencilModifierData *gpmd = (WeightProxGpencilModifierData *)md;
 
   BLI_assert(MEMCMP_STRUCT_AFTER_IS_ZERO(gpmd, modifier));
 
-  MEMCPY_STRUCT_AFTER(gpmd, DNA_struct_default_get(WeightGpencilModifierData), modifier);
+  MEMCPY_STRUCT_AFTER(gpmd, DNA_struct_default_get(WeightProxGpencilModifierData), modifier);
 }
 
 static void copyData(const GpencilModifierData *md, GpencilModifierData *target)
@@ -72,7 +72,7 @@ static void copyData(const GpencilModifierData *md, GpencilModifierData *target)
 
 /* Calc distance between point and target object. */
 static float calc_point_weight_by_distance(Object *ob,
-                                           WeightGpencilModifierData *mmd,
+                                           WeightProxGpencilModifierData *mmd,
                                            const float dist_max,
                                            const float dist_min,
                                            bGPDspoint *pt)
@@ -103,9 +103,8 @@ static void deformStroke(GpencilModifierData *md,
                          bGPDframe *UNUSED(gpf),
                          bGPDstroke *gps)
 {
-  WeightGpencilModifierData *mmd = (WeightGpencilModifierData *)md;
+  WeightProxGpencilModifierData *mmd = (WeightProxGpencilModifierData *)md;
   const int def_nr = BKE_object_defgroup_name_index(ob, mmd->vgname);
-  const eWeightGpencilModifierMode mode = mmd->mode;
 
   if (!is_stroke_affected_by_modifier(ob,
                                       mmd->layername,
@@ -130,20 +129,6 @@ static void deformStroke(GpencilModifierData *md,
     return;
   }
 
-  /* Use default Z up. */
-  float vec_axis[3] = {0.0f, 0.0f, 1.0f};
-  float axis[3] = {0.0f, 0.0f, 0.0f};
-  axis[mmd->axis] = 1.0f;
-  float vec_ref[3];
-  /* Apply modifier rotation (sub 90 degrees for Y axis due Z-Up vector). */
-  float rot_angle = mmd->angle - ((mmd->axis == 1) ? M_PI_2 : 0.0f);
-  rotate_normalized_v3_v3v3fl(vec_ref, vec_axis, axis, rot_angle);
-
-  /* Apply the rotation of the object. */
-  if (mmd->space == GP_SPACE_LOCAL) {
-    mul_mat3_m4_v3(ob->obmat, vec_ref);
-  }
-
   /* Ensure there is a vertex group. */
   BKE_gpencil_dvert_ensure(gps);
 
@@ -157,36 +142,9 @@ static void deformStroke(GpencilModifierData *md,
       continue;
     }
 
-    switch (mode) {
-      case GP_WEIGHT_MODE_DISTANCE: {
-        if (mmd->object) {
-          bGPDspoint *pt = &gps->points[i];
-          weight_pt = calc_point_weight_by_distance(ob, mmd, dist_max, dist_min, pt);
-        }
-        break;
-      }
-      case GP_WEIGHT_MODE_ANGLE: {
-        /* Special case for single points. */
-        if (gps->totpoints == 1) {
-          weight_pt = 1.0f;
-          break;
-        }
-
-        bGPDspoint *pt1 = (i > 0) ? &gps->points[i] : &gps->points[i + 1];
-        bGPDspoint *pt2 = (i > 0) ? &gps->points[i - 1] : &gps->points[i];
-        float fpt1[3], fpt2[3];
-        mul_v3_m4v3(fpt1, ob->obmat, &pt1->x);
-        mul_v3_m4v3(fpt2, ob->obmat, &pt2->x);
-
-        float vec[3];
-        sub_v3_v3v3(vec, fpt1, fpt2);
-        float angle = angle_on_axis_v3v3_v3(vec_ref, vec, axis);
-        /* Use sin to get a value between 0 and 1. */
-        weight_pt = 1.0f - sin(angle);
-        break;
-      }
-      default:
-        break;
+    if (mmd->object) {
+      bGPDspoint *pt = &gps->points[i];
+      weight_pt = calc_point_weight_by_distance(ob, mmd, dist_max, dist_min, pt);
     }
 
     /* Invert weight if required. */
@@ -198,7 +156,7 @@ static void deformStroke(GpencilModifierData *md,
     if (dvert != NULL) {
       MDeformWeight *dw = BKE_defvert_ensure_index(dvert, target_def_nr);
       if (dw) {
-        dw->weight = (mmd->flag & GP_WEIGHT_BLEND_DATA) ? dw->weight * weight_pt : weight_pt;
+        dw->weight = (mmd->flag & GP_WEIGHT_MULTIPLY_DATA) ? dw->weight * weight_pt : weight_pt;
         CLAMP(dw->weight, mmd->min_weight, 1.0f);
       }
     }
@@ -223,7 +181,7 @@ static void bakeModifier(struct Main *UNUSED(bmain),
 
 static void foreachIDLink(GpencilModifierData *md, Object *ob, IDWalkFunc walk, void *userData)
 {
-  WeightGpencilModifierData *mmd = (WeightGpencilModifierData *)md;
+  WeightProxGpencilModifierData *mmd = (WeightProxGpencilModifierData *)md;
 
   walk(userData, ob, (ID **)&mmd->material, IDWALK_CB_USER);
   walk(userData, ob, (ID **)&mmd->object, IDWALK_CB_NOP);
@@ -233,7 +191,7 @@ static void updateDepsgraph(GpencilModifierData *md,
                             const ModifierUpdateDepsgraphContext *ctx,
                             const int UNUSED(mode))
 {
-  WeightGpencilModifierData *mmd = (WeightGpencilModifierData *)md;
+  WeightProxGpencilModifierData *mmd = (WeightProxGpencilModifierData *)md;
   if (mmd->object != NULL) {
     DEG_add_object_relation(
         ctx->node, mmd->object, DEG_OB_COMP_TRANSFORM, "GPencil Weight Modifier");
@@ -244,54 +202,36 @@ static void updateDepsgraph(GpencilModifierData *md,
 
 static bool isDisabled(GpencilModifierData *md, int UNUSED(userRenderParams))
 {
-  WeightGpencilModifierData *mmd = (WeightGpencilModifierData *)md;
-
-  return (mmd->target_vgname[0] == '\0');
-}
-
-static void distance_panel_draw(const bContext *UNUSED(C), Panel *panel)
-{
-  PointerRNA *ptr = gpencil_modifier_panel_get_property_pointers(panel, NULL);
-
-  uiLayout *layout = panel->layout;
-  uiLayoutSetPropSep(layout, true);
+  WeightProxGpencilModifierData *mmd = (WeightProxGpencilModifierData *)md;
 
-  uiItemR(layout, ptr, "object", 0, NULL, ICON_CUBE);
-  uiLayout *sub = uiLayoutColumn(layout, true);
-  uiItemR(sub, ptr, "distance_start", 0, NULL, ICON_NONE);
-  uiItemR(sub, ptr, "distance_end", 0, "End", ICON_NONE);
+  return ((mmd->target_vgname[0] == '\0') || (mmd->object == NULL));
 }
 
-static void panel_draw(const bContext *C, Panel *panel)
+static void panel_draw(const bContext *UNUSED(C), Panel *panel)
 {
+  uiLayout *row, *sub;
   uiLayout *layout = panel->layout;
 
   PointerRNA ob_ptr;
   PointerRNA *ptr = gpencil_modifier_panel_get_property_pointers(panel, &ob_ptr);
 
   uiLayoutSetPropSep(layout, true);
-  uiItemR(layout, ptr, "mode", 0, NULL, ICON_NONE);
+  row = uiLayoutRow(layout, true);
+  uiItemPointerR(row, ptr, "target_vertex_group", &ob_ptr, "vertex_groups", NULL, ICON_NONE);
+  sub = uiLayoutRow(row, true);
+  bool has_output = RNA_string_length(ptr, "target_vertex_group") != 0;
+  uiLayoutSetPropDecorate(sub, false);
+  uiLayoutSetActive(sub, has_output);
+  uiItemR(sub, ptr, "use_invert_output", 0, "", ICON_ARROW_LEFTRIGHT);
 
-  const eWeightGpencilModifierMode mode = RNA_enum_get(ptr, "mode");
+  uiItemR(layout, ptr, "object", 0, NULL, ICON_NONE);
 
-  uiItemPointerR(layout, ptr, "target_vertex_group", &ob_ptr, "vertex_groups", NULL, ICON_NONE);
+  sub = uiLayoutColumn(layout, true);
+  uiItemR(sub, ptr, "distance_start", 0, NULL, ICON_NONE);
+  uiItemR(sub, ptr, "distance_end", 0, NULL, ICON_NONE);
 
   uiItemR(layout, ptr, "minimum_weight", 0, NULL, ICON_NONE);
-  uiItemR(layout, ptr, "use_invert_output", 0, NULL, ICON_NONE);
-  uiItemR(layout, ptr, "use_blend", 0, NULL, ICON_NONE);
-
-  switch (mode) {
-    case GP_WEIGHT_MODE_DISTANCE:
-      distance_panel_draw(C, panel);
-      break;
-    case GP_WEIGHT_MODE_ANGLE:
-      uiItemR(layout, ptr, "angle", 0, NULL, ICON_NONE);
-      uiItemR(layout, ptr, "axis", 0, NULL, ICON_NONE);
-      uiItemR(layout, ptr, "space", 0, NULL, ICON_NONE);
-      break;
-    default:
-      break;
-  }
+  uiItemR(layout, ptr, "use_multiply", 0, NULL, ICON_NONE);
 
   gpencil_modifier_panel_end(layout, ptr);
 }
@@ -304,16 +244,16 @@ static void mask_panel_draw(const bContext *UNUSED(C), Panel *panel)
 static void panelRegister(ARegionType *region_type)
 {
   PanelType *panel_type = gpencil_modifier_panel_register(
-      region_type, eGpencilModifierType_Weight, panel_draw);
+      region_type, eGpencilModifierType_WeightProximity, panel_draw);
 
   gpencil_modifier_subpanel_register(
       region_type, "mask", "Influence", NULL, mask_panel_draw, panel_type);
 }
 
-GpencilModifierTypeInfo modifierType_Gpencil_Weight = {
-    /* name */ "Vertex Weight",
-    /* structName */ "WeightGpencilModifierData",
-    /* structSize */ sizeof(WeightGpencilModifierData),
+GpencilModifierTypeInfo modifierType_Gpencil_WeightProximity = {
+    /* name */ "Vertex Weight Proximity",
+    /* structName */ "WeightProxGpencilModifierData",
+    /* structSize */ sizeof(WeightProxGpencilModifierData),
     /* type */ eGpencilModifierTypeType_Gpencil,
     /* flags */ 0,
 
diff --git a/source/blender/gpu/GPU_material.h b/source/blender/gpu/GPU_material.h
index 312da491a36..e64521768f9 100644
--- a/source/blender/gpu/GPU_material.h
+++ b/source/blender/gpu/GPU_material.h
@@ -175,10 +175,7 @@ GPUNodeLink *GPU_uniformbuf_link_out(struct GPUMaterial *mat,
 void GPU_material_output_link(GPUMaterial *material, GPUNodeLink *link);
 void GPU_material_add_output_link_aov(GPUMaterial *material, GPUNodeLink *link, int hash);
 
-void GPU_material_sss_profile_create(GPUMaterial *material,
-                                     float radii[3],
-                                     const short *falloff_type,
-                                     const float *sharpness);
+void GPU_material_sss_profile_create(GPUMaterial *material, float radii[3]);
 struct GPUUniformBuf *GPU_material_sss_profile_get(GPUMaterial *material,
                                                    int sample_len,
                                                    struct GPUTexture **tex_profile);
diff --git a/source/blender/gpu/intern/gpu_material.c b/source/blender/gpu/intern/gpu_material.c
index 56e72fbeca9..6872a08e854 100644
--- a/source/blender/gpu/intern/gpu_material.c
+++ b/source/blender/gpu/intern/gpu_material.c
@@ -96,8 +96,6 @@ struct GPUMaterial {
   float sss_enabled;
   float sss_radii[3];
   int sss_samples;
-  short int sss_falloff;
-  float sss_sharpness;
   bool sss_dirty;
 
   GPUTexture *coba_tex; /* 1D Texture array containing all color bands. */
@@ -266,18 +264,6 @@ static void sss_calculate_offsets(GPUSssKernelData *kd, int count, float exponen
   }
 }
 
-#define GAUSS_TRUNCATE 12.46f
-static float gaussian_profile(float r, float radius)
-{
-  const float v = radius * radius * (0.25f * 0.25f);
-  const float Rm = sqrtf(v * GAUSS_TRUNCATE);
-
-  if (r >= Rm) {
-    return 0.0f;
-  }
-  return expf(-r * r / (2.0f * v)) / (2.0f * M_PI * v);
-}
-
 #define BURLEY_TRUNCATE 16.0f
 #define BURLEY_TRUNCATE_CDF 0.9963790093708328f  // cdf(BURLEY_TRUNCATE)
 static float burley_profile(float r, float d)
@@ -287,45 +273,15 @@ static float burley_profile(float r, float d)
   return (exp_r_d + exp_r_3_d) / (4.0f * d);
 }
 
-static float cubic_profile(float r, float radius, float sharpness)
-{
-  float Rm = radius * (1.0f + sharpness);
-
-  if (r >= Rm) {
-    return 0.0f;
-  }
-  /* custom variation with extra sharpness, to match the previous code */
-  const float y = 1.0f / (1.0f + sharpness);
-  float Rmy, ry, ryinv;
-
-  Rmy = powf(Rm, y);
-  ry = powf(r, y);
-  ryinv = (r > 0.0f) ? powf(r, y - 1.0f) : 0.0f;
-
-  const float Rmy5 = (Rmy * Rmy) * (Rmy * Rmy) * Rmy;
-  const float f = Rmy - ry;
-  const float num = f * (f * f) * (y * ryinv);
-
-  return (10.0f * num) / (Rmy5 * M_PI);
-}
-
-static float eval_profile(float r, short falloff_type, float sharpness, float param)
+static float eval_profile(float r, float param)
 {
   r = fabsf(r);
-
-  if (ELEM(falloff_type, SHD_SUBSURFACE_BURLEY, SHD_SUBSURFACE_RANDOM_WALK)) {
-    return burley_profile(r, param) / BURLEY_TRUNCATE_CDF;
-  }
-  if (falloff_type == SHD_SUBSURFACE_CUBIC) {
-    return cubic_profile(r, param, sharpness);
-  }
-
-  return gaussian_profile(r, param);
+  return burley_profile(r, param) / BURLEY_TRUNCATE_CDF;
 }
 
 /* Resolution for each sample of the precomputed kernel profile */
 #define INTEGRAL_RESOLUTION 32
-static float eval_integral(float x0, float x1, short falloff_type, float sharpness, float param)
+static float eval_integral(float x0, float x1, float param)
 {
   const float range = x1 - x0;
   const float step = range / INTEGRAL_RESOLUTION;
@@ -333,7 +289,7 @@ static float eval_integral(float x0, float x1, short falloff_type, float sharpne
 
   for (int i = 0; i < INTEGRAL_RESOLUTION; i++) {
     float x = x0 + range * ((float)i + 0.5f) / (float)INTEGRAL_RESOLUTION;
-    float y = eval_profile(x, falloff_type, sharpness, param);
+    float y = eval_profile(x, param);
     integral += y * step;
   }
 
@@ -341,8 +297,7 @@ static float eval_integral(float x0, float x1, short falloff_type, float sharpne
 }
 #undef INTEGRAL_RESOLUTION
 
-static void compute_sss_kernel(
-    GPUSssKernelData *kd, const float radii[3], int sample_len, int falloff_type, float sharpness)
+static void compute_sss_kernel(GPUSssKernelData *kd, const float radii[3], int sample_len)
 {
   float rad[3];
   /* Minimum radius */
@@ -353,27 +308,15 @@ static void compute_sss_kernel(
   /* Christensen-Burley fitting */
   float l[3], d[3];
 
-  if (ELEM(falloff_type, SHD_SUBSURFACE_BURLEY, SHD_SUBSURFACE_RANDOM_WALK)) {
-    mul_v3_v3fl(l, rad, 0.25f * M_1_PI);
-    const float A = 1.0f;
-    const float s = 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
-    /* XXX 0.6f Out of nowhere to match cycles! Empirical! Can be tweak better. */
-    mul_v3_v3fl(d, l, 0.6f / s);
-    mul_v3_v3fl(rad, d, BURLEY_TRUNCATE);
-    kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
-
-    copy_v3_v3(kd->param, d);
-  }
-  else if (falloff_type == SHD_SUBSURFACE_CUBIC) {
-    copy_v3_v3(kd->param, rad);
-    mul_v3_fl(rad, 1.0f + sharpness);
-    kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
-  }
-  else {
-    kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
+  mul_v3_v3fl(l, rad, 0.25f * M_1_PI);
+  const float A = 1.0f;
+  const float s = 1.9f - A + 3.5f * (A - 0.8f) * (A - 0.8f);
+  /* XXX 0.6f Out of nowhere to match cycles! Empirical! Can be tweak better. */
+  mul_v3_v3fl(d, l, 0.6f / s);
+  mul_v3_v3fl(rad, d, BURLEY_TRUNCATE);
+  kd->max_radius = MAX3(rad[0], rad[1], rad[2]);
 
-    copy_v3_v3(kd->param, rad);
-  }
+  copy_v3_v3(kd->param, d);
 
   /* Compute samples locations on the 1d kernel [-1..1] */
   sss_calculate_offsets(kd, sample_len, SSS_EXPONENT);
@@ -403,9 +346,9 @@ static void compute_sss_kernel(
     x0 *= kd->max_radius;
     x1 *= kd->max_radius;
 
-    kd->kernel[i][0] = eval_integral(x0, x1, falloff_type, sharpness, kd->param[0]);
-    kd->kernel[i][1] = eval_integral(x0, x1, falloff_type, sharpness, kd->param[1]);
-    kd->kernel[i][2] = eval_integral(x0, x1, falloff_type, sharpness, kd->param[2]);
+    kd->kernel[i][0] = eval_integral(x0, x1, kd->param[0]);
+    kd->kernel[i][1] = eval_integral(x0, x1, kd->param[1]);
+    kd->kernel[i][2] = eval_integral(x0, x1, kd->param[2]);
 
     sum[0] += kd->kernel[i][0];
     sum[1] += kd->kernel[i][1];
@@ -439,8 +382,6 @@ static void compute_sss_kernel(
 #define INTEGRAL_RESOLUTION 512
 static void compute_sss_translucence_kernel(const GPUSssKernelData *kd,
                                             int resolution,
-                                            short falloff_type,
-                                            float sharpness,
                                             float **output)
 {
   float(*texels)[4];
@@ -463,9 +404,9 @@ static void compute_sss_translucence_kernel(const GPUSssKernelData *kd,
       float dist = hypotf(r + r_step * 0.5f, d);
 
       float profile[3];
-      profile[0] = eval_profile(dist, falloff_type, sharpness, kd->param[0]);
-      profile[1] = eval_profile(dist, falloff_type, sharpness, kd->param[1]);
-      profile[2] = eval_profile(dist, falloff_type, sharpness, kd->param[2]);
+      profile[0] = eval_profile(dist, kd->param[0]);
+      profile[1] = eval_profile(dist, kd->param[1]);
+      profile[2] = eval_profile(dist, kd->param[2]);
 
       /* Since the profile and configuration are radially symmetrical we
        * can just evaluate it once and weight it accordingly */
@@ -499,14 +440,9 @@ static void compute_sss_translucence_kernel(const GPUSssKernelData *kd,
 }
 #undef INTEGRAL_RESOLUTION
 
-void GPU_material_sss_profile_create(GPUMaterial *material,
-                                     float radii[3],
-                                     const short *falloff_type,
-                                     const float *sharpness)
+void GPU_material_sss_profile_create(GPUMaterial *material, float radii[3])
 {
   copy_v3_v3(material->sss_radii, radii);
-  material->sss_falloff = (falloff_type) ? *falloff_type : 0.0;
-  material->sss_sharpness = (sharpness) ? *sharpness : 0.0;
   material->sss_dirty = true;
   material->sss_enabled = true;
 
@@ -527,20 +463,14 @@ struct GPUUniformBuf *GPU_material_sss_profile_get(GPUMaterial *material,
   if (material->sss_dirty || (material->sss_samples != sample_len)) {
     GPUSssKernelData kd;
 
-    float sharpness = material->sss_sharpness;
-
-    /* XXX Black magic but it seems to fit. Maybe because we integrate -1..1 */
-    sharpness *= 0.5f;
-
-    compute_sss_kernel(&kd, material->sss_radii, sample_len, material->sss_falloff, sharpness);
+    compute_sss_kernel(&kd, material->sss_radii, sample_len);
 
     /* Update / Create UBO */
     GPU_uniformbuf_update(material->sss_profile, &kd);
 
     /* Update / Create Tex */
     float *translucence_profile;
-    compute_sss_translucence_kernel(
-        &kd, 64, material->sss_falloff, sharpness, &translucence_profile);
+    compute_sss_translucence_kernel(&kd, 64, &translucence_profile);
 
     if (material->sss_tex_profile != NULL) {
       GPU_texture_free(material->sss_tex_profile);
diff --git a/source/blender/gpu/intern/gpu_material_library.h b/source/blender/gpu/intern/gpu_material_library.h
index 782d89d6f2a..d3b12d3a2b7 100644
--- a/source/blender/gpu/intern/gpu_material_library.h
+++ b/source/blender/gpu/intern/gpu_material_library.h
@@ -27,7 +27,7 @@
 #include "GPU_material.h"
 
 #define MAX_FUNCTION_NAME 64
-#define MAX_PARAMETER 32
+#define MAX_PARAMETER 36
 
 struct GSet;
 
diff --git a/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl b/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl
index d77259638fd..bba84c2be52 100644
--- a/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl
+++ b/source/blender/gpu/shaders/material/gpu_shader_material_principled.glsl
@@ -19,6 +19,8 @@ void node_bsdf_principled(vec4 base_color,
                           float subsurface,
                           vec3 subsurface_radius,
                           vec4 subsurface_color,
+                          float subsurface_ior,
+                          float subsurface_anisotropy,
                           float metallic,
                           float specular,
                           float specular_tint,
@@ -201,6 +203,6 @@ void node_bsdf_principled(vec4 base_color,
 #else
 /* clang-format off */
 /* Stub principled because it is not compatible with volumetrics. */
-#  define node_bsdf_principled(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, bb, cc, dd, result) (result = CLOSURE_DEFAULT)
+#  define node_bsdf_principled(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, aa, bb, cc, dd, ee, ff, result) (result = CLOSURE_DEFAULT)
 /* clang-format on */
 #endif
diff --git a/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl b/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl
index 5129bf71903..d0c159cdf37 100644
--- a/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl
+++ b/source/blender/gpu/shaders/material/gpu_shader_material_subsurface_scattering.glsl
@@ -5,8 +5,8 @@ CLOSURE_EVAL_FUNCTION_DECLARE_1(node_subsurface_scattering, Diffuse)
 void node_subsurface_scattering(vec4 color,
                                 float scale,
                                 vec3 radius,
-                                float sharpen,
-                                float texture_blur,
+                                float ior,
+                                float anisotropy,
                                 vec3 N,
                                 float sss_id,
                                 out Closure result)
@@ -20,15 +20,7 @@ void node_subsurface_scattering(vec4 color,
 
   result = CLOSURE_DEFAULT;
 
-  /* Not perfect for texture_blur values between 0.0 and 1.0.
-   * Interpolate between separated color and color applied on irradiance. */
-  float one_minus_texture_blur = 1.0 - texture_blur;
-  vec3 sss_albedo = color.rgb * one_minus_texture_blur + texture_blur;
-  vec3 radiance_tint = color.rgb * texture_blur + one_minus_texture_blur;
-  /* Consider output radiance as irradiance. */
-  out_Diffuse_0.radiance *= radiance_tint;
-
-  closure_load_sss_data(scale, out_Diffuse_0.radiance, sss_albedo, int(sss_id), result);
+  closure_load_sss_data(scale, out_Diffuse_0.radiance, color.rgb, int(sss_id), result);
 
   /* TODO(fclem) Try to not use this. */
   closure_load_ssr_data(vec3(0.0), 0.0, in_Diffuse_0.N, -1.0, result);
diff --git a/source/blender/makesdna/DNA_gpencil_modifier_defaults.h b/source/blender/makesdna/DNA_gpencil_modifier_defaults.h
index 450527c7443..2a3c6f4e3db 100644
--- a/source/blender/makesdna/DNA_gpencil_modifier_defaults.h
+++ b/source/blender/makesdna/DNA_gpencil_modifier_defaults.h
@@ -283,7 +283,7 @@
     .colorband = NULL, \
   }
 
-#define _DNA_DEFAULT_WeightGpencilModifierData \
+#define _DNA_DEFAULT_WeightProxGpencilModifierData \
   { \
     .target_vgname = "", \
     .material = NULL, \
@@ -291,12 +291,23 @@
     .vgname = "", \
     .pass_index = 0, \
     .flag = 0, \
-    .axis = 1, \
     .layer_pass = 0, \
     .dist_start = 0.0f, \
     .dist_end = 20.0f, \
   }
 
+#define _DNA_DEFAULT_WeightAngleGpencilModifierData \
+  { \
+    .target_vgname = "", \
+    .material = NULL, \
+    .layername = "", \
+    .vgname = "", \
+    .pass_index = 0, \
+    .flag = 0, \
+    .axis = 1, \
+    .layer_pass = 0, \
+  }
+
 #define _DNA_DEFAULT_LineartGpencilModifierData \
   { \
     .edge_types = LRT_EDGE_FLAG_ALL_TYPE, \
@@ -314,9 +325,13 @@
   { \
     .start_fac = 0.1f,\
     .end_fac = 0.1f,\
-    .overshoot_fac = 0.01f,\
+    .overshoot_fac = 0.1f,\
     .pass_index = 0,\
     .material = NULL,\
+    .flag = GP_LENGTH_USE_CURVATURE,\
+    .point_density = 30.0f,\
+    .segment_influence = 0.0f,\
+    .max_angle = DEG2RAD(170.0f),\
   }
 
 #define _DNA_DEFAULT_DashGpencilModifierData \
diff --git a/source/blender/makesdna/DNA_gpencil_modifier_types.h b/source/blender/makesdna/DNA_gpencil_modifier_types.h
index d3429329ef6..8d967a38808 100644
--- a/source/blender/makesdna/DNA_gpencil_modifier_types.h
+++ b/source/blender/makesdna/DNA_gpencil_modifier_types.h
@@ -55,8 +55,9 @@ typedef enum GpencilModifierType {
   eGpencilModifierType_Texture = 18,
   eGpencilModifierType_Lineart = 19,
   eGpencilModifierType_Length = 20,
-  eGpencilModifierType_Weight = 21,
+  eGpencilModifierType_WeightProximity = 21,
   eGpencilModifierType_Dash = 22,
+  eGpencilModifierType_WeightAngle = 23,
   /* Keep last. */
   NUM_GREASEPENCIL_MODIFIER_TYPES,
 } GpencilModifierType;
@@ -493,7 +494,10 @@ typedef struct LengthGpencilModifierData {
   float overshoot_fac;
   /** Modifier mode. */
   int mode;
-  char _pad[4];
+  /* Curvature parameters. */
+  float point_density;
+  float segment_influence;
+  float max_angle;
 } LengthGpencilModifierData;
 
 typedef enum eLengthGpencil_Flag {
@@ -501,6 +505,8 @@ typedef enum eLengthGpencil_Flag {
   GP_LENGTH_INVERT_PASS = (1 << 1),
   GP_LENGTH_INVERT_LAYERPASS = (1 << 2),
   GP_LENGTH_INVERT_MATERIAL = (1 << 3),
+  GP_LENGTH_USE_CURVATURE = (1 << 4),
+  GP_LENGTH_INVERT_CURVATURE = (1 << 5),
 } eLengthGpencil_Flag;
 
 typedef enum eLengthGpencil_Type {
@@ -891,7 +897,7 @@ typedef enum eTextureGpencil_Mode {
   STROKE_AND_FILL = 2,
 } eTextureGpencil_Mode;
 
-typedef struct WeightGpencilModifierData {
+typedef struct WeightProxGpencilModifierData {
   GpencilModifierData modifier;
   /** Target vertexgroup name, MAX_VGROUP_NAME. */
   char target_vgname[64];
@@ -909,22 +915,39 @@ typedef struct WeightGpencilModifierData {
   float min_weight;
   /** Custom index for passes. */
   int layer_pass;
-  /** Calculation Mode. */
-  short mode;
-  /** Axis. */
-  short axis;
-  /** Angle */
-  float angle;
   /** Start/end distances. */
   float dist_start;
   float dist_end;
-  /** Space (Local/World). */
-  short space;
-  char _pad[6];
 
   /** Reference object */
   struct Object *object;
-} WeightGpencilModifierData;
+} WeightProxGpencilModifierData;
+
+typedef struct WeightAngleGpencilModifierData {
+  GpencilModifierData modifier;
+  /** Target vertexgroup name, MAX_VGROUP_NAME. */
+  char target_vgname[64];
+  /** Material for filtering. */
+  struct Material *material;
+  /** Layer name. */
+  char layername[64];
+  /** Optional vertexgroup filter name, MAX_VGROUP_NAME. */
+  char vgname[64];
+  /** Custom index for passes. */
+  int pass_index;
+  /** Flags. */
+  int flag;
+  /** Minimum valid weight (clamp value). */
+  float min_weight;
+  /** Custom index for passes. */
+  int layer_pass;
+  /** Axis. */
+  short axis;
+  /** Space (Local/World). */
+  short space;
+  /** Angle */
+  float angle;
+} WeightAngleGpencilModifierData;
 
 typedef enum eWeightGpencil_Flag {
   GP_WEIGHT_INVERT_LAYER = (1 << 0),
@@ -932,15 +955,10 @@ typedef enum eWeightGpencil_Flag {
   GP_WEIGHT_INVERT_VGROUP = (1 << 2),
   GP_WEIGHT_INVERT_LAYERPASS = (1 << 3),
   GP_WEIGHT_INVERT_MATERIAL = (1 << 4),
-  GP_WEIGHT_BLEND_DATA = (1 << 5),
+  GP_WEIGHT_MULTIPLY_DATA = (1 << 5),
   GP_WEIGHT_INVERT_OUTPUT = (1 << 6),
 } eWeightGpencil_Flag;
 
-typedef enum eWeightGpencilModifierMode {
-  GP_WEIGHT_MODE_DISTANCE = 0,
-  GP_WEIGHT_MODE_ANGLE = 1,
-} eWeightGpencilModifierMode;
-
 typedef enum eGpencilModifierSpace {
   GP_SPACE_LOCAL = 0,
   GP_SPACE_WORLD = 1,
diff --git a/source/blender/makesdna/DNA_layer_types.h b/source/blender/makesdna/DNA_layer_types.h
index 63e4597150c..520f989452c 100644
--- a/source/blender/makesdna/DNA_layer_types.h
+++ b/source/blender/makesdna/DNA_layer_types.h
@@ -68,7 +68,7 @@ typedef enum eViewLayerCryptomatteFlags {
   VIEW_LAYER_CRYPTOMATTE_OBJECT = (1 << 0),
   VIEW_LAYER_CRYPTOMATTE_MATERIAL = (1 << 1),
   VIEW_LAYER_CRYPTOMATTE_ASSET = (1 << 2),
-  VIEW_LAYER_CRYPTOMATTE_ACCURATE = (1 << 3),
+  /* VIEW_LAYER_CRYPTOMATTE_ACCURATE = (1 << 3), */ /* DEPRECATED */
 } eViewLayerCryptomatteFlags;
 #define VIEW_LAYER_CRYPTOMATTE_ALL \
   (VIEW_LAYER_CRYPTOMATTE_OBJECT | VIEW_LAYER_CRYPTOMATTE_MATERIAL | VIEW_LAYER_CRYPTOMATTE_ASSET)
diff --git a/source/blender/makesdna/DNA_node_types.h b/source/blender/makesdna/DNA_node_types.h
index f4c88333528..cf159a1e28d 100644
--- a/source/blender/makesdna/DNA_node_types.h
+++ b/source/blender/makesdna/DNA_node_types.h
@@ -1032,6 +1032,11 @@ typedef struct NodeShaderTexPointDensity {
   char _pad2[4];
 } NodeShaderTexPointDensity;
 
+typedef struct NodeShaderPrincipled {
+  char use_subsurface_auto_radius;
+  char _pad[3];
+} NodeShaderPrincipled;
+
 /* TEX_output */
 typedef struct TexNodeOutput {
   char name[64];
@@ -1452,6 +1457,11 @@ typedef struct NodeGeometryCurveToPoints {
   uint8_t mode;
 } NodeGeometryCurveToPoints;
 
+typedef struct NodeGeometryCurveSample {
+  /* GeometryNodeCurveSampleMode. */
+  uint8_t mode;
+} NodeGeometryCurveSample;
+
 typedef struct NodeGeometryAttributeTransfer {
   /* AttributeDomain. */
   int8_t domain;
@@ -1798,11 +1808,12 @@ enum {
 enum {
 #ifdef DNA_DEPRECATED_ALLOW
   SHD_SUBSURFACE_COMPATIBLE = 0, /* Deprecated */
-#endif
   SHD_SUBSURFACE_CUBIC = 1,
   SHD_SUBSURFACE_GAUSSIAN = 2,
-  SHD_SUBSURFACE_BURLEY = 3,
-  SHD_SUBSURFACE_RANDOM_WALK = 4,
+#endif
+  SHD_SUBSURFACE_DIFFUSION = 3,
+  SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS = 4,
+  SHD_SUBSURFACE_RANDOM_WALK = 5,
 };
 
 /* blur node */
diff --git a/source/blender/makesdna/DNA_scene_defaults.h b/source/blender/makesdna/DNA_scene_defaults.h
index 61707964191..9ecf94ebd6e 100644
--- a/source/blender/makesdna/DNA_scene_defaults.h
+++ b/source/blender/makesdna/DNA_scene_defaults.h
@@ -135,8 +135,6 @@
     .border.xmax = 1.0f, \
     .border.ymax = 1.0f, \
  \
-    .preview_start_resolution = 64, \
- \
     .line_thickness_mode = R_LINE_THICKNESS_ABSOLUTE, \
     .unit_line_thickness = 1.0f, \
  \
diff --git a/source/blender/makesdna/DNA_scene_types.h b/source/blender/makesdna/DNA_scene_types.h
index 7800e7f9efe..b28c3ac2b85 100644
--- a/source/blender/makesdna/DNA_scene_types.h
+++ b/source/blender/makesdna/DNA_scene_types.h
@@ -261,7 +261,7 @@ typedef enum eScenePassType {
   SCE_PASS_UNUSED_3 = (1 << 4), /* SPEC */
   SCE_PASS_SHADOW = (1 << 5),
   SCE_PASS_AO = (1 << 6),
-  SCE_PASS_UNUSED_4 = (1 << 7), /* REFLECT */
+  SCE_PASS_POSITION = (1 << 7),
   SCE_PASS_NORMAL = (1 << 8),
   SCE_PASS_VECTOR = (1 << 9),
   SCE_PASS_UNUSED_5 = (1 << 10), /* REFRACT */
@@ -293,6 +293,7 @@ typedef enum eScenePassType {
 #define RE_PASSNAME_COMBINED "Combined"
 #define RE_PASSNAME_Z "Depth"
 #define RE_PASSNAME_VECTOR "Vector"
+#define RE_PASSNAME_POSITION "Position"
 #define RE_PASSNAME_NORMAL "Normal"
 #define RE_PASSNAME_UV "UV"
 #define RE_PASSNAME_EMIT "Emit"
@@ -592,7 +593,7 @@ typedef enum eBakeSaveMode {
 /** #BakeData.pass_filter */
 typedef enum eBakePassFilter {
   R_BAKE_PASS_FILTER_NONE = 0,
-  R_BAKE_PASS_FILTER_AO = (1 << 0),
+  R_BAKE_PASS_FILTER_UNUSED = (1 << 0),
   R_BAKE_PASS_FILTER_EMIT = (1 << 1),
   R_BAKE_PASS_FILTER_DIFFUSE = (1 << 2),
   R_BAKE_PASS_FILTER_GLOSSY = (1 << 3),
@@ -653,7 +654,8 @@ typedef struct RenderData {
   /**
    * render tile dimensions
    */
-  int tilex, tiley;
+  int tilex DNA_DEPRECATED;
+  int tiley DNA_DEPRECATED;
 
   short planes DNA_DEPRECATED;
   short imtype DNA_DEPRECATED;
@@ -764,13 +766,10 @@ typedef struct RenderData {
   /* Cycles baking */
   struct BakeData bake;
 
-  int preview_start_resolution;
+  int _pad8;
   short preview_pixel_size;
 
-  /* Type of the debug pass to use.
-   * Only used when built with debug passes support.
-   */
-  short debug_pass_type;
+  short _pad4;
 
   /* MultiView */
   /** SceneRenderView. */
@@ -1344,6 +1343,7 @@ typedef struct SequencerToolSettings {
   /** When there are many snap points, 0-1 range corresponds to resolution from boundbox to all
    * possible snap points. */
   int snap_distance;
+  int pivot_point;
 } SequencerToolSettings;
 
 typedef enum eSeqOverlapMode {
@@ -1886,12 +1886,12 @@ enum {
 #define R_COMP_CROP (1 << 7)
 #define R_SCEMODE_UNUSED_8 (1 << 8) /* cleared */
 #define R_SINGLE_LAYER (1 << 9)
-#define R_EXR_TILE_FILE (1 << 10)
+#define R_SCEMODE_UNUSED_10 (1 << 10) /* cleared */
 #define R_SCEMODE_UNUSED_11 (1 << 11) /* cleared */
 #define R_NO_IMAGE_LOAD (1 << 12)
 #define R_SCEMODE_UNUSED_13 (1 << 13) /* cleared */
 #define R_NO_FRAME_UPDATE (1 << 14)
-#define R_FULL_SAMPLE (1 << 15)
+#define R_SCEMODE_UNUSED_15 (1 << 15) /* cleared */
 #define R_SCEMODE_UNUSED_16 (1 << 16) /* cleared */
 #define R_SCEMODE_UNUSED_17 (1 << 17) /* cleared */
 #define R_TEXNODE_PREVIEW (1 << 18)
diff --git a/source/blender/makesdna/DNA_sequence_types.h b/source/blender/makesdna/DNA_sequence_types.h
index 03c38eb71a0..25330acd486 100644
--- a/source/blender/makesdna/DNA_sequence_types.h
+++ b/source/blender/makesdna/DNA_sequence_types.h
@@ -74,6 +74,8 @@ typedef struct StripTransform {
   float scale_x;
   float scale_y;
   float rotation;
+  /** 0-1 range, use SEQ_image_transform_origin_offset_pixelspace_get to convert to pixel space. */
+  float origin[2];
 } StripTransform;
 
 typedef struct StripColorBalance {
@@ -516,7 +518,7 @@ enum {
   SEQ_OVERLAP = (1 << 3),
   SEQ_FILTERY = (1 << 4),
   SEQ_MUTE = (1 << 5),
-  SEQ_FLAG_UNUSED_6 = (1 << 6), /* cleared */
+  SEQ_FLAG_SKIP_THUMBNAILS = (1 << 6),
   SEQ_REVERSE_FRAMES = (1 << 7),
   SEQ_IPO_FRAME_LOCKED = (1 << 8),
   SEQ_EFFECT_NOT_LOADED = (1 << 9),
@@ -722,6 +724,7 @@ enum {
 
   SEQ_CACHE_PREFETCH_ENABLE = (1 << 10),
   SEQ_CACHE_DISK_CACHE_ENABLE = (1 << 11),
+  SEQ_CACHE_STORE_THUMBNAIL = (1 << 12),
 };
 
 #ifdef __cplusplus
diff --git a/source/blender/makesdna/DNA_space_types.h b/source/blender/makesdna/DNA_space_types.h
index 6505816256c..e849039fa93 100644
--- a/source/blender/makesdna/DNA_space_types.h
+++ b/source/blender/makesdna/DNA_space_types.h
@@ -583,6 +583,7 @@ typedef struct SequencerPreviewOverlay {
 
 /* SequencerPreviewOverlay.flag */
 typedef enum eSpaceSeq_SequencerPreviewOverlay_Flag {
+  SEQ_PREVIEW_SHOW_OUTLINE_SELECTED = (1 << 2),
   SEQ_PREVIEW_SHOW_SAFE_MARGINS = (1 << 3),
   SEQ_PREVIEW_SHOW_GPENCIL = (1 << 4),
   SEQ_PREVIEW_SHOW_SAFE_CENTER = (1 << 9),
@@ -597,6 +598,7 @@ typedef struct SequencerTimelineOverlay {
 /* SequencerTimelineOverlay.flag */
 typedef enum eSpaceSeq_SequencerTimelineOverlay_Flag {
   SEQ_TIMELINE_SHOW_STRIP_OFFSETS = (1 << 1),
+  SEQ_TIMELINE_SHOW_THUMBNAILS = (1 << 2),
   SEQ_TIMELINE_SHOW_FCURVES = (1 << 5),
   SEQ_TIMELINE_ALL_WAVEFORMS = (1 << 7), /* draw all waveforms */
   SEQ_TIMELINE_NO_WAVEFORMS = (1 << 8),  /* draw no waveforms */
@@ -606,6 +608,13 @@ typedef enum eSpaceSeq_SequencerTimelineOverlay_Flag {
   SEQ_TIMELINE_SHOW_GRID = (1 << 18),
 } eSpaceSeq_SequencerTimelineOverlay_Flag;
 
+typedef struct SpaceSeqRuntime {
+  /** Required for Thumbnail job start condition. */
+  struct rctf last_thumbnail_area;
+  /** Stores lists of most recently displayed thumbnails. */
+  struct GHash *last_displayed_thumbnails;
+} SpaceSeqRuntime;
+
 /* Sequencer */
 typedef struct SpaceSeq {
   SpaceLink *next, *prev;
@@ -649,6 +658,7 @@ typedef struct SpaceSeq {
   char multiview_eye;
   char _pad2[7];
 
+  SpaceSeqRuntime runtime;
 } SpaceSeq;
 
 /* SpaceSeq.mainb */
@@ -685,6 +695,7 @@ typedef enum eSpaceSeq_Flag {
   SPACE_SEQ_FLAG_UNUSED_15 = (1 << 15),
   SPACE_SEQ_FLAG_UNUSED_16 = (1 << 16),
   SEQ_USE_PROXIES = (1 << 17),
+  SEQ_SHOW_GRID = (1 << 18),
 } eSpaceSeq_Flag;
 
 /* SpaceSeq.view */
diff --git a/source/blender/makesdna/DNA_uuid_types.h b/source/blender/makesdna/DNA_uuid_types.h
index 30c8beaa628..fa0a78f074b 100644
--- a/source/blender/makesdna/DNA_uuid_types.h
+++ b/source/blender/makesdna/DNA_uuid_types.h
@@ -28,15 +28,17 @@ extern "C" {
 
 /**
  * \brief Universally Unique Identifier according to RFC4122.
+ *
+ * Cannot be named simply `UUID`, because Windows already defines that type.
  */
-typedef struct UUID {
+typedef struct bUUID {
   uint32_t time_low;
   uint16_t time_mid;
   uint16_t time_hi_and_version;
   uint8_t clock_seq_hi_and_reserved;
   uint8_t clock_seq_low;
   uint8_t node[6];
-} UUID;
+} bUUID;
 
 #ifdef __cplusplus
 }
diff --git a/source/blender/makesdna/DNA_view2d_types.h b/source/blender/makesdna/DNA_view2d_types.h
index c385ac04bd3..f8166305fd9 100644
--- a/source/blender/makesdna/DNA_view2d_types.h
+++ b/source/blender/makesdna/DNA_view2d_types.h
@@ -132,6 +132,8 @@ enum {
   V2D_PIXELOFS_X = (1 << 2),
   /* apply pixel offsets on y-axis when setting view matrices */
   V2D_PIXELOFS_Y = (1 << 3),
+  /* zoom, pan or similar action is in progress */
+  V2D_IS_NAVIGATING = (1 << 9),
   /* view settings need to be set still... */
   V2D_IS_INIT = (1 << 10),
 };
diff --git a/source/blender/makesdna/DNA_workspace_types.h b/source/blender/makesdna/DNA_workspace_types.h
index e0294d3534c..a0856588a58 100644
--- a/source/blender/makesdna/DNA_workspace_types.h
+++ b/source/blender/makesdna/DNA_workspace_types.h
@@ -29,6 +29,15 @@
 extern "C" {
 #endif
 
+/** #bToolRef_Runtime.flag */
+enum {
+  /**
+   * This tool should use the fallback key-map.
+   * Typically gizmos handle this but some tools (such as the knife tool) don't use a gizmo.
+   */
+  TOOLREF_FLAG_FALLBACK_KEYMAP = (1 << 0),
+};
+
 #
 #
 typedef struct bToolRef_Runtime {
@@ -47,6 +56,8 @@ typedef struct bToolRef_Runtime {
 
   /** Index when a tool is a member of a group. */
   int index;
+  /** Options: `TOOLREF_FLAG_*`. */
+  int flag;
 } bToolRef_Runtime;
 
 /* Stored per mode. */
diff --git a/source/blender/makesdna/intern/dna_defaults.c b/source/blender/makesdna/intern/dna_defaults.c
index 4cb8610f6ac..2dbbb35c3ca 100644
--- a/source/blender/makesdna/intern/dna_defaults.c
+++ b/source/blender/makesdna/intern/dna_defaults.c
@@ -318,7 +318,8 @@ SDNA_DEFAULT_DECL_STRUCT(TextureGpencilModifierData);
 SDNA_DEFAULT_DECL_STRUCT(ThickGpencilModifierData);
 SDNA_DEFAULT_DECL_STRUCT(TimeGpencilModifierData);
 SDNA_DEFAULT_DECL_STRUCT(TintGpencilModifierData);
-SDNA_DEFAULT_DECL_STRUCT(WeightGpencilModifierData);
+SDNA_DEFAULT_DECL_STRUCT(WeightProxGpencilModifierData);
+SDNA_DEFAULT_DECL_STRUCT(WeightAngleGpencilModifierData);
 SDNA_DEFAULT_DECL_STRUCT(LineartGpencilModifierData);
 SDNA_DEFAULT_DECL_STRUCT(LengthGpencilModifierData);
 SDNA_DEFAULT_DECL_STRUCT(DashGpencilModifierData);
@@ -548,7 +549,8 @@ const void *DNA_default_table[SDNA_TYPE_MAX] = {
     SDNA_DEFAULT_DECL(ThickGpencilModifierData),
     SDNA_DEFAULT_DECL(TimeGpencilModifierData),
     SDNA_DEFAULT_DECL(TintGpencilModifierData),
-    SDNA_DEFAULT_DECL(WeightGpencilModifierData),
+    SDNA_DEFAULT_DECL(WeightAngleGpencilModifierData),
+    SDNA_DEFAULT_DECL(WeightProxGpencilModifierData),
     SDNA_DEFAULT_DECL(LineartGpencilModifierData),
     SDNA_DEFAULT_DECL(LengthGpencilModifierData),
     SDNA_DEFAULT_DECL(DashGpencilModifierData),
diff --git a/source/blender/makesrna/intern/rna_gpencil_modifier.c b/source/blender/makesrna/intern/rna_gpencil_modifier.c
index 4fa33424994..a2d5b134056 100644
--- a/source/blender/makesrna/intern/rna_gpencil_modifier.c
+++ b/source/blender/makesrna/intern/rna_gpencil_modifier.c
@@ -58,6 +58,17 @@
 #include "WM_types.h"
 
 const EnumPropertyItem rna_enum_object_greasepencil_modifier_type_items[] = {
+    {0, "", 0, N_("Modify"), ""},
+    {eGpencilModifierType_WeightAngle,
+     "GP_WEIGHT_ANGLE",
+     ICON_MOD_VERTEX_WEIGHT,
+     "Vertex Weight Angle",
+     "Generate Vertex Weights base on stroke angle"},
+    {eGpencilModifierType_WeightProximity,
+     "GP_WEIGHT_PROXIMITY",
+     ICON_MOD_VERTEX_WEIGHT,
+     "Vertex Weight Proximity",
+     "Generate Vertex Weights base on distance to object"},
     {0, "", 0, N_("Generate"), ""},
     {eGpencilModifierType_Array,
      "GP_ARRAY",
@@ -74,6 +85,11 @@ const EnumPropertyItem rna_enum_object_greasepencil_modifier_type_items[] = {
      ICON_MOD_DASH,
      "Dot Dash",
      "Generate dot-dash styled strokes"},
+    {eGpencilModifierType_Length,
+     "GP_LENGTH",
+     ICON_MOD_LENGTH,
+     "Length",
+     "Extend or shrink strokes"},
     {eGpencilModifierType_Lineart,
      "GP_LINEART",
      ICON_MOD_LINEART,
@@ -99,11 +115,6 @@ const EnumPropertyItem rna_enum_object_greasepencil_modifier_type_items[] = {
      ICON_MOD_SUBSURF,
      "Subdivide",
      "Subdivide stroke adding more control points"},
-    {eGpencilModifierType_Weight,
-     "GP_WEIGHT",
-     ICON_MOD_VERTEX_WEIGHT,
-     "Vertex Weight",
-     "Generate Vertex Weights"},
     {0, "", 0, N_("Deform"), ""},
     {eGpencilModifierType_Armature,
      "GP_ARMATURE",
@@ -120,11 +131,6 @@ const EnumPropertyItem rna_enum_object_greasepencil_modifier_type_items[] = {
      ICON_MOD_LATTICE,
      "Lattice",
      "Deform strokes using lattice"},
-    {eGpencilModifierType_Length,
-     "GP_LENGTH",
-     ICON_MOD_LENGTH,
-     "Length",
-     "Extend or shrink strokes"},
     {eGpencilModifierType_Noise, "GP_NOISE", ICON_MOD_NOISE, "Noise", "Add noise to strokes"},
     {eGpencilModifierType_Offset,
      "GP_OFFSET",
@@ -244,8 +250,10 @@ static StructRNA *rna_GpencilModifier_refine(struct PointerRNA *ptr)
       return &RNA_TintGpencilModifier;
     case eGpencilModifierType_Time:
       return &RNA_TimeGpencilModifier;
-    case eGpencilModifierType_Weight:
-      return &RNA_WeightGpencilModifier;
+    case eGpencilModifierType_WeightProximity:
+      return &RNA_WeightProxGpencilModifier;
+    case eGpencilModifierType_WeightAngle:
+      return &RNA_WeightAngleGpencilModifier;
     case eGpencilModifierType_Color:
       return &RNA_ColorGpencilModifier;
     case eGpencilModifierType_Array:
@@ -346,8 +354,10 @@ RNA_GP_MOD_VGROUP_NAME_SET(Offset, vgname);
 RNA_GP_MOD_VGROUP_NAME_SET(Armature, vgname);
 RNA_GP_MOD_VGROUP_NAME_SET(Texture, vgname);
 RNA_GP_MOD_VGROUP_NAME_SET(Tint, vgname);
-RNA_GP_MOD_VGROUP_NAME_SET(Weight, target_vgname);
-RNA_GP_MOD_VGROUP_NAME_SET(Weight, vgname);
+RNA_GP_MOD_VGROUP_NAME_SET(WeightProx, target_vgname);
+RNA_GP_MOD_VGROUP_NAME_SET(WeightProx, vgname);
+RNA_GP_MOD_VGROUP_NAME_SET(WeightAngle, target_vgname);
+RNA_GP_MOD_VGROUP_NAME_SET(WeightAngle, vgname);
 RNA_GP_MOD_VGROUP_NAME_SET(Lineart, vgname);
 
 #  undef RNA_GP_MOD_VGROUP_NAME_SET
@@ -380,7 +390,7 @@ static void greasepencil_modifier_object_set(Object *self,
 RNA_GP_MOD_OBJECT_SET(Armature, object, OB_ARMATURE);
 RNA_GP_MOD_OBJECT_SET(Lattice, object, OB_LATTICE);
 RNA_GP_MOD_OBJECT_SET(Mirror, object, OB_EMPTY);
-RNA_GP_MOD_OBJECT_SET(Weight, object, OB_EMPTY);
+RNA_GP_MOD_OBJECT_SET(WeightProx, object, OB_EMPTY);
 
 #  undef RNA_GP_MOD_OBJECT_SET
 
@@ -554,11 +564,21 @@ static void rna_ThickGpencilModifier_material_set(PointerRNA *ptr,
   rna_GpencilModifier_material_set(ptr, value, ma_target, reports);
 }
 
-static void rna_WeightGpencilModifier_material_set(PointerRNA *ptr,
-                                                   PointerRNA value,
-                                                   struct ReportList *reports)
+static void rna_WeightProxGpencilModifier_material_set(PointerRNA *ptr,
+                                                       PointerRNA value,
+                                                       struct ReportList *reports)
 {
-  WeightGpencilModifierData *tmd = (WeightGpencilModifierData *)ptr->data;
+  WeightProxGpencilModifierData *tmd = (WeightProxGpencilModifierData *)ptr->data;
+  Material **ma_target = &tmd->material;
+
+  rna_GpencilModifier_material_set(ptr, value, ma_target, reports);
+}
+
+static void rna_WeightAngleGpencilModifier_material_set(PointerRNA *ptr,
+                                                        PointerRNA value,
+                                                        struct ReportList *reports)
+{
+  WeightAngleGpencilModifierData *tmd = (WeightAngleGpencilModifierData *)ptr->data;
   Material **ma_target = &tmd->material;
 
   rna_GpencilModifier_material_set(ptr, value, ma_target, reports);
@@ -2783,24 +2803,129 @@ static void rna_def_modifier_gpenciltexture(BlenderRNA *brna)
   RNA_define_lib_overridable(false);
 }
 
-static void rna_def_modifier_gpencilweight(BlenderRNA *brna)
+static void rna_def_modifier_gpencilweight_proximity(BlenderRNA *brna)
+{
+  StructRNA *srna;
+  PropertyRNA *prop;
+
+  srna = RNA_def_struct(brna, "WeightProxGpencilModifier", "GpencilModifier");
+  RNA_def_struct_ui_text(srna, "Weight Modifier Proximity", "Calculate Vertex Weight dynamically");
+  RNA_def_struct_sdna(srna, "WeightProxGpencilModifierData");
+  RNA_def_struct_ui_icon(srna, ICON_MOD_VERTEX_WEIGHT);
+
+  RNA_define_lib_overridable(true);
+
+  prop = RNA_def_property(srna, "target_vertex_group", PROP_STRING, PROP_NONE);
+  RNA_def_property_string_sdna(prop, NULL, "target_vgname");
+  RNA_def_property_ui_text(prop, "Vertex Group", "Output Vertex group");
+  RNA_def_property_string_funcs(
+      prop, NULL, NULL, "rna_WeightProxGpencilModifier_target_vgname_set");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "use_multiply", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_MULTIPLY_DATA);
+  RNA_def_property_ui_text(
+      prop,
+      "Multiply Weights",
+      "Multiply the calculated weights with the existing values in the vertex group");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "use_invert_output", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_OUTPUT);
+  RNA_def_property_ui_text(prop, "Invert", "Invert output weight values");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "layer", PROP_STRING, PROP_NONE);
+  RNA_def_property_string_sdna(prop, NULL, "layername");
+  RNA_def_property_ui_text(prop, "Layer", "Layer name");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "material", PROP_POINTER, PROP_NONE);
+  RNA_def_property_flag(prop, PROP_EDITABLE);
+  RNA_def_property_pointer_funcs(prop,
+                                 NULL,
+                                 "rna_WeightProxGpencilModifier_material_set",
+                                 NULL,
+                                 "rna_GpencilModifier_material_poll");
+  RNA_def_property_ui_text(prop, "Material", "Material used for filtering effect");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "vertex_group", PROP_STRING, PROP_NONE);
+  RNA_def_property_string_sdna(prop, NULL, "vgname");
+  RNA_def_property_ui_text(prop, "Vertex Group", "Vertex group name for modulating the deform");
+  RNA_def_property_string_funcs(prop, NULL, NULL, "rna_WeightProxGpencilModifier_vgname_set");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  /* Distance reference object */
+  prop = RNA_def_property(srna, "object", PROP_POINTER, PROP_NONE);
+  RNA_def_property_ui_text(prop, "Target Object", "Object used as distance reference");
+  RNA_def_property_pointer_funcs(
+      prop, NULL, "rna_WeightProxGpencilModifier_object_set", NULL, NULL);
+  RNA_def_property_flag(prop, PROP_EDITABLE | PROP_ID_SELF_CHECK);
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_dependency_update");
+
+  prop = RNA_def_property(srna, "distance_start", PROP_FLOAT, PROP_NONE);
+  RNA_def_property_float_sdna(prop, NULL, "dist_start");
+  RNA_def_property_ui_range(prop, 0, 1000.0, 1.0, 2);
+  RNA_def_property_ui_text(prop, "Lowest", "Start value for distance calculation");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "minimum_weight", PROP_FLOAT, PROP_FACTOR);
+  RNA_def_property_float_sdna(prop, NULL, "min_weight");
+  RNA_def_property_ui_text(prop, "Minimum", "Minimum value for vertex weight");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "distance_end", PROP_FLOAT, PROP_NONE);
+  RNA_def_property_float_sdna(prop, NULL, "dist_end");
+  RNA_def_property_ui_range(prop, 0, 1000.0, 1.0, 2);
+  RNA_def_property_ui_text(prop, "Highest", "Max value for distance calculation");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "pass_index", PROP_INT, PROP_NONE);
+  RNA_def_property_int_sdna(prop, NULL, "pass_index");
+  RNA_def_property_range(prop, 0, 100);
+  RNA_def_property_ui_text(prop, "Pass", "Pass index");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "invert_layers", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_LAYER);
+  RNA_def_property_ui_text(prop, "Inverse Layers", "Inverse filter");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "invert_materials", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_MATERIAL);
+  RNA_def_property_ui_text(prop, "Inverse Materials", "Inverse filter");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "invert_material_pass", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_PASS);
+  RNA_def_property_ui_text(prop, "Inverse Pass", "Inverse filter");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "invert_vertex", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_VGROUP);
+  RNA_def_property_ui_text(prop, "Inverse VertexGroup", "Inverse filter");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "layer_pass", PROP_INT, PROP_NONE);
+  RNA_def_property_int_sdna(prop, NULL, "layer_pass");
+  RNA_def_property_range(prop, 0, 100);
+  RNA_def_property_ui_text(prop, "Pass", "Layer pass index");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "invert_layer_pass", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_LAYERPASS);
+  RNA_def_property_ui_text(prop, "Inverse Pass", "Inverse filter");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  RNA_define_lib_overridable(false);
+}
+
+static void rna_def_modifier_gpencilweight_angle(BlenderRNA *brna)
 {
   StructRNA *srna;
   PropertyRNA *prop;
 
-  static const EnumPropertyItem mode_items[] = {
-      {GP_WEIGHT_MODE_DISTANCE,
-       "DISTANCE",
-       0,
-       "Distance",
-       "Calculate weights depending on the distance to the target object"},
-      {GP_WEIGHT_MODE_ANGLE,
-       "ANGLE",
-       0,
-       "Angle",
-       "Calculate weights depending on the stroke orientation"},
-      {0, NULL, 0, NULL, NULL},
-  };
   static const EnumPropertyItem axis_items[] = {
       {0, "X", 0, "X", ""},
       {1, "Y", 0, "Y", ""},
@@ -2814,34 +2939,31 @@ static void rna_def_modifier_gpencilweight(BlenderRNA *brna)
       {0, NULL, 0, NULL, NULL},
   };
 
-  srna = RNA_def_struct(brna, "WeightGpencilModifier", "GpencilModifier");
-  RNA_def_struct_ui_text(srna, "Weight Modifier", "Calculate Vertex Weight dynamically");
-  RNA_def_struct_sdna(srna, "WeightGpencilModifierData");
+  srna = RNA_def_struct(brna, "WeightAngleGpencilModifier", "GpencilModifier");
+  RNA_def_struct_ui_text(srna, "Weight Modifier Amgle", "Calculate Vertex Weight dynamically");
+  RNA_def_struct_sdna(srna, "WeightAngleGpencilModifierData");
   RNA_def_struct_ui_icon(srna, ICON_MOD_VERTEX_WEIGHT);
 
   RNA_define_lib_overridable(true);
 
-  prop = RNA_def_property(srna, "mode", PROP_ENUM, PROP_NONE);
-  RNA_def_property_enum_sdna(prop, NULL, "mode");
-  RNA_def_property_enum_items(prop, mode_items);
-  RNA_def_property_ui_text(prop, "Mode", "");
-  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
-
   prop = RNA_def_property(srna, "target_vertex_group", PROP_STRING, PROP_NONE);
   RNA_def_property_string_sdna(prop, NULL, "target_vgname");
-  RNA_def_property_ui_text(prop, "Output", "Output Vertex group");
-  RNA_def_property_string_funcs(prop, NULL, NULL, "rna_WeightGpencilModifier_target_vgname_set");
+  RNA_def_property_ui_text(prop, "Vertex Group", "Output Vertex group");
+  RNA_def_property_string_funcs(
+      prop, NULL, NULL, "rna_WeightProxGpencilModifier_target_vgname_set");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
-  prop = RNA_def_property(srna, "use_blend", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_BLEND_DATA);
+  prop = RNA_def_property(srna, "use_multiply", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_MULTIPLY_DATA);
   RNA_def_property_ui_text(
-      prop, "Blend", "Blend results with existing weights in output weight group");
+      prop,
+      "Multiply Weights",
+      "Multiply the calculated weights with the existing values in the vertex group");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
   prop = RNA_def_property(srna, "use_invert_output", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_WEIGHT_INVERT_OUTPUT);
-  RNA_def_property_ui_text(prop, "Invert", "Invert weight values");
+  RNA_def_property_ui_text(prop, "Invert", "Invert output weight values");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
   prop = RNA_def_property(srna, "angle", PROP_FLOAT, PROP_ANGLE);
@@ -2871,7 +2993,7 @@ static void rna_def_modifier_gpencilweight(BlenderRNA *brna)
   RNA_def_property_flag(prop, PROP_EDITABLE);
   RNA_def_property_pointer_funcs(prop,
                                  NULL,
-                                 "rna_WeightGpencilModifier_material_set",
+                                 "rna_WeightAngleGpencilModifier_material_set",
                                  NULL,
                                  "rna_GpencilModifier_material_poll");
   RNA_def_property_ui_text(prop, "Material", "Material used for filtering effect");
@@ -2880,20 +3002,7 @@ static void rna_def_modifier_gpencilweight(BlenderRNA *brna)
   prop = RNA_def_property(srna, "vertex_group", PROP_STRING, PROP_NONE);
   RNA_def_property_string_sdna(prop, NULL, "vgname");
   RNA_def_property_ui_text(prop, "Vertex Group", "Vertex group name for modulating the deform");
-  RNA_def_property_string_funcs(prop, NULL, NULL, "rna_WeightGpencilModifier_vgname_set");
-  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
-
-  /* Distance reference object */
-  prop = RNA_def_property(srna, "object", PROP_POINTER, PROP_NONE);
-  RNA_def_property_ui_text(prop, "Object", "Object used as distance reference");
-  RNA_def_property_pointer_funcs(prop, NULL, "rna_WeightGpencilModifier_object_set", NULL, NULL);
-  RNA_def_property_flag(prop, PROP_EDITABLE | PROP_ID_SELF_CHECK);
-  RNA_def_property_update(prop, 0, "rna_GpencilModifier_dependency_update");
-
-  prop = RNA_def_property(srna, "distance_start", PROP_FLOAT, PROP_NONE);
-  RNA_def_property_float_sdna(prop, NULL, "dist_start");
-  RNA_def_property_ui_range(prop, 0, 1000.0, 1.0, 2);
-  RNA_def_property_ui_text(prop, "Distance Start", "Start value for distance calculation");
+  RNA_def_property_string_funcs(prop, NULL, NULL, "rna_WeightAngleGpencilModifier_vgname_set");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
   prop = RNA_def_property(srna, "minimum_weight", PROP_FLOAT, PROP_FACTOR);
@@ -2901,12 +3010,6 @@ static void rna_def_modifier_gpencilweight(BlenderRNA *brna)
   RNA_def_property_ui_text(prop, "Minimum", "Minimum value for vertex weight");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
-  prop = RNA_def_property(srna, "distance_end", PROP_FLOAT, PROP_NONE);
-  RNA_def_property_float_sdna(prop, NULL, "dist_end");
-  RNA_def_property_ui_range(prop, 0, 1000.0, 1.0, 2);
-  RNA_def_property_ui_text(prop, "Distance End", "End value for distance calculation");
-  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
-
   prop = RNA_def_property(srna, "pass_index", PROP_INT, PROP_NONE);
   RNA_def_property_int_sdna(prop, NULL, "pass_index");
   RNA_def_property_range(prop, 0, 100);
@@ -3278,14 +3381,29 @@ static void rna_def_modifier_gpencillength(BlenderRNA *brna)
 
   prop = RNA_def_property(srna, "start_factor", PROP_FLOAT, PROP_NONE);
   RNA_def_property_float_sdna(prop, NULL, "start_fac");
-  RNA_def_property_ui_range(prop, -10.0f, 10.0f, 0.1, 1);
-  RNA_def_property_ui_text(prop, "Start Factor", "Length difference for each segment");
+  RNA_def_property_ui_range(prop, -10.0f, 10.0f, 0.1, 2);
+  RNA_def_property_ui_text(
+      prop, "Start Factor", "Added length to the start of each stroke relative to its length");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
   prop = RNA_def_property(srna, "end_factor", PROP_FLOAT, PROP_NONE);
   RNA_def_property_float_sdna(prop, NULL, "end_fac");
-  RNA_def_property_ui_range(prop, -10.0f, 10.0f, 0.1, 1);
-  RNA_def_property_ui_text(prop, "End Factor", "Length difference for each segment");
+  RNA_def_property_ui_range(prop, -10.0f, 10.0f, 0.1, 2);
+  RNA_def_property_ui_text(
+      prop, "End Factor", "Added length to the end of each stroke relative to its length");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "start_length", PROP_FLOAT, PROP_DISTANCE);
+  RNA_def_property_float_sdna(prop, NULL, "start_fac");
+  RNA_def_property_ui_range(prop, -100.0f, 100.0f, 0.1f, 3);
+  RNA_def_property_ui_text(
+      prop, "Start Factor", "Absolute added length to the start of each stroke");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "end_length", PROP_FLOAT, PROP_DISTANCE);
+  RNA_def_property_float_sdna(prop, NULL, "end_fac");
+  RNA_def_property_ui_range(prop, -100.0f, 100.0f, 0.1f, 3);
+  RNA_def_property_ui_text(prop, "End Factor", "Absolute added length to the end of each stroke");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
   prop = RNA_def_property(srna, "overshoot_factor", PROP_FLOAT, PROP_FACTOR);
@@ -3293,8 +3411,8 @@ static void rna_def_modifier_gpencillength(BlenderRNA *brna)
   RNA_def_property_range(prop, 0.0f, 1.0f);
   RNA_def_property_ui_text(
       prop,
-      "Overshoot Factor",
-      "Defines how precise must follow the stroke trajectory for the overshoot extremes");
+      "Used Length",
+      "Defines what portion of the stroke is used for the calculation of the extension");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
   prop = RNA_def_property(srna, "mode", PROP_ENUM, PROP_NONE);
@@ -3303,6 +3421,44 @@ static void rna_def_modifier_gpencillength(BlenderRNA *brna)
   RNA_def_property_ui_text(prop, "Mode", "Mode to define length");
   RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
 
+  prop = RNA_def_property(srna, "use_curvature", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_LENGTH_USE_CURVATURE);
+  RNA_def_property_ui_text(prop, "Use Curvature", "Follow the curvature of the stroke");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "invert_curvature", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", GP_LENGTH_INVERT_CURVATURE);
+  RNA_def_property_ui_text(
+      prop, "Invert Curvature", "Invert the curvature of the stroke's extension");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "point_density", PROP_FLOAT, PROP_NONE);
+  RNA_def_property_range(prop, 0.1f, 1000.0f);
+  RNA_def_property_ui_range(prop, 0.1f, 1000.0f, 1.0f, 1);
+  RNA_def_property_ui_scale_type(prop, PROP_SCALE_CUBIC);
+  RNA_def_property_ui_text(
+      prop, "Point Density", "Multiplied by Start/End for the total added point count");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "segment_influence", PROP_FLOAT, PROP_FACTOR);
+  RNA_def_property_range(prop, -2.0f, 3.0f);
+  RNA_def_property_ui_range(prop, 0.0f, 1.0f, 0.1f, 2);
+  RNA_def_property_ui_text(prop,
+                           "Segment Influence",
+                           "Factor to determine how much the length of the individual segments "
+                           "should influence the final computed curvature. Higher factors makes "
+                           "small segments influence the overall curvature less");
+  RNA_def_property_update(prop, 0, "rna_GpencilModifier_update");
+
+  prop = RNA_def_property(srna, "max_angle", PROP_FLOAT, PROP_ANGLE);
+  RNA_def_property_ui_text(prop,
+                           "Filter Angle",
+                           "Ignore points on the stroke that deviate from their neighbors by more "
+                           "than this angle when determining the extrapolation shape");
+  RNA_def_property_range(prop, 0.0f, DEG2RAD(180.0f));
+  RNA_def_property_ui_range(prop, 0.0f, DEG2RAD(179.5f), 10.0f, 1);
+  RNA_def_property_update(prop, NC_SCENE, "rna_GpencilModifier_update");
+
   prop = RNA_def_property(srna, "layer", PROP_STRING, PROP_NONE);
   RNA_def_property_string_sdna(prop, NULL, "layername");
   RNA_def_property_ui_text(prop, "Layer", "Layer name");
@@ -3552,7 +3708,8 @@ void RNA_def_greasepencil_modifier(BlenderRNA *brna)
   rna_def_modifier_gpencilarmature(brna);
   rna_def_modifier_gpencilmultiply(brna);
   rna_def_modifier_gpenciltexture(brna);
-  rna_def_modifier_gpencilweight(brna);
+  rna_def_modifier_gpencilweight_angle(brna);
+  rna_def_modifier_gpencilweight_proximity(brna);
   rna_def_modifier_gpencillineart(brna);
   rna_def_modifier_gpencillength(brna);
   rna_def_modifier_gpencildash(brna);
diff --git a/source/blender/makesrna/intern/rna_nodetree.c b/source/blender/makesrna/intern/rna_nodetree.c
index 76e37dbcdbc..ec53f35df4c 100644
--- a/source/blender/makesrna/intern/rna_nodetree.c
+++ b/source/blender/makesrna/intern/rna_nodetree.c
@@ -2168,6 +2168,17 @@ static const EnumPropertyItem *rna_GeometryNodeAttributeFill_type_itemf(bContext
   return itemf_function_check(rna_enum_attribute_type_items, attribute_fill_type_supported);
 }
 
+static bool attribute_statistic_type_supported(const EnumPropertyItem *item)
+{
+  return ELEM(item->value, CD_PROP_FLOAT, CD_PROP_FLOAT3);
+}
+static const EnumPropertyItem *rna_GeometryNodeAttributeStatistic_type_itemf(
+    bContext *UNUSED(C), PointerRNA *UNUSED(ptr), PropertyRNA *UNUSED(prop), bool *r_free)
+{
+  *r_free = true;
+  return itemf_function_check(rna_enum_attribute_type_items, attribute_statistic_type_supported);
+}
+
 /**
  * This bit of ugly code makes sure the float / attribute option shows up instead of
  * vector / attribute if the node uses an operation that uses a float for input B or C.
@@ -4654,16 +4665,18 @@ static const EnumPropertyItem node_principled_distribution_items[] = {
 };
 
 static const EnumPropertyItem node_subsurface_method_items[] = {
-    {SHD_SUBSURFACE_BURLEY,
-     "BURLEY",
+    {SHD_SUBSURFACE_RANDOM_WALK_FIXED_RADIUS,
+     "RANDOM_WALK_FIXED_RADIUS",
      0,
-     "Christensen-Burley",
-     "Approximation to physically based volume scattering"},
+     "Random Walk (Fixed Radius)",
+     "Volumetric approximation to physically based volume scattering, using the scattering radius "
+     "as specified"},
     {SHD_SUBSURFACE_RANDOM_WALK,
      "RANDOM_WALK",
      0,
      "Random Walk",
-     "Volumetric approximation to physically based volume scattering"},
+     "Volumetric approximation to physically based volume scattering, with scattering radius "
+     "automatically adjusted to match color textures"},
     {0, NULL, 0, NULL, NULL}};
 
 /* -- Common nodes ---------------------------------------------------------- */
@@ -6133,35 +6146,12 @@ static void def_sh_ambient_occlusion(StructRNA *srna)
 
 static void def_sh_subsurface(StructRNA *srna)
 {
-  static const EnumPropertyItem prop_subsurface_falloff_items[] = {
-      {SHD_SUBSURFACE_CUBIC, "CUBIC", 0, "Cubic", "Simple cubic falloff function"},
-      {SHD_SUBSURFACE_GAUSSIAN,
-       "GAUSSIAN",
-       0,
-       "Gaussian",
-       "Normal distribution, multiple can be combined to fit more complex profiles"},
-      {SHD_SUBSURFACE_BURLEY,
-       "BURLEY",
-       0,
-       "Christensen-Burley",
-       "Approximation to physically based volume scattering"},
-      {SHD_SUBSURFACE_RANDOM_WALK,
-       "RANDOM_WALK",
-       0,
-       "Random Walk",
-       "Volumetric approximation to physically based volume scattering"},
-      {0, NULL, 0, NULL, NULL},
-  };
-
   PropertyRNA *prop;
 
   prop = RNA_def_property(srna, "falloff", PROP_ENUM, PROP_NONE);
   RNA_def_property_enum_sdna(prop, NULL, "custom1");
-  RNA_def_property_enum_items(prop, prop_subsurface_falloff_items);
-  RNA_def_property_ui_text(prop,
-                           "Falloff",
-                           "Function to determine how much light nearby points contribute based "
-                           "on their distance to the shading point");
+  RNA_def_property_enum_items(prop, node_subsurface_method_items);
+  RNA_def_property_ui_text(prop, "Method", "Method for rendering subsurface scattering");
   RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_ShaderNode_socket_update");
 }
 
@@ -9077,6 +9067,30 @@ static void def_geo_curve_primitive_bezier_segment(StructRNA *srna)
   RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_socket_update");
 }
 
+static void def_geo_curve_sample(StructRNA *srna)
+{
+  static EnumPropertyItem mode_items[] = {
+      {GEO_NODE_CURVE_SAMPLE_FACTOR,
+       "FACTOR",
+       0,
+       "Factor",
+       "Find sample positions on the curve using a factor of its total length"},
+      {GEO_NODE_CURVE_SAMPLE_LENGTH,
+       "LENGTH",
+       0,
+       "Length",
+       "Find sample positions on the curve using a distance from its beginning"},
+      {0, NULL, 0, NULL, NULL},
+  };
+
+  RNA_def_struct_sdna_from(srna, "NodeGeometryCurveSample", "storage");
+
+  PropertyRNA *prop = RNA_def_property(srna, "mode", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_items(prop, mode_items);
+  RNA_def_property_ui_text(prop, "Mode", "Method for sampling input");
+  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_socket_update");
+}
+
 static void def_geo_triangulate(StructRNA *srna)
 {
   PropertyRNA *prop;
@@ -9219,6 +9233,29 @@ static void def_geo_attribute_convert(StructRNA *srna)
   RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
 }
 
+static void def_geo_attribute_statistic(StructRNA *srna)
+{
+  PropertyRNA *prop;
+
+  prop = RNA_def_property(srna, "data_type", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_sdna(prop, NULL, "custom1");
+  RNA_def_property_enum_items(prop, rna_enum_attribute_type_items);
+  RNA_def_property_enum_funcs(prop, NULL, NULL, "rna_GeometryNodeAttributeStatistic_type_itemf");
+  RNA_def_property_enum_default(prop, CD_PROP_FLOAT);
+  RNA_def_property_ui_text(
+      prop,
+      "Data Type",
+      "The data type the attribute is converted to before calculating the results");
+  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_GeometryNode_socket_update");
+
+  prop = RNA_def_property(srna, "domain", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_sdna(prop, NULL, "custom2");
+  RNA_def_property_enum_items(prop, rna_enum_attribute_domain_items);
+  RNA_def_property_enum_default(prop, ATTR_DOMAIN_POINT);
+  RNA_def_property_ui_text(prop, "Domain", "Which domain to read the data from");
+  RNA_def_property_update(prop, NC_NODE | NA_EDITED, "rna_Node_update");
+}
+
 static void def_geo_attribute_math(StructRNA *srna)
 {
   PropertyRNA *prop;
diff --git a/source/blender/makesrna/intern/rna_render.c b/source/blender/makesrna/intern/rna_render.c
index 4400d198b4a..fcb46904e8d 100644
--- a/source/blender/makesrna/intern/rna_render.c
+++ b/source/blender/makesrna/intern/rna_render.c
@@ -52,6 +52,7 @@ const EnumPropertyItem rna_enum_render_pass_type_items[] = {
     {SCE_PASS_Z, "Z", 0, "Z", ""},
     {SCE_PASS_SHADOW, "SHADOW", 0, "Shadow", ""},
     {SCE_PASS_AO, "AO", 0, "Ambient Occlusion", ""},
+    {SCE_PASS_POSITION, "POSITION", 0, "Position", ""},
     {SCE_PASS_NORMAL, "NORMAL", 0, "Normal", ""},
     {SCE_PASS_VECTOR, "VECTOR", 0, "Vector", ""},
     {SCE_PASS_INDEXOB, "OBJECT_INDEX", 0, "Object Index", ""},
@@ -79,6 +80,7 @@ const EnumPropertyItem rna_enum_bake_pass_type_items[] = {
     {SCE_PASS_COMBINED, "COMBINED", 0, "Combined", ""},
     {SCE_PASS_AO, "AO", 0, "Ambient Occlusion", ""},
     {SCE_PASS_SHADOW, "SHADOW", 0, "Shadow", ""},
+    {SCE_PASS_POSITION, "POSITION", 0, "Position", ""},
     {SCE_PASS_NORMAL, "NORMAL", 0, "Normal", ""},
     {SCE_PASS_UV, "UV", 0, "UV", ""},
     {SCE_PASS_ROUGHNESS, "ROUGHNESS", 0, "ROUGHNESS", ""},
@@ -177,6 +179,40 @@ static void engine_render(RenderEngine *engine, Depsgraph *depsgraph)
   RNA_parameter_list_free(&list);
 }
 
+static void engine_render_frame_finish(RenderEngine *engine)
+{
+  extern FunctionRNA rna_RenderEngine_render_frame_finish_func;
+  PointerRNA ptr;
+  ParameterList list;
+  FunctionRNA *func;
+
+  RNA_pointer_create(NULL, engine->type->rna_ext.srna, engine, &ptr);
+  func = &rna_RenderEngine_render_frame_finish_func;
+
+  RNA_parameter_list_create(&list, &ptr, func);
+  engine->type->rna_ext.call(NULL, &ptr, func, &list);
+
+  RNA_parameter_list_free(&list);
+}
+
+static void engine_draw(RenderEngine *engine, const struct bContext *context, Depsgraph *depsgraph)
+{
+  extern FunctionRNA rna_RenderEngine_draw_func;
+  PointerRNA ptr;
+  ParameterList list;
+  FunctionRNA *func;
+
+  RNA_pointer_create(NULL, engine->type->rna_ext.srna, engine, &ptr);
+  func = &rna_RenderEngine_draw_func;
+
+  RNA_parameter_list_create(&list, &ptr, func);
+  RNA_parameter_set_lookup(&list, "context", &context);
+  RNA_parameter_set_lookup(&list, "depsgraph", &depsgraph);
+  engine->type->rna_ext.call(NULL, &ptr, func, &list);
+
+  RNA_parameter_list_free(&list);
+}
+
 static void engine_bake(RenderEngine *engine,
                         struct Depsgraph *depsgraph,
                         struct Object *object,
@@ -315,7 +351,7 @@ static StructRNA *rna_RenderEngine_register(Main *bmain,
   RenderEngineType *et, dummyet = {NULL};
   RenderEngine dummyengine = {NULL};
   PointerRNA dummyptr;
-  int have_function[8];
+  int have_function[9];
 
   /* setup dummy engine & engine type to store static properties in */
   dummyengine.type = &dummyet;
@@ -358,11 +394,13 @@ static StructRNA *rna_RenderEngine_register(Main *bmain,
 
   et->update = (have_function[0]) ? engine_update : NULL;
   et->render = (have_function[1]) ? engine_render : NULL;
-  et->bake = (have_function[2]) ? engine_bake : NULL;
-  et->view_update = (have_function[3]) ? engine_view_update : NULL;
-  et->view_draw = (have_function[4]) ? engine_view_draw : NULL;
-  et->update_script_node = (have_function[5]) ? engine_update_script_node : NULL;
-  et->update_render_passes = (have_function[6]) ? engine_update_render_passes : NULL;
+  et->render_frame_finish = (have_function[2]) ? engine_render_frame_finish : NULL;
+  et->draw = (have_function[3]) ? engine_draw : NULL;
+  et->bake = (have_function[4]) ? engine_bake : NULL;
+  et->view_update = (have_function[5]) ? engine_view_update : NULL;
+  et->view_draw = (have_function[6]) ? engine_view_draw : NULL;
+  et->update_script_node = (have_function[7]) ? engine_update_script_node : NULL;
+  et->update_render_passes = (have_function[8]) ? engine_update_render_passes : NULL;
 
   RE_engines_register(et);
 
@@ -519,6 +557,19 @@ static void rna_def_render_engine(BlenderRNA *brna)
   parm = RNA_def_pointer(func, "depsgraph", "Depsgraph", "", "");
   RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
 
+  func = RNA_def_function(srna, "render_frame_finish", NULL);
+  RNA_def_function_ui_description(
+      func, "Perform finishing operations after all view layers in a frame were rendered");
+  RNA_def_function_flag(func, FUNC_REGISTER_OPTIONAL | FUNC_ALLOW_WRITE);
+
+  func = RNA_def_function(srna, "draw", NULL);
+  RNA_def_function_ui_description(func, "Draw render image");
+  RNA_def_function_flag(func, FUNC_REGISTER_OPTIONAL);
+  parm = RNA_def_pointer(func, "context", "Context", "", "");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_pointer(func, "depsgraph", "Depsgraph", "", "");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+
   func = RNA_def_function(srna, "bake", NULL);
   RNA_def_function_ui_description(func, "Bake passes");
   RNA_def_function_flag(func, FUNC_REGISTER_OPTIONAL | FUNC_ALLOW_WRITE);
@@ -641,6 +692,14 @@ static void rna_def_render_engine(BlenderRNA *brna)
   parm = RNA_def_boolean(func, "do_break", 0, "Break", "");
   RNA_def_function_return(func, parm);
 
+  func = RNA_def_function(srna, "pass_by_index_get", "RE_engine_pass_by_index_get");
+  parm = RNA_def_string(func, "layer", NULL, 0, "Layer", "Name of render layer to get pass for");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "index", 0, 0, INT_MAX, "Index", "Index of pass to get", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_pointer(func, "render_pass", "RenderPass", "Index", "Index of pass to get");
+  RNA_def_function_return(func, parm);
+
   func = RNA_def_function(srna, "active_view_get", "RE_engine_active_view_get");
   parm = RNA_def_string(func, "view", NULL, 0, "View", "Single view active");
   RNA_def_function_return(func, parm);
@@ -761,6 +820,22 @@ static void rna_def_render_engine(BlenderRNA *brna)
   func = RNA_def_function(srna, "free_blender_memory", "RE_engine_free_blender_memory");
   RNA_def_function_ui_description(func, "Free Blender side memory of render engine");
 
+  func = RNA_def_function(srna, "tile_highlight_set", "RE_engine_tile_highlight_set");
+  RNA_def_function_ui_description(func, "Set highlighted state of the given tile");
+  parm = RNA_def_int(func, "x", 0, 0, INT_MAX, "X", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "y", 0, 0, INT_MAX, "Y", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "width", 0, 0, INT_MAX, "Width", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_int(func, "height", 0, 0, INT_MAX, "Height", "", 0, INT_MAX);
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+  parm = RNA_def_boolean(func, "highlight", 0, "Highlight", "");
+  RNA_def_parameter_flags(parm, 0, PARM_REQUIRED);
+
+  func = RNA_def_function(srna, "tile_highlight_clear_all", "RE_engine_tile_highlight_clear_all");
+  RNA_def_function_ui_description(func, "Clear highlight from all tiles");
+
   RNA_define_verify_sdna(0);
 
   prop = RNA_def_property(srna, "is_animation", PROP_BOOLEAN, PROP_NONE);
@@ -777,11 +852,6 @@ static void rna_def_render_engine(BlenderRNA *brna)
   RNA_def_property_boolean_sdna(prop, NULL, "layer_override", 1);
   RNA_def_property_array(prop, 20);
 
-  prop = RNA_def_property(srna, "tile_x", PROP_INT, PROP_UNSIGNED);
-  RNA_def_property_int_sdna(prop, NULL, "tile_x");
-  prop = RNA_def_property(srna, "tile_y", PROP_INT, PROP_UNSIGNED);
-  RNA_def_property_int_sdna(prop, NULL, "tile_y");
-
   prop = RNA_def_property(srna, "resolution_x", PROP_INT, PROP_PIXEL);
   RNA_def_property_int_sdna(prop, NULL, "resolution_x");
   RNA_def_property_clear_flag(prop, PROP_EDITABLE);
@@ -880,12 +950,6 @@ static void rna_def_render_engine(BlenderRNA *brna)
                            "Don't expose Cycles and Eevee shading nodes in the node editor user "
                            "interface, so own nodes can be used instead");
 
-  prop = RNA_def_property(srna, "bl_use_save_buffers", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "type->flag", RE_USE_SAVE_BUFFERS);
-  RNA_def_property_flag(prop, PROP_REGISTER_OPTIONAL);
-  RNA_def_property_ui_text(
-      prop, "Use Save Buffers", "Support render to an on disk buffer during rendering");
-
   prop = RNA_def_property(srna, "bl_use_spherical_stereo", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "type->flag", RE_USE_SPHERICAL_STEREO);
   RNA_def_property_flag(prop, PROP_REGISTER_OPTIONAL);
diff --git a/source/blender/makesrna/intern/rna_scene.c b/source/blender/makesrna/intern/rna_scene.c
index badaaa14aa4..e45d39a1ddc 100644
--- a/source/blender/makesrna/intern/rna_scene.c
+++ b/source/blender/makesrna/intern/rna_scene.c
@@ -532,7 +532,6 @@ const EnumPropertyItem rna_enum_stereo3d_interlace_type_items[] = {
 
 const EnumPropertyItem rna_enum_bake_pass_filter_type_items[] = {
     {R_BAKE_PASS_FILTER_NONE, "NONE", 0, "None", ""},
-    {R_BAKE_PASS_FILTER_AO, "AO", 0, "Ambient Occlusion", ""},
     {R_BAKE_PASS_FILTER_EMIT, "EMIT", 0, "Emit", ""},
     {R_BAKE_PASS_FILTER_DIRECT, "DIRECT", 0, "Direct", ""},
     {R_BAKE_PASS_FILTER_INDIRECT, "INDIRECT", 0, "Indirect", ""},
@@ -3525,6 +3524,16 @@ static void rna_def_sequencer_tool_settings(BlenderRNA *brna)
       {0, NULL, 0, NULL, NULL},
   };
 
+    static const EnumPropertyItem pivot_points[] = {
+      {V3D_AROUND_CENTER_MEDIAN, "MEDIAN", ICON_PIVOT_MEDIAN, "Median Point", ""},
+      {V3D_AROUND_LOCAL_ORIGINS,
+       "INDIVIDUAL_ORIGINS",
+       ICON_PIVOT_INDIVIDUAL,
+       "Individual Origins",
+       "Pivot around each selected island's own median point"},
+      {0, NULL, 0, NULL, NULL},
+
+  };
   srna = RNA_def_struct(brna, "SequencerToolSettings", NULL);
   RNA_def_struct_path_func(srna, "rna_SequencerToolSettings_path");
   RNA_def_struct_ui_text(srna, "Sequencer Tool Settings", "");
@@ -3568,6 +3577,10 @@ static void rna_def_sequencer_tool_settings(BlenderRNA *brna)
   prop = RNA_def_property(srna, "overlap_mode", PROP_ENUM, PROP_NONE);
   RNA_def_property_enum_items(prop, scale_overlap_modes);
   RNA_def_property_ui_text(prop, "Overlap Mode", "How to resolve overlap after transformation");
+
+  prop = RNA_def_property(srna, "pivot_point", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_items(prop, pivot_points);
+  RNA_def_property_ui_text(prop, "Pivot Point", "Rotation or scaling pivot point");
 }
 
 static void rna_def_unified_paint_settings(BlenderRNA *brna)
@@ -4137,13 +4150,6 @@ void rna_def_view_layer_common(BlenderRNA *brna, StructRNA *srna, const bool sce
         prop, "Cryptomatte Levels", "Sets how many unique objects can be distinguished per pixel");
     RNA_def_property_ui_range(prop, 2.0, 16.0, 2.0, 0.0);
     RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_ViewLayer_pass_update");
-
-    prop = RNA_def_property(srna, "use_pass_cryptomatte_accurate", PROP_BOOLEAN, PROP_NONE);
-    RNA_def_property_boolean_sdna(prop, NULL, "cryptomatte_flag", VIEW_LAYER_CRYPTOMATTE_ACCURATE);
-    RNA_def_property_boolean_default(prop, true);
-    RNA_def_property_ui_text(
-        prop, "Cryptomatte Accurate", "Generate a more accurate cryptomatte pass");
-    RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_ViewLayer_pass_update");
   }
 
   prop = RNA_def_property(srna, "use_solid", PROP_BOOLEAN, PROP_NONE);
@@ -4237,6 +4243,16 @@ void rna_def_view_layer_common(BlenderRNA *brna, StructRNA *srna, const bool sce
     RNA_def_property_clear_flag(prop, PROP_EDITABLE);
   }
 
+  prop = RNA_def_property(srna, "use_pass_position", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "passflag", SCE_PASS_POSITION);
+  RNA_def_property_ui_text(prop, "Position", "Deliver position pass");
+  if (scene) {
+    RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_ViewLayer_pass_update");
+  }
+  else {
+    RNA_def_property_clear_flag(prop, PROP_EDITABLE);
+  }
+
   prop = RNA_def_property(srna, "use_pass_normal", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "passflag", SCE_PASS_NORMAL);
   RNA_def_property_ui_text(prop, "Normal", "Deliver normal pass");
@@ -5108,10 +5124,6 @@ static void rna_def_bake_data(BlenderRNA *brna)
   RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
 
   /* custom passes flags */
-  prop = RNA_def_property(srna, "use_pass_ambient_occlusion", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "pass_filter", R_BAKE_PASS_FILTER_AO);
-  RNA_def_property_ui_text(prop, "Ambient Occlusion", "Add ambient occlusion contribution");
-
   prop = RNA_def_property(srna, "use_pass_emit", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "pass_filter", R_BAKE_PASS_FILTER_EMIT);
   RNA_def_property_ui_text(prop, "Emit", "Add emission contribution");
@@ -5920,29 +5932,6 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
   RNA_def_property_ui_text(prop, "Resolution %", "Percentage scale for render resolution");
   RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, "rna_SceneSequencer_update");
 
-  prop = RNA_def_property(srna, "tile_x", PROP_INT, PROP_PIXEL);
-  RNA_def_property_int_sdna(prop, NULL, "tilex");
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_range(prop, 8, 65536);
-  RNA_def_property_ui_text(prop, "Tile X", "Horizontal tile size to use while rendering");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
-  prop = RNA_def_property(srna, "tile_y", PROP_INT, PROP_PIXEL);
-  RNA_def_property_int_sdna(prop, NULL, "tiley");
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_range(prop, 8, 65536);
-  RNA_def_property_ui_text(prop, "Tile Y", "Vertical tile size to use while rendering");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
-  prop = RNA_def_property(srna, "preview_start_resolution", PROP_INT, PROP_NONE);
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_range(prop, 8, 16384);
-  RNA_def_property_ui_text(prop,
-                           "Start Resolution",
-                           "Resolution to start rendering preview at, "
-                           "progressively increasing it to the full viewport size");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
   prop = RNA_def_property(srna, "preview_pixel_size", PROP_ENUM, PROP_NONE);
   RNA_def_property_enum_sdna(prop, NULL, "preview_pixel_size");
   RNA_def_property_enum_items(prop, pixel_size_items);
@@ -6199,24 +6188,6 @@ static void rna_def_scene_render_data(BlenderRNA *brna)
   RNA_def_property_clear_flag(prop, PROP_EDITABLE);
   RNA_def_property_ui_text(prop, "Movie Format", "When true the format is a movie");
 
-  prop = RNA_def_property(srna, "use_save_buffers", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "scemode", R_EXR_TILE_FILE);
-  RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
-  RNA_def_property_ui_text(
-      prop,
-      "Save Buffers",
-      "Save tiles for all RenderLayers and SceneNodes to files in the temp directory "
-      "(saves memory, required for Full Sample)");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
-  prop = RNA_def_property(srna, "use_full_sample", PROP_BOOLEAN, PROP_NONE);
-  RNA_def_property_boolean_sdna(prop, NULL, "scemode", R_FULL_SAMPLE);
-  RNA_def_property_ui_text(prop,
-                           "Full Sample",
-                           "Save for every anti-aliasing sample the entire RenderLayer results "
-                           "(this solves anti-aliasing issues with compositing)");
-  RNA_def_property_update(prop, NC_SCENE | ND_RENDER_OPTIONS, NULL);
-
   prop = RNA_def_property(srna, "use_lock_interface", PROP_BOOLEAN, PROP_NONE);
   RNA_def_property_boolean_sdna(prop, NULL, "use_lock_interface", 1);
   RNA_def_property_clear_flag(prop, PROP_ANIMATABLE);
diff --git a/source/blender/makesrna/intern/rna_sequencer.c b/source/blender/makesrna/intern/rna_sequencer.c
index cd87e4d10c1..b713ffb68b4 100644
--- a/source/blender/makesrna/intern/rna_sequencer.c
+++ b/source/blender/makesrna/intern/rna_sequencer.c
@@ -1442,6 +1442,12 @@ static void rna_def_strip_transform(BlenderRNA *brna)
   RNA_def_property_ui_text(prop, "Rotation", "Rotate around image center");
   RNA_def_property_update(prop, NC_SCENE | ND_SEQUENCER, "rna_SequenceTransform_update");
 
+  prop = RNA_def_property(srna, "origin", PROP_FLOAT, PROP_NONE);
+  RNA_def_property_float_sdna(prop, NULL, "origin");
+  RNA_def_property_ui_text(prop, "Origin", "Origin of image for transformation");
+  RNA_def_property_ui_range(prop, 0, 1, 1, 3);
+  RNA_def_property_update(prop, NC_SCENE | ND_SEQUENCER, "rna_SequenceTransform_update");
+
   RNA_def_struct_path_func(srna, "rna_SequenceTransform_path");
 }
 
diff --git a/source/blender/makesrna/intern/rna_space.c b/source/blender/makesrna/intern/rna_space.c
index 8c331bd1911..a05cef7a1cd 100644
--- a/source/blender/makesrna/intern/rna_space.c
+++ b/source/blender/makesrna/intern/rna_space.c
@@ -5371,6 +5371,11 @@ static void rna_def_space_sequencer_preview_overlay(BlenderRNA *brna)
   RNA_def_property_boolean_sdna(prop, NULL, "flag", SEQ_PREVIEW_SHOW_GPENCIL);
   RNA_def_property_ui_text(prop, "Show Annotation", "Show annotations for this view");
   RNA_def_property_update(prop, NC_SPACE | ND_SPACE_SEQUENCER, NULL);
+
+  prop = RNA_def_property(srna, "show_image_outline", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", SEQ_PREVIEW_SHOW_OUTLINE_SELECTED);
+  RNA_def_property_ui_text(prop, "Image Outline", "");
+  RNA_def_property_update(prop, NC_SPACE | ND_SPACE_SEQUENCER, NULL);
 }
 
 static void rna_def_space_sequencer_timeline_overlay(BlenderRNA *brna)
@@ -5439,6 +5444,11 @@ static void rna_def_space_sequencer_timeline_overlay(BlenderRNA *brna)
   RNA_def_property_boolean_sdna(prop, NULL, "flag", SEQ_TIMELINE_SHOW_STRIP_OFFSETS);
   RNA_def_property_ui_text(prop, "Show Offsets", "Display strip in/out offsets");
   RNA_def_property_update(prop, NC_SPACE | ND_SPACE_SEQUENCER, NULL);
+
+  prop = RNA_def_property(srna, "show_thumbnails", PROP_BOOLEAN, PROP_NONE);
+  RNA_def_property_boolean_sdna(prop, NULL, "flag", SEQ_TIMELINE_SHOW_THUMBNAILS);
+  RNA_def_property_ui_text(prop, "Show Thumbnails", "Show strip thumbnails");
+  RNA_def_property_update(prop, NC_SPACE | ND_SPACE_SEQUENCER, NULL);
 }
 
 static void rna_def_space_sequencer(BlenderRNA *brna)
diff --git a/source/blender/makesrna/intern/rna_workspace_api.c b/source/blender/makesrna/intern/rna_workspace_api.c
index a2bb89dd5ee..15230f1198b 100644
--- a/source/blender/makesrna/intern/rna_workspace_api.c
+++ b/source/blender/makesrna/intern/rna_workspace_api.c
@@ -29,6 +29,7 @@
 
 #include "DNA_object_types.h"
 #include "DNA_windowmanager_types.h"
+#include "DNA_workspace_types.h"
 
 #include "RNA_enum_types.h" /* own include */
 
@@ -51,6 +52,7 @@ static void rna_WorkSpaceTool_setup(ID *id,
                                     const char *data_block,
                                     const char *op_idname,
                                     int index,
+                                    int options,
                                     const char *idname_fallback,
                                     const char *keymap_fallback)
 {
@@ -62,6 +64,7 @@ static void rna_WorkSpaceTool_setup(ID *id,
   STRNCPY(tref_rt.data_block, data_block);
   STRNCPY(tref_rt.op, op_idname);
   tref_rt.index = index;
+  tref_rt.flag = options;
 
   /* While it's logical to assign both these values from setup,
    * it's useful to stored this in DNA for re-use, exceptional case: write to the 'tref'. */
@@ -131,6 +134,11 @@ void RNA_api_workspace_tool(StructRNA *srna)
   PropertyRNA *parm;
   FunctionRNA *func;
 
+  static EnumPropertyItem options_items[] = {
+      {TOOLREF_FLAG_FALLBACK_KEYMAP, "KEYMAP_FALLBACK", 0, "Fallback", ""},
+      {0, NULL, 0, NULL, NULL},
+  };
+
   func = RNA_def_function(srna, "setup", "rna_WorkSpaceTool_setup");
   RNA_def_function_flag(func, FUNC_USE_SELF_ID | FUNC_USE_CONTEXT);
   RNA_def_function_ui_description(func, "Set the tool settings");
@@ -146,6 +154,7 @@ void RNA_api_workspace_tool(StructRNA *srna)
   RNA_def_string(func, "data_block", NULL, MAX_NAME, "Data Block", "");
   RNA_def_string(func, "operator", NULL, MAX_NAME, "Operator", "");
   RNA_def_int(func, "index", 0, INT_MIN, INT_MAX, "Index", "", INT_MIN, INT_MAX);
+  RNA_def_enum_flag(func, "options", options_items, 0, "Tool Options", "");
 
   RNA_def_string(func, "idname_fallback", NULL, MAX_NAME, "Fallback Identifier", "");
   RNA_def_string(func, "keymap_fallback", NULL, KMAP_MAX_NAME, "Fallback Key Map", "");
diff --git a/source/blender/modifiers/intern/MOD_nodes.cc b/source/blender/modifiers/intern/MOD_nodes.cc
index 6b976b016e1..8c02c83d479 100644
--- a/source/blender/modifiers/intern/MOD_nodes.cc
+++ b/source/blender/modifiers/intern/MOD_nodes.cc
@@ -68,6 +68,8 @@
 #include "UI_interface.h"
 #include "UI_resources.h"
 
+#include "BLT_translation.h"
+
 #include "WM_types.h"
 
 #include "RNA_access.h"
@@ -1090,17 +1092,29 @@ static void panel_draw(const bContext *C, Panel *panel)
   }
 
   /* Draw node warnings. */
+  bool has_legacy_node = false;
   if (nmd->runtime_eval_log != nullptr) {
     const geo_log::ModifierLog &log = *static_cast<geo_log::ModifierLog *>(nmd->runtime_eval_log);
-    log.foreach_node_log([layout](const geo_log::NodeLog &node_log) {
+    log.foreach_node_log([&](const geo_log::NodeLog &node_log) {
       for (const geo_log::NodeWarning &warning : node_log.warnings()) {
-        if (warning.type != geo_log::NodeWarningType::Info) {
+        if (warning.type == geo_log::NodeWarningType::Legacy) {
+          has_legacy_node = true;
+        }
+        else if (warning.type != geo_log::NodeWarningType::Info) {
           uiItemL(layout, warning.message.c_str(), ICON_ERROR);
         }
       }
     });
   }
 
+  if (USER_EXPERIMENTAL_TEST(&U, use_geometry_nodes_fields) && has_legacy_node) {
+    uiLayout *row = uiLayoutRow(layout, false);
+    uiItemL(row, IFACE_("Node tree has legacy node"), ICON_ERROR);
+    uiLayout *sub = uiLayoutRow(row, false);
+    uiLayoutSetAlignment(sub, UI_LAYOUT_ALIGN_RIGHT);
+    uiItemO(sub, "", ICON_VIEWZOOM, "NODE_OT_geometry_node_view_legacy");
+  }
+
   modifier_panel_end(layout, ptr);
 }
 
diff --git a/source/blender/modifiers/intern/MOD_nodes_evaluator.cc b/source/blender/modifiers/intern/MOD_nodes_evaluator.cc
index 56de0f87ed8..e50c07ce6f2 100644
--- a/source/blender/modifiers/intern/MOD_nodes_evaluator.cc
+++ b/source/blender/modifiers/intern/MOD_nodes_evaluator.cc
@@ -26,6 +26,8 @@
 #include "FN_generic_value_map.hh"
 #include "FN_multi_function.hh"
 
+#include "BLT_translation.h"
+
 #include "BLI_enumerable_thread_specific.hh"
 #include "BLI_stack.hh"
 #include "BLI_task.h"
@@ -868,6 +870,12 @@ class GeometryNodesEvaluator {
 
     NodeParamsProvider params_provider{*this, node, node_state};
     GeoNodeExecParams params{params_provider};
+    if (USER_EXPERIMENTAL_TEST(&U, use_geometry_nodes_fields)) {
+      if (node->idname().find("Legacy") != StringRef::not_found) {
+        params.error_message_add(geo_log::NodeWarningType::Legacy,
+                                 TIP_("Legacy node will be removed before Blender 4.0"));
+      }
+    }
     bnode.typeinfo->geometry_node_execute(params);
   }
 
diff --git a/source/blender/nodes/CMakeLists.txt b/source/blender/nodes/CMakeLists.txt
index b0fc55fab0c..a8795649ede 100644
--- a/source/blender/nodes/CMakeLists.txt
+++ b/source/blender/nodes/CMakeLists.txt
@@ -139,6 +139,9 @@ set(SRC
   function/nodes/node_fn_input_string.cc
   function/nodes/node_fn_input_vector.cc
   function/nodes/node_fn_random_float.cc
+  function/nodes/node_fn_string_length.cc
+  function/nodes/node_fn_string_substring.cc
+  function/nodes/node_fn_value_to_string.cc
   function/node_function_util.cc
 
   geometry/nodes/legacy/node_geo_material_assign.cc
@@ -161,6 +164,7 @@ set(SRC
   geometry/nodes/node_geo_attribute_remove.cc
   geometry/nodes/node_geo_attribute_sample_texture.cc
   geometry/nodes/node_geo_attribute_separate_xyz.cc
+  geometry/nodes/node_geo_attribute_statistic.cc
   geometry/nodes/node_geo_attribute_transfer.cc
   geometry/nodes/node_geo_attribute_vector_math.cc
   geometry/nodes/node_geo_attribute_vector_rotate.cc
@@ -169,9 +173,11 @@ set(SRC
   geometry/nodes/node_geo_collection_info.cc
   geometry/nodes/node_geo_common.cc
   geometry/nodes/node_geo_convex_hull.cc
+  geometry/nodes/node_geo_curve_sample.cc
   geometry/nodes/node_geo_curve_endpoints.cc
   geometry/nodes/node_geo_curve_fill.cc
   geometry/nodes/node_geo_curve_length.cc
+  geometry/nodes/node_geo_curve_parameter.cc
   geometry/nodes/node_geo_curve_primitive_bezier_segment.cc
   geometry/nodes/node_geo_curve_primitive_circle.cc
   geometry/nodes/node_geo_curve_primitive_line.cc
@@ -193,6 +199,7 @@ set(SRC
   geometry/nodes/node_geo_input_material.cc
   geometry/nodes/node_geo_input_normal.cc
   geometry/nodes/node_geo_input_position.cc
+  geometry/nodes/node_geo_input_tangent.cc
   geometry/nodes/node_geo_input_index.cc
   geometry/nodes/node_geo_is_viewport.cc
   geometry/nodes/node_geo_join_geometry.cc
@@ -221,6 +228,7 @@ set(SRC
   geometry/nodes/node_geo_realize_instances.cc
   geometry/nodes/node_geo_separate_components.cc
   geometry/nodes/node_geo_set_position.cc
+  geometry/nodes/node_geo_string_join.cc
   geometry/nodes/node_geo_subdivision_surface.cc
   geometry/nodes/node_geo_switch.cc
   geometry/nodes/node_geo_transform.cc
diff --git a/source/blender/nodes/NOD_function.h b/source/blender/nodes/NOD_function.h
index 29f1a465491..a67458418f2 100644
--- a/source/blender/nodes/NOD_function.h
+++ b/source/blender/nodes/NOD_function.h
@@ -26,6 +26,9 @@ void register_node_type_fn_float_to_int(void);
 void register_node_type_fn_input_string(void);
 void register_node_type_fn_input_vector(void);
 void register_node_type_fn_random_float(void);
+void register_node_type_fn_string_length(void);
+void register_node_type_fn_string_substring(void);
+void register_node_type_fn_value_to_string(void);
 
 #ifdef __cplusplus
 }
diff --git a/source/blender/nodes/NOD_geometry.h b/source/blender/nodes/NOD_geometry.h
index 0d31ae2143a..24f60263d8a 100644
--- a/source/blender/nodes/NOD_geometry.h
+++ b/source/blender/nodes/NOD_geometry.h
@@ -48,6 +48,7 @@ void register_node_type_geo_attribute_proximity(void);
 void register_node_type_geo_attribute_randomize(void);
 void register_node_type_geo_attribute_remove(void);
 void register_node_type_geo_attribute_separate_xyz(void);
+void register_node_type_geo_attribute_statistic(void);
 void register_node_type_geo_attribute_transfer(void);
 void register_node_type_geo_attribute_vector_math(void);
 void register_node_type_geo_attribute_vector_rotate(void);
@@ -58,6 +59,8 @@ void register_node_type_geo_convex_hull(void);
 void register_node_type_geo_curve_endpoints(void);
 void register_node_type_geo_curve_fill(void);
 void register_node_type_geo_curve_length(void);
+void register_node_type_geo_curve_parameter(void);
+void register_node_type_geo_curve_sample(void);
 void register_node_type_geo_curve_primitive_bezier_segment(void);
 void register_node_type_geo_curve_primitive_circle(void);
 void register_node_type_geo_curve_primitive_line(void);
@@ -79,6 +82,7 @@ void register_node_type_geo_input_index(void);
 void register_node_type_geo_input_material(void);
 void register_node_type_geo_input_normal(void);
 void register_node_type_geo_input_position(void);
+void register_node_type_geo_input_tangent(void);
 void register_node_type_geo_is_viewport(void);
 void register_node_type_geo_join_geometry(void);
 void register_node_type_geo_material_assign(void);
@@ -108,6 +112,7 @@ void register_node_type_geo_sample_texture(void);
 void register_node_type_geo_select_by_handle_type(void);
 void register_node_type_geo_separate_components(void);
 void register_node_type_geo_set_position(void);
+void register_node_type_geo_string_join(void);
 void register_node_type_geo_subdivision_surface(void);
 void register_node_type_geo_switch(void);
 void register_node_type_geo_transform(void);
diff --git a/source/blender/nodes/NOD_geometry_nodes_eval_log.hh b/source/blender/nodes/NOD_geometry_nodes_eval_log.hh
index 00d97b24646..ff8e137e341 100644
--- a/source/blender/nodes/NOD_geometry_nodes_eval_log.hh
+++ b/source/blender/nodes/NOD_geometry_nodes_eval_log.hh
@@ -131,6 +131,7 @@ enum class NodeWarningType {
   Error,
   Warning,
   Info,
+  Legacy,
 };
 
 struct NodeWarning {
diff --git a/source/blender/nodes/NOD_static_types.h b/source/blender/nodes/NOD_static_types.h
index b2f1fa5e83a..8fb18e839a7 100644
--- a/source/blender/nodes/NOD_static_types.h
+++ b/source/blender/nodes/NOD_static_types.h
@@ -267,7 +267,10 @@ DefNode(FunctionNode, FN_NODE_FLOAT_COMPARE, def_float_compare, "FLOAT_COMPARE",
 DefNode(FunctionNode, FN_NODE_FLOAT_TO_INT, def_float_to_int, "FLOAT_TO_INT", FloatToInt, "Float to Integer", "")
 DefNode(FunctionNode, FN_NODE_INPUT_STRING, def_fn_input_string, "INPUT_STRING", InputString, "String", "")
 DefNode(FunctionNode, FN_NODE_INPUT_VECTOR, def_fn_input_vector, "INPUT_VECTOR", InputVector, "Vector", "")
-DefNode(FunctionNode, FN_NODE_RANDOM_FLOAT, 0,                  "RANDOM_FLOAT", RandomFloat, "Random Float", "")
+DefNode(FunctionNode, FN_NODE_RANDOM_FLOAT, 0, "RANDOM_FLOAT", RandomFloat, "Random Float", "")
+DefNode(FunctionNode, FN_NODE_VALUE_TO_STRING, 0, "VALUE_TO_STRING", ValueToString, "Value to String", "")
+DefNode(FunctionNode, FN_NODE_STRING_LENGTH, 0, "STRING_LENGTH", StringLength, "String Length", "")
+DefNode(FunctionNode, FN_NODE_STRING_SUBSTRING, 0, "STRING_SUBSTRING", StringSubstring, "String Substring", "")
 
 DefNode(GeometryNode, GEO_NODE_LECAGY_ATTRIBUTE_CLAMP, def_geo_attribute_clamp, "LEGACY_ATTRIBUTE_CLAMP", LegacyAttributeClamp, "Attribute Clamp", "")
 DefNode(GeometryNode, GEO_NODE_LEGACY_ALIGN_ROTATION_TO_VECTOR, def_geo_align_rotation_to_vector, "LEGACY_ALIGN_ROTATION_TO_VECTOR", LegacyAlignRotationToVector, "Align Rotation to Vector", "")
@@ -307,13 +310,16 @@ DefNode(GeometryNode, GEO_NODE_LEGACY_SELECT_BY_MATERIAL, 0, "LEGACY_SELECT_BY_M
 DefNode(GeometryNode, GEO_NODE_ATTRIBUTE_CAPTURE, def_geo_attribute_capture, "ATTRIBUTE_CAPTURE", AttributeCapture, "Attribute Capture", "")
 DefNode(GeometryNode, GEO_NODE_ATTRIBUTE_REMOVE, 0, "ATTRIBUTE_REMOVE", AttributeRemove, "Attribute Remove", "")
 DefNode(GeometryNode, GEO_NODE_ATTRIBUTE_VECTOR_ROTATE, def_geo_attribute_vector_rotate, "LEGACY_ATTRIBUTE_VECTOR_ROTATE", LegacyAttributeVectorRotate, "Attribute Vector Rotate", "")
+DefNode(GeometryNode, GEO_NODE_ATTRIBUTE_STATISTIC, def_geo_attribute_statistic, "ATTRIBUTE_STATISTIC", AttributeStatistic, "Attribute Statistic", "")
 DefNode(GeometryNode, GEO_NODE_BOOLEAN, def_geo_boolean, "BOOLEAN", Boolean, "Boolean", "")
 DefNode(GeometryNode, GEO_NODE_BOUNDING_BOX, 0, "BOUNDING_BOX", BoundBox, "Bounding Box", "")
 DefNode(GeometryNode, GEO_NODE_COLLECTION_INFO, def_geo_collection_info, "COLLECTION_INFO", CollectionInfo, "Collection Info", "")
 DefNode(GeometryNode, GEO_NODE_CONVEX_HULL, 0, "CONVEX_HULL", ConvexHull, "Convex Hull", "")
+DefNode(GeometryNode, GEO_NODE_CURVE_SAMPLE, def_geo_curve_sample, "CURVE_SAMPLE", CurveSample, "Curve Sample", "")
 DefNode(GeometryNode, GEO_NODE_CURVE_ENDPOINTS, 0, "CURVE_ENDPOINTS", CurveEndpoints, "Curve Endpoints", "")
 DefNode(GeometryNode, GEO_NODE_CURVE_FILL, def_geo_curve_fill, "CURVE_FILL", CurveFill, "Curve Fill", "")
 DefNode(GeometryNode, GEO_NODE_CURVE_LENGTH, 0, "CURVE_LENGTH", CurveLength, "Curve Length", "")
+DefNode(GeometryNode, GEO_NODE_CURVE_PARAMETER, 0, "CURVE_PARAMETER", CurveParameter, "Curve Parameter", "")
 DefNode(GeometryNode, GEO_NODE_CURVE_PRIMITIVE_BEZIER_SEGMENT, def_geo_curve_primitive_bezier_segment, "CURVE_PRIMITIVE_BEZIER_SEGMENT", CurvePrimitiveBezierSegment, "Bezier Segment", "")
 DefNode(GeometryNode, GEO_NODE_CURVE_PRIMITIVE_CIRCLE, def_geo_curve_primitive_circle, "CURVE_PRIMITIVE_CIRCLE", CurvePrimitiveCircle, "Curve Circle", "")
 DefNode(GeometryNode, GEO_NODE_CURVE_PRIMITIVE_LINE, def_geo_curve_primitive_line, "CURVE_PRIMITIVE_LINE", CurvePrimitiveLine, "Curve Line", "")
@@ -330,6 +336,7 @@ DefNode(GeometryNode, GEO_NODE_INPUT_INDEX, 0, "INDEX", InputIndex, "Index", "")
 DefNode(GeometryNode, GEO_NODE_INPUT_MATERIAL, def_geo_input_material, "INPUT_MATERIAL", InputMaterial, "Material", "")
 DefNode(GeometryNode, GEO_NODE_INPUT_NORMAL, 0, "INPUT_NORMAL", InputNormal, "Normal", "")
 DefNode(GeometryNode, GEO_NODE_INPUT_POSITION, 0, "POSITION", InputPosition, "Position", "")
+DefNode(GeometryNode, GEO_NODE_INPUT_TANGENT, 0, "INPUT_TANGENT", InputTangent, "Curve Tangent", "")
 DefNode(GeometryNode, GEO_NODE_IS_VIEWPORT, 0, "IS_VIEWPORT", IsViewport, "Is Viewport", "")
 DefNode(GeometryNode, GEO_NODE_JOIN_GEOMETRY, 0, "JOIN_GEOMETRY", JoinGeometry, "Join Geometry", "")
 DefNode(GeometryNode, GEO_NODE_MATERIAL_ASSIGN, 0, "MATERIAL_ASSIGN", MaterialAssign, "Material Assign", "")
@@ -348,6 +355,7 @@ DefNode(GeometryNode, GEO_NODE_OBJECT_INFO, def_geo_object_info, "OBJECT_INFO",
 DefNode(GeometryNode, GEO_NODE_REALIZE_INSTANCES, 0, "REALIZE_INSTANCES", RealizeInstances, "Realize Instances", "")
 DefNode(GeometryNode, GEO_NODE_SEPARATE_COMPONENTS, 0, "SEPARATE_COMPONENTS", SeparateComponents, "Separate Components", "")
 DefNode(GeometryNode, GEO_NODE_SET_POSITION, 0, "SET_POSITION", SetPosition, "Set Position", "")
+DefNode(GeometryNode, GEO_NODE_STRING_JOIN, 0, "STRING_JOIN", StringJoin, "String Join", "")
 DefNode(GeometryNode, GEO_NODE_SUBDIVISION_SURFACE, def_geo_subdivision_surface, "SUBDIVISION_SURFACE", SubdivisionSurface, "Subdivision Surface", "")
 DefNode(GeometryNode, GEO_NODE_SWITCH, def_geo_switch, "SWITCH", Switch, "Switch", "")
 DefNode(GeometryNode, GEO_NODE_TRANSFORM, 0, "TRANSFORM", Transform, "Transform", "")
diff --git a/source/blender/nodes/composite/nodes/node_composite_image.c b/source/blender/nodes/composite/nodes/node_composite_image.c
index 243300b0a44..a56dfea9dbf 100644
--- a/source/blender/nodes/composite/nodes/node_composite_image.c
+++ b/source/blender/nodes/composite/nodes/node_composite_image.c
@@ -45,7 +45,7 @@ static bNodeSocketTemplate cmp_node_rlayers_out[] = {
     {SOCK_VECTOR, N_(RE_PASSNAME_NORMAL), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_VECTOR, N_(RE_PASSNAME_UV), 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_VECTOR, N_(RE_PASSNAME_VECTOR), 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
-    {SOCK_RGBA, N_(RE_PASSNAME_DEPRECATED), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
+    {SOCK_VECTOR, N_(RE_PASSNAME_POSITION), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_RGBA, N_(RE_PASSNAME_DEPRECATED), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_RGBA, N_(RE_PASSNAME_DEPRECATED), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {SOCK_RGBA, N_(RE_PASSNAME_SHADOW), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
@@ -72,7 +72,7 @@ static bNodeSocketTemplate cmp_node_rlayers_out[] = {
     {SOCK_RGBA, N_(RE_PASSNAME_SUBSURFACE_COLOR), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f},
     {-1, ""},
 };
-#define MAX_LEGACY_SOCKET_INDEX 30
+#define NUM_LEGACY_SOCKETS (ARRAY_SIZE(cmp_node_rlayers_out) - 1)
 
 static void cmp_node_image_add_pass_output(bNodeTree *ntree,
                                            bNode *node,
@@ -382,7 +382,7 @@ static void cmp_node_image_verify_outputs(bNodeTree *ntree, bNode *node, bool rl
           break;
         }
       }
-      if (!link && (!rlayer || sock_index > MAX_LEGACY_SOCKET_INDEX)) {
+      if (!link && (!rlayer || sock_index >= NUM_LEGACY_SOCKETS)) {
         MEM_freeN(sock->storage);
         nodeRemoveSocket(ntree, node, sock);
       }
@@ -468,43 +468,12 @@ void node_cmp_rlayers_outputs(bNodeTree *ntree, bNode *node)
 
 const char *node_cmp_rlayers_sock_to_pass(int sock_index)
 {
-  const char *sock_to_passname[] = {
-      RE_PASSNAME_COMBINED,
-      RE_PASSNAME_COMBINED,
-      RE_PASSNAME_Z,
-      RE_PASSNAME_NORMAL,
-      RE_PASSNAME_UV,
-      RE_PASSNAME_VECTOR,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_SHADOW,
-      RE_PASSNAME_AO,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_DEPRECATED,
-      RE_PASSNAME_INDEXOB,
-      RE_PASSNAME_INDEXMA,
-      RE_PASSNAME_MIST,
-      RE_PASSNAME_EMIT,
-      RE_PASSNAME_ENVIRONMENT,
-      RE_PASSNAME_DIFFUSE_DIRECT,
-      RE_PASSNAME_DIFFUSE_INDIRECT,
-      RE_PASSNAME_DIFFUSE_COLOR,
-      RE_PASSNAME_GLOSSY_DIRECT,
-      RE_PASSNAME_GLOSSY_INDIRECT,
-      RE_PASSNAME_GLOSSY_COLOR,
-      RE_PASSNAME_TRANSM_DIRECT,
-      RE_PASSNAME_TRANSM_INDIRECT,
-      RE_PASSNAME_TRANSM_COLOR,
-      RE_PASSNAME_SUBSURFACE_DIRECT,
-      RE_PASSNAME_SUBSURFACE_INDIRECT,
-      RE_PASSNAME_SUBSURFACE_COLOR,
-  };
-  if (sock_index > MAX_LEGACY_SOCKET_INDEX) {
+  if (sock_index >= NUM_LEGACY_SOCKETS) {
     return NULL;
   }
-  return sock_to_passname[sock_index];
+  const char *name = cmp_node_rlayers_out[sock_index].name;
+  /* Exception for alpha, which is derived from Combined. */
+  return (STREQ(name, "Alpha")) ? RE_PASSNAME_COMBINED : name;
 }
 
 static void node_composit_init_rlayers(const bContext *C, PointerRNA *ptr)
diff --git a/source/blender/nodes/function/nodes/node_fn_string_length.cc b/source/blender/nodes/function/nodes/node_fn_string_length.cc
new file mode 100644
index 00000000000..a0f85dfd2bf
--- /dev/null
+++ b/source/blender/nodes/function/nodes/node_fn_string_length.cc
@@ -0,0 +1,49 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_string_utf8.h"
+
+#include <iomanip>
+
+#include "node_function_util.hh"
+
+namespace blender::nodes {
+
+static void fn_node_string_length_declare(NodeDeclarationBuilder &b)
+{
+  b.add_input<decl::String>("String");
+  b.add_output<decl::Int>("Length");
+};
+
+}  // namespace blender::nodes
+
+static void fn_node_string_length_build_multi_function(
+    blender::nodes::NodeMultiFunctionBuilder &builder)
+{
+  static blender::fn::CustomMF_SI_SO<std::string, int> str_len_fn{
+      "String Length", [](const std::string &a) { return BLI_strlen_utf8(a.c_str()); }};
+  builder.set_matching_fn(&str_len_fn);
+}
+
+void register_node_type_fn_string_length()
+{
+  static bNodeType ntype;
+
+  fn_node_type_base(&ntype, FN_NODE_STRING_LENGTH, "String Length", NODE_CLASS_CONVERTER, 0);
+  ntype.declare = blender::nodes::fn_node_string_length_declare;
+  ntype.build_multi_function = fn_node_string_length_build_multi_function;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/function/nodes/node_fn_string_substring.cc b/source/blender/nodes/function/nodes/node_fn_string_substring.cc
new file mode 100644
index 00000000000..55a01093ae9
--- /dev/null
+++ b/source/blender/nodes/function/nodes/node_fn_string_substring.cc
@@ -0,0 +1,54 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_string_utf8.h"
+
+#include "node_function_util.hh"
+
+namespace blender::nodes {
+
+static void fn_node_string_substring_declare(NodeDeclarationBuilder &b)
+{
+  b.add_input<decl::String>("String");
+  b.add_input<decl::Int>("Position");
+  b.add_input<decl::Int>("Length").min(0);
+  b.add_output<decl::String>("String");
+};
+
+}  // namespace blender::nodes
+
+static void fn_node_string_substring_build_multi_function(
+    blender::nodes::NodeMultiFunctionBuilder &builder)
+{
+  static blender::fn::CustomMF_SI_SI_SI_SO<std::string, int, int, std::string> substring_fn{
+      "Substring", [](const std::string &str, int a, int b) {
+        const int len = BLI_strlen_utf8(str.c_str());
+        const int start = BLI_str_utf8_offset_from_index(str.c_str(), std::clamp(a, 0, len));
+        const int end = BLI_str_utf8_offset_from_index(str.c_str(), std::clamp(a + b, 0, len));
+        return str.substr(start, std::max<int>(end - start, 0));
+      }};
+  builder.set_matching_fn(&substring_fn);
+}
+
+void register_node_type_fn_string_substring()
+{
+  static bNodeType ntype;
+
+  fn_node_type_base(&ntype, FN_NODE_STRING_SUBSTRING, "String Substring", NODE_CLASS_CONVERTER, 0);
+  ntype.declare = blender::nodes::fn_node_string_substring_declare;
+  ntype.build_multi_function = fn_node_string_substring_build_multi_function;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/function/nodes/node_fn_value_to_string.cc b/source/blender/nodes/function/nodes/node_fn_value_to_string.cc
new file mode 100644
index 00000000000..c1e6373cb6d
--- /dev/null
+++ b/source/blender/nodes/function/nodes/node_fn_value_to_string.cc
@@ -0,0 +1,51 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "node_function_util.hh"
+#include <iomanip>
+
+namespace blender::nodes {
+
+static void fn_node_value_to_string_declare(NodeDeclarationBuilder &b)
+{
+  b.add_input<decl::Float>("Value");
+  b.add_input<decl::Int>("Decimals").min(0);
+  b.add_output<decl::String>("String");
+};
+
+}  // namespace blender::nodes
+
+static void fn_node_value_to_string_build_multi_function(
+    blender::nodes::NodeMultiFunctionBuilder &builder)
+{
+  static blender::fn::CustomMF_SI_SI_SO<float, int, std::string> to_str_fn{
+      "Value To String", [](float a, int b) {
+        std::stringstream stream;
+        stream << std::fixed << std::setprecision(std::max(0, b)) << a;
+        return stream.str();
+      }};
+  builder.set_matching_fn(&to_str_fn);
+}
+
+void register_node_type_fn_value_to_string()
+{
+  static bNodeType ntype;
+
+  fn_node_type_base(&ntype, FN_NODE_VALUE_TO_STRING, "Value to String", NODE_CLASS_CONVERTER, 0);
+  ntype.declare = blender::nodes::fn_node_value_to_string_declare;
+  ntype.build_multi_function = fn_node_value_to_string_build_multi_function;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/geometry/nodes/node_geo_attribute_statistic.cc b/source/blender/nodes/geometry/nodes/node_geo_attribute_statistic.cc
new file mode 100644
index 00000000000..5001034518c
--- /dev/null
+++ b/source/blender/nodes/geometry/nodes/node_geo_attribute_statistic.cc
@@ -0,0 +1,378 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <algorithm>
+#include <numeric>
+
+#include "UI_interface.h"
+#include "UI_resources.h"
+
+#include "BLI_math_base_safe.h"
+
+#include "node_geometry_util.hh"
+
+namespace blender::nodes {
+
+static void geo_node_attribute_statistic_declare(NodeDeclarationBuilder &b)
+{
+  b.add_input<decl::Geometry>("Geometry");
+  b.add_input<decl::Float>("Attribute").hide_value();
+  b.add_input<decl::Vector>("Attribute", "Attribute_001").hide_value();
+
+  b.add_output<decl::Float>("Mean");
+  b.add_output<decl::Float>("Median");
+  b.add_output<decl::Float>("Sum");
+  b.add_output<decl::Float>("Min");
+  b.add_output<decl::Float>("Max");
+  b.add_output<decl::Float>("Range");
+  b.add_output<decl::Float>("Standard Deviation");
+  b.add_output<decl::Float>("Variance");
+
+  b.add_output<decl::Vector>("Mean", "Mean_001");
+  b.add_output<decl::Vector>("Median", "Median_001");
+  b.add_output<decl::Vector>("Sum", "Sum_001");
+  b.add_output<decl::Vector>("Min", "Min_001");
+  b.add_output<decl::Vector>("Max", "Max_001");
+  b.add_output<decl::Vector>("Range", "Range_001");
+  b.add_output<decl::Vector>("Standard Deviation", "Standard Deviation_001");
+  b.add_output<decl::Vector>("Variance", "Variance_001");
+}
+
+static void geo_node_attribute_statistic_layout(uiLayout *layout,
+                                                bContext *UNUSED(C),
+                                                PointerRNA *ptr)
+{
+  uiItemR(layout, ptr, "data_type", 0, "", ICON_NONE);
+  uiItemR(layout, ptr, "domain", 0, "", ICON_NONE);
+}
+
+static void geo_node_attribute_statistic_init(bNodeTree *UNUSED(tree), bNode *node)
+{
+  node->custom1 = CD_PROP_FLOAT;
+  node->custom2 = ATTR_DOMAIN_POINT;
+}
+
+static void geo_node_attribute_statistic_update(bNodeTree *UNUSED(ntree), bNode *node)
+{
+  bNodeSocket *socket_geo = (bNodeSocket *)node->inputs.first;
+  bNodeSocket *socket_float_attr = socket_geo->next;
+  bNodeSocket *socket_float3_attr = socket_float_attr->next;
+
+  bNodeSocket *socket_float_mean = (bNodeSocket *)node->outputs.first;
+  bNodeSocket *socket_float_median = socket_float_mean->next;
+  bNodeSocket *socket_float_sum = socket_float_median->next;
+  bNodeSocket *socket_float_min = socket_float_sum->next;
+  bNodeSocket *socket_float_max = socket_float_min->next;
+  bNodeSocket *socket_float_range = socket_float_max->next;
+  bNodeSocket *socket_float_std = socket_float_range->next;
+  bNodeSocket *socket_float_variance = socket_float_std->next;
+
+  bNodeSocket *socket_vector_mean = socket_float_variance->next;
+  bNodeSocket *socket_vector_median = socket_vector_mean->next;
+  bNodeSocket *socket_vector_sum = socket_vector_median->next;
+  bNodeSocket *socket_vector_min = socket_vector_sum->next;
+  bNodeSocket *socket_vector_max = socket_vector_min->next;
+  bNodeSocket *socket_vector_range = socket_vector_max->next;
+  bNodeSocket *socket_vector_std = socket_vector_range->next;
+  bNodeSocket *socket_vector_variance = socket_vector_std->next;
+
+  const CustomDataType data_type = static_cast<CustomDataType>(node->custom1);
+
+  nodeSetSocketAvailability(socket_float_attr, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_mean, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_median, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_sum, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_min, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_max, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_range, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_std, data_type == CD_PROP_FLOAT);
+  nodeSetSocketAvailability(socket_float_variance, data_type == CD_PROP_FLOAT);
+
+  nodeSetSocketAvailability(socket_float3_attr, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_mean, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_median, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_sum, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_min, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_max, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_range, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_std, data_type == CD_PROP_FLOAT3);
+  nodeSetSocketAvailability(socket_vector_variance, data_type == CD_PROP_FLOAT3);
+}
+
+template<typename T> static T compute_sum(const Span<T> data)
+{
+  return std::accumulate(data.begin(), data.end(), T());
+}
+
+static float compute_variance(const Span<float> data, const float mean)
+{
+  if (data.size() <= 1) {
+    return 0.0f;
+  }
+
+  float sum_of_squared_differences = std::accumulate(
+      data.begin(), data.end(), 0.0f, [mean](float accumulator, float value) {
+        float difference = mean - value;
+        return accumulator + difference * difference;
+      });
+
+  return sum_of_squared_differences / (data.size() - 1);
+}
+
+static float median_of_sorted_span(const Span<float> data)
+{
+  if (data.is_empty()) {
+    return 0.0f;
+  }
+
+  const float median = data[data.size() / 2];
+
+  /* For spans of even length, the median is the average of the middle two elements. */
+  if (data.size() % 2 == 0) {
+    return (median + data[data.size() / 2 - 1]) * 0.5f;
+  }
+  return median;
+}
+static void set_empty(CustomDataType data_type, GeoNodeExecParams &params)
+{
+  if (data_type == CD_PROP_FLOAT) {
+    params.set_output("Mean", 0.0f);
+    params.set_output("Median", 0.0f);
+    params.set_output("Sum", 0.0f);
+    params.set_output("Min", 0.0f);
+    params.set_output("Max", 0.0f);
+    params.set_output("Range", 0.0f);
+    params.set_output("Standard Deviation", 0.0f);
+    params.set_output("Variance", 0.0f);
+  }
+  else if (data_type == CD_PROP_FLOAT3) {
+    params.set_output("Mean_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Median_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Sum_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Min_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Max_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Range_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Standard Deviation_001", float3{0.0f, 0.0f, 0.0f});
+    params.set_output("Variance_001", float3{0.0f, 0.0f, 0.0f});
+  }
+}
+
+static void geo_node_attribute_statistic_exec(GeoNodeExecParams params)
+{
+  GeometrySet geometry_set = params.get_input<GeometrySet>("Geometry");
+
+  const bNode &node = params.node();
+  const CustomDataType data_type = static_cast<CustomDataType>(node.custom1);
+  const AttributeDomain domain = static_cast<AttributeDomain>(node.custom2);
+
+  int64_t total_size = 0;
+  Vector<const GeometryComponent *> components = geometry_set.get_components_for_read();
+
+  for (const GeometryComponent *component : components) {
+    if (component->attribute_domain_supported(domain)) {
+      total_size += component->attribute_domain_size(domain);
+    }
+  }
+  if (total_size == 0) {
+    set_empty(data_type, params);
+    return;
+  }
+
+  switch (data_type) {
+    case CD_PROP_FLOAT: {
+      const Field<float> input_field = params.get_input<Field<float>>("Attribute");
+      Array<float> data = Array<float>(total_size);
+      int offset = 0;
+      for (const GeometryComponent *component : components) {
+        if (component->attribute_domain_supported(domain)) {
+          GeometryComponentFieldContext field_context{*component, domain};
+          const int domain_size = component->attribute_domain_size(domain);
+          fn::FieldEvaluator data_evaluator{field_context, domain_size};
+          MutableSpan<float> component_result = data.as_mutable_span().slice(offset, domain_size);
+          data_evaluator.add_with_destination(input_field, component_result);
+          data_evaluator.evaluate();
+          offset += domain_size;
+        }
+      }
+
+      float mean = 0.0f;
+      float median = 0.0f;
+      float sum = 0.0f;
+      float min = 0.0f;
+      float max = 0.0f;
+      float range = 0.0f;
+      float standard_deviation = 0.0f;
+      float variance = 0.0f;
+      const bool sort_required = params.output_is_required("Min") ||
+                                 params.output_is_required("Max") ||
+                                 params.output_is_required("Range") ||
+                                 params.output_is_required("Median");
+      const bool sum_required = params.output_is_required("Sum") ||
+                                params.output_is_required("Mean");
+      const bool variance_required = params.output_is_required("Standard Deviation") ||
+                                     params.output_is_required("Variance");
+
+      if (total_size != 0) {
+        if (sort_required) {
+          std::sort(data.begin(), data.end());
+          median = median_of_sorted_span(data);
+
+          min = data.first();
+          max = data.last();
+          range = max - min;
+        }
+        if (sum_required || variance_required) {
+          sum = compute_sum<float>(data);
+          mean = sum / total_size;
+
+          if (variance_required) {
+            variance = compute_variance(data, mean);
+            standard_deviation = std::sqrt(variance);
+          }
+        }
+      }
+
+      if (sum_required) {
+        params.set_output("Sum", sum);
+        params.set_output("Mean", mean);
+      }
+      if (sort_required) {
+        params.set_output("Min", min);
+        params.set_output("Max", max);
+        params.set_output("Range", range);
+        params.set_output("Median", median);
+      }
+      if (variance_required) {
+        params.set_output("Standard Deviation", standard_deviation);
+        params.set_output("Variance", variance);
+      }
+      break;
+    }
+    case CD_PROP_FLOAT3: {
+      const Field<float3> input_field = params.get_input<Field<float3>>("Attribute_001");
+
+      Array<float3> data = Array<float3>(total_size);
+      int offset = 0;
+      for (const GeometryComponent *component : components) {
+        if (component->attribute_domain_supported(domain)) {
+          GeometryComponentFieldContext field_context{*component, domain};
+          const int domain_size = component->attribute_domain_size(domain);
+          fn::FieldEvaluator data_evaluator{field_context, domain_size};
+          MutableSpan<float3> component_result = data.as_mutable_span().slice(offset, domain_size);
+          data_evaluator.add_with_destination(input_field, component_result);
+          data_evaluator.evaluate();
+          offset += domain_size;
+        }
+      }
+
+      float3 median{0};
+      float3 min{0};
+      float3 max{0};
+      float3 range{0};
+      float3 sum{0};
+      float3 mean{0};
+      float3 variance{0};
+      float3 standard_deviation{0};
+      const bool sort_required = params.output_is_required("Min_001") ||
+                                 params.output_is_required("Max_001") ||
+                                 params.output_is_required("Range_001") ||
+                                 params.output_is_required("Median_001");
+      const bool sum_required = params.output_is_required("Sum_001") ||
+                                params.output_is_required("Mean_001");
+      const bool variance_required = params.output_is_required("Standard Deviation_001") ||
+                                     params.output_is_required("Variance_001");
+
+      Array<float> data_x;
+      Array<float> data_y;
+      Array<float> data_z;
+      if (sort_required || variance_required) {
+        data_x.reinitialize(total_size);
+        data_y.reinitialize(total_size);
+        data_z.reinitialize(total_size);
+        for (const int i : data.index_range()) {
+          data_x[i] = data[i].x;
+          data_y[i] = data[i].y;
+          data_z[i] = data[i].z;
+        }
+      }
+
+      if (total_size != 0) {
+        if (sort_required) {
+          std::sort(data_x.begin(), data_x.end());
+          std::sort(data_y.begin(), data_y.end());
+          std::sort(data_z.begin(), data_z.end());
+
+          const float x_median = median_of_sorted_span(data_x);
+          const float y_median = median_of_sorted_span(data_y);
+          const float z_median = median_of_sorted_span(data_z);
+          median = float3(x_median, y_median, z_median);
+
+          min = float3(data_x.first(), data_y.first(), data_z.first());
+          max = float3(data_x.last(), data_y.last(), data_z.last());
+          range = max - min;
+        }
+        if (sum_required || variance_required) {
+          sum = compute_sum(data.as_span());
+          mean = sum / total_size;
+
+          if (variance_required) {
+            const float x_variance = compute_variance(data_x, mean.x);
+            const float y_variance = compute_variance(data_y, mean.y);
+            const float z_variance = compute_variance(data_z, mean.z);
+            variance = float3(x_variance, y_variance, z_variance);
+            standard_deviation = float3(
+                std::sqrt(variance.x), std::sqrt(variance.y), std::sqrt(variance.z));
+          }
+        }
+      }
+
+      if (sum_required) {
+        params.set_output("Sum_001", sum);
+        params.set_output("Mean_001", mean);
+      }
+      if (sort_required) {
+        params.set_output("Min_001", min);
+        params.set_output("Max_001", max);
+        params.set_output("Range_001", range);
+        params.set_output("Median_001", median);
+      }
+      if (variance_required) {
+        params.set_output("Standard Deviation_001", standard_deviation);
+        params.set_output("Variance_001", variance);
+      }
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+}  // namespace blender::nodes
+
+void register_node_type_geo_attribute_statistic()
+{
+  static bNodeType ntype;
+
+  geo_node_type_base(
+      &ntype, GEO_NODE_ATTRIBUTE_STATISTIC, "Attribute Statistic", NODE_CLASS_ATTRIBUTE, 0);
+
+  ntype.declare = blender::nodes::geo_node_attribute_statistic_declare;
+  node_type_init(&ntype, blender::nodes::geo_node_attribute_statistic_init);
+  node_type_update(&ntype, blender::nodes::geo_node_attribute_statistic_update);
+  ntype.geometry_node_execute = blender::nodes::geo_node_attribute_statistic_exec;
+  ntype.draw_buttons = blender::nodes::geo_node_attribute_statistic_layout;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/geometry/nodes/node_geo_curve_fill.cc b/source/blender/nodes/geometry/nodes/node_geo_curve_fill.cc
index d8f40b0a0df..8de2975f9b0 100644
--- a/source/blender/nodes/geometry/nodes/node_geo_curve_fill.cc
+++ b/source/blender/nodes/geometry/nodes/node_geo_curve_fill.cc
@@ -124,37 +124,55 @@ static Mesh *cdt_to_mesh(const blender::meshintersect::CDT_result<double> &resul
   return mesh;
 }
 
-static Mesh *curve_fill_calculate(GeoNodeExecParams &params, const CurveComponent &component)
+static void curve_fill_calculate(GeometrySet &geometry_set, const GeometryNodeCurveFillMode mode)
 {
-  const CurveEval &curve = *component.get_for_read();
-  if (curve.splines().size() == 0) {
-    return nullptr;
+  if (!geometry_set.has_curve()) {
+    return;
   }
 
-  const NodeGeometryCurveFill &storage = *(const NodeGeometryCurveFill *)params.node().storage;
-  const GeometryNodeCurveFillMode mode = (GeometryNodeCurveFillMode)storage.mode;
+  const CurveEval &curve = *geometry_set.get_curve_for_read();
+  if (curve.splines().is_empty()) {
+    geometry_set.replace_curve(nullptr);
+    return;
+  }
 
   const CDT_output_type output_type = (mode == GEO_NODE_CURVE_FILL_MODE_NGONS) ?
                                           CDT_CONSTRAINTS_VALID_BMESH_WITH_HOLES :
                                           CDT_INSIDE_WITH_HOLES;
 
   const blender::meshintersect::CDT_result<double> results = do_cdt(curve, output_type);
-  return cdt_to_mesh(results);
+  Mesh *mesh = cdt_to_mesh(results);
+
+  geometry_set.replace_mesh(mesh);
+  geometry_set.replace_curve(nullptr);
 }
 
 static void geo_node_curve_fill_exec(GeoNodeExecParams params)
 {
   GeometrySet geometry_set = params.extract_input<GeometrySet>("Curve");
-  geometry_set = bke::geometry_set_realize_instances(geometry_set);
 
-  if (!geometry_set.has_curve()) {
-    params.set_output("Mesh", GeometrySet());
+  const NodeGeometryCurveFill &storage = *(const NodeGeometryCurveFill *)params.node().storage;
+  const GeometryNodeCurveFillMode mode = (GeometryNodeCurveFillMode)storage.mode;
+
+  if (geometry_set.has_instances()) {
+    InstancesComponent &instances = geometry_set.get_component_for_write<InstancesComponent>();
+    instances.ensure_geometry_instances();
+
+    threading::parallel_for(IndexRange(instances.references_amount()), 16, [&](IndexRange range) {
+      for (int i : range) {
+        GeometrySet &geometry_set = instances.geometry_set_from_reference(i);
+        geometry_set = bke::geometry_set_realize_instances(geometry_set);
+        curve_fill_calculate(geometry_set, mode);
+      }
+    });
+
+    params.set_output("Mesh", std::move(geometry_set));
     return;
   }
 
-  Mesh *mesh = curve_fill_calculate(params,
-                                    *geometry_set.get_component_for_read<CurveComponent>());
-  params.set_output("Mesh", GeometrySet::create_with_mesh(mesh));
+  curve_fill_calculate(geometry_set, mode);
+
+  params.set_output("Mesh", std::move(geometry_set));
 }
 
 }  // namespace blender::nodes
diff --git a/source/blender/nodes/geometry/nodes/node_geo_curve_parameter.cc b/source/blender/nodes/geometry/nodes/node_geo_curve_parameter.cc
new file mode 100644
index 00000000000..2cde198e679
--- /dev/null
+++ b/source/blender/nodes/geometry/nodes/node_geo_curve_parameter.cc
@@ -0,0 +1,206 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_task.hh"
+
+#include "BKE_spline.hh"
+
+#include "node_geometry_util.hh"
+
+namespace blender::nodes {
+
+static void geo_node_curve_parameter_declare(NodeDeclarationBuilder &b)
+{
+  b.add_output<decl::Float>("Factor");
+}
+
+/**
+ * A basic interpolation from the point domain to the spline domain would be useless, since the
+ * average parameter for each spline would just be 0.5, or close to it. Instead, the parameter for
+ * each spline is the portion of the total length at the start of the spline.
+ */
+static Array<float> curve_parameter_spline_domain(const CurveEval &curve, const IndexMask mask)
+{
+  Span<SplinePtr> splines = curve.splines();
+  float length = 0.0f;
+  Array<float> parameters(splines.size());
+  for (const int i : splines.index_range()) {
+    parameters[i] = length;
+    length += splines[i]->length();
+  }
+  const float total_length_inverse = length == 0.0f ? 0.0f : 1.0f / length;
+  mask.foreach_index([&](const int64_t i) { parameters[i] *= total_length_inverse; });
+
+  return parameters;
+}
+
+/**
+ * The parameter at each control point is the factor at the corresponding evaluated point.
+ */
+static void calculate_bezier_parameters(const BezierSpline &spline, MutableSpan<float> parameters)
+{
+  Span<int> offsets = spline.control_point_offsets();
+  Span<float> lengths = spline.evaluated_lengths();
+  const float total_length = spline.length();
+  const float total_length_inverse = total_length == 0.0f ? 0.0f : 1.0f / total_length;
+
+  for (const int i : IndexRange(1, spline.size() - 1)) {
+    parameters[i] = lengths[offsets[i] - 1] * total_length_inverse;
+  }
+}
+
+/**
+ * The parameter for poly splines is simply the evaluated lengths divided by the total length.
+ */
+static void calculate_poly_parameters(const PolySpline &spline, MutableSpan<float> parameters)
+{
+  Span<float> lengths = spline.evaluated_lengths();
+  const float total_length = spline.length();
+  const float total_length_inverse = total_length == 0.0f ? 0.0f : 1.0f / total_length;
+
+  for (const int i : IndexRange(1, spline.size() - 1)) {
+    parameters[i] = lengths[i - 1] * total_length_inverse;
+  }
+}
+
+/**
+ * Since NURBS control points do not necessarily coincide with the evaluated curve's path, and
+ * each control point doesn't correspond well to a specific evaluated point, the parameter at
+ * each point is not well defined. So instead, treat the control points as if they were a poly
+ * spline.
+ */
+static void calculate_nurbs_parameters(const NURBSpline &spline, MutableSpan<float> parameters)
+{
+  Span<float3> positions = spline.positions();
+  Array<float> control_point_lengths(spline.size());
+
+  float length = 0.0f;
+  for (const int i : IndexRange(positions.size() - 1)) {
+    parameters[i] = length;
+    length += float3::distance(positions[i], positions[i + 1]);
+  }
+
+  const float total_length_inverse = length == 0.0f ? 0.0f : 1.0f / length;
+  for (float &parameter : parameters) {
+    parameter *= total_length_inverse;
+  }
+}
+
+static Array<float> curve_parameter_point_domain(const CurveEval &curve)
+{
+  Span<SplinePtr> splines = curve.splines();
+  Array<int> offsets = curve.control_point_offsets();
+  const int total_size = offsets.last();
+  Array<float> parameters(total_size);
+
+  threading::parallel_for(splines.index_range(), 128, [&](IndexRange range) {
+    for (const int i : range) {
+      const Spline &spline = *splines[i];
+      MutableSpan spline_factors{parameters.as_mutable_span().slice(offsets[i], spline.size())};
+      spline_factors.first() = 0.0f;
+      switch (splines[i]->type()) {
+        case Spline::Type::Bezier: {
+          calculate_bezier_parameters(static_cast<const BezierSpline &>(spline), spline_factors);
+          break;
+        }
+        case Spline::Type::Poly: {
+          calculate_poly_parameters(static_cast<const PolySpline &>(spline), spline_factors);
+          break;
+        }
+        case Spline::Type::NURBS: {
+          calculate_nurbs_parameters(static_cast<const NURBSpline &>(spline), spline_factors);
+          break;
+        }
+      }
+    }
+  });
+  return parameters;
+}
+
+static const GVArray *construct_curve_parameter_gvarray(const CurveEval &curve,
+                                                        const IndexMask mask,
+                                                        const AttributeDomain domain,
+                                                        ResourceScope &scope)
+{
+  if (domain == ATTR_DOMAIN_POINT) {
+    Array<float> parameters = curve_parameter_point_domain(curve);
+    return &scope.construct<fn::GVArray_For_ArrayContainer<Array<float>>>(std::move(parameters));
+  }
+
+  if (domain == ATTR_DOMAIN_CURVE) {
+    Array<float> parameters = curve_parameter_spline_domain(curve, mask);
+    return &scope.construct<fn::GVArray_For_ArrayContainer<Array<float>>>(std::move(parameters));
+  }
+
+  return nullptr;
+}
+
+class CurveParameterFieldInput final : public fn::FieldInput {
+ public:
+  CurveParameterFieldInput() : fn::FieldInput(CPPType::get<float>(), "Curve Parameter")
+  {
+  }
+
+  const GVArray *get_varray_for_context(const fn::FieldContext &context,
+                                        IndexMask mask,
+                                        ResourceScope &scope) const final
+  {
+    if (const GeometryComponentFieldContext *geometry_context =
+            dynamic_cast<const GeometryComponentFieldContext *>(&context)) {
+
+      const GeometryComponent &component = geometry_context->geometry_component();
+      const AttributeDomain domain = geometry_context->domain();
+
+      if (component.type() == GEO_COMPONENT_TYPE_CURVE) {
+        const CurveComponent &curve_component = static_cast<const CurveComponent &>(component);
+        const CurveEval *curve = curve_component.get_for_read();
+        if (curve) {
+          return construct_curve_parameter_gvarray(*curve, mask, domain, scope);
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  uint64_t hash() const override
+  {
+    /* Some random constant hash. */
+    return 29837456298;
+  }
+
+  bool is_equal_to(const fn::FieldNode &other) const override
+  {
+    return dynamic_cast<const CurveParameterFieldInput *>(&other) != nullptr;
+  }
+};
+
+static void geo_node_curve_parameter_exec(GeoNodeExecParams params)
+{
+  Field<float> parameter_field{std::make_shared<CurveParameterFieldInput>()};
+  params.set_output("Factor", std::move(parameter_field));
+}
+
+}  // namespace blender::nodes
+
+void register_node_type_geo_curve_parameter()
+{
+  static bNodeType ntype;
+
+  geo_node_type_base(&ntype, GEO_NODE_CURVE_PARAMETER, "Curve Parameter", NODE_CLASS_INPUT, 0);
+  ntype.geometry_node_execute = blender::nodes::geo_node_curve_parameter_exec;
+  ntype.declare = blender::nodes::geo_node_curve_parameter_declare;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/geometry/nodes/node_geo_curve_sample.cc b/source/blender/nodes/geometry/nodes/node_geo_curve_sample.cc
new file mode 100644
index 00000000000..ac0cd510ffa
--- /dev/null
+++ b/source/blender/nodes/geometry/nodes/node_geo_curve_sample.cc
@@ -0,0 +1,288 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_task.hh"
+
+#include "BKE_spline.hh"
+
+#include "UI_interface.h"
+#include "UI_resources.h"
+
+#include "node_geometry_util.hh"
+
+namespace blender::nodes {
+
+static void geo_node_curve_sample_declare(NodeDeclarationBuilder &b)
+{
+  b.add_input<decl::Geometry>("Curve");
+  b.add_input<decl::Float>("Factor").min(0.0f).max(1.0f).subtype(PROP_FACTOR);
+  b.add_input<decl::Float>("Length").min(0.0f).subtype(PROP_DISTANCE);
+
+  b.add_output<decl::Vector>("Position");
+  b.add_output<decl::Vector>("Tangent");
+  b.add_output<decl::Vector>("Normal");
+}
+
+static void geo_node_curve_sample_layout(uiLayout *layout, bContext *UNUSED(C), PointerRNA *ptr)
+{
+  uiItemR(layout, ptr, "mode", UI_ITEM_R_EXPAND, nullptr, ICON_NONE);
+}
+
+static void geo_node_curve_sample_type_init(bNodeTree *UNUSED(tree), bNode *node)
+{
+  NodeGeometryCurveSample *data = (NodeGeometryCurveSample *)MEM_callocN(
+      sizeof(NodeGeometryCurveSample), __func__);
+  data->mode = GEO_NODE_CURVE_SAMPLE_LENGTH;
+  node->storage = data;
+}
+
+static void geo_node_curve_sample_update(bNodeTree *UNUSED(ntree), bNode *node)
+{
+  const NodeGeometryCurveSample &node_storage = *(NodeGeometryCurveSample *)node->storage;
+  const GeometryNodeCurveSampleMode mode = (GeometryNodeCurveSampleMode)node_storage.mode;
+
+  bNodeSocket *factor = ((bNodeSocket *)node->inputs.first)->next;
+  bNodeSocket *length = factor->next;
+
+  nodeSetSocketAvailability(factor, mode == GEO_NODE_CURVE_SAMPLE_FACTOR);
+  nodeSetSocketAvailability(length, mode == GEO_NODE_CURVE_SAMPLE_LENGTH);
+}
+
+template<typename T> static T sample_with_lookup(const Spline::LookupResult lookup, Span<T> data)
+{
+  return attribute_math::mix2(
+      lookup.factor, data[lookup.evaluated_index], data[lookup.next_evaluated_index]);
+}
+
+class SampleCurveFunction : public fn::MultiFunction {
+ private:
+  /**
+   * The function holds a geometry set instead of a curve or a curve component in order to
+   * maintain a reference to the geometry while the field tree is being built, so that the
+   * curve is not freed before the function can execute.
+   */
+  GeometrySet geometry_set_;
+  /**
+   * To support factor inputs, the node adds another field operation before this one to multiply by
+   * the curve's total length. Since that must calculate the spline lengths anyway, store them to
+   * reuse the calculation.
+   */
+  Array<float> spline_lengths_;
+  /** The last member of #spline_lengths_, extracted for convenience. */
+  const float total_length_;
+
+ public:
+  SampleCurveFunction(GeometrySet geometry_set, Array<float> spline_lengths)
+      : geometry_set_(std::move(geometry_set)),
+        spline_lengths_(std::move(spline_lengths)),
+        total_length_(spline_lengths_.last())
+  {
+    static fn::MFSignature signature = create_signature();
+    this->set_signature(&signature);
+  }
+
+  static fn::MFSignature create_signature()
+  {
+    blender::fn::MFSignatureBuilder signature{"Curve Sample"};
+    signature.single_input<float>("Length");
+    signature.single_output<float3>("Position");
+    signature.single_output<float3>("Tangent");
+    signature.single_output<float3>("Normal");
+    return signature.build();
+  }
+
+  void call(IndexMask mask, fn::MFParams params, fn::MFContext UNUSED(context)) const override
+  {
+    MutableSpan<float3> sampled_positions = params.uninitialized_single_output_if_required<float3>(
+        1, "Position");
+    MutableSpan<float3> sampled_tangents = params.uninitialized_single_output_if_required<float3>(
+        2, "Tangent");
+    MutableSpan<float3> sampled_normals = params.uninitialized_single_output_if_required<float3>(
+        3, "Normal");
+
+    auto return_default = [&]() {
+      if (!sampled_positions.is_empty()) {
+        sampled_positions.fill_indices(mask, {0, 0, 0});
+      }
+      if (!sampled_tangents.is_empty()) {
+        sampled_tangents.fill_indices(mask, {0, 0, 0});
+      }
+      if (!sampled_normals.is_empty()) {
+        sampled_normals.fill_indices(mask, {0, 0, 0});
+      }
+    };
+
+    if (!geometry_set_.has_curve()) {
+      return return_default();
+    }
+
+    const CurveComponent *curve_component = geometry_set_.get_component_for_read<CurveComponent>();
+    const CurveEval *curve = curve_component->get_for_read();
+    Span<SplinePtr> splines = curve->splines();
+    if (splines.is_empty()) {
+      return return_default();
+    }
+
+    const VArray<float> &lengths_varray = params.readonly_single_input<float>(0, "Length");
+    const VArray_Span lengths{lengths_varray};
+#ifdef DEBUG
+    for (const float length : lengths) {
+      /* Lengths must be in range of the curve's total length. This is ensured in
+       * #get_length_input_field by adding another multi-function before this one
+       * to clamp the lengths. */
+      BLI_assert(length >= 0.0f && length <= total_length_);
+    }
+#endif
+
+    Array<int> spline_indices(mask.min_array_size());
+    for (const int i : mask) {
+      const float *offset = std::lower_bound(
+          spline_lengths_.begin(), spline_lengths_.end(), lengths[i]);
+      const int index = offset - spline_lengths_.data() - 1;
+      spline_indices[i] = std::max(index, 0);
+    }
+
+    /* Storing lookups in an array is unnecessary but will simplify custom attribute transfer. */
+    Array<Spline::LookupResult> lookups(mask.min_array_size());
+    for (const int i : mask) {
+      const float length_in_spline = lengths[i] - spline_lengths_[spline_indices[i]];
+      lookups[i] = splines[spline_indices[i]]->lookup_evaluated_length(length_in_spline);
+    }
+
+    if (!sampled_positions.is_empty()) {
+      for (const int i : mask) {
+        const Spline::LookupResult &lookup = lookups[i];
+        const Span<float3> evaluated_positions = splines[spline_indices[i]]->evaluated_positions();
+        sampled_positions[i] = sample_with_lookup(lookup, evaluated_positions);
+      }
+    }
+
+    if (!sampled_tangents.is_empty()) {
+      for (const int i : mask) {
+        const Spline::LookupResult &lookup = lookups[i];
+        const Span<float3> evaluated_tangents = splines[spline_indices[i]]->evaluated_tangents();
+        sampled_tangents[i] = sample_with_lookup(lookup, evaluated_tangents).normalized();
+      }
+    }
+
+    if (!sampled_normals.is_empty()) {
+      for (const int i : mask) {
+        const Spline::LookupResult &lookup = lookups[i];
+        const Span<float3> evaluated_normals = splines[spline_indices[i]]->evaluated_normals();
+        sampled_normals[i] = sample_with_lookup(lookup, evaluated_normals).normalized();
+      }
+    }
+  }
+};
+
+/**
+ * Pre-process the lengths or factors used for the sampling, turning factors into lengths, and
+ * clamping between zero and the total length of the curve. Do this as a separate operation in the
+ * field tree to make the sampling simpler, and to let the evaluator optimize better.
+ *
+ * \todo Use a mutable single input instead when they are supported.
+ */
+static Field<float> get_length_input_field(const GeoNodeExecParams &params,
+                                           const float curve_total_length)
+{
+  const NodeGeometryCurveSample &node_storage = *(NodeGeometryCurveSample *)params.node().storage;
+  const GeometryNodeCurveSampleMode mode = (GeometryNodeCurveSampleMode)node_storage.mode;
+
+  if (mode == GEO_NODE_CURVE_SAMPLE_LENGTH) {
+    /* Just make sure the length is in bounds of the curve. */
+    Field<float> length_field = params.get_input<Field<float>>("Length");
+    auto clamp_fn = std::make_unique<fn::CustomMF_SI_SO<float, float>>(
+        __func__, [curve_total_length](float length) {
+          return std::clamp(length, 0.0f, curve_total_length);
+        });
+    auto clamp_op = std::make_shared<FieldOperation>(
+        FieldOperation(std::move(clamp_fn), {std::move(length_field)}));
+
+    return Field<float>(std::move(clamp_op), 0);
+  }
+
+  /* Convert the factor to a length and clamp it to the bounds of the curve. */
+  Field<float> factor_field = params.get_input<Field<float>>("Factor");
+  auto clamp_fn = std::make_unique<fn::CustomMF_SI_SO<float, float>>(
+      __func__, [curve_total_length](float factor) {
+        const float length = factor * curve_total_length;
+        return std::clamp(length, 0.0f, curve_total_length);
+      });
+  auto process_op = std::make_shared<FieldOperation>(
+      FieldOperation(std::move(clamp_fn), {std::move(factor_field)}));
+
+  return Field<float>(std::move(process_op), 0);
+}
+
+static void geo_node_curve_sample_exec(GeoNodeExecParams params)
+{
+  GeometrySet geometry_set = params.extract_input<GeometrySet>("Curve");
+
+  auto return_default = [&]() {
+    params.set_output("Position", fn::make_constant_field<float3>({0.0f, 0.0f, 0.0f}));
+    params.set_output("Tangent", fn::make_constant_field<float3>({0.0f, 0.0f, 0.0f}));
+    params.set_output("Normal", fn::make_constant_field<float3>({0.0f, 0.0f, 0.0f}));
+  };
+
+  const CurveComponent *component = geometry_set.get_component_for_read<CurveComponent>();
+  if (component == nullptr) {
+    return return_default();
+  }
+
+  const CurveEval *curve = component->get_for_read();
+  if (curve == nullptr) {
+    return return_default();
+  }
+
+  if (curve->splines().is_empty()) {
+    return return_default();
+  }
+
+  Array<float> spline_lengths = curve->accumulated_spline_lengths();
+  const float total_length = spline_lengths.last();
+  if (total_length == 0.0f) {
+    return return_default();
+  }
+
+  Field<float> length_field = get_length_input_field(params, total_length);
+
+  auto sample_fn = std::make_unique<SampleCurveFunction>(std::move(geometry_set),
+                                                         std::move(spline_lengths));
+  auto sample_op = std::make_shared<FieldOperation>(
+      FieldOperation(std::move(sample_fn), {length_field}));
+
+  params.set_output("Position", Field<float3>(sample_op, 0));
+  params.set_output("Tangent", Field<float3>(sample_op, 1));
+  params.set_output("Normal", Field<float3>(sample_op, 2));
+}
+
+}  // namespace blender::nodes
+
+void register_node_type_geo_curve_sample()
+{
+  static bNodeType ntype;
+
+  geo_node_type_base(&ntype, GEO_NODE_CURVE_SAMPLE, "Curve Sample", NODE_CLASS_GEOMETRY, 0);
+  ntype.geometry_node_execute = blender::nodes::geo_node_curve_sample_exec;
+  ntype.declare = blender::nodes::geo_node_curve_sample_declare;
+  node_type_init(&ntype, blender::nodes::geo_node_curve_sample_type_init);
+  node_type_update(&ntype, blender::nodes::geo_node_curve_sample_update);
+  node_type_storage(
+      &ntype, "NodeGeometryCurveSample", node_free_standard_storage, node_copy_standard_storage);
+  ntype.draw_buttons = blender::nodes::geo_node_curve_sample_layout;
+
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/geometry/nodes/node_geo_curve_to_mesh.cc b/source/blender/nodes/geometry/nodes/node_geo_curve_to_mesh.cc
index b8bdb3d71d6..89ba635ff4b 100644
--- a/source/blender/nodes/geometry/nodes/node_geo_curve_to_mesh.cc
+++ b/source/blender/nodes/geometry/nodes/node_geo_curve_to_mesh.cc
@@ -14,17 +14,10 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
-#include "BLI_array.hh"
-#include "BLI_float4x4.hh"
-#include "BLI_task.hh"
-
-#include "DNA_mesh_types.h"
-#include "DNA_meshdata_types.h"
-
-#include "BKE_material.h"
-#include "BKE_mesh.h"
 #include "BKE_spline.hh"
 
+#include "BKE_curve_to_mesh.hh"
+
 #include "UI_interface.h"
 #include "UI_resources.h"
 
@@ -39,692 +32,6 @@ static void geo_node_curve_to_mesh_declare(NodeDeclarationBuilder &b)
   b.add_output<decl::Geometry>("Mesh");
 }
 
-/** Information about the creation of one curve spline and profile spline combination. */
-struct ResultInfo {
-  const Spline &spline;
-  const Spline &profile;
-  int vert_offset;
-  int edge_offset;
-  int loop_offset;
-  int poly_offset;
-  int spline_vert_len;
-  int spline_edge_len;
-  int profile_vert_len;
-  int profile_edge_len;
-};
-
-static void vert_extrude_to_mesh_data(const Spline &spline,
-                                      const float3 profile_vert,
-                                      MutableSpan<MVert> r_verts,
-                                      MutableSpan<MEdge> r_edges,
-                                      const int vert_offset,
-                                      const int edge_offset)
-{
-  Span<float3> positions = spline.evaluated_positions();
-
-  for (const int i : IndexRange(positions.size() - 1)) {
-    MEdge &edge = r_edges[edge_offset + i];
-    edge.v1 = vert_offset + i;
-    edge.v2 = vert_offset + i + 1;
-    edge.flag = ME_LOOSEEDGE;
-  }
-
-  if (spline.is_cyclic() && spline.evaluated_edges_size() > 1) {
-    MEdge &edge = r_edges[edge_offset + spline.evaluated_edges_size() - 1];
-    edge.v1 = vert_offset;
-    edge.v2 = vert_offset + positions.size() - 1;
-    edge.flag = ME_LOOSEEDGE;
-  }
-
-  for (const int i : positions.index_range()) {
-    MVert &vert = r_verts[vert_offset + i];
-    copy_v3_v3(vert.co, positions[i] + profile_vert);
-  }
-}
-
-static void mark_edges_sharp(MutableSpan<MEdge> edges)
-{
-  for (MEdge &edge : edges) {
-    edge.flag |= ME_SHARP;
-  }
-}
-
-static void spline_extrude_to_mesh_data(const ResultInfo &info,
-                                        MutableSpan<MVert> r_verts,
-                                        MutableSpan<MEdge> r_edges,
-                                        MutableSpan<MLoop> r_loops,
-                                        MutableSpan<MPoly> r_polys)
-{
-  const Spline &spline = info.spline;
-  const Spline &profile = info.profile;
-  if (info.profile_vert_len == 1) {
-    vert_extrude_to_mesh_data(spline,
-                              profile.evaluated_positions()[0],
-                              r_verts,
-                              r_edges,
-                              info.vert_offset,
-                              info.edge_offset);
-    return;
-  }
-
-  /* Add the edges running along the length of the curve, starting at each profile vertex. */
-  const int spline_edges_start = info.edge_offset;
-  for (const int i_profile : IndexRange(info.profile_vert_len)) {
-    const int profile_edge_offset = spline_edges_start + i_profile * info.spline_edge_len;
-    for (const int i_ring : IndexRange(info.spline_edge_len)) {
-      const int i_next_ring = (i_ring == info.spline_vert_len - 1) ? 0 : i_ring + 1;
-
-      const int ring_vert_offset = info.vert_offset + info.profile_vert_len * i_ring;
-      const int next_ring_vert_offset = info.vert_offset + info.profile_vert_len * i_next_ring;
-
-      MEdge &edge = r_edges[profile_edge_offset + i_ring];
-      edge.v1 = ring_vert_offset + i_profile;
-      edge.v2 = next_ring_vert_offset + i_profile;
-      edge.flag = ME_EDGEDRAW | ME_EDGERENDER;
-    }
-  }
-
-  /* Add the edges running along each profile ring. */
-  const int profile_edges_start = spline_edges_start +
-                                  info.profile_vert_len * info.spline_edge_len;
-  for (const int i_ring : IndexRange(info.spline_vert_len)) {
-    const int ring_vert_offset = info.vert_offset + info.profile_vert_len * i_ring;
-
-    const int ring_edge_offset = profile_edges_start + i_ring * info.profile_edge_len;
-    for (const int i_profile : IndexRange(info.profile_edge_len)) {
-      const int i_next_profile = (i_profile == info.profile_vert_len - 1) ? 0 : i_profile + 1;
-
-      MEdge &edge = r_edges[ring_edge_offset + i_profile];
-      edge.v1 = ring_vert_offset + i_profile;
-      edge.v2 = ring_vert_offset + i_next_profile;
-      edge.flag = ME_EDGEDRAW | ME_EDGERENDER;
-    }
-  }
-
-  /* Calculate poly and corner indices. */
-  for (const int i_ring : IndexRange(info.spline_edge_len)) {
-    const int i_next_ring = (i_ring == info.spline_vert_len - 1) ? 0 : i_ring + 1;
-
-    const int ring_vert_offset = info.vert_offset + info.profile_vert_len * i_ring;
-    const int next_ring_vert_offset = info.vert_offset + info.profile_vert_len * i_next_ring;
-
-    const int ring_edge_start = profile_edges_start + info.profile_edge_len * i_ring;
-    const int next_ring_edge_offset = profile_edges_start + info.profile_edge_len * i_next_ring;
-
-    const int ring_poly_offset = info.poly_offset + i_ring * info.profile_edge_len;
-    const int ring_loop_offset = info.loop_offset + i_ring * info.profile_edge_len * 4;
-
-    for (const int i_profile : IndexRange(info.profile_edge_len)) {
-      const int ring_segment_loop_offset = ring_loop_offset + i_profile * 4;
-      const int i_next_profile = (i_profile == info.profile_vert_len - 1) ? 0 : i_profile + 1;
-
-      const int spline_edge_start = spline_edges_start + info.spline_edge_len * i_profile;
-      const int next_spline_edge_start = spline_edges_start +
-                                         info.spline_edge_len * i_next_profile;
-
-      MPoly &poly = r_polys[ring_poly_offset + i_profile];
-      poly.loopstart = ring_segment_loop_offset;
-      poly.totloop = 4;
-      poly.flag = ME_SMOOTH;
-
-      MLoop &loop_a = r_loops[ring_segment_loop_offset];
-      loop_a.v = ring_vert_offset + i_profile;
-      loop_a.e = ring_edge_start + i_profile;
-      MLoop &loop_b = r_loops[ring_segment_loop_offset + 1];
-      loop_b.v = ring_vert_offset + i_next_profile;
-      loop_b.e = next_spline_edge_start + i_ring;
-      MLoop &loop_c = r_loops[ring_segment_loop_offset + 2];
-      loop_c.v = next_ring_vert_offset + i_next_profile;
-      loop_c.e = next_ring_edge_offset + i_profile;
-      MLoop &loop_d = r_loops[ring_segment_loop_offset + 3];
-      loop_d.v = next_ring_vert_offset + i_profile;
-      loop_d.e = spline_edge_start + i_ring;
-    }
-  }
-
-  /* Calculate the positions of each profile ring profile along the spline. */
-  Span<float3> positions = spline.evaluated_positions();
-  Span<float3> tangents = spline.evaluated_tangents();
-  Span<float3> normals = spline.evaluated_normals();
-  Span<float3> profile_positions = profile.evaluated_positions();
-
-  GVArray_Typed<float> radii = spline.interpolate_to_evaluated(spline.radii());
-  for (const int i_ring : IndexRange(info.spline_vert_len)) {
-    float4x4 point_matrix = float4x4::from_normalized_axis_data(
-        positions[i_ring], normals[i_ring], tangents[i_ring]);
-    point_matrix.apply_scale(radii[i_ring]);
-
-    const int ring_vert_start = info.vert_offset + i_ring * info.profile_vert_len;
-    for (const int i_profile : IndexRange(info.profile_vert_len)) {
-      MVert &vert = r_verts[ring_vert_start + i_profile];
-      copy_v3_v3(vert.co, point_matrix * profile_positions[i_profile]);
-    }
-  }
-
-  /* Mark edge loops from sharp vector control points sharp. */
-  if (profile.type() == Spline::Type::Bezier) {
-    const BezierSpline &bezier_spline = static_cast<const BezierSpline &>(profile);
-    Span<int> control_point_offsets = bezier_spline.control_point_offsets();
-    for (const int i : IndexRange(bezier_spline.size())) {
-      if (bezier_spline.point_is_sharp(i)) {
-        mark_edges_sharp(
-            r_edges.slice(spline_edges_start + info.spline_edge_len * control_point_offsets[i],
-                          info.spline_edge_len));
-      }
-    }
-  }
-}
-
-static inline int spline_extrude_vert_size(const Spline &curve, const Spline &profile)
-{
-  return curve.evaluated_points_size() * profile.evaluated_points_size();
-}
-
-static inline int spline_extrude_edge_size(const Spline &curve, const Spline &profile)
-{
-  /* Add the ring edges, with one ring for every curve vertex, and the edge loops
-   * that run along the length of the curve, starting on the first profile. */
-  return curve.evaluated_points_size() * profile.evaluated_edges_size() +
-         curve.evaluated_edges_size() * profile.evaluated_points_size();
-}
-
-static inline int spline_extrude_loop_size(const Spline &curve, const Spline &profile)
-{
-  return curve.evaluated_edges_size() * profile.evaluated_edges_size() * 4;
-}
-
-static inline int spline_extrude_poly_size(const Spline &curve, const Spline &profile)
-{
-  return curve.evaluated_edges_size() * profile.evaluated_edges_size();
-}
-
-struct ResultOffsets {
-  Array<int> vert;
-  Array<int> edge;
-  Array<int> loop;
-  Array<int> poly;
-};
-static ResultOffsets calculate_result_offsets(Span<SplinePtr> profiles, Span<SplinePtr> curves)
-{
-  const int total = profiles.size() * curves.size();
-  Array<int> vert(total + 1);
-  Array<int> edge(total + 1);
-  Array<int> loop(total + 1);
-  Array<int> poly(total + 1);
-
-  int mesh_index = 0;
-  int vert_offset = 0;
-  int edge_offset = 0;
-  int loop_offset = 0;
-  int poly_offset = 0;
-  for (const int i_spline : curves.index_range()) {
-    for (const int i_profile : profiles.index_range()) {
-      vert[mesh_index] = vert_offset;
-      edge[mesh_index] = edge_offset;
-      loop[mesh_index] = loop_offset;
-      poly[mesh_index] = poly_offset;
-      vert_offset += spline_extrude_vert_size(*curves[i_spline], *profiles[i_profile]);
-      edge_offset += spline_extrude_edge_size(*curves[i_spline], *profiles[i_profile]);
-      loop_offset += spline_extrude_loop_size(*curves[i_spline], *profiles[i_profile]);
-      poly_offset += spline_extrude_poly_size(*curves[i_spline], *profiles[i_profile]);
-      mesh_index++;
-    }
-  }
-  vert.last() = vert_offset;
-  edge.last() = edge_offset;
-  loop.last() = loop_offset;
-  poly.last() = poly_offset;
-
-  return {std::move(vert), std::move(edge), std::move(loop), std::move(poly)};
-}
-
-static AttributeDomain get_result_attribute_domain(const MeshComponent &component,
-                                                   const AttributeIDRef &attribute_id)
-{
-  /* Only use a different domain if it is builtin and must only exist on one domain. */
-  if (!component.attribute_is_builtin(attribute_id)) {
-    return ATTR_DOMAIN_POINT;
-  }
-
-  std::optional<AttributeMetaData> meta_data = component.attribute_get_meta_data(attribute_id);
-  if (!meta_data) {
-    /* This function has to return something in this case, but it shouldn't be used,
-     * so return an output that will assert later if the code attempts to handle it. */
-    return ATTR_DOMAIN_AUTO;
-  }
-
-  return meta_data->domain;
-}
-
-/**
- * The data stored in the attribute and its domain from #OutputAttribute, to avoid calling
- * `as_span()` for every single profile and curve spline combination, and for readability.
- */
-struct ResultAttributeData {
-  GMutableSpan data;
-  AttributeDomain domain;
-};
-
-static std::optional<ResultAttributeData> create_attribute_and_get_span(
-    MeshComponent &component,
-    const AttributeIDRef &attribute_id,
-    AttributeMetaData meta_data,
-    Vector<OutputAttribute> &r_attributes)
-{
-  const AttributeDomain domain = get_result_attribute_domain(component, attribute_id);
-  OutputAttribute attribute = component.attribute_try_get_for_output_only(
-      attribute_id, domain, meta_data.data_type);
-  if (!attribute) {
-    return std::nullopt;
-  }
-
-  GMutableSpan span = attribute.as_span();
-  r_attributes.append(std::move(attribute));
-  return std::make_optional<ResultAttributeData>({span, domain});
-}
-
-/**
- * Store the references to the attribute data from the curve and profile inputs. Here we rely on
- * the invariants of the storage of curve attributes, that the order will be consistent between
- * splines, and all splines will have the same attributes.
- */
-struct ResultAttributes {
-  /**
-   * Result attributes on the mesh corresponding to each attribute on the curve input, in the same
-   * order. The data is optional only in case the attribute does not exist on the mesh for some
-   * reason, like "shade_smooth" when the result has no faces.
-   */
-  Vector<std::optional<ResultAttributeData>> curve_point_attributes;
-  Vector<std::optional<ResultAttributeData>> curve_spline_attributes;
-
-  /**
-   * Result attributes corresponding the attributes on the profile input, in the same order. The
-   * attributes are optional in case the attribute names correspond to a names used by the curve
-   * input, in which case the curve input attributes take precedence.
-   */
-  Vector<std::optional<ResultAttributeData>> profile_point_attributes;
-  Vector<std::optional<ResultAttributeData>> profile_spline_attributes;
-
-  /**
-   * Because some builtin attributes are not stored contiguously, and the curve inputs might have
-   * attributes with those names, it's necessary to keep OutputAttributes around to give access to
-   * the result data in a contiguous array.
-   */
-  Vector<OutputAttribute> attributes;
-};
-static ResultAttributes create_result_attributes(const CurveEval &curve,
-                                                 const CurveEval &profile,
-                                                 Mesh &mesh)
-{
-  MeshComponent mesh_component;
-  mesh_component.replace(&mesh, GeometryOwnershipType::Editable);
-  Set<AttributeIDRef> curve_attributes;
-
-  /* In order to prefer attributes on the main curve input when there are name collisions, first
-   * check the attributes on the curve, then add attributes on the profile that are not also on the
-   * main curve input. */
-  ResultAttributes result;
-  curve.splines().first()->attributes.foreach_attribute(
-      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
-        curve_attributes.add_new(id);
-        result.curve_point_attributes.append(
-            create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
-        return true;
-      },
-      ATTR_DOMAIN_POINT);
-  curve.attributes.foreach_attribute(
-      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
-        curve_attributes.add_new(id);
-        result.curve_spline_attributes.append(
-            create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
-        return true;
-      },
-      ATTR_DOMAIN_CURVE);
-  profile.splines().first()->attributes.foreach_attribute(
-      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
-        if (curve_attributes.contains(id)) {
-          result.profile_point_attributes.append({});
-        }
-        else {
-          result.profile_point_attributes.append(
-              create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
-        }
-        return true;
-      },
-      ATTR_DOMAIN_POINT);
-  profile.attributes.foreach_attribute(
-      [&](const AttributeIDRef &id, const AttributeMetaData &meta_data) {
-        if (curve_attributes.contains(id)) {
-          result.profile_spline_attributes.append({});
-        }
-        else {
-          result.profile_spline_attributes.append(
-              create_attribute_and_get_span(mesh_component, id, meta_data, result.attributes));
-        }
-        return true;
-      },
-      ATTR_DOMAIN_CURVE);
-
-  return result;
-}
-
-template<typename T>
-static void copy_curve_point_data_to_mesh_verts(const Span<T> src,
-                                                const ResultInfo &info,
-                                                MutableSpan<T> dst)
-{
-  for (const int i_ring : IndexRange(info.spline_vert_len)) {
-    const int ring_vert_start = info.vert_offset + i_ring * info.profile_vert_len;
-    dst.slice(ring_vert_start, info.profile_vert_len).fill(src[i_ring]);
-  }
-}
-
-template<typename T>
-static void copy_curve_point_data_to_mesh_edges(const Span<T> src,
-                                                const ResultInfo &info,
-                                                MutableSpan<T> dst)
-{
-  const int edges_start = info.edge_offset + info.profile_vert_len * info.spline_edge_len;
-  for (const int i_ring : IndexRange(info.spline_vert_len)) {
-    const int ring_edge_start = edges_start + info.profile_edge_len * i_ring;
-    dst.slice(ring_edge_start, info.profile_edge_len).fill(src[i_ring]);
-  }
-}
-
-template<typename T>
-static void copy_curve_point_data_to_mesh_faces(const Span<T> src,
-                                                const ResultInfo &info,
-                                                MutableSpan<T> dst)
-{
-  for (const int i_ring : IndexRange(info.spline_edge_len)) {
-    const int ring_face_start = info.poly_offset + info.profile_edge_len * i_ring;
-    dst.slice(ring_face_start, info.profile_edge_len).fill(src[i_ring]);
-  }
-}
-
-static void copy_curve_point_attribute_to_mesh(const GSpan src,
-                                               const ResultInfo &info,
-                                               ResultAttributeData &dst)
-{
-  GVArrayPtr interpolated_gvarray = info.spline.interpolate_to_evaluated(src);
-  GSpan interpolated = interpolated_gvarray->get_internal_span();
-
-  attribute_math::convert_to_static_type(src.type(), [&](auto dummy) {
-    using T = decltype(dummy);
-    switch (dst.domain) {
-      case ATTR_DOMAIN_POINT:
-        copy_curve_point_data_to_mesh_verts(interpolated.typed<T>(), info, dst.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_EDGE:
-        copy_curve_point_data_to_mesh_edges(interpolated.typed<T>(), info, dst.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_FACE:
-        copy_curve_point_data_to_mesh_faces(interpolated.typed<T>(), info, dst.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_CORNER:
-        /* Unsupported for now, since there are no builtin attributes to convert into. */
-        break;
-      default:
-        BLI_assert_unreachable();
-        break;
-    }
-  });
-}
-
-template<typename T>
-static void copy_profile_point_data_to_mesh_verts(const Span<T> src,
-                                                  const ResultInfo &info,
-                                                  MutableSpan<T> dst)
-{
-  for (const int i_ring : IndexRange(info.spline_vert_len)) {
-    const int profile_vert_start = info.vert_offset + i_ring * info.profile_vert_len;
-    for (const int i_profile : IndexRange(info.profile_vert_len)) {
-      dst[profile_vert_start + i_profile] = src[i_profile];
-    }
-  }
-}
-
-template<typename T>
-static void copy_profile_point_data_to_mesh_edges(const Span<T> src,
-                                                  const ResultInfo &info,
-                                                  MutableSpan<T> dst)
-{
-  for (const int i_profile : IndexRange(info.profile_vert_len)) {
-    const int profile_edge_offset = info.edge_offset + i_profile * info.spline_edge_len;
-    dst.slice(profile_edge_offset, info.spline_edge_len).fill(src[i_profile]);
-  }
-}
-
-template<typename T>
-static void copy_profile_point_data_to_mesh_faces(const Span<T> src,
-                                                  const ResultInfo &info,
-                                                  MutableSpan<T> dst)
-{
-  for (const int i_ring : IndexRange(info.spline_edge_len)) {
-    const int profile_face_start = info.poly_offset + i_ring * info.profile_edge_len;
-    for (const int i_profile : IndexRange(info.profile_edge_len)) {
-      dst[profile_face_start + i_profile] = src[i_profile];
-    }
-  }
-}
-
-static void copy_profile_point_attribute_to_mesh(const GSpan src,
-                                                 const ResultInfo &info,
-                                                 ResultAttributeData &dst)
-{
-  GVArrayPtr interpolated_gvarray = info.profile.interpolate_to_evaluated(src);
-  GSpan interpolated = interpolated_gvarray->get_internal_span();
-
-  attribute_math::convert_to_static_type(src.type(), [&](auto dummy) {
-    using T = decltype(dummy);
-    switch (dst.domain) {
-      case ATTR_DOMAIN_POINT:
-        copy_profile_point_data_to_mesh_verts(interpolated.typed<T>(), info, dst.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_EDGE:
-        copy_profile_point_data_to_mesh_edges(interpolated.typed<T>(), info, dst.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_FACE:
-        copy_profile_point_data_to_mesh_faces(interpolated.typed<T>(), info, dst.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_CORNER:
-        /* Unsupported for now, since there are no builtin attributes to convert into. */
-        break;
-      default:
-        BLI_assert_unreachable();
-        break;
-    }
-  });
-}
-
-static void copy_point_domain_attributes_to_mesh(const ResultInfo &info,
-                                                 ResultAttributes &attributes)
-{
-  if (!attributes.curve_point_attributes.is_empty()) {
-    int i = 0;
-    info.spline.attributes.foreach_attribute(
-        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
-          if (attributes.curve_point_attributes[i]) {
-            copy_curve_point_attribute_to_mesh(*info.spline.attributes.get_for_read(id),
-                                               info,
-                                               *attributes.curve_point_attributes[i]);
-          }
-          i++;
-          return true;
-        },
-        ATTR_DOMAIN_POINT);
-  }
-  if (!attributes.profile_point_attributes.is_empty()) {
-    int i = 0;
-    info.profile.attributes.foreach_attribute(
-        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
-          if (attributes.profile_point_attributes[i]) {
-            copy_profile_point_attribute_to_mesh(*info.profile.attributes.get_for_read(id),
-                                                 info,
-                                                 *attributes.profile_point_attributes[i]);
-          }
-          i++;
-          return true;
-        },
-        ATTR_DOMAIN_POINT);
-  }
-}
-
-template<typename T>
-static void copy_spline_data_to_mesh(Span<T> src, Span<int> offsets, MutableSpan<T> dst)
-{
-  for (const int i : IndexRange(src.size())) {
-    dst.slice(offsets[i], offsets[i + 1] - offsets[i]).fill(src[i]);
-  }
-}
-
-/**
- * Since the offsets for each combination of curve and profile spline are stored for every mesh
- * domain, and this just needs to fill the chunks corresponding to each combination, we can use
- * the same function for all mesh domains.
- */
-static void copy_spline_attribute_to_mesh(const GSpan src,
-                                          const ResultOffsets &offsets,
-                                          ResultAttributeData &dst_attribute)
-{
-  attribute_math::convert_to_static_type(src.type(), [&](auto dummy) {
-    using T = decltype(dummy);
-    switch (dst_attribute.domain) {
-      case ATTR_DOMAIN_POINT:
-        copy_spline_data_to_mesh(src.typed<T>(), offsets.vert, dst_attribute.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_EDGE:
-        copy_spline_data_to_mesh(src.typed<T>(), offsets.edge, dst_attribute.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_FACE:
-        copy_spline_data_to_mesh(src.typed<T>(), offsets.poly, dst_attribute.data.typed<T>());
-        break;
-      case ATTR_DOMAIN_CORNER:
-        copy_spline_data_to_mesh(src.typed<T>(), offsets.loop, dst_attribute.data.typed<T>());
-        break;
-      default:
-        BLI_assert_unreachable();
-        break;
-    }
-  });
-}
-
-static void copy_spline_domain_attributes_to_mesh(const CurveEval &curve,
-                                                  const CurveEval &profile,
-                                                  const ResultOffsets &offsets,
-                                                  ResultAttributes &attributes)
-{
-  if (!attributes.curve_spline_attributes.is_empty()) {
-    int i = 0;
-    curve.attributes.foreach_attribute(
-        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
-          if (attributes.curve_spline_attributes[i]) {
-            copy_spline_attribute_to_mesh(*curve.attributes.get_for_read(id),
-                                          offsets,
-                                          *attributes.curve_spline_attributes[i]);
-          }
-          i++;
-          return true;
-        },
-        ATTR_DOMAIN_CURVE);
-  }
-  if (!attributes.profile_spline_attributes.is_empty()) {
-    int i = 0;
-    profile.attributes.foreach_attribute(
-        [&](const AttributeIDRef &id, const AttributeMetaData &UNUSED(meta_data)) {
-          if (attributes.profile_spline_attributes[i]) {
-            copy_spline_attribute_to_mesh(*profile.attributes.get_for_read(id),
-                                          offsets,
-                                          *attributes.profile_spline_attributes[i]);
-          }
-          i++;
-          return true;
-        },
-        ATTR_DOMAIN_CURVE);
-  }
-}
-
-/**
- * \note Normal calculation is by far the slowest part of calculations relating to the result mesh.
- * Although it would be a sensible decision to use the better topology information available while
- * generating the mesh to also generate the normals, that work may wasted if the output mesh is
- * changed anyway in a way that affects the normals. So currently this code uses the safer /
- * simpler solution of deferring normal calculation to the rest of Blender.
- */
-static Mesh *curve_to_mesh_calculate(const CurveEval &curve, const CurveEval &profile)
-{
-  Span<SplinePtr> profiles = profile.splines();
-  Span<SplinePtr> curves = curve.splines();
-
-  const ResultOffsets offsets = calculate_result_offsets(profiles, curves);
-  if (offsets.vert.last() == 0) {
-    return nullptr;
-  }
-
-  Mesh *mesh = BKE_mesh_new_nomain(
-      offsets.vert.last(), offsets.edge.last(), 0, offsets.loop.last(), offsets.poly.last());
-  BKE_id_material_eval_ensure_default_slot(&mesh->id);
-  mesh->flag |= ME_AUTOSMOOTH;
-  mesh->smoothresh = DEG2RADF(180.0f);
-  BKE_mesh_normals_tag_dirty(mesh);
-
-  ResultAttributes attributes = create_result_attributes(curve, profile, *mesh);
-
-  threading::parallel_for(curves.index_range(), 128, [&](IndexRange curves_range) {
-    for (const int i_spline : curves_range) {
-      const Spline &spline = *curves[i_spline];
-      if (spline.evaluated_points_size() == 0) {
-        continue;
-      }
-      const int spline_start_index = i_spline * profiles.size();
-      threading::parallel_for(profiles.index_range(), 128, [&](IndexRange profiles_range) {
-        for (const int i_profile : profiles_range) {
-          const Spline &profile = *profiles[i_profile];
-          const int i_mesh = spline_start_index + i_profile;
-          ResultInfo info{
-              spline,
-              profile,
-              offsets.vert[i_mesh],
-              offsets.edge[i_mesh],
-              offsets.loop[i_mesh],
-              offsets.poly[i_mesh],
-              spline.evaluated_points_size(),
-              spline.evaluated_edges_size(),
-              profile.evaluated_points_size(),
-              profile.evaluated_edges_size(),
-          };
-
-          spline_extrude_to_mesh_data(info,
-                                      {mesh->mvert, mesh->totvert},
-                                      {mesh->medge, mesh->totedge},
-                                      {mesh->mloop, mesh->totloop},
-                                      {mesh->mpoly, mesh->totpoly});
-
-          copy_point_domain_attributes_to_mesh(info, attributes);
-        }
-      });
-    }
-  });
-
-  copy_spline_domain_attributes_to_mesh(curve, profile, offsets, attributes);
-
-  for (OutputAttribute &output_attribute : attributes.attributes) {
-    output_attribute.save();
-  }
-
-  return mesh;
-}
-
-static CurveEval get_curve_single_vert()
-{
-  CurveEval curve;
-  std::unique_ptr<PolySpline> spline = std::make_unique<PolySpline>();
-  spline->add_point(float3(0), 0, 0.0f);
-  curve.add_spline(std::move(spline));
-
-  return curve;
-}
-
 static void geo_node_curve_to_mesh_exec(GeoNodeExecParams params)
 {
   GeometrySet curve_set = params.extract_input<GeometrySet>("Curve");
@@ -750,11 +57,14 @@ static void geo_node_curve_to_mesh_exec(GeoNodeExecParams params)
 
   const CurveEval *profile_curve = profile_set.get_curve_for_read();
 
-  static const CurveEval vert_curve = get_curve_single_vert();
-
-  Mesh *mesh = curve_to_mesh_calculate(*curve_set.get_curve_for_read(),
-                                       (profile_curve == nullptr) ? vert_curve : *profile_curve);
-  params.set_output("Mesh", GeometrySet::create_with_mesh(mesh));
+  if (profile_curve == nullptr) {
+    Mesh *mesh = bke::curve_to_wire_mesh(*curve_set.get_curve_for_read());
+    params.set_output("Mesh", GeometrySet::create_with_mesh(mesh));
+  }
+  else {
+    Mesh *mesh = bke::curve_to_mesh_sweep(*curve_set.get_curve_for_read(), *profile_curve);
+    params.set_output("Mesh", GeometrySet::create_with_mesh(mesh));
+  }
 }
 
 }  // namespace blender::nodes
diff --git a/source/blender/nodes/geometry/nodes/node_geo_input_normal.cc b/source/blender/nodes/geometry/nodes/node_geo_input_normal.cc
index 07818f2a3ad..f92086acdf0 100644
--- a/source/blender/nodes/geometry/nodes/node_geo_input_normal.cc
+++ b/source/blender/nodes/geometry/nodes/node_geo_input_normal.cc
@@ -14,10 +14,13 @@
  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  */
 
+#include "BLI_task.hh"
+
 #include "DNA_mesh_types.h"
 #include "DNA_meshdata_types.h"
 
 #include "BKE_mesh.h"
+#include "BKE_spline.hh"
 
 #include "node_geometry_util.hh"
 
@@ -147,6 +150,95 @@ static const GVArray *construct_mesh_normals_gvarray(const MeshComponent &mesh_c
   }
 }
 
+static void calculate_bezier_normals(const BezierSpline &spline, MutableSpan<float3> normals)
+{
+  Span<int> offsets = spline.control_point_offsets();
+  Span<float3> evaluated_normals = spline.evaluated_normals();
+  for (const int i : IndexRange(spline.size())) {
+    normals[i] = evaluated_normals[offsets[i]];
+  }
+}
+
+static void calculate_poly_normals(const PolySpline &spline, MutableSpan<float3> normals)
+{
+  normals.copy_from(spline.evaluated_normals());
+}
+
+/**
+ * Because NURBS control points are not necessarily on the path, the normal at the control points
+ * is not well defined, so create a temporary poly spline to find the normals. This requires extra
+ * copying currently, but may be more efficient in the future if attributes have some form of CoW.
+ */
+static void calculate_nurbs_normals(const NURBSpline &spline, MutableSpan<float3> normals)
+{
+  PolySpline poly_spline;
+  poly_spline.resize(spline.size());
+  poly_spline.positions().copy_from(spline.positions());
+  normals.copy_from(poly_spline.evaluated_normals());
+}
+
+static Array<float3> curve_normal_point_domain(const CurveEval &curve)
+{
+  Span<SplinePtr> splines = curve.splines();
+  Array<int> offsets = curve.control_point_offsets();
+  const int total_size = offsets.last();
+  Array<float3> normals(total_size);
+
+  threading::parallel_for(splines.index_range(), 128, [&](IndexRange range) {
+    for (const int i : range) {
+      const Spline &spline = *splines[i];
+      MutableSpan spline_normals{normals.as_mutable_span().slice(offsets[i], spline.size())};
+      switch (splines[i]->type()) {
+        case Spline::Type::Bezier:
+          calculate_bezier_normals(static_cast<const BezierSpline &>(spline), spline_normals);
+          break;
+        case Spline::Type::Poly:
+          calculate_poly_normals(static_cast<const PolySpline &>(spline), spline_normals);
+          break;
+        case Spline::Type::NURBS:
+          calculate_nurbs_normals(static_cast<const NURBSpline &>(spline), spline_normals);
+          break;
+      }
+    }
+  });
+  return normals;
+}
+
+static const GVArray *construct_curve_normal_gvarray(const CurveComponent &component,
+                                                     const AttributeDomain domain,
+                                                     ResourceScope &scope)
+{
+  const CurveEval *curve = component.get_for_read();
+  if (curve == nullptr) {
+    return nullptr;
+  }
+
+  if (domain == ATTR_DOMAIN_POINT) {
+    const Span<SplinePtr> splines = curve->splines();
+
+    /* Use a reference to evaluated normals if possible to avoid an allocation and a copy.
+     * This is only possible when there is only one poly spline. */
+    if (splines.size() == 1 && splines.first()->type() == Spline::Type::Poly) {
+      const PolySpline &spline = static_cast<PolySpline &>(*splines.first());
+      return &scope.construct<fn::GVArray_For_Span<float3>>(spline.evaluated_normals());
+    }
+
+    Array<float3> normals = curve_normal_point_domain(*curve);
+    return &scope.construct<fn::GVArray_For_ArrayContainer<Array<float3>>>(std::move(normals));
+  }
+
+  if (domain == ATTR_DOMAIN_CURVE) {
+    Array<float3> point_normals = curve_normal_point_domain(*curve);
+    GVArrayPtr gvarray = std::make_unique<fn::GVArray_For_ArrayContainer<Array<float3>>>(
+        std::move(point_normals));
+    GVArrayPtr spline_normals = component.attribute_try_adapt_domain(
+        std::move(gvarray), ATTR_DOMAIN_POINT, ATTR_DOMAIN_CURVE);
+    return scope.add_value(std::move(spline_normals)).get();
+  }
+
+  return nullptr;
+}
+
 class NormalFieldInput final : public fn::FieldInput {
  public:
   NormalFieldInput() : fn::FieldInput(CPPType::get<float3>(), "Normal")
@@ -173,8 +265,8 @@ class NormalFieldInput final : public fn::FieldInput {
         return construct_mesh_normals_gvarray(mesh_component, *mesh, mask, domain, scope);
       }
       if (component.type() == GEO_COMPONENT_TYPE_CURVE) {
-        /* TODO: Add curve normals support. */
-        return nullptr;
+        const CurveComponent &curve_component = static_cast<const CurveComponent &>(component);
+        return construct_curve_normal_gvarray(curve_component, domain, scope);
       }
     }
     return nullptr;
diff --git a/source/blender/nodes/geometry/nodes/node_geo_input_tangent.cc b/source/blender/nodes/geometry/nodes/node_geo_input_tangent.cc
new file mode 100644
index 00000000000..68788709f1e
--- /dev/null
+++ b/source/blender/nodes/geometry/nodes/node_geo_input_tangent.cc
@@ -0,0 +1,174 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "BLI_task.hh"
+
+#include "BKE_spline.hh"
+
+#include "node_geometry_util.hh"
+
+namespace blender::nodes {
+
+static void geo_node_input_tangent_declare(NodeDeclarationBuilder &b)
+{
+  b.add_output<decl::Vector>("Tangent");
+}
+
+static void calculate_bezier_tangents(const BezierSpline &spline, MutableSpan<float3> tangents)
+{
+  Span<int> offsets = spline.control_point_offsets();
+  Span<float3> evaluated_tangents = spline.evaluated_tangents();
+  for (const int i : IndexRange(spline.size())) {
+    tangents[i] = evaluated_tangents[offsets[i]];
+  }
+}
+
+static void calculate_poly_tangents(const PolySpline &spline, MutableSpan<float3> tangents)
+{
+  tangents.copy_from(spline.evaluated_tangents());
+}
+
+/**
+ * Because NURBS control points are not necessarily on the path, the tangent at the control points
+ * is not well defined, so create a temporary poly spline to find the tangents. This requires extra
+ * copying currently, but may be more efficient in the future if attributes have some form of CoW.
+ */
+static void calculate_nurbs_tangents(const NURBSpline &spline, MutableSpan<float3> tangents)
+{
+  PolySpline poly_spline;
+  poly_spline.resize(spline.size());
+  poly_spline.positions().copy_from(spline.positions());
+  tangents.copy_from(poly_spline.evaluated_tangents());
+}
+
+static Array<float3> curve_tangent_point_domain(const CurveEval &curve)
+{
+  Span<SplinePtr> splines = curve.splines();
+  Array<int> offsets = curve.control_point_offsets();
+  const int total_size = offsets.last();
+  Array<float3> tangents(total_size);
+
+  threading::parallel_for(splines.index_range(), 128, [&](IndexRange range) {
+    for (const int i : range) {
+      const Spline &spline = *splines[i];
+      MutableSpan spline_tangents{tangents.as_mutable_span().slice(offsets[i], spline.size())};
+      switch (splines[i]->type()) {
+        case Spline::Type::Bezier: {
+          calculate_bezier_tangents(static_cast<const BezierSpline &>(spline), spline_tangents);
+          break;
+        }
+        case Spline::Type::Poly: {
+          calculate_poly_tangents(static_cast<const PolySpline &>(spline), spline_tangents);
+          break;
+        }
+        case Spline::Type::NURBS: {
+          calculate_nurbs_tangents(static_cast<const NURBSpline &>(spline), spline_tangents);
+          break;
+        }
+      }
+    }
+  });
+  return tangents;
+}
+
+static const GVArray *construct_curve_tangent_gvarray(const CurveComponent &component,
+                                                      const AttributeDomain domain,
+                                                      ResourceScope &scope)
+{
+  const CurveEval *curve = component.get_for_read();
+  if (curve == nullptr) {
+    return nullptr;
+  }
+
+  if (domain == ATTR_DOMAIN_POINT) {
+    const Span<SplinePtr> splines = curve->splines();
+
+    /* Use a reference to evaluated tangents if possible to avoid an allocation and a copy.
+     * This is only possible when there is only one poly spline. */
+    if (splines.size() == 1 && splines.first()->type() == Spline::Type::Poly) {
+      const PolySpline &spline = static_cast<PolySpline &>(*splines.first());
+      return &scope.construct<fn::GVArray_For_Span<float3>>(spline.evaluated_tangents());
+    }
+
+    Array<float3> tangents = curve_tangent_point_domain(*curve);
+    return &scope.construct<fn::GVArray_For_ArrayContainer<Array<float3>>>(std::move(tangents));
+  }
+
+  if (domain == ATTR_DOMAIN_CURVE) {
+    Array<float3> point_tangents = curve_tangent_point_domain(*curve);
+    GVArrayPtr gvarray = std::make_unique<fn::GVArray_For_ArrayContainer<Array<float3>>>(
+        std::move(point_tangents));
+    GVArrayPtr spline_tangents = component.attribute_try_adapt_domain(
+        std::move(gvarray), ATTR_DOMAIN_POINT, ATTR_DOMAIN_CURVE);
+    return scope.add_value(std::move(spline_tangents)).get();
+  }
+
+  return nullptr;
+}
+
+class TangentFieldInput final : public fn::FieldInput {
+ public:
+  TangentFieldInput() : fn::FieldInput(CPPType::get<float3>(), "Tangent")
+  {
+  }
+
+  const GVArray *get_varray_for_context(const fn::FieldContext &context,
+                                        IndexMask UNUSED(mask),
+                                        ResourceScope &scope) const final
+  {
+    if (const GeometryComponentFieldContext *geometry_context =
+            dynamic_cast<const GeometryComponentFieldContext *>(&context)) {
+
+      const GeometryComponent &component = geometry_context->geometry_component();
+      const AttributeDomain domain = geometry_context->domain();
+
+      if (component.type() == GEO_COMPONENT_TYPE_CURVE) {
+        const CurveComponent &curve_component = static_cast<const CurveComponent &>(component);
+        return construct_curve_tangent_gvarray(curve_component, domain, scope);
+      }
+    }
+    return nullptr;
+  }
+
+  uint64_t hash() const override
+  {
+    /* Some random constant hash. */
+    return 91827364589;
+  }
+
+  bool is_equal_to(const fn::FieldNode &other) const override
+  {
+    return dynamic_cast<const TangentFieldInput *>(&other) != nullptr;
+  }
+};
+
+static void geo_node_input_tangent_exec(GeoNodeExecParams params)
+{
+  Field<float3> tangent_field{std::make_shared<TangentFieldInput>()};
+  params.set_output("Tangent", std::move(tangent_field));
+}
+
+}  // namespace blender::nodes
+
+void register_node_type_geo_input_tangent()
+{
+  static bNodeType ntype;
+
+  geo_node_type_base(&ntype, GEO_NODE_INPUT_TANGENT, "Curve Tangent", NODE_CLASS_INPUT, 0);
+  ntype.geometry_node_execute = blender::nodes::geo_node_input_tangent_exec;
+  ntype.declare = blender::nodes::geo_node_input_tangent_declare;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/geometry/nodes/node_geo_string_join.cc b/source/blender/nodes/geometry/nodes/node_geo_string_join.cc
new file mode 100644
index 00000000000..1e4a4d1f68b
--- /dev/null
+++ b/source/blender/nodes/geometry/nodes/node_geo_string_join.cc
@@ -0,0 +1,53 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "node_geometry_util.hh"
+
+namespace blender::nodes {
+
+static void geo_node_string_join_declare(NodeDeclarationBuilder &b)
+{
+  b.add_input<decl::String>("Delimiter");
+  b.add_input<decl::String>("Strings").multi_input().hide_value();
+  b.add_output<decl::String>("String");
+};
+
+static void geo_node_string_join_exec(GeoNodeExecParams params)
+{
+  Vector<std::string> strings = params.extract_multi_input<std::string>("Strings");
+  const std::string delim = params.extract_input<std::string>("Delimiter");
+
+  std::string output;
+  for (const int i : strings.index_range()) {
+    output += strings[i];
+    if (i < (strings.size() - 1)) {
+      output += delim;
+    }
+  }
+  params.set_output("String", std::move(output));
+}
+
+}  // namespace blender::nodes
+
+void register_node_type_geo_string_join()
+{
+  static bNodeType ntype;
+
+  geo_node_type_base(&ntype, GEO_NODE_STRING_JOIN, "String Join", NODE_CLASS_CONVERTER, 0);
+  ntype.geometry_node_execute = blender::nodes::geo_node_string_join_exec;
+  ntype.declare = blender::nodes::geo_node_string_join_declare;
+  nodeRegisterType(&ntype);
+}
diff --git a/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c b/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
index f601f3e9fd0..06f4d1f1b79 100644
--- a/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
+++ b/source/blender/nodes/shader/nodes/node_shader_bsdf_principled.c
@@ -35,6 +35,8 @@ static bNodeSocketTemplate sh_node_bsdf_principled_in[] = {
      PROP_NONE,
      SOCK_COMPACT},
     {SOCK_RGBA, N_("Subsurface Color"), 0.8f, 0.8f, 0.8f, 1.0f, 0.0f, 1.0f},
+    {SOCK_FLOAT, N_("Subsurface IOR"), 1.4f, 0.0f, 0.0f, 0.0f, 1.01f, 3.8f, PROP_FACTOR},
+    {SOCK_FLOAT, N_("Subsurface Anisotropy"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_FLOAT, N_("Metallic"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_FLOAT, N_("Specular"), 0.5f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_FLOAT, N_("Specular Tint"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
@@ -74,7 +76,7 @@ static bNodeSocketTemplate sh_node_bsdf_principled_out[] = {
 static void node_shader_init_principled(bNodeTree *UNUSED(ntree), bNode *node)
 {
   node->custom1 = SHD_GLOSSY_GGX;
-  node->custom2 = SHD_SUBSURFACE_BURLEY;
+  node->custom2 = SHD_SUBSURFACE_RANDOM_WALK;
 }
 
 #define socket_not_zero(sock) (in[sock].link || (clamp_f(in[sock].vec[0], 0.0f, 1.0f) > 1e-5f))
@@ -90,41 +92,40 @@ static int node_shader_gpu_bsdf_principled(GPUMaterial *mat,
   GPUNodeLink *sss_scale;
 
   /* Normals */
-  if (!in[20].link) {
-    GPU_link(mat, "world_normals_get", &in[20].link);
+  if (!in[22].link) {
+    GPU_link(mat, "world_normals_get", &in[22].link);
   }
 
   /* Clearcoat Normals */
-  if (!in[21].link) {
-    GPU_link(mat, "world_normals_get", &in[21].link);
+  if (!in[23].link) {
+    GPU_link(mat, "world_normals_get", &in[23].link);
   }
 
 #if 0 /* Not used at the moment. */
   /* Tangents */
-  if (!in[22].link) {
+  if (!in[24].link) {
     GPUNodeLink *orco = GPU_attribute(CD_ORCO, "");
-    GPU_link(mat, "tangent_orco_z", orco, &in[22].link);
+    GPU_link(mat, "tangent_orco_z", orco, &in[24].link);
     GPU_link(mat,
              "node_tangent",
              GPU_builtin(GPU_WORLD_NORMAL),
-             in[22].link,
+             in[24].link,
              GPU_builtin(GPU_OBJECT_MATRIX),
-             &in[22].link);
+             &in[24].link);
   }
 #endif
 
-  bool use_diffuse = socket_not_one(4) && socket_not_one(15);
+  bool use_diffuse = socket_not_one(6) && socket_not_one(17);
   bool use_subsurf = socket_not_zero(1) && use_diffuse && node->sss_id > 0;
-  bool use_refract = socket_not_one(4) && socket_not_zero(15);
-  bool use_clear = socket_not_zero(12);
+  bool use_refract = socket_not_one(6) && socket_not_zero(17);
+  bool use_clear = socket_not_zero(14);
 
   /* SSS Profile */
   if (use_subsurf) {
-    static short profile = SHD_SUBSURFACE_BURLEY;
     bNodeSocket *socket = BLI_findlink(&node->original->inputs, 2);
     bNodeSocketValueRGBA *socket_data = socket->default_value;
     /* For some reason it seems that the socket value is in ARGB format. */
-    GPU_material_sss_profile_create(mat, &socket_data->value[1], &profile, NULL);
+    GPU_material_sss_profile_create(mat, &socket_data->value[1]);
   }
 
   if (in[2].link) {
diff --git a/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c b/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c
index 4b91bcbd11c..e917858e0f2 100644
--- a/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c
+++ b/source/blender/nodes/shader/nodes/node_shader_subsurface_scattering.c
@@ -25,8 +25,8 @@ static bNodeSocketTemplate sh_node_subsurface_scattering_in[] = {
     {SOCK_RGBA, N_("Color"), 0.8f, 0.8f, 0.8f, 1.0f, 0.0f, 1.0f},
     {SOCK_FLOAT, N_("Scale"), 1.0, 0.0f, 0.0f, 0.0f, 0.0f, 1000.0f},
     {SOCK_VECTOR, N_("Radius"), 1.0f, 0.2f, 0.1f, 0.0f, 0.0f, 100.0f, PROP_NONE, SOCK_COMPACT},
-    {SOCK_FLOAT, N_("Sharpness"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
-    {SOCK_FLOAT, N_("Texture Blur"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
+    {SOCK_FLOAT, N_("IOR"), 1.4f, 0.0f, 0.0f, 0.0f, 1.01f, 3.8f, PROP_FACTOR},
+    {SOCK_FLOAT, N_("Anisotropy"), 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, PROP_FACTOR},
     {SOCK_VECTOR, N_("Normal"), 0.0f, 0.0f, 0.0f, 1.0f, -1.0f, 1.0f, PROP_NONE, SOCK_HIDE_VALUE},
     {-1, ""},
 };
@@ -38,7 +38,8 @@ static bNodeSocketTemplate sh_node_subsurface_scattering_out[] = {
 
 static void node_shader_init_subsurface_scattering(bNodeTree *UNUSED(ntree), bNode *node)
 {
-  node->custom1 = SHD_SUBSURFACE_BURLEY;
+  node->custom1 = SHD_SUBSURFACE_RANDOM_WALK;
+  node->custom2 = true;
 }
 
 static int node_shader_gpu_subsurface_scattering(GPUMaterial *mat,
@@ -54,11 +55,8 @@ static int node_shader_gpu_subsurface_scattering(GPUMaterial *mat,
   if (node->sss_id > 0) {
     bNodeSocket *socket = BLI_findlink(&node->original->inputs, 2);
     bNodeSocketValueRGBA *socket_data = socket->default_value;
-    bNodeSocket *socket_sharp = BLI_findlink(&node->original->inputs, 3);
-    bNodeSocketValueFloat *socket_data_sharp = socket_sharp->default_value;
     /* For some reason it seems that the socket value is in ARGB format. */
-    GPU_material_sss_profile_create(
-        mat, &socket_data->value[1], &node->original->custom1, &socket_data_sharp->value);
+    GPU_material_sss_profile_create(mat, &socket_data->value[1]);
 
     /* sss_id is 0 only the node is not connected to any output.
      * In this case flagging the material would trigger a bug (see T68736). */
@@ -69,23 +67,6 @@ static int node_shader_gpu_subsurface_scattering(GPUMaterial *mat,
       mat, node, "node_subsurface_scattering", in, out, GPU_constant(&node->sss_id));
 }
 
-static void node_shader_update_subsurface_scattering(bNodeTree *UNUSED(ntree), bNode *node)
-{
-  bNodeSocket *sock;
-  int falloff = node->custom1;
-
-  for (sock = node->inputs.first; sock; sock = sock->next) {
-    if (STREQ(sock->name, "Sharpness")) {
-      if (falloff == SHD_SUBSURFACE_CUBIC) {
-        sock->flag &= ~SOCK_UNAVAIL;
-      }
-      else {
-        sock->flag |= SOCK_UNAVAIL;
-      }
-    }
-  }
-}
-
 /* node type definition */
 void register_node_type_sh_subsurface_scattering(void)
 {
@@ -99,7 +80,6 @@ void register_node_type_sh_subsurface_scattering(void)
   node_type_init(&ntype, node_shader_init_subsurface_scattering);
   node_type_storage(&ntype, "", NULL, NULL);
   node_type_gpu(&ntype, node_shader_gpu_subsurface_scattering);
-  node_type_update(&ntype, node_shader_update_subsurface_scattering);
 
   nodeRegisterType(&ntype);
 }
diff --git a/source/blender/render/CMakeLists.txt b/source/blender/render/CMakeLists.txt
index 0046474d064..494415a4077 100644
--- a/source/blender/render/CMakeLists.txt
+++ b/source/blender/render/CMakeLists.txt
@@ -59,7 +59,6 @@ set(SRC
   RE_pipeline.h
   RE_texture.h
 
-  intern/initrender.h
   intern/pipeline.h
   intern/render_result.h
   intern/render_types.h
diff --git a/source/blender/render/RE_engine.h b/source/blender/render/RE_engine.h
index dfc0d5d0e9f..2a3a5964262 100644
--- a/source/blender/render/RE_engine.h
+++ b/source/blender/render/RE_engine.h
@@ -40,6 +40,7 @@ struct RenderData;
 struct RenderEngine;
 struct RenderEngineType;
 struct RenderLayer;
+struct RenderPass;
 struct RenderResult;
 struct ReportList;
 struct Scene;
@@ -59,7 +60,7 @@ extern "C" {
 #define RE_USE_PREVIEW 4
 #define RE_USE_POSTPROCESS 8
 #define RE_USE_EEVEE_VIEWPORT 16
-#define RE_USE_SAVE_BUFFERS 32
+/* #define RE_USE_SAVE_BUFFERS_DEPRECATED 32 */
 #define RE_USE_SHADING_NODES_CUSTOM 64
 #define RE_USE_SPHERICAL_STEREO 128
 #define RE_USE_STEREO_VIEWPORT 256
@@ -75,6 +76,7 @@ extern "C" {
 #define RE_ENGINE_DO_UPDATE 8
 #define RE_ENGINE_RENDERING 16
 #define RE_ENGINE_HIGHLIGHT_TILES 32
+#define RE_ENGINE_CAN_DRAW 64
 
 extern ListBase R_engines;
 
@@ -87,7 +89,20 @@ typedef struct RenderEngineType {
   int flag;
 
   void (*update)(struct RenderEngine *engine, struct Main *bmain, struct Depsgraph *depsgraph);
+
   void (*render)(struct RenderEngine *engine, struct Depsgraph *depsgraph);
+
+  /* Offline rendering is finished - no more view layers will be rendered.
+   *
+   * All the pending data is to be communicated from the engine back to Blender. In a possibly
+   * most memory-efficient manner (engine might free its database before making Blender to allocate
+   * full-frame render result). */
+  void (*render_frame_finish)(struct RenderEngine *engine);
+
+  void (*draw)(struct RenderEngine *engine,
+               const struct bContext *context,
+               struct Depsgraph *depsgraph);
+
   void (*bake)(struct RenderEngine *engine,
                struct Depsgraph *depsgraph,
                struct Object *object,
@@ -132,9 +147,6 @@ typedef struct RenderEngine {
   struct Object *camera_override;
   unsigned int layer_override;
 
-  int tile_x;
-  int tile_y;
-
   struct Render *re;
   ListBase fullresult;
   char text[512]; /* IMA_MAX_RENDER_TEXT */
@@ -189,6 +201,10 @@ void RE_engine_end_result(RenderEngine *engine,
                           bool merge_results);
 struct RenderResult *RE_engine_get_result(struct RenderEngine *engine);
 
+struct RenderPass *RE_engine_pass_by_index_get(struct RenderEngine *engine,
+                                               const char *layer_name,
+                                               int index);
+
 const char *RE_engine_active_view_get(RenderEngine *engine);
 void RE_engine_active_view_set(RenderEngine *engine, const char *viewname);
 float RE_engine_get_camera_shift_x(RenderEngine *engine,
@@ -228,6 +244,24 @@ void RE_engine_register_pass(struct RenderEngine *engine,
 
 bool RE_engine_use_persistent_data(struct RenderEngine *engine);
 
+struct RenderEngine *RE_engine_get(const struct Render *re);
+
+/* Acquire render engine for drawing via its `draw()` callback.
+ *
+ * If drawing is not possible false is returned. If drawing is possible then the engine is
+ * "acquired" so that it can not be freed by the render pipeline.
+ *
+ * Drawing is possible if the engine has the `draw()` callback and it is in its `render()`
+ * callback. */
+bool RE_engine_draw_acquire(struct Render *re);
+void RE_engine_draw_release(struct Render *re);
+
+/* NOTE: Only used for Cycles's BLenderGPUDisplay integration with the draw manager. A subject
+ * for re-consideration. Do not use this functionality. */
+bool RE_engine_has_render_context(struct RenderEngine *engine);
+void RE_engine_render_context_enable(struct RenderEngine *engine);
+void RE_engine_render_context_disable(struct RenderEngine *engine);
+
 /* Engine Types */
 
 void RE_engines_init(void);
@@ -252,6 +286,10 @@ void RE_bake_engine_set_engine_parameters(struct Render *re,
 
 void RE_engine_free_blender_memory(struct RenderEngine *engine);
 
+void RE_engine_tile_highlight_set(
+    struct RenderEngine *engine, int x, int y, int width, int height, bool highlight);
+void RE_engine_tile_highlight_clear_all(struct RenderEngine *engine);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/render/RE_pipeline.h b/source/blender/render/RE_pipeline.h
index cd839385bfb..3237772dd80 100644
--- a/source/blender/render/RE_pipeline.h
+++ b/source/blender/render/RE_pipeline.h
@@ -141,9 +141,6 @@ typedef struct RenderResult {
   volatile rcti renrect;
   volatile RenderLayer *renlay;
 
-  /* optional saved endresult on disk */
-  int do_exr_tile;
-
   /* for render results in Image, verify validity for sequences */
   int framenr;
 
diff --git a/source/blender/render/intern/bake.c b/source/blender/render/intern/bake.c
index 76839651b5d..0f893ce8cd5 100644
--- a/source/blender/render/intern/bake.c
+++ b/source/blender/render/intern/bake.c
@@ -774,18 +774,6 @@ void RE_bake_pixels_populate(Mesh *me,
 
 /* ******************** NORMALS ************************ */
 
-/**
- * convert a normalized normal to the -1.0 1.0 range
- * the input is expected to be POS_X, POS_Y, POS_Z
- */
-static void normal_uncompress(float out[3], const float in[3])
-{
-  int i;
-  for (i = 0; i < 3; i++) {
-    out[i] = 2.0f * in[i] - 1.0f;
-  }
-}
-
 static void normal_compress(float out[3],
                             const float in[3],
                             const eBakeNormalSwizzle normal_swizzle[3])
@@ -934,7 +922,7 @@ void RE_bake_normal_world_to_tangent(const BakePixel pixel_array[],
     copy_v3_v3(tsm[2], normal);
 
     /* texture values */
-    normal_uncompress(nor, &result[offset]);
+    copy_v3_v3(nor, &result[offset]);
 
     /* converts from world space to local space */
     mul_transposed_mat3_m4_v3(mat, nor);
@@ -976,7 +964,7 @@ void RE_bake_normal_world_to_object(const BakePixel pixel_array[],
     }
 
     offset = i * depth;
-    normal_uncompress(nor, &result[offset]);
+    copy_v3_v3(nor, &result[offset]);
 
     /* rotates only without translation */
     mul_mat3_m4_v3(iobmat, nor);
@@ -1004,7 +992,7 @@ void RE_bake_normal_world_to_world(const BakePixel pixel_array[],
     }
 
     offset = i * depth;
-    normal_uncompress(nor, &result[offset]);
+    copy_v3_v3(nor, &result[offset]);
 
     /* save back the values */
     normal_compress(&result[offset], nor, normal_swizzle);
@@ -1053,6 +1041,7 @@ int RE_pass_depth(const eScenePassType pass_type)
     }
     case SCE_PASS_COMBINED:
     case SCE_PASS_SHADOW:
+    case SCE_PASS_POSITION:
     case SCE_PASS_NORMAL:
     case SCE_PASS_VECTOR:
     case SCE_PASS_INDEXOB: /* XXX double check */
diff --git a/source/blender/render/intern/engine.c b/source/blender/render/intern/engine.c
index 5728b784714..389b821ca35 100644
--- a/source/blender/render/intern/engine.c
+++ b/source/blender/render/intern/engine.c
@@ -62,7 +62,6 @@
 
 #include "DRW_engine.h"
 
-#include "initrender.h"
 #include "pipeline.h"
 #include "render_result.h"
 #include "render_types.h"
@@ -283,14 +282,6 @@ static void render_result_to_bake(RenderEngine *engine, RenderResult *rr)
 
 /* Render Results */
 
-static RenderPart *get_part_from_result(Render *re, RenderResult *result)
-{
-  rcti key = result->tilerect;
-  BLI_rcti_translate(&key, re->disprect.xmin, re->disprect.ymin);
-
-  return BLI_ghash_lookup(re->parts, &key);
-}
-
 static HighlightedTile highlighted_tile_from_result_get(Render *re, RenderResult *result)
 {
   HighlightedTile tile;
@@ -300,6 +291,37 @@ static HighlightedTile highlighted_tile_from_result_get(Render *re, RenderResult
   return tile;
 }
 
+static void engine_tile_highlight_set(RenderEngine *engine,
+                                      const HighlightedTile *tile,
+                                      bool highlight)
+{
+  if ((engine->flag & RE_ENGINE_HIGHLIGHT_TILES) == 0) {
+    return;
+  }
+
+  Render *re = engine->re;
+
+  BLI_mutex_lock(&re->highlighted_tiles_mutex);
+
+  if (re->highlighted_tiles == NULL) {
+    re->highlighted_tiles = BLI_gset_new(
+        BLI_ghashutil_inthash_v4_p, BLI_ghashutil_inthash_v4_cmp, "highlighted tiles");
+  }
+
+  if (highlight) {
+    HighlightedTile **tile_in_set;
+    if (!BLI_gset_ensure_p_ex(re->highlighted_tiles, tile, (void ***)&tile_in_set)) {
+      *tile_in_set = MEM_mallocN(sizeof(HighlightedTile), __func__);
+      **tile_in_set = *tile;
+    }
+  }
+  else {
+    BLI_gset_remove(re->highlighted_tiles, tile, MEM_freeN);
+  }
+
+  BLI_mutex_unlock(&re->highlighted_tiles_mutex);
+}
+
 RenderResult *RE_engine_begin_result(
     RenderEngine *engine, int x, int y, int w, int h, const char *layername, const char *viewname)
 {
@@ -332,7 +354,7 @@ RenderResult *RE_engine_begin_result(
   disprect.ymin = y;
   disprect.ymax = y + h;
 
-  result = render_result_new(re, &disprect, RR_USE_MEM, layername, viewname);
+  result = render_result_new(re, &disprect, layername, viewname);
 
   /* TODO: make this thread safe. */
 
@@ -341,25 +363,12 @@ RenderResult *RE_engine_begin_result(
     render_result_clone_passes(re, result, viewname);
     render_result_passes_allocated_ensure(result);
 
-    RenderPart *pa;
-
-    /* Copy EXR tile settings, so pipeline knows whether this is a result
-     * for Save Buffers enabled rendering.
-     */
-    result->do_exr_tile = re->result->do_exr_tile;
-
     BLI_addtail(&engine->fullresult, result);
 
     result->tilerect.xmin += re->disprect.xmin;
     result->tilerect.xmax += re->disprect.xmin;
     result->tilerect.ymin += re->disprect.ymin;
     result->tilerect.ymax += re->disprect.ymin;
-
-    pa = get_part_from_result(re, result);
-
-    if (pa) {
-      pa->status = PART_STATUS_IN_PROGRESS;
-    }
   }
 
   return result;
@@ -426,53 +435,14 @@ void RE_engine_end_result(
 
   re_ensure_passes_allocated_thread_safe(re);
 
-  /* merge. on break, don't merge in result for preview renders, looks nicer */
-  if (!highlight) {
-    /* for exr tile render, detect tiles that are done */
-    RenderPart *pa = get_part_from_result(re, result);
-
-    if (pa) {
-      pa->status = (!cancel && merge_results) ? PART_STATUS_MERGED : PART_STATUS_RENDERED;
-    }
-    else if (re->result->do_exr_tile) {
-      /* If written result does not match any tile and we are using save
-       * buffers, we are going to get OpenEXR save errors. */
-      fprintf(stderr, "RenderEngine.end_result: dimensions do not match any OpenEXR tile.\n");
-    }
-  }
-
   if (re->engine && (re->engine->flag & RE_ENGINE_HIGHLIGHT_TILES)) {
-    BLI_mutex_lock(&re->highlighted_tiles_mutex);
-
-    if (re->highlighted_tiles == NULL) {
-      re->highlighted_tiles = BLI_gset_new(
-          BLI_ghashutil_inthash_v4_p, BLI_ghashutil_inthash_v4_cmp, "highlighted tiles");
-    }
+    const HighlightedTile tile = highlighted_tile_from_result_get(re, result);
 
-    HighlightedTile tile = highlighted_tile_from_result_get(re, result);
-    if (highlight) {
-      void **tile_in_set;
-      if (!BLI_gset_ensure_p_ex(re->highlighted_tiles, &tile, &tile_in_set)) {
-        *tile_in_set = MEM_mallocN(sizeof(HighlightedTile), __func__);
-        memcpy(*tile_in_set, &tile, sizeof(tile));
-      }
-      BLI_gset_add(re->highlighted_tiles, &tile);
-    }
-    else {
-      BLI_gset_remove(re->highlighted_tiles, &tile, MEM_freeN);
-    }
-
-    BLI_mutex_unlock(&re->highlighted_tiles_mutex);
+    engine_tile_highlight_set(engine, &tile, highlight);
   }
 
   if (!cancel || merge_results) {
-    if (re->result->do_exr_tile) {
-      if (!cancel && merge_results) {
-        render_result_exr_file_merge(re->result, result, re->viewname);
-        render_result_merge(re->result, result);
-      }
-    }
-    else if (!(re->test_break(re->tbh) && (re->r.scemode & R_BUTS_PREVIEW))) {
+    if (!(re->test_break(re->tbh) && (re->r.scemode & R_BUTS_PREVIEW))) {
       render_result_merge(re->result, result);
     }
 
@@ -582,6 +552,27 @@ void RE_engine_set_error_message(RenderEngine *engine, const char *msg)
   }
 }
 
+RenderPass *RE_engine_pass_by_index_get(RenderEngine *engine, const char *layer_name, int index)
+{
+  Render *re = engine->re;
+  if (re == NULL) {
+    return NULL;
+  }
+
+  RenderPass *pass = NULL;
+
+  RenderResult *rr = RE_AcquireResultRead(re);
+  if (rr != NULL) {
+    const RenderLayer *layer = RE_GetRenderLayer(rr, layer_name);
+    if (layer != NULL) {
+      pass = BLI_findlink(&layer->passes, index);
+    }
+  }
+  RE_ReleaseResult(re);
+
+  return pass;
+}
+
 const char *RE_engine_active_view_get(RenderEngine *engine)
 {
   Render *re = engine->re;
@@ -837,12 +828,6 @@ bool RE_bake_engine(Render *re,
   engine->resolution_x = re->winx;
   engine->resolution_y = re->winy;
 
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-  RE_parts_init(re);
-  engine->tile_x = re->r.tilex;
-  engine->tile_y = re->r.tiley;
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
   if (type->bake) {
     engine->depsgraph = depsgraph;
 
@@ -870,21 +855,13 @@ bool RE_bake_engine(Render *re,
     engine->depsgraph = NULL;
   }
 
-  engine->tile_x = 0;
-  engine->tile_y = 0;
   engine->flag &= ~RE_ENGINE_RENDERING;
 
-  /* Free depsgraph outside of parts mutex lock, since this locks OpenGL context
-   * while the UI drawing might also lock the OpenGL context and parts mutex. */
   engine_depsgraph_free(engine);
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
 
   RE_engine_free(engine);
   re->engine = NULL;
 
-  RE_parts_free(re);
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
   if (BKE_reports_contain(re->reports, RPT_ERROR)) {
     G.is_break = true;
   }
@@ -928,15 +905,23 @@ static void engine_render_view_layer(Render *re,
       DRW_render_context_enable(engine->re);
     }
 
+    BLI_mutex_lock(&engine->re->engine_draw_mutex);
+    re->engine->flag |= RE_ENGINE_CAN_DRAW;
+    BLI_mutex_unlock(&engine->re->engine_draw_mutex);
+
     engine->type->render(engine, engine->depsgraph);
 
+    BLI_mutex_lock(&engine->re->engine_draw_mutex);
+    re->engine->flag &= ~RE_ENGINE_CAN_DRAW;
+    BLI_mutex_unlock(&engine->re->engine_draw_mutex);
+
     if (use_gpu_context) {
       DRW_render_context_disable(engine->re);
     }
   }
 
   /* Optionally composite grease pencil over render result. */
-  if (engine->has_grease_pencil && use_grease_pencil && !re->result->do_exr_tile) {
+  if (engine->has_grease_pencil && use_grease_pencil) {
     /* NOTE: External engine might have been requested to free its
      * dependency graph, which is only allowed if there is no grease
      * pencil (pipeline is taking care of that). */
@@ -981,16 +966,11 @@ bool RE_engine_render(Render *re, bool do_all)
   /* create render result */
   BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
   if (re->result == NULL || !(re->r.scemode & R_BUTS_PREVIEW)) {
-    int savebuffers = RR_USE_MEM;
-
     if (re->result) {
       render_result_free(re->result);
     }
 
-    if ((type->flag & RE_USE_SAVE_BUFFERS) && (re->r.scemode & R_EXR_TILE_FILE)) {
-      savebuffers = RR_USE_EXR;
-    }
-    re->result = render_result_new(re, &re->disprect, savebuffers, RR_ALL_LAYERS, RR_ALL_VIEWS);
+    re->result = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
   }
   BLI_rw_mutex_unlock(&re->resultmutex);
 
@@ -1035,32 +1015,15 @@ bool RE_engine_render(Render *re, bool do_all)
   engine->resolution_x = re->winx;
   engine->resolution_y = re->winy;
 
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-  RE_parts_init(re);
-  engine->tile_x = re->partx;
-  engine->tile_y = re->party;
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
-  if (re->result->do_exr_tile) {
-    render_result_exr_file_begin(re, engine);
-  }
-
   /* Clear UI drawing locks. */
   if (re->draw_lock) {
     re->draw_lock(re->dlh, false);
   }
 
-  /* Render view layers. */
-  bool delay_grease_pencil = false;
-
   if (type->render) {
     FOREACH_VIEW_LAYER_TO_RENDER_BEGIN (re, view_layer_iter) {
       engine_render_view_layer(re, engine, view_layer_iter, true, true);
 
-      /* With save buffers there is no render buffer in memory for compositing, delay
-       * grease pencil in that case. */
-      delay_grease_pencil = engine->has_grease_pencil && re->result->do_exr_tile;
-
       if (RE_engine_test_break(engine)) {
         break;
       }
@@ -1068,42 +1031,18 @@ bool RE_engine_render(Render *re, bool do_all)
     FOREACH_VIEW_LAYER_TO_RENDER_END;
   }
 
+  if (type->render_frame_finish) {
+    type->render_frame_finish(engine);
+  }
+
   /* Clear tile data */
-  engine->tile_x = 0;
-  engine->tile_y = 0;
   engine->flag &= ~RE_ENGINE_RENDERING;
 
   render_result_free_list(&engine->fullresult, engine->fullresult.first);
 
-  BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-
-  /* For save buffers, read back from disk. */
-  if (re->result->do_exr_tile) {
-    render_result_exr_file_end(re, engine);
-  }
-
-  /* Perform delayed grease pencil rendering. */
-  if (delay_grease_pencil) {
-    BLI_rw_mutex_unlock(&re->partsmutex);
-
-    FOREACH_VIEW_LAYER_TO_RENDER_BEGIN (re, view_layer_iter) {
-      engine_render_view_layer(re, engine, view_layer_iter, false, true);
-      if (RE_engine_test_break(engine)) {
-        break;
-      }
-    }
-    FOREACH_VIEW_LAYER_TO_RENDER_END;
-
-    BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
-  }
-
   /* re->engine becomes zero if user changed active render engine during render */
   if (!engine_keep_depsgraph(engine) || !re->engine) {
-    /* Free depsgraph outside of parts mutex lock, since this locks OpenGL context
-     * while the UI drawing might also lock the OpenGL context and parts mutex. */
-    BLI_rw_mutex_unlock(&re->partsmutex);
     engine_depsgraph_free(engine);
-    BLI_rw_mutex_lock(&re->partsmutex, THREAD_LOCK_WRITE);
 
     RE_engine_free(engine);
     re->engine = NULL;
@@ -1115,9 +1054,6 @@ bool RE_engine_render(Render *re, bool do_all)
     BLI_rw_mutex_unlock(&re->resultmutex);
   }
 
-  RE_parts_free(re);
-  BLI_rw_mutex_unlock(&re->partsmutex);
-
   if (BKE_reports_contain(re->reports, RPT_ERROR)) {
     G.is_break = true;
   }
@@ -1179,3 +1115,81 @@ void RE_engine_free_blender_memory(RenderEngine *engine)
   }
   engine_depsgraph_free(engine);
 }
+
+struct RenderEngine *RE_engine_get(const Render *re)
+{
+  return re->engine;
+}
+
+bool RE_engine_draw_acquire(Render *re)
+{
+  BLI_mutex_lock(&re->engine_draw_mutex);
+
+  RenderEngine *engine = re->engine;
+
+  if (engine == NULL || engine->type->draw == NULL || (engine->flag & RE_ENGINE_CAN_DRAW) == 0) {
+    BLI_mutex_unlock(&re->engine_draw_mutex);
+    return false;
+  }
+
+  return true;
+}
+
+void RE_engine_draw_release(Render *re)
+{
+  BLI_mutex_unlock(&re->engine_draw_mutex);
+}
+
+void RE_engine_tile_highlight_set(
+    RenderEngine *engine, int x, int y, int width, int height, bool highlight)
+{
+  HighlightedTile tile;
+  BLI_rcti_init(&tile.rect, x, x + width, y, y + height);
+
+  engine_tile_highlight_set(engine, &tile, highlight);
+}
+
+void RE_engine_tile_highlight_clear_all(RenderEngine *engine)
+{
+  if ((engine->flag & RE_ENGINE_HIGHLIGHT_TILES) == 0) {
+    return;
+  }
+
+  Render *re = engine->re;
+
+  BLI_mutex_lock(&re->highlighted_tiles_mutex);
+
+  if (re->highlighted_tiles != NULL) {
+    BLI_gset_clear(re->highlighted_tiles, MEM_freeN);
+  }
+
+  BLI_mutex_unlock(&re->highlighted_tiles_mutex);
+}
+
+/* -------------------------------------------------------------------- */
+/** \name OpenGL context manipulation.
+ *
+ * NOTE: Only used for Cycles's BLenderGPUDisplay integration with the draw manager. A subject
+ * for re-consideration. Do not use this functionality.
+ * \{ */
+
+bool RE_engine_has_render_context(RenderEngine *engine)
+{
+  if (engine->re == NULL) {
+    return false;
+  }
+
+  return RE_gl_context_get(engine->re) != NULL;
+}
+
+void RE_engine_render_context_enable(RenderEngine *engine)
+{
+  DRW_render_context_enable(engine->re);
+}
+
+void RE_engine_render_context_disable(RenderEngine *engine)
+{
+  DRW_render_context_disable(engine->re);
+}
+
+/** \} */
diff --git a/source/blender/render/intern/initrender.c b/source/blender/render/intern/initrender.c
index 3148625c866..2370d8e893b 100644
--- a/source/blender/render/intern/initrender.c
+++ b/source/blender/render/intern/initrender.c
@@ -43,9 +43,6 @@
 #include "pipeline.h"
 #include "render_types.h"
 
-/* Own includes */
-#include "initrender.h"
-
 /* ****************** MASKS and LUTS **************** */
 
 static float filt_quadratic(float x)
@@ -244,91 +241,3 @@ void RE_GetViewPlane(Render *re, rctf *r_viewplane, rcti *r_disprect)
     BLI_rcti_init(r_disprect, 0, 0, 0, 0);
   }
 }
-
-/* ~~~~~~~~~~~~~~~~ part (tile) calculus ~~~~~~~~~~~~~~~~~~~~~~ */
-
-void RE_parts_free(Render *re)
-{
-  if (re->parts) {
-    BLI_ghash_free(re->parts, NULL, MEM_freeN);
-    re->parts = NULL;
-  }
-}
-
-void RE_parts_clamp(Render *re)
-{
-  /* part size */
-  re->partx = max_ii(1, min_ii(re->r.tilex, re->rectx));
-  re->party = max_ii(1, min_ii(re->r.tiley, re->recty));
-}
-
-void RE_parts_init(Render *re)
-{
-  int nr, xd, yd, partx, party, xparts, yparts;
-  int xminb, xmaxb, yminb, ymaxb;
-
-  RE_parts_free(re);
-
-  re->parts = BLI_ghash_new(
-      BLI_ghashutil_inthash_v4_p, BLI_ghashutil_inthash_v4_cmp, "render parts");
-
-  /* Just for readable code. */
-  xminb = re->disprect.xmin;
-  yminb = re->disprect.ymin;
-  xmaxb = re->disprect.xmax;
-  ymaxb = re->disprect.ymax;
-
-  RE_parts_clamp(re);
-
-  partx = re->partx;
-  party = re->party;
-  /* part count */
-  xparts = (re->rectx + partx - 1) / partx;
-  yparts = (re->recty + party - 1) / party;
-
-  for (nr = 0; nr < xparts * yparts; nr++) {
-    rcti disprect;
-    int rectx, recty;
-
-    xd = (nr % xparts);
-    yd = (nr - xd) / xparts;
-
-    disprect.xmin = xminb + xd * partx;
-    disprect.ymin = yminb + yd * party;
-
-    /* ensure we cover the entire picture, so last parts go to end */
-    if (xd < xparts - 1) {
-      disprect.xmax = disprect.xmin + partx;
-      if (disprect.xmax > xmaxb) {
-        disprect.xmax = xmaxb;
-      }
-    }
-    else {
-      disprect.xmax = xmaxb;
-    }
-
-    if (yd < yparts - 1) {
-      disprect.ymax = disprect.ymin + party;
-      if (disprect.ymax > ymaxb) {
-        disprect.ymax = ymaxb;
-      }
-    }
-    else {
-      disprect.ymax = ymaxb;
-    }
-
-    rectx = BLI_rcti_size_x(&disprect);
-    recty = BLI_rcti_size_y(&disprect);
-
-    /* so, now can we add this part? */
-    if (rectx > 0 && recty > 0) {
-      RenderPart *pa = MEM_callocN(sizeof(RenderPart), "new part");
-
-      pa->disprect = disprect;
-      pa->rectx = rectx;
-      pa->recty = recty;
-
-      BLI_ghash_insert(re->parts, &pa->disprect, pa);
-    }
-  }
-}
diff --git a/source/blender/render/intern/pipeline.c b/source/blender/render/intern/pipeline.c
index 5418f4035b1..72ff920561d 100644
--- a/source/blender/render/intern/pipeline.c
+++ b/source/blender/render/intern/pipeline.c
@@ -102,7 +102,6 @@
 #include "DEG_depsgraph.h"
 
 /* internal */
-#include "initrender.h"
 #include "pipeline.h"
 #include "render_result.h"
 #include "render_types.h"
@@ -568,7 +567,7 @@ Render *RE_NewRender(const char *name)
     BLI_addtail(&RenderGlobal.renderlist, re);
     BLI_strncpy(re->name, name, RE_MAXNAME);
     BLI_rw_mutex_init(&re->resultmutex);
-    BLI_rw_mutex_init(&re->partsmutex);
+    BLI_mutex_init(&re->engine_draw_mutex);
     BLI_mutex_init(&re->highlighted_tiles_mutex);
   }
 
@@ -633,7 +632,7 @@ void RE_FreeRender(Render *re)
   }
 
   BLI_rw_mutex_end(&re->resultmutex);
-  BLI_rw_mutex_end(&re->partsmutex);
+  BLI_mutex_end(&re->engine_draw_mutex);
   BLI_mutex_end(&re->highlighted_tiles_mutex);
 
   BLI_freelistN(&re->view_layers);
@@ -722,26 +721,6 @@ void RE_FreePersistentData(const Scene *scene)
 
 /* ********* initialize state ******** */
 
-/* clear full sample and tile flags if needed */
-static int check_mode_full_sample(RenderData *rd)
-{
-  int scemode = rd->scemode;
-
-  /* not supported by any current renderer */
-  scemode &= ~R_FULL_SAMPLE;
-
-#ifdef WITH_OPENEXR
-  if (scemode & R_FULL_SAMPLE) {
-    scemode |= R_EXR_TILE_FILE; /* enable automatic */
-  }
-#else
-  /* can't do this without openexr support */
-  scemode &= ~(R_EXR_TILE_FILE | R_FULL_SAMPLE);
-#endif
-
-  return scemode;
-}
-
 static void re_init_resolution(Render *re, Render *source, int winx, int winy, rcti *disprect)
 {
   re->winx = winx;
@@ -839,8 +818,6 @@ void RE_InitState(Render *re,
     return;
   }
 
-  re->r.scemode = check_mode_full_sample(&re->r);
-
   if (single_layer) {
     int index = BLI_findindex(render_layers, single_layer);
     if (index != -1) {
@@ -890,9 +867,6 @@ void RE_InitState(Render *re,
     render_result_view_new(re->result, "");
   }
 
-  /* ensure renderdatabase can use part settings correct */
-  RE_parts_clamp(re);
-
   BLI_rw_mutex_unlock(&re->resultmutex);
 
   RE_init_threadcount(re);
@@ -1040,7 +1014,7 @@ static void render_result_uncrop(Render *re)
       /* weak is: it chances disprect from border */
       render_result_disprect_to_full_resolution(re);
 
-      rres = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
+      rres = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
       render_result_passes_allocated_ensure(rres);
       rres->stamp_data = BKE_stamp_data_copy(re->result->stamp_data);
 
@@ -1227,7 +1201,7 @@ static void do_render_compositor(Render *re)
     if ((re->r.mode & R_CROP) == 0) {
       render_result_disprect_to_full_resolution(re);
     }
-    re->result = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
+    re->result = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
 
     BLI_rw_mutex_unlock(&re->resultmutex);
 
@@ -1647,7 +1621,7 @@ bool RE_is_rendering_allowed(Scene *scene,
                              Object *camera_override,
                              ReportList *reports)
 {
-  int scemode = check_mode_full_sample(&scene->r);
+  const int scemode = scene->r.scemode;
 
   if (scene->r.mode & R_BORDER) {
     if (scene->r.border.xmax <= scene->r.border.xmin ||
@@ -1657,17 +1631,6 @@ bool RE_is_rendering_allowed(Scene *scene,
     }
   }
 
-  if (scemode & (R_EXR_TILE_FILE | R_FULL_SAMPLE)) {
-    char str[FILE_MAX];
-
-    render_result_exr_file_path(scene, "", 0, str);
-
-    if (!BLI_file_is_writable(str)) {
-      BKE_report(reports, RPT_ERROR, "Cannot save render buffers, check the temp default path");
-      return 0;
-    }
-  }
-
   if (RE_seq_render_active(scene, &scene->r)) {
     /* Sequencer */
     if (scene->r.mode & R_BORDER) {
@@ -1686,13 +1649,6 @@ bool RE_is_rendering_allowed(Scene *scene,
       BKE_report(reports, RPT_ERROR, "No render output node in scene");
       return 0;
     }
-
-    if (scemode & R_FULL_SAMPLE) {
-      if (compositor_needs_render(scene, 0) == 0) {
-        BKE_report(reports, RPT_ERROR, "Full sample AA not supported without 3D rendering");
-        return 0;
-      }
-    }
   }
   else {
     /* Regular Render */
@@ -1710,14 +1666,6 @@ bool RE_is_rendering_allowed(Scene *scene,
   return 1;
 }
 
-static void validate_render_settings(Render *re)
-{
-  if (RE_engine_is_external(re)) {
-    /* not supported yet */
-    re->r.scemode &= ~R_FULL_SAMPLE;
-  }
-}
-
 static void update_physics_cache(Render *re,
                                  Scene *scene,
                                  ViewLayer *view_layer,
@@ -1820,8 +1768,6 @@ static int render_init_from_main(Render *re,
   /* initstate makes new result, have to send changed tags around */
   ntreeCompositTagRender(re->scene);
 
-  validate_render_settings(re);
-
   re->display_init(re->dih, re->result);
   re->display_clear(re->dch, re->result);
 
diff --git a/source/blender/render/intern/render_result.c b/source/blender/render/intern/render_result.c
index 6cb6aabe885..c308147fc5b 100644
--- a/source/blender/render/intern/render_result.c
+++ b/source/blender/render/intern/render_result.c
@@ -260,8 +260,10 @@ RenderPass *render_layer_add_pass(RenderResult *rr,
 /* will read info from Render *re to define layers */
 /* called in threads */
 /* re->winx,winy is coordinate space of entire image, partrct the part within */
-RenderResult *render_result_new(
-    Render *re, rcti *partrct, int savebuffers, const char *layername, const char *viewname)
+RenderResult *render_result_new(Render *re,
+                                rcti *partrct,
+                                const char *layername,
+                                const char *viewname)
 {
   RenderResult *rr;
   RenderLayer *rl;
@@ -287,10 +289,6 @@ RenderResult *render_result_new(
   rr->tilerect.ymin = partrct->ymin - re->disprect.ymin;
   rr->tilerect.ymax = partrct->ymax - re->disprect.ymin;
 
-  if (savebuffers) {
-    rr->do_exr_tile = true;
-  }
-
   rr->passes_allocated = false;
 
   render_result_views_new(rr, &re->r);
@@ -314,10 +312,6 @@ RenderResult *render_result_new(
     rl->rectx = rectx;
     rl->recty = recty;
 
-    if (rr->do_exr_tile) {
-      rl->exrhandle = IMB_exr_get_handle();
-    }
-
     for (rv = rr->views.first; rv; rv = rv->next) {
       const char *view = rv->name;
 
@@ -327,10 +321,6 @@ RenderResult *render_result_new(
         }
       }
 
-      if (rr->do_exr_tile) {
-        IMB_exr_add_view(rl->exrhandle, view);
-      }
-
 #define RENDER_LAYER_ADD_PASS_SAFE(rr, rl, channels, name, viewname, chan_id) \
   do { \
     if (render_layer_add_pass(rr, rl, channels, name, viewname, chan_id) == NULL) { \
@@ -351,6 +341,9 @@ RenderResult *render_result_new(
       if (view_layer->passflag & SCE_PASS_NORMAL) {
         RENDER_LAYER_ADD_PASS_SAFE(rr, rl, 3, RE_PASSNAME_NORMAL, view, "XYZ");
       }
+      if (view_layer->passflag & SCE_PASS_POSITION) {
+        RENDER_LAYER_ADD_PASS_SAFE(rr, rl, 3, RE_PASSNAME_POSITION, view, "XYZ");
+      }
       if (view_layer->passflag & SCE_PASS_UV) {
         RENDER_LAYER_ADD_PASS_SAFE(rr, rl, 3, RE_PASSNAME_UV, view, "UVA");
       }
@@ -424,11 +417,6 @@ RenderResult *render_result_new(
     rl->rectx = rectx;
     rl->recty = recty;
 
-    /* duplicate code... */
-    if (rr->do_exr_tile) {
-      rl->exrhandle = IMB_exr_get_handle();
-    }
-
     for (rv = rr->views.first; rv; rv = rv->next) {
       const char *view = rv->name;
 
@@ -438,10 +426,6 @@ RenderResult *render_result_new(
         }
       }
 
-      if (rr->do_exr_tile) {
-        IMB_exr_add_view(rl->exrhandle, view);
-      }
-
       /* a renderlayer should always have a Combined pass */
       render_layer_add_pass(rr, rl, 4, RE_PASSNAME_COMBINED, view, "RGBA");
     }
@@ -1089,227 +1073,6 @@ void render_result_single_layer_end(Render *re)
   re->pushedresult = NULL;
 }
 
-/************************* EXR Tile File Rendering ***************************/
-
-static void save_render_result_tile(RenderResult *rr, RenderResult *rrpart, const char *viewname)
-{
-  RenderLayer *rlp, *rl;
-  RenderPass *rpassp;
-  int partx, party;
-
-  BLI_thread_lock(LOCK_IMAGE);
-
-  for (rlp = rrpart->layers.first; rlp; rlp = rlp->next) {
-    rl = RE_GetRenderLayer(rr, rlp->name);
-
-    /* should never happen but prevents crash if it does */
-    BLI_assert(rl);
-    if (UNLIKELY(rl == NULL)) {
-      continue;
-    }
-
-    /* passes are allocated in sync */
-    for (rpassp = rlp->passes.first; rpassp; rpassp = rpassp->next) {
-      const int xstride = rpassp->channels;
-      int a;
-      char fullname[EXR_PASS_MAXNAME];
-
-      for (a = 0; a < xstride; a++) {
-        set_pass_full_name(fullname, rpassp->name, a, viewname, rpassp->chan_id);
-
-        IMB_exr_set_channel(rl->exrhandle,
-                            rlp->name,
-                            fullname,
-                            xstride,
-                            xstride * rrpart->rectx,
-                            rpassp->rect + a);
-      }
-    }
-  }
-
-  party = rrpart->tilerect.ymin;
-  partx = rrpart->tilerect.xmin;
-
-  for (rlp = rrpart->layers.first; rlp; rlp = rlp->next) {
-    rl = RE_GetRenderLayer(rr, rlp->name);
-
-    /* should never happen but prevents crash if it does */
-    BLI_assert(rl);
-    if (UNLIKELY(rl == NULL)) {
-      continue;
-    }
-
-    IMB_exrtile_write_channels(rl->exrhandle, partx, party, 0, viewname, false);
-  }
-
-  BLI_thread_unlock(LOCK_IMAGE);
-}
-
-void render_result_save_empty_result_tiles(Render *re)
-{
-  RenderResult *rr;
-  RenderLayer *rl;
-
-  for (rr = re->result; rr; rr = rr->next) {
-    for (rl = rr->layers.first; rl; rl = rl->next) {
-      GHashIterator pa_iter;
-      GHASH_ITER (pa_iter, re->parts) {
-        RenderPart *pa = BLI_ghashIterator_getValue(&pa_iter);
-        if (pa->status != PART_STATUS_MERGED) {
-          int party = pa->disprect.ymin - re->disprect.ymin;
-          int partx = pa->disprect.xmin - re->disprect.xmin;
-          IMB_exrtile_write_channels(rl->exrhandle, partx, party, 0, re->viewname, true);
-        }
-      }
-    }
-  }
-}
-
-/* Compute list of passes needed by render engine. */
-static void templates_register_pass_cb(void *userdata,
-                                       Scene *UNUSED(scene),
-                                       ViewLayer *UNUSED(view_layer),
-                                       const char *name,
-                                       int channels,
-                                       const char *chan_id,
-                                       eNodeSocketDatatype UNUSED(type))
-{
-  ListBase *templates = userdata;
-  RenderPass *pass = MEM_callocN(sizeof(RenderPass), "RenderPassTemplate");
-
-  pass->channels = channels;
-  BLI_strncpy(pass->name, name, sizeof(pass->name));
-  BLI_strncpy(pass->chan_id, chan_id, sizeof(pass->chan_id));
-
-  BLI_addtail(templates, pass);
-}
-
-static void render_result_get_pass_templates(RenderEngine *engine,
-                                             Render *re,
-                                             RenderLayer *rl,
-                                             ListBase *templates)
-{
-  BLI_listbase_clear(templates);
-
-  if (engine && engine->type->update_render_passes) {
-    ViewLayer *view_layer = BLI_findstring(&re->view_layers, rl->name, offsetof(ViewLayer, name));
-    if (view_layer) {
-      RE_engine_update_render_passes(
-          engine, re->scene, view_layer, templates_register_pass_cb, templates);
-    }
-  }
-}
-
-/* begin write of exr tile file */
-void render_result_exr_file_begin(Render *re, RenderEngine *engine)
-{
-  char str[FILE_MAX];
-
-  for (RenderResult *rr = re->result; rr; rr = rr->next) {
-    LISTBASE_FOREACH (RenderLayer *, rl, &rr->layers) {
-      /* Get passes needed by engine. Normally we would wait for the
-       * engine to create them, but for EXR file we need to know in
-       * advance. */
-      ListBase templates;
-      render_result_get_pass_templates(engine, re, rl, &templates);
-
-      /* Create render passes requested by engine. Only this part is
-       * mutex locked to avoid deadlock with Python GIL. */
-      BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
-      LISTBASE_FOREACH (RenderPass *, pass, &templates) {
-        RE_create_render_pass(
-            re->result, pass->name, pass->channels, pass->chan_id, rl->name, NULL);
-      }
-      BLI_rw_mutex_unlock(&re->resultmutex);
-
-      BLI_freelistN(&templates);
-
-      /* Open EXR file for writing. */
-      render_result_exr_file_path(re->scene, rl->name, rr->sample_nr, str);
-      printf("write exr tmp file, %dx%d, %s\n", rr->rectx, rr->recty, str);
-      IMB_exrtile_begin_write(rl->exrhandle, str, 0, rr->rectx, rr->recty, re->partx, re->party);
-    }
-  }
-}
-
-/* end write of exr tile file, read back first sample */
-void render_result_exr_file_end(Render *re, RenderEngine *engine)
-{
-  /* Preserve stamp data. */
-  struct StampData *stamp_data = re->result->stamp_data;
-  re->result->stamp_data = NULL;
-
-  /* Close EXR files. */
-  for (RenderResult *rr = re->result; rr; rr = rr->next) {
-    LISTBASE_FOREACH (RenderLayer *, rl, &rr->layers) {
-      IMB_exr_close(rl->exrhandle);
-      rl->exrhandle = NULL;
-    }
-
-    rr->do_exr_tile = false;
-  }
-
-  /* Create new render result in memory instead of on disk. */
-  BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
-  render_result_free_list(&re->fullresult, re->result);
-  re->result = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
-  re->result->stamp_data = stamp_data;
-  render_result_passes_allocated_ensure(re->result);
-  BLI_rw_mutex_unlock(&re->resultmutex);
-
-  LISTBASE_FOREACH (RenderLayer *, rl, &re->result->layers) {
-    /* Get passes needed by engine. */
-    ListBase templates;
-    render_result_get_pass_templates(engine, re, rl, &templates);
-
-    /* Create render passes requested by engine. Only this part is
-     * mutex locked to avoid deadlock with Python GIL. */
-    BLI_rw_mutex_lock(&re->resultmutex, THREAD_LOCK_WRITE);
-    LISTBASE_FOREACH (RenderPass *, pass, &templates) {
-      RE_create_render_pass(re->result, pass->name, pass->channels, pass->chan_id, rl->name, NULL);
-    }
-
-    BLI_freelistN(&templates);
-
-    /* Render passes contents from file. */
-    char str[FILE_MAXFILE + MAX_ID_NAME + MAX_ID_NAME + 100] = "";
-    render_result_exr_file_path(re->scene, rl->name, 0, str);
-    printf("read exr tmp file: %s\n", str);
-
-    if (!render_result_exr_file_read_path(re->result, rl, str)) {
-      printf("cannot read: %s\n", str);
-    }
-    BLI_rw_mutex_unlock(&re->resultmutex);
-  }
-}
-
-/* save part into exr file */
-void render_result_exr_file_merge(RenderResult *rr, RenderResult *rrpart, const char *viewname)
-{
-  for (; rr && rrpart; rr = rr->next, rrpart = rrpart->next) {
-    save_render_result_tile(rr, rrpart, viewname);
-  }
-}
-
-/* path to temporary exr file */
-void render_result_exr_file_path(Scene *scene, const char *layname, int sample, char *filepath)
-{
-  char name[FILE_MAXFILE + MAX_ID_NAME + MAX_ID_NAME + 100];
-  const char *fi = BLI_path_basename(BKE_main_blendfile_path_from_global());
-
-  if (sample == 0) {
-    BLI_snprintf(name, sizeof(name), "%s_%s_%s.exr", fi, scene->id.name + 2, layname);
-  }
-  else {
-    BLI_snprintf(name, sizeof(name), "%s_%s_%s%d.exr", fi, scene->id.name + 2, layname, sample);
-  }
-
-  /* Make name safe for paths, see T43275. */
-  BLI_filename_make_safe(name);
-
-  BLI_join_dirfile(filepath, FILE_MAX, BKE_tempdir_session(), name);
-}
-
 /* called for reading temp files, and for external engines */
 int render_result_exr_file_read_path(RenderResult *rr,
                                      RenderLayer *rl_single,
@@ -1416,7 +1179,7 @@ bool render_result_exr_file_cache_read(Render *re)
   char *root = U.render_cachedir;
 
   RE_FreeRenderResult(re->result);
-  re->result = render_result_new(re, &re->disprect, RR_USE_MEM, RR_ALL_LAYERS, RR_ALL_VIEWS);
+  re->result = render_result_new(re, &re->disprect, RR_ALL_LAYERS, RR_ALL_VIEWS);
 
   /* First try cache. */
   render_result_exr_file_cache_path(re->scene, root, str);
diff --git a/source/blender/render/intern/render_result.h b/source/blender/render/intern/render_result.h
index 1fc64a4ea97..4145bb3b8ab 100644
--- a/source/blender/render/intern/render_result.h
+++ b/source/blender/render/intern/render_result.h
@@ -25,9 +25,6 @@
 
 #define PASS_VECTOR_MAX 10000.0f
 
-#define RR_USE_MEM 0
-#define RR_USE_EXR 1
-
 #define RR_ALL_LAYERS NULL
 #define RR_ALL_VIEWS NULL
 
@@ -51,7 +48,6 @@ extern "C" {
 
 struct RenderResult *render_result_new(struct Render *re,
                                        struct rcti *partrct,
-                                       int savebuffers,
                                        const char *layername,
                                        const char *viewname);
 
@@ -81,12 +77,6 @@ void render_result_free_list(struct ListBase *lb, struct RenderResult *rr);
 void render_result_single_layer_begin(struct Render *re);
 void render_result_single_layer_end(struct Render *re);
 
-/* EXR Tile File Render */
-
-void render_result_save_empty_result_tiles(struct Render *re);
-void render_result_exr_file_begin(struct Render *re, struct RenderEngine *engine);
-void render_result_exr_file_end(struct Render *re, struct RenderEngine *engine);
-
 /* render pass wrapper for gpencil */
 struct RenderPass *render_layer_add_pass(struct RenderResult *rr,
                                          struct RenderLayer *rl,
@@ -95,14 +85,6 @@ struct RenderPass *render_layer_add_pass(struct RenderResult *rr,
                                          const char *viewname,
                                          const char *chan_id);
 
-void render_result_exr_file_merge(struct RenderResult *rr,
-                                  struct RenderResult *rrpart,
-                                  const char *viewname);
-
-void render_result_exr_file_path(struct Scene *scene,
-                                 const char *layname,
-                                 int sample,
-                                 char *filepath);
 int render_result_exr_file_read_path(struct RenderResult *rr,
                                      struct RenderLayer *rl_single,
                                      const char *filepath);
diff --git a/source/blender/render/intern/render_types.h b/source/blender/render/intern/render_types.h
index d2d2b499495..ca4f72350e1 100644
--- a/source/blender/render/intern/render_types.h
+++ b/source/blender/render/intern/render_types.h
@@ -47,30 +47,10 @@ struct ReportList;
 extern "C" {
 #endif
 
-/* this is handed over to threaded hiding/passes/shading engine */
-typedef struct RenderPart {
-  struct RenderPart *next, *prev;
-
-  RenderResult *result; /* result of part rendering */
-  ListBase fullresult;  /* optional full sample buffers */
-
-  rcti disprect;    /* part coordinates within total picture */
-  int rectx, recty; /* the size */
-  int nr;           /* nr is partnr */
-  short status;
-} RenderPart;
-
 typedef struct HighlightedTile {
   rcti rect;
 } HighlightedTile;
 
-enum {
-  /* PART_STATUS_NONE = 0, */ /* UNUSED */
-  PART_STATUS_IN_PROGRESS = 1,
-  PART_STATUS_RENDERED = 2,
-  PART_STATUS_MERGED = 3,
-};
-
 /* controls state of render, everything that's read-only during render stage */
 struct Render {
   struct Render *next, *prev;
@@ -91,6 +71,9 @@ struct Render {
    * to not conflict with writes, so no lock used for that */
   ThreadRWMutex resultmutex;
 
+  /* Guard for drawing render result using engine's `draw()` callback. */
+  ThreadMutex engine_draw_mutex;
+
   /** Window size, display rect, viewplane.
    * \note Buffer width and height with percentage applied
    * without border & crop. convert to long before multiplying together to avoid overflow. */
@@ -101,10 +84,6 @@ struct Render {
   /* final picture width and height (within disprect) */
   int rectx, recty;
 
-  /* real maximum size of parts after correction for minimum
-   * partx*xparts can be larger than rectx, in that case last part is smaller */
-  int partx, party;
-
   /* Camera transform, only used by Freestyle. */
   float winmat[4][4];
 
@@ -120,9 +99,6 @@ struct Render {
   int active_view_layer;
   struct Object *camera_override;
 
-  ThreadRWMutex partsmutex;
-  struct GHash *parts;
-
   ThreadMutex highlighted_tiles_mutex;
   struct GSet *highlighted_tiles;
 
diff --git a/source/blender/sequencer/SEQ_iterator.h b/source/blender/sequencer/SEQ_iterator.h
index 4f7d603fd6a..d2a47a13db3 100644
--- a/source/blender/sequencer/SEQ_iterator.h
+++ b/source/blender/sequencer/SEQ_iterator.h
@@ -94,11 +94,15 @@ SeqCollection *SEQ_query_by_reference(struct Sequence *seq_reference,
                                                           SeqCollection *collection));
 SeqCollection *SEQ_query_selected_strips(struct ListBase *seqbase);
 SeqCollection *SEQ_query_unselected_strips(struct ListBase *seqbase);
-SeqCollection *SEQ_query_all_strips(struct ListBase *seqbase);
-SeqCollection *SEQ_query_all_strips_recursive(struct ListBase *seqbase);
+SeqCollection *SEQ_query_all_strips(ListBase *seqbase);
+SeqCollection *SEQ_query_all_strips_recursive(ListBase *seqbase);
+SeqCollection *SEQ_query_rendered_strips(ListBase *seqbase,
+                                         const int timeline_frame,
+                                         const int displayed_channel);
 void SEQ_query_strip_effect_chain(struct Sequence *seq_reference,
                                   struct ListBase *seqbase,
                                   SeqCollection *collection);
+void SEQ_filter_selected_strips(SeqCollection *collection);
 
 #ifdef __cplusplus
 }
diff --git a/source/blender/sequencer/SEQ_render.h b/source/blender/sequencer/SEQ_render.h
index c138daf1318..e99dc6d344f 100644
--- a/source/blender/sequencer/SEQ_render.h
+++ b/source/blender/sequencer/SEQ_render.h
@@ -27,6 +27,8 @@
 extern "C" {
 #endif
 
+#define SEQ_RENDER_THUMB_SIZE 256
+
 struct ListBase;
 struct Main;
 struct Scene;
@@ -67,6 +69,25 @@ struct ImBuf *SEQ_render_give_ibuf(const SeqRenderData *context,
 struct ImBuf *SEQ_render_give_ibuf_direct(const SeqRenderData *context,
                                           float timeline_frame,
                                           struct Sequence *seq);
+void SEQ_render_thumbnails(const struct SeqRenderData *context,
+                           struct Sequence *seq,
+                           struct Sequence *seq_orig,
+                           float start_frame,
+                           float frame_step,
+                           rctf *view_area,
+                           const short *stop);
+struct ImBuf *SEQ_get_thumbnail(const struct SeqRenderData *context,
+                                struct Sequence *seq,
+                                float timeline_frame,
+                                rcti *crop,
+                                bool clipped);
+int SEQ_render_thumbnails_guaranteed_set_frame_step_get(const struct Sequence *seq);
+void SEQ_render_thumbnails_base_set(const struct SeqRenderData *context,
+                                    struct Sequence *seq,
+                                    struct Sequence *seq_orig,
+                                    rctf *view_area,
+                                    const short *stop);
+
 void SEQ_render_init_colorspace(struct Sequence *seq);
 void SEQ_render_new_render_data(struct Main *bmain,
                                 struct Depsgraph *depsgraph,
diff --git a/source/blender/sequencer/SEQ_sequencer.h b/source/blender/sequencer/SEQ_sequencer.h
index d7800d208a4..7e733817630 100644
--- a/source/blender/sequencer/SEQ_sequencer.h
+++ b/source/blender/sequencer/SEQ_sequencer.h
@@ -64,6 +64,7 @@ short SEQ_tool_settings_snap_flag_get(struct Scene *scene);
 short SEQ_tool_settings_snap_mode_get(struct Scene *scene);
 int SEQ_tool_settings_snap_distance_get(struct Scene *scene);
 eSeqOverlapMode SEQ_tool_settings_overlap_mode_get(struct Scene *scene);
+int SEQ_tool_settings_pivot_point_get(struct Scene *scene);
 struct SequencerToolSettings *SEQ_tool_settings_copy(struct SequencerToolSettings *tool_settings);
 struct Editing *SEQ_editing_get(const struct Scene *scene);
 struct Editing *SEQ_editing_ensure(struct Scene *scene);
diff --git a/source/blender/sequencer/SEQ_transform.h b/source/blender/sequencer/SEQ_transform.h
index 1977835f627..328efb9424a 100644
--- a/source/blender/sequencer/SEQ_transform.h
+++ b/source/blender/sequencer/SEQ_transform.h
@@ -61,6 +61,15 @@ void SEQ_transform_offset_after_frame(struct Scene *scene,
                                       const int delta,
                                       const int timeline_frame);
 
+/* Image transformation. */
+void SEQ_image_transform_mirror_factor_get(const struct Sequence *seq, float r_mirror[2]);
+void SEQ_image_transform_origin_offset_pixelspace_get(const struct Scene *scene,
+                                                      const struct Sequence *seq,
+                                                      float r_origin[2]);
+void SEQ_image_transform_final_quad_get(const struct Scene *scene,
+                                        const struct Sequence *seq,
+                                        float r_quad[4][2]);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/sequencer/SEQ_utils.h b/source/blender/sequencer/SEQ_utils.h
index 09de7bc67e9..d30a1b2d7ae 100644
--- a/source/blender/sequencer/SEQ_utils.h
+++ b/source/blender/sequencer/SEQ_utils.h
@@ -34,6 +34,7 @@ struct Mask;
 struct Scene;
 struct Sequence;
 struct StripElem;
+struct SeqRenderData;
 
 void SEQ_sort(struct ListBase *seqbase);
 void SEQ_sequence_base_unique_name_recursive(struct Scene *scene,
@@ -57,7 +58,6 @@ void SEQ_set_scale_to_fit(const struct Sequence *seq,
                           const int preview_height,
                           const eSeqImageFitMethod fit_method);
 void SEQ_ensure_unique_name(struct Sequence *seq, struct Scene *scene);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/source/blender/sequencer/intern/image_cache.c b/source/blender/sequencer/intern/image_cache.c
index 86bd840ce31..86c198075e9 100644
--- a/source/blender/sequencer/intern/image_cache.c
+++ b/source/blender/sequencer/intern/image_cache.c
@@ -104,6 +104,7 @@
 #define DCACHE_IMAGES_PER_FILE 100
 #define DCACHE_CURRENT_VERSION 2
 #define COLORSPACE_NAME_MAX 64 /* XXX: defined in imb intern */
+#define THUMB_CACHE_LIMIT 5000
 
 typedef struct DiskCacheHeaderEntry {
   unsigned char encoding;
@@ -148,6 +149,7 @@ typedef struct SeqCache {
   struct BLI_mempool *items_pool;
   struct SeqCacheKey *last_key;
   SeqDiskCache *disk_cache;
+  int thumbnail_count;
 } SeqCache;
 
 typedef struct SeqCacheItem {
@@ -776,7 +778,7 @@ static float seq_cache_timeline_frame_to_frame_index(Sequence *seq, float timeli
   /* With raw images, map timeline_frame to strip input media frame range. This means that static
    * images or extended frame range of movies will only generate one cache entry. No special
    * treatment in converting frame index to timeline_frame is needed. */
-  if (type == SEQ_CACHE_STORE_RAW) {
+  if (type == SEQ_CACHE_STORE_RAW || type == SEQ_CACHE_STORE_THUMBNAIL) {
     return seq_give_frame_index(seq, timeline_frame);
   }
 
@@ -875,7 +877,7 @@ static void seq_cache_put_ex(Scene *scene, SeqCacheKey *key, ImBuf *ibuf)
   if (BLI_ghash_reinsert(cache->hash, key, item, seq_cache_keyfree, seq_cache_valfree)) {
     IMB_refImBuf(ibuf);
 
-    if (!key->is_temp_cache) {
+    if (!key->is_temp_cache || key->type != SEQ_CACHE_STORE_THUMBNAIL) {
       cache->last_key = key;
     }
   }
@@ -1161,6 +1163,7 @@ static void seq_cache_create(Main *bmain, Scene *scene)
     cache->hash = BLI_ghash_new(seq_cache_hashhash, seq_cache_hashcmp, "SeqCache hash");
     cache->last_key = NULL;
     cache->bmain = bmain;
+    cache->thumbnail_count = 0;
     BLI_mutex_init(&cache->iterator_mutex);
     scene->ed->cache = cache;
 
@@ -1217,7 +1220,7 @@ void seq_cache_free_temp_cache(Scene *scene, short id, int timeline_frame)
     SeqCacheKey *key = BLI_ghashIterator_getKey(&gh_iter);
     BLI_ghashIterator_step(&gh_iter);
 
-    if (key->is_temp_cache && key->task_id == id) {
+    if (key->is_temp_cache && key->task_id == id && key->type != SEQ_CACHE_STORE_THUMBNAIL) {
       /* Use frame_index here to avoid freeing raw images if they are used for multiple frames. */
       float frame_index = seq_cache_timeline_frame_to_frame_index(
           key->seq, timeline_frame, key->type);
@@ -1278,6 +1281,7 @@ void SEQ_cache_cleanup(Scene *scene)
     BLI_ghash_remove(cache->hash, key, seq_cache_keyfree, seq_cache_valfree);
   }
   cache->last_key = NULL;
+  cache->thumbnail_count = 0;
   seq_cache_unlock(scene);
 }
 
@@ -1345,6 +1349,46 @@ void seq_cache_cleanup_sequence(Scene *scene,
   seq_cache_unlock(scene);
 }
 
+void seq_cache_thumbnail_cleanup(Scene *scene, rctf *view_area_safe)
+{
+  /* Add offsets to the left and right end to keep some frames in cache. */
+  view_area_safe->xmax += 200;
+  view_area_safe->xmin -= 200;
+  view_area_safe->ymin -= 1;
+  view_area_safe->ymax += 1;
+
+  SeqCache *cache = seq_cache_get_from_scene(scene);
+  if (!cache) {
+    return;
+  }
+
+  GHashIterator gh_iter;
+  BLI_ghashIterator_init(&gh_iter, cache->hash);
+  while (!BLI_ghashIterator_done(&gh_iter)) {
+    SeqCacheKey *key = BLI_ghashIterator_getKey(&gh_iter);
+    BLI_ghashIterator_step(&gh_iter);
+
+    const int frame_index = key->timeline_frame - key->seq->startdisp;
+    const int frame_step = SEQ_render_thumbnails_guaranteed_set_frame_step_get(key->seq);
+    const int relative_base_frame = round_fl_to_int((frame_index / (float)frame_step)) *
+                                    frame_step;
+    const int nearest_guaranted_absolute_frame = relative_base_frame + key->seq->startdisp;
+
+    if (nearest_guaranted_absolute_frame == key->timeline_frame) {
+      continue;
+    }
+
+    if ((key->type & SEQ_CACHE_STORE_THUMBNAIL) &&
+        (key->timeline_frame > view_area_safe->xmax ||
+         key->timeline_frame < view_area_safe->xmin || key->seq->machine > view_area_safe->ymax ||
+         key->seq->machine < view_area_safe->ymin)) {
+      BLI_ghash_remove(cache->hash, key, seq_cache_keyfree, seq_cache_valfree);
+      cache->thumbnail_count--;
+    }
+  }
+  cache->last_key = NULL;
+}
+
 struct ImBuf *seq_cache_get(const SeqRenderData *context,
                             Sequence *seq,
                             float timeline_frame,
@@ -1436,6 +1480,37 @@ bool seq_cache_put_if_possible(
   return false;
 }
 
+void seq_cache_thumbnail_put(
+    const SeqRenderData *context, Sequence *seq, float timeline_frame, ImBuf *i, rctf *view_area)
+{
+  Scene *scene = context->scene;
+
+  if (!scene->ed->cache) {
+    seq_cache_create(context->bmain, scene);
+  }
+
+  seq_cache_lock(scene);
+  SeqCache *cache = seq_cache_get_from_scene(scene);
+  SeqCacheKey *key = seq_cache_allocate_key(
+      cache, context, seq, timeline_frame, SEQ_CACHE_STORE_THUMBNAIL);
+
+  /* Prevent reinserting, it breaks cache key linking. */
+  if (BLI_ghash_haskey(cache->hash, key)) {
+    seq_cache_unlock(scene);
+    return;
+  }
+
+  /* Limit cache to THUMB_CACHE_LIMIT (5000) images stored. */
+  if (cache->thumbnail_count >= THUMB_CACHE_LIMIT) {
+    rctf view_area_safe = *view_area;
+    seq_cache_thumbnail_cleanup(scene, &view_area_safe);
+  }
+
+  seq_cache_put_ex(scene, key, i);
+  cache->thumbnail_count++;
+  seq_cache_unlock(scene);
+}
+
 void seq_cache_put(
     const SeqRenderData *context, Sequence *seq, float timeline_frame, int type, ImBuf *i)
 {
diff --git a/source/blender/sequencer/intern/image_cache.h b/source/blender/sequencer/intern/image_cache.h
index 63c559caee9..60031311985 100644
--- a/source/blender/sequencer/intern/image_cache.h
+++ b/source/blender/sequencer/intern/image_cache.h
@@ -46,6 +46,11 @@ void seq_cache_put(const struct SeqRenderData *context,
                    float timeline_frame,
                    int type,
                    struct ImBuf *i);
+void seq_cache_thumbnail_put(const struct SeqRenderData *context,
+                             struct Sequence *seq,
+                             float timeline_frame,
+                             struct ImBuf *i,
+                             rctf *view_area);
 bool seq_cache_put_if_possible(const struct SeqRenderData *context,
                                struct Sequence *seq,
                                float timeline_frame,
@@ -60,6 +65,7 @@ void seq_cache_cleanup_sequence(struct Scene *scene,
                                 struct Sequence *seq_changed,
                                 int invalidate_types,
                                 bool force_seq_changed_range);
+void seq_cache_thumbnail_cleanup(Scene *scene, rctf *view_area);
 bool seq_cache_is_full(void);
 
 #ifdef __cplusplus
diff --git a/source/blender/sequencer/intern/iterator.c b/source/blender/sequencer/intern/iterator.c
index 58f68205f51..2429405350b 100644
--- a/source/blender/sequencer/intern/iterator.c
+++ b/source/blender/sequencer/intern/iterator.c
@@ -37,6 +37,8 @@
 #include "BKE_scene.h"
 
 #include "SEQ_iterator.h"
+#include "SEQ_time.h"
+#include "render.h"
 
 /* -------------------------------------------------------------------- */
 /** \Iterator API
@@ -340,6 +342,114 @@ SeqCollection *SEQ_query_selected_strips(ListBase *seqbase)
   return collection;
 }
 
+static SeqCollection *query_strips_at_frame(ListBase *seqbase, const int timeline_frame)
+{
+  SeqCollection *collection = SEQ_collection_create(__func__);
+
+  LISTBASE_FOREACH (Sequence *, seq, seqbase) {
+    if (SEQ_time_strip_intersects_frame(seq, timeline_frame)) {
+      SEQ_collection_append_strip(seq, collection);
+    }
+  }
+  return collection;
+}
+
+static void collection_filter_channel_up_to_incl(SeqCollection *collection, const int channel)
+{
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, collection) {
+    if (seq->machine <= channel) {
+      continue;
+    }
+    SEQ_collection_remove_strip(seq, collection);
+  }
+}
+
+static bool seq_is_effect_of(const Sequence *seq_effect, const Sequence *possibly_input)
+{
+  if (seq_effect->seq1 == possibly_input || seq_effect->seq2 == possibly_input ||
+      seq_effect->seq3 == possibly_input) {
+    return true;
+  }
+  return false;
+}
+
+/* Check if seq must be rendered. This depends on whole stack in some cases, not only seq itself.
+ * Order of applying these conditions is important. */
+static bool must_render_strip(const Sequence *seq, SeqCollection *strips_at_timeline_frame)
+{
+  bool seq_have_effect_in_stack = false;
+  Sequence *seq_iter;
+  SEQ_ITERATOR_FOREACH (seq_iter, strips_at_timeline_frame) {
+    /* Strips is below another strip with replace blending are not rendered. */
+    if (seq_iter->blend_mode == SEQ_BLEND_REPLACE && seq->machine < seq_iter->machine) {
+      return false;
+    }
+
+    if ((seq_iter->type & SEQ_TYPE_EFFECT) != 0 && seq_is_effect_of(seq_iter, seq)) {
+      /* Strips in same channel or higher than its effect are rendered. */
+      if (seq->machine >= seq_iter->machine) {
+        return true;
+      }
+      /* Mark that this strip has effect in stack, that is above the strip. */
+      seq_have_effect_in_stack = true;
+    }
+  }
+
+  /* All effects are rendered (with respect to conditions above). */
+  if ((seq->type & SEQ_TYPE_EFFECT) != 0) {
+    return true;
+  }
+
+  /* If strip has effects in stack, and all effects are above this strip, it is not rendered. */
+  if (seq_have_effect_in_stack) {
+    return false;
+  }
+
+  return true;
+}
+
+/* Remove strips we don't want to render from collection. */
+static void collection_filter_rendered_strips(SeqCollection *collection)
+{
+  Sequence *seq;
+
+  /* Remove sound strips and muted strips from collection, because these are not rendered.
+   * Function #must_render_strip() don't have to check for these strips anymore. */
+  SEQ_ITERATOR_FOREACH (seq, collection) {
+    if (seq->type == SEQ_TYPE_SOUND_RAM || (seq->flag & SEQ_MUTE) != 0) {
+      SEQ_collection_remove_strip(seq, collection);
+    }
+  }
+
+  SEQ_ITERATOR_FOREACH (seq, collection) {
+    if (must_render_strip(seq, collection)) {
+      continue;
+    }
+    SEQ_collection_remove_strip(seq, collection);
+  }
+}
+
+/**
+ * Query strips that are rendered at \a timeline_frame when \a displayed channel is viewed
+ *
+ * \param seqbase: ListBase in which strips are queried
+ * \param timeline_frame: viewed frame
+ * \param displayed_channel: viewed channel. when set to 0, no channel filter is applied
+ * \return strip collection
+ */
+SeqCollection *SEQ_query_rendered_strips(ListBase *seqbase,
+                                         const int timeline_frame,
+                                         const int displayed_channel)
+{
+  SeqCollection *collection = query_strips_at_frame(seqbase, timeline_frame);
+  if (displayed_channel != 0) {
+    collection_filter_channel_up_to_incl(collection, displayed_channel);
+  }
+  collection_filter_rendered_strips(collection);
+  return collection;
+}
+
 /**
  * Query all unselected strips in seqbase.
  *
@@ -396,3 +506,13 @@ void SEQ_query_strip_effect_chain(Sequence *seq_reference,
     }
   }
 }
+
+void SEQ_filter_selected_strips(SeqCollection *collection)
+{
+  Sequence *seq;
+  SEQ_ITERATOR_FOREACH (seq, collection) {
+    if ((seq->flag & SELECT) == 0) {
+      SEQ_collection_remove_strip(seq, collection);
+    }
+  }
+}
diff --git a/source/blender/sequencer/intern/render.c b/source/blender/sequencer/intern/render.c
index 6c4502a3608..2578a6d4223 100644
--- a/source/blender/sequencer/intern/render.c
+++ b/source/blender/sequencer/intern/render.c
@@ -72,6 +72,7 @@
 #include "SEQ_render.h"
 #include "SEQ_sequencer.h"
 #include "SEQ_time.h"
+#include "SEQ_transform.h"
 #include "SEQ_utils.h"
 
 #include "effects.h"
@@ -262,94 +263,6 @@ StripElem *SEQ_render_give_stripelem(Sequence *seq, int timeline_frame)
   return se;
 }
 
-static bool seq_is_effect_of(const Sequence *seq_effect, const Sequence *possibly_input)
-{
-  if (seq_effect->seq1 == possibly_input || seq_effect->seq2 == possibly_input ||
-      seq_effect->seq3 == possibly_input) {
-    return true;
-  }
-  return false;
-}
-
-/* Check if seq must be rendered. This depends on whole stack in some cases, not only seq itself.
- * Order of applying these conditions is important. */
-static bool must_render_strip(const Sequence *seq, SeqCollection *strips_at_timeline_frame)
-{
-  bool seq_have_effect_in_stack = false;
-  Sequence *seq_iter;
-  SEQ_ITERATOR_FOREACH (seq_iter, strips_at_timeline_frame) {
-    /* Strips is below another strip with replace blending are not rendered. */
-    if (seq_iter->blend_mode == SEQ_BLEND_REPLACE && seq->machine < seq_iter->machine) {
-      return false;
-    }
-
-    if ((seq_iter->type & SEQ_TYPE_EFFECT) != 0 && seq_is_effect_of(seq_iter, seq)) {
-      /* Strips in same channel or higher than its effect are rendered. */
-      if (seq->machine >= seq_iter->machine) {
-        return true;
-      }
-      /* Mark that this strip has effect in stack, that is above the strip. */
-      seq_have_effect_in_stack = true;
-    }
-  }
-
-  /* All effects are rendered (with respect to conditions above). */
-  if ((seq->type & SEQ_TYPE_EFFECT) != 0) {
-    return true;
-  }
-
-  /* If strip has effects in stack, and all effects are above this strip, it is not rendered. */
-  if (seq_have_effect_in_stack) {
-    return false;
-  }
-
-  return true;
-}
-
-static SeqCollection *query_strips_at_frame(ListBase *seqbase, const int timeline_frame)
-{
-  SeqCollection *collection = SEQ_collection_create(__func__);
-
-  LISTBASE_FOREACH (Sequence *, seq, seqbase) {
-    if (SEQ_time_strip_intersects_frame(seq, timeline_frame)) {
-      SEQ_collection_append_strip(seq, collection);
-    }
-  }
-  return collection;
-}
-
-static void collection_filter_channel_up_to_incl(SeqCollection *collection, const int channel)
-{
-  Sequence *seq;
-  SEQ_ITERATOR_FOREACH (seq, collection) {
-    if (seq->machine <= channel) {
-      continue;
-    }
-    SEQ_collection_remove_strip(seq, collection);
-  }
-}
-
-/* Remove strips we don't want to render from collection. */
-static void collection_filter_rendered_strips(SeqCollection *collection)
-{
-  Sequence *seq;
-
-  /* Remove sound strips and muted strips from collection, because these are not rendered.
-   * Function #must_render_strip() don't have to check for these strips anymore. */
-  SEQ_ITERATOR_FOREACH (seq, collection) {
-    if (seq->type == SEQ_TYPE_SOUND_RAM || (seq->flag & SEQ_MUTE) != 0) {
-      SEQ_collection_remove_strip(seq, collection);
-    }
-  }
-
-  SEQ_ITERATOR_FOREACH (seq, collection) {
-    if (must_render_strip(seq, collection)) {
-      continue;
-    }
-    SEQ_collection_remove_strip(seq, collection);
-  }
-}
-
 static int seq_channel_cmp_fn(const void *a, const void *b)
 {
   return (*(Sequence **)a)->machine - (*(Sequence **)b)->machine;
@@ -360,13 +273,7 @@ int seq_get_shown_sequences(ListBase *seqbase,
                             const int chanshown,
                             Sequence **r_seq_arr)
 {
-  SeqCollection *collection = query_strips_at_frame(seqbase, timeline_frame);
-
-  if (chanshown != 0) {
-    collection_filter_channel_up_to_incl(collection, chanshown);
-  }
-  collection_filter_rendered_strips(collection);
-
+  SeqCollection *collection = SEQ_query_rendered_strips(seqbase, timeline_frame, chanshown);
   const int strip_count = BLI_gset_len(collection->set);
 
   if (strip_count > MAXSEQ) {
@@ -504,7 +411,7 @@ static void sequencer_image_crop_transform_matrix(const Sequence *seq,
   const float image_center_offs_y = (out->y - in->y) / 2;
   const float translate_x = transform->xofs * preview_scale_factor + image_center_offs_x;
   const float translate_y = transform->yofs * preview_scale_factor + image_center_offs_y;
-  const float pivot[2] = {in->x / 2, in->y / 2};
+  const float pivot[2] = {in->x * transform->origin[0], in->y * transform->origin[1]};
   loc_rot_size_to_mat3(r_transform_matrix,
                        (const float[]){translate_x, translate_y},
                        transform->rotation,
@@ -527,6 +434,31 @@ static void sequencer_image_crop_init(const Sequence *seq,
   BLI_rctf_init(r_crop, left, in->x - right, bottom, in->y - top);
 }
 
+static void sequencer_thumbnail_transform(ImBuf *in, ImBuf *out)
+{
+  float image_scale_factor = (float)out->x / in->x;
+  float transform_matrix[3][3];
+
+  /* Set to keep same loc,scale,rot but change scale to thumb size limit. */
+  const float scale_x = 1 * image_scale_factor;
+  const float scale_y = 1 * image_scale_factor;
+  const float image_center_offs_x = (out->x - in->x) / 2;
+  const float image_center_offs_y = (out->y - in->y) / 2;
+  const float pivot[2] = {in->x / 2, in->y / 2};
+  loc_rot_size_to_mat3(transform_matrix,
+                       (const float[]){image_center_offs_x, image_center_offs_y},
+                       0,
+                       (const float[]){scale_x, scale_y});
+  transform_pivot_set_m3(transform_matrix, pivot);
+  invert_m3(transform_matrix);
+
+  /* No crop. */
+  rctf source_crop;
+  BLI_rctf_init(&source_crop, 0, in->x, 0, in->y);
+
+  IMB_transform(in, out, transform_matrix, &source_crop, IMB_FILTER_NEAREST);
+}
+
 static void sequencer_preprocess_transform_crop(
     ImBuf *in, ImBuf *out, const SeqRenderData *context, Sequence *seq, const bool is_proxy_image)
 {
@@ -1989,7 +1921,164 @@ ImBuf *SEQ_render_give_ibuf_direct(const SeqRenderData *context,
   seq_render_state_init(&state);
 
   ImBuf *ibuf = seq_render_strip(context, &state, seq, timeline_frame);
-
   return ibuf;
 }
+
+/* Gets the direct image from source and scales to thumbnail size. */
+static ImBuf *seq_get_uncached_thumbnail(const SeqRenderData *context,
+                                         SeqRenderState *state,
+                                         Sequence *seq,
+                                         float timeline_frame)
+{
+  bool is_proxy_image = false;
+  ImBuf *ibuf = do_render_strip_uncached(context, state, seq, timeline_frame, &is_proxy_image);
+
+  if (ibuf == NULL) {
+    return NULL;
+  }
+
+  float aspect_ratio = (float)ibuf->x / ibuf->y;
+  int rectx, recty;
+  /* Calculate new dimensions - THUMB_SIZE (256) for x or y. */
+  if (ibuf->x > ibuf->y) {
+    rectx = SEQ_RENDER_THUMB_SIZE;
+    recty = round_fl_to_int(rectx / aspect_ratio);
+  }
+  else {
+    recty = SEQ_RENDER_THUMB_SIZE;
+    rectx = round_fl_to_int(recty * aspect_ratio);
+  }
+
+  /* Scale ibuf to thumbnail size. */
+  ImBuf *scaled_ibuf = IMB_allocImBuf(rectx, recty, 32, ibuf->rect_float ? IB_rectfloat : IB_rect);
+  sequencer_thumbnail_transform(ibuf, scaled_ibuf);
+  seq_imbuf_assign_spaces(context->scene, scaled_ibuf);
+  IMB_freeImBuf(ibuf);
+
+  return scaled_ibuf;
+}
+
+/* Get cached thumbnails. */
+ImBuf *SEQ_get_thumbnail(
+    const SeqRenderData *context, Sequence *seq, float timeline_frame, rcti *crop, bool clipped)
+{
+  ImBuf *ibuf = seq_cache_get(context, seq, roundf(timeline_frame), SEQ_CACHE_STORE_THUMBNAIL);
+
+  if (!clipped || ibuf == NULL) {
+    return ibuf;
+  }
+
+  /* Do clipping. */
+  ImBuf *ibuf_cropped = IMB_dupImBuf(ibuf);
+  if (crop->xmin < 0 || crop->ymin < 0) {
+    crop->xmin = 0;
+    crop->ymin = 0;
+  }
+  if (crop->xmax >= ibuf->x || crop->ymax >= ibuf->y) {
+    crop->xmax = ibuf->x - 1;
+    crop->ymax = ibuf->y - 1;
+  }
+  IMB_rect_crop(ibuf_cropped, crop);
+  IMB_freeImBuf(ibuf);
+  return ibuf_cropped;
+}
+
+/* Render the series of thumbnails and store in cache. */
+void SEQ_render_thumbnails(const SeqRenderData *context,
+                           Sequence *seq,
+                           Sequence *seq_orig,
+                           float start_frame,
+                           float frame_step,
+                           rctf *view_area,
+                           const short *stop)
+{
+  SeqRenderState state;
+  seq_render_state_init(&state);
+
+  /* Adding the hold offset value (seq->anim_startofs) to the start frame. Position of image not
+   * affected, but frame loaded affected. */
+  start_frame = start_frame - frame_step;
+  float upper_thumb_bound = (seq->endstill) ? (seq->start + seq->len) : seq->enddisp;
+  upper_thumb_bound = (upper_thumb_bound > view_area->xmax) ? view_area->xmax + frame_step :
+                                                              upper_thumb_bound;
+
+  while ((start_frame < upper_thumb_bound) & !*stop) {
+    ImBuf *ibuf = seq_cache_get(
+        context, seq_orig, round_fl_to_int(start_frame), SEQ_CACHE_STORE_THUMBNAIL);
+    if (ibuf) {
+      IMB_freeImBuf(ibuf);
+      start_frame += frame_step;
+      continue;
+    }
+
+    ibuf = seq_get_uncached_thumbnail(context, &state, seq, round_fl_to_int(start_frame));
+
+    if (ibuf) {
+      seq_cache_thumbnail_put(context, seq_orig, round_fl_to_int(start_frame), ibuf, view_area);
+      IMB_freeImBuf(ibuf);
+      seq_orig->flag &= ~SEQ_FLAG_SKIP_THUMBNAILS;
+    }
+    else {
+      /* Can not open source file. */
+      seq_orig->flag |= SEQ_FLAG_SKIP_THUMBNAILS;
+      return;
+    }
+
+    start_frame += frame_step;
+  }
+}
+
+/* Get frame step for equally spaced thumbnails. These thumbnails should always be present in
+ * memory, so they can be used when zooming.*/
+int SEQ_render_thumbnails_guaranteed_set_frame_step_get(const Sequence *seq)
+{
+  const int content_len = (seq->enddisp - seq->startdisp - seq->startstill - seq->endstill);
+
+  /* Arbitrary, but due to performance reasons should be as low as possible. */
+  const int thumbnails_base_set_count = min_ii(content_len / 100, 30);
+  if (thumbnails_base_set_count <= 0) {
+    return 0;
+  }
+  return content_len / thumbnails_base_set_count;
+}
+
+/* Render set of evenly spaced thumbnails that are drawn when zooming. */
+void SEQ_render_thumbnails_base_set(
+    const SeqRenderData *context, Sequence *seq, Sequence *seq_orig, rctf *view_area, const short *stop)
+{
+  SeqRenderState state;
+  seq_render_state_init(&state);
+
+  int timeline_frame = seq->startdisp;
+  const int frame_step = SEQ_render_thumbnails_guaranteed_set_frame_step_get(seq);
+
+  while (timeline_frame < seq->enddisp && !*stop) {
+    ImBuf *ibuf = seq_cache_get(
+        context, seq_orig, roundf(timeline_frame), SEQ_CACHE_STORE_THUMBNAIL);
+    if (ibuf) {
+      IMB_freeImBuf(ibuf);
+
+      if (frame_step == 0) {
+        return;
+      }
+
+      timeline_frame += frame_step;
+      continue;
+    }
+
+    ibuf = seq_get_uncached_thumbnail(context, &state, seq, timeline_frame);
+
+    if (ibuf) {
+      seq_cache_thumbnail_put(context, seq_orig, timeline_frame, ibuf, view_area);
+      IMB_freeImBuf(ibuf);
+    }
+
+    if (frame_step == 0) {
+      return;
+    }
+
+    timeline_frame += frame_step;
+  }
+}
+
 /** \} */
diff --git a/source/blender/sequencer/intern/sequencer.c b/source/blender/sequencer/intern/sequencer.c
index bf5942090c9..382bd51aae1 100644
--- a/source/blender/sequencer/intern/sequencer.c
+++ b/source/blender/sequencer/intern/sequencer.c
@@ -79,6 +79,8 @@ static Strip *seq_strip_alloc(int type)
     strip->transform = MEM_callocN(sizeof(struct StripTransform), "StripTransform");
     strip->transform->scale_x = 1;
     strip->transform->scale_y = 1;
+    strip->transform->origin[0] = 0.5f;
+    strip->transform->origin[1] = 0.5f;
     strip->crop = MEM_callocN(sizeof(struct StripCrop), "StripCrop");
   }
 
@@ -321,6 +323,7 @@ SequencerToolSettings *SEQ_tool_settings_init(void)
                              SEQ_SNAP_TO_STRIP_HOLD;
   tool_settings->snap_distance = 15;
   tool_settings->overlap_mode = SEQ_OVERLAP_SHUFFLE;
+  tool_settings->pivot_point = V3D_AROUND_LOCAL_ORIGINS;
 
   return tool_settings;
 }
@@ -377,6 +380,12 @@ eSeqOverlapMode SEQ_tool_settings_overlap_mode_get(Scene *scene)
   return tool_settings->overlap_mode;
 }
 
+int SEQ_tool_settings_pivot_point_get(Scene *scene)
+{
+  const SequencerToolSettings *tool_settings = SEQ_tool_settings_ensure(scene);
+  return tool_settings->pivot_point;
+}
+
 /**
  * Get seqbase that is being viewed currently. This can be main seqbase or meta strip seqbase
  *
@@ -948,6 +957,8 @@ static bool seq_read_lib_cb(Sequence *seq, void *user_data)
   BLI_listbase_clear(&seq->anims);
 
   SEQ_modifier_blend_read_lib(reader, sce, &seq->modifiers);
+
+  seq->flag &= ~SEQ_FLAG_SKIP_THUMBNAILS;
   return true;
 }
 
diff --git a/source/blender/sequencer/intern/strip_transform.c b/source/blender/sequencer/intern/strip_transform.c
index 3a5f93a72b0..d5ff455c694 100644
--- a/source/blender/sequencer/intern/strip_transform.c
+++ b/source/blender/sequencer/intern/strip_transform.c
@@ -421,3 +421,101 @@ void SEQ_transform_offset_after_frame(Scene *scene,
     }
   }
 }
+
+void SEQ_image_transform_mirror_factor_get(const Sequence *seq, float r_mirror[2])
+{
+  r_mirror[0] = 1.0f;
+  r_mirror[1] = 1.0f;
+
+  if ((seq->flag & SEQ_FLIPX) != 0) {
+    r_mirror[0] = -1.0f;
+  }
+  if ((seq->flag & SEQ_FLIPY) != 0) {
+    r_mirror[1] = -1.0f;
+  }
+}
+
+/**
+ * Get strip transform origin offset from image center
+ * Note: This function does not apply axis mirror.
+ *
+ * \param scene: Scene in which strips are located
+ * \param seq: Sequence to calculate image transform origin
+ * \param r_origin: return value
+ */
+void SEQ_image_transform_origin_offset_pixelspace_get(const Scene *scene,
+                                                      const Sequence *seq,
+                                                      float r_origin[2])
+{
+  float image_size[2];
+  StripElem *strip_elem = seq->strip->stripdata;
+  if (strip_elem == NULL) {
+    image_size[0] = scene->r.xsch;
+    image_size[1] = scene->r.ysch;
+  }
+  else {
+    image_size[0] = strip_elem->orig_width;
+    image_size[1] = strip_elem->orig_height;
+  }
+
+  const StripTransform *transform = seq->strip->transform;
+  r_origin[0] = (image_size[0] * transform->origin[0]) - (image_size[0] * 0.5f) + transform->xofs;
+  r_origin[1] = (image_size[1] * transform->origin[1]) - (image_size[1] * 0.5f) + transform->yofs;
+
+  float mirror[2];
+  SEQ_image_transform_mirror_factor_get(seq, mirror);
+  mul_v2_v2(r_origin, mirror);
+}
+
+/**
+ * Get strip transform origin offset from image center
+ *
+ * \param scene: Scene in which strips are located
+ * \param seq: Sequence to calculate image transform origin
+ * \param r_origin: return value
+ */
+
+void SEQ_image_transform_final_quad_get(const Scene *scene,
+                                        const Sequence *seq,
+                                        float r_quad[4][2])
+{
+  StripTransform *transform = seq->strip->transform;
+  StripCrop *crop = seq->strip->crop;
+
+  int imgage_size[2] = {scene->r.xsch, scene->r.ysch};
+  if (ELEM(seq->type, SEQ_TYPE_MOVIE, SEQ_TYPE_IMAGE)) {
+    imgage_size[0] = seq->strip->stripdata->orig_width;
+    imgage_size[1] = seq->strip->stripdata->orig_height;
+  }
+
+  float transform_matrix[3][3];
+  loc_rot_size_to_mat3(transform_matrix,
+                       (const float[]){transform->xofs, transform->yofs},
+                       transform->rotation,
+                       (const float[]){transform->scale_x, transform->scale_y});
+  const float origin[2] = {imgage_size[0] * transform->origin[0],
+                           imgage_size[1] * transform->origin[1]};
+  const float pivot[2] = {origin[0] - (imgage_size[0] / 2), origin[1] - (imgage_size[1] / 2)};
+  transform_pivot_set_m3(transform_matrix, pivot);
+
+  r_quad[0][0] = (imgage_size[0] / 2) - crop->right;
+  r_quad[0][1] = (imgage_size[1] / 2) - crop->top;
+  r_quad[1][0] = (imgage_size[0] / 2) - crop->right;
+  r_quad[1][1] = (-imgage_size[1] / 2) + crop->bottom;
+  r_quad[2][0] = (-imgage_size[0] / 2) + crop->left;
+  r_quad[2][1] = (-imgage_size[1] / 2) + crop->bottom;
+  r_quad[3][0] = (-imgage_size[0] / 2) + crop->left;
+  r_quad[3][1] = (imgage_size[1] / 2) - crop->top;
+
+  mul_m3_v2(transform_matrix, r_quad[0]);
+  mul_m3_v2(transform_matrix, r_quad[1]);
+  mul_m3_v2(transform_matrix, r_quad[2]);
+  mul_m3_v2(transform_matrix, r_quad[3]);
+
+  float mirror[2];
+  SEQ_image_transform_mirror_factor_get(seq, mirror);
+  mul_v2_v2(r_quad[0], mirror);
+  mul_v2_v2(r_quad[1], mirror);
+  mul_v2_v2(r_quad[2], mirror);
+  mul_v2_v2(r_quad[3], mirror);
+}
diff --git a/source/blender/sequencer/intern/utils.c b/source/blender/sequencer/intern/utils.c
index 1d3e7e4a223..8421aab5217 100644
--- a/source/blender/sequencer/intern/utils.c
+++ b/source/blender/sequencer/intern/utils.c
@@ -42,6 +42,7 @@
 #include "SEQ_edit.h"
 #include "SEQ_iterator.h"
 #include "SEQ_relations.h"
+#include "SEQ_render.h"
 #include "SEQ_select.h"
 #include "SEQ_sequencer.h"
 #include "SEQ_time.h"
diff --git a/source/blender/windowmanager/WM_api.h b/source/blender/windowmanager/WM_api.h
index 189a231616e..6794b1f4091 100644
--- a/source/blender/windowmanager/WM_api.h
+++ b/source/blender/windowmanager/WM_api.h
@@ -262,14 +262,21 @@ struct wmEventHandler_Keymap *WM_event_add_keymap_handler_priority(ListBase *han
                                                                    wmKeyMap *keymap,
                                                                    int priority);
 
-typedef struct wmKeyMap *(wmEventHandler_KeymapDynamicFn)(wmWindowManager *wm,
-                                                          struct wmEventHandler_Keymap *handler)
-    ATTR_WARN_UNUSED_RESULT;
-
-struct wmKeyMap *WM_event_get_keymap_from_toolsystem_fallback(
-    struct wmWindowManager *wm, struct wmEventHandler_Keymap *handler);
-struct wmKeyMap *WM_event_get_keymap_from_toolsystem(struct wmWindowManager *wm,
-                                                     struct wmEventHandler_Keymap *handler);
+typedef struct wmEventHandler_KeymapResult {
+  wmKeyMap *keymaps[3];
+  int keymaps_len;
+} wmEventHandler_KeymapResult;
+
+typedef void(wmEventHandler_KeymapDynamicFn)(wmWindowManager *wm,
+                                             struct wmEventHandler_Keymap *handler,
+                                             struct wmEventHandler_KeymapResult *km_result);
+
+void WM_event_get_keymap_from_toolsystem_fallback(struct wmWindowManager *wm,
+                                                  struct wmEventHandler_Keymap *handler,
+                                                  wmEventHandler_KeymapResult *km_result);
+void WM_event_get_keymap_from_toolsystem(struct wmWindowManager *wm,
+                                         struct wmEventHandler_Keymap *handler,
+                                         wmEventHandler_KeymapResult *km_result);
 
 struct wmEventHandler_Keymap *WM_event_add_keymap_handler_dynamic(
     ListBase *handlers, wmEventHandler_KeymapDynamicFn *keymap_fn, void *user_data);
@@ -281,8 +288,9 @@ void WM_event_set_keymap_handler_post_callback(struct wmEventHandler_Keymap *han
                                                                 wmKeyMapItem *kmi,
                                                                 void *user_data),
                                                void *user_data);
-wmKeyMap *WM_event_get_keymap_from_handler(wmWindowManager *wm,
-                                           struct wmEventHandler_Keymap *handler);
+void WM_event_get_keymaps_from_handler(wmWindowManager *wm,
+                                       struct wmEventHandler_Keymap *handler,
+                                       struct wmEventHandler_KeymapResult *km_result);
 
 wmKeyMapItem *WM_event_match_keymap_item(struct bContext *C,
                                          wmKeyMap *keymap,
@@ -707,6 +715,8 @@ void WM_event_fileselect_event(struct wmWindowManager *wm, void *ophandle, int e
 
 void WM_operator_region_active_win_set(struct bContext *C);
 
+int WM_operator_flag_only_pass_through_on_press(int retval, const struct wmEvent *event);
+
 /* drag and drop */
 struct wmDrag *WM_event_start_drag(
     struct bContext *C, int icon, int type, void *poin, double value, unsigned int flags);
@@ -787,6 +797,7 @@ enum {
   WM_JOB_TYPE_QUADRIFLOW_REMESH,
   WM_JOB_TYPE_TRACE_IMAGE,
   WM_JOB_TYPE_LINEART,
+  WM_JOB_TYPE_SEQ_DRAW_THUMBNAIL,
   /* add as needed, bake, seq proxy build
    * if having hard coded values is a problem */
 };
diff --git a/source/blender/windowmanager/gizmo/intern/wm_gizmo_group.c b/source/blender/windowmanager/gizmo/intern/wm_gizmo_group.c
index 213a3c2e342..22bdf65a169 100644
--- a/source/blender/windowmanager/gizmo/intern/wm_gizmo_group.c
+++ b/source/blender/windowmanager/gizmo/intern/wm_gizmo_group.c
@@ -265,6 +265,8 @@ void WM_gizmogroup_ensure_init(const bContext *C, wmGizmoGroup *gzgroup)
 {
   /* prepare for first draw */
   if (UNLIKELY((gzgroup->init_flag & WM_GIZMOGROUP_INIT_SETUP) == 0)) {
+
+    gzgroup->use_fallback_keymap = true;
     gzgroup->type->setup(C, gzgroup);
 
     /* Not ideal, initialize keymap here, needed for RNA runtime generated gizmos. */
diff --git a/source/blender/windowmanager/intern/wm_dragdrop.c b/source/blender/windowmanager/intern/wm_dragdrop.c
index 76bb93b681c..6585349c83c 100644
--- a/source/blender/windowmanager/intern/wm_dragdrop.c
+++ b/source/blender/windowmanager/intern/wm_dragdrop.c
@@ -34,6 +34,7 @@
 #include "BLT_translation.h"
 
 #include "BLI_blenlib.h"
+#include "BLI_math_color.h"
 
 #include "BIF_glutil.h"
 
@@ -50,6 +51,7 @@
 
 #include "UI_interface.h"
 #include "UI_interface_icons.h"
+#include "UI_resources.h"
 
 #include "RNA_access.h"
 
@@ -426,7 +428,7 @@ ID *WM_drag_get_local_ID_or_import_from_asset(const wmDrag *drag, int idcode)
 }
 
 /**
- * \brief Free asset ID imported for cancelled drop.
+ * \brief Free asset ID imported for canceled drop.
  *
  * If the asset was imported (linked/appended) using #WM_drag_get_local_ID_or_import_from_asset()`
  * (typically via a #wmDropBox.copy() callback), we want the ID to be removed again if the drop
@@ -463,8 +465,14 @@ void WM_drag_free_imported_drag_ID(struct Main *bmain, wmDrag *drag, wmDropBox *
 static void wm_drop_operator_draw(const char *name, int x, int y)
 {
   const uiFontStyle *fstyle = UI_FSTYLE_WIDGET;
-  const float col_fg[4] = {1.0f, 1.0f, 1.0f, 1.0f};
-  const float col_bg[4] = {0.0f, 0.0f, 0.0f, 0.2f};
+
+  /* Use the theme settings from tooltips. */
+  const bTheme *btheme = UI_GetTheme();
+  const uiWidgetColors *wcol = &btheme->tui.wcol_tooltip;
+
+  float col_fg[4], col_bg[4];
+  rgba_uchar_to_float(col_fg, wcol->text);
+  rgba_uchar_to_float(col_bg, wcol->inner);
 
   UI_fontstyle_draw_simple_backdrop(fstyle, x, y, name, col_fg, col_bg);
 }
diff --git a/source/blender/windowmanager/intern/wm_event_system.c b/source/blender/windowmanager/intern/wm_event_system.c
index ae09786356a..14fcc1d69cc 100644
--- a/source/blender/windowmanager/intern/wm_event_system.c
+++ b/source/blender/windowmanager/intern/wm_event_system.c
@@ -2989,9 +2989,18 @@ static int wm_handlers_do_intern(bContext *C, wmEvent *event, ListBase *handlers
       /* Handle all types here. */
       if (handler_base->type == WM_HANDLER_TYPE_KEYMAP) {
         wmEventHandler_Keymap *handler = (wmEventHandler_Keymap *)handler_base;
-        wmKeyMap *keymap = WM_event_get_keymap_from_handler(wm, handler);
-        action |= wm_handlers_do_keymap_with_keymap_handler(
-            C, event, handlers, handler, keymap, do_debug_handler);
+        wmEventHandler_KeymapResult km_result;
+        WM_event_get_keymaps_from_handler(wm, handler, &km_result);
+        int action_iter = WM_HANDLER_CONTINUE;
+        for (int km_index = 0; km_index < km_result.keymaps_len; km_index++) {
+          wmKeyMap *keymap = km_result.keymaps[km_index];
+          action_iter |= wm_handlers_do_keymap_with_keymap_handler(
+              C, event, handlers, handler, keymap, do_debug_handler);
+          if (action_iter & WM_HANDLER_BREAK) {
+            break;
+          }
+        }
+        action |= action_iter;
 
         /* Clear the tool-tip whenever a key binding is handled, without this tool-tips
          * are kept when a modal operators starts (annoying but otherwise harmless). */
@@ -3905,17 +3914,34 @@ wmEventHandler_Keymap *WM_event_add_keymap_handler(ListBase *handlers, wmKeyMap
  *
  * Follow #wmEventHandler_KeymapDynamicFn signature.
  */
-wmKeyMap *WM_event_get_keymap_from_toolsystem_fallback(wmWindowManager *wm,
-                                                       wmEventHandler_Keymap *handler)
+void WM_event_get_keymap_from_toolsystem_fallback(wmWindowManager *wm,
+                                                  wmEventHandler_Keymap *handler,
+                                                  wmEventHandler_KeymapResult *km_result)
 {
+  memset(km_result, 0x0, sizeof(*km_result));
+
+  const char *keymap_id_list[ARRAY_SIZE(km_result->keymaps)];
+  int keymap_id_list_len = 0;
+
   ScrArea *area = handler->dynamic.user_data;
   handler->keymap_tool = NULL;
   bToolRef_Runtime *tref_rt = area->runtime.tool ? area->runtime.tool->runtime : NULL;
-  if (tref_rt && tref_rt->keymap_fallback[0]) {
-    const char *keymap_id = NULL;
 
+  if (tref_rt && tref_rt->keymap[0]) {
+    keymap_id_list[keymap_id_list_len++] = tref_rt->keymap;
+  }
+
+  bool is_gizmo_visible = false;
+  bool is_gizmo_highlight = false;
+
+  if (tref_rt && tref_rt->keymap_fallback[0]) {
+    bool add_keymap = false;
     /* Support for the gizmo owning the tool keymap. */
-    if (tref_rt->gizmo_group[0] != '\0' && tref_rt->keymap_fallback[0] != '\0') {
+
+    if (tref_rt->flag & TOOLREF_FLAG_FALLBACK_KEYMAP) {
+      add_keymap = true;
+    }
+    if (tref_rt->gizmo_group[0] != '\0') {
       wmGizmoMap *gzmap = NULL;
       wmGizmoGroup *gzgroup = NULL;
       LISTBASE_FOREACH (ARegion *, region, &area->regionbase) {
@@ -3931,32 +3957,49 @@ wmKeyMap *WM_event_get_keymap_from_toolsystem_fallback(wmWindowManager *wm,
         if (gzgroup->type->flag & WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP) {
           /* If all are hidden, don't override. */
           if (gzgroup->use_fallback_keymap) {
+            is_gizmo_visible = true;
             wmGizmo *highlight = wm_gizmomap_highlight_get(gzmap);
-            if (highlight == NULL) {
-              keymap_id = tref_rt->keymap_fallback;
+            if (highlight) {
+              is_gizmo_highlight = true;
             }
+            add_keymap = true;
           }
         }
       }
     }
+    if (add_keymap) {
+      keymap_id_list[keymap_id_list_len++] = tref_rt->keymap_fallback;
+    }
+  }
 
-    if (keymap_id && keymap_id[0]) {
-      wmKeyMap *km = WM_keymap_list_find_spaceid_or_empty(
-          &wm->userconf->keymaps, keymap_id, area->spacetype, RGN_TYPE_WINDOW);
-      /* We shouldn't use keymaps from unrelated spaces. */
-      if (km != NULL) {
-        handler->keymap_tool = area->runtime.tool;
-        return km;
-      }
-      printf(
-          "Keymap: '%s' not found for tool '%s'\n", tref_rt->keymap, area->runtime.tool->idname);
+  if (is_gizmo_visible && !is_gizmo_highlight) {
+    if (keymap_id_list_len == 2) {
+      SWAP(const char *, keymap_id_list[0], keymap_id_list[1]);
     }
   }
-  return NULL;
+
+  for (int i = 0; i < keymap_id_list_len; i++) {
+    const char *keymap_id = keymap_id_list[i];
+    BLI_assert(keymap_id && keymap_id[0]);
+
+    wmKeyMap *km = WM_keymap_list_find_spaceid_or_empty(
+        &wm->userconf->keymaps, keymap_id, area->spacetype, RGN_TYPE_WINDOW);
+    /* We shouldn't use keymaps from unrelated spaces. */
+    if (km == NULL) {
+      printf("Keymap: '%s' not found for tool '%s'\n", keymap_id, area->runtime.tool->idname);
+      continue;
+    }
+    handler->keymap_tool = area->runtime.tool;
+    km_result->keymaps[km_result->keymaps_len++] = km;
+  }
 }
 
-wmKeyMap *WM_event_get_keymap_from_toolsystem(wmWindowManager *wm, wmEventHandler_Keymap *handler)
+void WM_event_get_keymap_from_toolsystem(wmWindowManager *wm,
+                                         wmEventHandler_Keymap *handler,
+                                         wmEventHandler_KeymapResult *km_result)
 {
+  memset(km_result, 0x0, sizeof(*km_result));
+
   ScrArea *area = handler->dynamic.user_data;
   handler->keymap_tool = NULL;
   bToolRef_Runtime *tref_rt = area->runtime.tool ? area->runtime.tool->runtime : NULL;
@@ -3968,13 +4011,14 @@ wmKeyMap *WM_event_get_keymap_from_toolsystem(wmWindowManager *wm, wmEventHandle
       /* We shouldn't use keymaps from unrelated spaces. */
       if (km != NULL) {
         handler->keymap_tool = area->runtime.tool;
-        return km;
+        km_result->keymaps[km_result->keymaps_len++] = km;
+      }
+      else {
+        printf(
+            "Keymap: '%s' not found for tool '%s'\n", tref_rt->keymap, area->runtime.tool->idname);
       }
-      printf(
-          "Keymap: '%s' not found for tool '%s'\n", tref_rt->keymap, area->runtime.tool->idname);
     }
   }
-  return NULL;
 }
 
 struct wmEventHandler_Keymap *WM_event_add_keymap_handler_dynamic(
@@ -5088,18 +5132,22 @@ void WM_set_locked_interface(wmWindowManager *wm, bool lock)
 /** \name Event / Keymap Matching API
  * \{ */
 
-wmKeyMap *WM_event_get_keymap_from_handler(wmWindowManager *wm, wmEventHandler_Keymap *handler)
+void WM_event_get_keymaps_from_handler(wmWindowManager *wm,
+                                       wmEventHandler_Keymap *handler,
+                                       wmEventHandler_KeymapResult *km_result)
 {
-  wmKeyMap *keymap;
   if (handler->dynamic.keymap_fn != NULL) {
-    keymap = handler->dynamic.keymap_fn(wm, handler);
+    handler->dynamic.keymap_fn(wm, handler, km_result);
     BLI_assert(handler->keymap == NULL);
   }
   else {
-    keymap = WM_keymap_active(wm, handler->keymap);
+    memset(km_result, 0x0, sizeof(*km_result));
+    wmKeyMap *keymap = WM_keymap_active(wm, handler->keymap);
     BLI_assert(keymap != NULL);
+    if (keymap != NULL) {
+      km_result->keymaps[km_result->keymaps_len++] = keymap;
+    }
   }
-  return keymap;
 }
 
 wmKeyMapItem *WM_event_match_keymap_item(bContext *C, wmKeyMap *keymap, const wmEvent *event)
@@ -5128,11 +5176,15 @@ wmKeyMapItem *WM_event_match_keymap_item_from_handlers(bContext *C,
     else if (handler_base->poll == NULL || handler_base->poll(CTX_wm_region(C), event)) {
       if (handler_base->type == WM_HANDLER_TYPE_KEYMAP) {
         wmEventHandler_Keymap *handler = (wmEventHandler_Keymap *)handler_base;
-        wmKeyMap *keymap = WM_event_get_keymap_from_handler(wm, handler);
-        if (keymap && WM_keymap_poll(C, keymap)) {
-          wmKeyMapItem *kmi = WM_event_match_keymap_item(C, keymap, event);
-          if (kmi != NULL) {
-            return kmi;
+        wmEventHandler_KeymapResult km_result;
+        WM_event_get_keymaps_from_handler(wm, handler, &km_result);
+        for (int km_index = 0; km_index < km_result.keymaps_len; km_index++) {
+          wmKeyMap *keymap = km_result.keymaps[km_index];
+          if (WM_keymap_poll(C, keymap)) {
+            wmKeyMapItem *kmi = WM_event_match_keymap_item(C, keymap, event);
+            if (kmi != NULL) {
+              return kmi;
+            }
           }
         }
       }
diff --git a/source/blender/windowmanager/intern/wm_jobs.c b/source/blender/windowmanager/intern/wm_jobs.c
index 6494c337c10..2604105896d 100644
--- a/source/blender/windowmanager/intern/wm_jobs.c
+++ b/source/blender/windowmanager/intern/wm_jobs.c
@@ -230,7 +230,7 @@ bool WM_jobs_test(const wmWindowManager *wm, const void *owner, int job_type)
   LISTBASE_FOREACH (wmJob *, wm_job, &wm->jobs) {
     if (wm_job->owner == owner) {
       if (ELEM(job_type, WM_JOB_TYPE_ANY, wm_job->job_type)) {
-        if (wm_job->running || wm_job->suspended) {
+        if ((wm_job->flag & WM_JOB_PROGRESS) && (wm_job->running || wm_job->suspended)) {
           return true;
         }
       }
diff --git a/source/blender/windowmanager/intern/wm_keymap.c b/source/blender/windowmanager/intern/wm_keymap.c
index f955abaed53..e5aedfc7f47 100644
--- a/source/blender/windowmanager/intern/wm_keymap.c
+++ b/source/blender/windowmanager/intern/wm_keymap.c
@@ -462,7 +462,9 @@ bool WM_keymap_poll(bContext *C, wmKeyMap *keymap)
     /* Empty key-maps may be missing more there may be a typo in the name.
      * Warn early to avoid losing time investigating each case.
      * When developing a customized Blender though you may want empty keymaps. */
-    if (!U.app_template[0]) {
+    if (!U.app_template[0] &&
+        /* Fallback key-maps may be intentionally empty, don't flood the output. */
+        !BLI_str_endswith(keymap->idname, " (fallback)")) {
       CLOG_WARN(WM_LOG_KEYMAPS, "empty keymap '%s'", keymap->idname);
     }
   }
@@ -1402,15 +1404,19 @@ static wmKeyMapItem *wm_keymap_item_find_handlers(const bContext *C,
   LISTBASE_FOREACH (wmEventHandler *, handler_base, handlers) {
     if (handler_base->type == WM_HANDLER_TYPE_KEYMAP) {
       wmEventHandler_Keymap *handler = (wmEventHandler_Keymap *)handler_base;
-      wmKeyMap *keymap = WM_event_get_keymap_from_handler(wm, handler);
-      if (keymap && WM_keymap_poll((bContext *)C, keymap)) {
-        wmKeyMapItem *kmi = wm_keymap_item_find_in_keymap(
-            keymap, opname, properties, is_strict, params);
-        if (kmi != NULL) {
-          if (r_keymap) {
-            *r_keymap = keymap;
+      wmEventHandler_KeymapResult km_result;
+      WM_event_get_keymaps_from_handler(wm, handler, &km_result);
+      for (int km_index = 0; km_index < km_result.keymaps_len; km_index++) {
+        wmKeyMap *keymap = km_result.keymaps[km_index];
+        if (WM_keymap_poll((bContext *)C, keymap)) {
+          wmKeyMapItem *kmi = wm_keymap_item_find_in_keymap(
+              keymap, opname, properties, is_strict, params);
+          if (kmi != NULL) {
+            if (r_keymap) {
+              *r_keymap = keymap;
+            }
+            return kmi;
           }
-          return kmi;
         }
       }
     }
diff --git a/source/blender/windowmanager/intern/wm_operator_utils.c b/source/blender/windowmanager/intern/wm_operator_utils.c
index 81b597f7484..85a0a28de79 100644
--- a/source/blender/windowmanager/intern/wm_operator_utils.c
+++ b/source/blender/windowmanager/intern/wm_operator_utils.c
@@ -41,6 +41,24 @@
 #include "ED_screen.h"
 
 /* -------------------------------------------------------------------- */
+/** \name Generic Utilities
+ * \{ */
+
+/**
+ * Only finish + pass through for press events (allowing press-tweak).
+ */
+int WM_operator_flag_only_pass_through_on_press(int retval, const struct wmEvent *event)
+{
+  if ((event->val != KM_PRESS) &&
+      ((retval & OPERATOR_PASS_THROUGH) && (retval & OPERATOR_FINISHED))) {
+    retval &= ~OPERATOR_PASS_THROUGH;
+  }
+  return retval;
+}
+
+/** \} */
+
+/* -------------------------------------------------------------------- */
 /** \name Value Interaction Helper
  *
  * Possible additions (add as needed).
diff --git a/source/blender/windowmanager/intern/wm_toolsystem.c b/source/blender/windowmanager/intern/wm_toolsystem.c
index 5eaf026191f..0c24520d565 100644
--- a/source/blender/windowmanager/intern/wm_toolsystem.c
+++ b/source/blender/windowmanager/intern/wm_toolsystem.c
@@ -326,7 +326,10 @@ void WM_toolsystem_ref_set_from_runtime(struct bContext *C,
   bool use_fallback_keymap = false;
 
   if (tref->idname_fallback[0] || tref->runtime->keymap_fallback[0]) {
-    if (tref_rt->gizmo_group[0]) {
+    if (tref_rt->flag & TOOLREF_FLAG_FALLBACK_KEYMAP) {
+      use_fallback_keymap = true;
+    }
+    else if (tref_rt->gizmo_group[0]) {
       wmGizmoGroupType *gzgt = WM_gizmogrouptype_find(tref_rt->gizmo_group, false);
       if (gzgt) {
         if (gzgt->flag & WM_GIZMOGROUPTYPE_TOOL_FALLBACK_KEYMAP) {
diff --git a/source/blender/windowmanager/intern/wm_window.c b/source/blender/windowmanager/intern/wm_window.c
index 887aed7ffc7..8baf4a0e013 100644
--- a/source/blender/windowmanager/intern/wm_window.c
+++ b/source/blender/windowmanager/intern/wm_window.c
@@ -2426,10 +2426,15 @@ void wm_window_IME_end(wmWindow *win)
 
 void *WM_opengl_context_create(void)
 {
-  /* On Windows there is a problem creating contexts that share lists
-   * from one context that is current in another thread.
-   * So we should call this function only on the main thread.
-   */
+  /* On Windows there is a problem creating contexts that share resources (almost any object,
+   * including legacy display lists, but also textures) with a context which is current in another
+   * thread. This is a documented and behavior of both `::wglCreateContextAttribsARB()` and
+   * `::wglShareLists()`.
+   *
+   * Other platforms might successfully share resources from context which is active somewhere
+   * else, but to keep our code behave the same on all platform we expect contexts to only be
+   * created from the main thread. */
+
   BLI_assert(BLI_thread_is_main());
   BLI_assert(GPU_framebuffer_active_get() == GPU_framebuffer_back_get());
 
diff --git a/tests/performance/tests/cycles.py b/tests/performance/tests/cycles.py
index bac6b8a7ceb..e702fa445d2 100644
--- a/tests/performance/tests/cycles.py
+++ b/tests/performance/tests/cycles.py
@@ -17,6 +17,16 @@ def _run(args):
     scene.render.image_settings.file_format = 'PNG'
     scene.cycles.device = 'CPU' if device_type == 'CPU' else 'GPU'
 
+    if scene.cycles.use_adaptive_sampling:
+        # Render samples specified in file, no other way to measure
+        # adaptive sampling performance reliably.
+        scene.cycles.time_limit = 0.0
+    else:
+        # Render for fixed amount of time so it's adaptive to the
+        # machine and devices.
+        scene.cycles.samples = 16384
+        scene.cycles.time_limit = 10.0
+
     if scene.cycles.device == 'GPU':
         # Enable specified GPU in preferences.
         prefs = bpy.context.preferences
@@ -62,12 +72,14 @@ class CyclesTest(api.Test):
                 'device_index': device_index,
                 'render_filepath': str(env.log_file.parent / (env.log_file.stem + '.png'))}
 
-        _, lines = env.run_in_blender(_run, args, ['--debug-cycles', '--verbose', '1', self.filepath])
+        _, lines = env.run_in_blender(_run, args, ['--debug-cycles', '--verbose', '2', self.filepath])
 
         # Parse render time from output
         prefix_time = "Render time (without synchronization): "
         prefix_memory = "Peak: "
+        prefix_time_per_sample = "Average time per sample: "
         time = None
+        time_per_sample = None
         memory = None
         for line in lines:
             line = line.strip()
@@ -75,12 +87,20 @@ class CyclesTest(api.Test):
             if offset != -1:
                 time = line[offset + len(prefix_time):]
                 time = float(time)
+            offset = line.find(prefix_time_per_sample)
+            if offset != -1:
+                time_per_sample = line[offset + len(prefix_time_per_sample):]
+                time_per_sample = time_per_sample.split()[0]
+                time_per_sample = float(time_per_sample)
             offset = line.find(prefix_memory)
             if offset != -1:
                 memory = line[offset + len(prefix_memory):]
                 memory = memory.split()[0].replace(',', '')
                 memory = float(memory)
 
+        if time_per_sample:
+            time = time_per_sample
+
         if not (time and memory):
             raise Exception("Error parsing render time output")
 
@@ -88,5 +108,5 @@ class CyclesTest(api.Test):
 
 
 def generate(env):
-    filepaths = env.find_blend_files('cycles-x/*')
+    filepaths = env.find_blend_files('cycles/*')
     return [CyclesTest(filepath) for filepath in filepaths]
diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt
index a1b94abc317..75f00c3c5cc 100644
--- a/tests/python/CMakeLists.txt
+++ b/tests/python/CMakeLists.txt
@@ -637,7 +637,6 @@ if(WITH_CYCLES OR WITH_OPENGL_RENDER_TESTS)
     set(render_tests
       bsdf
       denoise
-      denoise_animation
       displacement
       hair
       image_colorspace
author	Cian Jinks <cjinks99@gmail.com>	2021-09-22 17:09:31 +0300
committer	Cian Jinks <cjinks99@gmail.com>	2021-09-22 17:09:31 +0300
commit	e734491048ef2436af41e272b8900f20785ecbe6 (patch)
tree	8cee3fc068c782c0ba8cb9a581e768968c565569
parent	f21cd0881948f6eaf16af0b354cd904df7407bda (diff)
parent	204b01a254ac2445fea217e5211b2ed6aef631ca (diff)