Cycles: Added Cryptomatte output.

This allows for extra output passes that encode automatic object and material masks for the entire scene. It is an implementation of the Cryptomatte standard as introduced by Psyop. A good future extension would be to add a manifest to the export and to do plenty of testing to ensure that it is fully compatible with other renderers and compositing programs that use Cryptomatte. Internally, it adds the ability for Cycles to have several passes of the same type that are distinguished by their name. Differential Revision: https://developer.blender.org/D3538
author: Stefan Werner <stefan.werner@tangent-animation.com> 2018-10-28 12:37:41 +0300
committer: Stefan Werner <stefan.werner@tangent-animation.com> 2018-10-28 12:37:41 +0300
commit: e58c6cf0c678849cf9c348a8df5e0ec24a6abd4d (patch)
tree: 7c6dc887ad93e87ad56c3d0c627265a99c7f1e7d /intern/cycles/kernel
parent: c0b3e3daebd36a483e659d32e6517f2fb9b0e277 (diff)
10 files changed, 233 insertions, 16 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index b48ed649a8c..08efede36df 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -96,6 +96,7 @@ set(SRC_HEADERS
 	kernel_emission.h
 	kernel_film.h
 	kernel_globals.h
+	kernel_id_passes.h
 	kernel_jitter.h
 	kernel_light.h
 	kernel_math.h
diff --git a/intern/cycles/kernel/geom/geom_object.h b/intern/cycles/kernel/geom/geom_object.h
index cfe17e63627..0eb8ce2cf8b 100644
--- a/intern/cycles/kernel/geom/geom_object.h
+++ b/intern/cycles/kernel/geom/geom_object.h
@@ -304,6 +304,24 @@ ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 	return kernel_tex_fetch(__shaders, (sd->shader & SHADER_MASK)).pass_id;
 }
 
+/* Cryptomatte ID */
+
+ccl_device_inline float object_cryptomatte_id(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0.0f;
+
+	return kernel_tex_fetch(__objects, object).cryptomatte_object;
+}
+
+ccl_device_inline float object_cryptomatte_asset_id(KernelGlobals *kg, int object)
+{
+	if(object == OBJECT_NONE)
+		return 0;
+
+	return kernel_tex_fetch(__objects, object).cryptomatte_asset;
+}
+
 /* Particle data from which object was instanced */
 
 ccl_device_inline uint particle_index(KernelGlobals *kg, int particle)
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index 74cfacb5bc1..37402f42863 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -21,6 +21,7 @@
 
 #ifdef __KERNEL_CPU__
 #  include "util/util_vector.h"
+#  include "util/util_map.h"
 #endif
 
 #ifdef __KERNEL_OPENCL__
@@ -42,6 +43,8 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #  endif
 
+typedef unordered_map<float, float> CoverageMap;
+
 struct Intersection;
 struct VolumeStep;
 
@@ -68,6 +71,11 @@ typedef struct KernelGlobals {
 	VolumeStep *decoupled_volume_steps[2];
 	int decoupled_volume_steps_index;
 
+	/* A buffer for storing per-pixel coverage for Cryptomatte. */
+	CoverageMap *coverage_object;
+	CoverageMap *coverage_material;
+	CoverageMap *coverage_asset;
+
 	/* split kernel */
 	SplitData split_data;
 	SplitParams split_param_data;
diff --git a/intern/cycles/kernel/kernel_id_passes.h b/intern/cycles/kernel/kernel_id_passes.h
new file mode 100644
index 00000000000..486c61d2ae5
--- /dev/null
+++ b/intern/cycles/kernel/kernel_id_passes.h
@@ -0,0 +1,94 @@
+/*
+* Copyright 2018 Blender Foundation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer, int num_slots, float id, float weight)
+{
+	kernel_assert(id != ID_NONE);
+	if(weight == 0.0f) {
+		return;
+	}
+	
+	for(int slot = 0; slot < num_slots; slot++) {
+		ccl_global float2 *id_buffer = (ccl_global float2*)buffer;
+#ifdef __ATOMIC_PASS_WRITE__
+		/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+		if(id_buffer[slot].x == ID_NONE) {
+			/* Use an atomic to claim this slot.
+			* If a different thread got here first, try again from this slot on. */
+			float old_id = atomic_compare_and_swap_float(buffer+slot*2, ID_NONE, id);
+			if(old_id != ID_NONE && old_id != id) {
+				continue;
+			}
+			atomic_add_and_fetch_float(buffer+slot*2+1, weight);
+			break;
+		}
+		/* If there already is a slot for that ID, add the weight.
+		 * If no slot was found, add it to the last. */
+		else if(id_buffer[slot].x == id || slot == num_slots - 1) {
+			atomic_add_and_fetch_float(buffer+slot*2+1, weight);
+			break;
+		}
+#else /* __ATOMIC_PASS_WRITE__ */
+		/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
+		if(id_buffer[slot].x == ID_NONE) {
+			id_buffer[slot].x = id;
+			id_buffer[slot].y = weight;
+			break;
+		}
+		/* If there already is a slot for that ID, add the weight.
+		* If no slot was found, add it to the last. */
+		else if(id_buffer[slot].x == id || slot == num_slots - 1) {
+			id_buffer[slot].y += weight;
+			break;
+		}
+#endif /* __ATOMIC_PASS_WRITE__ */
+	}
+}
+
+ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
+{
+	ccl_global float2 *id_buffer = (ccl_global float2*)buffer;
+	for(int slot = 1; slot < num_slots; ++slot) {
+		if(id_buffer[slot].x == ID_NONE) {
+			return;
+		}
+		/* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
+		int i = slot;
+		while(i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
+			float2 swap = id_buffer[i];
+			id_buffer[i] = id_buffer[i - 1];
+			id_buffer[i - 1] = swap;
+			--i;
+		}
+	}
+}
+
+#ifdef __KERNEL_GPU__
+/* post-sorting for Cryptomatte */
+ccl_device void kernel_cryptomatte_post(KernelGlobals *kg, ccl_global float *buffer, uint sample, int x, int y, int offset, int stride)
+{
+	if(sample - 1 == kernel_data.integrator.aa_samples) {
+		int index = offset + x + y * stride;
+		int pass_stride = kernel_data.film.pass_stride;
+		ccl_global float *cryptomatte_buffer = buffer + index * pass_stride + kernel_data.film.pass_cryptomatte;
+		kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+	}
+}
+#endif
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 458aa6c2a97..e256a1819ed 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-CCL_NAMESPACE_BEGIN
-
 #if defined(__SPLIT_KERNEL__) || defined(__KERNEL_CUDA__)
 #define __ATOMIC_PASS_WRITE__
 #endif
 
+#include "kernel/kernel_id_passes.h"
+
+CCL_NAMESPACE_BEGIN
+
 ccl_device_inline void kernel_write_pass_float(ccl_global float *buffer, float value)
 {
 	ccl_global float *buf = buffer;
@@ -189,6 +191,23 @@ ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
 }
 #endif /* __KERNEL_DEBUG__ */
 
+#ifdef __KERNEL_CPU__
+#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_pass_cpu(buffer, depth * 2, id, matte_weight, kg->coverage_##name)
+ccl_device_inline size_t kernel_write_id_pass_cpu(float *buffer, size_t depth, float id, float matte_weight, CoverageMap *map)
+{
+	if(map) {
+		(*map)[id] += matte_weight;
+		return 0;
+	}
+#else /* __KERNEL_CPU__ */
+#define WRITE_ID_SLOT(buffer, depth, id, matte_weight, name) kernel_write_id_slots_gpu(buffer, depth * 2, id, matte_weight) 
+ccl_device_inline size_t kernel_write_id_slots_gpu(ccl_global float *buffer, size_t depth, float id, float matte_weight)
+{
+#endif /* __KERNEL_CPU__ */
+	kernel_write_id_slots(buffer, depth, id, matte_weight);
+	return depth * 2;
+}
+
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, ccl_addr_space PathState *state, float3 throughput)
 {
@@ -242,6 +261,26 @@ ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
 		}
 	}
 
+	if(kernel_data.film.cryptomatte_passes) {
+		const float matte_weight = average(throughput) * (1.0f - average(shader_bsdf_transparency(kg, sd)));
+		if(matte_weight > 0.0f) {
+			ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+			if(kernel_data.film.cryptomatte_passes & CRYPT_OBJECT) {
+				float id = object_cryptomatte_id(kg, sd->object);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, object);
+			}
+			if(kernel_data.film.cryptomatte_passes & CRYPT_MATERIAL) {
+				float id = shader_cryptomatte_id(kg, sd->shader);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, material);
+			}
+			if(kernel_data.film.cryptomatte_passes & CRYPT_ASSET) {
+				float id = object_cryptomatte_asset_id(kg, sd->object);
+				cryptomatte_buffer += WRITE_ID_SLOT(cryptomatte_buffer, kernel_data.film.cryptomatte_depth, id, matte_weight, asset);
+			}
+		}
+	}
+
+
 	if(light_flag & PASSMASK_COMPONENT(DIFFUSE))
 		L->color_diffuse += shader_bsdf_diffuse(kg, sd)*throughput;
 	if(light_flag & PASSMASK_COMPONENT(GLOSSY))
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index e834b701f96..af883aa715b 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -1276,4 +1276,9 @@ ccl_device bool shader_transparent_shadow(KernelGlobals *kg, Intersection *isect
 }
 #endif  /* __TRANSPARENT_SHADOWS__ */
 
+ccl_device float shader_cryptomatte_id(KernelGlobals *kg, int shader)
+{
+	return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).cryptomatte_id;
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index e93100a6442..f46b06f87f9 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -53,6 +53,7 @@ CCL_NAMESPACE_BEGIN
 #define OBJECT_NONE				(~0)
 #define PRIM_NONE				(~0)
 #define LAMP_NONE				(~0)
+#define ID_NONE					(0.0f)
 
 #define VOLUME_STACK_SIZE		32
 
@@ -415,6 +416,7 @@ typedef enum PassType {
 	PASS_RAY_BOUNCES,
 #endif
 	PASS_RENDER_TIME,
+	PASS_CRYPTOMATTE,
 	PASS_CATEGORY_MAIN_END = 31,
 
 	PASS_MIST = 32,
@@ -443,6 +445,14 @@ typedef enum PassType {
 
 #define PASS_ANY (~0)
 
+typedef enum CryptomatteType {
+	CRYPT_NONE = 0,
+	CRYPT_OBJECT = (1 << 0),
+	CRYPT_MATERIAL = (1 << 1),
+	CRYPT_ASSET = (1 << 2),
+	CRYPT_ACCURATE = (1 << 3),
+} CryptomatteType;
+
 typedef enum DenoisingPassOffsets {
 	DENOISING_PASS_NORMAL             = 0,
 	DENOISING_PASS_NORMAL_VAR         = 3,
@@ -1260,17 +1270,20 @@ typedef struct KernelFilm {
 	int pass_shadow;
 	float pass_shadow_scale;
 	int filter_table_offset;
+	int cryptomatte_passes;
+	int cryptomatte_depth;
+	int pass_cryptomatte;
 
 	int pass_mist;
 	float mist_start;
 	float mist_inv_depth;
 	float mist_falloff;
-
+	
 	int pass_denoising_data;
 	int pass_denoising_clean;
 	int denoising_flags;
 
-	int pad1, pad2, pad3;
+	int pad1, pad2;
 
 	/* XYZ to rendering color space transform. float4 instead of float3 to
 	 * ensure consistent padding/alignment across devices. */
@@ -1460,7 +1473,11 @@ typedef struct KernelObject {
 	uint patch_map_offset;
 	uint attribute_map_offset;
 	uint motion_offset;
-	uint pad;
+	uint pad1;
+
+	float cryptomatte_object;
+	float cryptomatte_asset;
+	float pad2, pad3;
 } KernelObject;
 static_assert_align(KernelObject, 16);
 
@@ -1540,7 +1557,7 @@ static_assert_align(KernelParticle, 16);
 
 typedef struct KernelShader {
 	float constant_emission[3];
-	float pad1;
+	float cryptomatte_id;
 	int flags;
 	int pass_id;
 	int pad2, pad3;
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 8a180a509e8..af311027f78 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -40,14 +40,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 
 #ifdef __BRANCHED_PATH__
@@ -56,14 +63,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
 kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+	
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 63128d0aecf..de1f5088629 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -66,9 +66,17 @@ __kernel void kernel_ocl_path_trace(
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
+	bool thread_is_active = x < sx + sw && y < sy + sh;
+	if(thread_is_active) {
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
+	}
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(thread_is_active) {
+			kernel_cryptomatte_post(kg, buffer, sample, x, y, offset, stride);
+		}
+	}
 }
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 180c0b57077..18eec6372f1 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -80,8 +80,10 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
+	bool ray_was_updated = false;
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+		ray_was_updated = true;
 		uint sample = state->sample;
 		uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
 		ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
@@ -92,6 +94,17 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
 	}
 
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(ray_was_updated && state->sample - 1 == kernel_data.integrator.aa_samples) {
+			uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+			ccl_global float *buffer = kernel_split_params.tile.buffer + buffer_offset;
+			ccl_global float *cryptomatte_buffer = buffer + kernel_data.film.pass_cryptomatte;
+			kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
+		}
+	}
+
 	if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
 		/* We have completed current work; So get next work */
 		ccl_global uint *work_pools = kernel_split_params.work_pools;
author	Stefan Werner <stefan.werner@tangent-animation.com>	2018-10-28 12:37:41 +0300
committer	Stefan Werner <stefan.werner@tangent-animation.com>	2018-10-28 12:37:41 +0300
commit	e58c6cf0c678849cf9c348a8df5e0ec24a6abd4d (patch)
tree	7c6dc887ad93e87ad56c3d0c627265a99c7f1e7d /intern/cycles/kernel
parent	c0b3e3daebd36a483e659d32e6517f2fb9b0e277 (diff)