9 files changed, 65 insertions, 43 deletions
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
index b62aa9663ec..e036b53b810 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -95,6 +95,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
index 26777fdabb2..4c758711481 100644
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -191,6 +191,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          int dy,
                                                          float *difference_image,
                                                          float *image,
+                                                         float *temp_image,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
@@ -200,7 +201,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
 #ifdef KERNEL_STUB
 	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-	kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
+	kernel_filter_nlm_update_output(dx, dy, difference_image, image, temp_image, out_image, accum_image, load_int4(rect), stride, f);
 #endif
 }
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index b77b7350d86..ae4fd85780d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -26,7 +26,7 @@ template<typename T> struct TextureInterpolator  {
 		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
 		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
 		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
+	} (void) 0
 
 	static ccl_always_inline float4 read(float4 r)
 	{
@@ -540,4 +540,4 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, int id, float x,
 
 CCL_NAMESPACE_END
 
-#endif // __KERNEL_CPU_IMAGE_H__
+#endif  // __KERNEL_CPU_IMAGE_H__
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 5ec1655ab05..759b7e4c20d 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -97,7 +97,7 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 	{
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 /* Film */
@@ -120,7 +120,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -141,7 +141,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 /* Shader Evaluate */
@@ -176,7 +176,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 	else {
 		kernel_background_evaluate(kg, input, output, i);
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 }
 
 #else  /* __SPLIT_KERNEL__ */
@@ -208,7 +208,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		ccl_local type locals; \
 		kernel_##name(kg, &locals); \
 	}
-#endif /* KERNEL_STUB */
+#endif  /* KERNEL_STUB */
 
 DEFINE_SPLIT_KERNEL_FUNCTION(path_init)
 DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu
index 0561c40e6b1..b856cbde45c 100644
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int w,
                                        int h,
                                        int stride,
-                                       int shift_stride,
+                                       int pass_stride,
                                        int r,
                                        int channel_offset,
                                        float a,
@@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
 		                                  weight_image,
 		                                  variance_image,
@@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
                             int w,
                             int h,
                             int stride,
-                            int shift_stride,
+                            int pass_stride,
                             int r,
                             int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_blur(co.x, co.y,
 		                       difference_image + ofs,
 		                       out_image + ofs,
@@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                    int w,
                                    int h,
                                    int stride,
-                                   int shift_stride,
+                                   int pass_stride,
                                    int r,
                                    int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_weight(co.x, co.y,
 		                              difference_image + ofs,
 		                              out_image + ofs,
@@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      int w,
                                      int h,
                                      int stride,
-                                     int shift_stride,
+                                     int pass_stride,
                                      int r,
                                      int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
 		                                difference_image + ofs,
 		                                image,
@@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
-                                         int f,
-                                         int pass_stride)
+                                         int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
 		kernel_filter_nlm_construct_gramian(co.x, co.y,
 		                                    co.z, co.w,
 		                                    difference_image + ofs,
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 8a180a509e8..af311027f78 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -40,14 +40,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 
 #ifdef __BRANCHED_PATH__
@@ -56,14 +63,21 @@ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
 kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size)
 {
 	int work_index = ccl_global_id(0);
-
-	if(work_index < total_work_size) {
-		uint x, y, sample;
+	bool thread_is_active = work_index < total_work_size;
+	uint x, y, sample;
+	KernelGlobals kg;
+	if(thread_is_active) {
 		get_work_pixel(tile, work_index, &x, &y, &sample);
 
-		KernelGlobals kg;
 		kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
 	}
+	
+	if(kernel_data.film.cryptomatte_passes) {
+		__syncthreads();
+		if(thread_is_active) {
+			kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride);
+		}
+	}
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl
index 3c75754fb39..a550f97f4eb 100644
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int w,
                                                     int h,
                                                     int stride,
-                                                    int shift_stride,
+                                                    int pass_stride,
                                                     int r,
                                                     int channel_offset,
                                                     float a,
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
 		                                  weight_image,
 		                                  variance_image,
@@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
                                          int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_blur(co.x, co.y,
 		                       difference_image + ofs,
 		                       out_image + ofs,
@@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest
                                                 int w,
                                                 int h,
                                                 int stride,
-                                                int shift_stride,
+                                                int pass_stride,
                                                 int r,
                                                 int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_calc_weight(co.x, co.y,
 		                              difference_image + ofs,
 		                              out_image + ofs,
@@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                   int w,
                                                   int h,
                                                   int stride,
-                                                  int shift_stride,
+                                                  int pass_stride,
                                                   int r,
                                                   int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+	if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
 		kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
 		                                difference_image + ofs,
 		                                image,
@@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int w,
                                                       int h,
                                                       int stride,
-                                                      int shift_stride,
+                                                      int pass_stride,
                                                       int r,
-                                                      int f,
-                                                      int pass_stride)
+                                                      int f)
 {
 	int4 co, rect;
 	int ofs;
-	if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+	if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
 		kernel_filter_nlm_construct_gramian(co.x, co.y,
 		                                    co.z, co.w,
 		                                    difference_image + ofs,
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 63128d0aecf..de1f5088629 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -66,9 +66,17 @@ __kernel void kernel_ocl_path_trace(
 
 	int x = sx + ccl_global_id(0);
 	int y = sy + ccl_global_id(1);
-
-	if(x < sx + sw && y < sy + sh)
+	bool thread_is_active = x < sx + sw && y < sy + sh;
+	if(thread_is_active) {
 		kernel_path_trace(kg, buffer, sample, x, y, offset, stride);
+	}
+	if(kernel_data.film.cryptomatte_passes) {
+		/* Make sure no thread is writing to the buffers. */
+		ccl_barrier(CCL_LOCAL_MEM_FENCE);
+		if(thread_is_active) {
+			kernel_cryptomatte_post(kg, buffer, sample, x, y, offset, stride);
+		}
+	}
 }
 
 #else  /* __COMPILE_ONLY_MEGAKERNEL__ */
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
index dd9d683e030..79af831c2fb 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
+++ b/intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
@@ -142,7 +142,7 @@ ccl_device_inline float svm_image_texture_frac(float x, int *ix)
 		u[1] =  ((      0.5f * t - 1.0f) * t       ) * t + (2.0f/3.0f); \
 		u[2] =  ((     -0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f/6.0f); \
 		u[3] = (1.0f / 6.0f) * t * t * t; \
-	} (void)0
+	} (void) 0
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {