13 files changed, 91 insertions, 24 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index f06963c146e..832f4d1c1fd 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -343,7 +343,7 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
+		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2())
@@ -374,8 +374,14 @@ public:
 
 		for(int sample = 0; sample < task.num_samples; sample++) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					task.shader_eval_type, x, task.offset, sample);
+				shader_kernel(&kg,
+				              (uint4*)task.shader_input,
+				              (float4*)task.shader_output,
+				              (float*)task.shader_output_luma,
+				              task.shader_eval_type,
+				              x,
+				              task.offset,
+				              sample);
 
 			if(task.get_cancel() || task_pool.canceled())
 				break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d9d6fd77ecb..5c9ca3454c6 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -726,6 +726,7 @@ public:
 		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+		CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
 
 		/* get kernel function */
 		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
@@ -747,13 +748,18 @@ public:
 				int shader_w = min(shader_chunk_size, end - shader_x);
 
 				/* pass in parameters */
-				void *args[] = {&d_input,
-								 &d_output,
-								 &task.shader_eval_type,
-								 &shader_x,
-								 &shader_w,
-								 &offset,
-								 &sample};
+				void *args[8];
+				int arg = 0;
+				args[arg++] = &d_input;
+				args[arg++] = &d_output;
+				if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+					args[arg++] = &d_output_luma;
+				}
+				args[arg++] = &task.shader_eval_type;
+				args[arg++] = &shader_x;
+				args[arg++] = &shader_w;
+				args[arg++] = &offset;
+				args[arg++] = &sample;
 
 				/* launch kernel */
 				int threads_per_block;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 8fb841b2b0d..069305e8a29 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -316,6 +316,7 @@ public:
 				if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
 				if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
 				if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
+				if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
 
 				sub.device->task_add(subtask);
 			}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index afa35224aba..23faa61e8e5 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -648,6 +648,9 @@ protected:
 			if(task.shader_output)
 				task.shader_output = device_ptr_from_client_pointer(task.shader_output);
 
+			if(task.shader_output)luma)
+				task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
+
 
 			task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
 			task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 2e751f6697f..60ecc1d0a86 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -132,7 +132,7 @@ public:
 		archive & type & task.x & task.y & task.w & task.h;
 		archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		archive & task.offset & task.stride;
-		archive & task.shader_input & task.shader_output & task.shader_eval_type;
+		archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
 		archive & task.shader_x & task.shader_w;
 		archive & task.need_finish_queue;
 	}
@@ -291,7 +291,7 @@ public:
 		*archive & type & task.x & task.y & task.w & task.h;
 		*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		*archive & task.offset & task.stride;
-		*archive & task.shader_input & task.shader_output & task.shader_eval_type;
+		*archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
 		*archive & task.shader_x & task.shader_w;
 		*archive & task.need_finish_queue;
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index e0c602461ed..a1743f53831 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1304,6 +1304,7 @@ public:
 		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
 		cl_mem d_input = CL_MEM_PTR(task.shader_input);
 		cl_mem d_output = CL_MEM_PTR(task.shader_output);
+		cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
 		cl_int d_shader_eval_type = task.shader_eval_type;
 		cl_int d_shader_x = task.shader_x;
 		cl_int d_shader_w = task.shader_w;
@@ -1330,6 +1331,12 @@ public:
 				                d_input,
 				                d_output);
 
+		if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_output_luma);
+		}
+
 #define KERNEL_TEX(type, ttype, name) \
 		set_kernel_arg_mem(kernel, &start_arg_index, #name);
 #include "kernel_textures.h"
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index d527540f300..0cae118a692 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 DeviceTask::DeviceTask(Type type_)
 : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
   sample(0), num_samples(1),
-  shader_input(0), shader_output(0),
+  shader_input(0), shader_output(0), shader_output_luma(0),
   shader_eval_type(0), shader_x(0), shader_w(0)
 {
 	last_update_time = time_dt();
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 834ea60988a..7654508d4a5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -46,7 +46,7 @@ public:
 	int offset, stride;
 
 	device_ptr shader_input;
-	device_ptr shader_output;
+	device_ptr shader_output, shader_output_luma;
 	int shader_eval_type;
 	int shader_x, shader_w;
 
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 715c11c7ea0..b54afbd21b8 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -453,7 +453,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
+                                       ccl_global uint4 *input,
+                                       ccl_global float4 *output,
+                                       ccl_global float *output_luma,
+                                       ShaderEvalType type,
+                                       int i,
+                                       int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i];
@@ -500,10 +506,22 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 	}
 	
 	/* write output */
-	if(sample == 0)
-		output[i] = make_float4(out.x, out.y, out.z, 0.0f);
-	else
-		output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+	if(sample == 0) {
+		if(output != NULL) {
+			output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+		}
+		if(output_luma != NULL) {
+			output_luma[i] = average(out);
+		}
+	}
+	else {
+		if(output != NULL) {
+			output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+		}
+		if(output_luma != NULL) {
+			output_luma[i] += average(out);
+		}
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 2560c6d8dee..1ce1e41272b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -42,6 +42,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 693285ec3a8..0249610b381 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -99,12 +99,14 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
                                        int sample)
 {
 	if(type >= SHADER_EVAL_BAKE) {
+		kernel_assert(output_luma == NULL);
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -117,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		kernel_shader_evaluate(kg,
 		                       input,
 		                       output,
+		                       output_luma,
 		                       (ShaderEvalType)type,
 		                       i,
 		                       sample);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 3929b676f07..e094612de01 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -159,12 +159,26 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+kernel_cuda_shader(uint4 *input,
+                   float4 *output,
+                   float *output_luma,
+                   int type,
+                   int sx,
+                   int sw,
+                   int offset,
+                   int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw) {
+		kernel_shader_evaluate(NULL,
+		                       input,
+		                       output,
+		                       output_luma,
+		                       (ShaderEvalType)type, 
+		                       x,
+		                       sample);
+	}
 }
 
 extern "C" __global__ void
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 57db6fd9098..4c9f7ba1d7c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -61,6 +61,7 @@ __kernel void kernel_ocl_shader(
 	ccl_constant KernelData *data,
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
+	ccl_global float *output_luma,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
@@ -78,8 +79,15 @@ __kernel void kernel_ocl_shader(
 
 	int x = sx + get_global_id(0);
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw) {
+		kernel_shader_evaluate(kg,
+		                       input,
+		                       output,
+		                       output_luma,
+		                       (ShaderEvalType)type,
+		                       x,
+		                       sample);
+	}
 }
 
 __kernel void kernel_ocl_bake(