Cycles: Optionally output luminance from the shader evaluation kernel

This makes it possible to move some parts of evaluation from host to the device and hopefully reduce memory usage by avoid having full RGBA buffer on the host. Reviewers: juicyfruit, lukasstockner97, brecht Reviewed By: lukasstockner97, brecht Differential Revision: https://developer.blender.org/D1702
author: Sergey Sharybin <sergey.vfx@gmail.com> 2015-12-30 17:04:01 +0300
committer: Sergey Sharybin <sergey.vfx@gmail.com> 2015-12-30 17:04:04 +0300
commit: 3918c8b9a52ae9dcdb0488df92d7d3ca615be8c7 (patch)
tree: 3740e477610ab3ed020d505cd98308d29f663f5b /intern
parent: c8a551bf13edf711b93ea89cd3fcd244e4206cee (diff)
13 files changed, 91 insertions, 24 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index f06963c146e..832f4d1c1fd 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -343,7 +343,7 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
+		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2())
@@ -374,8 +374,14 @@ public:
 
 		for(int sample = 0; sample < task.num_samples; sample++) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					task.shader_eval_type, x, task.offset, sample);
+				shader_kernel(&kg,
+				              (uint4*)task.shader_input,
+				              (float4*)task.shader_output,
+				              (float*)task.shader_output_luma,
+				              task.shader_eval_type,
+				              x,
+				              task.offset,
+				              sample);
 
 			if(task.get_cancel() || task_pool.canceled())
 				break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d9d6fd77ecb..5c9ca3454c6 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -726,6 +726,7 @@ public:
 		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+		CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
 
 		/* get kernel function */
 		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
@@ -747,13 +748,18 @@ public:
 				int shader_w = min(shader_chunk_size, end - shader_x);
 
 				/* pass in parameters */
-				void *args[] = {&d_input,
-								 &d_output,
-								 &task.shader_eval_type,
-								 &shader_x,
-								 &shader_w,
-								 &offset,
-								 &sample};
+				void *args[8];
+				int arg = 0;
+				args[arg++] = &d_input;
+				args[arg++] = &d_output;
+				if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+					args[arg++] = &d_output_luma;
+				}
+				args[arg++] = &task.shader_eval_type;
+				args[arg++] = &shader_x;
+				args[arg++] = &shader_w;
+				args[arg++] = &offset;
+				args[arg++] = &sample;
 
 				/* launch kernel */
 				int threads_per_block;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 8fb841b2b0d..069305e8a29 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -316,6 +316,7 @@ public:
 				if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
 				if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
 				if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
+				if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
 
 				sub.device->task_add(subtask);
 			}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index afa35224aba..23faa61e8e5 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -648,6 +648,9 @@ protected:
 			if(task.shader_output)
 				task.shader_output = device_ptr_from_client_pointer(task.shader_output);
 
+			if(task.shader_output)luma)
+				task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
+
 
 			task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
 			task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 2e751f6697f..60ecc1d0a86 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -132,7 +132,7 @@ public:
 		archive & type & task.x & task.y & task.w & task.h;
 		archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		archive & task.offset & task.stride;
-		archive & task.shader_input & task.shader_output & task.shader_eval_type;
+		archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
 		archive & task.shader_x & task.shader_w;
 		archive & task.need_finish_queue;
 	}
@@ -291,7 +291,7 @@ public:
 		*archive & type & task.x & task.y & task.w & task.h;
 		*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		*archive & task.offset & task.stride;
-		*archive & task.shader_input & task.shader_output & task.shader_eval_type;
+		*archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
 		*archive & task.shader_x & task.shader_w;
 		*archive & task.need_finish_queue;
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index e0c602461ed..a1743f53831 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1304,6 +1304,7 @@ public:
 		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
 		cl_mem d_input = CL_MEM_PTR(task.shader_input);
 		cl_mem d_output = CL_MEM_PTR(task.shader_output);
+		cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
 		cl_int d_shader_eval_type = task.shader_eval_type;
 		cl_int d_shader_x = task.shader_x;
 		cl_int d_shader_w = task.shader_w;
@@ -1330,6 +1331,12 @@ public:
 				                d_input,
 				                d_output);
 
+		if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_output_luma);
+		}
+
 #define KERNEL_TEX(type, ttype, name) \
 		set_kernel_arg_mem(kernel, &start_arg_index, #name);
 #include "kernel_textures.h"
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index d527540f300..0cae118a692 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 DeviceTask::DeviceTask(Type type_)
 : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
   sample(0), num_samples(1),
-  shader_input(0), shader_output(0),
+  shader_input(0), shader_output(0), shader_output_luma(0),
   shader_eval_type(0), shader_x(0), shader_w(0)
 {
 	last_update_time = time_dt();
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 834ea60988a..7654508d4a5 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -46,7 +46,7 @@ public:
 	int offset, stride;
 
 	device_ptr shader_input;
-	device_ptr shader_output;
+	device_ptr shader_output, shader_output_luma;
 	int shader_eval_type;
 	int shader_x, shader_w;
 
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 715c11c7ea0..b54afbd21b8 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -453,7 +453,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
+                                       ccl_global uint4 *input,
+                                       ccl_global float4 *output,
+                                       ccl_global float *output_luma,
+                                       ShaderEvalType type,
+                                       int i,
+                                       int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i];
@@ -500,10 +506,22 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 	}
 	
 	/* write output */
-	if(sample == 0)
-		output[i] = make_float4(out.x, out.y, out.z, 0.0f);
-	else
-		output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+	if(sample == 0) {
+		if(output != NULL) {
+			output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+		}
+		if(output_luma != NULL) {
+			output_luma[i] = average(out);
+		}
+	}
+	else {
+		if(output != NULL) {
+			output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+		}
+		if(output_luma != NULL) {
+			output_luma[i] += average(out);
+		}
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 2560c6d8dee..1ce1e41272b 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -42,6 +42,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 693285ec3a8..0249610b381 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -99,12 +99,14 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
                                        int sample)
 {
 	if(type >= SHADER_EVAL_BAKE) {
+		kernel_assert(output_luma == NULL);
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -117,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		kernel_shader_evaluate(kg,
 		                       input,
 		                       output,
+		                       output_luma,
 		                       (ShaderEvalType)type,
 		                       i,
 		                       sample);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 3929b676f07..e094612de01 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -159,12 +159,26 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
+kernel_cuda_shader(uint4 *input,
+                   float4 *output,
+                   float *output_luma,
+                   int type,
+                   int sx,
+                   int sw,
+                   int offset,
+                   int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw) {
+		kernel_shader_evaluate(NULL,
+		                       input,
+		                       output,
+		                       output_luma,
+		                       (ShaderEvalType)type, 
+		                       x,
+		                       sample);
+	}
 }
 
 extern "C" __global__ void
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 57db6fd9098..4c9f7ba1d7c 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -61,6 +61,7 @@ __kernel void kernel_ocl_shader(
 	ccl_constant KernelData *data,
 	ccl_global uint4 *input,
 	ccl_global float4 *output,
+	ccl_global float *output_luma,
 
 #define KERNEL_TEX(type, ttype, name) \
 	ccl_global type *name,
@@ -78,8 +79,15 @@ __kernel void kernel_ocl_shader(
 
 	int x = sx + get_global_id(0);
 
-	if(x < sx + sw)
-		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+	if(x < sx + sw) {
+		kernel_shader_evaluate(kg,
+		                       input,
+		                       output,
+		                       output_luma,
+		                       (ShaderEvalType)type,
+		                       x,
+		                       sample);
+	}
 }
 
 __kernel void kernel_ocl_bake(
author	Sergey Sharybin <sergey.vfx@gmail.com>	2015-12-30 17:04:01 +0300
committer	Sergey Sharybin <sergey.vfx@gmail.com>	2015-12-30 17:04:04 +0300
commit	3918c8b9a52ae9dcdb0488df92d7d3ca615be8c7 (patch)
tree	3740e477610ab3ed020d505cd98308d29f663f5b /intern
parent	c8a551bf13edf711b93ea89cd3fcd244e4206cee (diff)