diff options
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 12 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 20 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 1 | ||||
-rw-r--r-- | intern/cycles/device/device_network.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/device/device_network.h | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_opencl.cpp | 7 | ||||
-rw-r--r-- | intern/cycles/device/device_task.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/device_task.h | 2 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_bake.h | 28 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/kernel_cpu.h | 1 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h | 3 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel.cu | 20 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/opencl/kernel.cl | 12 |
13 files changed, 91 insertions, 24 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index f06963c146e..832f4d1c1fd 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -343,7 +343,7 @@ public: #ifdef WITH_OSL OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int); + void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int); #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 if(system_cpu_support_avx2()) @@ -374,8 +374,14 @@ public: for(int sample = 0; sample < task.num_samples; sample++) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, - task.shader_eval_type, x, task.offset, sample); + shader_kernel(&kg, + (uint4*)task.shader_input, + (float4*)task.shader_output, + (float*)task.shader_output_luma, + task.shader_eval_type, + x, + task.offset, + sample); if(task.get_cancel() || task_pool.canceled()) break; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index d9d6fd77ecb..5c9ca3454c6 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -726,6 +726,7 @@ public: CUfunction cuShader; CUdeviceptr d_input = cuda_device_ptr(task.shader_input); CUdeviceptr d_output = cuda_device_ptr(task.shader_output); + CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma); /* get kernel function */ if(task.shader_eval_type >= SHADER_EVAL_BAKE) { @@ -747,13 +748,18 @@ public: int shader_w = min(shader_chunk_size, end - shader_x); /* pass in parameters */ - void *args[] = {&d_input, - &d_output, - &task.shader_eval_type, - &shader_x, - &shader_w, - &offset, - &sample}; + void *args[8]; + int arg = 0; + args[arg++] = &d_input; + args[arg++] = &d_output; + if(task.shader_eval_type < SHADER_EVAL_BAKE) { + args[arg++] = &d_output_luma; + } + args[arg++] = &task.shader_eval_type; + args[arg++] = &shader_x; + args[arg++] = &shader_w; + args[arg++] = &offset; + args[arg++] = &sample; /* launch kernel */ int threads_per_block; diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 8fb841b2b0d..069305e8a29 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -316,6 +316,7 @@ public: if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half]; if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input]; if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output]; + if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma]; sub.device->task_add(subtask); } diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index afa35224aba..23faa61e8e5 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -648,6 +648,9 @@ protected: if(task.shader_output) task.shader_output = device_ptr_from_client_pointer(task.shader_output); + if(task.shader_output)luma) + task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma); + task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index 2e751f6697f..60ecc1d0a86 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -132,7 +132,7 @@ public: archive & type & task.x & task.y & task.w & task.h; archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; archive & task.offset & task.stride; - archive & task.shader_input & task.shader_output & task.shader_eval_type; + archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type; archive & task.shader_x & task.shader_w; archive & task.need_finish_queue; } @@ -291,7 +291,7 @@ public: *archive & type & task.x & task.y & task.w & task.h; *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; *archive & task.offset & task.stride; - *archive & task.shader_input & task.shader_output & task.shader_eval_type; + *archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type; *archive & task.shader_x & task.shader_w; *archive & task.need_finish_queue; diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index e0c602461ed..a1743f53831 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -1304,6 +1304,7 @@ public: cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); cl_mem d_input = CL_MEM_PTR(task.shader_input); cl_mem d_output = CL_MEM_PTR(task.shader_output); + cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma); cl_int d_shader_eval_type = task.shader_eval_type; cl_int d_shader_x = task.shader_x; cl_int d_shader_w = task.shader_w; @@ -1330,6 +1331,12 @@ public: d_input, d_output); + if(task.shader_eval_type < SHADER_EVAL_BAKE) { + start_arg_index += kernel_set_args(kernel, + start_arg_index, + d_output_luma); + } + #define KERNEL_TEX(type, ttype, name) \ set_kernel_arg_mem(kernel, &start_arg_index, #name); #include "kernel_textures.h" diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index d527540f300..0cae118a692 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN DeviceTask::DeviceTask(Type type_) : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0), sample(0), num_samples(1), - shader_input(0), shader_output(0), + shader_input(0), shader_output(0), shader_output_luma(0), shader_eval_type(0), shader_x(0), shader_w(0) { last_update_time = time_dt(); diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 834ea60988a..7654508d4a5 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -46,7 +46,7 @@ public: int offset, stride; device_ptr shader_input; - device_ptr shader_output; + device_ptr shader_output, shader_output_luma; int shader_eval_type; int shader_x, shader_w; diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index 715c11c7ea0..b54afbd21b8 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -453,7 +453,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac; } -ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample) +ccl_device void kernel_shader_evaluate(KernelGlobals *kg, + ccl_global uint4 *input, + ccl_global float4 *output, + ccl_global float *output_luma, + ShaderEvalType type, + int i, + int sample) { ShaderData sd; uint4 in = input[i]; @@ -500,10 +506,22 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu } /* write output */ - if(sample == 0) - output[i] = make_float4(out.x, out.y, out.z, 0.0f); - else - output[i] += make_float4(out.x, out.y, out.z, 0.0f); + if(sample == 0) { + if(output != NULL) { + output[i] = make_float4(out.x, out.y, out.z, 0.0f); + } + if(output_luma != NULL) { + output_luma[i] = average(out); + } + } + else { + if(output != NULL) { + output[i] += make_float4(out.x, out.y, out.z, 0.0f); + } + if(output_luma != NULL) { + output_luma[i] += average(out); + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 2560c6d8dee..1ce1e41272b 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -42,6 +42,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, + float *output_luma, int type, int i, int offset, diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index 693285ec3a8..0249610b381 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -99,12 +99,14 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg, void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, uint4 *input, float4 *output, + float *output_luma, int type, int i, int offset, int sample) { if(type >= SHADER_EVAL_BAKE) { + kernel_assert(output_luma == NULL); kernel_bake_evaluate(kg, input, output, @@ -117,6 +119,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg, kernel_shader_evaluate(kg, input, output, + output_luma, (ShaderEvalType)type, i, sample); diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu index 3929b676f07..e094612de01 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel.cu @@ -159,12 +159,26 @@ kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scal extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample) +kernel_cuda_shader(uint4 *input, + float4 *output, + float *output_luma, + int type, + int sx, + int sw, + int offset, + int sample) { int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - if(x < sx + sw) - kernel_shader_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample); + if(x < sx + sw) { + kernel_shader_evaluate(NULL, + input, + output, + output_luma, + (ShaderEvalType)type, + x, + sample); + } } extern "C" __global__ void diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl index 57db6fd9098..4c9f7ba1d7c 100644 --- a/intern/cycles/kernel/kernels/opencl/kernel.cl +++ b/intern/cycles/kernel/kernels/opencl/kernel.cl @@ -61,6 +61,7 @@ __kernel void kernel_ocl_shader( ccl_constant KernelData *data, ccl_global uint4 *input, ccl_global float4 *output, + ccl_global float *output_luma, #define KERNEL_TEX(type, ttype, name) \ ccl_global type *name, @@ -78,8 +79,15 @@ __kernel void kernel_ocl_shader( int x = sx + get_global_id(0); - if(x < sx + sw) - kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample); + if(x < sx + sw) { + kernel_shader_evaluate(kg, + input, + output, + output_luma, + (ShaderEvalType)type, + x, + sample); + } } __kernel void kernel_ocl_bake( |