diff options
author | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2014-06-06 16:40:09 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@gmail.com> | 2014-06-06 17:39:04 +0400 |
commit | e4e58d46128dc7fe4fb9b881d73b38173f00f5c3 (patch) | |
tree | cc38ac39838bec84d28de396374ba022139a8aa2 /intern/cycles/device | |
parent | 553264ff8e20484d0b91bb468f56aa1b7144f7aa (diff) |
Fix T40370: cycles CUDA baking timeout with high number of AA samples.
Now baking does one AA sample at a time, just like final render. There is
also some code for shader antialiasing that solves T40369 but it is disabled
for now because there may be unpredictable side effects.
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 15 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 51 | ||||
-rw-r--r-- | intern/cycles/device/device_opencl.cpp | 21 |
3 files changed, 51 insertions, 36 deletions
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index b0739dd20b4..71bf2d23d6e 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -393,7 +393,8 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX if(system_cpu_support_avx()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) + kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -404,7 +405,8 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 if(system_cpu_support_sse41()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) + kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -415,7 +417,8 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 if(system_cpu_support_sse3()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) + kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -426,7 +429,8 @@ public: #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 if(system_cpu_support_sse2()) { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) + kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); if(task.get_cancel() || task_pool.canceled()) break; @@ -436,7 +440,8 @@ public: #endif { for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) { - kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x); + for(int sample = 0; sample < task.num_samples; sample++) + kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample); if(task.get_cancel() || task_pool.canceled()) break; diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 0429bfc6e97..0aa09ac5383 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -764,40 +764,45 @@ public: int shader_w = min(shader_chunk_size, end - shader_x); - /* pass in parameters */ - int offset = 0; + for(int sample = 0; sample < task.num_samples; sample++) { + /* pass in parameters */ + int offset = 0; - cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input))); - offset += sizeof(d_input); + cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input))); + offset += sizeof(d_input); - cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output))); - offset += sizeof(d_output); + cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output))); + offset += sizeof(d_output); - int shader_eval_type = task.shader_eval_type; - offset = align_up(offset, __alignof(shader_eval_type)); + int shader_eval_type = task.shader_eval_type; + offset = align_up(offset, __alignof(shader_eval_type)); - cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type)); - offset += sizeof(task.shader_eval_type); + cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type)); + offset += sizeof(task.shader_eval_type); - cuda_assert(cuParamSeti(cuShader, offset, shader_x)); - offset += sizeof(shader_x); + cuda_assert(cuParamSeti(cuShader, offset, shader_x)); + offset += sizeof(shader_x); - cuda_assert(cuParamSeti(cuShader, offset, shader_w)); - offset += sizeof(shader_w); + cuda_assert(cuParamSeti(cuShader, offset, shader_w)); + offset += sizeof(shader_w); - cuda_assert(cuParamSetSize(cuShader, offset)); + cuda_assert(cuParamSeti(cuShader, offset, sample)); + offset += sizeof(sample); - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); + cuda_assert(cuParamSetSize(cuShader, offset)); - int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1)); - cuda_assert(cuLaunchGrid(cuShader, xblocks, 1)); + int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; - cuda_assert(cuCtxSynchronize()); + cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1)); + cuda_assert(cuLaunchGrid(cuShader, xblocks, 1)); + + cuda_assert(cuCtxSynchronize()); + } } cuda_pop_context(); diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index f841daba124..abfe445414a 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -1067,19 +1067,24 @@ public: else kernel = ckShaderKernel; - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output)); + for(int sample = 0; sample < task.num_samples; sample++) { + cl_int d_sample = task.sample; + + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output)); #define KERNEL_TEX(type, ttype, name) \ - set_kernel_arg_mem(kernel, &narg, #name); + set_kernel_arg_mem(kernel, &narg, #name); #include "kernel_textures.h" - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); - opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w)); + opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample)); - enqueue_kernel(kernel, task.shader_w, 1); + enqueue_kernel(kernel, task.shader_w, 1); + } } void thread_run(DeviceTask *task) |