Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2016-05-31 18:47:54 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2016-05-31 18:48:42 +0300
commitd2bb0e660bdec00164a9fc72d145e308fb723d16 (patch)
tree118db6c99469065ccd804af55f25ba486a3875a5 /intern
parentad1c3bef8bb0b405d22bfed8007629276cb0d68f (diff)
Fix T46207: Slow OpenCL GPU bake and blown out baking Cycles render
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/device/device_opencl.cpp81
-rw-r--r--intern/cycles/kernel/kernel_bake.h8
-rw-r--r--intern/cycles/render/bake.cpp2
3 files changed, 50 insertions, 41 deletions
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index c7dcf7602df..afe21c49730 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1224,18 +1224,28 @@ public:
CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
clGetDeviceInfo(cdDevice,
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
-
- /* try to divide evenly over 2 dimensions */
+
+ /* Try to divide evenly over 2 dimensions. */
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
- /* some implementations have max size 1 on 2nd dimension */
+ /* Some implementations have max size 1 on 2nd dimension. */
if(local_size[1] > max_work_items[1]) {
local_size[0] = workgroup_size/max_work_items[1];
local_size[1] = max_work_items[1];
}
- size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
+ size_t global_size[2] = {global_size_round_up(local_size[0], w),
+ global_size_round_up(local_size[1], h)};
+
+ /* Vertical size of 1 is coming from bake/shade kernels where we should
+ * not round anything up because otherwise we'll either be doing too
+ * much work per pixel (if we don't check global ID on Y axis) or will
+ * be checking for global ID to always have Y of 0.
+ */
+ if (h == 1) {
+ global_size[h] = 1;
+ }
/* run kernel */
opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
@@ -1320,48 +1330,49 @@ public:
else
kernel = ckShaderKernel;
- for(int sample = 0; sample < task.num_samples; sample++) {
-
- if(task.get_cancel())
- break;
-
- cl_int d_sample = sample;
-
- cl_uint start_arg_index =
- kernel_set_args(kernel,
- 0,
- d_data,
- d_input,
- d_output);
+ cl_uint start_arg_index =
+ kernel_set_args(kernel,
+ 0,
+ d_data,
+ d_input,
+ d_output);
- if(task.shader_eval_type < SHADER_EVAL_BAKE) {
- start_arg_index += kernel_set_args(kernel,
- start_arg_index,
- d_output_luma);
- }
+ if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+ start_arg_index += kernel_set_args(kernel,
+ start_arg_index,
+ d_output_luma);
+ }
#define KERNEL_TEX(type, ttype, name) \
- set_kernel_arg_mem(kernel, &start_arg_index, #name);
+ set_kernel_arg_mem(kernel, &start_arg_index, #name);
#include "kernel_textures.h"
#undef KERNEL_TEX
+ start_arg_index += kernel_set_args(kernel,
+ start_arg_index,
+ d_shader_eval_type);
+ if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
start_arg_index += kernel_set_args(kernel,
start_arg_index,
- d_shader_eval_type);
- if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
- start_arg_index += kernel_set_args(kernel,
- start_arg_index,
- d_shader_filter);
- }
- start_arg_index += kernel_set_args(kernel,
- start_arg_index,
- d_shader_x,
- d_shader_w,
- d_offset,
- d_sample);
+ d_shader_filter);
+ }
+ start_arg_index += kernel_set_args(kernel,
+ start_arg_index,
+ d_shader_x,
+ d_shader_w,
+ d_offset);
+
+ for(int sample = 0; sample < task.num_samples; sample++) {
+
+ if(task.get_cancel())
+ break;
+
+ kernel_set_args(kernel, start_arg_index, sample);
enqueue_kernel(kernel, task.shader_w, 1);
+ clFinish(cqCommandQueue);
+
task.update_progress(NULL);
}
}
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 3966a06fe33..8d05befe1d4 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -482,12 +482,10 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
/* write output */
- float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+ const float output_fac = is_aa_pass(type)? 1.0f/num_samples: 1.0f;
+ const float4 scaled_result = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
- if(sample == 0)
- output[i] = make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
- else
- output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
+ output[i] = (sample == 0)? scaled_result: output[i] + scaled_result;
}
#endif /* __BAKING__ */
diff --git a/intern/cycles/render/bake.cpp b/intern/cycles/render/bake.cpp
index 5bf5e5113ef..13310a61761 100644
--- a/intern/cycles/render/bake.cpp
+++ b/intern/cycles/render/bake.cpp
@@ -177,7 +177,7 @@ bool BakeManager::bake(Device *device, DeviceScene *dscene, Scene *scene, Progre
device->mem_alloc(d_input, MEM_READ_ONLY);
device->mem_copy_to(d_input);
- device->mem_alloc(d_output, MEM_WRITE_ONLY);
+ device->mem_alloc(d_output, MEM_READ_WRITE);
DeviceTask task(DeviceTask::SHADER);
task.shader_input = d_input.device_pointer;