Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/device')
-rw-r--r--intern/cycles/device/CMakeLists.txt92
-rw-r--r--intern/cycles/device/device.cpp921
-rw-r--r--intern/cycles/device/device.h744
-rw-r--r--intern/cycles/device/device_cpu.cpp2094
-rw-r--r--intern/cycles/device/device_cuda.cpp5137
-rw-r--r--intern/cycles/device/device_denoising.cpp537
-rw-r--r--intern/cycles/device/device_denoising.h320
-rw-r--r--intern/cycles/device/device_intern.h27
-rw-r--r--intern/cycles/device/device_memory.cpp113
-rw-r--r--intern/cycles/device/device_memory.h678
-rw-r--r--intern/cycles/device/device_multi.cpp769
-rw-r--r--intern/cycles/device/device_network.cpp1453
-rw-r--r--intern/cycles/device/device_network.h843
-rw-r--r--intern/cycles/device/device_opencl.cpp373
-rw-r--r--intern/cycles/device/device_split_kernel.cpp517
-rw-r--r--intern/cycles/device/device_split_kernel.h185
-rw-r--r--intern/cycles/device/device_task.cpp175
-rw-r--r--intern/cycles/device/device_task.h155
-rw-r--r--intern/cycles/device/opencl/memory_manager.cpp361
-rw-r--r--intern/cycles/device/opencl/memory_manager.h97
-rw-r--r--intern/cycles/device/opencl/opencl.h1222
-rw-r--r--intern/cycles/device/opencl/opencl_split.cpp3506
-rw-r--r--intern/cycles/device/opencl/opencl_util.cpp1948
23 files changed, 11227 insertions, 11040 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index d804a07bcab..75f4a72bee3 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -1,61 +1,61 @@
set(INC
- ..
- ../../glew-mx
+ ..
+ ../../glew-mx
)
set(INC_SYS
- ${GLEW_INCLUDE_DIR}
- ../../../extern/clew/include
+ ${GLEW_INCLUDE_DIR}
+ ../../../extern/clew/include
)
if(WITH_CUDA_DYNLOAD)
- list(APPEND INC
- ../../../extern/cuew/include
- )
- add_definitions(-DWITH_CUDA_DYNLOAD)
+ list(APPEND INC
+ ../../../extern/cuew/include
+ )
+ add_definitions(-DWITH_CUDA_DYNLOAD)
else()
- list(APPEND INC_SYS
- ${CUDA_TOOLKIT_INCLUDE}
- )
- add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
+ list(APPEND INC_SYS
+ ${CUDA_TOOLKIT_INCLUDE}
+ )
+ add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
endif()
set(SRC
- device.cpp
- device_cpu.cpp
- device_cuda.cpp
- device_denoising.cpp
- device_memory.cpp
- device_multi.cpp
- device_opencl.cpp
- device_split_kernel.cpp
- device_task.cpp
+ device.cpp
+ device_cpu.cpp
+ device_cuda.cpp
+ device_denoising.cpp
+ device_memory.cpp
+ device_multi.cpp
+ device_opencl.cpp
+ device_split_kernel.cpp
+ device_task.cpp
)
set(SRC_OPENCL
- opencl/opencl.h
- opencl/memory_manager.h
+ opencl/opencl.h
+ opencl/memory_manager.h
- opencl/opencl_split.cpp
- opencl/opencl_util.cpp
- opencl/memory_manager.cpp
+ opencl/opencl_split.cpp
+ opencl/opencl_util.cpp
+ opencl/memory_manager.cpp
)
if(WITH_CYCLES_NETWORK)
- list(APPEND SRC
- device_network.cpp
- )
+ list(APPEND SRC
+ device_network.cpp
+ )
endif()
set(SRC_HEADERS
- device.h
- device_denoising.h
- device_memory.h
- device_intern.h
- device_network.h
- device_split_kernel.h
- device_task.h
+ device.h
+ device_denoising.h
+ device_memory.h
+ device_intern.h
+ device_network.h
+ device_split_kernel.h
+ device_task.h
)
set(LIB
@@ -63,27 +63,27 @@ set(LIB
)
if(WITH_CUDA_DYNLOAD)
- list(APPEND LIB
- extern_cuew
- )
+ list(APPEND LIB
+ extern_cuew
+ )
else()
- list(APPEND LIB
- ${CUDA_CUDA_LIBRARY}
- )
+ list(APPEND LIB
+ ${CUDA_CUDA_LIBRARY}
+ )
endif()
add_definitions(${GL_DEFINITIONS})
if(WITH_CYCLES_NETWORK)
- add_definitions(-DWITH_NETWORK)
+ add_definitions(-DWITH_NETWORK)
endif()
if(WITH_CYCLES_DEVICE_OPENCL)
- add_definitions(-DWITH_OPENCL)
+ add_definitions(-DWITH_OPENCL)
endif()
if(WITH_CYCLES_DEVICE_CUDA)
- add_definitions(-DWITH_CUDA)
+ add_definitions(-DWITH_CUDA)
endif()
if(WITH_CYCLES_DEVICE_MULTI)
- add_definitions(-DWITH_MULTI)
+ add_definitions(-DWITH_MULTI)
endif()
include_directories(${INC})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index e74637472ef..16a68e8b855 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -44,572 +44,577 @@ uint Device::devices_initialized_mask = 0;
/* Device Requested Features */
-std::ostream& operator <<(std::ostream &os,
- const DeviceRequestedFeatures& requested_features)
+std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features)
{
- os << "Experimental features: "
- << (requested_features.experimental ? "On" : "Off") << std::endl;
- os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
- /* TODO(sergey): Decode bitflag into list of names. */
- os << "Nodes features: " << requested_features.nodes_features << std::endl;
- os << "Use Hair: "
- << string_from_bool(requested_features.use_hair) << std::endl;
- os << "Use Object Motion: "
- << string_from_bool(requested_features.use_object_motion) << std::endl;
- os << "Use Camera Motion: "
- << string_from_bool(requested_features.use_camera_motion) << std::endl;
- os << "Use Baking: "
- << string_from_bool(requested_features.use_baking) << std::endl;
- os << "Use Subsurface: "
- << string_from_bool(requested_features.use_subsurface) << std::endl;
- os << "Use Volume: "
- << string_from_bool(requested_features.use_volume) << std::endl;
- os << "Use Branched Integrator: "
- << string_from_bool(requested_features.use_integrator_branched) << std::endl;
- os << "Use Patch Evaluation: "
- << string_from_bool(requested_features.use_patch_evaluation) << std::endl;
- os << "Use Transparent Shadows: "
- << string_from_bool(requested_features.use_transparent) << std::endl;
- os << "Use Principled BSDF: "
- << string_from_bool(requested_features.use_principled) << std::endl;
- os << "Use Denoising: "
- << string_from_bool(requested_features.use_denoising) << std::endl;
- os << "Use Displacement: "
- << string_from_bool(requested_features.use_true_displacement) << std::endl;
- os << "Use Background Light: "
- << string_from_bool(requested_features.use_background_light) << std::endl;
- return os;
+ os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl;
+ os << "Max nodes group: " << requested_features.max_nodes_group << std::endl;
+ /* TODO(sergey): Decode bitflag into list of names. */
+ os << "Nodes features: " << requested_features.nodes_features << std::endl;
+ os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl;
+ os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion)
+ << std::endl;
+ os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion)
+ << std::endl;
+ os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl;
+ os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl;
+ os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl;
+ os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched)
+ << std::endl;
+ os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation)
+ << std::endl;
+ os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent)
+ << std::endl;
+ os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled)
+ << std::endl;
+ os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl;
+ os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement)
+ << std::endl;
+ os << "Use Background Light: " << string_from_bool(requested_features.use_background_light)
+ << std::endl;
+ return os;
}
/* Device */
Device::~Device()
{
- if(!background) {
- if(vertex_buffer != 0) {
- glDeleteBuffers(1, &vertex_buffer);
- }
- if(fallback_shader_program != 0) {
- glDeleteProgram(fallback_shader_program);
- }
- }
+ if (!background) {
+ if (vertex_buffer != 0) {
+ glDeleteBuffers(1, &vertex_buffer);
+ }
+ if (fallback_shader_program != 0) {
+ glDeleteProgram(fallback_shader_program);
+ }
+ }
}
/* TODO move shaders to standalone .glsl file. */
const char *FALLBACK_VERTEX_SHADER =
-"#version 330\n"
-"uniform vec2 fullscreen;\n"
-"in vec2 texCoord;\n"
-"in vec2 pos;\n"
-"out vec2 texCoord_interp;\n"
-"\n"
-"vec2 normalize_coordinates()\n"
-"{\n"
-" return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-"}\n"
-"\n"
-"void main()\n"
-"{\n"
-" gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-" texCoord_interp = texCoord;\n"
-"}\n\0";
+ "#version 330\n"
+ "uniform vec2 fullscreen;\n"
+ "in vec2 texCoord;\n"
+ "in vec2 pos;\n"
+ "out vec2 texCoord_interp;\n"
+ "\n"
+ "vec2 normalize_coordinates()\n"
+ "{\n"
+ " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
+ "}\n"
+ "\n"
+ "void main()\n"
+ "{\n"
+ " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
+ " texCoord_interp = texCoord;\n"
+ "}\n\0";
const char *FALLBACK_FRAGMENT_SHADER =
-"#version 330\n"
-"uniform sampler2D image_texture;\n"
-"in vec2 texCoord_interp;\n"
-"out vec4 fragColor;\n"
-"\n"
-"void main()\n"
-"{\n"
-" fragColor = texture(image_texture, texCoord_interp);\n"
-"}\n\0";
+ "#version 330\n"
+ "uniform sampler2D image_texture;\n"
+ "in vec2 texCoord_interp;\n"
+ "out vec4 fragColor;\n"
+ "\n"
+ "void main()\n"
+ "{\n"
+ " fragColor = texture(image_texture, texCoord_interp);\n"
+ "}\n\0";
static void shader_print_errors(const char *task, const char *log, const char *code)
{
- LOG(ERROR) << "Shader: " << task << " error:";
- LOG(ERROR) << "===== shader string ====";
-
- stringstream stream(code);
- string partial;
-
- int line = 1;
- while(getline(stream, partial, '\n')) {
- if(line < 10) {
- LOG(ERROR) << " " << line << " " << partial;
- }
- else {
- LOG(ERROR) << line << " " << partial;
- }
- line++;
- }
- LOG(ERROR) << log;
+ LOG(ERROR) << "Shader: " << task << " error:";
+ LOG(ERROR) << "===== shader string ====";
+
+ stringstream stream(code);
+ string partial;
+
+ int line = 1;
+ while (getline(stream, partial, '\n')) {
+ if (line < 10) {
+ LOG(ERROR) << " " << line << " " << partial;
+ }
+ else {
+ LOG(ERROR) << line << " " << partial;
+ }
+ line++;
+ }
+ LOG(ERROR) << log;
}
static int bind_fallback_shader(void)
{
- GLint status;
- GLchar log[5000];
- GLsizei length = 0;
- GLuint program = 0;
+ GLint status;
+ GLchar log[5000];
+ GLsizei length = 0;
+ GLuint program = 0;
- struct Shader {
- const char *source;
- GLenum type;
- } shaders[2] = {
- {FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
- {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}
- };
+ struct Shader {
+ const char *source;
+ GLenum type;
+ } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER},
+ {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}};
- program = glCreateProgram();
+ program = glCreateProgram();
- for(int i = 0; i < 2; i++) {
- GLuint shader = glCreateShader(shaders[i].type);
+ for (int i = 0; i < 2; i++) {
+ GLuint shader = glCreateShader(shaders[i].type);
- string source_str = shaders[i].source;
- const char *c_str = source_str.c_str();
+ string source_str = shaders[i].source;
+ const char *c_str = source_str.c_str();
- glShaderSource(shader, 1, &c_str, NULL);
- glCompileShader(shader);
+ glShaderSource(shader, 1, &c_str, NULL);
+ glCompileShader(shader);
- glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
+ glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
- if(!status) {
- glGetShaderInfoLog(shader, sizeof(log), &length, log);
- shader_print_errors("compile", log, c_str);
- return 0;
- }
+ if (!status) {
+ glGetShaderInfoLog(shader, sizeof(log), &length, log);
+ shader_print_errors("compile", log, c_str);
+ return 0;
+ }
- glAttachShader(program, shader);
- }
+ glAttachShader(program, shader);
+ }
- /* Link output. */
- glBindFragDataLocation(program, 0, "fragColor");
+ /* Link output. */
+ glBindFragDataLocation(program, 0, "fragColor");
- /* Link and error check. */
- glLinkProgram(program);
+ /* Link and error check. */
+ glLinkProgram(program);
- glGetProgramiv(program, GL_LINK_STATUS, &status);
- if(!status) {
- glGetShaderInfoLog(program, sizeof(log), &length, log);
- shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
- shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
- return 0;
- }
+ glGetProgramiv(program, GL_LINK_STATUS, &status);
+ if (!status) {
+ glGetShaderInfoLog(program, sizeof(log), &length, log);
+ shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER);
+ shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER);
+ return 0;
+ }
- return program;
+ return program;
}
bool Device::bind_fallback_display_space_shader(const float width, const float height)
{
- if(fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
- return false;
- }
-
- if(fallback_status == FALLBACK_SHADER_STATUS_NONE) {
- fallback_shader_program = bind_fallback_shader();
- fallback_status = FALLBACK_SHADER_STATUS_ERROR;
-
- if(fallback_shader_program == 0) {
- return false;
- }
-
- glUseProgram(fallback_shader_program);
- image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
- if(image_texture_location < 0) {
- LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
- return false;
- }
-
- fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
- if(fullscreen_location < 0) {
- LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
- return false;
- }
-
- fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
- }
-
- /* Run this every time. */
- glUseProgram(fallback_shader_program);
- glUniform1i(image_texture_location, 0);
- glUniform2f(fullscreen_location, width, height);
- return true;
+ if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) {
+ return false;
+ }
+
+ if (fallback_status == FALLBACK_SHADER_STATUS_NONE) {
+ fallback_shader_program = bind_fallback_shader();
+ fallback_status = FALLBACK_SHADER_STATUS_ERROR;
+
+ if (fallback_shader_program == 0) {
+ return false;
+ }
+
+ glUseProgram(fallback_shader_program);
+ image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture");
+ if (image_texture_location < 0) {
+ LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform.";
+ return false;
+ }
+
+ fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen");
+ if (fullscreen_location < 0) {
+ LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform.";
+ return false;
+ }
+
+ fallback_status = FALLBACK_SHADER_STATUS_SUCCESS;
+ }
+
+ /* Run this every time. */
+ glUseProgram(fallback_shader_program);
+ glUniform1i(image_texture_location, 0);
+ glUniform2f(fullscreen_location, width, height);
+ return true;
}
-void Device::draw_pixels(
- device_memory& rgba, int y,
- int w, int h, int width, int height,
- int dx, int dy, int dw, int dh,
- bool transparent, const DeviceDrawParams &draw_params)
+void Device::draw_pixels(device_memory &rgba,
+ int y,
+ int w,
+ int h,
+ int width,
+ int height,
+ int dx,
+ int dy,
+ int dw,
+ int dh,
+ bool transparent,
+ const DeviceDrawParams &draw_params)
{
- const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-
- assert(rgba.type == MEM_PIXELS);
- mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
-
- GLuint texid;
- glActiveTexture(GL_TEXTURE0);
- glGenTextures(1, &texid);
- glBindTexture(GL_TEXTURE_2D, texid);
-
- if(rgba.data_type == TYPE_HALF) {
- GLhalf *data_pointer = (GLhalf*)rgba.host_pointer;
- data_pointer += 4 * y * w;
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
- }
- else {
- uint8_t *data_pointer = (uint8_t*)rgba.host_pointer;
- data_pointer += 4 * y * w;
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
- }
-
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-
- if(transparent) {
- glEnable(GL_BLEND);
- glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
- }
-
- GLint shader_program;
- if(use_fallback_shader) {
- if(!bind_fallback_display_space_shader(dw, dh)) {
- return;
- }
- shader_program = fallback_shader_program;
- }
- else {
- draw_params.bind_display_space_shader_cb();
- glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
- }
-
- if(!vertex_buffer) {
- glGenBuffers(1, &vertex_buffer);
- }
-
- glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
- /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
- glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
- float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
- if(vpointer) {
- /* texture coordinate - vertex pair */
- vpointer[0] = 0.0f;
- vpointer[1] = 0.0f;
- vpointer[2] = dx;
- vpointer[3] = dy;
-
- vpointer[4] = 1.0f;
- vpointer[5] = 0.0f;
- vpointer[6] = (float)width + dx;
- vpointer[7] = dy;
-
- vpointer[8] = 1.0f;
- vpointer[9] = 1.0f;
- vpointer[10] = (float)width + dx;
- vpointer[11] = (float)height + dy;
-
- vpointer[12] = 0.0f;
- vpointer[13] = 1.0f;
- vpointer[14] = dx;
- vpointer[15] = (float)height + dy;
-
- if(vertex_buffer) {
- glUnmapBuffer(GL_ARRAY_BUFFER);
- }
- }
-
- GLuint vertex_array_object;
- GLuint position_attribute, texcoord_attribute;
-
- glGenVertexArrays(1, &vertex_array_object);
- glBindVertexArray(vertex_array_object);
-
- texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
- position_attribute = glGetAttribLocation(shader_program, "pos");
-
- glEnableVertexAttribArray(texcoord_attribute);
- glEnableVertexAttribArray(position_attribute);
-
- glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
- glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2));
-
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
- if(vertex_buffer) {
- glBindBuffer(GL_ARRAY_BUFFER, 0);
- }
-
- if(use_fallback_shader) {
- glUseProgram(0);
- }
- else {
- draw_params.unbind_display_space_shader_cb();
- }
-
- glDeleteVertexArrays(1, &vertex_array_object);
- glBindTexture(GL_TEXTURE_2D, 0);
- glDeleteTextures(1, &texid);
-
- if(transparent) {
- glDisable(GL_BLEND);
- }
+ const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+
+ assert(rgba.type == MEM_PIXELS);
+ mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1));
+
+ GLuint texid;
+ glActiveTexture(GL_TEXTURE0);
+ glGenTextures(1, &texid);
+ glBindTexture(GL_TEXTURE_2D, texid);
+
+ if (rgba.data_type == TYPE_HALF) {
+ GLhalf *data_pointer = (GLhalf *)rgba.host_pointer;
+ data_pointer += 4 * y * w;
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer);
+ }
+ else {
+ uint8_t *data_pointer = (uint8_t *)rgba.host_pointer;
+ data_pointer += 4 * y * w;
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer);
+ }
+
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+ if (transparent) {
+ glEnable(GL_BLEND);
+ glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+ }
+
+ GLint shader_program;
+ if (use_fallback_shader) {
+ if (!bind_fallback_display_space_shader(dw, dh)) {
+ return;
+ }
+ shader_program = fallback_shader_program;
+ }
+ else {
+ draw_params.bind_display_space_shader_cb();
+ glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+ }
+
+ if (!vertex_buffer) {
+ glGenBuffers(1, &vertex_buffer);
+ }
+
+ glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+ /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+ glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+ float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+ if (vpointer) {
+ /* texture coordinate - vertex pair */
+ vpointer[0] = 0.0f;
+ vpointer[1] = 0.0f;
+ vpointer[2] = dx;
+ vpointer[3] = dy;
+
+ vpointer[4] = 1.0f;
+ vpointer[5] = 0.0f;
+ vpointer[6] = (float)width + dx;
+ vpointer[7] = dy;
+
+ vpointer[8] = 1.0f;
+ vpointer[9] = 1.0f;
+ vpointer[10] = (float)width + dx;
+ vpointer[11] = (float)height + dy;
+
+ vpointer[12] = 0.0f;
+ vpointer[13] = 1.0f;
+ vpointer[14] = dx;
+ vpointer[15] = (float)height + dy;
+
+ if (vertex_buffer) {
+ glUnmapBuffer(GL_ARRAY_BUFFER);
+ }
+ }
+
+ GLuint vertex_array_object;
+ GLuint position_attribute, texcoord_attribute;
+
+ glGenVertexArrays(1, &vertex_array_object);
+ glBindVertexArray(vertex_array_object);
+
+ texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+ position_attribute = glGetAttribLocation(shader_program, "pos");
+
+ glEnableVertexAttribArray(texcoord_attribute);
+ glEnableVertexAttribArray(position_attribute);
+
+ glVertexAttribPointer(
+ texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+ glVertexAttribPointer(position_attribute,
+ 2,
+ GL_FLOAT,
+ GL_FALSE,
+ 4 * sizeof(float),
+ (const GLvoid *)(sizeof(float) * 2));
+
+ glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+ if (vertex_buffer) {
+ glBindBuffer(GL_ARRAY_BUFFER, 0);
+ }
+
+ if (use_fallback_shader) {
+ glUseProgram(0);
+ }
+ else {
+ draw_params.unbind_display_space_shader_cb();
+ }
+
+ glDeleteVertexArrays(1, &vertex_array_object);
+ glBindTexture(GL_TEXTURE_2D, 0);
+ glDeleteTextures(1, &texid);
+
+ if (transparent) {
+ glDisable(GL_BLEND);
+ }
}
-Device *Device::create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
- Device *device;
+ Device *device;
- switch(info.type) {
- case DEVICE_CPU:
- device = device_cpu_create(info, stats, profiler, background);
- break;
+ switch (info.type) {
+ case DEVICE_CPU:
+ device = device_cpu_create(info, stats, profiler, background);
+ break;
#ifdef WITH_CUDA
- case DEVICE_CUDA:
- if(device_cuda_init())
- device = device_cuda_create(info, stats, profiler, background);
- else
- device = NULL;
- break;
+ case DEVICE_CUDA:
+ if (device_cuda_init())
+ device = device_cuda_create(info, stats, profiler, background);
+ else
+ device = NULL;
+ break;
#endif
#ifdef WITH_MULTI
- case DEVICE_MULTI:
- device = device_multi_create(info, stats, profiler, background);
- break;
+ case DEVICE_MULTI:
+ device = device_multi_create(info, stats, profiler, background);
+ break;
#endif
#ifdef WITH_NETWORK
- case DEVICE_NETWORK:
- device = device_network_create(info, stats, profiler, "127.0.0.1");
- break;
+ case DEVICE_NETWORK:
+ device = device_network_create(info, stats, profiler, "127.0.0.1");
+ break;
#endif
#ifdef WITH_OPENCL
- case DEVICE_OPENCL:
- if(device_opencl_init())
- device = device_opencl_create(info, stats, profiler, background);
- else
- device = NULL;
- break;
+ case DEVICE_OPENCL:
+ if (device_opencl_init())
+ device = device_opencl_create(info, stats, profiler, background);
+ else
+ device = NULL;
+ break;
#endif
- default:
- return NULL;
- }
+ default:
+ return NULL;
+ }
- return device;
+ return device;
}
DeviceType Device::type_from_string(const char *name)
{
- if(strcmp(name, "CPU") == 0)
- return DEVICE_CPU;
- else if(strcmp(name, "CUDA") == 0)
- return DEVICE_CUDA;
- else if(strcmp(name, "OPENCL") == 0)
- return DEVICE_OPENCL;
- else if(strcmp(name, "NETWORK") == 0)
- return DEVICE_NETWORK;
- else if(strcmp(name, "MULTI") == 0)
- return DEVICE_MULTI;
-
- return DEVICE_NONE;
+ if (strcmp(name, "CPU") == 0)
+ return DEVICE_CPU;
+ else if (strcmp(name, "CUDA") == 0)
+ return DEVICE_CUDA;
+ else if (strcmp(name, "OPENCL") == 0)
+ return DEVICE_OPENCL;
+ else if (strcmp(name, "NETWORK") == 0)
+ return DEVICE_NETWORK;
+ else if (strcmp(name, "MULTI") == 0)
+ return DEVICE_MULTI;
+
+ return DEVICE_NONE;
}
string Device::string_from_type(DeviceType type)
{
- if(type == DEVICE_CPU)
- return "CPU";
- else if(type == DEVICE_CUDA)
- return "CUDA";
- else if(type == DEVICE_OPENCL)
- return "OPENCL";
- else if(type == DEVICE_NETWORK)
- return "NETWORK";
- else if(type == DEVICE_MULTI)
- return "MULTI";
-
- return "";
+ if (type == DEVICE_CPU)
+ return "CPU";
+ else if (type == DEVICE_CUDA)
+ return "CUDA";
+ else if (type == DEVICE_OPENCL)
+ return "OPENCL";
+ else if (type == DEVICE_NETWORK)
+ return "NETWORK";
+ else if (type == DEVICE_MULTI)
+ return "MULTI";
+
+ return "";
}
vector<DeviceType> Device::available_types()
{
- vector<DeviceType> types;
- types.push_back(DEVICE_CPU);
+ vector<DeviceType> types;
+ types.push_back(DEVICE_CPU);
#ifdef WITH_CUDA
- types.push_back(DEVICE_CUDA);
+ types.push_back(DEVICE_CUDA);
#endif
#ifdef WITH_OPENCL
- types.push_back(DEVICE_OPENCL);
+ types.push_back(DEVICE_OPENCL);
#endif
#ifdef WITH_NETWORK
- types.push_back(DEVICE_NETWORK);
+ types.push_back(DEVICE_NETWORK);
#endif
- return types;
+ return types;
}
vector<DeviceInfo> Device::available_devices(uint mask)
{
- /* Lazy initialize devices. On some platforms OpenCL or CUDA drivers can
- * be broken and cause crashes when only trying to get device info, so
- * we don't want to do any initialization until the user chooses to. */
- thread_scoped_lock lock(device_mutex);
- vector<DeviceInfo> devices;
+ /* Lazy initialize devices. On some platforms OpenCL or CUDA drivers can
+ * be broken and cause crashes when only trying to get device info, so
+ * we don't want to do any initialization until the user chooses to. */
+ thread_scoped_lock lock(device_mutex);
+ vector<DeviceInfo> devices;
#ifdef WITH_OPENCL
- if(mask & DEVICE_MASK_OPENCL) {
- if(!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
- if(device_opencl_init()) {
- device_opencl_info(opencl_devices);
- }
- devices_initialized_mask |= DEVICE_MASK_OPENCL;
- }
- foreach(DeviceInfo& info, opencl_devices) {
- devices.push_back(info);
- }
- }
+ if (mask & DEVICE_MASK_OPENCL) {
+ if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) {
+ if (device_opencl_init()) {
+ device_opencl_info(opencl_devices);
+ }
+ devices_initialized_mask |= DEVICE_MASK_OPENCL;
+ }
+ foreach (DeviceInfo &info, opencl_devices) {
+ devices.push_back(info);
+ }
+ }
#endif
#ifdef WITH_CUDA
- if(mask & DEVICE_MASK_CUDA) {
- if(!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
- if(device_cuda_init()) {
- device_cuda_info(cuda_devices);
- }
- devices_initialized_mask |= DEVICE_MASK_CUDA;
- }
- foreach(DeviceInfo& info, cuda_devices) {
- devices.push_back(info);
- }
- }
+ if (mask & DEVICE_MASK_CUDA) {
+ if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) {
+ if (device_cuda_init()) {
+ device_cuda_info(cuda_devices);
+ }
+ devices_initialized_mask |= DEVICE_MASK_CUDA;
+ }
+ foreach (DeviceInfo &info, cuda_devices) {
+ devices.push_back(info);
+ }
+ }
#endif
- if(mask & DEVICE_MASK_CPU) {
- if(!(devices_initialized_mask & DEVICE_MASK_CPU)) {
- device_cpu_info(cpu_devices);
- devices_initialized_mask |= DEVICE_MASK_CPU;
- }
- foreach(DeviceInfo& info, cpu_devices) {
- devices.push_back(info);
- }
- }
+ if (mask & DEVICE_MASK_CPU) {
+ if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
+ device_cpu_info(cpu_devices);
+ devices_initialized_mask |= DEVICE_MASK_CPU;
+ }
+ foreach (DeviceInfo &info, cpu_devices) {
+ devices.push_back(info);
+ }
+ }
#ifdef WITH_NETWORK
- if(mask & DEVICE_MASK_NETWORK) {
- if(!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
- device_network_info(network_devices);
- devices_initialized_mask |= DEVICE_MASK_NETWORK;
- }
- foreach(DeviceInfo& info, network_devices) {
- devices.push_back(info);
- }
- }
+ if (mask & DEVICE_MASK_NETWORK) {
+ if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) {
+ device_network_info(network_devices);
+ devices_initialized_mask |= DEVICE_MASK_NETWORK;
+ }
+ foreach (DeviceInfo &info, network_devices) {
+ devices.push_back(info);
+ }
+ }
#endif
- return devices;
+ return devices;
}
string Device::device_capabilities(uint mask)
{
- thread_scoped_lock lock(device_mutex);
- string capabilities = "";
+ thread_scoped_lock lock(device_mutex);
+ string capabilities = "";
- if(mask & DEVICE_MASK_CPU) {
- capabilities += "\nCPU device capabilities: ";
- capabilities += device_cpu_capabilities() + "\n";
- }
+ if (mask & DEVICE_MASK_CPU) {
+ capabilities += "\nCPU device capabilities: ";
+ capabilities += device_cpu_capabilities() + "\n";
+ }
#ifdef WITH_OPENCL
- if(mask & DEVICE_MASK_OPENCL) {
- if(device_opencl_init()) {
- capabilities += "\nOpenCL device capabilities:\n";
- capabilities += device_opencl_capabilities();
- }
- }
+ if (mask & DEVICE_MASK_OPENCL) {
+ if (device_opencl_init()) {
+ capabilities += "\nOpenCL device capabilities:\n";
+ capabilities += device_opencl_capabilities();
+ }
+ }
#endif
#ifdef WITH_CUDA
- if(mask & DEVICE_MASK_CUDA) {
- if(device_cuda_init()) {
- capabilities += "\nCUDA device capabilities:\n";
- capabilities += device_cuda_capabilities();
- }
- }
+ if (mask & DEVICE_MASK_CUDA) {
+ if (device_cuda_init()) {
+ capabilities += "\nCUDA device capabilities:\n";
+ capabilities += device_cuda_capabilities();
+ }
+ }
#endif
- return capabilities;
+ return capabilities;
}
-DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int threads, bool background)
+DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
+ int threads,
+ bool background)
{
- assert(subdevices.size() > 0);
-
- if(subdevices.size() == 1) {
- /* No multi device needed. */
- return subdevices.front();
- }
-
- DeviceInfo info;
- info.type = DEVICE_MULTI;
- info.id = "MULTI";
- info.description = "Multi Device";
- info.num = 0;
-
- info.has_half_images = true;
- info.has_volume_decoupled = true;
- info.has_osl = true;
- info.has_profiling = true;
-
- foreach(const DeviceInfo &device, subdevices) {
- /* Ensure CPU device does not slow down GPU. */
- if(device.type == DEVICE_CPU && subdevices.size() > 1) {
- if(background) {
- int orig_cpu_threads = (threads)? threads: system_cpu_thread_count();
- int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);
-
- VLOG(1) << "CPU render threads reduced from "
- << orig_cpu_threads << " to " << cpu_threads
- << ", to dedicate to GPU.";
-
- if(cpu_threads >= 1) {
- DeviceInfo cpu_device = device;
- cpu_device.cpu_threads = cpu_threads;
- info.multi_devices.push_back(cpu_device);
- }
- else {
- continue;
- }
- }
- else {
- VLOG(1) << "CPU render threads disabled for interactive render.";
- continue;
- }
- }
- else {
- info.multi_devices.push_back(device);
- }
-
- /* Accumulate device info. */
- info.has_half_images &= device.has_half_images;
- info.has_volume_decoupled &= device.has_volume_decoupled;
- info.has_osl &= device.has_osl;
- info.has_profiling &= device.has_profiling;
- }
-
- return info;
+ assert(subdevices.size() > 0);
+
+ if (subdevices.size() == 1) {
+ /* No multi device needed. */
+ return subdevices.front();
+ }
+
+ DeviceInfo info;
+ info.type = DEVICE_MULTI;
+ info.id = "MULTI";
+ info.description = "Multi Device";
+ info.num = 0;
+
+ info.has_half_images = true;
+ info.has_volume_decoupled = true;
+ info.has_osl = true;
+ info.has_profiling = true;
+
+ foreach (const DeviceInfo &device, subdevices) {
+ /* Ensure CPU device does not slow down GPU. */
+ if (device.type == DEVICE_CPU && subdevices.size() > 1) {
+ if (background) {
+ int orig_cpu_threads = (threads) ? threads : system_cpu_thread_count();
+ int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0);
+
+ VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads
+ << ", to dedicate to GPU.";
+
+ if (cpu_threads >= 1) {
+ DeviceInfo cpu_device = device;
+ cpu_device.cpu_threads = cpu_threads;
+ info.multi_devices.push_back(cpu_device);
+ }
+ else {
+ continue;
+ }
+ }
+ else {
+ VLOG(1) << "CPU render threads disabled for interactive render.";
+ continue;
+ }
+ }
+ else {
+ info.multi_devices.push_back(device);
+ }
+
+ /* Accumulate device info. */
+ info.has_half_images &= device.has_half_images;
+ info.has_volume_decoupled &= device.has_volume_decoupled;
+ info.has_osl &= device.has_osl;
+ info.has_profiling &= device.has_profiling;
+ }
+
+ return info;
}
void Device::tag_update()
{
- free_memory();
+ free_memory();
}
void Device::free_memory()
{
- devices_initialized_mask = 0;
- cuda_devices.free_memory();
- opencl_devices.free_memory();
- cpu_devices.free_memory();
- network_devices.free_memory();
+ devices_initialized_mask = 0;
+ cuda_devices.free_memory();
+ opencl_devices.free_memory();
+ cpu_devices.free_memory();
+ network_devices.free_memory();
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index aa0a8e434d2..15a0ceb4a19 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -40,384 +40,428 @@ class RenderTile;
/* Device Types */
enum DeviceType {
- DEVICE_NONE = 0,
- DEVICE_CPU,
- DEVICE_OPENCL,
- DEVICE_CUDA,
- DEVICE_NETWORK,
- DEVICE_MULTI
+ DEVICE_NONE = 0,
+ DEVICE_CPU,
+ DEVICE_OPENCL,
+ DEVICE_CUDA,
+ DEVICE_NETWORK,
+ DEVICE_MULTI
};
enum DeviceTypeMask {
- DEVICE_MASK_CPU = (1 << DEVICE_CPU),
- DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
- DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
- DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
- DEVICE_MASK_ALL = ~0
+ DEVICE_MASK_CPU = (1 << DEVICE_CPU),
+ DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL),
+ DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
+ DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK),
+ DEVICE_MASK_ALL = ~0
};
enum DeviceKernelStatus {
- DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0,
- DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
- DEVICE_KERNEL_USING_FEATURE_KERNEL,
- DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
- DEVICE_KERNEL_UNKNOWN,
+ DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0,
+ DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE,
+ DEVICE_KERNEL_USING_FEATURE_KERNEL,
+ DEVICE_KERNEL_FEATURE_KERNEL_INVALID,
+ DEVICE_KERNEL_UNKNOWN,
};
#define DEVICE_MASK(type) (DeviceTypeMask)(1 << type)
class DeviceInfo {
-public:
- DeviceType type;
- string description;
- string id; /* used for user preferences, should stay fixed with changing hardware config */
- int num;
- bool display_device; /* GPU is used as a display device. */
- bool has_half_images; /* Support half-float textures. */
- bool has_volume_decoupled; /* Decoupled volume shading. */
- bool has_osl; /* Support Open Shading Language. */
- bool use_split_kernel; /* Use split or mega kernel. */
- bool has_profiling; /* Supports runtime collection of profiling info. */
- int cpu_threads;
- vector<DeviceInfo> multi_devices;
-
- DeviceInfo()
- {
- type = DEVICE_CPU;
- id = "CPU";
- num = 0;
- cpu_threads = 0;
- display_device = false;
- has_half_images = false;
- has_volume_decoupled = false;
- has_osl = false;
- use_split_kernel = false;
- has_profiling = false;
- }
-
- bool operator==(const DeviceInfo &info) {
- /* Multiple Devices with the same ID would be very bad. */
- assert(id != info.id || (type == info.type && num == info.num && description == info.description));
- return id == info.id;
- }
+ public:
+ DeviceType type;
+ string description;
+ string id; /* used for user preferences, should stay fixed with changing hardware config */
+ int num;
+ bool display_device; /* GPU is used as a display device. */
+ bool has_half_images; /* Support half-float textures. */
+ bool has_volume_decoupled; /* Decoupled volume shading. */
+ bool has_osl; /* Support Open Shading Language. */
+ bool use_split_kernel; /* Use split or mega kernel. */
+ bool has_profiling; /* Supports runtime collection of profiling info. */
+ int cpu_threads;
+ vector<DeviceInfo> multi_devices;
+
+ DeviceInfo()
+ {
+ type = DEVICE_CPU;
+ id = "CPU";
+ num = 0;
+ cpu_threads = 0;
+ display_device = false;
+ has_half_images = false;
+ has_volume_decoupled = false;
+ has_osl = false;
+ use_split_kernel = false;
+ has_profiling = false;
+ }
+
+ bool operator==(const DeviceInfo &info)
+ {
+ /* Multiple Devices with the same ID would be very bad. */
+ assert(id != info.id ||
+ (type == info.type && num == info.num && description == info.description));
+ return id == info.id;
+ }
};
class DeviceRequestedFeatures {
-public:
- /* Use experimental feature set. */
- bool experimental;
-
- /* Selective nodes compilation. */
-
- /* Identifier of a node group up to which all the nodes needs to be
- * compiled in. Nodes from higher group indices will be ignores.
- */
- int max_nodes_group;
-
- /* Features bitfield indicating which features from the requested group
- * will be compiled in. Nodes which corresponds to features which are not
- * in this bitfield will be ignored even if they're in the requested group.
- */
- int nodes_features;
-
- /* BVH/sampling kernel features. */
- bool use_hair;
- bool use_object_motion;
- bool use_camera_motion;
-
- /* Denotes whether baking functionality is needed. */
- bool use_baking;
-
- /* Use subsurface scattering materials. */
- bool use_subsurface;
-
- /* Use volume materials. */
- bool use_volume;
-
- /* Use branched integrator. */
- bool use_integrator_branched;
-
- /* Use OpenSubdiv patch evaluation */
- bool use_patch_evaluation;
-
- /* Use Transparent shadows */
- bool use_transparent;
-
- /* Use various shadow tricks, such as shadow catcher. */
- bool use_shadow_tricks;
-
- /* Per-uber shader usage flags. */
- bool use_principled;
-
- /* Denoising features. */
- bool use_denoising;
-
- /* Use raytracing in shaders. */
- bool use_shader_raytrace;
-
- /* Use true displacement */
- bool use_true_displacement;
-
- /* Use background lights */
- bool use_background_light;
-
- DeviceRequestedFeatures()
- {
- /* TODO(sergey): Find more meaningful defaults. */
- experimental = false;
- max_nodes_group = 0;
- nodes_features = 0;
- use_hair = false;
- use_object_motion = false;
- use_camera_motion = false;
- use_baking = false;
- use_subsurface = false;
- use_volume = false;
- use_integrator_branched = false;
- use_patch_evaluation = false;
- use_transparent = false;
- use_shadow_tricks = false;
- use_principled = false;
- use_denoising = false;
- use_shader_raytrace = false;
- use_true_displacement = false;
- use_background_light = false;
- }
-
- bool modified(const DeviceRequestedFeatures& requested_features)
- {
- return !(experimental == requested_features.experimental &&
- max_nodes_group == requested_features.max_nodes_group &&
- nodes_features == requested_features.nodes_features &&
- use_hair == requested_features.use_hair &&
- use_object_motion == requested_features.use_object_motion &&
- use_camera_motion == requested_features.use_camera_motion &&
- use_baking == requested_features.use_baking &&
- use_subsurface == requested_features.use_subsurface &&
- use_volume == requested_features.use_volume &&
- use_integrator_branched == requested_features.use_integrator_branched &&
- use_patch_evaluation == requested_features.use_patch_evaluation &&
- use_transparent == requested_features.use_transparent &&
- use_shadow_tricks == requested_features.use_shadow_tricks &&
- use_principled == requested_features.use_principled &&
- use_denoising == requested_features.use_denoising &&
- use_shader_raytrace == requested_features.use_shader_raytrace &&
- use_true_displacement == requested_features.use_true_displacement &&
- use_background_light == requested_features.use_background_light);
- }
-
- /* Convert the requested features structure to a build options,
- * which could then be passed to compilers.
- */
- string get_build_options() const
- {
- string build_options = "";
- if(experimental) {
- build_options += "-D__KERNEL_EXPERIMENTAL__ ";
- }
- build_options += "-D__NODES_MAX_GROUP__=" +
- string_printf("%d", max_nodes_group);
- build_options += " -D__NODES_FEATURES__=" +
- string_printf("%d", nodes_features);
- if(!use_hair) {
- build_options += " -D__NO_HAIR__";
- }
- if(!use_object_motion) {
- build_options += " -D__NO_OBJECT_MOTION__";
- }
- if(!use_camera_motion) {
- build_options += " -D__NO_CAMERA_MOTION__";
- }
- if(!use_baking) {
- build_options += " -D__NO_BAKING__";
- }
- if(!use_volume) {
- build_options += " -D__NO_VOLUME__";
- }
- if(!use_subsurface) {
- build_options += " -D__NO_SUBSURFACE__";
- }
- if(!use_integrator_branched) {
- build_options += " -D__NO_BRANCHED_PATH__";
- }
- if(!use_patch_evaluation) {
- build_options += " -D__NO_PATCH_EVAL__";
- }
- if(!use_transparent && !use_volume) {
- build_options += " -D__NO_TRANSPARENT__";
- }
- if(!use_shadow_tricks) {
- build_options += " -D__NO_SHADOW_TRICKS__";
- }
- if(!use_principled) {
- build_options += " -D__NO_PRINCIPLED__";
- }
- if(!use_denoising) {
- build_options += " -D__NO_DENOISING__";
- }
- if(!use_shader_raytrace) {
- build_options += " -D__NO_SHADER_RAYTRACE__";
- }
- return build_options;
- }
+ public:
+ /* Use experimental feature set. */
+ bool experimental;
+
+ /* Selective nodes compilation. */
+
+ /* Identifier of a node group up to which all the nodes needs to be
+ * compiled in. Nodes from higher group indices will be ignores.
+ */
+ int max_nodes_group;
+
+ /* Features bitfield indicating which features from the requested group
+ * will be compiled in. Nodes which corresponds to features which are not
+ * in this bitfield will be ignored even if they're in the requested group.
+ */
+ int nodes_features;
+
+ /* BVH/sampling kernel features. */
+ bool use_hair;
+ bool use_object_motion;
+ bool use_camera_motion;
+
+ /* Denotes whether baking functionality is needed. */
+ bool use_baking;
+
+ /* Use subsurface scattering materials. */
+ bool use_subsurface;
+
+ /* Use volume materials. */
+ bool use_volume;
+
+ /* Use branched integrator. */
+ bool use_integrator_branched;
+
+ /* Use OpenSubdiv patch evaluation */
+ bool use_patch_evaluation;
+
+ /* Use Transparent shadows */
+ bool use_transparent;
+
+ /* Use various shadow tricks, such as shadow catcher. */
+ bool use_shadow_tricks;
+
+ /* Per-uber shader usage flags. */
+ bool use_principled;
+
+ /* Denoising features. */
+ bool use_denoising;
+
+ /* Use raytracing in shaders. */
+ bool use_shader_raytrace;
+
+ /* Use true displacement */
+ bool use_true_displacement;
+
+ /* Use background lights */
+ bool use_background_light;
+
+ DeviceRequestedFeatures()
+ {
+ /* TODO(sergey): Find more meaningful defaults. */
+ experimental = false;
+ max_nodes_group = 0;
+ nodes_features = 0;
+ use_hair = false;
+ use_object_motion = false;
+ use_camera_motion = false;
+ use_baking = false;
+ use_subsurface = false;
+ use_volume = false;
+ use_integrator_branched = false;
+ use_patch_evaluation = false;
+ use_transparent = false;
+ use_shadow_tricks = false;
+ use_principled = false;
+ use_denoising = false;
+ use_shader_raytrace = false;
+ use_true_displacement = false;
+ use_background_light = false;
+ }
+
+ bool modified(const DeviceRequestedFeatures &requested_features)
+ {
+ return !(experimental == requested_features.experimental &&
+ max_nodes_group == requested_features.max_nodes_group &&
+ nodes_features == requested_features.nodes_features &&
+ use_hair == requested_features.use_hair &&
+ use_object_motion == requested_features.use_object_motion &&
+ use_camera_motion == requested_features.use_camera_motion &&
+ use_baking == requested_features.use_baking &&
+ use_subsurface == requested_features.use_subsurface &&
+ use_volume == requested_features.use_volume &&
+ use_integrator_branched == requested_features.use_integrator_branched &&
+ use_patch_evaluation == requested_features.use_patch_evaluation &&
+ use_transparent == requested_features.use_transparent &&
+ use_shadow_tricks == requested_features.use_shadow_tricks &&
+ use_principled == requested_features.use_principled &&
+ use_denoising == requested_features.use_denoising &&
+ use_shader_raytrace == requested_features.use_shader_raytrace &&
+ use_true_displacement == requested_features.use_true_displacement &&
+ use_background_light == requested_features.use_background_light);
+ }
+
+ /* Convert the requested features structure to a build options,
+ * which could then be passed to compilers.
+ */
+ string get_build_options() const
+ {
+ string build_options = "";
+ if (experimental) {
+ build_options += "-D__KERNEL_EXPERIMENTAL__ ";
+ }
+ build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group);
+ build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features);
+ if (!use_hair) {
+ build_options += " -D__NO_HAIR__";
+ }
+ if (!use_object_motion) {
+ build_options += " -D__NO_OBJECT_MOTION__";
+ }
+ if (!use_camera_motion) {
+ build_options += " -D__NO_CAMERA_MOTION__";
+ }
+ if (!use_baking) {
+ build_options += " -D__NO_BAKING__";
+ }
+ if (!use_volume) {
+ build_options += " -D__NO_VOLUME__";
+ }
+ if (!use_subsurface) {
+ build_options += " -D__NO_SUBSURFACE__";
+ }
+ if (!use_integrator_branched) {
+ build_options += " -D__NO_BRANCHED_PATH__";
+ }
+ if (!use_patch_evaluation) {
+ build_options += " -D__NO_PATCH_EVAL__";
+ }
+ if (!use_transparent && !use_volume) {
+ build_options += " -D__NO_TRANSPARENT__";
+ }
+ if (!use_shadow_tricks) {
+ build_options += " -D__NO_SHADOW_TRICKS__";
+ }
+ if (!use_principled) {
+ build_options += " -D__NO_PRINCIPLED__";
+ }
+ if (!use_denoising) {
+ build_options += " -D__NO_DENOISING__";
+ }
+ if (!use_shader_raytrace) {
+ build_options += " -D__NO_SHADER_RAYTRACE__";
+ }
+ return build_options;
+ }
};
-std::ostream& operator <<(std::ostream &os,
- const DeviceRequestedFeatures& requested_features);
+std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features);
/* Device */
struct DeviceDrawParams {
- function<void()> bind_display_space_shader_cb;
- function<void()> unbind_display_space_shader_cb;
+ function<void()> bind_display_space_shader_cb;
+ function<void()> unbind_display_space_shader_cb;
};
class Device {
- friend class device_sub_ptr;
-protected:
- enum {
- FALLBACK_SHADER_STATUS_NONE = 0,
- FALLBACK_SHADER_STATUS_ERROR,
- FALLBACK_SHADER_STATUS_SUCCESS,
- };
-
- Device(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background) : background(background),
- vertex_buffer(0),
- fallback_status(FALLBACK_SHADER_STATUS_NONE), fallback_shader_program(0),
- info(info_), stats(stats_), profiler(profiler_) {}
-
- bool background;
- string error_msg;
-
- /* used for real time display */
- unsigned int vertex_buffer;
- int fallback_status, fallback_shader_program;
- int image_texture_location, fullscreen_location;
-
- bool bind_fallback_display_space_shader(const float width, const float height);
-
- virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/)
- {
- /* Only required for devices that implement denoising. */
- assert(false);
- return (device_ptr) 0;
- }
- virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
-
-public:
- virtual ~Device();
-
- /* info */
- DeviceInfo info;
- virtual const string& error_message() { return error_msg; }
- bool have_error() { return !error_message().empty(); }
- virtual void set_error(const string& error)
- {
- if(!have_error()) {
- error_msg = error;
- }
- fprintf(stderr, "%s\n", error.c_str());
- fflush(stderr);
- }
- virtual bool show_samples() const { return false; }
- virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
-
- /* statistics */
- Stats &stats;
- Profiler &profiler;
-
- /* memory alignment */
- virtual int mem_sub_ptr_alignment() { return MIN_ALIGNMENT_CPU_DATA_TYPES; }
-
- /* constant memory */
- virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
-
- /* open shading language, only for CPU device */
- virtual void *osl_memory() { return NULL; }
-
- /* load/compile kernels, must be called before adding tasks */
- virtual bool load_kernels(
- const DeviceRequestedFeatures& /*requested_features*/)
- { return true; }
-
- /* Wait for device to become available to upload data and receive tasks
- * This method is used by the OpenCL device to load the
- * optimized kernels or when not (yet) available load the
- * generic kernels (only during foreground rendering) */
- virtual bool wait_for_availability(
- const DeviceRequestedFeatures& /*requested_features*/)
- { return true; }
- /* Check if there are 'better' kernels available to be used
- * We can switch over to these kernels
- * This method is used to determine if we can switch the preview kernels
- * to regular kernels */
- virtual DeviceKernelStatus get_active_kernel_switch_state()
- { return DEVICE_KERNEL_USING_FEATURE_KERNEL; }
-
- /* tasks */
- virtual int get_split_task_count(DeviceTask& task) = 0;
- virtual void task_add(DeviceTask& task) = 0;
- virtual void task_wait() = 0;
- virtual void task_cancel() = 0;
-
- /* opengl drawing */
- virtual void draw_pixels(device_memory& mem, int y,
- int w, int h, int width, int height,
- int dx, int dy, int dw, int dh,
- bool transparent, const DeviceDrawParams &draw_params);
+ friend class device_sub_ptr;
+
+ protected:
+ enum {
+ FALLBACK_SHADER_STATUS_NONE = 0,
+ FALLBACK_SHADER_STATUS_ERROR,
+ FALLBACK_SHADER_STATUS_SUCCESS,
+ };
+
+ Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background)
+ : background(background),
+ vertex_buffer(0),
+ fallback_status(FALLBACK_SHADER_STATUS_NONE),
+ fallback_shader_program(0),
+ info(info_),
+ stats(stats_),
+ profiler(profiler_)
+ {
+ }
+
+ bool background;
+ string error_msg;
+
+ /* used for real time display */
+ unsigned int vertex_buffer;
+ int fallback_status, fallback_shader_program;
+ int image_texture_location, fullscreen_location;
+
+ bool bind_fallback_display_space_shader(const float width, const float height);
+
+ virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
+ {
+ /* Only required for devices that implement denoising. */
+ assert(false);
+ return (device_ptr)0;
+ }
+ virtual void mem_free_sub_ptr(device_ptr /*ptr*/){};
+
+ public:
+ virtual ~Device();
+
+ /* info */
+ DeviceInfo info;
+ virtual const string &error_message()
+ {
+ return error_msg;
+ }
+ bool have_error()
+ {
+ return !error_message().empty();
+ }
+ virtual void set_error(const string &error)
+ {
+ if (!have_error()) {
+ error_msg = error;
+ }
+ fprintf(stderr, "%s\n", error.c_str());
+ fflush(stderr);
+ }
+ virtual bool show_samples() const
+ {
+ return false;
+ }
+ virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
+
+ /* statistics */
+ Stats &stats;
+ Profiler &profiler;
+
+ /* memory alignment */
+ virtual int mem_sub_ptr_alignment()
+ {
+ return MIN_ALIGNMENT_CPU_DATA_TYPES;
+ }
+
+ /* constant memory */
+ virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
+
+ /* open shading language, only for CPU device */
+ virtual void *osl_memory()
+ {
+ return NULL;
+ }
+
+ /* load/compile kernels, must be called before adding tasks */
+ virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/)
+ {
+ return true;
+ }
+
+ /* Wait for device to become available to upload data and receive tasks
+ * This method is used by the OpenCL device to load the
+ * optimized kernels or when not (yet) available load the
+ * generic kernels (only during foreground rendering) */
+ virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/)
+ {
+ return true;
+ }
+ /* Check if there are 'better' kernels available to be used
+ * We can switch over to these kernels
+ * This method is used to determine if we can switch the preview kernels
+ * to regular kernels */
+ virtual DeviceKernelStatus get_active_kernel_switch_state()
+ {
+ return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+ }
+
+ /* tasks */
+ virtual int get_split_task_count(DeviceTask &task) = 0;
+ virtual void task_add(DeviceTask &task) = 0;
+ virtual void task_wait() = 0;
+ virtual void task_cancel() = 0;
+
+ /* opengl drawing */
+ virtual void draw_pixels(device_memory &mem,
+ int y,
+ int w,
+ int h,
+ int width,
+ int height,
+ int dx,
+ int dy,
+ int dw,
+ int dh,
+ bool transparent,
+ const DeviceDrawParams &draw_params);
#ifdef WITH_NETWORK
- /* networking */
- void server_run();
+ /* networking */
+ void server_run();
#endif
- /* multi device */
- virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
- virtual int device_number(Device * /*sub_device*/) { return 0; }
- virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
- virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
-
- /* static */
- static Device *create(DeviceInfo& info, Stats &stats, Profiler& profiler, bool background = true);
-
- static DeviceType type_from_string(const char *name);
- static string string_from_type(DeviceType type);
- static vector<DeviceType> available_types();
- static vector<DeviceInfo> available_devices(uint device_type_mask = DEVICE_MASK_ALL);
- static string device_capabilities(uint device_type_mask = DEVICE_MASK_ALL);
- static DeviceInfo get_multi_device(const vector<DeviceInfo>& subdevices,
- int threads,
- bool background);
-
- /* Tag devices lists for update. */
- static void tag_update();
-
- static void free_memory();
-
-protected:
- /* Memory allocation, only accessed through device_memory. */
- friend class MultiDevice;
- friend class DeviceServer;
- friend class device_memory;
-
- virtual void mem_alloc(device_memory& mem) = 0;
- virtual void mem_copy_to(device_memory& mem) = 0;
- virtual void mem_copy_from(device_memory& mem,
- int y, int w, int h, int elem) = 0;
- virtual void mem_zero(device_memory& mem) = 0;
- virtual void mem_free(device_memory& mem) = 0;
-
-private:
- /* Indicted whether device types and devices lists were initialized. */
- static bool need_types_update, need_devices_update;
- static thread_mutex device_mutex;
- static vector<DeviceInfo> cuda_devices;
- static vector<DeviceInfo> opencl_devices;
- static vector<DeviceInfo> cpu_devices;
- static vector<DeviceInfo> network_devices;
- static uint devices_initialized_mask;
+ /* multi device */
+ virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/)
+ {
+ }
+ virtual int device_number(Device * /*sub_device*/)
+ {
+ return 0;
+ }
+ virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+ {
+ }
+ virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/)
+ {
+ }
+
+ /* static */
+ static Device *create(DeviceInfo &info,
+ Stats &stats,
+ Profiler &profiler,
+ bool background = true);
+
+ static DeviceType type_from_string(const char *name);
+ static string string_from_type(DeviceType type);
+ static vector<DeviceType> available_types();
+ static vector<DeviceInfo> available_devices(uint device_type_mask = DEVICE_MASK_ALL);
+ static string device_capabilities(uint device_type_mask = DEVICE_MASK_ALL);
+ static DeviceInfo get_multi_device(const vector<DeviceInfo> &subdevices,
+ int threads,
+ bool background);
+
+ /* Tag devices lists for update. */
+ static void tag_update();
+
+ static void free_memory();
+
+ protected:
+ /* Memory allocation, only accessed through device_memory. */
+ friend class MultiDevice;
+ friend class DeviceServer;
+ friend class device_memory;
+
+ virtual void mem_alloc(device_memory &mem) = 0;
+ virtual void mem_copy_to(device_memory &mem) = 0;
+ virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0;
+ virtual void mem_zero(device_memory &mem) = 0;
+ virtual void mem_free(device_memory &mem) = 0;
+
+ private:
+ /* Indicted whether device types and devices lists were initialized. */
+ static bool need_types_update, need_devices_update;
+ static thread_mutex device_mutex;
+ static vector<DeviceInfo> cuda_devices;
+ static vector<DeviceInfo> opencl_devices;
+ static vector<DeviceInfo> cpu_devices;
+ static vector<DeviceInfo> network_devices;
+ static uint devices_initialized_mask;
};
CCL_NAMESPACE_END
-#endif /* __DEVICE_H__ */
+#endif /* __DEVICE_H__ */
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 73f1fc02b08..837a8186064 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -61,1087 +61,1183 @@ class CPUDevice;
/* Has to be outside of the class to be shared across template instantiations. */
static const char *logged_architecture = "";
-template<typename F>
-class KernelFunctions {
-public:
- KernelFunctions()
- {
- kernel = (F)NULL;
- }
-
- KernelFunctions(F kernel_default,
- F kernel_sse2,
- F kernel_sse3,
- F kernel_sse41,
- F kernel_avx,
- F kernel_avx2)
- {
- const char *architecture_name = "default";
- kernel = kernel_default;
-
- /* Silence potential warnings about unused variables
- * when compiling without some architectures. */
- (void) kernel_sse2;
- (void) kernel_sse3;
- (void) kernel_sse41;
- (void) kernel_avx;
- (void) kernel_avx2;
+template<typename F> class KernelFunctions {
+ public:
+ KernelFunctions()
+ {
+ kernel = (F)NULL;
+ }
+
+ KernelFunctions(
+ F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
+ {
+ const char *architecture_name = "default";
+ kernel = kernel_default;
+
+ /* Silence potential warnings about unused variables
+ * when compiling without some architectures. */
+ (void)kernel_sse2;
+ (void)kernel_sse3;
+ (void)kernel_sse41;
+ (void)kernel_avx;
+ (void)kernel_avx2;
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
- if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
- architecture_name = "AVX2";
- kernel = kernel_avx2;
- }
- else
+ if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+ architecture_name = "AVX2";
+ kernel = kernel_avx2;
+ }
+ else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
- if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
- architecture_name = "AVX";
- kernel = kernel_avx;
- }
- else
+ if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) {
+ architecture_name = "AVX";
+ kernel = kernel_avx;
+ }
+ else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
- if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
- architecture_name = "SSE4.1";
- kernel = kernel_sse41;
- }
- else
+ if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) {
+ architecture_name = "SSE4.1";
+ kernel = kernel_sse41;
+ }
+ else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
- if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
- architecture_name = "SSE3";
- kernel = kernel_sse3;
- }
- else
+ if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) {
+ architecture_name = "SSE3";
+ kernel = kernel_sse3;
+ }
+ else
#endif
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
- if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
- architecture_name = "SSE2";
- kernel = kernel_sse2;
- }
+ if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+ architecture_name = "SSE2";
+ kernel = kernel_sse2;
+ }
#endif
- if(strcmp(architecture_name, logged_architecture) != 0) {
- VLOG(1) << "Will be using " << architecture_name << " kernels.";
- logged_architecture = architecture_name;
- }
- }
-
- inline F operator()() const {
- assert(kernel);
- return kernel;
- }
-protected:
- F kernel;
+ if (strcmp(architecture_name, logged_architecture) != 0) {
+ VLOG(1) << "Will be using " << architecture_name << " kernels.";
+ logged_architecture = architecture_name;
+ }
+ }
+
+ inline F operator()() const
+ {
+ assert(kernel);
+ return kernel;
+ }
+
+ protected:
+ F kernel;
};
class CPUSplitKernel : public DeviceSplitKernel {
- CPUDevice *device;
-public:
- explicit CPUSplitKernel(CPUDevice *device);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs);
-
- virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
- const DeviceRequestedFeatures&);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
+ CPUDevice *device;
+
+ public:
+ explicit CPUSplitKernel(CPUDevice *device);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+ RenderTile &rtile,
+ int num_global_elements,
+ device_memory &kernel_globals,
+ device_memory &kernel_data_,
+ device_memory &split_data,
+ device_memory &ray_state,
+ device_memory &queue_index,
+ device_memory &use_queues_flag,
+ device_memory &work_pool_wgs);
+
+ virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+ const DeviceRequestedFeatures &);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
+ virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
};
-class CPUDevice : public Device
-{
-public:
- TaskPool task_pool;
- KernelGlobals kernel_globals;
+class CPUDevice : public Device {
+ public:
+ TaskPool task_pool;
+ KernelGlobals kernel_globals;
- device_vector<TextureInfo> texture_info;
- bool need_texture_info;
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
#ifdef WITH_OSL
- OSLGlobals osl_globals;
+ OSLGlobals osl_globals;
#endif
- bool use_split_kernel;
-
- DeviceRequestedFeatures requested_features;
-
- KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
- KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
- KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
- KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel;
-
- KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
- KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel;
- KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
-
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
- KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel;
- KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
-
- KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel;
- KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel;
- KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
-
- KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
- int, int, int, int, int, int, int, int, ccl_global int*, int,
- ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
- unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
+ bool use_split_kernel;
+
+ DeviceRequestedFeatures requested_features;
+
+ KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
+ KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
+ convert_to_half_float_kernel;
+ KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
+ convert_to_byte_kernel;
+ KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
+ shader_kernel;
+
+ KernelFunctions<void (*)(
+ int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
+ filter_divide_shadow_kernel;
+ KernelFunctions<void (*)(
+ int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
+ filter_get_feature_kernel;
+ KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
+ filter_write_feature_kernel;
+ KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
+ filter_detect_outliers_kernel;
+ KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
+ filter_combine_halves_kernel;
+
+ KernelFunctions<void (*)(
+ int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
+ filter_nlm_calc_difference_kernel;
+ KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
+ KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
+ KernelFunctions<void (*)(
+ int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
+ filter_nlm_update_output_kernel;
+ KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
+
+ KernelFunctions<void (*)(
+ float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
+ filter_construct_transform_kernel;
+ KernelFunctions<void (*)(int,
+ int,
+ int,
+ float *,
+ float *,
+ float *,
+ int *,
+ float *,
+ float3 *,
+ int *,
+ int *,
+ int,
+ int,
+ int,
+ int,
+ bool)>
+ filter_nlm_construct_gramian_kernel;
+ KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
+ filter_finalize_kernel;
+
+ KernelFunctions<void (*)(KernelGlobals *,
+ ccl_constant KernelData *,
+ ccl_global void *,
+ int,
+ ccl_global char *,
+ int,
+ int,
+ int,
+ int,
+ int,
+ int,
+ int,
+ int,
+ ccl_global int *,
+ int,
+ ccl_global char *,
+ ccl_global unsigned int *,
+ unsigned int,
+ ccl_global float *)>
+ data_init_kernel;
+ unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
#define KERNEL_FUNCTIONS(name) \
- KERNEL_NAME_EVAL(cpu, name), \
- KERNEL_NAME_EVAL(cpu_sse2, name), \
- KERNEL_NAME_EVAL(cpu_sse3, name), \
- KERNEL_NAME_EVAL(cpu_sse41, name), \
- KERNEL_NAME_EVAL(cpu_avx, name), \
- KERNEL_NAME_EVAL(cpu_avx2, name)
-
- CPUDevice(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background_)
- : Device(info_, stats_, profiler_, background_),
- texture_info(this, "__texture_info", MEM_TEXTURE),
-#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
- REGISTER_KERNEL(path_trace),
- REGISTER_KERNEL(convert_to_half_float),
- REGISTER_KERNEL(convert_to_byte),
- REGISTER_KERNEL(shader),
- REGISTER_KERNEL(filter_divide_shadow),
- REGISTER_KERNEL(filter_get_feature),
- REGISTER_KERNEL(filter_write_feature),
- REGISTER_KERNEL(filter_detect_outliers),
- REGISTER_KERNEL(filter_combine_halves),
- REGISTER_KERNEL(filter_nlm_calc_difference),
- REGISTER_KERNEL(filter_nlm_blur),
- REGISTER_KERNEL(filter_nlm_calc_weight),
- REGISTER_KERNEL(filter_nlm_update_output),
- REGISTER_KERNEL(filter_nlm_normalize),
- REGISTER_KERNEL(filter_construct_transform),
- REGISTER_KERNEL(filter_nlm_construct_gramian),
- REGISTER_KERNEL(filter_finalize),
- REGISTER_KERNEL(data_init)
+ KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
+ KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
+ KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
+
+ CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
+ : Device(info_, stats_, profiler_, background_),
+ texture_info(this, "__texture_info", MEM_TEXTURE),
+#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
+ REGISTER_KERNEL(path_trace),
+ REGISTER_KERNEL(convert_to_half_float),
+ REGISTER_KERNEL(convert_to_byte),
+ REGISTER_KERNEL(shader),
+ REGISTER_KERNEL(filter_divide_shadow),
+ REGISTER_KERNEL(filter_get_feature),
+ REGISTER_KERNEL(filter_write_feature),
+ REGISTER_KERNEL(filter_detect_outliers),
+ REGISTER_KERNEL(filter_combine_halves),
+ REGISTER_KERNEL(filter_nlm_calc_difference),
+ REGISTER_KERNEL(filter_nlm_blur),
+ REGISTER_KERNEL(filter_nlm_calc_weight),
+ REGISTER_KERNEL(filter_nlm_update_output),
+ REGISTER_KERNEL(filter_nlm_normalize),
+ REGISTER_KERNEL(filter_construct_transform),
+ REGISTER_KERNEL(filter_nlm_construct_gramian),
+ REGISTER_KERNEL(filter_finalize),
+ REGISTER_KERNEL(data_init)
#undef REGISTER_KERNEL
- {
- if(info.cpu_threads == 0) {
- info.cpu_threads = TaskScheduler::num_threads();
- }
+ {
+ if (info.cpu_threads == 0) {
+ info.cpu_threads = TaskScheduler::num_threads();
+ }
#ifdef WITH_OSL
- kernel_globals.osl = &osl_globals;
+ kernel_globals.osl = &osl_globals;
#endif
- use_split_kernel = DebugFlags().cpu.split_kernel;
- if(use_split_kernel) {
- VLOG(1) << "Will be using split kernel.";
- }
- need_texture_info = false;
-
-#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
- REGISTER_SPLIT_KERNEL(path_init);
- REGISTER_SPLIT_KERNEL(scene_intersect);
- REGISTER_SPLIT_KERNEL(lamp_emission);
- REGISTER_SPLIT_KERNEL(do_volume);
- REGISTER_SPLIT_KERNEL(queue_enqueue);
- REGISTER_SPLIT_KERNEL(indirect_background);
- REGISTER_SPLIT_KERNEL(shader_setup);
- REGISTER_SPLIT_KERNEL(shader_sort);
- REGISTER_SPLIT_KERNEL(shader_eval);
- REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
- REGISTER_SPLIT_KERNEL(subsurface_scatter);
- REGISTER_SPLIT_KERNEL(direct_lighting);
- REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
- REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
- REGISTER_SPLIT_KERNEL(enqueue_inactive);
- REGISTER_SPLIT_KERNEL(next_iteration_setup);
- REGISTER_SPLIT_KERNEL(indirect_subsurface);
- REGISTER_SPLIT_KERNEL(buffer_update);
+ use_split_kernel = DebugFlags().cpu.split_kernel;
+ if (use_split_kernel) {
+ VLOG(1) << "Will be using split kernel.";
+ }
+ need_texture_info = false;
+
+#define REGISTER_SPLIT_KERNEL(name) \
+ split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
+ KERNEL_FUNCTIONS(name))
+ REGISTER_SPLIT_KERNEL(path_init);
+ REGISTER_SPLIT_KERNEL(scene_intersect);
+ REGISTER_SPLIT_KERNEL(lamp_emission);
+ REGISTER_SPLIT_KERNEL(do_volume);
+ REGISTER_SPLIT_KERNEL(queue_enqueue);
+ REGISTER_SPLIT_KERNEL(indirect_background);
+ REGISTER_SPLIT_KERNEL(shader_setup);
+ REGISTER_SPLIT_KERNEL(shader_sort);
+ REGISTER_SPLIT_KERNEL(shader_eval);
+ REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ REGISTER_SPLIT_KERNEL(subsurface_scatter);
+ REGISTER_SPLIT_KERNEL(direct_lighting);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
+ REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
+ REGISTER_SPLIT_KERNEL(enqueue_inactive);
+ REGISTER_SPLIT_KERNEL(next_iteration_setup);
+ REGISTER_SPLIT_KERNEL(indirect_subsurface);
+ REGISTER_SPLIT_KERNEL(buffer_update);
#undef REGISTER_SPLIT_KERNEL
#undef KERNEL_FUNCTIONS
- }
-
- ~CPUDevice()
- {
- task_pool.stop();
- texture_info.free();
- }
-
- virtual bool show_samples() const
- {
- return (info.cpu_threads == 1);
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const {
- BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
- if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
- bvh_layout_mask |= BVH_LAYOUT_BVH4;
- }
- if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
- bvh_layout_mask |= BVH_LAYOUT_BVH8;
- }
+ }
+
+ ~CPUDevice()
+ {
+ task_pool.stop();
+ texture_info.free();
+ }
+
+ virtual bool show_samples() const
+ {
+ return (info.cpu_threads == 1);
+ }
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const
+ {
+ BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
+ if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) {
+ bvh_layout_mask |= BVH_LAYOUT_BVH4;
+ }
+ if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) {
+ bvh_layout_mask |= BVH_LAYOUT_BVH8;
+ }
#ifdef WITH_EMBREE
- bvh_layout_mask |= BVH_LAYOUT_EMBREE;
-#endif /* WITH_EMBREE */
- return bvh_layout_mask;
- }
-
- void load_texture_info()
- {
- if(need_texture_info) {
- texture_info.copy_to_device();
- need_texture_info = false;
- }
- }
-
- void mem_alloc(device_memory& mem)
- {
- if(mem.type == MEM_TEXTURE) {
- assert(!"mem_alloc not supported for textures.");
- }
- else {
- if(mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- if(mem.type == MEM_DEVICE_ONLY) {
- assert(!mem.host_pointer);
- size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
- void *data = util_aligned_malloc(mem.memory_size(), alignment);
- mem.device_pointer = (device_ptr)data;
- }
- else {
- mem.device_pointer = (device_ptr)mem.host_pointer;
- }
-
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
- }
- }
-
- void mem_copy_to(device_memory& mem)
- {
- if(mem.type == MEM_TEXTURE) {
- tex_free(mem);
- tex_alloc(mem);
- }
- else if(mem.type == MEM_PIXELS) {
- assert(!"mem_copy_to not supported for pixels.");
- }
- else {
- if(!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- /* copy is no-op */
- }
- }
-
- void mem_copy_from(device_memory& /*mem*/,
- int /*y*/, int /*w*/, int /*h*/,
- int /*elem*/)
- {
- /* no-op */
- }
-
- void mem_zero(device_memory& mem)
- {
- if(!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if(mem.device_pointer) {
- memset((void*)mem.device_pointer, 0, mem.memory_size());
- }
- }
-
- void mem_free(device_memory& mem)
- {
- if(mem.type == MEM_TEXTURE) {
- tex_free(mem);
- }
- else if(mem.device_pointer) {
- if(mem.type == MEM_DEVICE_ONLY) {
- util_aligned_free((void*)mem.device_pointer);
- }
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-
- virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
- {
- return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
- }
-
- void const_copy_to(const char *name, void *host, size_t size)
- {
- kernel_const_copy(&kernel_globals, name, host, size);
- }
-
- void tex_alloc(device_memory& mem)
- {
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- if(mem.interpolation == INTERPOLATION_NONE) {
- /* Data texture. */
- kernel_tex_copy(&kernel_globals,
- mem.name,
- mem.host_pointer,
- mem.data_size);
- }
- else {
- /* Image Texture. */
- int flat_slot = 0;
- if(string_startswith(mem.name, "__tex_image")) {
- int pos = string(mem.name).rfind("_");
- flat_slot = atoi(mem.name + pos + 1);
- }
- else {
- assert(0);
- }
-
- if(flat_slot >= texture_info.size()) {
- /* Allocate some slots in advance, to reduce amount
- * of re-allocations. */
- texture_info.resize(flat_slot + 128);
- }
-
- TextureInfo& info = texture_info[flat_slot];
- info.data = (uint64_t)mem.host_pointer;
- info.cl_buffer = 0;
- info.interpolation = mem.interpolation;
- info.extension = mem.extension;
- info.width = mem.data_width;
- info.height = mem.data_height;
- info.depth = mem.data_depth;
-
- need_texture_info = true;
- }
-
- mem.device_pointer = (device_ptr)mem.host_pointer;
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
- }
-
- void tex_free(device_memory& mem)
- {
- if(mem.device_pointer) {
- mem.device_pointer = 0;
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- need_texture_info = true;
- }
- }
-
- void *osl_memory()
- {
+ bvh_layout_mask |= BVH_LAYOUT_EMBREE;
+#endif /* WITH_EMBREE */
+ return bvh_layout_mask;
+ }
+
+ void load_texture_info()
+ {
+ if (need_texture_info) {
+ texture_info.copy_to_device();
+ need_texture_info = false;
+ }
+ }
+
+ void mem_alloc(device_memory &mem)
+ {
+ if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else {
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
+ if (mem.type == MEM_DEVICE_ONLY) {
+ assert(!mem.host_pointer);
+ size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
+ void *data = util_aligned_malloc(mem.memory_size(), alignment);
+ mem.device_pointer = (device_ptr)data;
+ }
+ else {
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ }
+
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+ }
+ }
+
+ void mem_copy_to(device_memory &mem)
+ {
+ if (mem.type == MEM_TEXTURE) {
+ tex_free(mem);
+ tex_alloc(mem);
+ }
+ else if (mem.type == MEM_PIXELS) {
+ assert(!"mem_copy_to not supported for pixels.");
+ }
+ else {
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ /* copy is no-op */
+ }
+ }
+
+ void mem_copy_from(device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+ {
+ /* no-op */
+ }
+
+ void mem_zero(device_memory &mem)
+ {
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ if (mem.device_pointer) {
+ memset((void *)mem.device_pointer, 0, mem.memory_size());
+ }
+ }
+
+ void mem_free(device_memory &mem)
+ {
+ if (mem.type == MEM_TEXTURE) {
+ tex_free(mem);
+ }
+ else if (mem.device_pointer) {
+ if (mem.type == MEM_DEVICE_ONLY) {
+ util_aligned_free((void *)mem.device_pointer);
+ }
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+ }
+
+ virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+ {
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size)
+ {
+ kernel_const_copy(&kernel_globals, name, host, size);
+ }
+
+ void tex_alloc(device_memory &mem)
+ {
+ VLOG(1) << "Texture allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ if (mem.interpolation == INTERPOLATION_NONE) {
+ /* Data texture. */
+ kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size);
+ }
+ else {
+ /* Image Texture. */
+ int flat_slot = 0;
+ if (string_startswith(mem.name, "__tex_image")) {
+ int pos = string(mem.name).rfind("_");
+ flat_slot = atoi(mem.name + pos + 1);
+ }
+ else {
+ assert(0);
+ }
+
+ if (flat_slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount
+ * of re-allocations. */
+ texture_info.resize(flat_slot + 128);
+ }
+
+ TextureInfo &info = texture_info[flat_slot];
+ info.data = (uint64_t)mem.host_pointer;
+ info.cl_buffer = 0;
+ info.interpolation = mem.interpolation;
+ info.extension = mem.extension;
+ info.width = mem.data_width;
+ info.height = mem.data_height;
+ info.depth = mem.data_depth;
+
+ need_texture_info = true;
+ }
+
+ mem.device_pointer = (device_ptr)mem.host_pointer;
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+ }
+
+ void tex_free(device_memory &mem)
+ {
+ if (mem.device_pointer) {
+ mem.device_pointer = 0;
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ need_texture_info = true;
+ }
+ }
+
+ void *osl_memory()
+ {
#ifdef WITH_OSL
- return &osl_globals;
+ return &osl_globals;
#else
- return NULL;
+ return NULL;
#endif
- }
-
- void thread_run(DeviceTask *task)
- {
- if(task->type == DeviceTask::RENDER) {
- thread_render(*task);
- }
- else if(task->type == DeviceTask::FILM_CONVERT)
- thread_film_convert(*task);
- else if(task->type == DeviceTask::SHADER)
- thread_shader(*task);
- }
-
- class CPUDeviceTask : public DeviceTask {
- public:
- CPUDeviceTask(CPUDevice *device, DeviceTask& task)
- : DeviceTask(task)
- {
- run = function_bind(&CPUDevice::thread_run, device, this);
- }
- };
-
- bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
-
- int4 rect = task->rect;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int w = align_up(rect.z-rect.x, 4);
- int h = rect.w-rect.y;
- int stride = task->buffer.stride;
- int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
-
- float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
- float *blurDifference = temporary_mem;
- float *difference = temporary_mem + task->buffer.pass_stride;
- float *weightAccum = temporary_mem + 2*task->buffer.pass_stride;
-
- memset(weightAccum, 0, sizeof(float)*w*h);
- memset((float*) out_ptr, 0, sizeof(float)*w*h);
-
- for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
- int dy = i / (2*r+1) - r;
- int dx = i % (2*r+1) - r;
-
- int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
- filter_nlm_calc_difference_kernel()(dx, dy,
- (float*) guide_ptr,
- (float*) variance_ptr,
- NULL,
- difference,
- local_rect,
- w, channel_offset,
- 0, a, k_2);
-
- filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
- filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
- filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
-
- filter_nlm_update_output_kernel()(dx, dy,
- blurDifference,
- (float*) image_ptr,
- difference,
- (float*) out_ptr,
- weightAccum,
- local_rect,
- channel_offset,
- stride, f);
- }
-
- int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
- filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
-
- return true;
- }
-
- bool denoising_construct_transform(DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
-
- for(int y = 0; y < task->filter_area.w; y++) {
- for(int x = 0; x < task->filter_area.z; x++) {
- filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
- task->tile_info,
- x + task->filter_area.x,
- y + task->filter_area.y,
- y*task->filter_area.z + x,
- (float*) task->storage.transform.device_pointer,
- (int*) task->storage.rank.device_pointer,
- &task->rect.x,
- task->buffer.pass_stride,
- task->buffer.frame_stride,
- task->buffer.use_time,
- task->radius,
- task->pca_threshold);
- }
- }
- return true;
- }
-
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
-
- float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer;
- float *difference = temporary_mem;
- float *blurDifference = temporary_mem + task->buffer.pass_stride;
-
- int r = task->radius;
- int frame_offset = frame * task->buffer.frame_stride;
- for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
- int dy = i / (2*r+1) - r;
- int dx = i % (2*r+1) - r;
-
- int local_rect[4] = {max(0, -dx), max(0, -dy),
- task->reconstruction_state.source_w - max(0, dx),
- task->reconstruction_state.source_h - max(0, dy)};
- filter_nlm_calc_difference_kernel()(dx, dy,
- (float*) color_ptr,
- (float*) color_variance_ptr,
- (float*) scale_ptr,
- difference,
- local_rect,
- task->buffer.stride,
- task->buffer.pass_stride,
- frame_offset,
- 1.0f,
- task->nlm_k_2);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
- filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
- filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
- filter_nlm_construct_gramian_kernel()(dx, dy,
- task->tile_info->frames[frame],
- blurDifference,
- (float*) task->buffer.mem.device_pointer,
- (float*) task->storage.transform.device_pointer,
- (int*) task->storage.rank.device_pointer,
- (float*) task->storage.XtWX.device_pointer,
- (float3*) task->storage.XtWY.device_pointer,
- local_rect,
- &task->reconstruction_state.filter_window.x,
- task->buffer.stride,
- 4,
- task->buffer.pass_stride,
- frame_offset,
- task->buffer.use_time);
- }
-
- return true;
- }
-
- bool denoising_solve(device_ptr output_ptr,
- DenoisingTask *task)
- {
- for(int y = 0; y < task->filter_area.w; y++) {
- for(int x = 0; x < task->filter_area.z; x++) {
- filter_finalize_kernel()(x,
- y,
- y*task->filter_area.z + x,
- (float*) output_ptr,
- (int*) task->storage.rank.device_pointer,
- (float*) task->storage.XtWX.device_pointer,
- (float3*) task->storage.XtWY.device_pointer,
- &task->reconstruction_state.buffer_params.x,
- task->render_buffer.samples);
- }
- }
- return true;
- }
-
- bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
- device_ptr mean_ptr, device_ptr variance_ptr,
- int r, int4 rect, DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
-
- for(int y = rect.y; y < rect.w; y++) {
- for(int x = rect.x; x < rect.z; x++) {
- filter_combine_halves_kernel()(x, y,
- (float*) mean_ptr,
- (float*) variance_ptr,
- (float*) a_ptr,
- (float*) b_ptr,
- &rect.x,
- r);
- }
- }
- return true;
- }
-
- bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
- device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr, DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
-
- for(int y = task->rect.y; y < task->rect.w; y++) {
- for(int x = task->rect.x; x < task->rect.z; x++) {
- filter_divide_shadow_kernel()(task->render_buffer.samples,
- task->tile_info,
- x, y,
- (float*) a_ptr,
- (float*) b_ptr,
- (float*) sample_variance_ptr,
- (float*) sv_variance_ptr,
- (float*) buffer_variance_ptr,
- &task->rect.x,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- }
- }
- return true;
- }
-
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
-
- for(int y = task->rect.y; y < task->rect.w; y++) {
- for(int x = task->rect.x; x < task->rect.z; x++) {
- filter_get_feature_kernel()(task->render_buffer.samples,
- task->tile_info,
- mean_offset,
- variance_offset,
- x, y,
- (float*) mean_ptr,
- (float*) variance_ptr,
- scale,
- &task->rect.x,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- }
- }
- return true;
- }
-
- bool denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
- {
- for(int y = 0; y < task->filter_area.w; y++) {
- for(int x = 0; x < task->filter_area.z; x++) {
- filter_write_feature_kernel()(task->render_buffer.samples,
- x + task->filter_area.x,
- y + task->filter_area.y,
- &task->reconstruction_state.buffer_params.x,
- (float*) from_ptr,
- (float*) buffer_ptr,
- out_offset,
- &task->rect.x);
- }
- }
- return true;
- }
-
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
- {
- ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
-
- for(int y = task->rect.y; y < task->rect.w; y++) {
- for(int x = task->rect.x; x < task->rect.z; x++) {
- filter_detect_outliers_kernel()(x, y,
- (float*) image_ptr,
- (float*) variance_ptr,
- (float*) depth_ptr,
- (float*) output_ptr,
- &task->rect.x,
- task->buffer.pass_stride);
- }
- }
- return true;
- }
-
- void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
- {
- const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
- scoped_timer timer(&tile.buffers->render_time);
-
- Coverage coverage(kg, tile);
- if(use_coverage) {
- coverage.init_path_trace();
- }
-
- float *render_buffer = (float*)tile.buffer;
- int start_sample = tile.start_sample;
- int end_sample = tile.start_sample + tile.num_samples;
-
- /* Needed for Embree. */
- SIMD_SET_FLUSH_TO_ZERO;
-
- for(int sample = start_sample; sample < end_sample; sample++) {
- if(task.get_cancel() || task_pool.canceled()) {
- if(task.need_finish_queue == false)
- break;
- }
-
- for(int y = tile.y; y < tile.y + tile.h; y++) {
- for(int x = tile.x; x < tile.x + tile.w; x++) {
- if(use_coverage) {
- coverage.init_pixel(x, y);
- }
- path_trace_kernel()(kg, render_buffer,
- sample, x, y, tile.offset, tile.stride);
- }
- }
-
- tile.sample = sample + 1;
-
- task.update_progress(&tile, tile.w*tile.h);
- }
- if(use_coverage) {
- coverage.finalize();
- }
- }
-
- void denoise(DenoisingTask& denoising, RenderTile &tile)
- {
- ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
-
- tile.sample = tile.start_sample + tile.num_samples;
-
- denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
- denoising.render_buffer.samples = tile.sample;
- denoising.buffer.gpu_temporary_mem = false;
-
- denoising.run_denoising(&tile);
- }
-
- void thread_render(DeviceTask& task)
- {
- if(task_pool.canceled()) {
- if(task.need_finish_queue == false)
- return;
- }
-
- /* allocate buffer for kernel globals */
- device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
- kgbuffer.alloc_to_device(1);
-
- KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
-
- profiler.add_state(&kg->profiler);
-
- CPUSplitKernel *split_kernel = NULL;
- if(use_split_kernel) {
- split_kernel = new CPUSplitKernel(this);
- if(!split_kernel->load_kernels(requested_features)) {
- thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
- kgbuffer.free();
- delete split_kernel;
- return;
- }
- }
-
- RenderTile tile;
- DenoisingTask denoising(this, task);
- denoising.profiler = &kg->profiler;
-
- while(task.acquire_tile(this, tile)) {
- if(tile.task == RenderTile::PATH_TRACE) {
- if(use_split_kernel) {
- device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
- }
- else {
- path_trace(task, tile, kg);
- }
- }
- else if(tile.task == RenderTile::DENOISE) {
- denoise(denoising, tile);
- task.update_progress(&tile, tile.w*tile.h);
- }
-
- task.release_tile(tile);
-
- if(task_pool.canceled()) {
- if(task.need_finish_queue == false)
- break;
- }
- }
-
- profiler.remove_state(&kg->profiler);
-
- thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
- kg->~KernelGlobals();
- kgbuffer.free();
- delete split_kernel;
- }
-
- void thread_film_convert(DeviceTask& task)
- {
- float sample_scale = 1.0f/(task.sample + 1);
-
- if(task.rgba_half) {
- for(int y = task.y; y < task.y + task.h; y++)
- for(int x = task.x; x < task.x + task.w; x++)
- convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
- }
- else {
- for(int y = task.y; y < task.y + task.h; y++)
- for(int x = task.x; x < task.x + task.w; x++)
- convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
- sample_scale, x, y, task.offset, task.stride);
-
- }
- }
-
- void thread_shader(DeviceTask& task)
- {
- KernelGlobals kg = kernel_globals;
+ }
+
+ void thread_run(DeviceTask *task)
+ {
+ if (task->type == DeviceTask::RENDER) {
+ thread_render(*task);
+ }
+ else if (task->type == DeviceTask::FILM_CONVERT)
+ thread_film_convert(*task);
+ else if (task->type == DeviceTask::SHADER)
+ thread_shader(*task);
+ }
+
+ class CPUDeviceTask : public DeviceTask {
+ public:
+ CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task)
+ {
+ run = function_bind(&CPUDevice::thread_run, device, this);
+ }
+ };
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
+
+ int4 rect = task->rect;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int w = align_up(rect.z - rect.x, 4);
+ int h = rect.w - rect.y;
+ int stride = task->buffer.stride;
+ int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+
+ float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
+ float *blurDifference = temporary_mem;
+ float *difference = temporary_mem + task->buffer.pass_stride;
+ float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
+
+ memset(weightAccum, 0, sizeof(float) * w * h);
+ memset((float *)out_ptr, 0, sizeof(float) * w * h);
+
+ for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
+ int dy = i / (2 * r + 1) - r;
+ int dx = i % (2 * r + 1) - r;
+
+ int local_rect[4] = {
+ max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx,
+ dy,
+ (float *)guide_ptr,
+ (float *)variance_ptr,
+ NULL,
+ difference,
+ local_rect,
+ w,
+ channel_offset,
+ 0,
+ a,
+ k_2);
+
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+ filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
+
+ filter_nlm_update_output_kernel()(dx,
+ dy,
+ blurDifference,
+ (float *)image_ptr,
+ difference,
+ (float *)out_ptr,
+ weightAccum,
+ local_rect,
+ channel_offset,
+ stride,
+ f);
+ }
+
+ int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
+ filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
+
+ return true;
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
+
+ for (int y = 0; y < task->filter_area.w; y++) {
+ for (int x = 0; x < task->filter_area.z; x++) {
+ filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
+ task->tile_info,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ y * task->filter_area.z + x,
+ (float *)task->storage.transform.device_pointer,
+ (int *)task->storage.rank.device_pointer,
+ &task->rect.x,
+ task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ task->buffer.use_time,
+ task->radius,
+ task->pca_threshold);
+ }
+ }
+ return true;
+ }
+
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
+
+ float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
+ float *difference = temporary_mem;
+ float *blurDifference = temporary_mem + task->buffer.pass_stride;
+
+ int r = task->radius;
+ int frame_offset = frame * task->buffer.frame_stride;
+ for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
+ int dy = i / (2 * r + 1) - r;
+ int dx = i % (2 * r + 1) - r;
+
+ int local_rect[4] = {max(0, -dx),
+ max(0, -dy),
+ task->reconstruction_state.source_w - max(0, dx),
+ task->reconstruction_state.source_h - max(0, dy)};
+ filter_nlm_calc_difference_kernel()(dx,
+ dy,
+ (float *)color_ptr,
+ (float *)color_variance_ptr,
+ (float *)scale_ptr,
+ difference,
+ local_rect,
+ task->buffer.stride,
+ task->buffer.pass_stride,
+ frame_offset,
+ 1.0f,
+ task->nlm_k_2);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+ filter_nlm_calc_weight_kernel()(
+ blurDifference, difference, local_rect, task->buffer.stride, 4);
+ filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+ filter_nlm_construct_gramian_kernel()(dx,
+ dy,
+ task->tile_info->frames[frame],
+ blurDifference,
+ (float *)task->buffer.mem.device_pointer,
+ (float *)task->storage.transform.device_pointer,
+ (int *)task->storage.rank.device_pointer,
+ (float *)task->storage.XtWX.device_pointer,
+ (float3 *)task->storage.XtWY.device_pointer,
+ local_rect,
+ &task->reconstruction_state.filter_window.x,
+ task->buffer.stride,
+ 4,
+ task->buffer.pass_stride,
+ frame_offset,
+ task->buffer.use_time);
+ }
+
+ return true;
+ }
+
+ bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+ {
+ for (int y = 0; y < task->filter_area.w; y++) {
+ for (int x = 0; x < task->filter_area.z; x++) {
+ filter_finalize_kernel()(x,
+ y,
+ y * task->filter_area.z + x,
+ (float *)output_ptr,
+ (int *)task->storage.rank.device_pointer,
+ (float *)task->storage.XtWX.device_pointer,
+ (float3 *)task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.buffer_params.x,
+ task->render_buffer.samples);
+ }
+ }
+ return true;
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect,
+ DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
+
+ for (int y = rect.y; y < rect.w; y++) {
+ for (int x = rect.x; x < rect.z; x++) {
+ filter_combine_halves_kernel()(x,
+ y,
+ (float *)mean_ptr,
+ (float *)variance_ptr,
+ (float *)a_ptr,
+ (float *)b_ptr,
+ &rect.x,
+ r);
+ }
+ }
+ return true;
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
+
+ for (int y = task->rect.y; y < task->rect.w; y++) {
+ for (int x = task->rect.x; x < task->rect.z; x++) {
+ filter_divide_shadow_kernel()(task->render_buffer.samples,
+ task->tile_info,
+ x,
+ y,
+ (float *)a_ptr,
+ (float *)b_ptr,
+ (float *)sample_variance_ptr,
+ (float *)sv_variance_ptr,
+ (float *)buffer_variance_ptr,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.offset);
+ }
+ }
+ return true;
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ float scale,
+ DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
+
+ for (int y = task->rect.y; y < task->rect.w; y++) {
+ for (int x = task->rect.x; x < task->rect.z; x++) {
+ filter_get_feature_kernel()(task->render_buffer.samples,
+ task->tile_info,
+ mean_offset,
+ variance_offset,
+ x,
+ y,
+ (float *)mean_ptr,
+ (float *)variance_ptr,
+ scale,
+ &task->rect.x,
+ task->render_buffer.pass_stride,
+ task->render_buffer.offset);
+ }
+ }
+ return true;
+ }
+
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ for (int y = 0; y < task->filter_area.w; y++) {
+ for (int x = 0; x < task->filter_area.z; x++) {
+ filter_write_feature_kernel()(task->render_buffer.samples,
+ x + task->filter_area.x,
+ y + task->filter_area.y,
+ &task->reconstruction_state.buffer_params.x,
+ (float *)from_ptr,
+ (float *)buffer_ptr,
+ out_offset,
+ &task->rect.x);
+ }
+ }
+ return true;
+ }
+
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
+
+ for (int y = task->rect.y; y < task->rect.w; y++) {
+ for (int x = task->rect.x; x < task->rect.z; x++) {
+ filter_detect_outliers_kernel()(x,
+ y,
+ (float *)image_ptr,
+ (float *)variance_ptr,
+ (float *)depth_ptr,
+ (float *)output_ptr,
+ &task->rect.x,
+ task->buffer.pass_stride);
+ }
+ }
+ return true;
+ }
+
+ void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
+ {
+ const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
+
+ scoped_timer timer(&tile.buffers->render_time);
+
+ Coverage coverage(kg, tile);
+ if (use_coverage) {
+ coverage.init_path_trace();
+ }
+
+ float *render_buffer = (float *)tile.buffer;
+ int start_sample = tile.start_sample;
+ int end_sample = tile.start_sample + tile.num_samples;
+
+ /* Needed for Embree. */
+ SIMD_SET_FLUSH_TO_ZERO;
+
+ for (int sample = start_sample; sample < end_sample; sample++) {
+ if (task.get_cancel() || task_pool.canceled()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+
+ for (int y = tile.y; y < tile.y + tile.h; y++) {
+ for (int x = tile.x; x < tile.x + tile.w; x++) {
+ if (use_coverage) {
+ coverage.init_pixel(x, y);
+ }
+ path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
+ }
+ }
+
+ tile.sample = sample + 1;
+
+ task.update_progress(&tile, tile.w * tile.h);
+ }
+ if (use_coverage) {
+ coverage.finalize();
+ }
+ }
+
+ void denoise(DenoisingTask &denoising, RenderTile &tile)
+ {
+ ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
+
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ denoising.functions.construct_transform = function_bind(
+ &CPUDevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.accumulate = function_bind(
+ &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
+ denoising.functions.divide_shadow = function_bind(
+ &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(
+ &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(
+ &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(
+ &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(
+ &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+ denoising.functions.detect_outliers = function_bind(
+ &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
+ denoising.render_buffer.samples = tile.sample;
+ denoising.buffer.gpu_temporary_mem = false;
+
+ denoising.run_denoising(&tile);
+ }
+
+ void thread_render(DeviceTask &task)
+ {
+ if (task_pool.canceled()) {
+ if (task.need_finish_queue == false)
+ return;
+ }
+
+ /* allocate buffer for kernel globals */
+ device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
+ kgbuffer.alloc_to_device(1);
+
+ KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
+ KernelGlobals(thread_kernel_globals_init());
+
+ profiler.add_state(&kg->profiler);
+
+ CPUSplitKernel *split_kernel = NULL;
+ if (use_split_kernel) {
+ split_kernel = new CPUSplitKernel(this);
+ if (!split_kernel->load_kernels(requested_features)) {
+ thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
+ kgbuffer.free();
+ delete split_kernel;
+ return;
+ }
+ }
+
+ RenderTile tile;
+ DenoisingTask denoising(this, task);
+ denoising.profiler = &kg->profiler;
+
+ while (task.acquire_tile(this, tile)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ if (use_split_kernel) {
+ device_only_memory<uchar> void_buffer(this, "void_buffer");
+ split_kernel->path_trace(&task, tile, kgbuffer, void_buffer);
+ }
+ else {
+ path_trace(task, tile, kg);
+ }
+ }
+ else if (tile.task == RenderTile::DENOISE) {
+ denoise(denoising, tile);
+ task.update_progress(&tile, tile.w * tile.h);
+ }
+
+ task.release_tile(tile);
+
+ if (task_pool.canceled()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+
+ profiler.remove_state(&kg->profiler);
+
+ thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
+ kg->~KernelGlobals();
+ kgbuffer.free();
+ delete split_kernel;
+ }
+
+ void thread_film_convert(DeviceTask &task)
+ {
+ float sample_scale = 1.0f / (task.sample + 1);
+
+ if (task.rgba_half) {
+ for (int y = task.y; y < task.y + task.h; y++)
+ for (int x = task.x; x < task.x + task.w; x++)
+ convert_to_half_float_kernel()(&kernel_globals,
+ (uchar4 *)task.rgba_half,
+ (float *)task.buffer,
+ sample_scale,
+ x,
+ y,
+ task.offset,
+ task.stride);
+ }
+ else {
+ for (int y = task.y; y < task.y + task.h; y++)
+ for (int x = task.x; x < task.x + task.w; x++)
+ convert_to_byte_kernel()(&kernel_globals,
+ (uchar4 *)task.rgba_byte,
+ (float *)task.buffer,
+ sample_scale,
+ x,
+ y,
+ task.offset,
+ task.stride);
+ }
+ }
+
+ void thread_shader(DeviceTask &task)
+ {
+ KernelGlobals kg = kernel_globals;
#ifdef WITH_OSL
- OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+ OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
- for(int sample = 0; sample < task.num_samples; sample++) {
- for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
- shader_kernel()(&kg,
- (uint4*)task.shader_input,
- (float4*)task.shader_output,
- task.shader_eval_type,
- task.shader_filter,
- x,
- task.offset,
- sample);
-
- if(task.get_cancel() || task_pool.canceled())
- break;
-
- task.update_progress(NULL);
-
- }
+ for (int sample = 0; sample < task.num_samples; sample++) {
+ for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
+ shader_kernel()(&kg,
+ (uint4 *)task.shader_input,
+ (float4 *)task.shader_output,
+ task.shader_eval_type,
+ task.shader_filter,
+ x,
+ task.offset,
+ sample);
+
+ if (task.get_cancel() || task_pool.canceled())
+ break;
+
+ task.update_progress(NULL);
+ }
#ifdef WITH_OSL
- OSLShader::thread_free(&kg);
+ OSLShader::thread_free(&kg);
#endif
- }
-
- int get_split_task_count(DeviceTask& task)
- {
- if(task.type == DeviceTask::SHADER)
- return task.get_subtask_count(info.cpu_threads, 256);
- else
- return task.get_subtask_count(info.cpu_threads);
- }
-
- void task_add(DeviceTask& task)
- {
- /* Load texture info. */
- load_texture_info();
-
- /* split task into smaller ones */
- list<DeviceTask> tasks;
-
- if(task.type == DeviceTask::SHADER)
- task.split(tasks, info.cpu_threads, 256);
- else
- task.split(tasks, info.cpu_threads);
-
- foreach(DeviceTask& task, tasks)
- task_pool.push(new CPUDeviceTask(this, task));
- }
-
- void task_wait()
- {
- task_pool.wait_work();
- }
-
- void task_cancel()
- {
- task_pool.cancel();
- }
-
-protected:
- inline KernelGlobals thread_kernel_globals_init()
- {
- KernelGlobals kg = kernel_globals;
- kg.transparent_shadow_intersections = NULL;
- const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
- sizeof(*kg.decoupled_volume_steps);
- for(int i = 0; i < decoupled_count; ++i) {
- kg.decoupled_volume_steps[i] = NULL;
- }
- kg.decoupled_volume_steps_index = 0;
- kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
+ }
+
+ int get_split_task_count(DeviceTask &task)
+ {
+ if (task.type == DeviceTask::SHADER)
+ return task.get_subtask_count(info.cpu_threads, 256);
+ else
+ return task.get_subtask_count(info.cpu_threads);
+ }
+
+ void task_add(DeviceTask &task)
+ {
+ /* Load texture info. */
+ load_texture_info();
+
+ /* split task into smaller ones */
+ list<DeviceTask> tasks;
+
+ if (task.type == DeviceTask::SHADER)
+ task.split(tasks, info.cpu_threads, 256);
+ else
+ task.split(tasks, info.cpu_threads);
+
+ foreach (DeviceTask &task, tasks)
+ task_pool.push(new CPUDeviceTask(this, task));
+ }
+
+ void task_wait()
+ {
+ task_pool.wait_work();
+ }
+
+ void task_cancel()
+ {
+ task_pool.cancel();
+ }
+
+ protected:
+ inline KernelGlobals thread_kernel_globals_init()
+ {
+ KernelGlobals kg = kernel_globals;
+ kg.transparent_shadow_intersections = NULL;
+ const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
+ sizeof(*kg.decoupled_volume_steps);
+ for (int i = 0; i < decoupled_count; ++i) {
+ kg.decoupled_volume_steps[i] = NULL;
+ }
+ kg.decoupled_volume_steps_index = 0;
+ kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
#ifdef WITH_OSL
- OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
+ OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
- return kg;
- }
-
- inline void thread_kernel_globals_free(KernelGlobals *kg)
- {
- if(kg == NULL) {
- return;
- }
-
- if(kg->transparent_shadow_intersections != NULL) {
- free(kg->transparent_shadow_intersections);
- }
- const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
- sizeof(*kg->decoupled_volume_steps);
- for(int i = 0; i < decoupled_count; ++i) {
- if(kg->decoupled_volume_steps[i] != NULL) {
- free(kg->decoupled_volume_steps[i]);
- }
- }
+ return kg;
+ }
+
+ inline void thread_kernel_globals_free(KernelGlobals *kg)
+ {
+ if (kg == NULL) {
+ return;
+ }
+
+ if (kg->transparent_shadow_intersections != NULL) {
+ free(kg->transparent_shadow_intersections);
+ }
+ const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
+ sizeof(*kg->decoupled_volume_steps);
+ for (int i = 0; i < decoupled_count; ++i) {
+ if (kg->decoupled_volume_steps[i] != NULL) {
+ free(kg->decoupled_volume_steps[i]);
+ }
+ }
#ifdef WITH_OSL
- OSLShader::thread_free(kg);
+ OSLShader::thread_free(kg);
#endif
- }
+ }
- virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) {
- requested_features = requested_features_;
+ virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_)
+ {
+ requested_features = requested_features_;
- return true;
- }
+ return true;
+ }
};
/* split kernel */
class CPUSplitKernelFunction : public SplitKernelFunction {
-public:
- CPUDevice* device;
- void (*func)(KernelGlobals *kg, KernelData *data);
-
- CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {}
- ~CPUSplitKernelFunction() {}
-
- virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data)
- {
- if(!func) {
- return false;
- }
-
- KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
- kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
- for(int y = 0; y < dim.global_size[1]; y++) {
- for(int x = 0; x < dim.global_size[0]; x++) {
- kg->global_id = make_int2(x, y);
-
- func(kg, (KernelData*)data.device_pointer);
- }
- }
-
- return true;
- }
+ public:
+ CPUDevice *device;
+ void (*func)(KernelGlobals *kg, KernelData *data);
+
+ CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL)
+ {
+ }
+ ~CPUSplitKernelFunction()
+ {
+ }
+
+ virtual bool enqueue(const KernelDimensions &dim,
+ device_memory &kernel_globals,
+ device_memory &data)
+ {
+ if (!func) {
+ return false;
+ }
+
+ KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
+ kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+ for (int y = 0; y < dim.global_size[1]; y++) {
+ for (int x = 0; x < dim.global_size[0]; x++) {
+ kg->global_id = make_int2(x, y);
+
+ func(kg, (KernelData *)data.device_pointer);
+ }
+ }
+
+ return true;
+ }
};
CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device)
{
}
-bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
+bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+ RenderTile &rtile,
int num_global_elements,
- device_memory& kernel_globals,
- device_memory& data,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flags,
- device_memory& work_pool_wgs)
+ device_memory &kernel_globals,
+ device_memory &data,
+ device_memory &split_data,
+ device_memory &ray_state,
+ device_memory &queue_index,
+ device_memory &use_queues_flags,
+ device_memory &work_pool_wgs)
{
- KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
- kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
-
- for(int y = 0; y < dim.global_size[1]; y++) {
- for(int x = 0; x < dim.global_size[0]; x++) {
- kg->global_id = make_int2(x, y);
-
- device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
- (KernelData*)data.device_pointer,
- (void*)split_data.device_pointer,
- num_global_elements,
- (char*)ray_state.device_pointer,
- rtile.start_sample,
- rtile.start_sample + rtile.num_samples,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- (int*)queue_index.device_pointer,
- dim.global_size[0] * dim.global_size[1],
- (char*)use_queues_flags.device_pointer,
- (uint*)work_pool_wgs.device_pointer,
- rtile.num_samples,
- (float*)rtile.buffer);
- }
- }
-
- return true;
+ KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
+ kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
+
+ for (int y = 0; y < dim.global_size[1]; y++) {
+ for (int x = 0; x < dim.global_size[0]; x++) {
+ kg->global_id = make_int2(x, y);
+
+ device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
+ (KernelData *)data.device_pointer,
+ (void *)split_data.device_pointer,
+ num_global_elements,
+ (char *)ray_state.device_pointer,
+ rtile.start_sample,
+ rtile.start_sample + rtile.num_samples,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ (int *)queue_index.device_pointer,
+ dim.global_size[0] * dim.global_size[1],
+ (char *)use_queues_flags.device_pointer,
+ (uint *)work_pool_wgs.device_pointer,
+ rtile.num_samples,
+ (float *)rtile.buffer);
+ }
+ }
+
+ return true;
}
-SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name,
- const DeviceRequestedFeatures&)
+SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name,
+ const DeviceRequestedFeatures &)
{
- CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
+ CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
- kernel->func = device->split_kernels[kernel_name]();
- if(!kernel->func) {
- delete kernel;
- return NULL;
- }
+ kernel->func = device->split_kernels[kernel_name]();
+ if (!kernel->func) {
+ delete kernel;
+ return NULL;
+ }
- return kernel;
+ return kernel;
}
int2 CPUSplitKernel::split_kernel_local_size()
{
- return make_int2(1, 1);
+ return make_int2(1, 1);
}
-int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) {
- return make_int2(1, 1);
+int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/,
+ device_memory & /*data*/,
+ DeviceTask * /*task*/)
+{
+ return make_int2(1, 1);
}
-uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) {
- KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
+uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals,
+ device_memory & /*data*/,
+ size_t num_threads)
+{
+ KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
- return split_data_buffer_size(kg, num_threads);
+ return split_data_buffer_size(kg, num_threads);
}
-Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
- return new CPUDevice(info, stats, profiler, background);
+ return new CPUDevice(info, stats, profiler, background);
}
-void device_cpu_info(vector<DeviceInfo>& devices)
+void device_cpu_info(vector<DeviceInfo> &devices)
{
- DeviceInfo info;
-
- info.type = DEVICE_CPU;
- info.description = system_cpu_brand_string();
- info.id = "CPU";
- info.num = 0;
- info.has_volume_decoupled = true;
- info.has_osl = true;
- info.has_half_images = true;
- info.has_profiling = true;
-
- devices.insert(devices.begin(), info);
+ DeviceInfo info;
+
+ info.type = DEVICE_CPU;
+ info.description = system_cpu_brand_string();
+ info.id = "CPU";
+ info.num = 0;
+ info.has_volume_decoupled = true;
+ info.has_osl = true;
+ info.has_half_images = true;
+ info.has_profiling = true;
+
+ devices.insert(devices.begin(), info);
}
string device_cpu_capabilities()
{
- string capabilities = "";
- capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
- capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
- capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
- capabilities += system_cpu_support_avx() ? "AVX " : "";
- capabilities += system_cpu_support_avx2() ? "AVX2" : "";
- if(capabilities[capabilities.size() - 1] == ' ')
- capabilities.resize(capabilities.size() - 1);
- return capabilities;
+ string capabilities = "";
+ capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
+ capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
+ capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
+ capabilities += system_cpu_support_avx() ? "AVX " : "";
+ capabilities += system_cpu_support_avx2() ? "AVX2" : "";
+ if (capabilities[capabilities.size() - 1] == ' ')
+ capabilities.resize(capabilities.size() - 1);
+ return capabilities;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 3aa6bce155e..68bc3bd4045 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -62,2144 +62,2242 @@ namespace {
const char *cuewErrorString(CUresult result)
{
- /* We can only give error code here without major code duplication, that
- * should be enough since dynamic loading is only being disabled by folks
- * who knows what they're doing anyway.
- *
- * NOTE: Avoid call from several threads.
- */
- static string error;
- error = string_printf("%d", result);
- return error.c_str();
+ /* We can only give error code here without major code duplication, that
+ * should be enough since dynamic loading is only being disabled by folks
+ * who knows what they're doing anyway.
+ *
+ * NOTE: Avoid call from several threads.
+ */
+ static string error;
+ error = string_printf("%d", result);
+ return error.c_str();
}
const char *cuewCompilerPath()
{
- return CYCLES_CUDA_NVCC_EXECUTABLE;
+ return CYCLES_CUDA_NVCC_EXECUTABLE;
}
int cuewCompilerVersion()
{
- return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+ return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
}
-} /* namespace */
-#endif /* WITH_CUDA_DYNLOAD */
+} /* namespace */
+#endif /* WITH_CUDA_DYNLOAD */
class CUDADevice;
class CUDASplitKernel : public DeviceSplitKernel {
- CUDADevice *device;
-public:
- explicit CUDASplitKernel(CUDADevice *device);
-
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs);
-
- virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
- const DeviceRequestedFeatures&);
- virtual int2 split_kernel_local_size();
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+ CUDADevice *device;
+
+ public:
+ explicit CUDASplitKernel(CUDADevice *device);
+
+ virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+ RenderTile &rtile,
+ int num_global_elements,
+ device_memory &kernel_globals,
+ device_memory &kernel_data_,
+ device_memory &split_data,
+ device_memory &ray_state,
+ device_memory &queue_index,
+ device_memory &use_queues_flag,
+ device_memory &work_pool_wgs);
+
+ virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+ const DeviceRequestedFeatures &);
+ virtual int2 split_kernel_local_size();
+ virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
};
/* Utility to push/pop CUDA context. */
class CUDAContextScope {
-public:
- CUDAContextScope(CUDADevice *device);
- ~CUDAContextScope();
+ public:
+ CUDAContextScope(CUDADevice *device);
+ ~CUDAContextScope();
-private:
- CUDADevice *device;
+ private:
+ CUDADevice *device;
};
-class CUDADevice : public Device
-{
-public:
- DedicatedTaskPool task_pool;
- CUdevice cuDevice;
- CUcontext cuContext;
- CUmodule cuModule, cuFilterModule;
- size_t device_texture_headroom;
- size_t device_working_headroom;
- bool move_texture_to_host;
- size_t map_host_used;
- size_t map_host_limit;
- int can_map_host;
- int cuDevId;
- int cuDevArchitecture;
- bool first_error;
- CUDASplitKernel *split_kernel;
-
- struct CUDAMem {
- CUDAMem()
- : texobject(0), array(0), map_host_pointer(0), free_map_host(false) {}
-
- CUtexObject texobject;
- CUarray array;
- void *map_host_pointer;
- bool free_map_host;
- };
- typedef map<device_memory*, CUDAMem> CUDAMemMap;
- CUDAMemMap cuda_mem_map;
-
- struct PixelMem {
- GLuint cuPBO;
- CUgraphicsResource cuPBOresource;
- GLuint cuTexId;
- int w, h;
- };
- map<device_ptr, PixelMem> pixel_mem_map;
-
- /* Bindless Textures */
- device_vector<TextureInfo> texture_info;
- bool need_texture_info;
-
- CUdeviceptr cuda_device_ptr(device_ptr mem)
- {
- return (CUdeviceptr)mem;
- }
-
- static bool have_precompiled_kernels()
- {
- string cubins_path = path_get("lib");
- return path_exists(cubins_path);
- }
-
- virtual bool show_samples() const
- {
- /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
- return true;
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const {
- return BVH_LAYOUT_BVH2;
- }
-
-/*#ifdef NDEBUG
+class CUDADevice : public Device {
+ public:
+ DedicatedTaskPool task_pool;
+ CUdevice cuDevice;
+ CUcontext cuContext;
+ CUmodule cuModule, cuFilterModule;
+ size_t device_texture_headroom;
+ size_t device_working_headroom;
+ bool move_texture_to_host;
+ size_t map_host_used;
+ size_t map_host_limit;
+ int can_map_host;
+ int cuDevId;
+ int cuDevArchitecture;
+ bool first_error;
+ CUDASplitKernel *split_kernel;
+
+ struct CUDAMem {
+ CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false)
+ {
+ }
+
+ CUtexObject texobject;
+ CUarray array;
+ void *map_host_pointer;
+ bool free_map_host;
+ };
+ typedef map<device_memory *, CUDAMem> CUDAMemMap;
+ CUDAMemMap cuda_mem_map;
+
+ struct PixelMem {
+ GLuint cuPBO;
+ CUgraphicsResource cuPBOresource;
+ GLuint cuTexId;
+ int w, h;
+ };
+ map<device_ptr, PixelMem> pixel_mem_map;
+
+ /* Bindless Textures */
+ device_vector<TextureInfo> texture_info;
+ bool need_texture_info;
+
+ CUdeviceptr cuda_device_ptr(device_ptr mem)
+ {
+ return (CUdeviceptr)mem;
+ }
+
+ static bool have_precompiled_kernels()
+ {
+ string cubins_path = path_get("lib");
+ return path_exists(cubins_path);
+ }
+
+ virtual bool show_samples() const
+ {
+ /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+ return true;
+ }
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const
+ {
+ return BVH_LAYOUT_BVH2;
+ }
+
+ /*#ifdef NDEBUG
#define cuda_abort()
#else
#define cuda_abort() abort()
#endif*/
- void cuda_error_documentation()
- {
- if(first_error) {
- fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
- fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
- first_error = false;
- }
- }
+ void cuda_error_documentation()
+ {
+ if (first_error) {
+ fprintf(stderr,
+ "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+ fprintf(stderr,
+ "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
+ first_error = false;
+ }
+ }
#define cuda_assert(stmt) \
- { \
- CUresult result = stmt; \
- \
- if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
- if(error_msg == "") \
- error_msg = message; \
- fprintf(stderr, "%s\n", message.c_str()); \
- /*cuda_abort();*/ \
- cuda_error_documentation(); \
- } \
- } (void) 0
-
- bool cuda_error_(CUresult result, const string& stmt)
- {
- if(result == CUDA_SUCCESS)
- return false;
-
- string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
- if(error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
- cuda_error_documentation();
- return true;
- }
+ { \
+ CUresult result = stmt; \
+\
+ if (result != CUDA_SUCCESS) { \
+ string message = string_printf( \
+ "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
+ if (error_msg == "") \
+ error_msg = message; \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ /*cuda_abort();*/ \
+ cuda_error_documentation(); \
+ } \
+ } \
+ (void)0
+
+ bool cuda_error_(CUresult result, const string &stmt)
+ {
+ if (result == CUDA_SUCCESS)
+ return false;
+
+ string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
+ if (error_msg == "")
+ error_msg = message;
+ fprintf(stderr, "%s\n", message.c_str());
+ cuda_error_documentation();
+ return true;
+ }
#define cuda_error(stmt) cuda_error_(stmt, #stmt)
- void cuda_error_message(const string& message)
- {
- if(error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
- cuda_error_documentation();
- }
-
- CUDADevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
- : Device(info, stats, profiler, background_),
- texture_info(this, "__texture_info", MEM_TEXTURE)
- {
- first_error = true;
- background = background_;
-
- cuDevId = info.num;
- cuDevice = 0;
- cuContext = 0;
-
- cuModule = 0;
- cuFilterModule = 0;
-
- split_kernel = NULL;
-
- need_texture_info = false;
-
- device_texture_headroom = 0;
- device_working_headroom = 0;
- move_texture_to_host = false;
- map_host_limit = 0;
- map_host_used = 0;
- can_map_host = 0;
-
- /* Intialize CUDA. */
- if(cuda_error(cuInit(0)))
- return;
-
- /* Setup device and context. */
- if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
- return;
-
- /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
- * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
- * so we can predict which memory to map to host. */
- cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
- unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
- if(can_map_host) {
- ctx_flags |= CU_CTX_MAP_HOST;
- init_host_memory();
- }
-
- /* Create context. */
- CUresult result;
-
- if(background) {
- result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
- }
- else {
- result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
- if(result != CUDA_SUCCESS) {
- result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
- background = true;
- }
- }
-
- if(cuda_error_(result, "cuCtxCreate"))
- return;
-
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
- cuDevArchitecture = major*100 + minor*10;
-
- /* Pop context set by cuCtxCreate. */
- cuCtxPopCurrent(NULL);
- }
-
- ~CUDADevice()
- {
- task_pool.stop();
-
- delete split_kernel;
-
- texture_info.free();
-
- cuda_assert(cuCtxDestroy(cuContext));
- }
-
- bool support_device(const DeviceRequestedFeatures& /*requested_features*/)
- {
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
- /* We only support sm_30 and above */
- if(major < 3) {
- cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor));
- return false;
- }
-
- return true;
- }
-
- bool use_adaptive_compilation()
- {
- return DebugFlags().cuda.adaptive_compile;
- }
-
- bool use_split_kernel()
- {
- return DebugFlags().cuda.split_kernel;
- }
-
- /* Common NVCC flags which stays the same regardless of shading model,
- * kernel sources md5 and only depends on compiler or compilation settings.
- */
- string compile_kernel_get_common_cflags(
- const DeviceRequestedFeatures& requested_features,
- bool filter=false, bool split=false)
- {
- const int machine = system_cpu_bits();
- const string source_path = path_get("source");
- const string include_path = source_path;
- string cflags = string_printf("-m%d "
- "--ptxas-options=\"-v\" "
- "--use_fast_math "
- "-DNVCC "
- "-I\"%s\"",
- machine,
- include_path.c_str());
- if(!filter && use_adaptive_compilation()) {
- cflags += " " + requested_features.get_build_options();
- }
- const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
- if(extra_cflags) {
- cflags += string(" ") + string(extra_cflags);
- }
+ void cuda_error_message(const string &message)
+ {
+ if (error_msg == "")
+ error_msg = message;
+ fprintf(stderr, "%s\n", message.c_str());
+ cuda_error_documentation();
+ }
+
+ CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+ : Device(info, stats, profiler, background_),
+ texture_info(this, "__texture_info", MEM_TEXTURE)
+ {
+ first_error = true;
+ background = background_;
+
+ cuDevId = info.num;
+ cuDevice = 0;
+ cuContext = 0;
+
+ cuModule = 0;
+ cuFilterModule = 0;
+
+ split_kernel = NULL;
+
+ need_texture_info = false;
+
+ device_texture_headroom = 0;
+ device_working_headroom = 0;
+ move_texture_to_host = false;
+ map_host_limit = 0;
+ map_host_used = 0;
+ can_map_host = 0;
+
+ /* Intialize CUDA. */
+ if (cuda_error(cuInit(0)))
+ return;
+
+ /* Setup device and context. */
+ if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+ return;
+
+ /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+ * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+ * so we can predict which memory to map to host. */
+ cuda_assert(
+ cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+ unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+ if (can_map_host) {
+ ctx_flags |= CU_CTX_MAP_HOST;
+ init_host_memory();
+ }
+
+ /* Create context. */
+ CUresult result;
+
+ if (background) {
+ result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+ }
+ else {
+ result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+ if (result != CUDA_SUCCESS) {
+ result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+ background = true;
+ }
+ }
+
+ if (cuda_error_(result, "cuCtxCreate"))
+ return;
+
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+ cuDevArchitecture = major * 100 + minor * 10;
+
+ /* Pop context set by cuCtxCreate. */
+ cuCtxPopCurrent(NULL);
+ }
+
+ ~CUDADevice()
+ {
+ task_pool.stop();
+
+ delete split_kernel;
+
+ texture_info.free();
+
+ cuda_assert(cuCtxDestroy(cuContext));
+ }
+
+ bool support_device(const DeviceRequestedFeatures & /*requested_features*/)
+ {
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+ /* We only support sm_30 and above */
+ if (major < 3) {
+ cuda_error_message(string_printf(
+ "CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
+ major,
+ minor));
+ return false;
+ }
+
+ return true;
+ }
+
+ bool use_adaptive_compilation()
+ {
+ return DebugFlags().cuda.adaptive_compile;
+ }
+
+ bool use_split_kernel()
+ {
+ return DebugFlags().cuda.split_kernel;
+ }
+
+ /* Common NVCC flags which stays the same regardless of shading model,
+ * kernel sources md5 and only depends on compiler or compilation settings.
+ */
+ string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
+ bool filter = false,
+ bool split = false)
+ {
+ const int machine = system_cpu_bits();
+ const string source_path = path_get("source");
+ const string include_path = source_path;
+ string cflags = string_printf(
+ "-m%d "
+ "--ptxas-options=\"-v\" "
+ "--use_fast_math "
+ "-DNVCC "
+ "-I\"%s\"",
+ machine,
+ include_path.c_str());
+ if (!filter && use_adaptive_compilation()) {
+ cflags += " " + requested_features.get_build_options();
+ }
+ const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+ if (extra_cflags) {
+ cflags += string(" ") + string(extra_cflags);
+ }
#ifdef WITH_CYCLES_DEBUG
- cflags += " -D__KERNEL_DEBUG__";
+ cflags += " -D__KERNEL_DEBUG__";
#endif
- if(split) {
- cflags += " -D__SPLIT__";
- }
-
- return cflags;
- }
-
- bool compile_check_compiler() {
- const char *nvcc = cuewCompilerPath();
- if(nvcc == NULL) {
- cuda_error_message("CUDA nvcc compiler not found. "
- "Install CUDA toolkit in default location.");
- return false;
- }
- const int cuda_version = cuewCompilerVersion();
- VLOG(1) << "Found nvcc " << nvcc
- << ", CUDA version " << cuda_version
- << ".";
- const int major = cuda_version / 10, minor = cuda_version % 10;
- if(cuda_version == 0) {
- cuda_error_message("CUDA nvcc compiler version could not be parsed.");
- return false;
- }
- if(cuda_version < 80) {
- printf("Unsupported CUDA version %d.%d detected, "
- "you need CUDA 8.0 or newer.\n",
- major, minor);
- return false;
- }
- else if(cuda_version != 101) {
- printf("CUDA version %d.%d detected, build may succeed but only "
- "CUDA 10.1 is officially supported.\n",
- major, minor);
- }
- return true;
- }
-
- string compile_kernel(const DeviceRequestedFeatures& requested_features,
- bool filter=false, bool split=false)
- {
- const char *name, *source;
- if(filter) {
- name = "filter";
- source = "filter.cu";
- }
- else if(split) {
- name = "kernel_split";
- source = "kernel_split.cu";
- }
- else {
- name = "kernel";
- source = "kernel.cu";
- }
- /* Compute cubin name. */
- int major, minor;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
- cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
- /* Attempt to use kernel provided with Blender. */
- if(!use_adaptive_compilation()) {
- const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
- name, major, minor));
- VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
- if(path_exists(cubin)) {
- VLOG(1) << "Using precompiled kernel.";
- return cubin;
- }
- }
-
- const string common_cflags =
- compile_kernel_get_common_cflags(requested_features, filter, split);
-
- /* Try to use locally compiled kernel. */
- const string source_path = path_get("source");
- const string kernel_md5 = path_files_md5_hash(source_path);
-
- /* We include cflags into md5 so changing cuda toolkit or changing other
- * compiler command line arguments makes sure cubin gets re-built.
- */
- const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
-
- const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
- name, major, minor,
- cubin_md5.c_str());
- const string cubin = path_cache_get(path_join("kernels", cubin_file));
- VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
- if(path_exists(cubin)) {
- VLOG(1) << "Using locally compiled kernel.";
- return cubin;
- }
+ if (split) {
+ cflags += " -D__SPLIT__";
+ }
+
+ return cflags;
+ }
+
+ bool compile_check_compiler()
+ {
+ const char *nvcc = cuewCompilerPath();
+ if (nvcc == NULL) {
+ cuda_error_message(
+ "CUDA nvcc compiler not found. "
+ "Install CUDA toolkit in default location.");
+ return false;
+ }
+ const int cuda_version = cuewCompilerVersion();
+ VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
+ const int major = cuda_version / 10, minor = cuda_version % 10;
+ if (cuda_version == 0) {
+ cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+ return false;
+ }
+ if (cuda_version < 80) {
+ printf(
+ "Unsupported CUDA version %d.%d detected, "
+ "you need CUDA 8.0 or newer.\n",
+ major,
+ minor);
+ return false;
+ }
+ else if (cuda_version != 101) {
+ printf(
+ "CUDA version %d.%d detected, build may succeed but only "
+ "CUDA 10.1 is officially supported.\n",
+ major,
+ minor);
+ }
+ return true;
+ }
+
+ string compile_kernel(const DeviceRequestedFeatures &requested_features,
+ bool filter = false,
+ bool split = false)
+ {
+ const char *name, *source;
+ if (filter) {
+ name = "filter";
+ source = "filter.cu";
+ }
+ else if (split) {
+ name = "kernel_split";
+ source = "kernel_split.cu";
+ }
+ else {
+ name = "kernel";
+ source = "kernel.cu";
+ }
+ /* Compute cubin name. */
+ int major, minor;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+ cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+ /* Attempt to use kernel provided with Blender. */
+ if (!use_adaptive_compilation()) {
+ const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+ VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+ if (path_exists(cubin)) {
+ VLOG(1) << "Using precompiled kernel.";
+ return cubin;
+ }
+ }
+
+ const string common_cflags = compile_kernel_get_common_cflags(
+ requested_features, filter, split);
+
+ /* Try to use locally compiled kernel. */
+ const string source_path = path_get("source");
+ const string kernel_md5 = path_files_md5_hash(source_path);
+
+ /* We include cflags into md5 so changing cuda toolkit or changing other
+ * compiler command line arguments makes sure cubin gets re-built.
+ */
+ const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
+
+ const string cubin_file = string_printf(
+ "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
+ const string cubin = path_cache_get(path_join("kernels", cubin_file));
+ VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+ if (path_exists(cubin)) {
+ VLOG(1) << "Using locally compiled kernel.";
+ return cubin;
+ }
#ifdef _WIN32
- if(have_precompiled_kernels()) {
- if(major < 3) {
- cuda_error_message(string_printf(
- "CUDA device requires compute capability 3.0 or up, "
- "found %d.%d. Your GPU is not supported.",
- major, minor));
- }
- else {
- cuda_error_message(string_printf(
- "CUDA binary kernel for this graphics card compute "
- "capability (%d.%d) not found.",
- major, minor));
- }
- return "";
- }
+ if (have_precompiled_kernels()) {
+ if (major < 3) {
+ cuda_error_message(
+ string_printf("CUDA device requires compute capability 3.0 or up, "
+ "found %d.%d. Your GPU is not supported.",
+ major,
+ minor));
+ }
+ else {
+ cuda_error_message(
+ string_printf("CUDA binary kernel for this graphics card compute "
+ "capability (%d.%d) not found.",
+ major,
+ minor));
+ }
+ return "";
+ }
#endif
- /* Compile. */
- if(!compile_check_compiler()) {
- return "";
- }
- const char *nvcc = cuewCompilerPath();
- const string kernel = path_join(
- path_join(source_path, "kernel"),
- path_join("kernels",
- path_join("cuda", source)));
- double starttime = time_dt();
- printf("Compiling CUDA kernel ...\n");
-
- path_create_directories(cubin);
-
- string command = string_printf("\"%s\" "
- "-arch=sm_%d%d "
- "--cubin \"%s\" "
- "-o \"%s\" "
- "%s ",
- nvcc,
- major, minor,
- kernel.c_str(),
- cubin.c_str(),
- common_cflags.c_str());
-
- printf("%s\n", command.c_str());
-
- if(system(command.c_str()) == -1) {
- cuda_error_message("Failed to execute compilation command, "
- "see console for details.");
- return "";
- }
-
- /* Verify if compilation succeeded */
- if(!path_exists(cubin)) {
- cuda_error_message("CUDA kernel compilation failed, "
- "see console for details.");
- return "";
- }
-
- printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
- return cubin;
- }
-
- bool load_kernels(const DeviceRequestedFeatures& requested_features)
- {
- /* TODO(sergey): Support kernels re-load for CUDA devices.
- *
- * Currently re-loading kernel will invalidate memory pointers,
- * causing problems in cuCtxSynchronize.
- */
- if(cuFilterModule && cuModule) {
- VLOG(1) << "Skipping kernel reload, not currently supported.";
- return true;
- }
-
- /* check if cuda init succeeded */
- if(cuContext == 0)
- return false;
-
- /* check if GPU is supported */
- if(!support_device(requested_features))
- return false;
-
- /* get kernel */
- string cubin = compile_kernel(requested_features, false, use_split_kernel());
- if(cubin == "")
- return false;
-
- string filter_cubin = compile_kernel(requested_features, true, false);
- if(filter_cubin == "")
- return false;
-
- /* open module */
- CUDAContextScope scope(this);
-
- string cubin_data;
- CUresult result;
-
- if(path_read_text(cubin, cubin_data))
- result = cuModuleLoadData(&cuModule, cubin_data.c_str());
- else
- result = CUDA_ERROR_FILE_NOT_FOUND;
-
- if(cuda_error_(result, "cuModuleLoad"))
- cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
-
- if(path_read_text(filter_cubin, cubin_data))
- result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
- else
- result = CUDA_ERROR_FILE_NOT_FOUND;
-
- if(cuda_error_(result, "cuModuleLoad"))
- cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
-
- if(result == CUDA_SUCCESS) {
- reserve_local_memory(requested_features);
- }
-
- return (result == CUDA_SUCCESS);
- }
-
- void reserve_local_memory(const DeviceRequestedFeatures& requested_features)
- {
- if(use_split_kernel()) {
- /* Split kernel mostly uses global memory and adaptive compilation,
- * difficult to predict how much is needed currently. */
- return;
- }
-
- /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
- * needed for kernel launches, so that we can reliably figure out when
- * to allocate scene data in mapped host memory. */
- CUDAContextScope scope(this);
-
- size_t total = 0, free_before = 0, free_after = 0;
- cuMemGetInfo(&free_before, &total);
-
- /* Get kernel function. */
- CUfunction cuPathTrace;
-
- if(requested_features.use_integrator_branched) {
- cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
- }
-
- cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
- int min_blocks, num_threads_per_block;
- cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-
- /* Launch kernel, using just 1 block appears sufficient to reserve
- * memory for all multiprocessors. It would be good to do this in
- * parallel for the multi GPU case still to make it faster. */
- CUdeviceptr d_work_tiles = 0;
- uint total_work_size = 0;
-
- void *args[] = {&d_work_tiles,
- &total_work_size};
-
- cuda_assert(cuLaunchKernel(cuPathTrace,
- 1, 1, 1,
- num_threads_per_block, 1, 1,
- 0, 0, args, 0));
-
- cuda_assert(cuCtxSynchronize());
-
- cuMemGetInfo(&free_after, &total);
- VLOG(1) << "Local memory reserved "
- << string_human_readable_number(free_before - free_after) << " bytes. ("
- << string_human_readable_size(free_before - free_after) << ")";
+ /* Compile. */
+ if (!compile_check_compiler()) {
+ return "";
+ }
+ const char *nvcc = cuewCompilerPath();
+ const string kernel = path_join(path_join(source_path, "kernel"),
+ path_join("kernels", path_join("cuda", source)));
+ double starttime = time_dt();
+ printf("Compiling CUDA kernel ...\n");
+
+ path_create_directories(cubin);
+
+ string command = string_printf(
+ "\"%s\" "
+ "-arch=sm_%d%d "
+ "--cubin \"%s\" "
+ "-o \"%s\" "
+ "%s ",
+ nvcc,
+ major,
+ minor,
+ kernel.c_str(),
+ cubin.c_str(),
+ common_cflags.c_str());
+
+ printf("%s\n", command.c_str());
+
+ if (system(command.c_str()) == -1) {
+ cuda_error_message(
+ "Failed to execute compilation command, "
+ "see console for details.");
+ return "";
+ }
+
+ /* Verify if compilation succeeded */
+ if (!path_exists(cubin)) {
+ cuda_error_message(
+ "CUDA kernel compilation failed, "
+ "see console for details.");
+ return "";
+ }
+
+ printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+ return cubin;
+ }
+
+ bool load_kernels(const DeviceRequestedFeatures &requested_features)
+ {
+ /* TODO(sergey): Support kernels re-load for CUDA devices.
+ *
+ * Currently re-loading kernel will invalidate memory pointers,
+ * causing problems in cuCtxSynchronize.
+ */
+ if (cuFilterModule && cuModule) {
+ VLOG(1) << "Skipping kernel reload, not currently supported.";
+ return true;
+ }
+
+ /* check if cuda init succeeded */
+ if (cuContext == 0)
+ return false;
+
+ /* check if GPU is supported */
+ if (!support_device(requested_features))
+ return false;
+
+ /* get kernel */
+ string cubin = compile_kernel(requested_features, false, use_split_kernel());
+ if (cubin == "")
+ return false;
+
+ string filter_cubin = compile_kernel(requested_features, true, false);
+ if (filter_cubin == "")
+ return false;
+
+ /* open module */
+ CUDAContextScope scope(this);
+
+ string cubin_data;
+ CUresult result;
+
+ if (path_read_text(cubin, cubin_data))
+ result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if (cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+
+ if (path_read_text(filter_cubin, cubin_data))
+ result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+ else
+ result = CUDA_ERROR_FILE_NOT_FOUND;
+
+ if (cuda_error_(result, "cuModuleLoad"))
+ cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
+ if (result == CUDA_SUCCESS) {
+ reserve_local_memory(requested_features);
+ }
+
+ return (result == CUDA_SUCCESS);
+ }
+
+ void reserve_local_memory(const DeviceRequestedFeatures &requested_features)
+ {
+ if (use_split_kernel()) {
+ /* Split kernel mostly uses global memory and adaptive compilation,
+ * difficult to predict how much is needed currently. */
+ return;
+ }
+
+ /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+ * needed for kernel launches, so that we can reliably figure out when
+ * to allocate scene data in mapped host memory. */
+ CUDAContextScope scope(this);
+
+ size_t total = 0, free_before = 0, free_after = 0;
+ cuMemGetInfo(&free_before, &total);
+
+ /* Get kernel function. */
+ CUfunction cuPathTrace;
+
+ if (requested_features.use_integrator_branched) {
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+ }
+
+ cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+ int min_blocks, num_threads_per_block;
+ cuda_assert(cuOccupancyMaxPotentialBlockSize(
+ &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+ /* Launch kernel, using just 1 block appears sufficient to reserve
+ * memory for all multiprocessors. It would be good to do this in
+ * parallel for the multi GPU case still to make it faster. */
+ CUdeviceptr d_work_tiles = 0;
+ uint total_work_size = 0;
+
+ void *args[] = {&d_work_tiles, &total_work_size};
+
+ cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ cuMemGetInfo(&free_after, &total);
+ VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+ << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
#if 0
- /* For testing mapped host memory, fill up device memory. */
- const size_t keep_mb = 1024;
-
- while(free_after > keep_mb * 1024 * 1024LL) {
- CUdeviceptr tmp;
- cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
- cuMemGetInfo(&free_after, &total);
- }
+ /* For testing mapped host memory, fill up device memory. */
+ const size_t keep_mb = 1024;
+
+ while(free_after > keep_mb * 1024 * 1024LL) {
+ CUdeviceptr tmp;
+ cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+ cuMemGetInfo(&free_after, &total);
+ }
#endif
- }
-
- void init_host_memory()
- {
- /* Limit amount of host mapped memory, because allocating too much can
- * cause system instability. Leave at least half or 4 GB of system
- * memory free, whichever is smaller. */
- size_t default_limit = 4 * 1024 * 1024 * 1024LL;
- size_t system_ram = system_physical_ram();
-
- if(system_ram > 0) {
- if(system_ram / 2 > default_limit) {
- map_host_limit = system_ram - default_limit;
- }
- else {
- map_host_limit = system_ram / 2;
- }
- }
- else {
- VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
- map_host_limit = 0;
- }
-
- /* Amount of device memory to keep is free after texture memory
- * and working memory allocations respectively. We set the working
- * memory limit headroom lower so that some space is left after all
- * texture memory allocations. */
- device_working_headroom = 32 * 1024 * 1024LL; // 32MB
- device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
-
- VLOG(1) << "Mapped host memory limit set to "
- << string_human_readable_number(map_host_limit) << " bytes. ("
- << string_human_readable_size(map_host_limit) << ")";
- }
-
- void load_texture_info()
- {
- if(need_texture_info) {
- texture_info.copy_to_device();
- need_texture_info = false;
- }
- }
-
- void move_textures_to_host(size_t size, bool for_texture)
- {
- /* Signal to reallocate textures in host memory only. */
- move_texture_to_host = true;
-
- while(size > 0) {
- /* Find suitable memory allocation to move. */
- device_memory *max_mem = NULL;
- size_t max_size = 0;
- bool max_is_image = false;
-
- foreach(CUDAMemMap::value_type& pair, cuda_mem_map) {
- device_memory& mem = *pair.first;
- CUDAMem *cmem = &pair.second;
-
- bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- /* Can't move this type of memory. */
- if(!is_texture || cmem->array) {
- continue;
- }
-
- /* Already in host memory. */
- if(cmem->map_host_pointer) {
- continue;
- }
-
- /* For other textures, only move image textures. */
- if(for_texture && !is_image) {
- continue;
- }
-
- /* Try to move largest allocation, prefer moving images. */
- if(is_image > max_is_image ||
- (is_image == max_is_image && mem.device_size > max_size)) {
- max_is_image = is_image;
- max_size = mem.device_size;
- max_mem = &mem;
- }
- }
-
- /* Move to host memory. This part is mutex protected since
- * multiple CUDA devices could be moving the memory. The
- * first one will do it, and the rest will adopt the pointer. */
- if(max_mem) {
- VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
- static thread_mutex move_mutex;
- thread_scoped_lock lock(move_mutex);
-
- /* Preserve the original device pointer, in case of multi device
- * we can't change it because the pointer mapping would break. */
- device_ptr prev_pointer = max_mem->device_pointer;
- size_t prev_size = max_mem->device_size;
-
- tex_free(*max_mem);
- tex_alloc(*max_mem);
- size = (max_size >= size)? 0: size - max_size;
-
- max_mem->device_pointer = prev_pointer;
- max_mem->device_size = prev_size;
- }
- else {
- break;
- }
- }
-
- /* Update texture info array with new pointers. */
- load_texture_info();
-
- move_texture_to_host = false;
- }
-
- CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0)
- {
- CUDAContextScope scope(this);
-
- CUdeviceptr device_pointer = 0;
- size_t size = mem.memory_size() + pitch_padding;
-
- CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
- const char *status = "";
-
- /* First try allocating in device memory, respecting headroom. We make
- * an exception for texture info. It is small and frequently accessed,
- * so treat it as working memory.
- *
- * If there is not enough room for working memory, we will try to move
- * textures to host memory, assuming the performance impact would have
- * been worse for working memory. */
- bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
- bool is_image = is_texture && (mem.data_height > 1);
-
- size_t headroom = (is_texture)? device_texture_headroom:
- device_working_headroom;
-
- size_t total = 0, free = 0;
- cuMemGetInfo(&free, &total);
-
- /* Move textures to host memory if needed. */
- if(!move_texture_to_host && !is_image && (size + headroom) >= free) {
- move_textures_to_host(size + headroom - free, is_texture);
- cuMemGetInfo(&free, &total);
- }
-
- /* Allocate in device memory. */
- if(!move_texture_to_host && (size + headroom) < free) {
- mem_alloc_result = cuMemAlloc(&device_pointer, size);
- if(mem_alloc_result == CUDA_SUCCESS) {
- status = " in device memory";
- }
- }
-
- /* Fall back to mapped host memory if needed and possible. */
- void *map_host_pointer = 0;
- bool free_map_host = false;
-
- if(mem_alloc_result != CUDA_SUCCESS && can_map_host &&
- map_host_used + size < map_host_limit) {
- if(mem.shared_pointer) {
- /* Another device already allocated host memory. */
- mem_alloc_result = CUDA_SUCCESS;
- map_host_pointer = mem.shared_pointer;
- }
- else {
- /* Allocate host memory ourselves. */
- mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size,
- CU_MEMHOSTALLOC_DEVICEMAP |
- CU_MEMHOSTALLOC_WRITECOMBINED);
- mem.shared_pointer = map_host_pointer;
- free_map_host = true;
- }
-
- if(mem_alloc_result == CUDA_SUCCESS) {
- cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
- map_host_used += size;
- status = " in host memory";
-
- /* Replace host pointer with our host allocation. Only works if
- * CUDA memory layout is the same and has no pitch padding. Also
- * does not work if we move textures to host during a render,
- * since other devices might be using the memory. */
- if(!move_texture_to_host && pitch_padding == 0 &&
- mem.host_pointer && mem.host_pointer != mem.shared_pointer) {
- memcpy(mem.shared_pointer, mem.host_pointer, size);
- mem.host_free();
- mem.host_pointer = mem.shared_pointer;
- }
- }
- else {
- status = " failed, out of host memory";
- }
- }
- else if(mem_alloc_result != CUDA_SUCCESS) {
- status = " failed, out of device and host memory";
- }
-
- if(mem_alloc_result != CUDA_SUCCESS) {
- cuda_assert(mem_alloc_result);
- }
-
- if(mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")"
- << status;
- }
-
- mem.device_pointer = (device_ptr)device_pointer;
- mem.device_size = size;
- stats.mem_alloc(size);
-
- if(!mem.device_pointer) {
- return NULL;
- }
-
- /* Insert into map of allocations. */
- CUDAMem *cmem = &cuda_mem_map[&mem];
- cmem->map_host_pointer = map_host_pointer;
- cmem->free_map_host = free_map_host;
- return cmem;
- }
-
- void generic_copy_to(device_memory& mem)
- {
- if(mem.host_pointer && mem.device_pointer) {
- CUDAContextScope scope(this);
-
- if(mem.host_pointer != mem.shared_pointer) {
- cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer),
- mem.host_pointer,
- mem.memory_size()));
- }
- }
- }
-
- void generic_free(device_memory& mem)
- {
- if(mem.device_pointer) {
- CUDAContextScope scope(this);
- const CUDAMem& cmem = cuda_mem_map[&mem];
-
- if(cmem.map_host_pointer) {
- /* Free host memory. */
- if(cmem.free_map_host) {
- cuMemFreeHost(cmem.map_host_pointer);
- if(mem.host_pointer == mem.shared_pointer) {
- mem.host_pointer = 0;
- }
- mem.shared_pointer = 0;
- }
-
- map_host_used -= mem.device_size;
- }
- else {
- /* Free device memory. */
- cuMemFree(mem.device_pointer);
- }
-
- stats.mem_free(mem.device_size);
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
- }
-
- void mem_alloc(device_memory& mem)
- {
- if(mem.type == MEM_PIXELS && !background) {
- pixels_alloc(mem);
- }
- else if(mem.type == MEM_TEXTURE) {
- assert(!"mem_alloc not supported for textures.");
- }
- else {
- generic_alloc(mem);
- }
- }
-
- void mem_copy_to(device_memory& mem)
- {
- if(mem.type == MEM_PIXELS) {
- assert(!"mem_copy_to not supported for pixels.");
- }
- else if(mem.type == MEM_TEXTURE) {
- tex_free(mem);
- tex_alloc(mem);
- }
- else {
- if(!mem.device_pointer) {
- generic_alloc(mem);
- }
-
- generic_copy_to(mem);
- }
- }
-
- void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
- {
- if(mem.type == MEM_PIXELS && !background) {
- pixels_copy_from(mem, y, w, h);
- }
- else if(mem.type == MEM_TEXTURE) {
- assert(!"mem_copy_from not supported for textures.");
- }
- else {
- CUDAContextScope scope(this);
- size_t offset = elem*y*w;
- size_t size = elem*w*h;
-
- if(mem.host_pointer && mem.device_pointer) {
- cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset,
- (CUdeviceptr)(mem.device_pointer + offset), size));
- }
- else if(mem.host_pointer) {
- memset((char*)mem.host_pointer + offset, 0, size);
- }
- }
- }
-
- void mem_zero(device_memory& mem)
- {
- if(!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if(mem.host_pointer) {
- memset(mem.host_pointer, 0, mem.memory_size());
- }
-
- if(mem.device_pointer &&
- (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
- CUDAContextScope scope(this);
- cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
- }
- }
-
- void mem_free(device_memory& mem)
- {
- if(mem.type == MEM_PIXELS && !background) {
- pixels_free(mem);
- }
- else if(mem.type == MEM_TEXTURE) {
- tex_free(mem);
- }
- else {
- generic_free(mem);
- }
- }
-
- virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
- {
- return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
- }
-
- void const_copy_to(const char *name, void *host, size_t size)
- {
- CUDAContextScope scope(this);
- CUdeviceptr mem;
- size_t bytes;
-
- cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
- //assert(bytes == size);
- cuda_assert(cuMemcpyHtoD(mem, host, size));
- }
-
- void tex_alloc(device_memory& mem)
- {
- CUDAContextScope scope(this);
-
- /* General variables for both architectures */
- string bind_name = mem.name;
- size_t dsize = datatype_size(mem.data_type);
- size_t size = mem.memory_size();
-
- CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
- switch(mem.extension) {
- case EXTENSION_REPEAT:
- address_mode = CU_TR_ADDRESS_MODE_WRAP;
- break;
- case EXTENSION_EXTEND:
- address_mode = CU_TR_ADDRESS_MODE_CLAMP;
- break;
- case EXTENSION_CLIP:
- address_mode = CU_TR_ADDRESS_MODE_BORDER;
- break;
- default:
- assert(0);
- break;
- }
-
- CUfilter_mode filter_mode;
- if(mem.interpolation == INTERPOLATION_CLOSEST) {
- filter_mode = CU_TR_FILTER_MODE_POINT;
- }
- else {
- filter_mode = CU_TR_FILTER_MODE_LINEAR;
- }
-
- /* Data Storage */
- if(mem.interpolation == INTERPOLATION_NONE) {
- generic_alloc(mem);
- generic_copy_to(mem);
-
- CUdeviceptr cumem;
- size_t cubytes;
-
- cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
- if(cubytes == 8) {
- /* 64 bit device pointer */
- uint64_t ptr = mem.device_pointer;
- cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
- }
- else {
- /* 32 bit device pointer */
- uint32_t ptr = (uint32_t)mem.device_pointer;
- cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
- }
- return;
- }
-
- /* Image Texture Storage */
- CUarray_format_enum format;
- switch(mem.data_type) {
- case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
- case TYPE_UINT16: format = CU_AD_FORMAT_UNSIGNED_INT16; break;
- case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
- case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
- case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
- case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
- default: assert(0); return;
- }
-
- CUDAMem *cmem = NULL;
- CUarray array_3d = NULL;
- size_t src_pitch = mem.data_width * dsize * mem.data_elements;
- size_t dst_pitch = src_pitch;
-
- if(mem.data_depth > 1) {
- /* 3D texture using array, there is no API for linear memory. */
- CUDA_ARRAY3D_DESCRIPTOR desc;
-
- desc.Width = mem.data_width;
- desc.Height = mem.data_height;
- desc.Depth = mem.data_depth;
- desc.Format = format;
- desc.NumChannels = mem.data_elements;
- desc.Flags = 0;
-
- VLOG(1) << "Array 3D allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
- if(!array_3d) {
- return;
- }
-
- CUDA_MEMCPY3D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- param.dstArray = array_3d;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = mem.host_pointer;
- param.srcPitch = src_pitch;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
- param.Depth = mem.data_depth;
-
- cuda_assert(cuMemcpy3D(&param));
-
- mem.device_pointer = (device_ptr)array_3d;
- mem.device_size = size;
- stats.mem_alloc(size);
-
- cmem = &cuda_mem_map[&mem];
- cmem->texobject = 0;
- cmem->array = array_3d;
- }
- else if(mem.data_height > 0) {
- /* 2D texture, using pitch aligned linear memory. */
- int alignment = 0;
- cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
- dst_pitch = align_up(src_pitch, alignment);
- size_t dst_size = dst_pitch * mem.data_height;
-
- cmem = generic_alloc(mem, dst_size - mem.memory_size());
- if(!cmem) {
- return;
- }
-
- CUDA_MEMCPY2D param;
- memset(&param, 0, sizeof(param));
- param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
- param.dstDevice = mem.device_pointer;
- param.dstPitch = dst_pitch;
- param.srcMemoryType = CU_MEMORYTYPE_HOST;
- param.srcHost = mem.host_pointer;
- param.srcPitch = src_pitch;
- param.WidthInBytes = param.srcPitch;
- param.Height = mem.data_height;
-
- cuda_assert(cuMemcpy2DUnaligned(&param));
- }
- else {
- /* 1D texture, using linear memory. */
- cmem = generic_alloc(mem);
- if(!cmem) {
- return;
- }
-
- cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
- }
-
- /* Kepler+, bindless textures. */
- int flat_slot = 0;
- if(string_startswith(mem.name, "__tex_image")) {
- int pos = string(mem.name).rfind("_");
- flat_slot = atoi(mem.name + pos + 1);
- }
- else {
- assert(0);
- }
-
- CUDA_RESOURCE_DESC resDesc;
- memset(&resDesc, 0, sizeof(resDesc));
-
- if(array_3d) {
- resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- resDesc.res.array.hArray = array_3d;
- resDesc.flags = 0;
- }
- else if(mem.data_height > 0) {
- resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
- resDesc.res.pitch2D.devPtr = mem.device_pointer;
- resDesc.res.pitch2D.format = format;
- resDesc.res.pitch2D.numChannels = mem.data_elements;
- resDesc.res.pitch2D.height = mem.data_height;
- resDesc.res.pitch2D.width = mem.data_width;
- resDesc.res.pitch2D.pitchInBytes = dst_pitch;
- }
- else {
- resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
- resDesc.res.linear.devPtr = mem.device_pointer;
- resDesc.res.linear.format = format;
- resDesc.res.linear.numChannels = mem.data_elements;
- resDesc.res.linear.sizeInBytes = mem.device_size;
- }
-
- CUDA_TEXTURE_DESC texDesc;
- memset(&texDesc, 0, sizeof(texDesc));
- texDesc.addressMode[0] = address_mode;
- texDesc.addressMode[1] = address_mode;
- texDesc.addressMode[2] = address_mode;
- texDesc.filterMode = filter_mode;
- texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
- cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
- /* Resize once */
- if(flat_slot >= texture_info.size()) {
- /* Allocate some slots in advance, to reduce amount
- * of re-allocations. */
- texture_info.resize(flat_slot + 128);
- }
-
- /* Set Mapping and tag that we need to (re-)upload to device */
- TextureInfo& info = texture_info[flat_slot];
- info.data = (uint64_t)cmem->texobject;
- info.cl_buffer = 0;
- info.interpolation = mem.interpolation;
- info.extension = mem.extension;
- info.width = mem.data_width;
- info.height = mem.data_height;
- info.depth = mem.data_depth;
- need_texture_info = true;
- }
-
- void tex_free(device_memory& mem)
- {
- if(mem.device_pointer) {
- CUDAContextScope scope(this);
- const CUDAMem& cmem = cuda_mem_map[&mem];
-
- if(cmem.texobject) {
- /* Free bindless texture. */
- cuTexObjectDestroy(cmem.texobject);
- }
-
- if(cmem.array) {
- /* Free array. */
- cuArrayDestroy(cmem.array);
- stats.mem_free(mem.device_size);
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- cuda_mem_map.erase(cuda_mem_map.find(&mem));
- }
- else {
- generic_free(mem);
- }
- }
- }
-
-#define CUDA_GET_BLOCKSIZE(func, w, h) \
- int threads_per_block; \
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- int threads = (int)sqrt((float)threads_per_block); \
- int xblocks = ((w) + threads - 1)/threads; \
- int yblocks = ((h) + threads - 1)/threads;
-
-#define CUDA_LAUNCH_KERNEL(func, args) \
- cuda_assert(cuLaunchKernel(func, \
- xblocks, yblocks, 1, \
- threads, threads, 1, \
- 0, 0, args, 0));
+ }
+
+ void init_host_memory()
+ {
+ /* Limit amount of host mapped memory, because allocating too much can
+ * cause system instability. Leave at least half or 4 GB of system
+ * memory free, whichever is smaller. */
+ size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+ size_t system_ram = system_physical_ram();
+
+ if (system_ram > 0) {
+ if (system_ram / 2 > default_limit) {
+ map_host_limit = system_ram - default_limit;
+ }
+ else {
+ map_host_limit = system_ram / 2;
+ }
+ }
+ else {
+ VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+ map_host_limit = 0;
+ }
+
+ /* Amount of device memory to keep is free after texture memory
+ * and working memory allocations respectively. We set the working
+ * memory limit headroom lower so that some space is left after all
+ * texture memory allocations. */
+ device_working_headroom = 32 * 1024 * 1024LL; // 32MB
+ device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
+
+ VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+ << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+ }
+
+ void load_texture_info()
+ {
+ if (need_texture_info) {
+ texture_info.copy_to_device();
+ need_texture_info = false;
+ }
+ }
+
+ void move_textures_to_host(size_t size, bool for_texture)
+ {
+ /* Signal to reallocate textures in host memory only. */
+ move_texture_to_host = true;
+
+ while (size > 0) {
+ /* Find suitable memory allocation to move. */
+ device_memory *max_mem = NULL;
+ size_t max_size = 0;
+ bool max_is_image = false;
+
+ foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+ device_memory &mem = *pair.first;
+ CUDAMem *cmem = &pair.second;
+
+ bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+ bool is_image = is_texture && (mem.data_height > 1);
+
+ /* Can't move this type of memory. */
+ if (!is_texture || cmem->array) {
+ continue;
+ }
+
+ /* Already in host memory. */
+ if (cmem->map_host_pointer) {
+ continue;
+ }
+
+ /* For other textures, only move image textures. */
+ if (for_texture && !is_image) {
+ continue;
+ }
+
+ /* Try to move largest allocation, prefer moving images. */
+ if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+ max_is_image = is_image;
+ max_size = mem.device_size;
+ max_mem = &mem;
+ }
+ }
+
+ /* Move to host memory. This part is mutex protected since
+ * multiple CUDA devices could be moving the memory. The
+ * first one will do it, and the rest will adopt the pointer. */
+ if (max_mem) {
+ VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+ static thread_mutex move_mutex;
+ thread_scoped_lock lock(move_mutex);
+
+ /* Preserve the original device pointer, in case of multi device
+ * we can't change it because the pointer mapping would break. */
+ device_ptr prev_pointer = max_mem->device_pointer;
+ size_t prev_size = max_mem->device_size;
+
+ tex_free(*max_mem);
+ tex_alloc(*max_mem);
+ size = (max_size >= size) ? 0 : size - max_size;
+
+ max_mem->device_pointer = prev_pointer;
+ max_mem->device_size = prev_size;
+ }
+ else {
+ break;
+ }
+ }
+
+ /* Update texture info array with new pointers. */
+ load_texture_info();
+
+ move_texture_to_host = false;
+ }
+
+ CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
+ {
+ CUDAContextScope scope(this);
+
+ CUdeviceptr device_pointer = 0;
+ size_t size = mem.memory_size() + pitch_padding;
+
+ CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+ const char *status = "";
+
+ /* First try allocating in device memory, respecting headroom. We make
+ * an exception for texture info. It is small and frequently accessed,
+ * so treat it as working memory.
+ *
+ * If there is not enough room for working memory, we will try to move
+ * textures to host memory, assuming the performance impact would have
+ * been worse for working memory. */
+ bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+ bool is_image = is_texture && (mem.data_height > 1);
+
+ size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+ size_t total = 0, free = 0;
+ cuMemGetInfo(&free, &total);
+
+ /* Move textures to host memory if needed. */
+ if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+ move_textures_to_host(size + headroom - free, is_texture);
+ cuMemGetInfo(&free, &total);
+ }
+
+ /* Allocate in device memory. */
+ if (!move_texture_to_host && (size + headroom) < free) {
+ mem_alloc_result = cuMemAlloc(&device_pointer, size);
+ if (mem_alloc_result == CUDA_SUCCESS) {
+ status = " in device memory";
+ }
+ }
+
+ /* Fall back to mapped host memory if needed and possible. */
+ void *map_host_pointer = 0;
+ bool free_map_host = false;
+
+ if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+ map_host_used + size < map_host_limit) {
+ if (mem.shared_pointer) {
+ /* Another device already allocated host memory. */
+ mem_alloc_result = CUDA_SUCCESS;
+ map_host_pointer = mem.shared_pointer;
+ }
+ else {
+ /* Allocate host memory ourselves. */
+ mem_alloc_result = cuMemHostAlloc(
+ &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+ mem.shared_pointer = map_host_pointer;
+ free_map_host = true;
+ }
+
+ if (mem_alloc_result == CUDA_SUCCESS) {
+ cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+ map_host_used += size;
+ status = " in host memory";
+
+ /* Replace host pointer with our host allocation. Only works if
+ * CUDA memory layout is the same and has no pitch padding. Also
+ * does not work if we move textures to host during a render,
+ * since other devices might be using the memory. */
+ if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+ mem.host_pointer != mem.shared_pointer) {
+ memcpy(mem.shared_pointer, mem.host_pointer, size);
+ mem.host_free();
+ mem.host_pointer = mem.shared_pointer;
+ }
+ }
+ else {
+ status = " failed, out of host memory";
+ }
+ }
+ else if (mem_alloc_result != CUDA_SUCCESS) {
+ status = " failed, out of device and host memory";
+ }
+
+ if (mem_alloc_result != CUDA_SUCCESS) {
+ cuda_assert(mem_alloc_result);
+ }
+
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")" << status;
+ }
+
+ mem.device_pointer = (device_ptr)device_pointer;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ if (!mem.device_pointer) {
+ return NULL;
+ }
+
+ /* Insert into map of allocations. */
+ CUDAMem *cmem = &cuda_mem_map[&mem];
+ cmem->map_host_pointer = map_host_pointer;
+ cmem->free_map_host = free_map_host;
+ return cmem;
+ }
+
+ void generic_copy_to(device_memory &mem)
+ {
+ if (mem.host_pointer && mem.device_pointer) {
+ CUDAContextScope scope(this);
+
+ if (mem.host_pointer != mem.shared_pointer) {
+ cuda_assert(cuMemcpyHtoD(
+ cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
+ }
+ }
+ }
+
+ void generic_free(device_memory &mem)
+ {
+ if (mem.device_pointer) {
+ CUDAContextScope scope(this);
+ const CUDAMem &cmem = cuda_mem_map[&mem];
+
+ if (cmem.map_host_pointer) {
+ /* Free host memory. */
+ if (cmem.free_map_host) {
+ cuMemFreeHost(cmem.map_host_pointer);
+ if (mem.host_pointer == mem.shared_pointer) {
+ mem.host_pointer = 0;
+ }
+ mem.shared_pointer = 0;
+ }
+
+ map_host_used -= mem.device_size;
+ }
+ else {
+ /* Free device memory. */
+ cuMemFree(mem.device_pointer);
+ }
+
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ }
+
+ void mem_alloc(device_memory &mem)
+ {
+ if (mem.type == MEM_PIXELS && !background) {
+ pixels_alloc(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_alloc not supported for textures.");
+ }
+ else {
+ generic_alloc(mem);
+ }
+ }
+
+ void mem_copy_to(device_memory &mem)
+ {
+ if (mem.type == MEM_PIXELS) {
+ assert(!"mem_copy_to not supported for pixels.");
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free(mem);
+ tex_alloc(mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ generic_alloc(mem);
+ }
+
+ generic_copy_to(mem);
+ }
+ }
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+ {
+ if (mem.type == MEM_PIXELS && !background) {
+ pixels_copy_from(mem, y, w, h);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ assert(!"mem_copy_from not supported for textures.");
+ }
+ else {
+ CUDAContextScope scope(this);
+ size_t offset = elem * y * w;
+ size_t size = elem * w * h;
+
+ if (mem.host_pointer && mem.device_pointer) {
+ cuda_assert(cuMemcpyDtoH(
+ (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size));
+ }
+ else if (mem.host_pointer) {
+ memset((char *)mem.host_pointer + offset, 0, size);
+ }
+ }
+ }
+
+ void mem_zero(device_memory &mem)
+ {
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ if (mem.host_pointer) {
+ memset(mem.host_pointer, 0, mem.memory_size());
+ }
+
+ if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
+ CUDAContextScope scope(this);
+ cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
+ }
+ }
+
+ void mem_free(device_memory &mem)
+ {
+ if (mem.type == MEM_PIXELS && !background) {
+ pixels_free(mem);
+ }
+ else if (mem.type == MEM_TEXTURE) {
+ tex_free(mem);
+ }
+ else {
+ generic_free(mem);
+ }
+ }
+
+ virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+ {
+ return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size)
+ {
+ CUDAContextScope scope(this);
+ CUdeviceptr mem;
+ size_t bytes;
+
+ cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+ //assert(bytes == size);
+ cuda_assert(cuMemcpyHtoD(mem, host, size));
+ }
+
+ void tex_alloc(device_memory &mem)
+ {
+ CUDAContextScope scope(this);
+
+ /* General variables for both architectures */
+ string bind_name = mem.name;
+ size_t dsize = datatype_size(mem.data_type);
+ size_t size = mem.memory_size();
+
+ CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ switch (mem.extension) {
+ case EXTENSION_REPEAT:
+ address_mode = CU_TR_ADDRESS_MODE_WRAP;
+ break;
+ case EXTENSION_EXTEND:
+ address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+ break;
+ case EXTENSION_CLIP:
+ address_mode = CU_TR_ADDRESS_MODE_BORDER;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+
+ CUfilter_mode filter_mode;
+ if (mem.interpolation == INTERPOLATION_CLOSEST) {
+ filter_mode = CU_TR_FILTER_MODE_POINT;
+ }
+ else {
+ filter_mode = CU_TR_FILTER_MODE_LINEAR;
+ }
+
+ /* Data Storage */
+ if (mem.interpolation == INTERPOLATION_NONE) {
+ generic_alloc(mem);
+ generic_copy_to(mem);
+
+ CUdeviceptr cumem;
+ size_t cubytes;
+
+ cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+
+ if (cubytes == 8) {
+ /* 64 bit device pointer */
+ uint64_t ptr = mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
+ }
+ else {
+ /* 32 bit device pointer */
+ uint32_t ptr = (uint32_t)mem.device_pointer;
+ cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
+ }
+ return;
+ }
+
+ /* Image Texture Storage */
+ CUarray_format_enum format;
+ switch (mem.data_type) {
+ case TYPE_UCHAR:
+ format = CU_AD_FORMAT_UNSIGNED_INT8;
+ break;
+ case TYPE_UINT16:
+ format = CU_AD_FORMAT_UNSIGNED_INT16;
+ break;
+ case TYPE_UINT:
+ format = CU_AD_FORMAT_UNSIGNED_INT32;
+ break;
+ case TYPE_INT:
+ format = CU_AD_FORMAT_SIGNED_INT32;
+ break;
+ case TYPE_FLOAT:
+ format = CU_AD_FORMAT_FLOAT;
+ break;
+ case TYPE_HALF:
+ format = CU_AD_FORMAT_HALF;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ CUDAMem *cmem = NULL;
+ CUarray array_3d = NULL;
+ size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+ size_t dst_pitch = src_pitch;
+
+ if (mem.data_depth > 1) {
+ /* 3D texture using array, there is no API for linear memory. */
+ CUDA_ARRAY3D_DESCRIPTOR desc;
+
+ desc.Width = mem.data_width;
+ desc.Height = mem.data_height;
+ desc.Depth = mem.data_depth;
+ desc.Format = format;
+ desc.NumChannels = mem.data_elements;
+ desc.Flags = 0;
+
+ VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+ if (!array_3d) {
+ return;
+ }
+
+ CUDA_MEMCPY3D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ param.dstArray = array_3d;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+ param.Depth = mem.data_depth;
+
+ cuda_assert(cuMemcpy3D(&param));
+
+ mem.device_pointer = (device_ptr)array_3d;
+ mem.device_size = size;
+ stats.mem_alloc(size);
+
+ cmem = &cuda_mem_map[&mem];
+ cmem->texobject = 0;
+ cmem->array = array_3d;
+ }
+ else if (mem.data_height > 0) {
+ /* 2D texture, using pitch aligned linear memory. */
+ int alignment = 0;
+ cuda_assert(
+ cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+ dst_pitch = align_up(src_pitch, alignment);
+ size_t dst_size = dst_pitch * mem.data_height;
+
+ cmem = generic_alloc(mem, dst_size - mem.memory_size());
+ if (!cmem) {
+ return;
+ }
+
+ CUDA_MEMCPY2D param;
+ memset(&param, 0, sizeof(param));
+ param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+ param.dstDevice = mem.device_pointer;
+ param.dstPitch = dst_pitch;
+ param.srcMemoryType = CU_MEMORYTYPE_HOST;
+ param.srcHost = mem.host_pointer;
+ param.srcPitch = src_pitch;
+ param.WidthInBytes = param.srcPitch;
+ param.Height = mem.data_height;
+
+ cuda_assert(cuMemcpy2DUnaligned(&param));
+ }
+ else {
+ /* 1D texture, using linear memory. */
+ cmem = generic_alloc(mem);
+ if (!cmem) {
+ return;
+ }
+
+ cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+ }
+
+ /* Kepler+, bindless textures. */
+ int flat_slot = 0;
+ if (string_startswith(mem.name, "__tex_image")) {
+ int pos = string(mem.name).rfind("_");
+ flat_slot = atoi(mem.name + pos + 1);
+ }
+ else {
+ assert(0);
+ }
+
+ CUDA_RESOURCE_DESC resDesc;
+ memset(&resDesc, 0, sizeof(resDesc));
+
+ if (array_3d) {
+ resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ resDesc.res.array.hArray = array_3d;
+ resDesc.flags = 0;
+ }
+ else if (mem.data_height > 0) {
+ resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+ resDesc.res.pitch2D.devPtr = mem.device_pointer;
+ resDesc.res.pitch2D.format = format;
+ resDesc.res.pitch2D.numChannels = mem.data_elements;
+ resDesc.res.pitch2D.height = mem.data_height;
+ resDesc.res.pitch2D.width = mem.data_width;
+ resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+ }
+ else {
+ resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+ resDesc.res.linear.devPtr = mem.device_pointer;
+ resDesc.res.linear.format = format;
+ resDesc.res.linear.numChannels = mem.data_elements;
+ resDesc.res.linear.sizeInBytes = mem.device_size;
+ }
+
+ CUDA_TEXTURE_DESC texDesc;
+ memset(&texDesc, 0, sizeof(texDesc));
+ texDesc.addressMode[0] = address_mode;
+ texDesc.addressMode[1] = address_mode;
+ texDesc.addressMode[2] = address_mode;
+ texDesc.filterMode = filter_mode;
+ texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+ cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+ /* Resize once */
+ if (flat_slot >= texture_info.size()) {
+ /* Allocate some slots in advance, to reduce amount
+ * of re-allocations. */
+ texture_info.resize(flat_slot + 128);
+ }
+
+ /* Set Mapping and tag that we need to (re-)upload to device */
+ TextureInfo &info = texture_info[flat_slot];
+ info.data = (uint64_t)cmem->texobject;
+ info.cl_buffer = 0;
+ info.interpolation = mem.interpolation;
+ info.extension = mem.extension;
+ info.width = mem.data_width;
+ info.height = mem.data_height;
+ info.depth = mem.data_depth;
+ need_texture_info = true;
+ }
+
+ void tex_free(device_memory &mem)
+ {
+ if (mem.device_pointer) {
+ CUDAContextScope scope(this);
+ const CUDAMem &cmem = cuda_mem_map[&mem];
+
+ if (cmem.texobject) {
+ /* Free bindless texture. */
+ cuTexObjectDestroy(cmem.texobject);
+ }
+
+ if (cmem.array) {
+ /* Free array. */
+ cuArrayDestroy(cmem.array);
+ stats.mem_free(mem.device_size);
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ else {
+ generic_free(mem);
+ }
+ }
+ }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+ int threads_per_block; \
+ cuda_assert( \
+ cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int threads = (int)sqrt((float)threads_per_block); \
+ int xblocks = ((w) + threads - 1) / threads; \
+ int yblocks = ((h) + threads - 1) / threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+ cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
/* Similar as above, but for 1-dimensional blocks. */
-#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
- int threads_per_block; \
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
- int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \
- int yblocks = h;
-
-#define CUDA_LAUNCH_KERNEL_1D(func, args) \
- cuda_assert(cuLaunchKernel(func, \
- xblocks, yblocks, 1, \
- threads_per_block, 1, 1, \
- 0, 0, args, 0));
-
- bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
- DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- int stride = task->buffer.stride;
- int w = task->buffer.width;
- int h = task->buffer.h;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2*r+1)*(2*r+1);
- int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
- int frame_offset = 0;
-
- if(have_error())
- return false;
-
- CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
- CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
- CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
- CUdeviceptr scale_ptr = 0;
-
- cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
- cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
-
- {
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
-
- void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
- }
-
- {
- CUfunction cuNLMNormalize;
- cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
- cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
- void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
- CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
- CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
- cuda_assert(cuCtxSynchronize());
- }
-
- return !have_error();
- }
-
- bool denoising_construct_transform(DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterConstructTransform;
- cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
- CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
- task->storage.w,
- task->storage.h);
-
- void *args[] = {&task->buffer.mem.device_pointer,
- &task->tile_info_mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->filter_area,
- &task->rect,
- &task->radius,
- &task->pca_threshold,
- &task->buffer.pass_stride,
- &task->buffer.frame_stride,
- &task->buffer.use_time};
- CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- int r = task->radius;
- int f = 4;
- float a = 1.0f;
- float k_2 = task->nlm_k_2;
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
- int stride = task->buffer.stride;
- int frame_offset = frame * task->buffer.frame_stride;
- int t = task->tile_info->frames[frame];
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2*r+1)*(2*r+1);
-
- if(have_error())
- return false;
-
- CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
- CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
-
- CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
- cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
- cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
- cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
- cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
- CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
- task->reconstruction_state.source_w * task->reconstruction_state.source_h,
- num_shifts);
-
- void *calc_difference_args[] = {&color_ptr,
- &color_variance_ptr,
- &scale_ptr,
- &difference,
- &w, &h,
- &stride, &pass_stride,
- &r, &pass_stride,
- &frame_offset,
- &a, &k_2};
- void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
- void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
- void *construct_gramian_args[] = {&t,
- &blurDifference,
- &task->buffer.mem.device_pointer,
- &task->storage.transform.device_pointer,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->reconstruction_state.filter_window,
- &w, &h, &stride,
- &pass_stride, &r,
- &f,
- &frame_offset,
- &task->buffer.use_time};
-
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
- CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_solve(device_ptr output_ptr,
- DenoisingTask *task)
- {
- CUfunction cuFinalize;
- cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
- cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
- void *finalize_args[] = {&output_ptr,
- &task->storage.rank.device_pointer,
- &task->storage.XtWX.device_pointer,
- &task->storage.XtWY.device_pointer,
- &task->filter_area,
- &task->reconstruction_state.buffer_params.x,
- &task->render_buffer.samples};
- CUDA_GET_BLOCKSIZE(cuFinalize,
- task->reconstruction_state.source_w,
- task->reconstruction_state.source_h);
- CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
- device_ptr mean_ptr, device_ptr variance_ptr,
- int r, int4 rect, DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterCombineHalves;
- cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- void *args[] = {&mean_ptr,
- &variance_ptr,
- &a_ptr,
- &b_ptr,
- &rect,
- &r};
- CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
- device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr, DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterDivideShadow;
- cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- void *args[] = {&task->render_buffer.samples,
- &task->tile_info_mem.device_pointer,
- &a_ptr,
- &b_ptr,
- &sample_variance_ptr,
- &sv_variance_ptr,
- &buffer_variance_ptr,
- &task->rect,
- &task->render_buffer.pass_stride,
- &task->render_buffer.offset};
- CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterGetFeature;
- cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- void *args[] = {&task->render_buffer.samples,
- &task->tile_info_mem.device_pointer,
- &mean_offset,
- &variance_offset,
- &mean_ptr,
- &variance_ptr,
- &scale,
- &task->rect,
- &task->render_buffer.pass_stride,
- &task->render_buffer.offset};
- CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_write_feature(int out_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterWriteFeature;
- cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
- task->filter_area.z,
- task->filter_area.w);
-
- void *args[] = {&task->render_buffer.samples,
- &task->reconstruction_state.buffer_params,
- &task->filter_area,
- &from_ptr,
- &buffer_ptr,
- &out_offset,
- &task->rect};
- CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task)
- {
- if(have_error())
- return false;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilterDetectOutliers;
- cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
- cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
- CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- void *args[] = {&image_ptr,
- &variance_ptr,
- &depth_ptr,
- &output_ptr,
- &task->rect,
- &task->buffer.pass_stride};
-
- CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
- cuda_assert(cuCtxSynchronize());
-
- return !have_error();
- }
-
- void denoise(RenderTile &rtile, DenoisingTask& denoising)
- {
- denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
- denoising.render_buffer.samples = rtile.sample;
- denoising.buffer.gpu_temporary_mem = true;
-
- denoising.run_denoising(&rtile);
- }
-
- void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles)
- {
- scoped_timer timer(&rtile.buffers->render_time);
-
- if(have_error())
- return;
-
- CUDAContextScope scope(this);
- CUfunction cuPathTrace;
-
- /* Get kernel function. */
- if(task.integrator_branched) {
- cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
- }
-
- if(have_error()) {
- return;
- }
-
- cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
- /* Allocate work tile. */
- work_tiles.alloc(1);
-
- WorkTile *wtile = work_tiles.data();
- wtile->x = rtile.x;
- wtile->y = rtile.y;
- wtile->w = rtile.w;
- wtile->h = rtile.h;
- wtile->offset = rtile.offset;
- wtile->stride = rtile.stride;
- wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-
- /* Prepare work size. More step samples render faster, but for now we
- * remain conservative for GPUs connected to a display to avoid driver
- * timeouts and display freezing. */
- int min_blocks, num_threads_per_block;
- cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
- if(!info.display_device) {
- min_blocks *= 8;
- }
-
- uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
- /* Render all samples. */
- int start_sample = rtile.start_sample;
- int end_sample = rtile.start_sample + rtile.num_samples;
-
- for(int sample = start_sample; sample < end_sample; sample += step_samples) {
- /* Setup and copy work tile to device. */
- wtile->start_sample = sample;
- wtile->num_samples = min(step_samples, end_sample - sample);
- work_tiles.copy_to_device();
-
- CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
- uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
- uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
- /* Launch kernel. */
- void *args[] = {&d_work_tiles,
- &total_work_size};
-
- cuda_assert(cuLaunchKernel(cuPathTrace,
- num_blocks, 1, 1,
- num_threads_per_block, 1, 1,
- 0, 0, args, 0));
-
- cuda_assert(cuCtxSynchronize());
-
- /* Update progress. */
- rtile.sample = sample + wtile->num_samples;
- task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples);
-
- if(task.get_cancel()) {
- if(task.need_finish_queue == false)
- break;
- }
- }
- }
-
- void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
- {
- if(have_error())
- return;
-
- CUDAContextScope scope(this);
-
- CUfunction cuFilmConvert;
- CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
- CUdeviceptr d_buffer = cuda_device_ptr(buffer);
-
- /* get kernel function */
- if(rgba_half) {
- cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
- }
-
-
- float sample_scale = 1.0f/(task.sample + 1);
-
- /* pass in parameters */
- void *args[] = {&d_rgba,
- &d_buffer,
- &sample_scale,
- &task.x,
- &task.y,
- &task.w,
- &task.h,
- &task.offset,
- &task.stride};
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
- int xthreads = (int)sqrt(threads_per_block);
- int ythreads = (int)sqrt(threads_per_block);
- int xblocks = (task.w + xthreads - 1)/xthreads;
- int yblocks = (task.h + ythreads - 1)/ythreads;
-
- cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
- cuda_assert(cuLaunchKernel(cuFilmConvert,
- xblocks , yblocks, 1, /* blocks */
- xthreads, ythreads, 1, /* threads */
- 0, 0, args, 0));
-
- unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
-
- cuda_assert(cuCtxSynchronize());
- }
-
- void shader(DeviceTask& task)
- {
- if(have_error())
- return;
-
- CUDAContextScope scope(this);
-
- CUfunction cuShader;
- CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
- CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-
- /* get kernel function */
- if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
- }
- else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
- }
- else {
- cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
- }
-
- /* do tasks in smaller chunks, so we can cancel it */
- const int shader_chunk_size = 65536;
- const int start = task.shader_x;
- const int end = task.shader_x + task.shader_w;
- int offset = task.offset;
-
- bool canceled = false;
- for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
- for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
- int shader_w = min(shader_chunk_size, end - shader_x);
-
- /* pass in parameters */
- void *args[8];
- int arg = 0;
- args[arg++] = &d_input;
- args[arg++] = &d_output;
- args[arg++] = &task.shader_eval_type;
- if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
- args[arg++] = &task.shader_filter;
- }
- args[arg++] = &shader_x;
- args[arg++] = &shader_w;
- args[arg++] = &offset;
- args[arg++] = &sample;
-
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
- int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuLaunchKernel(cuShader,
- xblocks , 1, 1, /* blocks */
- threads_per_block, 1, 1, /* threads */
- 0, 0, args, 0));
-
- cuda_assert(cuCtxSynchronize());
-
- if(task.get_cancel()) {
- canceled = true;
- break;
- }
- }
-
- task.update_progress(NULL);
- }
- }
-
- CUdeviceptr map_pixels(device_ptr mem)
- {
- if(!background) {
- PixelMem pmem = pixel_mem_map[mem];
- CUdeviceptr buffer;
-
- size_t bytes;
- cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
- cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
- return buffer;
- }
-
- return cuda_device_ptr(mem);
- }
-
- void unmap_pixels(device_ptr mem)
- {
- if(!background) {
- PixelMem pmem = pixel_mem_map[mem];
-
- cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
- }
- }
-
- void pixels_alloc(device_memory& mem)
- {
- PixelMem pmem;
-
- pmem.w = mem.data_width;
- pmem.h = mem.data_height;
-
- CUDAContextScope scope(this);
-
- glGenBuffers(1, &pmem.cuPBO);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- if(mem.data_type == TYPE_HALF)
- glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
- else
- glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- glActiveTexture(GL_TEXTURE0);
- glGenTextures(1, &pmem.cuTexId);
- glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
- if(mem.data_type == TYPE_HALF)
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
- else
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
- glBindTexture(GL_TEXTURE_2D, 0);
-
- CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
- if(result == CUDA_SUCCESS) {
- mem.device_pointer = pmem.cuTexId;
- pixel_mem_map[mem.device_pointer] = pmem;
-
- mem.device_size = mem.memory_size();
- stats.mem_alloc(mem.device_size);
-
- return;
- }
- else {
- /* failed to register buffer, fallback to no interop */
- glDeleteBuffers(1, &pmem.cuPBO);
- glDeleteTextures(1, &pmem.cuTexId);
-
- background = true;
- }
- }
-
- void pixels_copy_from(device_memory& mem, int y, int w, int h)
- {
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
- CUDAContextScope scope(this);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
- size_t offset = sizeof(uchar)*4*y*w;
- memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
- glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
- }
-
- void pixels_free(device_memory& mem)
- {
- if(mem.device_pointer) {
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
- CUDAContextScope scope(this);
-
- cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
- glDeleteBuffers(1, &pmem.cuPBO);
- glDeleteTextures(1, &pmem.cuTexId);
-
- pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
- mem.device_pointer = 0;
-
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
-
- void draw_pixels(
- device_memory& mem, int y,
- int w, int h, int width, int height,
- int dx, int dy, int dw, int dh, bool transparent,
- const DeviceDrawParams &draw_params)
- {
- assert(mem.type == MEM_PIXELS);
-
- if(!background) {
- const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
- PixelMem pmem = pixel_mem_map[mem.device_pointer];
- float *vpointer;
-
- CUDAContextScope scope(this);
-
- /* for multi devices, this assumes the inefficient method that we allocate
- * all pixels on the device even though we only render to a subset */
- size_t offset = 4*y*w;
-
- if(mem.data_type == TYPE_HALF)
- offset *= sizeof(GLhalf);
- else
- offset *= sizeof(uint8_t);
-
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
- glActiveTexture(GL_TEXTURE0);
- glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
- if(mem.data_type == TYPE_HALF) {
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
- }
- else {
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
- }
- glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
- if(transparent) {
- glEnable(GL_BLEND);
- glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
- }
-
- GLint shader_program;
- if(use_fallback_shader) {
- if(!bind_fallback_display_space_shader(dw, dh)) {
- return;
- }
- shader_program = fallback_shader_program;
- }
- else {
- draw_params.bind_display_space_shader_cb();
- glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
- }
-
- if(!vertex_buffer) {
- glGenBuffers(1, &vertex_buffer);
- }
-
- glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
- /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
- glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
- vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
- if(vpointer) {
- /* texture coordinate - vertex pair */
- vpointer[0] = 0.0f;
- vpointer[1] = 0.0f;
- vpointer[2] = dx;
- vpointer[3] = dy;
-
- vpointer[4] = (float)w/(float)pmem.w;
- vpointer[5] = 0.0f;
- vpointer[6] = (float)width + dx;
- vpointer[7] = dy;
-
- vpointer[8] = (float)w/(float)pmem.w;
- vpointer[9] = (float)h/(float)pmem.h;
- vpointer[10] = (float)width + dx;
- vpointer[11] = (float)height + dy;
-
- vpointer[12] = 0.0f;
- vpointer[13] = (float)h/(float)pmem.h;
- vpointer[14] = dx;
- vpointer[15] = (float)height + dy;
-
- glUnmapBuffer(GL_ARRAY_BUFFER);
- }
-
- GLuint vertex_array_object;
- GLuint position_attribute, texcoord_attribute;
-
- glGenVertexArrays(1, &vertex_array_object);
- glBindVertexArray(vertex_array_object);
-
- texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
- position_attribute = glGetAttribLocation(shader_program, "pos");
-
- glEnableVertexAttribArray(texcoord_attribute);
- glEnableVertexAttribArray(position_attribute);
-
- glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
- glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2));
-
- glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
- if(use_fallback_shader) {
- glUseProgram(0);
- }
- else {
- draw_params.unbind_display_space_shader_cb();
- }
-
- if(transparent) {
- glDisable(GL_BLEND);
- }
-
- glBindTexture(GL_TEXTURE_2D, 0);
-
- return;
- }
-
- Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
- }
-
- void thread_run(DeviceTask *task)
- {
- CUDAContextScope scope(this);
-
- if(task->type == DeviceTask::RENDER) {
- DeviceRequestedFeatures requested_features;
- if(use_split_kernel()) {
- if(split_kernel == NULL) {
- split_kernel = new CUDASplitKernel(this);
- split_kernel->load_kernels(requested_features);
- }
- }
-
- device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
- /* keep rendering tiles until done */
- RenderTile tile;
- DenoisingTask denoising(this, *task);
-
- while(task->acquire_tile(this, tile)) {
- if(tile.task == RenderTile::PATH_TRACE) {
- if(use_split_kernel()) {
- device_only_memory<uchar> void_buffer(this, "void_buffer");
- split_kernel->path_trace(task, tile, void_buffer, void_buffer);
- }
- else {
- path_trace(*task, tile, work_tiles);
- }
- }
- else if(tile.task == RenderTile::DENOISE) {
- tile.sample = tile.start_sample + tile.num_samples;
-
- denoise(tile, denoising);
-
- task->update_progress(&tile, tile.w*tile.h);
- }
-
- task->release_tile(tile);
-
- if(task->get_cancel()) {
- if(task->need_finish_queue == false)
- break;
- }
- }
-
- work_tiles.free();
- }
- else if(task->type == DeviceTask::SHADER) {
- shader(*task);
-
- cuda_assert(cuCtxSynchronize());
- }
- }
-
- class CUDADeviceTask : public DeviceTask {
- public:
- CUDADeviceTask(CUDADevice *device, DeviceTask& task)
- : DeviceTask(task)
- {
- run = function_bind(&CUDADevice::thread_run, device, this);
- }
- };
-
- int get_split_task_count(DeviceTask& /*task*/)
- {
- return 1;
- }
-
- void task_add(DeviceTask& task)
- {
- CUDAContextScope scope(this);
-
- /* Load texture info. */
- load_texture_info();
-
- /* Synchronize all memory copies before executing task. */
- cuda_assert(cuCtxSynchronize());
-
- if(task.type == DeviceTask::FILM_CONVERT) {
- /* must be done in main thread due to opengl access */
- film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
- }
- else {
- task_pool.push(new CUDADeviceTask(this, task));
- }
- }
-
- void task_wait()
- {
- task_pool.wait();
- }
-
- void task_cancel()
- {
- task_pool.cancel();
- }
-
- friend class CUDASplitKernelFunction;
- friend class CUDASplitKernel;
- friend class CUDAContextScope;
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+ int threads_per_block; \
+ cuda_assert( \
+ cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+ int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
+ int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args) \
+ cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ int stride = task->buffer.stride;
+ int w = task->buffer.width;
+ int h = task->buffer.h;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int pass_stride = task->buffer.pass_stride;
+ int num_shifts = (2 * r + 1) * (2 * r + 1);
+ int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+ int frame_offset = 0;
+
+ if (have_error())
+ return false;
+
+ CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+ CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+ CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+ CUdeviceptr scale_ptr = 0;
+
+ cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
+ cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
+
+ {
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+ void *calc_difference_args[] = {&guide_ptr,
+ &variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &r,
+ &channel_offset,
+ &frame_offset,
+ &a,
+ &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {
+ &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *update_output_args[] = {&blurDifference,
+ &image_ptr,
+ &out_ptr,
+ &weightAccum,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &channel_offset,
+ &r,
+ &f};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+ }
+
+ {
+ CUfunction cuNLMNormalize;
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+ void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+ CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+ CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+ cuda_assert(cuCtxSynchronize());
+ }
+
+ return !have_error();
+ }
+
+ bool denoising_construct_transform(DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterConstructTransform;
+ cuda_assert(cuModuleGetFunction(
+ &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+ CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+ void *args[] = {&task->buffer.mem.device_pointer,
+ &task->tile_info_mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->filter_area,
+ &task->rect,
+ &task->radius,
+ &task->pca_threshold,
+ &task->buffer.pass_stride,
+ &task->buffer.frame_stride,
+ &task->buffer.use_time};
+ CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ int r = task->radius;
+ int f = 4;
+ float a = 1.0f;
+ float k_2 = task->nlm_k_2;
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+ int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
+
+ int pass_stride = task->buffer.pass_stride;
+ int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+ if (have_error())
+ return false;
+
+ CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+ CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+ CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+ cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+ cuda_assert(cuModuleGetFunction(
+ &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+ CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+ task->reconstruction_state.source_w *
+ task->reconstruction_state.source_h,
+ num_shifts);
+
+ void *calc_difference_args[] = {&color_ptr,
+ &color_variance_ptr,
+ &scale_ptr,
+ &difference,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &r,
+ &pass_stride,
+ &frame_offset,
+ &a,
+ &k_2};
+ void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *calc_weight_args[] = {
+ &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+ void *construct_gramian_args[] = {&t,
+ &blurDifference,
+ &task->buffer.mem.device_pointer,
+ &task->storage.transform.device_pointer,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->reconstruction_state.filter_window,
+ &w,
+ &h,
+ &stride,
+ &pass_stride,
+ &r,
+ &f,
+ &frame_offset,
+ &task->buffer.use_time};
+
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+ CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+ {
+ CUfunction cuFinalize;
+ cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+ cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+ void *finalize_args[] = {&output_ptr,
+ &task->storage.rank.device_pointer,
+ &task->storage.XtWX.device_pointer,
+ &task->storage.XtWY.device_pointer,
+ &task->filter_area,
+ &task->reconstruction_state.buffer_params.x,
+ &task->render_buffer.samples};
+ CUDA_GET_BLOCKSIZE(
+ cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+ CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterCombineHalves;
+ cuda_assert(cuModuleGetFunction(
+ &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+ CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterDivideShadow;
+ cuda_assert(cuModuleGetFunction(
+ &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->tile_info_mem.device_pointer,
+ &a_ptr,
+ &b_ptr,
+ &sample_variance_ptr,
+ &sv_variance_ptr,
+ &buffer_variance_ptr,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.offset};
+ CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ float scale,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterGetFeature;
+ cuda_assert(cuModuleGetFunction(
+ &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->tile_info_mem.device_pointer,
+ &mean_offset,
+ &variance_offset,
+ &mean_ptr,
+ &variance_ptr,
+ &scale,
+ &task->rect,
+ &task->render_buffer.pass_stride,
+ &task->render_buffer.offset};
+ CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_write_feature(int out_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterWriteFeature;
+ cuda_assert(cuModuleGetFunction(
+ &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+ void *args[] = {&task->render_buffer.samples,
+ &task->reconstruction_state.buffer_params,
+ &task->filter_area,
+ &from_ptr,
+ &buffer_ptr,
+ &out_offset,
+ &task->rect};
+ CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task)
+ {
+ if (have_error())
+ return false;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilterDetectOutliers;
+ cuda_assert(cuModuleGetFunction(
+ &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+ cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+ CUDA_GET_BLOCKSIZE(
+ cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ void *args[] = {&image_ptr,
+ &variance_ptr,
+ &depth_ptr,
+ &output_ptr,
+ &task->rect,
+ &task->buffer.pass_stride};
+
+ CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+ cuda_assert(cuCtxSynchronize());
+
+ return !have_error();
+ }
+
+ void denoise(RenderTile &rtile, DenoisingTask &denoising)
+ {
+ denoising.functions.construct_transform = function_bind(
+ &CUDADevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.accumulate = function_bind(
+ &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
+ denoising.functions.divide_shadow = function_bind(
+ &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(
+ &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(
+ &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(
+ &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(
+ &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+ denoising.functions.detect_outliers = function_bind(
+ &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+ denoising.buffer.gpu_temporary_mem = true;
+
+ denoising.run_denoising(&rtile);
+ }
+
+ void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
+ {
+ scoped_timer timer(&rtile.buffers->render_time);
+
+ if (have_error())
+ return;
+
+ CUDAContextScope scope(this);
+ CUfunction cuPathTrace;
+
+ /* Get kernel function. */
+ if (task.integrator_branched) {
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+ }
+
+ if (have_error()) {
+ return;
+ }
+
+ cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+ /* Allocate work tile. */
+ work_tiles.alloc(1);
+
+ WorkTile *wtile = work_tiles.data();
+ wtile->x = rtile.x;
+ wtile->y = rtile.y;
+ wtile->w = rtile.w;
+ wtile->h = rtile.h;
+ wtile->offset = rtile.offset;
+ wtile->stride = rtile.stride;
+ wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
+
+ /* Prepare work size. More step samples render faster, but for now we
+ * remain conservative for GPUs connected to a display to avoid driver
+ * timeouts and display freezing. */
+ int min_blocks, num_threads_per_block;
+ cuda_assert(cuOccupancyMaxPotentialBlockSize(
+ &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+ if (!info.display_device) {
+ min_blocks *= 8;
+ }
+
+ uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+ /* Render all samples. */
+ int start_sample = rtile.start_sample;
+ int end_sample = rtile.start_sample + rtile.num_samples;
+
+ for (int sample = start_sample; sample < end_sample; sample += step_samples) {
+ /* Setup and copy work tile to device. */
+ wtile->start_sample = sample;
+ wtile->num_samples = min(step_samples, end_sample - sample);
+ work_tiles.copy_to_device();
+
+ CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+ uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+ uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+ /* Launch kernel. */
+ void *args[] = {&d_work_tiles, &total_work_size};
+
+ cuda_assert(cuLaunchKernel(
+ cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ /* Update progress. */
+ rtile.sample = sample + wtile->num_samples;
+ task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+ if (task.get_cancel()) {
+ if (task.need_finish_queue == false)
+ break;
+ }
+ }
+ }
+
+ void film_convert(DeviceTask &task,
+ device_ptr buffer,
+ device_ptr rgba_byte,
+ device_ptr rgba_half)
+ {
+ if (have_error())
+ return;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuFilmConvert;
+ CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
+ CUdeviceptr d_buffer = cuda_device_ptr(buffer);
+
+ /* get kernel function */
+ if (rgba_half) {
+ cuda_assert(
+ cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+ }
+
+ float sample_scale = 1.0f / (task.sample + 1);
+
+ /* pass in parameters */
+ void *args[] = {&d_rgba,
+ &d_buffer,
+ &sample_scale,
+ &task.x,
+ &task.y,
+ &task.w,
+ &task.h,
+ &task.offset,
+ &task.stride};
+
+ /* launch kernel */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(
+ &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
+
+ int xthreads = (int)sqrt(threads_per_block);
+ int ythreads = (int)sqrt(threads_per_block);
+ int xblocks = (task.w + xthreads - 1) / xthreads;
+ int yblocks = (task.h + ythreads - 1) / ythreads;
+
+ cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(cuFilmConvert,
+ xblocks,
+ yblocks,
+ 1, /* blocks */
+ xthreads,
+ ythreads,
+ 1, /* threads */
+ 0,
+ 0,
+ args,
+ 0));
+
+ unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
+
+ cuda_assert(cuCtxSynchronize());
+ }
+
+ void shader(DeviceTask &task)
+ {
+ if (have_error())
+ return;
+
+ CUDAContextScope scope(this);
+
+ CUfunction cuShader;
+ CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
+ CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+
+ /* get kernel function */
+ if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
+ }
+ else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+ }
+ else {
+ cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
+ }
+
+ /* do tasks in smaller chunks, so we can cancel it */
+ const int shader_chunk_size = 65536;
+ const int start = task.shader_x;
+ const int end = task.shader_x + task.shader_w;
+ int offset = task.offset;
+
+ bool canceled = false;
+ for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
+ for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+ int shader_w = min(shader_chunk_size, end - shader_x);
+
+ /* pass in parameters */
+ void *args[8];
+ int arg = 0;
+ args[arg++] = &d_input;
+ args[arg++] = &d_output;
+ args[arg++] = &task.shader_eval_type;
+ if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+ args[arg++] = &task.shader_filter;
+ }
+ args[arg++] = &shader_x;
+ args[arg++] = &shader_w;
+ args[arg++] = &offset;
+ args[arg++] = &sample;
+
+ /* launch kernel */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(
+ &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+ int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
+
+ cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuLaunchKernel(cuShader,
+ xblocks,
+ 1,
+ 1, /* blocks */
+ threads_per_block,
+ 1,
+ 1, /* threads */
+ 0,
+ 0,
+ args,
+ 0));
+
+ cuda_assert(cuCtxSynchronize());
+
+ if (task.get_cancel()) {
+ canceled = true;
+ break;
+ }
+ }
+
+ task.update_progress(NULL);
+ }
+ }
+
+ CUdeviceptr map_pixels(device_ptr mem)
+ {
+ if (!background) {
+ PixelMem pmem = pixel_mem_map[mem];
+ CUdeviceptr buffer;
+
+ size_t bytes;
+ cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+ cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
+
+ return buffer;
+ }
+
+ return cuda_device_ptr(mem);
+ }
+
+ void unmap_pixels(device_ptr mem)
+ {
+ if (!background) {
+ PixelMem pmem = pixel_mem_map[mem];
+
+ cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
+ }
+ }
+
+ void pixels_alloc(device_memory &mem)
+ {
+ PixelMem pmem;
+
+ pmem.w = mem.data_width;
+ pmem.h = mem.data_height;
+
+ CUDAContextScope scope(this);
+
+ glGenBuffers(1, &pmem.cuPBO);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+ if (mem.data_type == TYPE_HALF)
+ glBufferData(
+ GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
+ else
+ glBufferData(
+ GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+ glActiveTexture(GL_TEXTURE0);
+ glGenTextures(1, &pmem.cuTexId);
+ glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+ if (mem.data_type == TYPE_HALF)
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+ else
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glBindTexture(GL_TEXTURE_2D, 0);
+
+ CUresult result = cuGraphicsGLRegisterBuffer(
+ &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+
+ if (result == CUDA_SUCCESS) {
+ mem.device_pointer = pmem.cuTexId;
+ pixel_mem_map[mem.device_pointer] = pmem;
+
+ mem.device_size = mem.memory_size();
+ stats.mem_alloc(mem.device_size);
+
+ return;
+ }
+ else {
+ /* failed to register buffer, fallback to no interop */
+ glDeleteBuffers(1, &pmem.cuPBO);
+ glDeleteTextures(1, &pmem.cuTexId);
+
+ background = true;
+ }
+ }
+
+ void pixels_copy_from(device_memory &mem, int y, int w, int h)
+ {
+ PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+ CUDAContextScope scope(this);
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+ uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+ size_t offset = sizeof(uchar) * 4 * y * w;
+ memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
+ glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+ }
+
+ void pixels_free(device_memory &mem)
+ {
+ if (mem.device_pointer) {
+ PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+ CUDAContextScope scope(this);
+
+ cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+ glDeleteBuffers(1, &pmem.cuPBO);
+ glDeleteTextures(1, &pmem.cuTexId);
+
+ pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+ mem.device_pointer = 0;
+
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+ }
+
+ void draw_pixels(device_memory &mem,
+ int y,
+ int w,
+ int h,
+ int width,
+ int height,
+ int dx,
+ int dy,
+ int dw,
+ int dh,
+ bool transparent,
+ const DeviceDrawParams &draw_params)
+ {
+ assert(mem.type == MEM_PIXELS);
+
+ if (!background) {
+ const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+ PixelMem pmem = pixel_mem_map[mem.device_pointer];
+ float *vpointer;
+
+ CUDAContextScope scope(this);
+
+ /* for multi devices, this assumes the inefficient method that we allocate
+ * all pixels on the device even though we only render to a subset */
+ size_t offset = 4 * y * w;
+
+ if (mem.data_type == TYPE_HALF)
+ offset *= sizeof(GLhalf);
+ else
+ offset *= sizeof(uint8_t);
+
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+ glActiveTexture(GL_TEXTURE0);
+ glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+ if (mem.data_type == TYPE_HALF) {
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
+ }
+ else {
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
+ }
+ glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+ if (transparent) {
+ glEnable(GL_BLEND);
+ glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+ }
+
+ GLint shader_program;
+ if (use_fallback_shader) {
+ if (!bind_fallback_display_space_shader(dw, dh)) {
+ return;
+ }
+ shader_program = fallback_shader_program;
+ }
+ else {
+ draw_params.bind_display_space_shader_cb();
+ glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+ }
+
+ if (!vertex_buffer) {
+ glGenBuffers(1, &vertex_buffer);
+ }
+
+ glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+ /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+ glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+ vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+ if (vpointer) {
+ /* texture coordinate - vertex pair */
+ vpointer[0] = 0.0f;
+ vpointer[1] = 0.0f;
+ vpointer[2] = dx;
+ vpointer[3] = dy;
+
+ vpointer[4] = (float)w / (float)pmem.w;
+ vpointer[5] = 0.0f;
+ vpointer[6] = (float)width + dx;
+ vpointer[7] = dy;
+
+ vpointer[8] = (float)w / (float)pmem.w;
+ vpointer[9] = (float)h / (float)pmem.h;
+ vpointer[10] = (float)width + dx;
+ vpointer[11] = (float)height + dy;
+
+ vpointer[12] = 0.0f;
+ vpointer[13] = (float)h / (float)pmem.h;
+ vpointer[14] = dx;
+ vpointer[15] = (float)height + dy;
+
+ glUnmapBuffer(GL_ARRAY_BUFFER);
+ }
+
+ GLuint vertex_array_object;
+ GLuint position_attribute, texcoord_attribute;
+
+ glGenVertexArrays(1, &vertex_array_object);
+ glBindVertexArray(vertex_array_object);
+
+ texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+ position_attribute = glGetAttribLocation(shader_program, "pos");
+
+ glEnableVertexAttribArray(texcoord_attribute);
+ glEnableVertexAttribArray(position_attribute);
+
+ glVertexAttribPointer(
+ texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+ glVertexAttribPointer(position_attribute,
+ 2,
+ GL_FLOAT,
+ GL_FALSE,
+ 4 * sizeof(float),
+ (const GLvoid *)(sizeof(float) * 2));
+
+ glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+ if (use_fallback_shader) {
+ glUseProgram(0);
+ }
+ else {
+ draw_params.unbind_display_space_shader_cb();
+ }
+
+ if (transparent) {
+ glDisable(GL_BLEND);
+ }
+
+ glBindTexture(GL_TEXTURE_2D, 0);
+
+ return;
+ }
+
+ Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
+ }
+
+ void thread_run(DeviceTask *task)
+ {
+ CUDAContextScope scope(this);
+
+ if (task->type == DeviceTask::RENDER) {
+ DeviceRequestedFeatures requested_features;
+ if (use_split_kernel()) {
+ if (split_kernel == NULL) {
+ split_kernel = new CUDASplitKernel(this);
+ split_kernel->load_kernels(requested_features);
+ }
+ }
+
+ device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+ /* keep rendering tiles until done */
+ RenderTile tile;
+ DenoisingTask denoising(this, *task);
+
+ while (task->acquire_tile(this, tile)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ if (use_split_kernel()) {
+ device_only_memory<uchar> void_buffer(this, "void_buffer");
+ split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+ }
+ else {
+ path_trace(*task, tile, work_tiles);
+ }
+ }
+ else if (tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+
+ denoise(tile, denoising);
+
+ task->update_progress(&tile, tile.w * tile.h);
+ }
+
+ task->release_tile(tile);
+
+ if (task->get_cancel()) {
+ if (task->need_finish_queue == false)
+ break;
+ }
+ }
+
+ work_tiles.free();
+ }
+ else if (task->type == DeviceTask::SHADER) {
+ shader(*task);
+
+ cuda_assert(cuCtxSynchronize());
+ }
+ }
+
+ class CUDADeviceTask : public DeviceTask {
+ public:
+ CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
+ {
+ run = function_bind(&CUDADevice::thread_run, device, this);
+ }
+ };
+
+ int get_split_task_count(DeviceTask & /*task*/)
+ {
+ return 1;
+ }
+
+ void task_add(DeviceTask &task)
+ {
+ CUDAContextScope scope(this);
+
+ /* Load texture info. */
+ load_texture_info();
+
+ /* Synchronize all memory copies before executing task. */
+ cuda_assert(cuCtxSynchronize());
+
+ if (task.type == DeviceTask::FILM_CONVERT) {
+ /* must be done in main thread due to opengl access */
+ film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+ }
+ else {
+ task_pool.push(new CUDADeviceTask(this, task));
+ }
+ }
+
+ void task_wait()
+ {
+ task_pool.wait();
+ }
+
+ void task_cancel()
+ {
+ task_pool.cancel();
+ }
+
+ friend class CUDASplitKernelFunction;
+ friend class CUDASplitKernel;
+ friend class CUDAContextScope;
};
/* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
@@ -2207,496 +2305,501 @@ public:
*/
#undef cuda_assert
#define cuda_assert(stmt) \
- { \
- CUresult result = stmt; \
- \
- if(result != CUDA_SUCCESS) { \
- string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
- if(device->error_msg == "") \
- device->error_msg = message; \
- fprintf(stderr, "%s\n", message.c_str()); \
- /*cuda_abort();*/ \
- device->cuda_error_documentation(); \
- } \
- } (void) 0
-
+ { \
+ CUresult result = stmt; \
+\
+ if (result != CUDA_SUCCESS) { \
+ string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+ if (device->error_msg == "") \
+ device->error_msg = message; \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ /*cuda_abort();*/ \
+ device->cuda_error_documentation(); \
+ } \
+ } \
+ (void)0
/* CUDA context scope. */
-CUDAContextScope::CUDAContextScope(CUDADevice *device)
-: device(device)
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
{
- cuda_assert(cuCtxPushCurrent(device->cuContext));
+ cuda_assert(cuCtxPushCurrent(device->cuContext));
}
CUDAContextScope::~CUDAContextScope()
{
- cuda_assert(cuCtxPopCurrent(NULL));
+ cuda_assert(cuCtxPopCurrent(NULL));
}
/* split kernel */
-class CUDASplitKernelFunction : public SplitKernelFunction{
- CUDADevice* device;
- CUfunction func;
-public:
- CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
-
- /* enqueue the kernel, returns false if there is an error */
- bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
- {
- return enqueue(dim, NULL);
- }
-
- /* enqueue the kernel, returns false if there is an error */
- bool enqueue(const KernelDimensions &dim, void *args[])
- {
- if(device->have_error())
- return false;
-
- CUDAContextScope scope(device);
-
- /* we ignore dim.local_size for now, as this is faster */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
- int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
-
- cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
- cuda_assert(cuLaunchKernel(func,
- xblocks, 1, 1, /* blocks */
- threads_per_block, 1, 1, /* threads */
- 0, 0, args, 0));
-
- return !device->have_error();
- }
+class CUDASplitKernelFunction : public SplitKernelFunction {
+ CUDADevice *device;
+ CUfunction func;
+
+ public:
+ CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
+ {
+ }
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
+ {
+ return enqueue(dim, NULL);
+ }
+
+ /* enqueue the kernel, returns false if there is an error */
+ bool enqueue(const KernelDimensions &dim, void *args[])
+ {
+ if (device->have_error())
+ return false;
+
+ CUDAContextScope scope(device);
+
+ /* we ignore dim.local_size for now, as this is faster */
+ int threads_per_block;
+ cuda_assert(
+ cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+ int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
+ threads_per_block;
+
+ cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+ cuda_assert(cuLaunchKernel(func,
+ xblocks,
+ 1,
+ 1, /* blocks */
+ threads_per_block,
+ 1,
+ 1, /* threads */
+ 0,
+ 0,
+ args,
+ 0));
+
+ return !device->have_error();
+ }
};
CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
{
}
-uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
+ device_memory & /*data*/,
+ size_t num_threads)
{
- CUDAContextScope scope(device);
+ CUDAContextScope scope(device);
- device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
- size_buffer.alloc(1);
- size_buffer.zero_to_device();
+ device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+ size_buffer.alloc(1);
+ size_buffer.zero_to_device();
- uint threads = num_threads;
- CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+ uint threads = num_threads;
+ CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
- struct args_t {
- uint* num_threads;
- CUdeviceptr* size;
- };
+ struct args_t {
+ uint *num_threads;
+ CUdeviceptr *size;
+ };
- args_t args = {
- &threads,
- &d_size
- };
+ args_t args = {&threads, &d_size};
- CUfunction state_buffer_size;
- cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+ CUfunction state_buffer_size;
+ cuda_assert(
+ cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
- cuda_assert(cuLaunchKernel(state_buffer_size,
- 1, 1, 1,
- 1, 1, 1,
- 0, 0, (void**)&args, 0));
+ cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
- size_buffer.copy_from_device(0, 1, 1);
- size_t size = size_buffer[0];
- size_buffer.free();
+ size_buffer.copy_from_device(0, 1, 1);
+ size_t size = size_buffer[0];
+ size_buffer.free();
- return size;
+ return size;
}
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& /*kernel_globals*/,
- device_memory& /*kernel_data*/,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs)
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+ RenderTile &rtile,
+ int num_global_elements,
+ device_memory & /*kernel_globals*/,
+ device_memory & /*kernel_data*/,
+ device_memory &split_data,
+ device_memory &ray_state,
+ device_memory &queue_index,
+ device_memory &use_queues_flag,
+ device_memory &work_pool_wgs)
{
- CUDAContextScope scope(device);
-
- CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
- CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
- CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
- CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
- CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
- CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
- int end_sample = rtile.start_sample + rtile.num_samples;
- int queue_size = dim.global_size[0] * dim.global_size[1];
-
- struct args_t {
- CUdeviceptr* split_data_buffer;
- int* num_elements;
- CUdeviceptr* ray_state;
- int* start_sample;
- int* end_sample;
- int* sx;
- int* sy;
- int* sw;
- int* sh;
- int* offset;
- int* stride;
- CUdeviceptr* queue_index;
- int* queuesize;
- CUdeviceptr* use_queues_flag;
- CUdeviceptr* work_pool_wgs;
- int* num_samples;
- CUdeviceptr* buffer;
- };
-
- args_t args = {
- &d_split_data,
- &num_global_elements,
- &d_ray_state,
- &rtile.start_sample,
- &end_sample,
- &rtile.x,
- &rtile.y,
- &rtile.w,
- &rtile.h,
- &rtile.offset,
- &rtile.stride,
- &d_queue_index,
- &queue_size,
- &d_use_queues_flag,
- &d_work_pool_wgs,
- &rtile.num_samples,
- &d_buffer
- };
-
- CUfunction data_init;
- cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
- if(device->have_error()) {
- return false;
- }
-
- CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
-
- return !device->have_error();
+ CUDAContextScope scope(device);
+
+ CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+ CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+ CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+ CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+ CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+ CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+ int end_sample = rtile.start_sample + rtile.num_samples;
+ int queue_size = dim.global_size[0] * dim.global_size[1];
+
+ struct args_t {
+ CUdeviceptr *split_data_buffer;
+ int *num_elements;
+ CUdeviceptr *ray_state;
+ int *start_sample;
+ int *end_sample;
+ int *sx;
+ int *sy;
+ int *sw;
+ int *sh;
+ int *offset;
+ int *stride;
+ CUdeviceptr *queue_index;
+ int *queuesize;
+ CUdeviceptr *use_queues_flag;
+ CUdeviceptr *work_pool_wgs;
+ int *num_samples;
+ CUdeviceptr *buffer;
+ };
+
+ args_t args = {&d_split_data,
+ &num_global_elements,
+ &d_ray_state,
+ &rtile.start_sample,
+ &end_sample,
+ &rtile.x,
+ &rtile.y,
+ &rtile.w,
+ &rtile.h,
+ &rtile.offset,
+ &rtile.stride,
+ &d_queue_index,
+ &queue_size,
+ &d_use_queues_flag,
+ &d_work_pool_wgs,
+ &rtile.num_samples,
+ &d_buffer};
+
+ CUfunction data_init;
+ cuda_assert(
+ cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+ if (device->have_error()) {
+ return false;
+ }
+
+ CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
+
+ return !device->have_error();
}
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
- const DeviceRequestedFeatures&)
+SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
+ const DeviceRequestedFeatures &)
{
- CUDAContextScope scope(device);
- CUfunction func;
-
- cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
- if(device->have_error()) {
- device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
- return NULL;
- }
-
- return new CUDASplitKernelFunction(device, func);
+ CUDAContextScope scope(device);
+ CUfunction func;
+
+ cuda_assert(
+ cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+ if (device->have_error()) {
+ device->cuda_error_message(
+ string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+ return NULL;
+ }
+
+ return new CUDASplitKernelFunction(device, func);
}
int2 CUDASplitKernel::split_kernel_local_size()
{
- return make_int2(32, 1);
+ return make_int2(32, 1);
}
-int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
+ device_memory &data,
+ DeviceTask * /*task*/)
{
- CUDAContextScope scope(device);
- size_t free;
- size_t total;
+ CUDAContextScope scope(device);
+ size_t free;
+ size_t total;
- cuda_assert(cuMemGetInfo(&free, &total));
+ cuda_assert(cuMemGetInfo(&free, &total));
- VLOG(1) << "Maximum device allocation size: "
- << string_human_readable_number(free) << " bytes. ("
- << string_human_readable_size(free) << ").";
+ VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
+ << " bytes. (" << string_human_readable_size(free) << ").";
- size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
- size_t side = round_down((int)sqrt(num_elements), 32);
- int2 global_size = make_int2(side, round_down(num_elements / side, 16));
- VLOG(1) << "Global size: " << global_size << ".";
- return global_size;
+ size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+ size_t side = round_down((int)sqrt(num_elements), 32);
+ int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+ VLOG(1) << "Global size: " << global_size << ".";
+ return global_size;
}
bool device_cuda_init()
{
#ifdef WITH_CUDA_DYNLOAD
- static bool initialized = false;
- static bool result = false;
-
- if(initialized)
- return result;
-
- initialized = true;
- int cuew_result = cuewInit(CUEW_INIT_CUDA);
- if(cuew_result == CUEW_SUCCESS) {
- VLOG(1) << "CUEW initialization succeeded";
- if(CUDADevice::have_precompiled_kernels()) {
- VLOG(1) << "Found precompiled kernels";
- result = true;
- }
-#ifndef _WIN32
- else if(cuewCompilerPath() != NULL) {
- VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
- result = true;
- }
- else {
- VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
- << " unable to use CUDA";
- }
-#endif
- }
- else {
- VLOG(1) << "CUEW initialization failed: "
- << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED)
- ? "Error setting up atexit() handler"
- : "Error opening the library");
- }
-
- return result;
+ static bool initialized = false;
+ static bool result = false;
+
+ if (initialized)
+ return result;
+
+ initialized = true;
+ int cuew_result = cuewInit(CUEW_INIT_CUDA);
+ if (cuew_result == CUEW_SUCCESS) {
+ VLOG(1) << "CUEW initialization succeeded";
+ if (CUDADevice::have_precompiled_kernels()) {
+ VLOG(1) << "Found precompiled kernels";
+ result = true;
+ }
+# ifndef _WIN32
+ else if (cuewCompilerPath() != NULL) {
+ VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
+ result = true;
+ }
+ else {
+ VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
+ << " unable to use CUDA";
+ }
+# endif
+ }
+ else {
+ VLOG(1) << "CUEW initialization failed: "
+ << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+ "Error opening the library");
+ }
+
+ return result;
#else /* WITH_CUDA_DYNLOAD */
- return true;
-#endif /* WITH_CUDA_DYNLOAD */
+ return true;
+#endif /* WITH_CUDA_DYNLOAD */
}
-Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
- return new CUDADevice(info, stats, profiler, background);
+ return new CUDADevice(info, stats, profiler, background);
}
static CUresult device_cuda_safe_init()
{
#ifdef _WIN32
- __try {
- return cuInit(0);
- }
- __except(EXCEPTION_EXECUTE_HANDLER) {
- /* Ignore crashes inside the CUDA driver and hope we can
- * survive even with corrupted CUDA installs. */
- fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
- }
-
- return CUDA_ERROR_NO_DEVICE;
+ __try {
+ return cuInit(0);
+ }
+ __except (EXCEPTION_EXECUTE_HANDLER) {
+ /* Ignore crashes inside the CUDA driver and hope we can
+ * survive even with corrupted CUDA installs. */
+ fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
+ }
+
+ return CUDA_ERROR_NO_DEVICE;
#else
- return cuInit(0);
+ return cuInit(0);
#endif
}
-void device_cuda_info(vector<DeviceInfo>& devices)
+void device_cuda_info(vector<DeviceInfo> &devices)
{
- CUresult result = device_cuda_safe_init();
- if(result != CUDA_SUCCESS) {
- if(result != CUDA_ERROR_NO_DEVICE)
- fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
- return;
- }
-
- int count = 0;
- result = cuDeviceGetCount(&count);
- if(result != CUDA_SUCCESS) {
- fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
- return;
- }
-
- vector<DeviceInfo> display_devices;
-
- for(int num = 0; num < count; num++) {
- char name[256];
-
- result = cuDeviceGetName(name, 256, num);
- if(result != CUDA_SUCCESS) {
- fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
- continue;
- }
-
- int major;
- cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
- if(major < 3) {
- VLOG(1) << "Ignoring device \"" << name
- << "\", this graphics card is no longer supported.";
- continue;
- }
-
- DeviceInfo info;
-
- info.type = DEVICE_CUDA;
- info.description = string(name);
- info.num = num;
-
- info.has_half_images = (major >= 3);
- info.has_volume_decoupled = false;
-
- int pci_location[3] = {0, 0, 0};
- cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
- cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
- cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
- info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
- name,
- (unsigned int)pci_location[0],
- (unsigned int)pci_location[1],
- (unsigned int)pci_location[2]);
-
- /* If device has a kernel timeout and no compute preemption, we assume
- * it is connected to a display and will freeze the display while doing
- * computations. */
- int timeout_attr = 0, preempt_attr = 0;
- cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
- cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
-
- if(timeout_attr && !preempt_attr) {
- VLOG(1) << "Device is recognized as display.";
- info.description += " (Display)";
- info.display_device = true;
- display_devices.push_back(info);
- }
- else {
- devices.push_back(info);
- }
- VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
- }
-
- if(!display_devices.empty())
- devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+ CUresult result = device_cuda_safe_init();
+ if (result != CUDA_SUCCESS) {
+ if (result != CUDA_ERROR_NO_DEVICE)
+ fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
+ return;
+ }
+
+ int count = 0;
+ result = cuDeviceGetCount(&count);
+ if (result != CUDA_SUCCESS) {
+ fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
+ return;
+ }
+
+ vector<DeviceInfo> display_devices;
+
+ for (int num = 0; num < count; num++) {
+ char name[256];
+
+ result = cuDeviceGetName(name, 256, num);
+ if (result != CUDA_SUCCESS) {
+ fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
+ continue;
+ }
+
+ int major;
+ cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
+ if (major < 3) {
+ VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported.";
+ continue;
+ }
+
+ DeviceInfo info;
+
+ info.type = DEVICE_CUDA;
+ info.description = string(name);
+ info.num = num;
+
+ info.has_half_images = (major >= 3);
+ info.has_volume_decoupled = false;
+
+ int pci_location[3] = {0, 0, 0};
+ cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
+ cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
+ cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
+ info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
+ name,
+ (unsigned int)pci_location[0],
+ (unsigned int)pci_location[1],
+ (unsigned int)pci_location[2]);
+
+ /* If device has a kernel timeout and no compute preemption, we assume
+ * it is connected to a display and will freeze the display while doing
+ * computations. */
+ int timeout_attr = 0, preempt_attr = 0;
+ cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
+ cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
+
+ if (timeout_attr && !preempt_attr) {
+ VLOG(1) << "Device is recognized as display.";
+ info.description += " (Display)";
+ info.display_device = true;
+ display_devices.push_back(info);
+ }
+ else {
+ devices.push_back(info);
+ }
+ VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+ }
+
+ if (!display_devices.empty())
+ devices.insert(devices.end(), display_devices.begin(), display_devices.end());
}
string device_cuda_capabilities()
{
- CUresult result = device_cuda_safe_init();
- if(result != CUDA_SUCCESS) {
- if(result != CUDA_ERROR_NO_DEVICE) {
- return string("Error initializing CUDA: ") + cuewErrorString(result);
- }
- return "No CUDA device found\n";
- }
-
- int count;
- result = cuDeviceGetCount(&count);
- if(result != CUDA_SUCCESS) {
- return string("Error getting devices: ") + cuewErrorString(result);
- }
-
- string capabilities = "";
- for(int num = 0; num < count; num++) {
- char name[256];
- if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
- continue;
- }
- capabilities += string("\t") + name + "\n";
- int value;
+ CUresult result = device_cuda_safe_init();
+ if (result != CUDA_SUCCESS) {
+ if (result != CUDA_ERROR_NO_DEVICE) {
+ return string("Error initializing CUDA: ") + cuewErrorString(result);
+ }
+ return "No CUDA device found\n";
+ }
+
+ int count;
+ result = cuDeviceGetCount(&count);
+ if (result != CUDA_SUCCESS) {
+ return string("Error getting devices: ") + cuewErrorString(result);
+ }
+
+ string capabilities = "";
+ for (int num = 0; num < count; num++) {
+ char name[256];
+ if (cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
+ continue;
+ }
+ capabilities += string("\t") + name + "\n";
+ int value;
#define GET_ATTR(attr) \
- { \
- if(cuDeviceGetAttribute(&value, \
- CU_DEVICE_ATTRIBUTE_##attr, \
- num) == CUDA_SUCCESS) \
- { \
- capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
- value); \
- } \
- } (void) 0
- /* TODO(sergey): Strip all attributes which are not useful for us
- * or does not depend on the driver.
- */
- GET_ATTR(MAX_THREADS_PER_BLOCK);
- GET_ATTR(MAX_BLOCK_DIM_X);
- GET_ATTR(MAX_BLOCK_DIM_Y);
- GET_ATTR(MAX_BLOCK_DIM_Z);
- GET_ATTR(MAX_GRID_DIM_X);
- GET_ATTR(MAX_GRID_DIM_Y);
- GET_ATTR(MAX_GRID_DIM_Z);
- GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
- GET_ATTR(SHARED_MEMORY_PER_BLOCK);
- GET_ATTR(TOTAL_CONSTANT_MEMORY);
- GET_ATTR(WARP_SIZE);
- GET_ATTR(MAX_PITCH);
- GET_ATTR(MAX_REGISTERS_PER_BLOCK);
- GET_ATTR(REGISTERS_PER_BLOCK);
- GET_ATTR(CLOCK_RATE);
- GET_ATTR(TEXTURE_ALIGNMENT);
- GET_ATTR(GPU_OVERLAP);
- GET_ATTR(MULTIPROCESSOR_COUNT);
- GET_ATTR(KERNEL_EXEC_TIMEOUT);
- GET_ATTR(INTEGRATED);
- GET_ATTR(CAN_MAP_HOST_MEMORY);
- GET_ATTR(COMPUTE_MODE);
- GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
- GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
- GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
- GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
- GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
- GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
- GET_ATTR(SURFACE_ALIGNMENT);
- GET_ATTR(CONCURRENT_KERNELS);
- GET_ATTR(ECC_ENABLED);
- GET_ATTR(TCC_DRIVER);
- GET_ATTR(MEMORY_CLOCK_RATE);
- GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
- GET_ATTR(L2_CACHE_SIZE);
- GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
- GET_ATTR(ASYNC_ENGINE_COUNT);
- GET_ATTR(UNIFIED_ADDRESSING);
- GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
- GET_ATTR(CAN_TEX2D_GATHER);
- GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
- GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
- GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
- GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
- GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
- GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
- GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
- GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
- GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
- GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
- GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
- GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
- GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
- GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
- GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
- GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
- GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
- GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
- GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
- GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
- GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
- GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
- GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
- GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
- GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
- GET_ATTR(COMPUTE_CAPABILITY_MINOR);
- GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
- GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
- GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
- GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
- GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
- GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
- GET_ATTR(MANAGED_MEMORY);
- GET_ATTR(MULTI_GPU_BOARD);
- GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
+ { \
+ if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
+ capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+ } \
+ } \
+ (void)0
+ /* TODO(sergey): Strip all attributes which are not useful for us
+ * or does not depend on the driver.
+ */
+ GET_ATTR(MAX_THREADS_PER_BLOCK);
+ GET_ATTR(MAX_BLOCK_DIM_X);
+ GET_ATTR(MAX_BLOCK_DIM_Y);
+ GET_ATTR(MAX_BLOCK_DIM_Z);
+ GET_ATTR(MAX_GRID_DIM_X);
+ GET_ATTR(MAX_GRID_DIM_Y);
+ GET_ATTR(MAX_GRID_DIM_Z);
+ GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
+ GET_ATTR(SHARED_MEMORY_PER_BLOCK);
+ GET_ATTR(TOTAL_CONSTANT_MEMORY);
+ GET_ATTR(WARP_SIZE);
+ GET_ATTR(MAX_PITCH);
+ GET_ATTR(MAX_REGISTERS_PER_BLOCK);
+ GET_ATTR(REGISTERS_PER_BLOCK);
+ GET_ATTR(CLOCK_RATE);
+ GET_ATTR(TEXTURE_ALIGNMENT);
+ GET_ATTR(GPU_OVERLAP);
+ GET_ATTR(MULTIPROCESSOR_COUNT);
+ GET_ATTR(KERNEL_EXEC_TIMEOUT);
+ GET_ATTR(INTEGRATED);
+ GET_ATTR(CAN_MAP_HOST_MEMORY);
+ GET_ATTR(COMPUTE_MODE);
+ GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
+ GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
+ GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
+ GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
+ GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
+ GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
+ GET_ATTR(SURFACE_ALIGNMENT);
+ GET_ATTR(CONCURRENT_KERNELS);
+ GET_ATTR(ECC_ENABLED);
+ GET_ATTR(TCC_DRIVER);
+ GET_ATTR(MEMORY_CLOCK_RATE);
+ GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
+ GET_ATTR(L2_CACHE_SIZE);
+ GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
+ GET_ATTR(ASYNC_ENGINE_COUNT);
+ GET_ATTR(UNIFIED_ADDRESSING);
+ GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
+ GET_ATTR(CAN_TEX2D_GATHER);
+ GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
+ GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
+ GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
+ GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
+ GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
+ GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
+ GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
+ GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
+ GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
+ GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
+ GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
+ GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
+ GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
+ GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
+ GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
+ GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
+ GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
+ GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
+ GET_ATTR(COMPUTE_CAPABILITY_MINOR);
+ GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
+ GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
+ GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
+ GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
+ GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+ GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
+ GET_ATTR(MANAGED_MEMORY);
+ GET_ATTR(MULTI_GPU_BOARD);
+ GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
#undef GET_ATTR
- capabilities += "\n";
- }
+ capabilities += "\n";
+ }
- return capabilities;
+ return capabilities;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 1bb144ef85a..05a7fb8ae4d 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -21,314 +21,329 @@
CCL_NAMESPACE_BEGIN
DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
-: tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
- profiler(NULL),
- storage(device),
- buffer(device),
- device(device)
+ : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
+ profiler(NULL),
+ storage(device),
+ buffer(device),
+ device(device)
{
- radius = task.denoising.radius;
- nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
- if(task.denoising.relative_pca) {
- pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
- }
- else {
- pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
- }
-
- render_buffer.frame_stride = task.frame_stride;
- render_buffer.pass_stride = task.pass_stride;
- render_buffer.offset = task.pass_denoising_data;
-
- target_buffer.pass_stride = task.target_pass_stride;
- target_buffer.denoising_clean_offset = task.pass_denoising_clean;
- target_buffer.offset = 0;
-
- functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
- functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
-
- tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
- tile_info->from_render = task.denoising_from_render? 1 : 0;
-
- tile_info->frames[0] = 0;
- tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
- for(int i = 1; i < tile_info->num_frames; i++) {
- tile_info->frames[i] = task.denoising_frames[i-1];
- }
-
- write_passes = task.denoising_write_passes;
- do_filter = task.denoising_do_filter;
+ radius = task.denoising.radius;
+ nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
+ if (task.denoising.relative_pca) {
+ pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
+ }
+ else {
+ pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
+ }
+
+ render_buffer.frame_stride = task.frame_stride;
+ render_buffer.pass_stride = task.pass_stride;
+ render_buffer.offset = task.pass_denoising_data;
+
+ target_buffer.pass_stride = task.target_pass_stride;
+ target_buffer.denoising_clean_offset = task.pass_denoising_clean;
+ target_buffer.offset = 0;
+
+ functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
+ functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
+
+ tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
+ tile_info->from_render = task.denoising_from_render ? 1 : 0;
+
+ tile_info->frames[0] = 0;
+ tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
+ for (int i = 1; i < tile_info->num_frames; i++) {
+ tile_info->frames[i] = task.denoising_frames[i - 1];
+ }
+
+ write_passes = task.denoising_write_passes;
+ do_filter = task.denoising_do_filter;
}
DenoisingTask::~DenoisingTask()
{
- storage.XtWX.free();
- storage.XtWY.free();
- storage.transform.free();
- storage.rank.free();
- buffer.mem.free();
- buffer.temporary_mem.free();
- tile_info_mem.free();
+ storage.XtWX.free();
+ storage.XtWY.free();
+ storage.transform.free();
+ storage.rank.free();
+ buffer.mem.free();
+ buffer.temporary_mem.free();
+ tile_info_mem.free();
}
void DenoisingTask::set_render_buffer(RenderTile *rtiles)
{
- for(int i = 0; i < 9; i++) {
- tile_info->offsets[i] = rtiles[i].offset;
- tile_info->strides[i] = rtiles[i].stride;
- tile_info->buffers[i] = rtiles[i].buffer;
- }
- tile_info->x[0] = rtiles[3].x;
- tile_info->x[1] = rtiles[4].x;
- tile_info->x[2] = rtiles[5].x;
- tile_info->x[3] = rtiles[5].x + rtiles[5].w;
- tile_info->y[0] = rtiles[1].y;
- tile_info->y[1] = rtiles[4].y;
- tile_info->y[2] = rtiles[7].y;
- tile_info->y[3] = rtiles[7].y + rtiles[7].h;
-
- target_buffer.offset = rtiles[9].offset;
- target_buffer.stride = rtiles[9].stride;
- target_buffer.ptr = rtiles[9].buffer;
-
- if(write_passes && rtiles[9].buffers) {
- target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset();
- }
- else {
- target_buffer.denoising_output_offset = 0;
- }
-
- tile_info_mem.copy_to_device();
+ for (int i = 0; i < 9; i++) {
+ tile_info->offsets[i] = rtiles[i].offset;
+ tile_info->strides[i] = rtiles[i].stride;
+ tile_info->buffers[i] = rtiles[i].buffer;
+ }
+ tile_info->x[0] = rtiles[3].x;
+ tile_info->x[1] = rtiles[4].x;
+ tile_info->x[2] = rtiles[5].x;
+ tile_info->x[3] = rtiles[5].x + rtiles[5].w;
+ tile_info->y[0] = rtiles[1].y;
+ tile_info->y[1] = rtiles[4].y;
+ tile_info->y[2] = rtiles[7].y;
+ tile_info->y[3] = rtiles[7].y + rtiles[7].h;
+
+ target_buffer.offset = rtiles[9].offset;
+ target_buffer.stride = rtiles[9].stride;
+ target_buffer.ptr = rtiles[9].buffer;
+
+ if (write_passes && rtiles[9].buffers) {
+ target_buffer.denoising_output_offset =
+ rtiles[9].buffers->params.get_denoising_prefiltered_offset();
+ }
+ else {
+ target_buffer.denoising_output_offset = 0;
+ }
+
+ tile_info_mem.copy_to_device();
}
void DenoisingTask::setup_denoising_buffer()
{
- /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
- rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
- rect = rect_expand(rect, radius);
- rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
-
- buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
- buffer.passes = buffer.use_intensity? 15 : 14;
- buffer.width = rect.z - rect.x;
- buffer.stride = align_up(buffer.width, 4);
- buffer.h = rect.w - rect.y;
- int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
- buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
- buffer.frame_stride = buffer.pass_stride * buffer.passes;
- /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
- int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
- buffer.mem.alloc_to_device(mem_size, false);
- buffer.use_time = (tile_info->num_frames > 1);
-
- /* CPUs process shifts sequentially while GPUs process them in parallel. */
- int num_layers;
- if(buffer.gpu_temporary_mem) {
- /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
- int max_radius = max(radius, 6);
- int num_shifts = (2*max_radius + 1) * (2*max_radius + 1);
- num_layers = 2*num_shifts + 1;
- }
- else {
- num_layers = 3;
- }
- /* Allocate two layers per shift as well as one for the weight accumulation. */
- buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
+ /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+ rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
+ rect = rect_expand(rect, radius);
+ rect = rect_clip(rect,
+ make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
+
+ buffer.use_intensity = write_passes || (tile_info->num_frames > 1);
+ buffer.passes = buffer.use_intensity ? 15 : 14;
+ buffer.width = rect.z - rect.x;
+ buffer.stride = align_up(buffer.width, 4);
+ buffer.h = rect.w - rect.y;
+ int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
+ buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
+ buffer.frame_stride = buffer.pass_stride * buffer.passes;
+ /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
+ int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
+ buffer.mem.alloc_to_device(mem_size, false);
+ buffer.use_time = (tile_info->num_frames > 1);
+
+ /* CPUs process shifts sequentially while GPUs process them in parallel. */
+ int num_layers;
+ if (buffer.gpu_temporary_mem) {
+ /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
+ int max_radius = max(radius, 6);
+ int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
+ num_layers = 2 * num_shifts + 1;
+ }
+ else {
+ num_layers = 3;
+ }
+ /* Allocate two layers per shift as well as one for the weight accumulation. */
+ buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride);
}
void DenoisingTask::prefilter_shadowing()
{
- device_ptr null_ptr = (device_ptr) 0;
-
- device_sub_ptr unfiltered_a (buffer.mem, 0, buffer.pass_stride);
- device_sub_ptr unfiltered_b (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr sample_var (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr buffer_var (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr filtered_var (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride);
-
- /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
- functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
-
- /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
- nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
- functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
-
- /* Reuse memory, the previous data isn't needed anymore. */
- device_ptr filtered_a = *buffer_var,
- filtered_b = *sample_var;
- /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
- nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
- functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
- functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
-
- device_ptr residual_var = *sample_var_var;
- /* Estimate the residual variance between the two filtered halves. */
- functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
-
- device_ptr final_a = *unfiltered_a,
- final_b = *unfiltered_b;
- /* Use the residual variance for a second filter pass. */
- nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
- functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
- functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
-
- /* Combine the two double-filtered halves to a final shadow feature. */
- device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride);
- functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+ device_ptr null_ptr = (device_ptr)0;
+
+ device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
+ device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride);
+ device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride);
+ device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride);
+ device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride);
+ device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride);
+
+ /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+ functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+ /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+ nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
+ functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+ /* Reuse memory, the previous data isn't needed anymore. */
+ device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
+ /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+ nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
+ functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+ functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+ device_ptr residual_var = *sample_var_var;
+ /* Estimate the residual variance between the two filtered halves. */
+ functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+ device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
+ /* Use the residual variance for a second filter pass. */
+ nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
+ functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+ functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+ /* Combine the two double-filtered halves to a final shadow feature. */
+ device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride);
+ functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
}
void DenoisingTask::prefilter_features()
{
- device_sub_ptr unfiltered (buffer.mem, 8*buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr variance (buffer.mem, 9*buffer.pass_stride, buffer.pass_stride);
-
- int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 };
- int variance_from[] = { 3, 4, 5, 13, 9, 10, 11};
- int pass_to[] = { 1, 2, 3, 0, 5, 6, 7};
- for(int pass = 0; pass < 7; pass++) {
- device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride);
- /* Get the unfiltered pass and its variance from the RenderBuffers. */
- functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples);
- /* Smooth the pass and store the result in the denoising buffers. */
- nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
- functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
- }
+ device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride);
+ device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride);
+
+ int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
+ int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
+ int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
+ for (int pass = 0; pass < 7; pass++) {
+ device_sub_ptr feature_pass(
+ buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
+ /* Get the unfiltered pass and its variance from the RenderBuffers. */
+ functions.get_feature(mean_from[pass],
+ variance_from[pass],
+ *unfiltered,
+ *variance,
+ 1.0f / render_buffer.samples);
+ /* Smooth the pass and store the result in the denoising buffers. */
+ nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
+ functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+ }
}
void DenoisingTask::prefilter_color()
{
- int mean_from[] = {20, 21, 22};
- int variance_from[] = {23, 24, 25};
- int mean_to[] = { 8, 9, 10};
- int variance_to[] = {11, 12, 13};
- int num_color_passes = 3;
-
- device_only_memory<float> temporary_color(device, "denoising temporary color");
- temporary_color.alloc_to_device(3*buffer.pass_stride, false);
-
- for(int pass = 0; pass < num_color_passes; pass++) {
- device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride);
- device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride);
- functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples);
- }
-
- device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride);
- device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
- device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride);
- functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
-
- if(buffer.use_intensity) {
- device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
- nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true);
- functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
- }
+ int mean_from[] = {20, 21, 22};
+ int variance_from[] = {23, 24, 25};
+ int mean_to[] = {8, 9, 10};
+ int variance_to[] = {11, 12, 13};
+ int num_color_passes = 3;
+
+ device_only_memory<float> temporary_color(device, "denoising temporary color");
+ temporary_color.alloc_to_device(3 * buffer.pass_stride, false);
+
+ for (int pass = 0; pass < num_color_passes; pass++) {
+ device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
+ device_sub_ptr color_var_pass(
+ buffer.mem, variance_to[pass] * buffer.pass_stride, buffer.pass_stride);
+ functions.get_feature(mean_from[pass],
+ variance_from[pass],
+ *color_pass,
+ *color_var_pass,
+ 1.0f / render_buffer.samples);
+ }
+
+ device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
+ device_sub_ptr color_var_pass(
+ buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
+ device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
+ functions.detect_outliers(
+ temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
+
+ if (buffer.use_intensity) {
+ device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
+ nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
+ functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
+ }
}
void DenoisingTask::load_buffer()
{
- device_ptr null_ptr = (device_ptr) 0;
-
- int original_offset = render_buffer.offset;
-
- int num_passes = buffer.use_intensity? 15 : 14;
- for(int i = 0; i < tile_info->num_frames; i++) {
- for(int pass = 0; pass < num_passes; pass++) {
- device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride);
- bool is_variance = (pass >= 11) && (pass <= 13);
- functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f);
- }
- render_buffer.offset += render_buffer.frame_stride;
- }
-
- render_buffer.offset = original_offset;
+ device_ptr null_ptr = (device_ptr)0;
+
+ int original_offset = render_buffer.offset;
+
+ int num_passes = buffer.use_intensity ? 15 : 14;
+ for (int i = 0; i < tile_info->num_frames; i++) {
+ for (int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr to_pass(
+ buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride);
+ bool is_variance = (pass >= 11) && (pass <= 13);
+ functions.get_feature(
+ pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
+ }
+ render_buffer.offset += render_buffer.frame_stride;
+ }
+
+ render_buffer.offset = original_offset;
}
void DenoisingTask::write_buffer()
{
- reconstruction_state.buffer_params = make_int4(target_buffer.offset,
- target_buffer.stride,
- target_buffer.pass_stride,
- target_buffer.denoising_clean_offset);
- int num_passes = buffer.use_intensity? 15 : 14;
- for(int pass = 0; pass < num_passes; pass++) {
- device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride);
- int out_offset = pass + target_buffer.denoising_output_offset;
- functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
- }
+ reconstruction_state.buffer_params = make_int4(target_buffer.offset,
+ target_buffer.stride,
+ target_buffer.pass_stride,
+ target_buffer.denoising_clean_offset);
+ int num_passes = buffer.use_intensity ? 15 : 14;
+ for (int pass = 0; pass < num_passes; pass++) {
+ device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride);
+ int out_offset = pass + target_buffer.denoising_output_offset;
+ functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
+ }
}
void DenoisingTask::construct_transform()
{
- storage.w = filter_area.z;
- storage.h = filter_area.w;
+ storage.w = filter_area.z;
+ storage.h = filter_area.w;
- storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false);
- storage.rank.alloc_to_device(storage.w*storage.h, false);
+ storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false);
+ storage.rank.alloc_to_device(storage.w * storage.h, false);
- functions.construct_transform();
+ functions.construct_transform();
}
void DenoisingTask::reconstruct()
{
- storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
- storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
- storage.XtWX.zero_to_device();
- storage.XtWY.zero_to_device();
-
- reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
- int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x;
- reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
- target_buffer.stride,
- target_buffer.pass_stride,
- target_buffer.denoising_clean_offset);
- reconstruction_state.source_w = rect.z-rect.x;
- reconstruction_state.source_h = rect.w-rect.y;
-
- device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride);
- device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride);
- for(int f = 0; f < tile_info->num_frames; f++) {
- device_ptr scale_ptr = 0;
- device_sub_ptr *scale_sub_ptr = NULL;
- if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
- scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride);
- scale_ptr = **scale_sub_ptr;
- }
-
- functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
- delete scale_sub_ptr;
- }
- functions.solve(target_buffer.ptr);
+ storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false);
+ storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false);
+ storage.XtWX.zero_to_device();
+ storage.XtWY.zero_to_device();
+
+ reconstruction_state.filter_window = rect_from_shape(
+ filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h);
+ int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
+ reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset,
+ target_buffer.stride,
+ target_buffer.pass_stride,
+ target_buffer.denoising_clean_offset);
+ reconstruction_state.source_w = rect.z - rect.x;
+ reconstruction_state.source_h = rect.w - rect.y;
+
+ device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride);
+ device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
+ for (int f = 0; f < tile_info->num_frames; f++) {
+ device_ptr scale_ptr = 0;
+ device_sub_ptr *scale_sub_ptr = NULL;
+ if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
+ scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
+ scale_ptr = **scale_sub_ptr;
+ }
+
+ functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
+ delete scale_sub_ptr;
+ }
+ functions.solve(target_buffer.ptr);
}
void DenoisingTask::run_denoising(RenderTile *tile)
{
- RenderTile rtiles[10];
- rtiles[4] = *tile;
- functions.map_neighbor_tiles(rtiles);
- set_render_buffer(rtiles);
-
- setup_denoising_buffer();
-
- if(tile_info->from_render) {
- prefilter_shadowing();
- prefilter_features();
- prefilter_color();
- }
- else {
- load_buffer();
- }
-
- if(do_filter) {
- construct_transform();
- reconstruct();
- }
-
- if(write_passes) {
- write_buffer();
- }
-
- functions.unmap_neighbor_tiles(rtiles);
+ RenderTile rtiles[10];
+ rtiles[4] = *tile;
+ functions.map_neighbor_tiles(rtiles);
+ set_render_buffer(rtiles);
+
+ setup_denoising_buffer();
+
+ if (tile_info->from_render) {
+ prefilter_shadowing();
+ prefilter_features();
+ prefilter_color();
+ }
+ else {
+ load_buffer();
+ }
+
+ if (do_filter) {
+ construct_transform();
+ reconstruct();
+ }
+
+ if (write_passes) {
+ write_buffer();
+ }
+
+ functions.unmap_neighbor_tiles(rtiles);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 5869aa05390..bd1d0193dbd 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -28,165 +28,169 @@
CCL_NAMESPACE_BEGIN
class DenoisingTask {
-public:
- /* Parameters of the denoising algorithm. */
- int radius;
- float nlm_k_2;
- float pca_threshold;
-
- /* Parameters of the RenderBuffers. */
- struct RenderBuffers {
- int offset;
- int pass_stride;
- int frame_stride;
- int samples;
- } render_buffer;
-
- /* Pointer and parameters of the target buffer. */
- struct TargetBuffer {
- int offset;
- int stride;
- int pass_stride;
- int denoising_clean_offset;
- int denoising_output_offset;
- device_ptr ptr;
- } target_buffer;
-
- TileInfo *tile_info;
- device_vector<int> tile_info_mem;
-
- ProfilingState *profiler;
-
- int4 rect;
- int4 filter_area;
-
- bool write_passes;
- bool do_filter;
-
- struct DeviceFunctions {
- function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
- device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
- device_ptr variance_ptr, /* Contains the variance of the guide image. */
- device_ptr out_ptr /* The filtered output is written into this image. */
- )> non_local_means;
- function<bool(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame
- )> accumulate;
- function<bool(device_ptr output_ptr)> solve;
- function<bool()> construct_transform;
-
- function<bool(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r,
- int4 rect
- )> combine_halves;
- function<bool(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr
- )> divide_shadow;
- function<bool(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale
- )> get_feature;
- function<bool(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr
- )> detect_outliers;
- function<bool(int out_offset,
- device_ptr frop_ptr,
- device_ptr buffer_ptr
- )> write_feature;
- function<void(RenderTile *rtiles)> map_neighbor_tiles;
- function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
- } functions;
-
- /* Stores state of the current Reconstruction operation,
- * which is accessed by the device in order to perform the operation. */
- struct ReconstructionState {
- int4 filter_window;
- int4 buffer_params;
-
- int source_w;
- int source_h;
- } reconstruction_state;
-
- /* Stores state of the current NLM operation,
- * which is accessed by the device in order to perform the operation. */
- struct NLMState {
- int r; /* Search radius of the filter. */
- int f; /* Patch size of the filter. */
- float a; /* Variance compensation factor in the MSE estimation. */
- float k_2; /* Squared value of the k parameter of the filter. */
- bool is_color;
-
- void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; }
- } nlm_state;
-
- struct Storage {
- device_only_memory<float> transform;
- device_only_memory<int> rank;
- device_only_memory<float> XtWX;
- device_only_memory<float3> XtWY;
- int w;
- int h;
-
- Storage(Device *device)
- : transform(device, "denoising transform"),
- rank(device, "denoising rank"),
- XtWX(device, "denoising XtWX"),
- XtWY(device, "denoising XtWY")
- {}
- } storage;
-
- DenoisingTask(Device *device, const DeviceTask &task);
- ~DenoisingTask();
-
- void run_denoising(RenderTile *tile);
-
- struct DenoiseBuffers {
- int pass_stride;
- int passes;
- int stride;
- int h;
- int width;
- int frame_stride;
- device_only_memory<float> mem;
- device_only_memory<float> temporary_mem;
- bool use_time;
- bool use_intensity;
-
- bool gpu_temporary_mem;
-
- DenoiseBuffers(Device *device)
- : mem(device, "denoising pixel buffer"),
- temporary_mem(device, "denoising temporary mem")
- {}
- } buffer;
-
-protected:
- Device *device;
-
- void set_render_buffer(RenderTile *rtiles);
- void setup_denoising_buffer();
- void prefilter_shadowing();
- void prefilter_features();
- void prefilter_color();
- void construct_transform();
- void reconstruct();
-
- void load_buffer();
- void write_buffer();
+ public:
+ /* Parameters of the denoising algorithm. */
+ int radius;
+ float nlm_k_2;
+ float pca_threshold;
+
+ /* Parameters of the RenderBuffers. */
+ struct RenderBuffers {
+ int offset;
+ int pass_stride;
+ int frame_stride;
+ int samples;
+ } render_buffer;
+
+ /* Pointer and parameters of the target buffer. */
+ struct TargetBuffer {
+ int offset;
+ int stride;
+ int pass_stride;
+ int denoising_clean_offset;
+ int denoising_output_offset;
+ device_ptr ptr;
+ } target_buffer;
+
+ TileInfo *tile_info;
+ device_vector<int> tile_info_mem;
+
+ ProfilingState *profiler;
+
+ int4 rect;
+ int4 filter_area;
+
+ bool write_passes;
+ bool do_filter;
+
+ struct DeviceFunctions {
+ function<bool(
+ device_ptr image_ptr, /* Contains the values that are smoothed. */
+ device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
+ device_ptr variance_ptr, /* Contains the variance of the guide image. */
+ device_ptr out_ptr /* The filtered output is written into this image. */
+ )>
+ non_local_means;
+ function<bool(
+ device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)>
+ accumulate;
+ function<bool(device_ptr output_ptr)> solve;
+ function<bool()> construct_transform;
+
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect)>
+ combine_halves;
+ function<bool(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr)>
+ divide_shadow;
+ function<bool(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ float scale)>
+ get_feature;
+ function<bool(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr)>
+ detect_outliers;
+ function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature;
+ function<void(RenderTile *rtiles)> map_neighbor_tiles;
+ function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
+ } functions;
+
+ /* Stores state of the current Reconstruction operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct ReconstructionState {
+ int4 filter_window;
+ int4 buffer_params;
+
+ int source_w;
+ int source_h;
+ } reconstruction_state;
+
+ /* Stores state of the current NLM operation,
+ * which is accessed by the device in order to perform the operation. */
+ struct NLMState {
+ int r; /* Search radius of the filter. */
+ int f; /* Patch size of the filter. */
+ float a; /* Variance compensation factor in the MSE estimation. */
+ float k_2; /* Squared value of the k parameter of the filter. */
+ bool is_color;
+
+ void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
+ {
+ r = r_;
+ f = f_;
+ a = a_, k_2 = k_2_;
+ is_color = is_color_;
+ }
+ } nlm_state;
+
+ struct Storage {
+ device_only_memory<float> transform;
+ device_only_memory<int> rank;
+ device_only_memory<float> XtWX;
+ device_only_memory<float3> XtWY;
+ int w;
+ int h;
+
+ Storage(Device *device)
+ : transform(device, "denoising transform"),
+ rank(device, "denoising rank"),
+ XtWX(device, "denoising XtWX"),
+ XtWY(device, "denoising XtWY")
+ {
+ }
+ } storage;
+
+ DenoisingTask(Device *device, const DeviceTask &task);
+ ~DenoisingTask();
+
+ void run_denoising(RenderTile *tile);
+
+ struct DenoiseBuffers {
+ int pass_stride;
+ int passes;
+ int stride;
+ int h;
+ int width;
+ int frame_stride;
+ device_only_memory<float> mem;
+ device_only_memory<float> temporary_mem;
+ bool use_time;
+ bool use_intensity;
+
+ bool gpu_temporary_mem;
+
+ DenoiseBuffers(Device *device)
+ : mem(device, "denoising pixel buffer"), temporary_mem(device, "denoising temporary mem")
+ {
+ }
+ } buffer;
+
+ protected:
+ Device *device;
+
+ void set_render_buffer(RenderTile *rtiles);
+ void setup_denoising_buffer();
+ void prefilter_shadowing();
+ void prefilter_features();
+ void prefilter_color();
+ void construct_transform();
+ void reconstruct();
+
+ void load_buffer();
+ void write_buffer();
};
CCL_NAMESPACE_END
-#endif /* __DEVICE_DENOISING_H__ */
+#endif /* __DEVICE_DENOISING_H__ */
diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h
index 94df1e009eb..c393a3f9cda 100644
--- a/intern/cycles/device/device_intern.h
+++ b/intern/cycles/device/device_intern.h
@@ -21,19 +21,22 @@ CCL_NAMESPACE_BEGIN
class Device;
-Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
+Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
bool device_opencl_init();
-Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-bool device_opencl_compile_kernel(const vector<string>& parameters);
+Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+bool device_opencl_compile_kernel(const vector<string> &parameters);
bool device_cuda_init();
-Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address);
-Device *device_multi_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
-
-void device_cpu_info(vector<DeviceInfo>& devices);
-void device_opencl_info(vector<DeviceInfo>& devices);
-void device_cuda_info(vector<DeviceInfo>& devices);
-void device_network_info(vector<DeviceInfo>& devices);
+Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+Device *device_network_create(DeviceInfo &info,
+ Stats &stats,
+ Profiler &profiler,
+ const char *address);
+Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+
+void device_cpu_info(vector<DeviceInfo> &devices);
+void device_opencl_info(vector<DeviceInfo> &devices);
+void device_cuda_info(vector<DeviceInfo> &devices);
+void device_network_info(vector<DeviceInfo> &devices);
string device_cpu_capabilities();
string device_opencl_capabilities();
@@ -41,4 +44,4 @@ string device_cuda_capabilities();
CCL_NAMESPACE_END
-#endif /* __DEVICE_INTERN_H__ */
+#endif /* __DEVICE_INTERN_H__ */
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index a8d29896553..859535307f4 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -22,21 +22,21 @@ CCL_NAMESPACE_BEGIN
/* Device Memory */
device_memory::device_memory(Device *device, const char *name, MemoryType type)
-: data_type(device_type_traits<uchar>::data_type),
- data_elements(device_type_traits<uchar>::num_elements),
- data_size(0),
- device_size(0),
- data_width(0),
- data_height(0),
- data_depth(0),
- type(type),
- name(name),
- interpolation(INTERPOLATION_NONE),
- extension(EXTENSION_REPEAT),
- device(device),
- device_pointer(0),
- host_pointer(0),
- shared_pointer(0)
+ : data_type(device_type_traits<uchar>::data_type),
+ data_elements(device_type_traits<uchar>::num_elements),
+ data_size(0),
+ device_size(0),
+ data_width(0),
+ data_height(0),
+ data_depth(0),
+ type(type),
+ name(name),
+ interpolation(INTERPOLATION_NONE),
+ extension(EXTENSION_REPEAT),
+ device(device),
+ device_pointer(0),
+ host_pointer(0),
+ shared_pointer(0)
{
}
@@ -46,95 +46,94 @@ device_memory::~device_memory()
void *device_memory::host_alloc(size_t size)
{
- if(!size) {
- return 0;
- }
+ if (!size) {
+ return 0;
+ }
- void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
+ void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
- if(ptr) {
- util_guarded_mem_alloc(size);
- }
- else {
- throw std::bad_alloc();
- }
+ if (ptr) {
+ util_guarded_mem_alloc(size);
+ }
+ else {
+ throw std::bad_alloc();
+ }
- return ptr;
+ return ptr;
}
void device_memory::host_free()
{
- if(host_pointer) {
- util_guarded_mem_free(memory_size());
- util_aligned_free((void*)host_pointer);
- host_pointer = 0;
- }
+ if (host_pointer) {
+ util_guarded_mem_free(memory_size());
+ util_aligned_free((void *)host_pointer);
+ host_pointer = 0;
+ }
}
void device_memory::device_alloc()
{
- assert(!device_pointer && type != MEM_TEXTURE);
- device->mem_alloc(*this);
+ assert(!device_pointer && type != MEM_TEXTURE);
+ device->mem_alloc(*this);
}
void device_memory::device_free()
{
- if(device_pointer) {
- device->mem_free(*this);
- }
+ if (device_pointer) {
+ device->mem_free(*this);
+ }
}
void device_memory::device_copy_to()
{
- if(host_pointer) {
- device->mem_copy_to(*this);
- }
+ if (host_pointer) {
+ device->mem_copy_to(*this);
+ }
}
void device_memory::device_copy_from(int y, int w, int h, int elem)
{
- assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
- device->mem_copy_from(*this, y, w, h, elem);
+ assert(type != MEM_TEXTURE && type != MEM_READ_ONLY);
+ device->mem_copy_from(*this, y, w, h, elem);
}
void device_memory::device_zero()
{
- if(data_size) {
- device->mem_zero(*this);
- }
+ if (data_size) {
+ device->mem_zero(*this);
+ }
}
void device_memory::swap_device(Device *new_device,
size_t new_device_size,
device_ptr new_device_ptr)
{
- original_device = device;
- original_device_size = device_size;
- original_device_ptr = device_pointer;
+ original_device = device;
+ original_device_size = device_size;
+ original_device_ptr = device_pointer;
- device = new_device;
- device_size = new_device_size;
- device_pointer = new_device_ptr;
+ device = new_device;
+ device_size = new_device_size;
+ device_pointer = new_device_ptr;
}
void device_memory::restore_device()
{
- device = original_device;
- device_size = original_device_size;
- device_pointer = original_device_ptr;
+ device = original_device;
+ device_size = original_device_size;
+ device_pointer = original_device_ptr;
}
/* Device Sub Ptr */
-device_sub_ptr::device_sub_ptr(device_memory& mem, int offset, int size)
-: device(mem.device)
+device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
{
- ptr = device->mem_alloc_sub_ptr(mem, offset, size);
+ ptr = device->mem_alloc_sub_ptr(mem, offset, size);
}
device_sub_ptr::~device_sub_ptr()
{
- device->mem_free_sub_ptr(ptr);
+ device->mem_free_sub_ptr(ptr);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index e43834bdc8d..f50184efba7 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -31,152 +31,155 @@ CCL_NAMESPACE_BEGIN
class Device;
-enum MemoryType {
- MEM_READ_ONLY,
- MEM_READ_WRITE,
- MEM_DEVICE_ONLY,
- MEM_TEXTURE,
- MEM_PIXELS
-};
+enum MemoryType { MEM_READ_ONLY, MEM_READ_WRITE, MEM_DEVICE_ONLY, MEM_TEXTURE, MEM_PIXELS };
/* Supported Data Types */
enum DataType {
- TYPE_UNKNOWN,
- TYPE_UCHAR,
- TYPE_UINT16,
- TYPE_UINT,
- TYPE_INT,
- TYPE_FLOAT,
- TYPE_HALF,
- TYPE_UINT64,
+ TYPE_UNKNOWN,
+ TYPE_UCHAR,
+ TYPE_UINT16,
+ TYPE_UINT,
+ TYPE_INT,
+ TYPE_FLOAT,
+ TYPE_HALF,
+ TYPE_UINT64,
};
static inline size_t datatype_size(DataType datatype)
{
- switch(datatype) {
- case TYPE_UNKNOWN: return 1;
- case TYPE_UCHAR: return sizeof(uchar);
- case TYPE_FLOAT: return sizeof(float);
- case TYPE_UINT: return sizeof(uint);
- case TYPE_UINT16: return sizeof(uint16_t);
- case TYPE_INT: return sizeof(int);
- case TYPE_HALF: return sizeof(half);
- case TYPE_UINT64: return sizeof(uint64_t);
- default: return 0;
- }
+ switch (datatype) {
+ case TYPE_UNKNOWN:
+ return 1;
+ case TYPE_UCHAR:
+ return sizeof(uchar);
+ case TYPE_FLOAT:
+ return sizeof(float);
+ case TYPE_UINT:
+ return sizeof(uint);
+ case TYPE_UINT16:
+ return sizeof(uint16_t);
+ case TYPE_INT:
+ return sizeof(int);
+ case TYPE_HALF:
+ return sizeof(half);
+ case TYPE_UINT64:
+ return sizeof(uint64_t);
+ default:
+ return 0;
+ }
}
/* Traits for data types */
template<typename T> struct device_type_traits {
- static const DataType data_type = TYPE_UNKNOWN;
- static const int num_elements = sizeof(T);
+ static const DataType data_type = TYPE_UNKNOWN;
+ static const int num_elements = sizeof(T);
};
template<> struct device_type_traits<uchar> {
- static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_UCHAR;
+ static const int num_elements = 1;
};
template<> struct device_type_traits<uchar2> {
- static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 2;
+ static const DataType data_type = TYPE_UCHAR;
+ static const int num_elements = 2;
};
template<> struct device_type_traits<uchar3> {
- static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 3;
+ static const DataType data_type = TYPE_UCHAR;
+ static const int num_elements = 3;
};
template<> struct device_type_traits<uchar4> {
- static const DataType data_type = TYPE_UCHAR;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_UCHAR;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<uint> {
- static const DataType data_type = TYPE_UINT;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_UINT;
+ static const int num_elements = 1;
};
template<> struct device_type_traits<uint2> {
- static const DataType data_type = TYPE_UINT;
- static const int num_elements = 2;
+ static const DataType data_type = TYPE_UINT;
+ static const int num_elements = 2;
};
template<> struct device_type_traits<uint3> {
- static const DataType data_type = TYPE_UINT;
- static const int num_elements = 3;
+ static const DataType data_type = TYPE_UINT;
+ static const int num_elements = 3;
};
template<> struct device_type_traits<uint4> {
- static const DataType data_type = TYPE_UINT;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_UINT;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<int> {
- static const DataType data_type = TYPE_INT;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_INT;
+ static const int num_elements = 1;
};
template<> struct device_type_traits<int2> {
- static const DataType data_type = TYPE_INT;
- static const int num_elements = 2;
+ static const DataType data_type = TYPE_INT;
+ static const int num_elements = 2;
};
template<> struct device_type_traits<int3> {
- static const DataType data_type = TYPE_INT;
- static const int num_elements = 3;
+ static const DataType data_type = TYPE_INT;
+ static const int num_elements = 3;
};
template<> struct device_type_traits<int4> {
- static const DataType data_type = TYPE_INT;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_INT;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<float> {
- static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_FLOAT;
+ static const int num_elements = 1;
};
template<> struct device_type_traits<float2> {
- static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 2;
+ static const DataType data_type = TYPE_FLOAT;
+ static const int num_elements = 2;
};
template<> struct device_type_traits<float3> {
- static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_FLOAT;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<float4> {
- static const DataType data_type = TYPE_FLOAT;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_FLOAT;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<half> {
- static const DataType data_type = TYPE_HALF;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_HALF;
+ static const int num_elements = 1;
};
template<> struct device_type_traits<ushort4> {
- static const DataType data_type = TYPE_UINT16;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_UINT16;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<uint16_t> {
- static const DataType data_type = TYPE_UINT16;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_UINT16;
+ static const int num_elements = 1;
};
template<> struct device_type_traits<half4> {
- static const DataType data_type = TYPE_HALF;
- static const int num_elements = 4;
+ static const DataType data_type = TYPE_HALF;
+ static const int num_elements = 4;
};
template<> struct device_type_traits<uint64_t> {
- static const DataType data_type = TYPE_UINT64;
- static const int num_elements = 1;
+ static const DataType data_type = TYPE_UINT64;
+ static const int num_elements = 1;
};
/* Device Memory
@@ -184,64 +187,67 @@ template<> struct device_type_traits<uint64_t> {
* Base class for all device memory. This should not be allocated directly,
* instead the appropriate subclass can be used. */
-class device_memory
-{
-public:
- size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
- size_t memory_elements_size(int elements) {
- return elements*data_elements*datatype_size(data_type);
- }
-
- /* Data information. */
- DataType data_type;
- int data_elements;
- size_t data_size;
- size_t device_size;
- size_t data_width;
- size_t data_height;
- size_t data_depth;
- MemoryType type;
- const char *name;
- InterpolationType interpolation;
- ExtensionType extension;
-
- /* Pointers. */
- Device *device;
- device_ptr device_pointer;
- void *host_pointer;
- void *shared_pointer;
-
- virtual ~device_memory();
-
- void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
- void restore_device();
-
-protected:
- friend class CUDADevice;
-
- /* Only create through subclasses. */
- device_memory(Device *device, const char *name, MemoryType type);
-
- /* No copying allowed. */
- device_memory(const device_memory&);
- device_memory& operator = (const device_memory&);
-
- /* Host allocation on the device. All host_pointer memory should be
- * allocated with these functions, for devices that support using
- * the same pointer for host and device. */
- void *host_alloc(size_t size);
- void host_free();
-
- /* Device memory allocation and copying. */
- void device_alloc();
- void device_free();
- void device_copy_to();
- void device_copy_from(int y, int w, int h, int elem);
- void device_zero();
-
- device_ptr original_device_ptr;
- size_t original_device_size;
- Device *original_device;
+class device_memory {
+ public:
+ size_t memory_size()
+ {
+ return data_size * data_elements * datatype_size(data_type);
+ }
+ size_t memory_elements_size(int elements)
+ {
+ return elements * data_elements * datatype_size(data_type);
+ }
+
+ /* Data information. */
+ DataType data_type;
+ int data_elements;
+ size_t data_size;
+ size_t device_size;
+ size_t data_width;
+ size_t data_height;
+ size_t data_depth;
+ MemoryType type;
+ const char *name;
+ InterpolationType interpolation;
+ ExtensionType extension;
+
+ /* Pointers. */
+ Device *device;
+ device_ptr device_pointer;
+ void *host_pointer;
+ void *shared_pointer;
+
+ virtual ~device_memory();
+
+ void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
+ void restore_device();
+
+ protected:
+ friend class CUDADevice;
+
+ /* Only create through subclasses. */
+ device_memory(Device *device, const char *name, MemoryType type);
+
+ /* No copying allowed. */
+ device_memory(const device_memory &);
+ device_memory &operator=(const device_memory &);
+
+ /* Host allocation on the device. All host_pointer memory should be
+ * allocated with these functions, for devices that support using
+ * the same pointer for host and device. */
+ void *host_alloc(size_t size);
+ void host_free();
+
+ /* Device memory allocation and copying. */
+ void device_alloc();
+ void device_free();
+ void device_copy_to();
+ void device_copy_from(int y, int w, int h, int elem);
+ void device_zero();
+
+ device_ptr original_device_ptr;
+ size_t original_device_size;
+ Device *original_device;
};
/* Device Only Memory
@@ -249,51 +255,49 @@ protected:
* Working memory only needed by the device, with no corresponding allocation
* on the host. Only used internally in the device implementations. */
-template<typename T>
-class device_only_memory : public device_memory
-{
-public:
- device_only_memory(Device *device, const char *name)
- : device_memory(device, name, MEM_DEVICE_ONLY)
- {
- data_type = device_type_traits<T>::data_type;
- data_elements = max(device_type_traits<T>::num_elements, 1);
- }
-
- virtual ~device_only_memory()
- {
- free();
- }
-
- void alloc_to_device(size_t num, bool shrink_to_fit = true)
- {
- size_t new_size = num;
- bool reallocate;
-
- if(shrink_to_fit) {
- reallocate = (data_size != new_size);
- }
- else {
- reallocate = (data_size < new_size);
- }
-
- if(reallocate) {
- device_free();
- data_size = new_size;
- device_alloc();
- }
- }
-
- void free()
- {
- device_free();
- data_size = 0;
- }
-
- void zero_to_device()
- {
- device_zero();
- }
+template<typename T> class device_only_memory : public device_memory {
+ public:
+ device_only_memory(Device *device, const char *name)
+ : device_memory(device, name, MEM_DEVICE_ONLY)
+ {
+ data_type = device_type_traits<T>::data_type;
+ data_elements = max(device_type_traits<T>::num_elements, 1);
+ }
+
+ virtual ~device_only_memory()
+ {
+ free();
+ }
+
+ void alloc_to_device(size_t num, bool shrink_to_fit = true)
+ {
+ size_t new_size = num;
+ bool reallocate;
+
+ if (shrink_to_fit) {
+ reallocate = (data_size != new_size);
+ }
+ else {
+ reallocate = (data_size < new_size);
+ }
+
+ if (reallocate) {
+ device_free();
+ data_size = new_size;
+ device_alloc();
+ }
+ }
+
+ void free()
+ {
+ device_free();
+ data_size = 0;
+ }
+
+ void zero_to_device()
+ {
+ device_zero();
+ }
};
/* Device Vector
@@ -307,135 +311,134 @@ public:
* automatically attached to kernel globals, using the provided name
* matching an entry in kernel_textures.h. */
-template<typename T> class device_vector : public device_memory
-{
-public:
- device_vector(Device *device, const char *name, MemoryType type)
- : device_memory(device, name, type)
- {
- data_type = device_type_traits<T>::data_type;
- data_elements = device_type_traits<T>::num_elements;
-
- assert(data_elements > 0);
- }
-
- virtual ~device_vector()
- {
- free();
- }
-
- /* Host memory allocation. */
- T *alloc(size_t width, size_t height = 0, size_t depth = 0)
- {
- size_t new_size = size(width, height, depth);
-
- if(new_size != data_size) {
- device_free();
- host_free();
- host_pointer = host_alloc(sizeof(T)*new_size);
- assert(device_pointer == 0);
- }
-
- data_size = new_size;
- data_width = width;
- data_height = height;
- data_depth = depth;
-
- return data();
- }
-
- /* Host memory resize. Only use this if the original data needs to be
- * preserved, it is faster to call alloc() if it can be discarded. */
- T *resize(size_t width, size_t height = 0, size_t depth = 0)
- {
- size_t new_size = size(width, height, depth);
-
- if(new_size != data_size) {
- void *new_ptr = host_alloc(sizeof(T)*new_size);
-
- if(new_size && data_size) {
- size_t min_size = ((new_size < data_size)? new_size: data_size);
- memcpy((T*)new_ptr, (T*)host_pointer, sizeof(T)*min_size);
- }
-
- device_free();
- host_free();
- host_pointer = new_ptr;
- assert(device_pointer == 0);
- }
-
- data_size = new_size;
- data_width = width;
- data_height = height;
- data_depth = depth;
-
- return data();
- }
-
- /* Take over data from an existing array. */
- void steal_data(array<T>& from)
- {
- device_free();
- host_free();
-
- data_size = from.size();
- data_width = 0;
- data_height = 0;
- data_depth = 0;
- host_pointer = from.steal_pointer();
- assert(device_pointer == 0);
- }
-
- /* Free device and host memory. */
- void free()
- {
- device_free();
- host_free();
-
- data_size = 0;
- data_width = 0;
- data_height = 0;
- data_depth = 0;
- host_pointer = 0;
- assert(device_pointer == 0);
- }
-
- size_t size()
- {
- return data_size;
- }
-
- T* data()
- {
- return (T*)host_pointer;
- }
-
- T& operator[](size_t i)
- {
- assert(i < data_size);
- return data()[i];
- }
-
- void copy_to_device()
- {
- device_copy_to();
- }
-
- void copy_from_device(int y, int w, int h)
- {
- device_copy_from(y, w, h, sizeof(T));
- }
-
- void zero_to_device()
- {
- device_zero();
- }
-
-protected:
- size_t size(size_t width, size_t height, size_t depth)
- {
- return width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth);
- }
+template<typename T> class device_vector : public device_memory {
+ public:
+ device_vector(Device *device, const char *name, MemoryType type)
+ : device_memory(device, name, type)
+ {
+ data_type = device_type_traits<T>::data_type;
+ data_elements = device_type_traits<T>::num_elements;
+
+ assert(data_elements > 0);
+ }
+
+ virtual ~device_vector()
+ {
+ free();
+ }
+
+ /* Host memory allocation. */
+ T *alloc(size_t width, size_t height = 0, size_t depth = 0)
+ {
+ size_t new_size = size(width, height, depth);
+
+ if (new_size != data_size) {
+ device_free();
+ host_free();
+ host_pointer = host_alloc(sizeof(T) * new_size);
+ assert(device_pointer == 0);
+ }
+
+ data_size = new_size;
+ data_width = width;
+ data_height = height;
+ data_depth = depth;
+
+ return data();
+ }
+
+ /* Host memory resize. Only use this if the original data needs to be
+ * preserved, it is faster to call alloc() if it can be discarded. */
+ T *resize(size_t width, size_t height = 0, size_t depth = 0)
+ {
+ size_t new_size = size(width, height, depth);
+
+ if (new_size != data_size) {
+ void *new_ptr = host_alloc(sizeof(T) * new_size);
+
+ if (new_size && data_size) {
+ size_t min_size = ((new_size < data_size) ? new_size : data_size);
+ memcpy((T *)new_ptr, (T *)host_pointer, sizeof(T) * min_size);
+ }
+
+ device_free();
+ host_free();
+ host_pointer = new_ptr;
+ assert(device_pointer == 0);
+ }
+
+ data_size = new_size;
+ data_width = width;
+ data_height = height;
+ data_depth = depth;
+
+ return data();
+ }
+
+ /* Take over data from an existing array. */
+ void steal_data(array<T> &from)
+ {
+ device_free();
+ host_free();
+
+ data_size = from.size();
+ data_width = 0;
+ data_height = 0;
+ data_depth = 0;
+ host_pointer = from.steal_pointer();
+ assert(device_pointer == 0);
+ }
+
+ /* Free device and host memory. */
+ void free()
+ {
+ device_free();
+ host_free();
+
+ data_size = 0;
+ data_width = 0;
+ data_height = 0;
+ data_depth = 0;
+ host_pointer = 0;
+ assert(device_pointer == 0);
+ }
+
+ size_t size()
+ {
+ return data_size;
+ }
+
+ T *data()
+ {
+ return (T *)host_pointer;
+ }
+
+ T &operator[](size_t i)
+ {
+ assert(i < data_size);
+ return data()[i];
+ }
+
+ void copy_to_device()
+ {
+ device_copy_to();
+ }
+
+ void copy_from_device(int y, int w, int h)
+ {
+ device_copy_from(y, w, h, sizeof(T));
+ }
+
+ void zero_to_device()
+ {
+ device_zero();
+ }
+
+ protected:
+ size_t size(size_t width, size_t height, size_t depth)
+ {
+ return width * ((height == 0) ? 1 : height) * ((depth == 0) ? 1 : depth);
+ }
};
/* Pixel Memory
@@ -443,28 +446,26 @@ protected:
* Device memory to efficiently draw as pixels to the screen in interactive
* rendering. Only copying pixels from the device is supported, not copying to. */
-template<typename T> class device_pixels : public device_vector<T>
-{
-public:
- device_pixels(Device *device, const char *name)
- : device_vector<T>(device, name, MEM_PIXELS)
- {
- }
-
- void alloc_to_device(size_t width, size_t height, size_t depth = 0)
- {
- device_vector<T>::alloc(width, height, depth);
-
- if(!device_memory::device_pointer) {
- device_memory::device_alloc();
- }
- }
-
- T *copy_from_device(int y, int w, int h)
- {
- device_memory::device_copy_from(y, w, h, sizeof(T));
- return device_vector<T>::data();
- }
+template<typename T> class device_pixels : public device_vector<T> {
+ public:
+ device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS)
+ {
+ }
+
+ void alloc_to_device(size_t width, size_t height, size_t depth = 0)
+ {
+ device_vector<T>::alloc(width, height, depth);
+
+ if (!device_memory::device_pointer) {
+ device_memory::device_alloc();
+ }
+ }
+
+ T *copy_from_device(int y, int w, int h)
+ {
+ device_memory::device_copy_from(y, w, h, sizeof(T));
+ return device_vector<T>::data();
+ }
};
/* Device Sub Memory
@@ -476,25 +477,24 @@ public:
* Note: some devices require offset and size of the sub_ptr to be properly
* aligned to device->mem_address_alingment(). */
-class device_sub_ptr
-{
-public:
- device_sub_ptr(device_memory& mem, int offset, int size);
- ~device_sub_ptr();
+class device_sub_ptr {
+ public:
+ device_sub_ptr(device_memory &mem, int offset, int size);
+ ~device_sub_ptr();
- device_ptr operator*() const
- {
- return ptr;
- }
+ device_ptr operator*() const
+ {
+ return ptr;
+ }
-protected:
- /* No copying. */
- device_sub_ptr& operator = (const device_sub_ptr&);
+ protected:
+ /* No copying. */
+ device_sub_ptr &operator=(const device_sub_ptr &);
- Device *device;
- device_ptr ptr;
+ Device *device;
+ device_ptr ptr;
};
CCL_NAMESPACE_END
-#endif /* __DEVICE_MEMORY_H__ */
+#endif /* __DEVICE_MEMORY_H__ */
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index bdb7c87fa57..4a40e106115 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -31,391 +31,406 @@
CCL_NAMESPACE_BEGIN
-class MultiDevice : public Device
-{
-public:
- struct SubDevice {
- explicit SubDevice(Device *device_)
- : device(device_) {}
-
- Device *device;
- map<device_ptr, device_ptr> ptr_map;
- };
-
- list<SubDevice> devices;
- device_ptr unique_key;
-
- MultiDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
- : Device(info, stats, profiler, background_), unique_key(1)
- {
- foreach(DeviceInfo& subinfo, info.multi_devices) {
- Device *device = Device::create(subinfo, sub_stats_, profiler, background);
-
- /* Always add CPU devices at the back since GPU devices can change
- * host memory pointers, which CPU uses as device pointer. */
- if(subinfo.type == DEVICE_CPU) {
- devices.push_back(SubDevice(device));
- }
- else {
- devices.push_front(SubDevice(device));
- }
- }
+class MultiDevice : public Device {
+ public:
+ struct SubDevice {
+ explicit SubDevice(Device *device_) : device(device_)
+ {
+ }
+
+ Device *device;
+ map<device_ptr, device_ptr> ptr_map;
+ };
+
+ list<SubDevice> devices;
+ device_ptr unique_key;
+
+ MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+ : Device(info, stats, profiler, background_), unique_key(1)
+ {
+ foreach (DeviceInfo &subinfo, info.multi_devices) {
+ Device *device = Device::create(subinfo, sub_stats_, profiler, background);
+
+ /* Always add CPU devices at the back since GPU devices can change
+ * host memory pointers, which CPU uses as device pointer. */
+ if (subinfo.type == DEVICE_CPU) {
+ devices.push_back(SubDevice(device));
+ }
+ else {
+ devices.push_front(SubDevice(device));
+ }
+ }
#ifdef WITH_NETWORK
- /* try to add network devices */
- ServerDiscovery discovery(true);
- time_sleep(1.0);
+ /* try to add network devices */
+ ServerDiscovery discovery(true);
+ time_sleep(1.0);
- vector<string> servers = discovery.get_server_list();
+ vector<string> servers = discovery.get_server_list();
- foreach(string& server, servers) {
- Device *device = device_network_create(info, stats, profiler, server.c_str());
- if(device)
- devices.push_back(SubDevice(device));
- }
+ foreach (string &server, servers) {
+ Device *device = device_network_create(info, stats, profiler, server.c_str());
+ if (device)
+ devices.push_back(SubDevice(device));
+ }
#endif
- }
-
- ~MultiDevice()
- {
- foreach(SubDevice& sub, devices)
- delete sub.device;
- }
-
- const string& error_message()
- {
- foreach(SubDevice& sub, devices) {
- if(sub.device->error_message() != "") {
- if(error_msg == "")
- error_msg = sub.device->error_message();
- break;
- }
- }
-
- return error_msg;
- }
-
- virtual bool show_samples() const
- {
- if(devices.size() > 1) {
- return false;
- }
- return devices.front().device->show_samples();
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const {
- BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
- foreach(const SubDevice& sub_device, devices) {
- bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
- }
- return bvh_layout_mask;
- }
-
- bool load_kernels(const DeviceRequestedFeatures& requested_features)
- {
- foreach(SubDevice& sub, devices)
- if(!sub.device->load_kernels(requested_features))
- return false;
-
- return true;
- }
-
- bool wait_for_availability(const DeviceRequestedFeatures& requested_features)
- {
- foreach(SubDevice& sub, devices)
- if(!sub.device->wait_for_availability(requested_features))
- return false;
-
- return true;
- }
-
- DeviceKernelStatus get_active_kernel_switch_state()
- {
- DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
-
- foreach(SubDevice& sub, devices) {
- DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
- switch (subresult) {
- case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL:
- result = subresult;
- break;
-
- case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
- case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
- return subresult;
-
- case DEVICE_KERNEL_USING_FEATURE_KERNEL:
- case DEVICE_KERNEL_UNKNOWN:
- break;
- }
- }
- return result;
- }
-
- void mem_alloc(device_memory& mem)
- {
- device_ptr key = unique_key++;
-
- foreach(SubDevice& sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = 0;
- mem.device_size = 0;
-
- sub.device->mem_alloc(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size);
- }
-
- void mem_copy_to(device_memory& mem)
- {
- device_ptr existing_key = mem.device_pointer;
- device_ptr key = (existing_key)? existing_key: unique_key++;
- size_t existing_size = mem.device_size;
-
- foreach(SubDevice& sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0;
- mem.device_size = existing_size;
-
- sub.device->mem_copy_to(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size - existing_size);
- }
-
- void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
- {
- device_ptr key = mem.device_pointer;
- int i = 0, sub_h = h/devices.size();
-
- foreach(SubDevice& sub, devices) {
- int sy = y + i*sub_h;
- int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
-
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
-
- sub.device->mem_copy_from(mem, sy, w, sh, elem);
- i++;
- }
-
- mem.device = this;
- mem.device_pointer = key;
- }
-
- void mem_zero(device_memory& mem)
- {
- device_ptr existing_key = mem.device_pointer;
- device_ptr key = (existing_key)? existing_key: unique_key++;
- size_t existing_size = mem.device_size;
-
- foreach(SubDevice& sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0;
- mem.device_size = existing_size;
-
- sub.device->mem_zero(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
-
- mem.device = this;
- mem.device_pointer = key;
- stats.mem_alloc(mem.device_size - existing_size);
- }
-
- void mem_free(device_memory& mem)
- {
- device_ptr key = mem.device_pointer;
- size_t existing_size = mem.device_size;
-
- foreach(SubDevice& sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
- mem.device_size = existing_size;
-
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(key));
- }
-
- mem.device = this;
- mem.device_pointer = 0;
- mem.device_size = 0;
- stats.mem_free(existing_size);
- }
-
- void const_copy_to(const char *name, void *host, size_t size)
- {
- foreach(SubDevice& sub, devices)
- sub.device->const_copy_to(name, host, size);
- }
-
- void draw_pixels(
- device_memory& rgba, int y,
- int w, int h, int width, int height,
- int dx, int dy, int dw, int dh,
- bool transparent, const DeviceDrawParams &draw_params)
- {
- device_ptr key = rgba.device_pointer;
- int i = 0, sub_h = h/devices.size();
- int sub_height = height/devices.size();
-
- foreach(SubDevice& sub, devices) {
- int sy = y + i*sub_h;
- int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h;
- int sheight = (i == (int)devices.size() - 1)? height - sub_height*i: sub_height;
- int sdy = dy + i*sub_height;
- /* adjust math for w/width */
-
- rgba.device_pointer = sub.ptr_map[key];
- sub.device->draw_pixels(rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
- i++;
- }
-
- rgba.device_pointer = key;
- }
-
- void map_tile(Device *sub_device, RenderTile& tile)
- {
- foreach(SubDevice& sub, devices) {
- if(sub.device == sub_device) {
- if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer];
- }
- }
- }
-
- int device_number(Device *sub_device)
- {
- int i = 0;
-
- foreach(SubDevice& sub, devices) {
- if(sub.device == sub_device)
- return i;
- i++;
- }
-
- return -1;
- }
-
- void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
- {
- for(int i = 0; i < 9; i++) {
- if(!tiles[i].buffers) {
- continue;
- }
-
- /* If the tile was rendered on another device, copy its memory to
- * to the current device now, for the duration of the denoising task.
- * Note that this temporarily modifies the RenderBuffers and calls
- * the device, so this function is not thread safe. */
- device_vector<float> &mem = tiles[i].buffers->buffer;
- if(mem.device != sub_device) {
- /* Only copy from device to host once. This is faster, but
- * also required for the case where a CPU thread is denoising
- * a tile rendered on the GPU. In that case we have to avoid
- * overwriting the buffer being denoised by the CPU thread. */
- if(!tiles[i].buffers->map_neighbor_copied) {
- tiles[i].buffers->map_neighbor_copied = true;
- mem.copy_from_device(0, mem.data_size, 1);
- }
-
- mem.swap_device(sub_device, 0, 0);
-
- mem.copy_to_device();
- tiles[i].buffer = mem.device_pointer;
- tiles[i].device_size = mem.device_size;
-
- mem.restore_device();
- }
- }
- }
-
- void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
- {
- /* Copy denoised result back to the host. */
- device_vector<float> &mem = tiles[9].buffers->buffer;
- mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
- mem.copy_from_device(0, mem.data_size, 1);
- mem.restore_device();
- /* Copy denoised result to the original device. */
- mem.copy_to_device();
-
- for(int i = 0; i < 9; i++) {
- if(!tiles[i].buffers) {
- continue;
- }
-
- device_vector<float> &mem = tiles[i].buffers->buffer;
- if(mem.device != sub_device) {
- mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
- sub_device->mem_free(mem);
- mem.restore_device();
- }
- }
- }
-
- int get_split_task_count(DeviceTask& task)
- {
- int total_tasks = 0;
- list<DeviceTask> tasks;
- task.split(tasks, devices.size());
- foreach(SubDevice& sub, devices) {
- if(!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
-
- total_tasks += sub.device->get_split_task_count(subtask);
- }
- }
- return total_tasks;
- }
-
- void task_add(DeviceTask& task)
- {
- list<DeviceTask> tasks;
- task.split(tasks, devices.size());
-
- foreach(SubDevice& sub, devices) {
- if(!tasks.empty()) {
- DeviceTask subtask = tasks.front();
- tasks.pop_front();
-
- if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer];
- if(task.rgba_byte) subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
- if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
- if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
- if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
-
- sub.device->task_add(subtask);
- }
- }
- }
-
- void task_wait()
- {
- foreach(SubDevice& sub, devices)
- sub.device->task_wait();
- }
-
- void task_cancel()
- {
- foreach(SubDevice& sub, devices)
- sub.device->task_cancel();
- }
-
-protected:
- Stats sub_stats_;
+ }
+
+ ~MultiDevice()
+ {
+ foreach (SubDevice &sub, devices)
+ delete sub.device;
+ }
+
+ const string &error_message()
+ {
+ foreach (SubDevice &sub, devices) {
+ if (sub.device->error_message() != "") {
+ if (error_msg == "")
+ error_msg = sub.device->error_message();
+ break;
+ }
+ }
+
+ return error_msg;
+ }
+
+ virtual bool show_samples() const
+ {
+ if (devices.size() > 1) {
+ return false;
+ }
+ return devices.front().device->show_samples();
+ }
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const
+ {
+ BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
+ foreach (const SubDevice &sub_device, devices) {
+ bvh_layout_mask &= sub_device.device->get_bvh_layout_mask();
+ }
+ return bvh_layout_mask;
+ }
+
+ bool load_kernels(const DeviceRequestedFeatures &requested_features)
+ {
+ foreach (SubDevice &sub, devices)
+ if (!sub.device->load_kernels(requested_features))
+ return false;
+
+ return true;
+ }
+
+ bool wait_for_availability(const DeviceRequestedFeatures &requested_features)
+ {
+ foreach (SubDevice &sub, devices)
+ if (!sub.device->wait_for_availability(requested_features))
+ return false;
+
+ return true;
+ }
+
+ DeviceKernelStatus get_active_kernel_switch_state()
+ {
+ DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL;
+
+ foreach (SubDevice &sub, devices) {
+ DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state();
+ switch (subresult) {
+ case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL:
+ result = subresult;
+ break;
+
+ case DEVICE_KERNEL_FEATURE_KERNEL_INVALID:
+ case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE:
+ return subresult;
+
+ case DEVICE_KERNEL_USING_FEATURE_KERNEL:
+ case DEVICE_KERNEL_UNKNOWN:
+ break;
+ }
+ }
+ return result;
+ }
+
+ void mem_alloc(device_memory &mem)
+ {
+ device_ptr key = unique_key++;
+
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ sub.device->mem_alloc(mem);
+ sub.ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size);
+ }
+
+ void mem_copy_to(device_memory &mem)
+ {
+ device_ptr existing_key = mem.device_pointer;
+ device_ptr key = (existing_key) ? existing_key : unique_key++;
+ size_t existing_size = mem.device_size;
+
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ sub.device->mem_copy_to(mem);
+ sub.ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size - existing_size);
+ }
+
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+ {
+ device_ptr key = mem.device_pointer;
+ int i = 0, sub_h = h / devices.size();
+
+ foreach (SubDevice &sub, devices) {
+ int sy = y + i * sub_h;
+ int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+
+ mem.device = sub.device;
+ mem.device_pointer = sub.ptr_map[key];
+
+ sub.device->mem_copy_from(mem, sy, w, sh, elem);
+ i++;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ }
+
+ void mem_zero(device_memory &mem)
+ {
+ device_ptr existing_key = mem.device_pointer;
+ device_ptr key = (existing_key) ? existing_key : unique_key++;
+ size_t existing_size = mem.device_size;
+
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ sub.device->mem_zero(mem);
+ sub.ptr_map[key] = mem.device_pointer;
+ }
+
+ mem.device = this;
+ mem.device_pointer = key;
+ stats.mem_alloc(mem.device_size - existing_size);
+ }
+
+ void mem_free(device_memory &mem)
+ {
+ device_ptr key = mem.device_pointer;
+ size_t existing_size = mem.device_size;
+
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = sub.ptr_map[key];
+ mem.device_size = existing_size;
+
+ sub.device->mem_free(mem);
+ sub.ptr_map.erase(sub.ptr_map.find(key));
+ }
+
+ mem.device = this;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+ stats.mem_free(existing_size);
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size)
+ {
+ foreach (SubDevice &sub, devices)
+ sub.device->const_copy_to(name, host, size);
+ }
+
+ void draw_pixels(device_memory &rgba,
+ int y,
+ int w,
+ int h,
+ int width,
+ int height,
+ int dx,
+ int dy,
+ int dw,
+ int dh,
+ bool transparent,
+ const DeviceDrawParams &draw_params)
+ {
+ device_ptr key = rgba.device_pointer;
+ int i = 0, sub_h = h / devices.size();
+ int sub_height = height / devices.size();
+
+ foreach (SubDevice &sub, devices) {
+ int sy = y + i * sub_h;
+ int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+ int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height;
+ int sdy = dy + i * sub_height;
+ /* adjust math for w/width */
+
+ rgba.device_pointer = sub.ptr_map[key];
+ sub.device->draw_pixels(
+ rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params);
+ i++;
+ }
+
+ rgba.device_pointer = key;
+ }
+
+ void map_tile(Device *sub_device, RenderTile &tile)
+ {
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device) {
+ if (tile.buffer)
+ tile.buffer = sub.ptr_map[tile.buffer];
+ }
+ }
+ }
+
+ int device_number(Device *sub_device)
+ {
+ int i = 0;
+
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device)
+ return i;
+ i++;
+ }
+
+ return -1;
+ }
+
+ void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+ {
+ for (int i = 0; i < 9; i++) {
+ if (!tiles[i].buffers) {
+ continue;
+ }
+
+ /* If the tile was rendered on another device, copy its memory to
+ * to the current device now, for the duration of the denoising task.
+ * Note that this temporarily modifies the RenderBuffers and calls
+ * the device, so this function is not thread safe. */
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+ if (mem.device != sub_device) {
+ /* Only copy from device to host once. This is faster, but
+ * also required for the case where a CPU thread is denoising
+ * a tile rendered on the GPU. In that case we have to avoid
+ * overwriting the buffer being denoised by the CPU thread. */
+ if (!tiles[i].buffers->map_neighbor_copied) {
+ tiles[i].buffers->map_neighbor_copied = true;
+ mem.copy_from_device(0, mem.data_size, 1);
+ }
+
+ mem.swap_device(sub_device, 0, 0);
+
+ mem.copy_to_device();
+ tiles[i].buffer = mem.device_pointer;
+ tiles[i].device_size = mem.device_size;
+
+ mem.restore_device();
+ }
+ }
+ }
+
+ void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+ {
+ /* Copy denoised result back to the host. */
+ device_vector<float> &mem = tiles[9].buffers->buffer;
+ mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer);
+ mem.copy_from_device(0, mem.data_size, 1);
+ mem.restore_device();
+ /* Copy denoised result to the original device. */
+ mem.copy_to_device();
+
+ for (int i = 0; i < 9; i++) {
+ if (!tiles[i].buffers) {
+ continue;
+ }
+
+ device_vector<float> &mem = tiles[i].buffers->buffer;
+ if (mem.device != sub_device) {
+ mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer);
+ sub_device->mem_free(mem);
+ mem.restore_device();
+ }
+ }
+ }
+
+ int get_split_task_count(DeviceTask &task)
+ {
+ int total_tasks = 0;
+ list<DeviceTask> tasks;
+ task.split(tasks, devices.size());
+ foreach (SubDevice &sub, devices) {
+ if (!tasks.empty()) {
+ DeviceTask subtask = tasks.front();
+ tasks.pop_front();
+
+ total_tasks += sub.device->get_split_task_count(subtask);
+ }
+ }
+ return total_tasks;
+ }
+
+ void task_add(DeviceTask &task)
+ {
+ list<DeviceTask> tasks;
+ task.split(tasks, devices.size());
+
+ foreach (SubDevice &sub, devices) {
+ if (!tasks.empty()) {
+ DeviceTask subtask = tasks.front();
+ tasks.pop_front();
+
+ if (task.buffer)
+ subtask.buffer = sub.ptr_map[task.buffer];
+ if (task.rgba_byte)
+ subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+ if (task.rgba_half)
+ subtask.rgba_half = sub.ptr_map[task.rgba_half];
+ if (task.shader_input)
+ subtask.shader_input = sub.ptr_map[task.shader_input];
+ if (task.shader_output)
+ subtask.shader_output = sub.ptr_map[task.shader_output];
+
+ sub.device->task_add(subtask);
+ }
+ }
+ }
+
+ void task_wait()
+ {
+ foreach (SubDevice &sub, devices)
+ sub.device->task_wait();
+ }
+
+ void task_cancel()
+ {
+ foreach (SubDevice &sub, devices)
+ sub.device->task_cancel();
+ }
+
+ protected:
+ Stats sub_stats_;
};
-Device *device_multi_create(DeviceInfo& info, Stats &stats, Profiler& profiler, bool background)
+Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
- return new MultiDevice(info, stats, profiler, background);
+ return new MultiDevice(info, stats, profiler, background);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index 6736480e95a..80334ad8f22 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -33,767 +33,776 @@ typedef map<device_ptr, DataVector> DataMap;
typedef vector<RenderTile> TileList;
/* search a list of tiles and find the one that matches the passed render tile */
-static TileList::iterator tile_list_find(TileList& tile_list, RenderTile& tile)
+static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile)
{
- for(TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
- if(tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
- return it;
- return tile_list.end();
+ for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it)
+ if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample)
+ return it;
+ return tile_list.end();
}
-class NetworkDevice : public Device
-{
-public:
- boost::asio::io_service io_service;
- tcp::socket socket;
- device_ptr mem_counter;
- DeviceTask the_task; /* todo: handle multiple tasks */
-
- thread_mutex rpc_lock;
-
- virtual bool show_samples() const
- {
- return false;
- }
-
- NetworkDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address)
- : Device(info, stats, profiler, true), socket(io_service)
- {
- error_func = NetworkError();
- stringstream portstr;
- portstr << SERVER_PORT;
-
- tcp::resolver resolver(io_service);
- tcp::resolver::query query(address, portstr.str());
- tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
- tcp::resolver::iterator end;
+class NetworkDevice : public Device {
+ public:
+ boost::asio::io_service io_service;
+ tcp::socket socket;
+ device_ptr mem_counter;
+ DeviceTask the_task; /* todo: handle multiple tasks */
+
+ thread_mutex rpc_lock;
+
+ virtual bool show_samples() const
+ {
+ return false;
+ }
+
+ NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address)
+ : Device(info, stats, profiler, true), socket(io_service)
+ {
+ error_func = NetworkError();
+ stringstream portstr;
+ portstr << SERVER_PORT;
+
+ tcp::resolver resolver(io_service);
+ tcp::resolver::query query(address, portstr.str());
+ tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
+ tcp::resolver::iterator end;
+
+ boost::system::error_code error = boost::asio::error::host_not_found;
+ while (error && endpoint_iterator != end) {
+ socket.close();
+ socket.connect(*endpoint_iterator++, error);
+ }
- boost::system::error_code error = boost::asio::error::host_not_found;
- while(error && endpoint_iterator != end)
- {
- socket.close();
- socket.connect(*endpoint_iterator++, error);
- }
-
- if(error)
- error_func.network_error(error.message());
+ if (error)
+ error_func.network_error(error.message());
- mem_counter = 0;
- }
+ mem_counter = 0;
+ }
- ~NetworkDevice()
- {
- RPCSend snd(socket, &error_func, "stop");
- snd.write();
- }
-
- virtual BVHLayoutMask get_bvh_layout_mask() const {
- return BVH_LAYOUT_BVH2;
- }
-
- void mem_alloc(device_memory& mem)
- {
- if(mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- thread_scoped_lock lock(rpc_lock);
+ ~NetworkDevice()
+ {
+ RPCSend snd(socket, &error_func, "stop");
+ snd.write();
+ }
- mem.device_pointer = ++mem_counter;
+ virtual BVHLayoutMask get_bvh_layout_mask() const
+ {
+ return BVH_LAYOUT_BVH2;
+ }
- RPCSend snd(socket, &error_func, "mem_alloc");
- snd.add(mem);
- snd.write();
- }
+ void mem_alloc(device_memory &mem)
+ {
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
- void mem_copy_to(device_memory& mem)
- {
- thread_scoped_lock lock(rpc_lock);
+ thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "mem_copy_to");
+ mem.device_pointer = ++mem_counter;
- snd.add(mem);
- snd.write();
- snd.write_buffer(mem.host_pointer, mem.memory_size());
- }
+ RPCSend snd(socket, &error_func, "mem_alloc");
+ snd.add(mem);
+ snd.write();
+ }
- void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
- {
- thread_scoped_lock lock(rpc_lock);
+ void mem_copy_to(device_memory &mem)
+ {
+ thread_scoped_lock lock(rpc_lock);
- size_t data_size = mem.memory_size();
+ RPCSend snd(socket, &error_func, "mem_copy_to");
- RPCSend snd(socket, &error_func, "mem_copy_from");
+ snd.add(mem);
+ snd.write();
+ snd.write_buffer(mem.host_pointer, mem.memory_size());
+ }
- snd.add(mem);
- snd.add(y);
- snd.add(w);
- snd.add(h);
- snd.add(elem);
- snd.write();
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+ {
+ thread_scoped_lock lock(rpc_lock);
- RPCReceive rcv(socket, &error_func);
- rcv.read_buffer(mem.host_pointer, data_size);
- }
+ size_t data_size = mem.memory_size();
- void mem_zero(device_memory& mem)
- {
- thread_scoped_lock lock(rpc_lock);
+ RPCSend snd(socket, &error_func, "mem_copy_from");
- RPCSend snd(socket, &error_func, "mem_zero");
+ snd.add(mem);
+ snd.add(y);
+ snd.add(w);
+ snd.add(h);
+ snd.add(elem);
+ snd.write();
- snd.add(mem);
- snd.write();
- }
+ RPCReceive rcv(socket, &error_func);
+ rcv.read_buffer(mem.host_pointer, data_size);
+ }
- void mem_free(device_memory& mem)
- {
- if(mem.device_pointer) {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "mem_free");
-
- snd.add(mem);
- snd.write();
-
- mem.device_pointer = 0;
- }
- }
+ void mem_zero(device_memory &mem)
+ {
+ thread_scoped_lock lock(rpc_lock);
- void const_copy_to(const char *name, void *host, size_t size)
- {
- thread_scoped_lock lock(rpc_lock);
+ RPCSend snd(socket, &error_func, "mem_zero");
- RPCSend snd(socket, &error_func, "const_copy_to");
+ snd.add(mem);
+ snd.write();
+ }
- string name_string(name);
-
- snd.add(name_string);
- snd.add(size);
- snd.write();
- snd.write_buffer(host, size);
- }
-
- bool load_kernels(const DeviceRequestedFeatures& requested_features)
- {
- if(error_func.have_error())
- return false;
-
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "load_kernels");
- snd.add(requested_features.experimental);
- snd.add(requested_features.max_closure);
- snd.add(requested_features.max_nodes_group);
- snd.add(requested_features.nodes_features);
- snd.write();
-
- bool result;
- RPCReceive rcv(socket, &error_func);
- rcv.read(result);
-
- return result;
- }
-
- void task_add(DeviceTask& task)
- {
- thread_scoped_lock lock(rpc_lock);
-
- the_task = task;
-
- RPCSend snd(socket, &error_func, "task_add");
- snd.add(task);
- snd.write();
- }
-
- void task_wait()
- {
- thread_scoped_lock lock(rpc_lock);
-
- RPCSend snd(socket, &error_func, "task_wait");
- snd.write();
-
- lock.unlock();
-
- TileList the_tiles;
-
- /* todo: run this threaded for connecting to multiple clients */
- for(;;) {
- if(error_func.have_error())
- break;
-
- RenderTile tile;
-
- lock.lock();
- RPCReceive rcv(socket, &error_func);
-
- if(rcv.name == "acquire_tile") {
- lock.unlock();
-
- /* todo: watch out for recursive calls! */
- if(the_task.acquire_tile(this, tile)) { /* write return as bool */
- the_tiles.push_back(tile);
-
- lock.lock();
- RPCSend snd(socket, &error_func, "acquire_tile");
- snd.add(tile);
- snd.write();
- lock.unlock();
- }
- else {
- lock.lock();
- RPCSend snd(socket, &error_func, "acquire_tile_none");
- snd.write();
- lock.unlock();
- }
- }
- else if(rcv.name == "release_tile") {
- rcv.read(tile);
- lock.unlock();
-
- TileList::iterator it = tile_list_find(the_tiles, tile);
- if(it != the_tiles.end()) {
- tile.buffers = it->buffers;
- the_tiles.erase(it);
- }
-
- assert(tile.buffers != NULL);
-
- the_task.release_tile(tile);
-
- lock.lock();
- RPCSend snd(socket, &error_func, "release_tile");
- snd.write();
- lock.unlock();
- }
- else if(rcv.name == "task_wait_done") {
- lock.unlock();
- break;
- }
- else
- lock.unlock();
- }
- }
-
- void task_cancel()
- {
- thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "task_cancel");
- snd.write();
- }
-
- int get_split_task_count(DeviceTask&)
- {
- return 1;
- }
-
-private:
- NetworkError error_func;
+ void mem_free(device_memory &mem)
+ {
+ if (mem.device_pointer) {
+ thread_scoped_lock lock(rpc_lock);
+
+ RPCSend snd(socket, &error_func, "mem_free");
+
+ snd.add(mem);
+ snd.write();
+
+ mem.device_pointer = 0;
+ }
+ }
+
+ void const_copy_to(const char *name, void *host, size_t size)
+ {
+ thread_scoped_lock lock(rpc_lock);
+
+ RPCSend snd(socket, &error_func, "const_copy_to");
+
+ string name_string(name);
+
+ snd.add(name_string);
+ snd.add(size);
+ snd.write();
+ snd.write_buffer(host, size);
+ }
+
+ bool load_kernels(const DeviceRequestedFeatures &requested_features)
+ {
+ if (error_func.have_error())
+ return false;
+
+ thread_scoped_lock lock(rpc_lock);
+
+ RPCSend snd(socket, &error_func, "load_kernels");
+ snd.add(requested_features.experimental);
+ snd.add(requested_features.max_closure);
+ snd.add(requested_features.max_nodes_group);
+ snd.add(requested_features.nodes_features);
+ snd.write();
+
+ bool result;
+ RPCReceive rcv(socket, &error_func);
+ rcv.read(result);
+
+ return result;
+ }
+
+ void task_add(DeviceTask &task)
+ {
+ thread_scoped_lock lock(rpc_lock);
+
+ the_task = task;
+
+ RPCSend snd(socket, &error_func, "task_add");
+ snd.add(task);
+ snd.write();
+ }
+
+ void task_wait()
+ {
+ thread_scoped_lock lock(rpc_lock);
+
+ RPCSend snd(socket, &error_func, "task_wait");
+ snd.write();
+
+ lock.unlock();
+
+ TileList the_tiles;
+
+ /* todo: run this threaded for connecting to multiple clients */
+ for (;;) {
+ if (error_func.have_error())
+ break;
+
+ RenderTile tile;
+
+ lock.lock();
+ RPCReceive rcv(socket, &error_func);
+
+ if (rcv.name == "acquire_tile") {
+ lock.unlock();
+
+ /* todo: watch out for recursive calls! */
+ if (the_task.acquire_tile(this, tile)) { /* write return as bool */
+ the_tiles.push_back(tile);
+
+ lock.lock();
+ RPCSend snd(socket, &error_func, "acquire_tile");
+ snd.add(tile);
+ snd.write();
+ lock.unlock();
+ }
+ else {
+ lock.lock();
+ RPCSend snd(socket, &error_func, "acquire_tile_none");
+ snd.write();
+ lock.unlock();
+ }
+ }
+ else if (rcv.name == "release_tile") {
+ rcv.read(tile);
+ lock.unlock();
+
+ TileList::iterator it = tile_list_find(the_tiles, tile);
+ if (it != the_tiles.end()) {
+ tile.buffers = it->buffers;
+ the_tiles.erase(it);
+ }
+
+ assert(tile.buffers != NULL);
+
+ the_task.release_tile(tile);
+
+ lock.lock();
+ RPCSend snd(socket, &error_func, "release_tile");
+ snd.write();
+ lock.unlock();
+ }
+ else if (rcv.name == "task_wait_done") {
+ lock.unlock();
+ break;
+ }
+ else
+ lock.unlock();
+ }
+ }
+
+ void task_cancel()
+ {
+ thread_scoped_lock lock(rpc_lock);
+ RPCSend snd(socket, &error_func, "task_cancel");
+ snd.write();
+ }
+
+ int get_split_task_count(DeviceTask &)
+ {
+ return 1;
+ }
+
+ private:
+ NetworkError error_func;
};
-Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address)
+Device *device_network_create(DeviceInfo &info,
+ Stats &stats,
+ Profiler &profiler,
+ const char *address)
{
- return new NetworkDevice(info, stats, profiler, address);
+ return new NetworkDevice(info, stats, profiler, address);
}
-void device_network_info(vector<DeviceInfo>& devices)
+void device_network_info(vector<DeviceInfo> &devices)
{
- DeviceInfo info;
+ DeviceInfo info;
- info.type = DEVICE_NETWORK;
- info.description = "Network Device";
- info.id = "NETWORK";
- info.num = 0;
+ info.type = DEVICE_NETWORK;
+ info.description = "Network Device";
+ info.id = "NETWORK";
+ info.num = 0;
- /* todo: get this info from device */
- info.has_volume_decoupled = false;
- info.has_osl = false;
+ /* todo: get this info from device */
+ info.has_volume_decoupled = false;
+ info.has_osl = false;
- devices.push_back(info);
+ devices.push_back(info);
}
class DeviceServer {
-public:
- thread_mutex rpc_lock;
-
- void network_error(const string &message) {
- error_func.network_error(message);
- }
-
- bool have_error() { return error_func.have_error(); }
-
- DeviceServer(Device *device_, tcp::socket& socket_)
- : device(device_), socket(socket_), stop(false), blocked_waiting(false)
- {
- error_func = NetworkError();
- }
-
- void listen()
- {
- /* receive remote function calls */
- for(;;) {
- listen_step();
-
- if(stop)
- break;
- }
- }
-
-protected:
- void listen_step()
- {
- thread_scoped_lock lock(rpc_lock);
- RPCReceive rcv(socket, &error_func);
-
- if(rcv.name == "stop")
- stop = true;
- else
- process(rcv, lock);
- }
-
- /* create a memory buffer for a device buffer and insert it into mem_data */
- DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
- {
- /* create a new DataVector and insert it into mem_data */
- pair<DataMap::iterator,bool> data_ins = mem_data.insert(
- DataMap::value_type(client_pointer, DataVector()));
-
- /* make sure it was a unique insertion */
- assert(data_ins.second);
-
- /* get a reference to the inserted vector */
- DataVector &data_v = data_ins.first->second;
-
- /* size the vector */
- data_v.resize(data_size);
-
- return data_v;
- }
-
- DataVector &data_vector_find(device_ptr client_pointer)
- {
- DataMap::iterator i = mem_data.find(client_pointer);
- assert(i != mem_data.end());
- return i->second;
- }
-
- /* setup mapping and reverse mapping of client_pointer<->real_pointer */
- void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
- {
- pair<PtrMap::iterator,bool> mapins;
-
- /* insert mapping from client pointer to our real device pointer */
- mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
- assert(mapins.second);
-
- /* insert reverse mapping from real our device pointer to client pointer */
- mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
- assert(mapins.second);
- }
-
- device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
- {
- PtrMap::iterator i = ptr_map.find(client_pointer);
- assert(i != ptr_map.end());
- return i->second;
- }
-
- device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
- {
- PtrMap::iterator i = ptr_map.find(client_pointer);
- assert(i != ptr_map.end());
-
- device_ptr result = i->second;
-
- /* erase the mapping */
- ptr_map.erase(i);
-
- /* erase the reverse mapping */
- PtrMap::iterator irev = ptr_imap.find(result);
- assert(irev != ptr_imap.end());
- ptr_imap.erase(irev);
-
- /* erase the data vector */
- DataMap::iterator idata = mem_data.find(client_pointer);
- assert(idata != mem_data.end());
- mem_data.erase(idata);
-
- return result;
- }
-
- /* note that the lock must be already acquired upon entry.
- * This is necessary because the caller often peeks at
- * the header and delegates control to here when it doesn't
- * specifically handle the current RPC.
- * The lock must be unlocked before returning */
- void process(RPCReceive& rcv, thread_scoped_lock &lock)
- {
- if(rcv.name == "mem_alloc") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- /* Allocate host side data buffer. */
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0;
-
- /* Perform the allocation on the actual device. */
- device->mem_alloc(mem);
-
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- else if(rcv.name == "mem_copy_to") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- if(client_pointer) {
- /* Lookup existing host side data buffer. */
- DataVector &data_v = data_vector_find(client_pointer);
- mem.host_pointer = (void*)&data_v[0];
-
- /* Translate the client pointer to a real device pointer. */
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
- }
- else {
- /* Allocate host side data buffer. */
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0;
- }
-
- /* Copy data from network into memory buffer. */
- rcv.read_buffer((uint8_t*)mem.host_pointer, data_size);
-
- /* Copy the data from the memory buffer to the device buffer. */
- device->mem_copy_to(mem);
-
- if(!client_pointer) {
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- }
- else if(rcv.name == "mem_copy_from") {
- string name;
- network_device_memory mem(device);
- int y, w, h, elem;
-
- rcv.read(mem, name);
- rcv.read(y);
- rcv.read(w);
- rcv.read(h);
- rcv.read(elem);
-
- device_ptr client_pointer = mem.device_pointer;
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
-
- DataVector &data_v = data_vector_find(client_pointer);
-
- mem.host_pointer = (device_ptr)&(data_v[0]);
-
- device->mem_copy_from(mem, y, w, h, elem);
-
- size_t data_size = mem.memory_size();
-
- RPCSend snd(socket, &error_func, "mem_copy_from");
- snd.write();
- snd.write_buffer((uint8_t*)mem.host_pointer, data_size);
- lock.unlock();
- }
- else if(rcv.name == "mem_zero") {
- string name;
- network_device_memory mem(device);
- rcv.read(mem, name);
- lock.unlock();
-
- size_t data_size = mem.memory_size();
- device_ptr client_pointer = mem.device_pointer;
-
- if(client_pointer) {
- /* Lookup existing host side data buffer. */
- DataVector &data_v = data_vector_find(client_pointer);
- mem.host_pointer = (void*)&data_v[0];
-
- /* Translate the client pointer to a real device pointer. */
- mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
- }
- else {
- /* Allocate host side data buffer. */
- DataVector &data_v = data_vector_insert(client_pointer, data_size);
- mem.host_pointer = (void*)? (device_ptr)&(data_v[0]): 0;
- }
-
- /* Zero memory. */
- device->mem_zero(mem);
-
- if(!client_pointer) {
- /* Store a mapping to/from client_pointer and real device pointer. */
- pointer_mapping_insert(client_pointer, mem.device_pointer);
- }
- }
- else if(rcv.name == "mem_free") {
- string name;
- network_device_memory mem(device);
-
- rcv.read(mem, name);
- lock.unlock();
-
- device_ptr client_pointer = mem.device_pointer;
-
- mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
-
- device->mem_free(mem);
- }
- else if(rcv.name == "const_copy_to") {
- string name_string;
- size_t size;
-
- rcv.read(name_string);
- rcv.read(size);
-
- vector<char> host_vector(size);
- rcv.read_buffer(&host_vector[0], size);
- lock.unlock();
-
- device->const_copy_to(name_string.c_str(), &host_vector[0], size);
- }
- else if(rcv.name == "load_kernels") {
- DeviceRequestedFeatures requested_features;
- rcv.read(requested_features.experimental);
- rcv.read(requested_features.max_closure);
- rcv.read(requested_features.max_nodes_group);
- rcv.read(requested_features.nodes_features);
-
- bool result;
- result = device->load_kernels(requested_features);
- RPCSend snd(socket, &error_func, "load_kernels");
- snd.add(result);
- snd.write();
- lock.unlock();
- }
- else if(rcv.name == "task_add") {
- DeviceTask task;
-
- rcv.read(task);
- lock.unlock();
-
- if(task.buffer)
- task.buffer = device_ptr_from_client_pointer(task.buffer);
-
- if(task.rgba_half)
- task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
-
- if(task.rgba_byte)
- task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
-
- if(task.shader_input)
- task.shader_input = device_ptr_from_client_pointer(task.shader_input);
-
- if(task.shader_output)
- task.shader_output = device_ptr_from_client_pointer(task.shader_output);
-
- task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
- task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
- task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this);
- task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
- task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
-
- device->task_add(task);
- }
- else if(rcv.name == "task_wait") {
- lock.unlock();
-
- blocked_waiting = true;
- device->task_wait();
- blocked_waiting = false;
-
- lock.lock();
- RPCSend snd(socket, &error_func, "task_wait_done");
- snd.write();
- lock.unlock();
- }
- else if(rcv.name == "task_cancel") {
- lock.unlock();
- device->task_cancel();
- }
- else if(rcv.name == "acquire_tile") {
- AcquireEntry entry;
- entry.name = rcv.name;
- rcv.read(entry.tile);
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else if(rcv.name == "acquire_tile_none") {
- AcquireEntry entry;
- entry.name = rcv.name;
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else if(rcv.name == "release_tile") {
- AcquireEntry entry;
- entry.name = rcv.name;
- acquire_queue.push_back(entry);
- lock.unlock();
- }
- else {
- cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
- lock.unlock();
- }
- }
-
- bool task_acquire_tile(Device *, RenderTile& tile)
- {
- thread_scoped_lock acquire_lock(acquire_mutex);
-
- bool result = false;
-
- RPCSend snd(socket, &error_func, "acquire_tile");
- snd.write();
-
- do {
- if(blocked_waiting)
- listen_step();
-
- /* todo: avoid busy wait loop */
- thread_scoped_lock lock(rpc_lock);
-
- if(!acquire_queue.empty()) {
- AcquireEntry entry = acquire_queue.front();
- acquire_queue.pop_front();
-
- if(entry.name == "acquire_tile") {
- tile = entry.tile;
-
- if(tile.buffer) tile.buffer = ptr_map[tile.buffer];
-
- result = true;
- break;
- }
- else if(entry.name == "acquire_tile_none") {
- break;
- }
- else {
- cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
- }
- }
- } while(acquire_queue.empty() && !stop && !have_error());
-
- return result;
- }
-
- void task_update_progress_sample()
- {
- ; /* skip */
- }
-
- void task_update_tile_sample(RenderTile&)
- {
- ; /* skip */
- }
-
- void task_release_tile(RenderTile& tile)
- {
- thread_scoped_lock acquire_lock(acquire_mutex);
-
- if(tile.buffer) tile.buffer = ptr_imap[tile.buffer];
-
- {
- thread_scoped_lock lock(rpc_lock);
- RPCSend snd(socket, &error_func, "release_tile");
- snd.add(tile);
- snd.write();
- lock.unlock();
- }
-
- do {
- if(blocked_waiting)
- listen_step();
-
- /* todo: avoid busy wait loop */
- thread_scoped_lock lock(rpc_lock);
-
- if(!acquire_queue.empty()) {
- AcquireEntry entry = acquire_queue.front();
- acquire_queue.pop_front();
-
- if(entry.name == "release_tile") {
- lock.unlock();
- break;
- }
- else {
- cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
- }
- }
- } while(acquire_queue.empty() && !stop);
- }
-
- bool task_get_cancel()
- {
- return false;
- }
-
- /* properties */
- Device *device;
- tcp::socket& socket;
-
- /* mapping of remote to local pointer */
- PtrMap ptr_map;
- PtrMap ptr_imap;
- DataMap mem_data;
-
- struct AcquireEntry {
- string name;
- RenderTile tile;
- };
-
- thread_mutex acquire_mutex;
- list<AcquireEntry> acquire_queue;
-
- bool stop;
- bool blocked_waiting;
-private:
- NetworkError error_func;
-
- /* todo: free memory and device (osl) on network error */
-
+ public:
+ thread_mutex rpc_lock;
+
+ void network_error(const string &message)
+ {
+ error_func.network_error(message);
+ }
+
+ bool have_error()
+ {
+ return error_func.have_error();
+ }
+
+ DeviceServer(Device *device_, tcp::socket &socket_)
+ : device(device_), socket(socket_), stop(false), blocked_waiting(false)
+ {
+ error_func = NetworkError();
+ }
+
+ void listen()
+ {
+ /* receive remote function calls */
+ for (;;) {
+ listen_step();
+
+ if (stop)
+ break;
+ }
+ }
+
+ protected:
+ void listen_step()
+ {
+ thread_scoped_lock lock(rpc_lock);
+ RPCReceive rcv(socket, &error_func);
+
+ if (rcv.name == "stop")
+ stop = true;
+ else
+ process(rcv, lock);
+ }
+
+ /* create a memory buffer for a device buffer and insert it into mem_data */
+ DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size)
+ {
+ /* create a new DataVector and insert it into mem_data */
+ pair<DataMap::iterator, bool> data_ins = mem_data.insert(
+ DataMap::value_type(client_pointer, DataVector()));
+
+ /* make sure it was a unique insertion */
+ assert(data_ins.second);
+
+ /* get a reference to the inserted vector */
+ DataVector &data_v = data_ins.first->second;
+
+ /* size the vector */
+ data_v.resize(data_size);
+
+ return data_v;
+ }
+
+ DataVector &data_vector_find(device_ptr client_pointer)
+ {
+ DataMap::iterator i = mem_data.find(client_pointer);
+ assert(i != mem_data.end());
+ return i->second;
+ }
+
+ /* setup mapping and reverse mapping of client_pointer<->real_pointer */
+ void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer)
+ {
+ pair<PtrMap::iterator, bool> mapins;
+
+ /* insert mapping from client pointer to our real device pointer */
+ mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer));
+ assert(mapins.second);
+
+ /* insert reverse mapping from real our device pointer to client pointer */
+ mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer));
+ assert(mapins.second);
+ }
+
+ device_ptr device_ptr_from_client_pointer(device_ptr client_pointer)
+ {
+ PtrMap::iterator i = ptr_map.find(client_pointer);
+ assert(i != ptr_map.end());
+ return i->second;
+ }
+
+ device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer)
+ {
+ PtrMap::iterator i = ptr_map.find(client_pointer);
+ assert(i != ptr_map.end());
+
+ device_ptr result = i->second;
+
+ /* erase the mapping */
+ ptr_map.erase(i);
+
+ /* erase the reverse mapping */
+ PtrMap::iterator irev = ptr_imap.find(result);
+ assert(irev != ptr_imap.end());
+ ptr_imap.erase(irev);
+
+ /* erase the data vector */
+ DataMap::iterator idata = mem_data.find(client_pointer);
+ assert(idata != mem_data.end());
+ mem_data.erase(idata);
+
+ return result;
+ }
+
+ /* note that the lock must be already acquired upon entry.
+ * This is necessary because the caller often peeks at
+ * the header and delegates control to here when it doesn't
+ * specifically handle the current RPC.
+ * The lock must be unlocked before returning */
+ void process(RPCReceive &rcv, thread_scoped_lock &lock)
+ {
+ if (rcv.name == "mem_alloc") {
+ string name;
+ network_device_memory mem(device);
+ rcv.read(mem, name);
+ lock.unlock();
+
+ /* Allocate host side data buffer. */
+ size_t data_size = mem.memory_size();
+ device_ptr client_pointer = mem.device_pointer;
+
+ DataVector &data_v = data_vector_insert(client_pointer, data_size);
+ mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
+
+ /* Perform the allocation on the actual device. */
+ device->mem_alloc(mem);
+
+ /* Store a mapping to/from client_pointer and real device pointer. */
+ pointer_mapping_insert(client_pointer, mem.device_pointer);
+ }
+ else if (rcv.name == "mem_copy_to") {
+ string name;
+ network_device_memory mem(device);
+ rcv.read(mem, name);
+ lock.unlock();
+
+ size_t data_size = mem.memory_size();
+ device_ptr client_pointer = mem.device_pointer;
+
+ if (client_pointer) {
+ /* Lookup existing host side data buffer. */
+ DataVector &data_v = data_vector_find(client_pointer);
+ mem.host_pointer = (void *)&data_v[0];
+
+ /* Translate the client pointer to a real device pointer. */
+ mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+ }
+ else {
+ /* Allocate host side data buffer. */
+ DataVector &data_v = data_vector_insert(client_pointer, data_size);
+ mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0;
+ }
+
+ /* Copy data from network into memory buffer. */
+ rcv.read_buffer((uint8_t *)mem.host_pointer, data_size);
+
+ /* Copy the data from the memory buffer to the device buffer. */
+ device->mem_copy_to(mem);
+
+ if (!client_pointer) {
+ /* Store a mapping to/from client_pointer and real device pointer. */
+ pointer_mapping_insert(client_pointer, mem.device_pointer);
+ }
+ }
+ else if (rcv.name == "mem_copy_from") {
+ string name;
+ network_device_memory mem(device);
+ int y, w, h, elem;
+
+ rcv.read(mem, name);
+ rcv.read(y);
+ rcv.read(w);
+ rcv.read(h);
+ rcv.read(elem);
+
+ device_ptr client_pointer = mem.device_pointer;
+ mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+
+ DataVector &data_v = data_vector_find(client_pointer);
+
+ mem.host_pointer = (device_ptr) & (data_v[0]);
+
+ device->mem_copy_from(mem, y, w, h, elem);
+
+ size_t data_size = mem.memory_size();
+
+ RPCSend snd(socket, &error_func, "mem_copy_from");
+ snd.write();
+ snd.write_buffer((uint8_t *)mem.host_pointer, data_size);
+ lock.unlock();
+ }
+ else if (rcv.name == "mem_zero") {
+ string name;
+ network_device_memory mem(device);
+ rcv.read(mem, name);
+ lock.unlock();
+
+ size_t data_size = mem.memory_size();
+ device_ptr client_pointer = mem.device_pointer;
+
+ if (client_pointer) {
+ /* Lookup existing host side data buffer. */
+ DataVector &data_v = data_vector_find(client_pointer);
+ mem.host_pointer = (void *)&data_v[0];
+
+ /* Translate the client pointer to a real device pointer. */
+ mem.device_pointer = device_ptr_from_client_pointer(client_pointer);
+ }
+ else {
+ /* Allocate host side data buffer. */
+ DataVector &data_v = data_vector_insert(client_pointer, data_size);
+ mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0;
+ }
+
+ /* Zero memory. */
+ device->mem_zero(mem);
+
+ if (!client_pointer) {
+ /* Store a mapping to/from client_pointer and real device pointer. */
+ pointer_mapping_insert(client_pointer, mem.device_pointer);
+ }
+ }
+ else if (rcv.name == "mem_free") {
+ string name;
+ network_device_memory mem(device);
+
+ rcv.read(mem, name);
+ lock.unlock();
+
+ device_ptr client_pointer = mem.device_pointer;
+
+ mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer);
+
+ device->mem_free(mem);
+ }
+ else if (rcv.name == "const_copy_to") {
+ string name_string;
+ size_t size;
+
+ rcv.read(name_string);
+ rcv.read(size);
+
+ vector<char> host_vector(size);
+ rcv.read_buffer(&host_vector[0], size);
+ lock.unlock();
+
+ device->const_copy_to(name_string.c_str(), &host_vector[0], size);
+ }
+ else if (rcv.name == "load_kernels") {
+ DeviceRequestedFeatures requested_features;
+ rcv.read(requested_features.experimental);
+ rcv.read(requested_features.max_closure);
+ rcv.read(requested_features.max_nodes_group);
+ rcv.read(requested_features.nodes_features);
+
+ bool result;
+ result = device->load_kernels(requested_features);
+ RPCSend snd(socket, &error_func, "load_kernels");
+ snd.add(result);
+ snd.write();
+ lock.unlock();
+ }
+ else if (rcv.name == "task_add") {
+ DeviceTask task;
+
+ rcv.read(task);
+ lock.unlock();
+
+ if (task.buffer)
+ task.buffer = device_ptr_from_client_pointer(task.buffer);
+
+ if (task.rgba_half)
+ task.rgba_half = device_ptr_from_client_pointer(task.rgba_half);
+
+ if (task.rgba_byte)
+ task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte);
+
+ if (task.shader_input)
+ task.shader_input = device_ptr_from_client_pointer(task.shader_input);
+
+ if (task.shader_output)
+ task.shader_output = device_ptr_from_client_pointer(task.shader_output);
+
+ task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
+ task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
+ task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample,
+ this);
+ task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1);
+ task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this);
+
+ device->task_add(task);
+ }
+ else if (rcv.name == "task_wait") {
+ lock.unlock();
+
+ blocked_waiting = true;
+ device->task_wait();
+ blocked_waiting = false;
+
+ lock.lock();
+ RPCSend snd(socket, &error_func, "task_wait_done");
+ snd.write();
+ lock.unlock();
+ }
+ else if (rcv.name == "task_cancel") {
+ lock.unlock();
+ device->task_cancel();
+ }
+ else if (rcv.name == "acquire_tile") {
+ AcquireEntry entry;
+ entry.name = rcv.name;
+ rcv.read(entry.tile);
+ acquire_queue.push_back(entry);
+ lock.unlock();
+ }
+ else if (rcv.name == "acquire_tile_none") {
+ AcquireEntry entry;
+ entry.name = rcv.name;
+ acquire_queue.push_back(entry);
+ lock.unlock();
+ }
+ else if (rcv.name == "release_tile") {
+ AcquireEntry entry;
+ entry.name = rcv.name;
+ acquire_queue.push_back(entry);
+ lock.unlock();
+ }
+ else {
+ cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n";
+ lock.unlock();
+ }
+ }
+
+ bool task_acquire_tile(Device *, RenderTile &tile)
+ {
+ thread_scoped_lock acquire_lock(acquire_mutex);
+
+ bool result = false;
+
+ RPCSend snd(socket, &error_func, "acquire_tile");
+ snd.write();
+
+ do {
+ if (blocked_waiting)
+ listen_step();
+
+ /* todo: avoid busy wait loop */
+ thread_scoped_lock lock(rpc_lock);
+
+ if (!acquire_queue.empty()) {
+ AcquireEntry entry = acquire_queue.front();
+ acquire_queue.pop_front();
+
+ if (entry.name == "acquire_tile") {
+ tile = entry.tile;
+
+ if (tile.buffer)
+ tile.buffer = ptr_map[tile.buffer];
+
+ result = true;
+ break;
+ }
+ else if (entry.name == "acquire_tile_none") {
+ break;
+ }
+ else {
+ cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n";
+ }
+ }
+ } while (acquire_queue.empty() && !stop && !have_error());
+
+ return result;
+ }
+
+ void task_update_progress_sample()
+ {
+ ; /* skip */
+ }
+
+ void task_update_tile_sample(RenderTile &)
+ {
+ ; /* skip */
+ }
+
+ void task_release_tile(RenderTile &tile)
+ {
+ thread_scoped_lock acquire_lock(acquire_mutex);
+
+ if (tile.buffer)
+ tile.buffer = ptr_imap[tile.buffer];
+
+ {
+ thread_scoped_lock lock(rpc_lock);
+ RPCSend snd(socket, &error_func, "release_tile");
+ snd.add(tile);
+ snd.write();
+ lock.unlock();
+ }
+
+ do {
+ if (blocked_waiting)
+ listen_step();
+
+ /* todo: avoid busy wait loop */
+ thread_scoped_lock lock(rpc_lock);
+
+ if (!acquire_queue.empty()) {
+ AcquireEntry entry = acquire_queue.front();
+ acquire_queue.pop_front();
+
+ if (entry.name == "release_tile") {
+ lock.unlock();
+ break;
+ }
+ else {
+ cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n";
+ }
+ }
+ } while (acquire_queue.empty() && !stop);
+ }
+
+ bool task_get_cancel()
+ {
+ return false;
+ }
+
+ /* properties */
+ Device *device;
+ tcp::socket &socket;
+
+ /* mapping of remote to local pointer */
+ PtrMap ptr_map;
+ PtrMap ptr_imap;
+ DataMap mem_data;
+
+ struct AcquireEntry {
+ string name;
+ RenderTile tile;
+ };
+
+ thread_mutex acquire_mutex;
+ list<AcquireEntry> acquire_queue;
+
+ bool stop;
+ bool blocked_waiting;
+
+ private:
+ NetworkError error_func;
+
+ /* todo: free memory and device (osl) on network error */
};
void Device::server_run()
{
- try {
- /* starts thread that responds to discovery requests */
- ServerDiscovery discovery;
-
- for(;;) {
- /* accept connection */
- boost::asio::io_service io_service;
- tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
-
- tcp::socket socket(io_service);
- acceptor.accept(socket);
-
- string remote_address = socket.remote_endpoint().address().to_string();
- printf("Connected to remote client at: %s\n", remote_address.c_str());
-
- DeviceServer server(this, socket);
- server.listen();
-
- printf("Disconnected.\n");
- }
- }
- catch(exception& e) {
- fprintf(stderr, "Network server exception: %s\n", e.what());
- }
+ try {
+ /* starts thread that responds to discovery requests */
+ ServerDiscovery discovery;
+
+ for (;;) {
+ /* accept connection */
+ boost::asio::io_service io_service;
+ tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT));
+
+ tcp::socket socket(io_service);
+ acceptor.accept(socket);
+
+ string remote_address = socket.remote_endpoint().address().to_string();
+ printf("Connected to remote client at: %s\n", remote_address.c_str());
+
+ DeviceServer server(this, socket);
+ server.listen();
+
+ printf("Disconnected.\n");
+ }
+ }
+ catch (exception &e) {
+ fprintf(stderr, "Network server exception: %s\n", e.what());
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 67626ae177f..5b69b815cc6 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -19,35 +19,35 @@
#ifdef WITH_NETWORK
-#include <boost/archive/text_iarchive.hpp>
-#include <boost/archive/text_oarchive.hpp>
-#include <boost/archive/binary_iarchive.hpp>
-#include <boost/archive/binary_oarchive.hpp>
-#include <boost/array.hpp>
-#include <boost/asio.hpp>
-#include <boost/bind.hpp>
-#include <boost/serialization/vector.hpp>
-#include <boost/thread.hpp>
-
-#include <iostream>
-#include <sstream>
-#include <deque>
-
-#include "render/buffers.h"
-
-#include "util/util_foreach.h"
-#include "util/util_list.h"
-#include "util/util_map.h"
-#include "util/util_param.h"
-#include "util/util_string.h"
+# include <boost/archive/text_iarchive.hpp>
+# include <boost/archive/text_oarchive.hpp>
+# include <boost/archive/binary_iarchive.hpp>
+# include <boost/archive/binary_oarchive.hpp>
+# include <boost/array.hpp>
+# include <boost/asio.hpp>
+# include <boost/bind.hpp>
+# include <boost/serialization/vector.hpp>
+# include <boost/thread.hpp>
+
+# include <iostream>
+# include <sstream>
+# include <deque>
+
+# include "render/buffers.h"
+
+# include "util/util_foreach.h"
+# include "util/util_list.h"
+# include "util/util_map.h"
+# include "util/util_param.h"
+# include "util/util_string.h"
CCL_NAMESPACE_BEGIN
-using std::cout;
using std::cerr;
+using std::cout;
+using std::exception;
using std::hex;
using std::setw;
-using std::exception;
using boost::asio::ip::tcp;
@@ -56,436 +56,435 @@ static const int DISCOVER_PORT = 5121;
static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP";
static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP";
-#if 0
+# if 0
typedef boost::archive::text_oarchive o_archive;
typedef boost::archive::text_iarchive i_archive;
-#else
+# else
typedef boost::archive::binary_oarchive o_archive;
typedef boost::archive::binary_iarchive i_archive;
-#endif
+# endif
/* Serialization of device memory */
-class network_device_memory : public device_memory
-{
-public:
- network_device_memory(Device *device)
- : device_memory(device, "", MEM_READ_ONLY)
- {
- }
+class network_device_memory : public device_memory {
+ public:
+ network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY)
+ {
+ }
- ~network_device_memory()
- {
- device_pointer = 0;
- };
+ ~network_device_memory()
+ {
+ device_pointer = 0;
+ };
- vector<char> local_data;
+ vector<char> local_data;
};
/* Common netowrk error function / object for both DeviceNetwork and DeviceServer*/
class NetworkError {
-public:
- NetworkError() {
- error = "";
- error_count = 0;
- }
-
- ~NetworkError() {}
-
- void network_error(const string& message) {
- error = message;
- error_count += 1;
- }
-
- bool have_error() {
- return true ? error_count > 0 : false;
- }
-
-private:
- string error;
- int error_count;
+ public:
+ NetworkError()
+ {
+ error = "";
+ error_count = 0;
+ }
+
+ ~NetworkError()
+ {
+ }
+
+ void network_error(const string &message)
+ {
+ error = message;
+ error_count += 1;
+ }
+
+ bool have_error()
+ {
+ return true ? error_count > 0 : false;
+ }
+
+ private:
+ string error;
+ int error_count;
};
-
/* Remote procedure call Send */
class RPCSend {
-public:
- RPCSend(tcp::socket& socket_, NetworkError* e, const string& name_ = "")
- : name(name_), socket(socket_), archive(archive_stream), sent(false)
- {
- archive & name_;
- error_func = e;
- fprintf(stderr, "rpc send %s\n", name.c_str());
- }
-
- ~RPCSend()
- {
- }
-
- void add(const device_memory& mem)
- {
- archive & mem.data_type & mem.data_elements & mem.data_size;
- archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
- archive & mem.type & string(mem.name);
- archive & mem.interpolation & mem.extension;
- archive & mem.device_pointer;
- }
-
- template<typename T> void add(const T& data)
- {
- archive & data;
- }
-
- void add(const DeviceTask& task)
- {
- int type = (int)task.type;
- archive & type & task.x & task.y & task.w & task.h;
- archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
- archive & task.offset & task.stride;
- archive & task.shader_input & task.shader_output & task.shader_eval_type;
- archive & task.shader_x & task.shader_w;
- archive & task.need_finish_queue;
- }
-
- void add(const RenderTile& tile)
- {
- archive & tile.x & tile.y & tile.w & tile.h;
- archive & tile.start_sample & tile.num_samples & tile.sample;
- archive & tile.resolution & tile.offset & tile.stride;
- archive & tile.buffer;
- }
-
- void write()
- {
- boost::system::error_code error;
-
- /* get string from stream */
- string archive_str = archive_stream.str();
-
- /* first send fixed size header with size of following data */
- ostringstream header_stream;
- header_stream << setw(8) << hex << archive_str.size();
- string header_str = header_stream.str();
-
- boost::asio::write(socket,
- boost::asio::buffer(header_str),
- boost::asio::transfer_all(), error);
-
- if(error.value())
- error_func->network_error(error.message());
-
- /* then send actual data */
- boost::asio::write(socket,
- boost::asio::buffer(archive_str),
- boost::asio::transfer_all(), error);
-
- if(error.value())
- error_func->network_error(error.message());
-
- sent = true;
- }
-
- void write_buffer(void *buffer, size_t size)
- {
- boost::system::error_code error;
-
- boost::asio::write(socket,
- boost::asio::buffer(buffer, size),
- boost::asio::transfer_all(), error);
-
- if(error.value())
- error_func->network_error(error.message());
- }
-
-protected:
- string name;
- tcp::socket& socket;
- ostringstream archive_stream;
- o_archive archive;
- bool sent;
- NetworkError *error_func;
+ public:
+ RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "")
+ : name(name_), socket(socket_), archive(archive_stream), sent(false)
+ {
+ archive &name_;
+ error_func = e;
+ fprintf(stderr, "rpc send %s\n", name.c_str());
+ }
+
+ ~RPCSend()
+ {
+ }
+
+ void add(const device_memory &mem)
+ {
+ archive &mem.data_type &mem.data_elements &mem.data_size;
+ archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
+ archive &mem.type &string(mem.name);
+ archive &mem.interpolation &mem.extension;
+ archive &mem.device_pointer;
+ }
+
+ template<typename T> void add(const T &data)
+ {
+ archive &data;
+ }
+
+ void add(const DeviceTask &task)
+ {
+ int type = (int)task.type;
+ archive &type &task.x &task.y &task.w &task.h;
+ archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
+ archive &task.offset &task.stride;
+ archive &task.shader_input &task.shader_output &task.shader_eval_type;
+ archive &task.shader_x &task.shader_w;
+ archive &task.need_finish_queue;
+ }
+
+ void add(const RenderTile &tile)
+ {
+ archive &tile.x &tile.y &tile.w &tile.h;
+ archive &tile.start_sample &tile.num_samples &tile.sample;
+ archive &tile.resolution &tile.offset &tile.stride;
+ archive &tile.buffer;
+ }
+
+ void write()
+ {
+ boost::system::error_code error;
+
+ /* get string from stream */
+ string archive_str = archive_stream.str();
+
+ /* first send fixed size header with size of following data */
+ ostringstream header_stream;
+ header_stream << setw(8) << hex << archive_str.size();
+ string header_str = header_stream.str();
+
+ boost::asio::write(
+ socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error);
+
+ if (error.value())
+ error_func->network_error(error.message());
+
+ /* then send actual data */
+ boost::asio::write(
+ socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error);
+
+ if (error.value())
+ error_func->network_error(error.message());
+
+ sent = true;
+ }
+
+ void write_buffer(void *buffer, size_t size)
+ {
+ boost::system::error_code error;
+
+ boost::asio::write(
+ socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error);
+
+ if (error.value())
+ error_func->network_error(error.message());
+ }
+
+ protected:
+ string name;
+ tcp::socket &socket;
+ ostringstream archive_stream;
+ o_archive archive;
+ bool sent;
+ NetworkError *error_func;
};
/* Remote procedure call Receive */
class RPCReceive {
-public:
- RPCReceive(tcp::socket& socket_, NetworkError* e )
- : socket(socket_), archive_stream(NULL), archive(NULL)
- {
- error_func = e;
- /* read head with fixed size */
- vector<char> header(8);
- boost::system::error_code error;
- size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
-
- if(error.value()) {
- error_func->network_error(error.message());
- }
-
- /* verify if we got something */
- if(len == header.size()) {
- /* decode header */
- string header_str(&header[0], header.size());
- istringstream header_stream(header_str);
-
- size_t data_size;
-
- if((header_stream >> hex >> data_size)) {
-
- vector<char> data(data_size);
- size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
-
- if(error.value())
- error_func->network_error(error.message());
-
-
- if(len == data_size) {
- archive_str = (data.size())? string(&data[0], data.size()): string("");
-
- archive_stream = new istringstream(archive_str);
- archive = new i_archive(*archive_stream);
-
- *archive & name;
- fprintf(stderr, "rpc receive %s\n", name.c_str());
- }
- else {
- error_func->network_error("Network receive error: data size doesn't match header");
- }
- }
- else {
- error_func->network_error("Network receive error: can't decode data size from header");
- }
- }
- else {
- error_func->network_error("Network receive error: invalid header size");
- }
- }
-
- ~RPCReceive()
- {
- delete archive;
- delete archive_stream;
- }
-
- void read(network_device_memory& mem, string& name)
- {
- *archive & mem.data_type & mem.data_elements & mem.data_size;
- *archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer;
- *archive & mem.type & name;
- *archive & mem.interpolation & mem.extension;
- *archive & mem.device_pointer;
-
- mem.name = name.c_str();
- mem.host_pointer = 0;
-
- /* Can't transfer OpenGL texture over network. */
- if(mem.type == MEM_PIXELS) {
- mem.type = MEM_READ_WRITE;
- }
- }
-
- template<typename T> void read(T& data)
- {
- *archive & data;
- }
-
- void read_buffer(void *buffer, size_t size)
- {
- boost::system::error_code error;
- size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
-
- if(error.value()) {
- error_func->network_error(error.message());
- }
-
- if(len != size)
- cout << "Network receive error: buffer size doesn't match expected size\n";
- }
-
- void read(DeviceTask& task)
- {
- int type;
-
- *archive & type & task.x & task.y & task.w & task.h;
- *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
- *archive & task.offset & task.stride;
- *archive & task.shader_input & task.shader_output & task.shader_eval_type;
- *archive & task.shader_x & task.shader_w;
- *archive & task.need_finish_queue;
-
- task.type = (DeviceTask::Type)type;
- }
-
- void read(RenderTile& tile)
- {
- *archive & tile.x & tile.y & tile.w & tile.h;
- *archive & tile.start_sample & tile.num_samples & tile.sample;
- *archive & tile.resolution & tile.offset & tile.stride;
- *archive & tile.buffer;
-
- tile.buffers = NULL;
- }
-
- string name;
-
-protected:
- tcp::socket& socket;
- string archive_str;
- istringstream *archive_stream;
- i_archive *archive;
- NetworkError *error_func;
+ public:
+ RPCReceive(tcp::socket &socket_, NetworkError *e)
+ : socket(socket_), archive_stream(NULL), archive(NULL)
+ {
+ error_func = e;
+ /* read head with fixed size */
+ vector<char> header(8);
+ boost::system::error_code error;
+ size_t len = boost::asio::read(socket, boost::asio::buffer(header), error);
+
+ if (error.value()) {
+ error_func->network_error(error.message());
+ }
+
+ /* verify if we got something */
+ if (len == header.size()) {
+ /* decode header */
+ string header_str(&header[0], header.size());
+ istringstream header_stream(header_str);
+
+ size_t data_size;
+
+ if ((header_stream >> hex >> data_size)) {
+
+ vector<char> data(data_size);
+ size_t len = boost::asio::read(socket, boost::asio::buffer(data), error);
+
+ if (error.value())
+ error_func->network_error(error.message());
+
+ if (len == data_size) {
+ archive_str = (data.size()) ? string(&data[0], data.size()) : string("");
+
+ archive_stream = new istringstream(archive_str);
+ archive = new i_archive(*archive_stream);
+
+ *archive &name;
+ fprintf(stderr, "rpc receive %s\n", name.c_str());
+ }
+ else {
+ error_func->network_error("Network receive error: data size doesn't match header");
+ }
+ }
+ else {
+ error_func->network_error("Network receive error: can't decode data size from header");
+ }
+ }
+ else {
+ error_func->network_error("Network receive error: invalid header size");
+ }
+ }
+
+ ~RPCReceive()
+ {
+ delete archive;
+ delete archive_stream;
+ }
+
+ void read(network_device_memory &mem, string &name)
+ {
+ *archive &mem.data_type &mem.data_elements &mem.data_size;
+ *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer;
+ *archive &mem.type &name;
+ *archive &mem.interpolation &mem.extension;
+ *archive &mem.device_pointer;
+
+ mem.name = name.c_str();
+ mem.host_pointer = 0;
+
+ /* Can't transfer OpenGL texture over network. */
+ if (mem.type == MEM_PIXELS) {
+ mem.type = MEM_READ_WRITE;
+ }
+ }
+
+ template<typename T> void read(T &data)
+ {
+ *archive &data;
+ }
+
+ void read_buffer(void *buffer, size_t size)
+ {
+ boost::system::error_code error;
+ size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error);
+
+ if (error.value()) {
+ error_func->network_error(error.message());
+ }
+
+ if (len != size)
+ cout << "Network receive error: buffer size doesn't match expected size\n";
+ }
+
+ void read(DeviceTask &task)
+ {
+ int type;
+
+ *archive &type &task.x &task.y &task.w &task.h;
+ *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples;
+ *archive &task.offset &task.stride;
+ *archive &task.shader_input &task.shader_output &task.shader_eval_type;
+ *archive &task.shader_x &task.shader_w;
+ *archive &task.need_finish_queue;
+
+ task.type = (DeviceTask::Type)type;
+ }
+
+ void read(RenderTile &tile)
+ {
+ *archive &tile.x &tile.y &tile.w &tile.h;
+ *archive &tile.start_sample &tile.num_samples &tile.sample;
+ *archive &tile.resolution &tile.offset &tile.stride;
+ *archive &tile.buffer;
+
+ tile.buffers = NULL;
+ }
+
+ string name;
+
+ protected:
+ tcp::socket &socket;
+ string archive_str;
+ istringstream *archive_stream;
+ i_archive *archive;
+ NetworkError *error_func;
};
/* Server auto discovery */
class ServerDiscovery {
-public:
- explicit ServerDiscovery(bool discover = false)
- : listen_socket(io_service), collect_servers(false)
- {
- /* setup listen socket */
- listen_endpoint.address(boost::asio::ip::address_v4::any());
- listen_endpoint.port(DISCOVER_PORT);
-
- listen_socket.open(listen_endpoint.protocol());
-
- boost::asio::socket_base::reuse_address option(true);
- listen_socket.set_option(option);
-
- listen_socket.bind(listen_endpoint);
-
- /* setup receive callback */
- async_receive();
-
- /* start server discovery */
- if(discover) {
- collect_servers = true;
- servers.clear();
-
- broadcast_message(DISCOVER_REQUEST_MSG);
- }
-
- /* start thread */
- work = new boost::asio::io_service::work(io_service);
- thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
- }
-
- ~ServerDiscovery()
- {
- io_service.stop();
- thread->join();
- delete thread;
- delete work;
- }
-
- vector<string> get_server_list()
- {
- vector<string> result;
-
- mutex.lock();
- result = vector<string>(servers.begin(), servers.end());
- mutex.unlock();
-
- return result;
- }
-
-private:
- void handle_receive_from(const boost::system::error_code& error, size_t size)
- {
- if(error) {
- cout << "Server discovery receive error: " << error.message() << "\n";
- return;
- }
-
- if(size > 0) {
- string msg = string(receive_buffer, size);
-
- /* handle incoming message */
- if(collect_servers) {
- if(msg == DISCOVER_REPLY_MSG) {
- string address = receive_endpoint.address().to_string();
-
- mutex.lock();
-
- /* add address if it's not already in the list */
- bool found = std::find(servers.begin(), servers.end(),
- address) != servers.end();
-
- if(!found)
- servers.push_back(address);
-
- mutex.unlock();
- }
- }
- else {
- /* reply to request */
- if(msg == DISCOVER_REQUEST_MSG)
- broadcast_message(DISCOVER_REPLY_MSG);
- }
- }
-
- async_receive();
- }
-
- void async_receive()
- {
- listen_socket.async_receive_from(
- boost::asio::buffer(receive_buffer), receive_endpoint,
- boost::bind(&ServerDiscovery::handle_receive_from, this,
- boost::asio::placeholders::error, boost::asio::placeholders::bytes_transferred));
- }
-
- void broadcast_message(const string& msg)
- {
- /* setup broadcast socket */
- boost::asio::ip::udp::socket socket(io_service);
-
- socket.open(boost::asio::ip::udp::v4());
-
- boost::asio::socket_base::broadcast option(true);
- socket.set_option(option);
-
- boost::asio::ip::udp::endpoint broadcast_endpoint(
- boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
-
- /* broadcast message */
- socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
- }
-
- /* network service and socket */
- boost::asio::io_service io_service;
- boost::asio::ip::udp::endpoint listen_endpoint;
- boost::asio::ip::udp::socket listen_socket;
-
- /* threading */
- boost::thread *thread;
- boost::asio::io_service::work *work;
- boost::mutex mutex;
-
- /* buffer and endpoint for receiving messages */
- char receive_buffer[256];
- boost::asio::ip::udp::endpoint receive_endpoint;
-
- // os, version, devices, status, host name, group name, ip as far as fields go
- struct ServerInfo {
- string cycles_version;
- string os;
- int device_count;
- string status;
- string host_name;
- string group_name;
- string host_addr;
- };
-
- /* collection of server addresses in list */
- bool collect_servers;
- vector<string> servers;
+ public:
+ explicit ServerDiscovery(bool discover = false)
+ : listen_socket(io_service), collect_servers(false)
+ {
+ /* setup listen socket */
+ listen_endpoint.address(boost::asio::ip::address_v4::any());
+ listen_endpoint.port(DISCOVER_PORT);
+
+ listen_socket.open(listen_endpoint.protocol());
+
+ boost::asio::socket_base::reuse_address option(true);
+ listen_socket.set_option(option);
+
+ listen_socket.bind(listen_endpoint);
+
+ /* setup receive callback */
+ async_receive();
+
+ /* start server discovery */
+ if (discover) {
+ collect_servers = true;
+ servers.clear();
+
+ broadcast_message(DISCOVER_REQUEST_MSG);
+ }
+
+ /* start thread */
+ work = new boost::asio::io_service::work(io_service);
+ thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service));
+ }
+
+ ~ServerDiscovery()
+ {
+ io_service.stop();
+ thread->join();
+ delete thread;
+ delete work;
+ }
+
+ vector<string> get_server_list()
+ {
+ vector<string> result;
+
+ mutex.lock();
+ result = vector<string>(servers.begin(), servers.end());
+ mutex.unlock();
+
+ return result;
+ }
+
+ private:
+ void handle_receive_from(const boost::system::error_code &error, size_t size)
+ {
+ if (error) {
+ cout << "Server discovery receive error: " << error.message() << "\n";
+ return;
+ }
+
+ if (size > 0) {
+ string msg = string(receive_buffer, size);
+
+ /* handle incoming message */
+ if (collect_servers) {
+ if (msg == DISCOVER_REPLY_MSG) {
+ string address = receive_endpoint.address().to_string();
+
+ mutex.lock();
+
+ /* add address if it's not already in the list */
+ bool found = std::find(servers.begin(), servers.end(), address) != servers.end();
+
+ if (!found)
+ servers.push_back(address);
+
+ mutex.unlock();
+ }
+ }
+ else {
+ /* reply to request */
+ if (msg == DISCOVER_REQUEST_MSG)
+ broadcast_message(DISCOVER_REPLY_MSG);
+ }
+ }
+
+ async_receive();
+ }
+
+ void async_receive()
+ {
+ listen_socket.async_receive_from(boost::asio::buffer(receive_buffer),
+ receive_endpoint,
+ boost::bind(&ServerDiscovery::handle_receive_from,
+ this,
+ boost::asio::placeholders::error,
+ boost::asio::placeholders::bytes_transferred));
+ }
+
+ void broadcast_message(const string &msg)
+ {
+ /* setup broadcast socket */
+ boost::asio::ip::udp::socket socket(io_service);
+
+ socket.open(boost::asio::ip::udp::v4());
+
+ boost::asio::socket_base::broadcast option(true);
+ socket.set_option(option);
+
+ boost::asio::ip::udp::endpoint broadcast_endpoint(
+ boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT);
+
+ /* broadcast message */
+ socket.send_to(boost::asio::buffer(msg), broadcast_endpoint);
+ }
+
+ /* network service and socket */
+ boost::asio::io_service io_service;
+ boost::asio::ip::udp::endpoint listen_endpoint;
+ boost::asio::ip::udp::socket listen_socket;
+
+ /* threading */
+ boost::thread *thread;
+ boost::asio::io_service::work *work;
+ boost::mutex mutex;
+
+ /* buffer and endpoint for receiving messages */
+ char receive_buffer[256];
+ boost::asio::ip::udp::endpoint receive_endpoint;
+
+ // os, version, devices, status, host name, group name, ip as far as fields go
+ struct ServerInfo {
+ string cycles_version;
+ string os;
+ int device_count;
+ string status;
+ string host_name;
+ string group_name;
+ string host_addr;
+ };
+
+ /* collection of server addresses in list */
+ bool collect_servers;
+ vector<string> servers;
};
CCL_NAMESPACE_END
#endif
-#endif /* __DEVICE_NETWORK_H__ */
+#endif /* __DEVICE_NETWORK_H__ */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 4cefaa217f1..99a8d2438d6 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -16,218 +16,211 @@
#ifdef WITH_OPENCL
-#include "device/opencl/opencl.h"
+# include "device/opencl/opencl.h"
-#include "device/device_intern.h"
+# include "device/device_intern.h"
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_set.h"
-#include "util/util_string.h"
+# include "util/util_foreach.h"
+# include "util/util_logging.h"
+# include "util/util_set.h"
+# include "util/util_string.h"
CCL_NAMESPACE_BEGIN
-Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
+Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
{
- return opencl_create_split_device(info, stats, profiler, background);
+ return opencl_create_split_device(info, stats, profiler, background);
}
bool device_opencl_init()
{
- static bool initialized = false;
- static bool result = false;
-
- if(initialized)
- return result;
-
- initialized = true;
-
- if(OpenCLInfo::device_type() != 0) {
- int clew_result = clewInit();
- if(clew_result == CLEW_SUCCESS) {
- VLOG(1) << "CLEW initialization succeeded.";
- result = true;
- }
- else {
- VLOG(1) << "CLEW initialization failed: "
- << ((clew_result == CLEW_ERROR_ATEXIT_FAILED)
- ? "Error setting up atexit() handler"
- : "Error opening the library");
- }
- }
- else {
- VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
- result = false;
- }
-
- return result;
+ static bool initialized = false;
+ static bool result = false;
+
+ if (initialized)
+ return result;
+
+ initialized = true;
+
+ if (OpenCLInfo::device_type() != 0) {
+ int clew_result = clewInit();
+ if (clew_result == CLEW_SUCCESS) {
+ VLOG(1) << "CLEW initialization succeeded.";
+ result = true;
+ }
+ else {
+ VLOG(1) << "CLEW initialization failed: "
+ << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+ "Error opening the library");
+ }
+ }
+ else {
+ VLOG(1) << "Skip initializing CLEW, platform is force disabled.";
+ result = false;
+ }
+
+ return result;
}
-
static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms)
{
-#ifdef _WIN32
- __try {
- return clGetPlatformIDs(0, NULL, num_platforms);
- }
- __except(EXCEPTION_EXECUTE_HANDLER) {
- /* Ignore crashes inside the OpenCL driver and hope we can
- * survive even with corrupted OpenCL installs. */
- fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
- }
-
- *num_platforms = 0;
- return CL_DEVICE_NOT_FOUND;
-#else
- return clGetPlatformIDs(0, NULL, num_platforms);
-#endif
+# ifdef _WIN32
+ __try {
+ return clGetPlatformIDs(0, NULL, num_platforms);
+ }
+ __except (EXCEPTION_EXECUTE_HANDLER) {
+ /* Ignore crashes inside the OpenCL driver and hope we can
+ * survive even with corrupted OpenCL installs. */
+ fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n");
+ }
+
+ *num_platforms = 0;
+ return CL_DEVICE_NOT_FOUND;
+# else
+ return clGetPlatformIDs(0, NULL, num_platforms);
+# endif
}
-void device_opencl_info(vector<DeviceInfo>& devices)
+void device_opencl_info(vector<DeviceInfo> &devices)
{
- cl_uint num_platforms = 0;
- device_opencl_get_num_platforms_safe(&num_platforms);
- if(num_platforms == 0) {
- return;
- }
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- /* Devices are numbered consecutively across platforms. */
- int num_devices = 0;
- set<string> unique_ids;
- foreach(OpenCLPlatformDevice& platform_device, usable_devices) {
- /* Compute unique ID for persistent user preferences. */
- const string& platform_name = platform_device.platform_name;
- const string& device_name = platform_device.device_name;
- string hardware_id = platform_device.hardware_id;
- if(hardware_id == "") {
- hardware_id = string_printf("ID_%d", num_devices);
- }
- string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
-
- /* Hardware ID might not be unique, add device number in that case. */
- if(unique_ids.find(id) != unique_ids.end()) {
- id += string_printf("_ID_%d", num_devices);
- }
- unique_ids.insert(id);
-
- /* Create DeviceInfo. */
- DeviceInfo info;
- info.type = DEVICE_OPENCL;
- info.description = string_remove_trademark(string(device_name));
- info.num = num_devices;
- /* We don't know if it's used for display, but assume it is. */
- info.display_device = true;
- info.use_split_kernel = true;
- info.has_volume_decoupled = false;
- info.id = id;
-
- /* Check OpenCL extensions */
- info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
-
- devices.push_back(info);
- num_devices++;
- }
+ cl_uint num_platforms = 0;
+ device_opencl_get_num_platforms_safe(&num_platforms);
+ if (num_platforms == 0) {
+ return;
+ }
+
+ vector<OpenCLPlatformDevice> usable_devices;
+ OpenCLInfo::get_usable_devices(&usable_devices);
+ /* Devices are numbered consecutively across platforms. */
+ int num_devices = 0;
+ set<string> unique_ids;
+ foreach (OpenCLPlatformDevice &platform_device, usable_devices) {
+ /* Compute unique ID for persistent user preferences. */
+ const string &platform_name = platform_device.platform_name;
+ const string &device_name = platform_device.device_name;
+ string hardware_id = platform_device.hardware_id;
+ if (hardware_id == "") {
+ hardware_id = string_printf("ID_%d", num_devices);
+ }
+ string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id;
+
+ /* Hardware ID might not be unique, add device number in that case. */
+ if (unique_ids.find(id) != unique_ids.end()) {
+ id += string_printf("_ID_%d", num_devices);
+ }
+ unique_ids.insert(id);
+
+ /* Create DeviceInfo. */
+ DeviceInfo info;
+ info.type = DEVICE_OPENCL;
+ info.description = string_remove_trademark(string(device_name));
+ info.num = num_devices;
+ /* We don't know if it's used for display, but assume it is. */
+ info.display_device = true;
+ info.use_split_kernel = true;
+ info.has_volume_decoupled = false;
+ info.id = id;
+
+ /* Check OpenCL extensions */
+ info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos;
+
+ devices.push_back(info);
+ num_devices++;
+ }
}
string device_opencl_capabilities()
{
- if(OpenCLInfo::device_type() == 0) {
- return "All OpenCL devices are forced to be OFF";
- }
- string result = "";
- string error_msg = ""; /* Only used by opencl_assert(), but in the future
- * it could also be nicely reported to the console.
- */
- cl_uint num_platforms = 0;
- opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
- if(num_platforms == 0) {
- return "No OpenCL platforms found\n";
- }
- result += string_printf("Number of platforms: %u\n", num_platforms);
-
- vector<cl_platform_id> platform_ids;
- platform_ids.resize(num_platforms);
- opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
-
- typedef char cl_string[1024];
-
-#define APPEND_INFO(func, id, name, what, type) \
- do { \
- type data; \
- memset(&data, 0, sizeof(data)); \
- opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
- result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
- } while(false)
-#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
- do { \
- char data[1024] = "\0"; \
- size_t length = 0; \
- if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
- if(length != 0 && data[0] != '\0') { \
- result += string_printf("%s: %s\n", name, data); \
- } \
- } \
- } while(false)
-#define APPEND_PLATFORM_INFO(id, name, what, type) \
- APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
-#define APPEND_DEVICE_INFO(id, name, what, type) \
- APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
-#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
- APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
-
- vector<cl_device_id> device_ids;
- for(cl_uint platform = 0; platform < num_platforms; ++platform) {
- cl_platform_id platform_id = platform_ids[platform];
-
- result += string_printf("Platform #%u\n", platform);
-
- APPEND_PLATFORM_INFO(platform_id, "Name", CL_PLATFORM_NAME, cl_string);
- APPEND_PLATFORM_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR, cl_string);
- APPEND_PLATFORM_INFO(platform_id, "Version", CL_PLATFORM_VERSION, cl_string);
- APPEND_PLATFORM_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE, cl_string);
- APPEND_PLATFORM_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS, cl_string);
-
- cl_uint num_devices = 0;
- opencl_assert(clGetDeviceIDs(platform_ids[platform],
- CL_DEVICE_TYPE_ALL,
- 0,
- NULL,
- &num_devices));
- result += string_printf("\tNumber of devices: %u\n", num_devices);
-
- device_ids.resize(num_devices);
- opencl_assert(clGetDeviceIDs(platform_ids[platform],
- CL_DEVICE_TYPE_ALL,
- num_devices,
- &device_ids[0],
- NULL));
- for(cl_uint device = 0; device < num_devices; ++device) {
- cl_device_id device_id = device_ids[device];
-
- result += string_printf("\t\tDevice: #%u\n", device);
-
- APPEND_DEVICE_INFO(device_id, "Name", CL_DEVICE_NAME, cl_string);
- APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
- APPEND_DEVICE_INFO(device_id, "Vendor", CL_DEVICE_VENDOR, cl_string);
- APPEND_DEVICE_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION, cl_string);
- APPEND_DEVICE_INFO(device_id, "Profile", CL_DEVICE_PROFILE, cl_string);
- APPEND_DEVICE_INFO(device_id, "Version", CL_DEVICE_VERSION, cl_string);
- APPEND_DEVICE_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS, cl_string);
- APPEND_DEVICE_INFO(device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
- APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
- APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
- }
- }
-
-#undef APPEND_STRING_INFO
-#undef APPEND_PLATFORM_STRING_INFO
-#undef APPEND_DEVICE_STRING_INFO
-
- return result;
+ if (OpenCLInfo::device_type() == 0) {
+ return "All OpenCL devices are forced to be OFF";
+ }
+ string result = "";
+ string error_msg = ""; /* Only used by opencl_assert(), but in the future
+ * it could also be nicely reported to the console.
+ */
+ cl_uint num_platforms = 0;
+ opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms));
+ if (num_platforms == 0) {
+ return "No OpenCL platforms found\n";
+ }
+ result += string_printf("Number of platforms: %u\n", num_platforms);
+
+ vector<cl_platform_id> platform_ids;
+ platform_ids.resize(num_platforms);
+ opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL));
+
+ typedef char cl_string[1024];
+
+# define APPEND_INFO(func, id, name, what, type) \
+ do { \
+ type data; \
+ memset(&data, 0, sizeof(data)); \
+ opencl_assert(func(id, what, sizeof(data), &data, NULL)); \
+ result += string_printf("%s: %s\n", name, to_string(data).c_str()); \
+ } while (false)
+# define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \
+ do { \
+ char data[1024] = "\0"; \
+ size_t length = 0; \
+ if (func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \
+ if (length != 0 && data[0] != '\0') { \
+ result += string_printf("%s: %s\n", name, data); \
+ } \
+ } \
+ } while (false)
+# define APPEND_PLATFORM_INFO(id, name, what, type) \
+ APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type)
+# define APPEND_DEVICE_INFO(id, name, what, type) \
+ APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type)
+# define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \
+ APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what)
+
+ vector<cl_device_id> device_ids;
+ for (cl_uint platform = 0; platform < num_platforms; ++platform) {
+ cl_platform_id platform_id = platform_ids[platform];
+
+ result += string_printf("Platform #%u\n", platform);
+
+ APPEND_PLATFORM_INFO(platform_id, "Name", CL_PLATFORM_NAME, cl_string);
+ APPEND_PLATFORM_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR, cl_string);
+ APPEND_PLATFORM_INFO(platform_id, "Version", CL_PLATFORM_VERSION, cl_string);
+ APPEND_PLATFORM_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE, cl_string);
+ APPEND_PLATFORM_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS, cl_string);
+
+ cl_uint num_devices = 0;
+ opencl_assert(
+ clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices));
+ result += string_printf("\tNumber of devices: %u\n", num_devices);
+
+ device_ids.resize(num_devices);
+ opencl_assert(clGetDeviceIDs(
+ platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL));
+ for (cl_uint device = 0; device < num_devices; ++device) {
+ cl_device_id device_id = device_ids[device];
+
+ result += string_printf("\t\tDevice: #%u\n", device);
+
+ APPEND_DEVICE_INFO(device_id, "Name", CL_DEVICE_NAME, cl_string);
+ APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD);
+ APPEND_DEVICE_INFO(device_id, "Vendor", CL_DEVICE_VENDOR, cl_string);
+ APPEND_DEVICE_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION, cl_string);
+ APPEND_DEVICE_INFO(device_id, "Profile", CL_DEVICE_PROFILE, cl_string);
+ APPEND_DEVICE_INFO(device_id, "Version", CL_DEVICE_VERSION, cl_string);
+ APPEND_DEVICE_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS, cl_string);
+ APPEND_DEVICE_INFO(
+ device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint);
+ APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
+ APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
+ }
+ }
+
+# undef APPEND_STRING_INFO
+# undef APPEND_PLATFORM_STRING_INFO
+# undef APPEND_DEVICE_STRING_INFO
+
+ return result;
}
CCL_NAMESPACE_END
-#endif /* WITH_OPENCL */
+#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index ee566e57918..42e597a34d7 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -27,299 +27,304 @@ CCL_NAMESPACE_BEGIN
static const double alpha = 0.1; /* alpha for rolling average */
DeviceSplitKernel::DeviceSplitKernel(Device *device)
-: device(device),
- split_data(device, "split_data"),
- ray_state(device, "ray_state", MEM_READ_WRITE),
- queue_index(device, "queue_index"),
- use_queues_flag(device, "use_queues_flag"),
- work_pool_wgs(device, "work_pool_wgs"),
- kernel_data_initialized(false)
+ : device(device),
+ split_data(device, "split_data"),
+ ray_state(device, "ray_state", MEM_READ_WRITE),
+ queue_index(device, "queue_index"),
+ use_queues_flag(device, "use_queues_flag"),
+ work_pool_wgs(device, "work_pool_wgs"),
+ kernel_data_initialized(false)
{
- avg_time_per_sample = 0.0;
-
- kernel_path_init = NULL;
- kernel_scene_intersect = NULL;
- kernel_lamp_emission = NULL;
- kernel_do_volume = NULL;
- kernel_queue_enqueue = NULL;
- kernel_indirect_background = NULL;
- kernel_shader_setup = NULL;
- kernel_shader_sort = NULL;
- kernel_shader_eval = NULL;
- kernel_holdout_emission_blurring_pathtermination_ao = NULL;
- kernel_subsurface_scatter = NULL;
- kernel_direct_lighting = NULL;
- kernel_shadow_blocked_ao = NULL;
- kernel_shadow_blocked_dl = NULL;
- kernel_enqueue_inactive = NULL;
- kernel_next_iteration_setup = NULL;
- kernel_indirect_subsurface = NULL;
- kernel_buffer_update = NULL;
+ avg_time_per_sample = 0.0;
+
+ kernel_path_init = NULL;
+ kernel_scene_intersect = NULL;
+ kernel_lamp_emission = NULL;
+ kernel_do_volume = NULL;
+ kernel_queue_enqueue = NULL;
+ kernel_indirect_background = NULL;
+ kernel_shader_setup = NULL;
+ kernel_shader_sort = NULL;
+ kernel_shader_eval = NULL;
+ kernel_holdout_emission_blurring_pathtermination_ao = NULL;
+ kernel_subsurface_scatter = NULL;
+ kernel_direct_lighting = NULL;
+ kernel_shadow_blocked_ao = NULL;
+ kernel_shadow_blocked_dl = NULL;
+ kernel_enqueue_inactive = NULL;
+ kernel_next_iteration_setup = NULL;
+ kernel_indirect_subsurface = NULL;
+ kernel_buffer_update = NULL;
}
DeviceSplitKernel::~DeviceSplitKernel()
{
- split_data.free();
- ray_state.free();
- use_queues_flag.free();
- queue_index.free();
- work_pool_wgs.free();
-
- delete kernel_path_init;
- delete kernel_scene_intersect;
- delete kernel_lamp_emission;
- delete kernel_do_volume;
- delete kernel_queue_enqueue;
- delete kernel_indirect_background;
- delete kernel_shader_setup;
- delete kernel_shader_sort;
- delete kernel_shader_eval;
- delete kernel_holdout_emission_blurring_pathtermination_ao;
- delete kernel_subsurface_scatter;
- delete kernel_direct_lighting;
- delete kernel_shadow_blocked_ao;
- delete kernel_shadow_blocked_dl;
- delete kernel_enqueue_inactive;
- delete kernel_next_iteration_setup;
- delete kernel_indirect_subsurface;
- delete kernel_buffer_update;
+ split_data.free();
+ ray_state.free();
+ use_queues_flag.free();
+ queue_index.free();
+ work_pool_wgs.free();
+
+ delete kernel_path_init;
+ delete kernel_scene_intersect;
+ delete kernel_lamp_emission;
+ delete kernel_do_volume;
+ delete kernel_queue_enqueue;
+ delete kernel_indirect_background;
+ delete kernel_shader_setup;
+ delete kernel_shader_sort;
+ delete kernel_shader_eval;
+ delete kernel_holdout_emission_blurring_pathtermination_ao;
+ delete kernel_subsurface_scatter;
+ delete kernel_direct_lighting;
+ delete kernel_shadow_blocked_ao;
+ delete kernel_shadow_blocked_dl;
+ delete kernel_enqueue_inactive;
+ delete kernel_next_iteration_setup;
+ delete kernel_indirect_subsurface;
+ delete kernel_buffer_update;
}
-bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
+bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features)
{
#define LOAD_KERNEL(name) \
- kernel_##name = get_split_kernel_function(#name, requested_features); \
- if(!kernel_##name) { \
- device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
- return false; \
- }
-
- LOAD_KERNEL(path_init);
- LOAD_KERNEL(scene_intersect);
- LOAD_KERNEL(lamp_emission);
- if (requested_features.use_volume) {
- LOAD_KERNEL(do_volume);
- }
- LOAD_KERNEL(queue_enqueue);
- LOAD_KERNEL(indirect_background);
- LOAD_KERNEL(shader_setup);
- LOAD_KERNEL(shader_sort);
- LOAD_KERNEL(shader_eval);
- LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
- LOAD_KERNEL(subsurface_scatter);
- LOAD_KERNEL(direct_lighting);
- LOAD_KERNEL(shadow_blocked_ao);
- LOAD_KERNEL(shadow_blocked_dl);
- LOAD_KERNEL(enqueue_inactive);
- LOAD_KERNEL(next_iteration_setup);
- LOAD_KERNEL(indirect_subsurface);
- LOAD_KERNEL(buffer_update);
+ kernel_##name = get_split_kernel_function(#name, requested_features); \
+ if (!kernel_##name) { \
+ device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
+ return false; \
+ }
+
+ LOAD_KERNEL(path_init);
+ LOAD_KERNEL(scene_intersect);
+ LOAD_KERNEL(lamp_emission);
+ if (requested_features.use_volume) {
+ LOAD_KERNEL(do_volume);
+ }
+ LOAD_KERNEL(queue_enqueue);
+ LOAD_KERNEL(indirect_background);
+ LOAD_KERNEL(shader_setup);
+ LOAD_KERNEL(shader_sort);
+ LOAD_KERNEL(shader_eval);
+ LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
+ LOAD_KERNEL(subsurface_scatter);
+ LOAD_KERNEL(direct_lighting);
+ LOAD_KERNEL(shadow_blocked_ao);
+ LOAD_KERNEL(shadow_blocked_dl);
+ LOAD_KERNEL(enqueue_inactive);
+ LOAD_KERNEL(next_iteration_setup);
+ LOAD_KERNEL(indirect_subsurface);
+ LOAD_KERNEL(buffer_update);
#undef LOAD_KERNEL
- /* Re-initialiaze kernel-dependent data when kernels change. */
- kernel_data_initialized = false;
+ /* Re-initialiaze kernel-dependent data when kernels change. */
+ kernel_data_initialized = false;
- return true;
+ return true;
}
-size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size)
+size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg,
+ device_memory &data,
+ uint64_t max_buffer_size)
{
- uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
- VLOG(1) << "Split state element size: "
- << string_human_readable_number(size_per_element) << " bytes. ("
- << string_human_readable_size(size_per_element) << ").";
- return max_buffer_size / size_per_element;
+ uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
+ VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
+ << " bytes. (" << string_human_readable_size(size_per_element) << ").";
+ return max_buffer_size / size_per_element;
}
bool DeviceSplitKernel::path_trace(DeviceTask *task,
- RenderTile& tile,
- device_memory& kgbuffer,
- device_memory& kernel_data)
+ RenderTile &tile,
+ device_memory &kgbuffer,
+ device_memory &kernel_data)
{
- if(device->have_error()) {
- return false;
- }
+ if (device->have_error()) {
+ return false;
+ }
- /* Allocate all required global memory once. */
- if(!kernel_data_initialized) {
- kernel_data_initialized = true;
+ /* Allocate all required global memory once. */
+ if (!kernel_data_initialized) {
+ kernel_data_initialized = true;
- /* Set local size */
- int2 lsize = split_kernel_local_size();
- local_size[0] = lsize[0];
- local_size[1] = lsize[1];
+ /* Set local size */
+ int2 lsize = split_kernel_local_size();
+ local_size[0] = lsize[0];
+ local_size[1] = lsize[1];
- /* Set global size */
- int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
+ /* Set global size */
+ int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
- /* Make sure that set work size is a multiple of local
- * work size dimensions.
- */
- global_size[0] = round_up(gsize[0], local_size[0]);
- global_size[1] = round_up(gsize[1], local_size[1]);
+ /* Make sure that set work size is a multiple of local
+ * work size dimensions.
+ */
+ global_size[0] = round_up(gsize[0], local_size[0]);
+ global_size[1] = round_up(gsize[1], local_size[1]);
- int num_global_elements = global_size[0] * global_size[1];
- assert(num_global_elements % WORK_POOL_SIZE == 0);
+ int num_global_elements = global_size[0] * global_size[1];
+ assert(num_global_elements % WORK_POOL_SIZE == 0);
- /* Calculate max groups */
+ /* Calculate max groups */
- /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
- unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU;
- unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
+ /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
+ unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
+ WORK_POOL_SIZE_GPU;
+ unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
- /* Allocate work_pool_wgs memory. */
- work_pool_wgs.alloc_to_device(max_work_groups);
- queue_index.alloc_to_device(NUM_QUEUES);
- use_queues_flag.alloc_to_device(1);
- split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
- ray_state.alloc(num_global_elements);
- }
+ /* Allocate work_pool_wgs memory. */
+ work_pool_wgs.alloc_to_device(max_work_groups);
+ queue_index.alloc_to_device(NUM_QUEUES);
+ use_queues_flag.alloc_to_device(1);
+ split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
+ ray_state.alloc(num_global_elements);
+ }
- /* Number of elements in the global state buffer */
- int num_global_elements = global_size[0] * global_size[1];
+ /* Number of elements in the global state buffer */
+ int num_global_elements = global_size[0] * global_size[1];
#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
- if(device->have_error()) { \
- return false; \
- } \
- if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
- return false; \
- }
-
- tile.sample = tile.start_sample;
-
- /* for exponential increase between tile updates */
- int time_multiplier = 1;
-
- while(tile.sample < tile.start_sample + tile.num_samples) {
- /* to keep track of how long it takes to run a number of samples */
- double start_time = time_dt();
-
- /* initial guess to start rolling average */
- const int initial_num_samples = 1;
- /* approx number of samples per second */
- int samples_per_second = (avg_time_per_sample > 0.0) ?
- int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples;
-
- RenderTile subtile = tile;
- subtile.start_sample = tile.sample;
- subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample);
-
- if(device->have_error()) {
- return false;
- }
-
- /* reset state memory here as global size for data_init
- * kernel might not be large enough to do in kernel
- */
- work_pool_wgs.zero_to_device();
- split_data.zero_to_device();
- ray_state.zero_to_device();
-
- if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
- subtile,
- num_global_elements,
- kgbuffer,
- kernel_data,
- split_data,
- ray_state,
- queue_index,
- use_queues_flag,
- work_pool_wgs))
- {
- return false;
- }
-
- ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
-
- bool activeRaysAvailable = true;
- double cancel_time = DBL_MAX;
-
- while(activeRaysAvailable) {
- /* Do path-iteration in host [Enqueue Path-iteration kernels. */
- for(int PathIter = 0; PathIter < 16; PathIter++) {
- ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
- if (kernel_do_volume) {
- ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
- }
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
- ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
-
- if(task->get_cancel() && cancel_time == DBL_MAX) {
- /* Wait up to twice as many seconds for current samples to finish
- * to avoid artifacts in render result from ending too soon.
- */
- cancel_time = time_dt() + 2.0 * time_multiplier;
- }
-
- if(time_dt() > cancel_time) {
- return true;
- }
- }
-
- /* Decide if we should exit path-iteration in host. */
- ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
-
- activeRaysAvailable = false;
-
- for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
- if(!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
- if(IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
- /* Something went wrong, abort to avoid looping endlessly. */
- device->set_error("Split kernel error: invalid ray state");
- return false;
- }
-
- /* Not all rays are RAY_INACTIVE. */
- activeRaysAvailable = true;
- break;
- }
- }
-
- if(time_dt() > cancel_time) {
- return true;
- }
- }
-
- double time_per_sample = ((time_dt()-start_time) / subtile.num_samples);
-
- if(avg_time_per_sample == 0.0) {
- /* start rolling average */
- avg_time_per_sample = time_per_sample;
- }
- else {
- avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
- }
+ if (device->have_error()) { \
+ return false; \
+ } \
+ if (!kernel_##name->enqueue( \
+ KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
+ return false; \
+ }
+
+ tile.sample = tile.start_sample;
+
+ /* for exponential increase between tile updates */
+ int time_multiplier = 1;
+
+ while (tile.sample < tile.start_sample + tile.num_samples) {
+ /* to keep track of how long it takes to run a number of samples */
+ double start_time = time_dt();
+
+ /* initial guess to start rolling average */
+ const int initial_num_samples = 1;
+ /* approx number of samples per second */
+ int samples_per_second = (avg_time_per_sample > 0.0) ?
+ int(double(time_multiplier) / avg_time_per_sample) + 1 :
+ initial_num_samples;
+
+ RenderTile subtile = tile;
+ subtile.start_sample = tile.sample;
+ subtile.num_samples = min(samples_per_second,
+ tile.start_sample + tile.num_samples - tile.sample);
+
+ if (device->have_error()) {
+ return false;
+ }
+
+ /* reset state memory here as global size for data_init
+ * kernel might not be large enough to do in kernel
+ */
+ work_pool_wgs.zero_to_device();
+ split_data.zero_to_device();
+ ray_state.zero_to_device();
+
+ if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
+ subtile,
+ num_global_elements,
+ kgbuffer,
+ kernel_data,
+ split_data,
+ ray_state,
+ queue_index,
+ use_queues_flag,
+ work_pool_wgs)) {
+ return false;
+ }
+
+ ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
+
+ bool activeRaysAvailable = true;
+ double cancel_time = DBL_MAX;
+
+ while (activeRaysAvailable) {
+ /* Do path-iteration in host [Enqueue Path-iteration kernels. */
+ for (int PathIter = 0; PathIter < 16; PathIter++) {
+ ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+ if (kernel_do_volume) {
+ ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
+ }
+ ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(
+ holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+ ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
+
+ if (task->get_cancel() && cancel_time == DBL_MAX) {
+ /* Wait up to twice as many seconds for current samples to finish
+ * to avoid artifacts in render result from ending too soon.
+ */
+ cancel_time = time_dt() + 2.0 * time_multiplier;
+ }
+
+ if (time_dt() > cancel_time) {
+ return true;
+ }
+ }
+
+ /* Decide if we should exit path-iteration in host. */
+ ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
+
+ activeRaysAvailable = false;
+
+ for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
+ if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
+ if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
+ /* Something went wrong, abort to avoid looping endlessly. */
+ device->set_error("Split kernel error: invalid ray state");
+ return false;
+ }
+
+ /* Not all rays are RAY_INACTIVE. */
+ activeRaysAvailable = true;
+ break;
+ }
+ }
+
+ if (time_dt() > cancel_time) {
+ return true;
+ }
+ }
+
+ double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
+
+ if (avg_time_per_sample == 0.0) {
+ /* start rolling average */
+ avg_time_per_sample = time_per_sample;
+ }
+ else {
+ avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
+ }
#undef ENQUEUE_SPLIT_KERNEL
- tile.sample += subtile.num_samples;
- task->update_progress(&tile, tile.w*tile.h*subtile.num_samples);
+ tile.sample += subtile.num_samples;
+ task->update_progress(&tile, tile.w * tile.h * subtile.num_samples);
- time_multiplier = min(time_multiplier << 1, 10);
+ time_multiplier = min(time_multiplier << 1, 10);
- if(task->get_cancel()) {
- return true;
- }
- }
+ if (task->get_cancel()) {
+ return true;
+ }
+ }
- return true;
+ return true;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 622733b843f..c9fb2ac844f 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -27,106 +27,115 @@ CCL_NAMESPACE_BEGIN
* Since some bytes may be needed for aligning chunks of memory;
* This is the amount of memory that we dedicate for that purpose.
*/
-#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
/* Types used for split kernel */
class KernelDimensions {
-public:
- size_t global_size[2];
- size_t local_size[2];
-
- KernelDimensions(size_t global_size_[2], size_t local_size_[2])
- {
- memcpy(global_size, global_size_, sizeof(global_size));
- memcpy(local_size, local_size_, sizeof(local_size));
- }
+ public:
+ size_t global_size[2];
+ size_t local_size[2];
+
+ KernelDimensions(size_t global_size_[2], size_t local_size_[2])
+ {
+ memcpy(global_size, global_size_, sizeof(global_size));
+ memcpy(local_size, local_size_, sizeof(local_size));
+ }
};
class SplitKernelFunction {
-public:
- virtual ~SplitKernelFunction() {}
+ public:
+ virtual ~SplitKernelFunction()
+ {
+ }
- /* enqueue the kernel, returns false if there is an error */
- virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0;
+ /* enqueue the kernel, returns false if there is an error */
+ virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0;
};
class DeviceSplitKernel {
-private:
- Device *device;
-
- SplitKernelFunction *kernel_path_init;
- SplitKernelFunction *kernel_scene_intersect;
- SplitKernelFunction *kernel_lamp_emission;
- SplitKernelFunction *kernel_do_volume;
- SplitKernelFunction *kernel_queue_enqueue;
- SplitKernelFunction *kernel_indirect_background;
- SplitKernelFunction *kernel_shader_setup;
- SplitKernelFunction *kernel_shader_sort;
- SplitKernelFunction *kernel_shader_eval;
- SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
- SplitKernelFunction *kernel_subsurface_scatter;
- SplitKernelFunction *kernel_direct_lighting;
- SplitKernelFunction *kernel_shadow_blocked_ao;
- SplitKernelFunction *kernel_shadow_blocked_dl;
- SplitKernelFunction *kernel_enqueue_inactive;
- SplitKernelFunction *kernel_next_iteration_setup;
- SplitKernelFunction *kernel_indirect_subsurface;
- SplitKernelFunction *kernel_buffer_update;
-
- /* Global memory variables [porting]; These memory is used for
- * co-operation between different kernels; Data written by one
- * kernel will be available to another kernel via this global
- * memory.
- */
- device_only_memory<uchar> split_data;
- device_vector<uchar> ray_state;
- device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
-
- /* Flag to make sceneintersect and lampemission kernel use queues. */
- device_only_memory<char> use_queues_flag;
-
- /* Approximate time it takes to complete one sample */
- double avg_time_per_sample;
-
- /* Work pool with respect to each work group. */
- device_only_memory<unsigned int> work_pool_wgs;
-
- /* Cached kernel-dependent data, initialized once. */
- bool kernel_data_initialized;
- size_t local_size[2];
- size_t global_size[2];
-
-public:
- explicit DeviceSplitKernel(Device* device);
- virtual ~DeviceSplitKernel();
-
- bool load_kernels(const DeviceRequestedFeatures& requested_features);
- bool path_trace(DeviceTask *task,
- RenderTile& rtile,
- device_memory& kgbuffer,
- device_memory& kernel_data);
-
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0;
- size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size);
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data_,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs) = 0;
-
- virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
- const DeviceRequestedFeatures&) = 0;
- virtual int2 split_kernel_local_size() = 0;
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0;
+ private:
+ Device *device;
+
+ SplitKernelFunction *kernel_path_init;
+ SplitKernelFunction *kernel_scene_intersect;
+ SplitKernelFunction *kernel_lamp_emission;
+ SplitKernelFunction *kernel_do_volume;
+ SplitKernelFunction *kernel_queue_enqueue;
+ SplitKernelFunction *kernel_indirect_background;
+ SplitKernelFunction *kernel_shader_setup;
+ SplitKernelFunction *kernel_shader_sort;
+ SplitKernelFunction *kernel_shader_eval;
+ SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao;
+ SplitKernelFunction *kernel_subsurface_scatter;
+ SplitKernelFunction *kernel_direct_lighting;
+ SplitKernelFunction *kernel_shadow_blocked_ao;
+ SplitKernelFunction *kernel_shadow_blocked_dl;
+ SplitKernelFunction *kernel_enqueue_inactive;
+ SplitKernelFunction *kernel_next_iteration_setup;
+ SplitKernelFunction *kernel_indirect_subsurface;
+ SplitKernelFunction *kernel_buffer_update;
+
+ /* Global memory variables [porting]; These memory is used for
+ * co-operation between different kernels; Data written by one
+ * kernel will be available to another kernel via this global
+ * memory.
+ */
+ device_only_memory<uchar> split_data;
+ device_vector<uchar> ray_state;
+ device_only_memory<int>
+ queue_index; /* Array of size num_queues that tracks the size of each queue. */
+
+ /* Flag to make sceneintersect and lampemission kernel use queues. */
+ device_only_memory<char> use_queues_flag;
+
+ /* Approximate time it takes to complete one sample */
+ double avg_time_per_sample;
+
+ /* Work pool with respect to each work group. */
+ device_only_memory<unsigned int> work_pool_wgs;
+
+ /* Cached kernel-dependent data, initialized once. */
+ bool kernel_data_initialized;
+ size_t local_size[2];
+ size_t global_size[2];
+
+ public:
+ explicit DeviceSplitKernel(Device *device);
+ virtual ~DeviceSplitKernel();
+
+ bool load_kernels(const DeviceRequestedFeatures &requested_features);
+ bool path_trace(DeviceTask *task,
+ RenderTile &rtile,
+ device_memory &kgbuffer,
+ device_memory &kernel_data);
+
+ virtual uint64_t state_buffer_size(device_memory &kg,
+ device_memory &data,
+ size_t num_threads) = 0;
+ size_t max_elements_for_max_buffer_size(device_memory &kg,
+ device_memory &data,
+ uint64_t max_buffer_size);
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+ RenderTile &rtile,
+ int num_global_elements,
+ device_memory &kernel_globals,
+ device_memory &kernel_data_,
+ device_memory &split_data,
+ device_memory &ray_state,
+ device_memory &queue_index,
+ device_memory &use_queues_flag,
+ device_memory &work_pool_wgs) = 0;
+
+ virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+ const DeviceRequestedFeatures &) = 0;
+ virtual int2 split_kernel_local_size() = 0;
+ virtual int2 split_kernel_global_size(device_memory &kg,
+ device_memory &data,
+ DeviceTask *task) = 0;
};
CCL_NAMESPACE_END
-#endif /* __DEVICE_SPLIT_KERNEL_H__ */
+#endif /* __DEVICE_SPLIT_KERNEL_H__ */
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index 8310863886c..376ad06a734 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -29,100 +29,111 @@ CCL_NAMESPACE_BEGIN
/* Device Task */
DeviceTask::DeviceTask(Type type_)
-: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
- sample(0), num_samples(1),
- shader_input(0), shader_output(0),
- shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0)
+ : type(type_),
+ x(0),
+ y(0),
+ w(0),
+ h(0),
+ rgba_byte(0),
+ rgba_half(0),
+ buffer(0),
+ sample(0),
+ num_samples(1),
+ shader_input(0),
+ shader_output(0),
+ shader_eval_type(0),
+ shader_filter(0),
+ shader_x(0),
+ shader_w(0)
{
- last_update_time = time_dt();
+ last_update_time = time_dt();
}
int DeviceTask::get_subtask_count(int num, int max_size)
{
- if(max_size != 0) {
- int max_size_num;
-
- if(type == SHADER) {
- max_size_num = (shader_w + max_size - 1)/max_size;
- }
- else {
- max_size = max(1, max_size/w);
- max_size_num = (h + max_size - 1)/max_size;
- }
-
- num = max(max_size_num, num);
- }
-
- if(type == SHADER) {
- num = min(shader_w, num);
- }
- else if(type == RENDER) {
- }
- else {
- num = min(h, num);
- }
-
- return num;
+ if (max_size != 0) {
+ int max_size_num;
+
+ if (type == SHADER) {
+ max_size_num = (shader_w + max_size - 1) / max_size;
+ }
+ else {
+ max_size = max(1, max_size / w);
+ max_size_num = (h + max_size - 1) / max_size;
+ }
+
+ num = max(max_size_num, num);
+ }
+
+ if (type == SHADER) {
+ num = min(shader_w, num);
+ }
+ else if (type == RENDER) {
+ }
+ else {
+ num = min(h, num);
+ }
+
+ return num;
}
-void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
+void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size)
{
- num = get_subtask_count(num, max_size);
-
- if(type == SHADER) {
- for(int i = 0; i < num; i++) {
- int tx = shader_x + (shader_w/num)*i;
- int tw = (i == num-1)? shader_w - i*(shader_w/num): shader_w/num;
-
- DeviceTask task = *this;
-
- task.shader_x = tx;
- task.shader_w = tw;
-
- tasks.push_back(task);
- }
- }
- else if(type == RENDER) {
- for(int i = 0; i < num; i++)
- tasks.push_back(*this);
- }
- else {
- for(int i = 0; i < num; i++) {
- int ty = y + (h/num)*i;
- int th = (i == num-1)? h - i*(h/num): h/num;
-
- DeviceTask task = *this;
-
- task.y = ty;
- task.h = th;
-
- tasks.push_back(task);
- }
- }
+ num = get_subtask_count(num, max_size);
+
+ if (type == SHADER) {
+ for (int i = 0; i < num; i++) {
+ int tx = shader_x + (shader_w / num) * i;
+ int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num;
+
+ DeviceTask task = *this;
+
+ task.shader_x = tx;
+ task.shader_w = tw;
+
+ tasks.push_back(task);
+ }
+ }
+ else if (type == RENDER) {
+ for (int i = 0; i < num; i++)
+ tasks.push_back(*this);
+ }
+ else {
+ for (int i = 0; i < num; i++) {
+ int ty = y + (h / num) * i;
+ int th = (i == num - 1) ? h - i * (h / num) : h / num;
+
+ DeviceTask task = *this;
+
+ task.y = ty;
+ task.h = th;
+
+ tasks.push_back(task);
+ }
+ }
}
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
{
- if((type != RENDER) &&
- (type != SHADER))
- return;
-
- if(update_progress_sample) {
- if(pixel_samples == -1) {
- pixel_samples = shader_w;
- }
- update_progress_sample(pixel_samples, rtile? rtile->sample : 0);
- }
-
- if(update_tile_sample) {
- double current_time = time_dt();
-
- if(current_time - last_update_time >= 1.0) {
- update_tile_sample(*rtile);
-
- last_update_time = current_time;
- }
- }
+ if ((type != RENDER) && (type != SHADER))
+ return;
+
+ if (update_progress_sample) {
+ if (pixel_samples == -1) {
+ pixel_samples = shader_w;
+ }
+ update_progress_sample(pixel_samples, rtile ? rtile->sample : 0);
+ }
+
+ if (update_tile_sample) {
+ double current_time = time_dt();
+
+ if (current_time - last_update_time >= 1.0) {
+ update_tile_sample(*rtile);
+
+ last_update_time = current_time;
+ }
+ }
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index f1fd4246868..5cc2e5e25db 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -33,87 +33,88 @@ class RenderTile;
class Tile;
class DenoiseParams {
-public:
- /* Pixel radius for neighbouring pixels to take into account. */
- int radius;
- /* Controls neighbor pixel weighting for the denoising filter. */
- float strength;
- /* Preserve more or less detail based on feature passes. */
- float feature_strength;
- /* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */
- bool relative_pca;
- /* How many frames before and after the current center frame are included. */
- int neighbor_frames;
- /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
- bool clamp_input;
-
- DenoiseParams()
- {
- radius = 8;
- strength = 0.5f;
- feature_strength = 0.5f;
- relative_pca = false;
- neighbor_frames = 2;
- clamp_input = true;
- }
+ public:
+ /* Pixel radius for neighbouring pixels to take into account. */
+ int radius;
+ /* Controls neighbor pixel weighting for the denoising filter. */
+ float strength;
+ /* Preserve more or less detail based on feature passes. */
+ float feature_strength;
+ /* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */
+ bool relative_pca;
+ /* How many frames before and after the current center frame are included. */
+ int neighbor_frames;
+ /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */
+ bool clamp_input;
+
+ DenoiseParams()
+ {
+ radius = 8;
+ strength = 0.5f;
+ feature_strength = 0.5f;
+ relative_pca = false;
+ neighbor_frames = 2;
+ clamp_input = true;
+ }
};
class DeviceTask : public Task {
-public:
- typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
- Type type;
-
- int x, y, w, h;
- device_ptr rgba_byte;
- device_ptr rgba_half;
- device_ptr buffer;
- int sample;
- int num_samples;
- int offset, stride;
-
- device_ptr shader_input;
- device_ptr shader_output;
- int shader_eval_type;
- int shader_filter;
- int shader_x, shader_w;
-
- int passes_size;
-
- explicit DeviceTask(Type type = RENDER);
-
- int get_subtask_count(int num, int max_size = 0);
- void split(list<DeviceTask>& tasks, int num, int max_size = 0);
-
- void update_progress(RenderTile *rtile, int pixel_samples = -1);
-
- function<bool(Device *device, RenderTile&)> acquire_tile;
- function<void(long, int)> update_progress_sample;
- function<void(RenderTile&)> update_tile_sample;
- function<void(RenderTile&)> release_tile;
- function<bool()> get_cancel;
- function<void(RenderTile*, Device*)> map_neighbor_tiles;
- function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
-
- DenoiseParams denoising;
- bool denoising_from_render;
- vector<int> denoising_frames;
-
- bool denoising_do_filter;
- bool denoising_write_passes;
-
- int pass_stride;
- int frame_stride;
- int target_pass_stride;
- int pass_denoising_data;
- int pass_denoising_clean;
-
- bool need_finish_queue;
- bool integrator_branched;
- int2 requested_tile_size;
-protected:
- double last_update_time;
+ public:
+ typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
+ Type type;
+
+ int x, y, w, h;
+ device_ptr rgba_byte;
+ device_ptr rgba_half;
+ device_ptr buffer;
+ int sample;
+ int num_samples;
+ int offset, stride;
+
+ device_ptr shader_input;
+ device_ptr shader_output;
+ int shader_eval_type;
+ int shader_filter;
+ int shader_x, shader_w;
+
+ int passes_size;
+
+ explicit DeviceTask(Type type = RENDER);
+
+ int get_subtask_count(int num, int max_size = 0);
+ void split(list<DeviceTask> &tasks, int num, int max_size = 0);
+
+ void update_progress(RenderTile *rtile, int pixel_samples = -1);
+
+ function<bool(Device *device, RenderTile &)> acquire_tile;
+ function<void(long, int)> update_progress_sample;
+ function<void(RenderTile &)> update_tile_sample;
+ function<void(RenderTile &)> release_tile;
+ function<bool()> get_cancel;
+ function<void(RenderTile *, Device *)> map_neighbor_tiles;
+ function<void(RenderTile *, Device *)> unmap_neighbor_tiles;
+
+ DenoiseParams denoising;
+ bool denoising_from_render;
+ vector<int> denoising_frames;
+
+ bool denoising_do_filter;
+ bool denoising_write_passes;
+
+ int pass_stride;
+ int frame_stride;
+ int target_pass_stride;
+ int pass_denoising_data;
+ int pass_denoising_clean;
+
+ bool need_finish_queue;
+ bool integrator_branched;
+ int2 requested_tile_size;
+
+ protected:
+ double last_update_time;
};
CCL_NAMESPACE_END
-#endif /* __DEVICE_TASK_H__ */
+#endif /* __DEVICE_TASK_H__ */
diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp
index 9cb105982aa..f85aadce1c2 100644
--- a/intern/cycles/device/opencl/memory_manager.cpp
+++ b/intern/cycles/device/opencl/memory_manager.cpp
@@ -16,241 +16,246 @@
#ifdef WITH_OPENCL
-#include "util/util_foreach.h"
+# include "util/util_foreach.h"
-#include "device/opencl/opencl.h"
-#include "device/opencl/memory_manager.h"
+# include "device/opencl/opencl.h"
+# include "device/opencl/memory_manager.h"
CCL_NAMESPACE_BEGIN
-void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation)
+void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation)
{
- allocations.push_back(&allocation);
+ allocations.push_back(&allocation);
}
void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device)
{
- bool need_realloc = false;
-
- /* Calculate total size and remove any freed. */
- size_t total_size = 0;
-
- for(int i = allocations.size()-1; i >= 0; i--) {
- Allocation* allocation = allocations[i];
-
- /* Remove allocations that have been freed. */
- if(!allocation->mem || allocation->mem->memory_size() == 0) {
- allocation->device_buffer = NULL;
- allocation->size = 0;
-
- allocations.erase(allocations.begin()+i);
-
- need_realloc = true;
-
- continue;
- }
-
- /* Get actual size for allocation. */
- size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
-
- if(allocation->size != alloc_size) {
- /* Allocation is either new or resized. */
- allocation->size = alloc_size;
- allocation->needs_copy_to_device = true;
-
- need_realloc = true;
- }
-
- total_size += alloc_size;
- }
-
- if(need_realloc) {
- cl_ulong max_buffer_size;
- clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
- if(total_size > max_buffer_size) {
- device->set_error("Scene too complex to fit in available memory.");
- return;
- }
-
- device_only_memory<uchar> *new_buffer =
- new device_only_memory<uchar>(device, "memory manager buffer");
-
- new_buffer->alloc_to_device(total_size);
-
- size_t offset = 0;
-
- foreach(Allocation* allocation, allocations) {
- if(allocation->needs_copy_to_device) {
- /* Copy from host to device. */
- opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
- CL_MEM_PTR(new_buffer->device_pointer),
- CL_FALSE,
- offset,
- allocation->mem->memory_size(),
- allocation->mem->host_pointer,
- 0, NULL, NULL
- ));
-
- allocation->needs_copy_to_device = false;
- }
- else {
- /* Fast copy from memory already on device. */
- opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue,
- CL_MEM_PTR(buffer->device_pointer),
- CL_MEM_PTR(new_buffer->device_pointer),
- allocation->desc.offset,
- offset,
- allocation->mem->memory_size(),
- 0, NULL, NULL
- ));
- }
-
- allocation->desc.offset = offset;
- offset += allocation->size;
- }
-
- delete buffer;
-
- buffer = new_buffer;
- }
- else {
- assert(total_size == buffer->data_size);
-
- size_t offset = 0;
-
- foreach(Allocation* allocation, allocations) {
- if(allocation->needs_copy_to_device) {
- /* Copy from host to device. */
- opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue,
- CL_MEM_PTR(buffer->device_pointer),
- CL_FALSE,
- offset,
- allocation->mem->memory_size(),
- allocation->mem->host_pointer,
- 0, NULL, NULL
- ));
-
- allocation->needs_copy_to_device = false;
- }
-
- offset += allocation->size;
- }
- }
-
- /* Not really necessary, but seems to improve responsiveness for some reason. */
- clFinish(device->cqCommandQueue);
+ bool need_realloc = false;
+
+ /* Calculate total size and remove any freed. */
+ size_t total_size = 0;
+
+ for (int i = allocations.size() - 1; i >= 0; i--) {
+ Allocation *allocation = allocations[i];
+
+ /* Remove allocations that have been freed. */
+ if (!allocation->mem || allocation->mem->memory_size() == 0) {
+ allocation->device_buffer = NULL;
+ allocation->size = 0;
+
+ allocations.erase(allocations.begin() + i);
+
+ need_realloc = true;
+
+ continue;
+ }
+
+ /* Get actual size for allocation. */
+ size_t alloc_size = align_up(allocation->mem->memory_size(), 16);
+
+ if (allocation->size != alloc_size) {
+ /* Allocation is either new or resized. */
+ allocation->size = alloc_size;
+ allocation->needs_copy_to_device = true;
+
+ need_realloc = true;
+ }
+
+ total_size += alloc_size;
+ }
+
+ if (need_realloc) {
+ cl_ulong max_buffer_size;
+ clGetDeviceInfo(
+ device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+ if (total_size > max_buffer_size) {
+ device->set_error("Scene too complex to fit in available memory.");
+ return;
+ }
+
+ device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device,
+ "memory manager buffer");
+
+ new_buffer->alloc_to_device(total_size);
+
+ size_t offset = 0;
+
+ foreach (Allocation *allocation, allocations) {
+ if (allocation->needs_copy_to_device) {
+ /* Copy from host to device. */
+ opencl_device_assert(device,
+ clEnqueueWriteBuffer(device->cqCommandQueue,
+ CL_MEM_PTR(new_buffer->device_pointer),
+ CL_FALSE,
+ offset,
+ allocation->mem->memory_size(),
+ allocation->mem->host_pointer,
+ 0,
+ NULL,
+ NULL));
+
+ allocation->needs_copy_to_device = false;
+ }
+ else {
+ /* Fast copy from memory already on device. */
+ opencl_device_assert(device,
+ clEnqueueCopyBuffer(device->cqCommandQueue,
+ CL_MEM_PTR(buffer->device_pointer),
+ CL_MEM_PTR(new_buffer->device_pointer),
+ allocation->desc.offset,
+ offset,
+ allocation->mem->memory_size(),
+ 0,
+ NULL,
+ NULL));
+ }
+
+ allocation->desc.offset = offset;
+ offset += allocation->size;
+ }
+
+ delete buffer;
+
+ buffer = new_buffer;
+ }
+ else {
+ assert(total_size == buffer->data_size);
+
+ size_t offset = 0;
+
+ foreach (Allocation *allocation, allocations) {
+ if (allocation->needs_copy_to_device) {
+ /* Copy from host to device. */
+ opencl_device_assert(device,
+ clEnqueueWriteBuffer(device->cqCommandQueue,
+ CL_MEM_PTR(buffer->device_pointer),
+ CL_FALSE,
+ offset,
+ allocation->mem->memory_size(),
+ allocation->mem->host_pointer,
+ 0,
+ NULL,
+ NULL));
+
+ allocation->needs_copy_to_device = false;
+ }
+
+ offset += allocation->size;
+ }
+ }
+
+ /* Not really necessary, but seems to improve responsiveness for some reason. */
+ clFinish(device->cqCommandQueue);
}
void MemoryManager::DeviceBuffer::free(OpenCLDevice *)
{
- buffer->free();
+ buffer->free();
}
-MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer()
+MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer()
{
- DeviceBuffer* smallest = device_buffers;
+ DeviceBuffer *smallest = device_buffers;
- foreach(DeviceBuffer& device_buffer, device_buffers) {
- if(device_buffer.size < smallest->size) {
- smallest = &device_buffer;
- }
- }
+ foreach (DeviceBuffer &device_buffer, device_buffers) {
+ if (device_buffer.size < smallest->size) {
+ smallest = &device_buffer;
+ }
+ }
- return smallest;
+ return smallest;
}
-MemoryManager::MemoryManager(OpenCLDevice *device)
-: device(device), need_update(false)
+MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false)
{
- foreach(DeviceBuffer& device_buffer, device_buffers) {
- device_buffer.buffer =
- new device_only_memory<uchar>(device, "memory manager buffer");
- }
+ foreach (DeviceBuffer &device_buffer, device_buffers) {
+ device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer");
+ }
}
void MemoryManager::free()
{
- foreach(DeviceBuffer& device_buffer, device_buffers) {
- device_buffer.free(device);
- }
+ foreach (DeviceBuffer &device_buffer, device_buffers) {
+ device_buffer.free(device);
+ }
}
-void MemoryManager::alloc(const char *name, device_memory& mem)
+void MemoryManager::alloc(const char *name, device_memory &mem)
{
- Allocation& allocation = allocations[name];
+ Allocation &allocation = allocations[name];
- allocation.mem = &mem;
- allocation.needs_copy_to_device = true;
+ allocation.mem = &mem;
+ allocation.needs_copy_to_device = true;
- if(!allocation.device_buffer) {
- DeviceBuffer* device_buffer = smallest_device_buffer();
- allocation.device_buffer = device_buffer;
+ if (!allocation.device_buffer) {
+ DeviceBuffer *device_buffer = smallest_device_buffer();
+ allocation.device_buffer = device_buffer;
- allocation.desc.device_buffer = device_buffer - device_buffers;
+ allocation.desc.device_buffer = device_buffer - device_buffers;
- device_buffer->add_allocation(allocation);
+ device_buffer->add_allocation(allocation);
- device_buffer->size += mem.memory_size();
- }
+ device_buffer->size += mem.memory_size();
+ }
- need_update = true;
+ need_update = true;
}
-bool MemoryManager::free(device_memory& mem)
+bool MemoryManager::free(device_memory &mem)
{
- foreach(AllocationsMap::value_type& value, allocations) {
- Allocation& allocation = value.second;
- if(allocation.mem == &mem) {
+ foreach (AllocationsMap::value_type &value, allocations) {
+ Allocation &allocation = value.second;
+ if (allocation.mem == &mem) {
- allocation.device_buffer->size -= mem.memory_size();
+ allocation.device_buffer->size -= mem.memory_size();
- allocation.mem = NULL;
- allocation.needs_copy_to_device = false;
+ allocation.mem = NULL;
+ allocation.needs_copy_to_device = false;
- need_update = true;
- return true;
- }
- }
+ need_update = true;
+ return true;
+ }
+ }
- return false;
+ return false;
}
MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name)
{
- update_device_memory();
+ update_device_memory();
- Allocation& allocation = allocations[name];
- return allocation.desc;
+ Allocation &allocation = allocations[name];
+ return allocation.desc;
}
void MemoryManager::update_device_memory()
{
- if(!need_update) {
- return;
- }
+ if (!need_update) {
+ return;
+ }
- need_update = false;
+ need_update = false;
- foreach(DeviceBuffer& device_buffer, device_buffers) {
- device_buffer.update_device_memory(device);
- }
+ foreach (DeviceBuffer &device_buffer, device_buffers) {
+ device_buffer.update_device_memory(device);
+ }
}
void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
{
- update_device_memory();
-
- foreach(DeviceBuffer& device_buffer, device_buffers) {
- if(device_buffer.buffer->device_pointer) {
- device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
- }
- else {
- device->kernel_set_args(kernel, (*narg)++, device->null_mem);
- }
- }
+ update_device_memory();
+
+ foreach (DeviceBuffer &device_buffer, device_buffers) {
+ if (device_buffer.buffer->device_pointer) {
+ device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer);
+ }
+ else {
+ device->kernel_set_args(kernel, (*narg)++, device->null_mem);
+ }
+ }
}
CCL_NAMESPACE_END
-#endif /* WITH_OPENCL */
+#endif /* WITH_OPENCL */
diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h
index 8fcc4440369..2fbc97a0756 100644
--- a/intern/cycles/device/opencl/memory_manager.h
+++ b/intern/cycles/device/opencl/memory_manager.h
@@ -29,78 +29,77 @@ CCL_NAMESPACE_BEGIN
class OpenCLDevice;
class MemoryManager {
-public:
- static const int NUM_DEVICE_BUFFERS = 8;
+ public:
+ static const int NUM_DEVICE_BUFFERS = 8;
- struct BufferDescriptor {
- uint device_buffer;
- cl_ulong offset;
- };
+ struct BufferDescriptor {
+ uint device_buffer;
+ cl_ulong offset;
+ };
-private:
- struct DeviceBuffer;
+ private:
+ struct DeviceBuffer;
- struct Allocation {
- device_memory *mem;
+ struct Allocation {
+ device_memory *mem;
- DeviceBuffer *device_buffer;
- size_t size; /* Size of actual allocation, may be larger than requested. */
+ DeviceBuffer *device_buffer;
+ size_t size; /* Size of actual allocation, may be larger than requested. */
- BufferDescriptor desc;
+ BufferDescriptor desc;
- bool needs_copy_to_device;
+ bool needs_copy_to_device;
- Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
- {
- }
- };
+ Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false)
+ {
+ }
+ };
- struct DeviceBuffer {
- device_only_memory<uchar> *buffer;
- vector<Allocation*> allocations;
- size_t size; /* Size of all allocations. */
+ struct DeviceBuffer {
+ device_only_memory<uchar> *buffer;
+ vector<Allocation *> allocations;
+ size_t size; /* Size of all allocations. */
- DeviceBuffer()
- : buffer(NULL), size(0)
- {
- }
+ DeviceBuffer() : buffer(NULL), size(0)
+ {
+ }
- ~DeviceBuffer()
- {
- delete buffer;
- buffer = NULL;
- }
+ ~DeviceBuffer()
+ {
+ delete buffer;
+ buffer = NULL;
+ }
- void add_allocation(Allocation& allocation);
+ void add_allocation(Allocation &allocation);
- void update_device_memory(OpenCLDevice *device);
+ void update_device_memory(OpenCLDevice *device);
- void free(OpenCLDevice *device);
- };
+ void free(OpenCLDevice *device);
+ };
- OpenCLDevice *device;
+ OpenCLDevice *device;
- DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
+ DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS];
- typedef unordered_map<string, Allocation> AllocationsMap;
- AllocationsMap allocations;
+ typedef unordered_map<string, Allocation> AllocationsMap;
+ AllocationsMap allocations;
- bool need_update;
+ bool need_update;
- DeviceBuffer* smallest_device_buffer();
+ DeviceBuffer *smallest_device_buffer();
-public:
- MemoryManager(OpenCLDevice *device);
+ public:
+ MemoryManager(OpenCLDevice *device);
- void free(); /* Free all memory. */
+ void free(); /* Free all memory. */
- void alloc(const char *name, device_memory& mem);
- bool free(device_memory& mem);
+ void alloc(const char *name, device_memory &mem);
+ bool free(device_memory &mem);
- BufferDescriptor get_descriptor(string name);
+ BufferDescriptor get_descriptor(string name);
- void update_device_memory();
- void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+ void update_device_memory();
+ void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
};
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h
index 89761293638..e7bafa0b8a8 100644
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -16,645 +16,641 @@
#ifdef WITH_OPENCL
-#include "device/device.h"
-#include "device/device_denoising.h"
-#include "device/device_split_kernel.h"
+# include "device/device.h"
+# include "device/device_denoising.h"
+# include "device/device_split_kernel.h"
-#include "util/util_map.h"
-#include "util/util_param.h"
-#include "util/util_string.h"
+# include "util/util_map.h"
+# include "util/util_param.h"
+# include "util/util_string.h"
-#include "clew.h"
+# include "clew.h"
-#include "device/opencl/memory_manager.h"
+# include "device/opencl/memory_manager.h"
CCL_NAMESPACE_BEGIN
/* Disable workarounds, seems to be working fine on latest drivers. */
-#define CYCLES_DISABLE_DRIVER_WORKAROUNDS
+# define CYCLES_DISABLE_DRIVER_WORKAROUNDS
/* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */
-#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
+# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS
/* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */
-# undef clEnqueueNDRangeKernel
-# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
+# undef clEnqueueNDRangeKernel
+# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \
+ CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \
+ clFinish(a);
-# undef clEnqueueWriteBuffer
-# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
+# undef clEnqueueWriteBuffer
+# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \
+ CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \
+ clFinish(a);
-# undef clEnqueueReadBuffer
-# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
- CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
- clFinish(a);
-#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
+# undef clEnqueueReadBuffer
+# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \
+ CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \
+ clFinish(a);
+# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */
-#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
struct OpenCLPlatformDevice {
- OpenCLPlatformDevice(cl_platform_id platform_id,
- const string& platform_name,
- cl_device_id device_id,
- cl_device_type device_type,
- const string& device_name,
- const string& hardware_id,
- const string& device_extensions)
- : platform_id(platform_id),
- platform_name(platform_name),
- device_id(device_id),
- device_type(device_type),
- device_name(device_name),
- hardware_id(hardware_id),
- device_extensions(device_extensions) {}
- cl_platform_id platform_id;
- string platform_name;
- cl_device_id device_id;
- cl_device_type device_type;
- string device_name;
- string hardware_id;
- string device_extensions;
+ OpenCLPlatformDevice(cl_platform_id platform_id,
+ const string &platform_name,
+ cl_device_id device_id,
+ cl_device_type device_type,
+ const string &device_name,
+ const string &hardware_id,
+ const string &device_extensions)
+ : platform_id(platform_id),
+ platform_name(platform_name),
+ device_id(device_id),
+ device_type(device_type),
+ device_name(device_name),
+ hardware_id(hardware_id),
+ device_extensions(device_extensions)
+ {
+ }
+ cl_platform_id platform_id;
+ string platform_name;
+ cl_device_id device_id;
+ cl_device_type device_type;
+ string device_name;
+ string hardware_id;
+ string device_extensions;
};
/* Contains all static OpenCL helper functions. */
-class OpenCLInfo
-{
-public:
- static cl_device_type device_type();
- static bool use_debug();
- static bool device_supported(const string& platform_name,
- const cl_device_id device_id);
- static bool platform_version_check(cl_platform_id platform,
- string *error = NULL);
- static bool device_version_check(cl_device_id device,
- string *error = NULL);
- static string get_hardware_id(const string& platform_name,
- cl_device_id device_id);
- static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
- bool force_all = false);
-
- /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
-
- /* Platform information. */
- static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
- static cl_uint get_num_platforms();
-
- static bool get_platforms(vector<cl_platform_id> *platform_ids,
- cl_int *error = NULL);
- static vector<cl_platform_id> get_platforms();
-
- static bool get_platform_name(cl_platform_id platform_id,
- string *platform_name);
- static string get_platform_name(cl_platform_id platform_id);
-
- static bool get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- cl_uint *num_devices,
- cl_int *error = NULL);
- static cl_uint get_num_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type);
-
- static bool get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type,
- vector<cl_device_id> *device_ids,
- cl_int* error = NULL);
- static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
- cl_device_type device_type);
-
- /* Device information. */
- static bool get_device_name(cl_device_id device_id,
- string *device_name,
- cl_int* error = NULL);
-
- static string get_device_name(cl_device_id device_id);
-
- static bool get_device_extensions(cl_device_id device_id,
- string *device_extensions,
- cl_int* error = NULL);
-
- static string get_device_extensions(cl_device_id device_id);
-
- static bool get_device_type(cl_device_id device_id,
- cl_device_type *device_type,
- cl_int* error = NULL);
- static cl_device_type get_device_type(cl_device_id device_id);
-
- static bool get_driver_version(cl_device_id device_id,
- int *major,
- int *minor,
- cl_int* error = NULL);
-
- static int mem_sub_ptr_alignment(cl_device_id device_id);
-
- /* Get somewhat more readable device name.
- * Main difference is AMD OpenCL here which only gives code name
- * for the regular device name. This will give more sane device
- * name using some extensions.
- */
- static string get_readable_device_name(cl_device_id device_id);
+class OpenCLInfo {
+ public:
+ static cl_device_type device_type();
+ static bool use_debug();
+ static bool device_supported(const string &platform_name, const cl_device_id device_id);
+ static bool platform_version_check(cl_platform_id platform, string *error = NULL);
+ static bool device_version_check(cl_device_id device, string *error = NULL);
+ static string get_hardware_id(const string &platform_name, cl_device_id device_id);
+ static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
+ bool force_all = false);
+
+ /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */
+
+ /* Platform information. */
+ static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL);
+ static cl_uint get_num_platforms();
+
+ static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL);
+ static vector<cl_platform_id> get_platforms();
+
+ static bool get_platform_name(cl_platform_id platform_id, string *platform_name);
+ static string get_platform_name(cl_platform_id platform_id);
+
+ static bool get_num_platform_devices(cl_platform_id platform_id,
+ cl_device_type device_type,
+ cl_uint *num_devices,
+ cl_int *error = NULL);
+ static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type);
+
+ static bool get_platform_devices(cl_platform_id platform_id,
+ cl_device_type device_type,
+ vector<cl_device_id> *device_ids,
+ cl_int *error = NULL);
+ static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id,
+ cl_device_type device_type);
+
+ /* Device information. */
+ static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL);
+
+ static string get_device_name(cl_device_id device_id);
+
+ static bool get_device_extensions(cl_device_id device_id,
+ string *device_extensions,
+ cl_int *error = NULL);
+
+ static string get_device_extensions(cl_device_id device_id);
+
+ static bool get_device_type(cl_device_id device_id,
+ cl_device_type *device_type,
+ cl_int *error = NULL);
+ static cl_device_type get_device_type(cl_device_id device_id);
+
+ static bool get_driver_version(cl_device_id device_id,
+ int *major,
+ int *minor,
+ cl_int *error = NULL);
+
+ static int mem_sub_ptr_alignment(cl_device_id device_id);
+
+ /* Get somewhat more readable device name.
+ * Main difference is AMD OpenCL here which only gives code name
+ * for the regular device name. This will give more sane device
+ * name using some extensions.
+ */
+ static string get_readable_device_name(cl_device_id device_id);
};
/* Thread safe cache for contexts and programs.
*/
-class OpenCLCache
-{
- struct Slot
- {
- struct ProgramEntry
- {
- ProgramEntry();
- ProgramEntry(const ProgramEntry& rhs);
- ~ProgramEntry();
- cl_program program;
- thread_mutex *mutex;
- };
-
- Slot();
- Slot(const Slot& rhs);
- ~Slot();
-
- thread_mutex *context_mutex;
- cl_context context;
- typedef map<ustring, ProgramEntry> EntryMap;
- EntryMap programs;
-
- };
-
- /* key is combination of platform ID and device ID */
- typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
-
- /* map of Slot objects */
- typedef map<PlatformDevicePair, Slot> CacheMap;
- CacheMap cache;
-
- /* MD5 hash of the kernel source. */
- string kernel_md5;
-
- thread_mutex cache_lock;
- thread_mutex kernel_md5_lock;
-
- /* lazy instantiate */
- static OpenCLCache& global_instance();
-
-public:
-
- enum ProgramName {
- OCL_DEV_BASE_PROGRAM,
- OCL_DEV_MEGAKERNEL_PROGRAM,
- };
-
- /* Lookup context in the cache. If this returns NULL, slot_locker
- * will be holding a lock for the cache. slot_locker should refer to a
- * default constructed thread_scoped_lock. */
- static cl_context get_context(cl_platform_id platform,
- cl_device_id device,
- thread_scoped_lock& slot_locker);
- /* Same as above. */
- static cl_program get_program(cl_platform_id platform,
- cl_device_id device,
- ustring key,
- thread_scoped_lock& slot_locker);
-
- /* Store context in the cache. You MUST have tried to get the item before storing to it. */
- static void store_context(cl_platform_id platform,
- cl_device_id device,
- cl_context context,
- thread_scoped_lock& slot_locker);
- /* Same as above. */
- static void store_program(cl_platform_id platform,
- cl_device_id device,
- cl_program program,
- ustring key,
- thread_scoped_lock& slot_locker);
-
- static string get_kernel_md5();
+class OpenCLCache {
+ struct Slot {
+ struct ProgramEntry {
+ ProgramEntry();
+ ProgramEntry(const ProgramEntry &rhs);
+ ~ProgramEntry();
+ cl_program program;
+ thread_mutex *mutex;
+ };
+
+ Slot();
+ Slot(const Slot &rhs);
+ ~Slot();
+
+ thread_mutex *context_mutex;
+ cl_context context;
+ typedef map<ustring, ProgramEntry> EntryMap;
+ EntryMap programs;
+ };
+
+ /* key is combination of platform ID and device ID */
+ typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
+
+ /* map of Slot objects */
+ typedef map<PlatformDevicePair, Slot> CacheMap;
+ CacheMap cache;
+
+ /* MD5 hash of the kernel source. */
+ string kernel_md5;
+
+ thread_mutex cache_lock;
+ thread_mutex kernel_md5_lock;
+
+ /* lazy instantiate */
+ static OpenCLCache &global_instance();
+
+ public:
+ enum ProgramName {
+ OCL_DEV_BASE_PROGRAM,
+ OCL_DEV_MEGAKERNEL_PROGRAM,
+ };
+
+ /* Lookup context in the cache. If this returns NULL, slot_locker
+ * will be holding a lock for the cache. slot_locker should refer to a
+ * default constructed thread_scoped_lock. */
+ static cl_context get_context(cl_platform_id platform,
+ cl_device_id device,
+ thread_scoped_lock &slot_locker);
+ /* Same as above. */
+ static cl_program get_program(cl_platform_id platform,
+ cl_device_id device,
+ ustring key,
+ thread_scoped_lock &slot_locker);
+
+ /* Store context in the cache. You MUST have tried to get the item before storing to it. */
+ static void store_context(cl_platform_id platform,
+ cl_device_id device,
+ cl_context context,
+ thread_scoped_lock &slot_locker);
+ /* Same as above. */
+ static void store_program(cl_platform_id platform,
+ cl_device_id device,
+ cl_program program,
+ ustring key,
+ thread_scoped_lock &slot_locker);
+
+ static string get_kernel_md5();
};
-#define opencl_device_assert(device, stmt) \
- { \
- cl_int err = stmt; \
- \
- if(err != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
- if((device)->error_message() == "") \
- (device)->set_error(message); \
- fprintf(stderr, "%s\n", message.c_str()); \
- } \
- } (void) 0
-
-#define opencl_assert(stmt) \
- { \
- cl_int err = stmt; \
- \
- if(err != CL_SUCCESS) { \
- string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
- if(error_msg == "") \
- error_msg = message; \
- fprintf(stderr, "%s\n", message.c_str()); \
- } \
- } (void) 0
-
-class OpenCLDevice : public Device
-{
-public:
- DedicatedTaskPool task_pool;
-
- /* Task pool for required kernels (base, AO kernels during foreground rendering) */
- TaskPool load_required_kernel_task_pool;
- /* Task pool for optional kernels (feature kernels during foreground rendering) */
- TaskPool load_kernel_task_pool;
- cl_context cxContext;
- cl_command_queue cqCommandQueue;
- cl_platform_id cpPlatform;
- cl_device_id cdDevice;
- cl_int ciErr;
- int device_num;
- bool use_preview_kernels;
-
- class OpenCLProgram {
- public:
- OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) {}
- OpenCLProgram(OpenCLDevice *device,
- const string& program_name,
- const string& kernel_name,
- const string& kernel_build_options,
- bool use_stdout = true);
- ~OpenCLProgram();
-
- void add_kernel(ustring name);
-
- /* Try to load the program from device cache or disk */
- bool load();
- /* Compile the kernel (first separate, failback to local) */
- void compile();
- /* Create the OpenCL kernels after loading or compiling */
- void create_kernels();
-
- bool is_loaded() const { return loaded; }
- const string& get_log() const { return log; }
- void report_error();
-
- /* Wait until this kernel is available to be used
- * It will return true when the kernel is available.
- * It will return false when the kernel is not available
- * or could not be loaded. */
- bool wait_for_availability();
-
- cl_kernel operator()();
- cl_kernel operator()(ustring name);
-
- void release();
-
- private:
- bool build_kernel(const string *debug_src);
- /* Build the program by calling the own process.
- * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
- * build calls internally if they come from the same process.
- * If that is not supported, this function just returns false.
- */
- bool compile_separate(const string& clbin);
- /* Build the program by calling OpenCL directly. */
- bool compile_kernel(const string *debug_src);
- /* Loading and saving the program from/to disk. */
- bool load_binary(const string& clbin, const string *debug_src = NULL);
- bool save_binary(const string& clbin);
-
- void add_log(const string& msg, bool is_debug);
- void add_error(const string& msg);
-
- bool loaded;
- bool needs_compiling;
-
- cl_program program;
- OpenCLDevice *device;
-
- /* Used for the OpenCLCache key. */
- string program_name;
-
- string kernel_file, kernel_build_options, device_md5;
-
- bool use_stdout;
- string log, error_msg;
- string compile_output;
-
- map<ustring, cl_kernel> kernels;
- };
-
- /* Container for all types of split programs. */
- class OpenCLSplitPrograms {
- public:
- OpenCLDevice *device;
- OpenCLProgram program_split;
- OpenCLProgram program_lamp_emission;
- OpenCLProgram program_do_volume;
- OpenCLProgram program_indirect_background;
- OpenCLProgram program_shader_eval;
- OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
- OpenCLProgram program_subsurface_scatter;
- OpenCLProgram program_direct_lighting;
- OpenCLProgram program_shadow_blocked_ao;
- OpenCLProgram program_shadow_blocked_dl;
-
- OpenCLSplitPrograms(OpenCLDevice *device);
- ~OpenCLSplitPrograms();
-
- /* Load the kernels and put the created kernels in the given `programs`
- * paramter. */
- void load_kernels(vector<OpenCLProgram*> &programs,
- const DeviceRequestedFeatures& requested_features,
- bool is_preview=false);
- };
-
- DeviceSplitKernel *split_kernel;
-
- OpenCLProgram base_program;
- OpenCLProgram bake_program;
- OpenCLProgram displace_program;
- OpenCLProgram background_program;
- OpenCLProgram denoising_program;
-
- OpenCLSplitPrograms kernel_programs;
- OpenCLSplitPrograms preview_programs;
-
- typedef map<string, device_vector<uchar>*> ConstMemMap;
- typedef map<string, device_ptr> MemMap;
-
- ConstMemMap const_mem_map;
- MemMap mem_map;
- device_ptr null_mem;
-
- bool device_initialized;
- string platform_name;
- string device_name;
-
- bool opencl_error(cl_int err);
- void opencl_error(const string& message);
- void opencl_assert_err(cl_int err, const char* where);
-
- OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background);
- ~OpenCLDevice();
-
- static void CL_CALLBACK context_notify_callback(const char *err_info,
- const void * /*private_info*/, size_t /*cb*/, void *user_data);
-
- bool opencl_version_check();
- OpenCLSplitPrograms* get_split_programs();
-
- string device_md5_hash(string kernel_custom_build_options = "");
- bool load_kernels(const DeviceRequestedFeatures& requested_features);
- void load_required_kernels(const DeviceRequestedFeatures& requested_features);
- void load_preview_kernels();
-
- bool wait_for_availability(const DeviceRequestedFeatures& requested_features);
- DeviceKernelStatus get_active_kernel_switch_state();
-
- /* Get the name of the opencl program for the given kernel */
- const string get_opencl_program_name(const string& kernel_name);
- /* Get the program file name to compile (*.cl) for the given kernel */
- const string get_opencl_program_filename(const string& kernel_name);
- string get_build_options(const DeviceRequestedFeatures& requested_features,
- const string& opencl_program_name,
- bool preview_kernel=false);
- /* Enable the default features to reduce recompilation events */
- void enable_default_features(DeviceRequestedFeatures& features);
-
- void mem_alloc(device_memory& mem);
- void mem_copy_to(device_memory& mem);
- void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
- void mem_zero(device_memory& mem);
- void mem_free(device_memory& mem);
-
- int mem_sub_ptr_alignment();
-
- void const_copy_to(const char *name, void *host, size_t size);
- void tex_alloc(device_memory& mem);
- void tex_free(device_memory& mem);
-
- size_t global_size_round_up(int group_size, int global_size);
- void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
- bool x_workgroups = false,
- size_t max_workgroup_size = -1);
- void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
- void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
-
- void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
- void shader(DeviceTask& task);
-
- void denoise(RenderTile& tile, DenoisingTask& denoising);
-
- class OpenCLDeviceTask : public DeviceTask {
- public:
- OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
- : DeviceTask(task)
- {
- run = function_bind(&OpenCLDevice::thread_run,
- device,
- this);
- }
- };
-
- int get_split_task_count(DeviceTask& /*task*/)
- {
- return 1;
- }
-
- void task_add(DeviceTask& task)
- {
- task_pool.push(new OpenCLDeviceTask(this, task));
- }
-
- void task_wait()
- {
- task_pool.wait();
- }
-
- void task_cancel()
- {
- task_pool.cancel();
- }
-
- void thread_run(DeviceTask *task);
-
- virtual BVHLayoutMask get_bvh_layout_mask() const {
- return BVH_LAYOUT_BVH2;
- }
-
- virtual bool show_samples() const {
- return true;
- }
-
-
-protected:
- string kernel_build_options(const string *debug_src = NULL);
-
- void mem_zero_kernel(device_ptr ptr, size_t size);
-
- bool denoising_non_local_means(device_ptr image_ptr,
- device_ptr guide_ptr,
- device_ptr variance_ptr,
- device_ptr out_ptr,
- DenoisingTask *task);
- bool denoising_construct_transform(DenoisingTask *task);
- bool denoising_accumulate(device_ptr color_ptr,
- device_ptr color_variance_ptr,
- device_ptr scale_ptr,
- int frame,
- DenoisingTask *task);
- bool denoising_solve(device_ptr output_ptr,
- DenoisingTask *task);
- bool denoising_combine_halves(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- int r, int4 rect,
- DenoisingTask *task);
- bool denoising_divide_shadow(device_ptr a_ptr,
- device_ptr b_ptr,
- device_ptr sample_variance_ptr,
- device_ptr sv_variance_ptr,
- device_ptr buffer_variance_ptr,
- DenoisingTask *task);
- bool denoising_get_feature(int mean_offset,
- int variance_offset,
- device_ptr mean_ptr,
- device_ptr variance_ptr,
- float scale,
- DenoisingTask *task);
- bool denoising_write_feature(int to_offset,
- device_ptr from_ptr,
- device_ptr buffer_ptr,
- DenoisingTask *task);
- bool denoising_detect_outliers(device_ptr image_ptr,
- device_ptr variance_ptr,
- device_ptr depth_ptr,
- device_ptr output_ptr,
- DenoisingTask *task);
-
- device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
- void mem_free_sub_ptr(device_ptr ptr);
-
- class ArgumentWrapper {
- public:
- ArgumentWrapper() : size(0), pointer(NULL)
- {
- }
-
- ArgumentWrapper(device_memory& argument) : size(sizeof(void*)),
- pointer((void*)(&argument.device_pointer))
- {
- }
-
- template<typename T>
- ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)),
- pointer((void*)(&argument.device_pointer))
- {
- }
-
- template<typename T>
- ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)),
- pointer((void*)(&argument.device_pointer))
- {
- }
- template<typename T>
- ArgumentWrapper(T& argument) : size(sizeof(argument)),
- pointer(&argument)
- {
- }
-
- ArgumentWrapper(int argument) : size(sizeof(int)),
- int_value(argument),
- pointer(&int_value)
- {
- }
-
- ArgumentWrapper(float argument) : size(sizeof(float)),
- float_value(argument),
- pointer(&float_value)
- {
- }
-
- size_t size;
- int int_value;
- float float_value;
- void *pointer;
- };
-
- /* TODO(sergey): In the future we can use variadic templates, once
- * C++0x is allowed. Should allow to clean this up a bit.
- */
- int kernel_set_args(cl_kernel kernel,
- int start_argument_index,
- const ArgumentWrapper& arg1 = ArgumentWrapper(),
- const ArgumentWrapper& arg2 = ArgumentWrapper(),
- const ArgumentWrapper& arg3 = ArgumentWrapper(),
- const ArgumentWrapper& arg4 = ArgumentWrapper(),
- const ArgumentWrapper& arg5 = ArgumentWrapper(),
- const ArgumentWrapper& arg6 = ArgumentWrapper(),
- const ArgumentWrapper& arg7 = ArgumentWrapper(),
- const ArgumentWrapper& arg8 = ArgumentWrapper(),
- const ArgumentWrapper& arg9 = ArgumentWrapper(),
- const ArgumentWrapper& arg10 = ArgumentWrapper(),
- const ArgumentWrapper& arg11 = ArgumentWrapper(),
- const ArgumentWrapper& arg12 = ArgumentWrapper(),
- const ArgumentWrapper& arg13 = ArgumentWrapper(),
- const ArgumentWrapper& arg14 = ArgumentWrapper(),
- const ArgumentWrapper& arg15 = ArgumentWrapper(),
- const ArgumentWrapper& arg16 = ArgumentWrapper(),
- const ArgumentWrapper& arg17 = ArgumentWrapper(),
- const ArgumentWrapper& arg18 = ArgumentWrapper(),
- const ArgumentWrapper& arg19 = ArgumentWrapper(),
- const ArgumentWrapper& arg20 = ArgumentWrapper(),
- const ArgumentWrapper& arg21 = ArgumentWrapper(),
- const ArgumentWrapper& arg22 = ArgumentWrapper(),
- const ArgumentWrapper& arg23 = ArgumentWrapper(),
- const ArgumentWrapper& arg24 = ArgumentWrapper(),
- const ArgumentWrapper& arg25 = ArgumentWrapper(),
- const ArgumentWrapper& arg26 = ArgumentWrapper(),
- const ArgumentWrapper& arg27 = ArgumentWrapper(),
- const ArgumentWrapper& arg28 = ArgumentWrapper(),
- const ArgumentWrapper& arg29 = ArgumentWrapper(),
- const ArgumentWrapper& arg30 = ArgumentWrapper(),
- const ArgumentWrapper& arg31 = ArgumentWrapper(),
- const ArgumentWrapper& arg32 = ArgumentWrapper(),
- const ArgumentWrapper& arg33 = ArgumentWrapper());
-
- void release_kernel_safe(cl_kernel kernel);
- void release_mem_object_safe(cl_mem mem);
- void release_program_safe(cl_program program);
-
- /* ** Those guys are for workign around some compiler-specific bugs ** */
-
- cl_program load_cached_kernel(
- ustring key,
- thread_scoped_lock& cache_locker);
-
- void store_cached_kernel(
- cl_program program,
- ustring key,
- thread_scoped_lock& cache_locker);
-
-private:
- MemoryManager memory_manager;
- friend class MemoryManager;
-
- static_assert_align(TextureInfo, 16);
- device_vector<TextureInfo> texture_info;
-
- typedef map<string, device_memory*> TexturesMap;
- TexturesMap textures;
-
- bool textures_need_update;
-
-protected:
- void flush_texture_buffers();
-
- friend class OpenCLSplitKernel;
- friend class OpenCLSplitKernelFunction;
+# define opencl_device_assert(device, stmt) \
+ { \
+ cl_int err = stmt; \
+\
+ if (err != CL_SUCCESS) { \
+ string message = string_printf( \
+ "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+ if ((device)->error_message() == "") \
+ (device)->set_error(message); \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ } \
+ } \
+ (void)0
+
+# define opencl_assert(stmt) \
+ { \
+ cl_int err = stmt; \
+\
+ if (err != CL_SUCCESS) { \
+ string message = string_printf( \
+ "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
+ if (error_msg == "") \
+ error_msg = message; \
+ fprintf(stderr, "%s\n", message.c_str()); \
+ } \
+ } \
+ (void)0
+
+class OpenCLDevice : public Device {
+ public:
+ DedicatedTaskPool task_pool;
+
+ /* Task pool for required kernels (base, AO kernels during foreground rendering) */
+ TaskPool load_required_kernel_task_pool;
+ /* Task pool for optional kernels (feature kernels during foreground rendering) */
+ TaskPool load_kernel_task_pool;
+ cl_context cxContext;
+ cl_command_queue cqCommandQueue;
+ cl_platform_id cpPlatform;
+ cl_device_id cdDevice;
+ cl_int ciErr;
+ int device_num;
+ bool use_preview_kernels;
+
+ class OpenCLProgram {
+ public:
+ OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL)
+ {
+ }
+ OpenCLProgram(OpenCLDevice *device,
+ const string &program_name,
+ const string &kernel_name,
+ const string &kernel_build_options,
+ bool use_stdout = true);
+ ~OpenCLProgram();
+
+ void add_kernel(ustring name);
+
+ /* Try to load the program from device cache or disk */
+ bool load();
+ /* Compile the kernel (first separate, failback to local) */
+ void compile();
+ /* Create the OpenCL kernels after loading or compiling */
+ void create_kernels();
+
+ bool is_loaded() const
+ {
+ return loaded;
+ }
+ const string &get_log() const
+ {
+ return log;
+ }
+ void report_error();
+
+ /* Wait until this kernel is available to be used
+ * It will return true when the kernel is available.
+ * It will return false when the kernel is not available
+ * or could not be loaded. */
+ bool wait_for_availability();
+
+ cl_kernel operator()();
+ cl_kernel operator()(ustring name);
+
+ void release();
+
+ private:
+ bool build_kernel(const string *debug_src);
+ /* Build the program by calling the own process.
+ * This is required for multithreaded OpenCL compilation, since most Frameworks serialize
+ * build calls internally if they come from the same process.
+ * If that is not supported, this function just returns false.
+ */
+ bool compile_separate(const string &clbin);
+ /* Build the program by calling OpenCL directly. */
+ bool compile_kernel(const string *debug_src);
+ /* Loading and saving the program from/to disk. */
+ bool load_binary(const string &clbin, const string *debug_src = NULL);
+ bool save_binary(const string &clbin);
+
+ void add_log(const string &msg, bool is_debug);
+ void add_error(const string &msg);
+
+ bool loaded;
+ bool needs_compiling;
+
+ cl_program program;
+ OpenCLDevice *device;
+
+ /* Used for the OpenCLCache key. */
+ string program_name;
+
+ string kernel_file, kernel_build_options, device_md5;
+
+ bool use_stdout;
+ string log, error_msg;
+ string compile_output;
+
+ map<ustring, cl_kernel> kernels;
+ };
+
+ /* Container for all types of split programs. */
+ class OpenCLSplitPrograms {
+ public:
+ OpenCLDevice *device;
+ OpenCLProgram program_split;
+ OpenCLProgram program_lamp_emission;
+ OpenCLProgram program_do_volume;
+ OpenCLProgram program_indirect_background;
+ OpenCLProgram program_shader_eval;
+ OpenCLProgram program_holdout_emission_blurring_pathtermination_ao;
+ OpenCLProgram program_subsurface_scatter;
+ OpenCLProgram program_direct_lighting;
+ OpenCLProgram program_shadow_blocked_ao;
+ OpenCLProgram program_shadow_blocked_dl;
+
+ OpenCLSplitPrograms(OpenCLDevice *device);
+ ~OpenCLSplitPrograms();
+
+ /* Load the kernels and put the created kernels in the given `programs`
+ * paramter. */
+ void load_kernels(vector<OpenCLProgram *> &programs,
+ const DeviceRequestedFeatures &requested_features,
+ bool is_preview = false);
+ };
+
+ DeviceSplitKernel *split_kernel;
+
+ OpenCLProgram base_program;
+ OpenCLProgram bake_program;
+ OpenCLProgram displace_program;
+ OpenCLProgram background_program;
+ OpenCLProgram denoising_program;
+
+ OpenCLSplitPrograms kernel_programs;
+ OpenCLSplitPrograms preview_programs;
+
+ typedef map<string, device_vector<uchar> *> ConstMemMap;
+ typedef map<string, device_ptr> MemMap;
+
+ ConstMemMap const_mem_map;
+ MemMap mem_map;
+ device_ptr null_mem;
+
+ bool device_initialized;
+ string platform_name;
+ string device_name;
+
+ bool opencl_error(cl_int err);
+ void opencl_error(const string &message);
+ void opencl_assert_err(cl_int err, const char *where);
+
+ OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background);
+ ~OpenCLDevice();
+
+ static void CL_CALLBACK context_notify_callback(const char *err_info,
+ const void * /*private_info*/,
+ size_t /*cb*/,
+ void *user_data);
+
+ bool opencl_version_check();
+ OpenCLSplitPrograms *get_split_programs();
+
+ string device_md5_hash(string kernel_custom_build_options = "");
+ bool load_kernels(const DeviceRequestedFeatures &requested_features);
+ void load_required_kernels(const DeviceRequestedFeatures &requested_features);
+ void load_preview_kernels();
+
+ bool wait_for_availability(const DeviceRequestedFeatures &requested_features);
+ DeviceKernelStatus get_active_kernel_switch_state();
+
+ /* Get the name of the opencl program for the given kernel */
+ const string get_opencl_program_name(const string &kernel_name);
+ /* Get the program file name to compile (*.cl) for the given kernel */
+ const string get_opencl_program_filename(const string &kernel_name);
+ string get_build_options(const DeviceRequestedFeatures &requested_features,
+ const string &opencl_program_name,
+ bool preview_kernel = false);
+ /* Enable the default features to reduce recompilation events */
+ void enable_default_features(DeviceRequestedFeatures &features);
+
+ void mem_alloc(device_memory &mem);
+ void mem_copy_to(device_memory &mem);
+ void mem_copy_from(device_memory &mem, int y, int w, int h, int elem);
+ void mem_zero(device_memory &mem);
+ void mem_free(device_memory &mem);
+
+ int mem_sub_ptr_alignment();
+
+ void const_copy_to(const char *name, void *host, size_t size);
+ void tex_alloc(device_memory &mem);
+ void tex_free(device_memory &mem);
+
+ size_t global_size_round_up(int group_size, int global_size);
+ void enqueue_kernel(cl_kernel kernel,
+ size_t w,
+ size_t h,
+ bool x_workgroups = false,
+ size_t max_workgroup_size = -1);
+ void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
+ void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
+
+ void film_convert(DeviceTask &task,
+ device_ptr buffer,
+ device_ptr rgba_byte,
+ device_ptr rgba_half);
+ void shader(DeviceTask &task);
+
+ void denoise(RenderTile &tile, DenoisingTask &denoising);
+
+ class OpenCLDeviceTask : public DeviceTask {
+ public:
+ OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task)
+ {
+ run = function_bind(&OpenCLDevice::thread_run, device, this);
+ }
+ };
+
+ int get_split_task_count(DeviceTask & /*task*/)
+ {
+ return 1;
+ }
+
+ void task_add(DeviceTask &task)
+ {
+ task_pool.push(new OpenCLDeviceTask(this, task));
+ }
+
+ void task_wait()
+ {
+ task_pool.wait();
+ }
+
+ void task_cancel()
+ {
+ task_pool.cancel();
+ }
+
+ void thread_run(DeviceTask *task);
+
+ virtual BVHLayoutMask get_bvh_layout_mask() const
+ {
+ return BVH_LAYOUT_BVH2;
+ }
+
+ virtual bool show_samples() const
+ {
+ return true;
+ }
+
+ protected:
+ string kernel_build_options(const string *debug_src = NULL);
+
+ void mem_zero_kernel(device_ptr ptr, size_t size);
+
+ bool denoising_non_local_means(device_ptr image_ptr,
+ device_ptr guide_ptr,
+ device_ptr variance_ptr,
+ device_ptr out_ptr,
+ DenoisingTask *task);
+ bool denoising_construct_transform(DenoisingTask *task);
+ bool denoising_accumulate(device_ptr color_ptr,
+ device_ptr color_variance_ptr,
+ device_ptr scale_ptr,
+ int frame,
+ DenoisingTask *task);
+ bool denoising_solve(device_ptr output_ptr, DenoisingTask *task);
+ bool denoising_combine_halves(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ int r,
+ int4 rect,
+ DenoisingTask *task);
+ bool denoising_divide_shadow(device_ptr a_ptr,
+ device_ptr b_ptr,
+ device_ptr sample_variance_ptr,
+ device_ptr sv_variance_ptr,
+ device_ptr buffer_variance_ptr,
+ DenoisingTask *task);
+ bool denoising_get_feature(int mean_offset,
+ int variance_offset,
+ device_ptr mean_ptr,
+ device_ptr variance_ptr,
+ float scale,
+ DenoisingTask *task);
+ bool denoising_write_feature(int to_offset,
+ device_ptr from_ptr,
+ device_ptr buffer_ptr,
+ DenoisingTask *task);
+ bool denoising_detect_outliers(device_ptr image_ptr,
+ device_ptr variance_ptr,
+ device_ptr depth_ptr,
+ device_ptr output_ptr,
+ DenoisingTask *task);
+
+ device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size);
+ void mem_free_sub_ptr(device_ptr ptr);
+
+ class ArgumentWrapper {
+ public:
+ ArgumentWrapper() : size(0), pointer(NULL)
+ {
+ }
+
+ ArgumentWrapper(device_memory &argument)
+ : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
+ {
+ }
+
+ template<typename T>
+ ArgumentWrapper(device_vector<T> &argument)
+ : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
+ {
+ }
+
+ template<typename T>
+ ArgumentWrapper(device_only_memory<T> &argument)
+ : size(sizeof(void *)), pointer((void *)(&argument.device_pointer))
+ {
+ }
+ template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument)
+ {
+ }
+
+ ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value)
+ {
+ }
+
+ ArgumentWrapper(float argument)
+ : size(sizeof(float)), float_value(argument), pointer(&float_value)
+ {
+ }
+
+ size_t size;
+ int int_value;
+ float float_value;
+ void *pointer;
+ };
+
+ /* TODO(sergey): In the future we can use variadic templates, once
+ * C++0x is allowed. Should allow to clean this up a bit.
+ */
+ int kernel_set_args(cl_kernel kernel,
+ int start_argument_index,
+ const ArgumentWrapper &arg1 = ArgumentWrapper(),
+ const ArgumentWrapper &arg2 = ArgumentWrapper(),
+ const ArgumentWrapper &arg3 = ArgumentWrapper(),
+ const ArgumentWrapper &arg4 = ArgumentWrapper(),
+ const ArgumentWrapper &arg5 = ArgumentWrapper(),
+ const ArgumentWrapper &arg6 = ArgumentWrapper(),
+ const ArgumentWrapper &arg7 = ArgumentWrapper(),
+ const ArgumentWrapper &arg8 = ArgumentWrapper(),
+ const ArgumentWrapper &arg9 = ArgumentWrapper(),
+ const ArgumentWrapper &arg10 = ArgumentWrapper(),
+ const ArgumentWrapper &arg11 = ArgumentWrapper(),
+ const ArgumentWrapper &arg12 = ArgumentWrapper(),
+ const ArgumentWrapper &arg13 = ArgumentWrapper(),
+ const ArgumentWrapper &arg14 = ArgumentWrapper(),
+ const ArgumentWrapper &arg15 = ArgumentWrapper(),
+ const ArgumentWrapper &arg16 = ArgumentWrapper(),
+ const ArgumentWrapper &arg17 = ArgumentWrapper(),
+ const ArgumentWrapper &arg18 = ArgumentWrapper(),
+ const ArgumentWrapper &arg19 = ArgumentWrapper(),
+ const ArgumentWrapper &arg20 = ArgumentWrapper(),
+ const ArgumentWrapper &arg21 = ArgumentWrapper(),
+ const ArgumentWrapper &arg22 = ArgumentWrapper(),
+ const ArgumentWrapper &arg23 = ArgumentWrapper(),
+ const ArgumentWrapper &arg24 = ArgumentWrapper(),
+ const ArgumentWrapper &arg25 = ArgumentWrapper(),
+ const ArgumentWrapper &arg26 = ArgumentWrapper(),
+ const ArgumentWrapper &arg27 = ArgumentWrapper(),
+ const ArgumentWrapper &arg28 = ArgumentWrapper(),
+ const ArgumentWrapper &arg29 = ArgumentWrapper(),
+ const ArgumentWrapper &arg30 = ArgumentWrapper(),
+ const ArgumentWrapper &arg31 = ArgumentWrapper(),
+ const ArgumentWrapper &arg32 = ArgumentWrapper(),
+ const ArgumentWrapper &arg33 = ArgumentWrapper());
+
+ void release_kernel_safe(cl_kernel kernel);
+ void release_mem_object_safe(cl_mem mem);
+ void release_program_safe(cl_program program);
+
+ /* ** Those guys are for workign around some compiler-specific bugs ** */
+
+ cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker);
+
+ void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker);
+
+ private:
+ MemoryManager memory_manager;
+ friend class MemoryManager;
+
+ static_assert_align(TextureInfo, 16);
+ device_vector<TextureInfo> texture_info;
+
+ typedef map<string, device_memory *> TexturesMap;
+ TexturesMap textures;
+
+ bool textures_need_update;
+
+ protected:
+ void flush_texture_buffers();
+
+ friend class OpenCLSplitKernel;
+ friend class OpenCLSplitKernelFunction;
};
-Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background);
+Device *opencl_create_split_device(DeviceInfo &info,
+ Stats &stats,
+ Profiler &profiler,
+ bool background);
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp
index 489d10b7087..70b1a643044 100644
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -16,273 +16,278 @@
#ifdef WITH_OPENCL
-#include "device/opencl/opencl.h"
+# include "device/opencl/opencl.h"
-#include "kernel/kernel_types.h"
-#include "kernel/split/kernel_split_data_types.h"
+# include "kernel/kernel_types.h"
+# include "kernel/split/kernel_split_data_types.h"
-#include "util/util_algorithm.h"
-#include "util/util_debug.h"
-#include "util/util_foreach.h"
-#include "util/util_logging.h"
-#include "util/util_md5.h"
-#include "util/util_path.h"
-#include "util/util_time.h"
+# include "util/util_algorithm.h"
+# include "util/util_debug.h"
+# include "util/util_foreach.h"
+# include "util/util_logging.h"
+# include "util/util_md5.h"
+# include "util/util_path.h"
+# include "util/util_time.h"
CCL_NAMESPACE_BEGIN
struct texture_slot_t {
- texture_slot_t(const string& name, int slot)
- : name(name),
- slot(slot) {
- }
- string name;
- int slot;
+ texture_slot_t(const string &name, int slot) : name(name), slot(slot)
+ {
+ }
+ string name;
+ int slot;
};
static const string NON_SPLIT_KERNELS =
- "denoising "
- "base "
- "background "
- "displace ";
+ "denoising "
+ "base "
+ "background "
+ "displace ";
static const string SPLIT_BUNDLE_KERNELS =
- "data_init "
- "path_init "
- "state_buffer_size "
- "scene_intersect "
- "queue_enqueue "
- "shader_setup "
- "shader_sort "
- "enqueue_inactive "
- "next_iteration_setup "
- "indirect_subsurface "
- "buffer_update";
-
-const string OpenCLDevice::get_opencl_program_name(const string& kernel_name)
+ "data_init "
+ "path_init "
+ "state_buffer_size "
+ "scene_intersect "
+ "queue_enqueue "
+ "shader_setup "
+ "shader_sort "
+ "enqueue_inactive "
+ "next_iteration_setup "
+ "indirect_subsurface "
+ "buffer_update";
+
+const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
{
- if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
- return kernel_name;
- }
- else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
- return "split_bundle";
- }
- else {
- return "split_" + kernel_name;
- }
+ if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
+ return kernel_name;
+ }
+ else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
+ return "split_bundle";
+ }
+ else {
+ return "split_" + kernel_name;
+ }
}
-const string OpenCLDevice::get_opencl_program_filename(const string& kernel_name)
+const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
{
- if (kernel_name == "denoising") {
- return "filter.cl";
- }
- else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
- return "kernel_split_bundle.cl";
- }
- else {
- return "kernel_" + kernel_name + ".cl";
- }
+ if (kernel_name == "denoising") {
+ return "filter.cl";
+ }
+ else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
+ return "kernel_split_bundle.cl";
+ }
+ else {
+ return "kernel_" + kernel_name + ".cl";
+ }
}
/* Enable features that we always want to compile to reduce recompilation events */
-void OpenCLDevice::enable_default_features(DeviceRequestedFeatures& features)
+void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
{
- features.use_transparent = true;
- features.use_shadow_tricks = true;
- features.use_principled = true;
- features.use_denoising = true;
-
- if (!background)
- {
- features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
- features.nodes_features = NODE_FEATURE_ALL;
- features.use_hair = true;
- features.use_subsurface = true;
- features.use_camera_motion = false;
- features.use_object_motion = false;
- }
+ features.use_transparent = true;
+ features.use_shadow_tricks = true;
+ features.use_principled = true;
+ features.use_denoising = true;
+
+ if (!background) {
+ features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+ features.nodes_features = NODE_FEATURE_ALL;
+ features.use_hair = true;
+ features.use_subsurface = true;
+ features.use_camera_motion = false;
+ features.use_object_motion = false;
+ }
}
-string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name, bool preview_kernel)
+string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
+ const string &opencl_program_name,
+ bool preview_kernel)
{
- /* first check for non-split kernel programs */
- if (opencl_program_name == "base" || opencl_program_name == "denoising") {
- return "";
- }
- else if (opencl_program_name == "bake") {
- /* Note: get_build_options for bake is only requested when baking is enabled.
- * displace and background are always requested.
- * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_denoising = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_hair = true;
- features.use_subsurface = true;
- features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
- features.nodes_features = NODE_FEATURE_ALL;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
- else if (opencl_program_name == "displace") {
- /* As displacement does not use any nodes from the Shading group (eg BSDF).
- * We disable all features that are related to shading. */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_denoising = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_baking = false;
- features.use_transparent = false;
- features.use_shadow_tricks = false;
- features.use_subsurface = false;
- features.use_volume = false;
- features.nodes_features &= ~NODE_FEATURE_VOLUME;
- features.use_denoising = false;
- features.use_principled = false;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
- else if (opencl_program_name == "background") {
- /* Background uses Background shading
- * It is save to disable shadow features, subsurface and volumetric. */
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
- features.use_baking = false;
- features.use_object_motion = false;
- features.use_camera_motion = false;
- features.use_transparent = false;
- features.use_shadow_tricks = false;
- features.use_denoising = false;
- /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
- * Perhaps we should remove them in UI as it does not make any sense when
- * rendering background. */
- features.nodes_features &= ~NODE_FEATURE_VOLUME;
- features.use_subsurface = false;
- features.use_volume = false;
- features.use_shader_raytrace = false;
- features.use_patch_evaluation = false;
- features.use_integrator_branched = false;
- return features.get_build_options();
- }
-
- string build_options = "-D__SPLIT_KERNEL__ ";
- /* Set compute device build option. */
- cl_device_type device_type;
- OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
- assert(this->ciErr == CL_SUCCESS);
- if(device_type == CL_DEVICE_TYPE_GPU) {
- build_options += "-D__COMPUTE_DEVICE_GPU__ ";
- }
-
- DeviceRequestedFeatures nofeatures;
- enable_default_features(nofeatures);
-
- /* Add program specific optimized compile directives */
- if (preview_kernel) {
- DeviceRequestedFeatures preview_features;
- preview_features.use_hair = true;
- build_options += "-D__KERNEL_AO_PREVIEW__ ";
- build_options += preview_features.get_build_options();
- }
- else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
- build_options += nofeatures.get_build_options();
- }
- else {
- DeviceRequestedFeatures features(requested_features);
- enable_default_features(features);
-
- /* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
- * this also makes sure that the kernels that are build during baking can be reused
- * when not doing any baking. */
- features.use_baking = false;
-
- /* Do not vary on shaders when program doesn't do any shading.
- * We have bundled them in a single program. */
- if (opencl_program_name == "split_bundle") {
- features.max_nodes_group = 0;
- features.nodes_features = 0;
- features.use_shader_raytrace = false;
- }
-
- /* No specific settings, just add the regular ones */
- build_options += features.get_build_options();
- }
-
- return build_options;
+ /* first check for non-split kernel programs */
+ if (opencl_program_name == "base" || opencl_program_name == "denoising") {
+ return "";
+ }
+ else if (opencl_program_name == "bake") {
+ /* Note: get_build_options for bake is only requested when baking is enabled.
+ * displace and background are always requested.
+ * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
+ DeviceRequestedFeatures features(requested_features);
+ enable_default_features(features);
+ features.use_denoising = false;
+ features.use_object_motion = false;
+ features.use_camera_motion = false;
+ features.use_hair = true;
+ features.use_subsurface = true;
+ features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
+ features.nodes_features = NODE_FEATURE_ALL;
+ features.use_integrator_branched = false;
+ return features.get_build_options();
+ }
+ else if (opencl_program_name == "displace") {
+ /* As displacement does not use any nodes from the Shading group (eg BSDF).
+ * We disable all features that are related to shading. */
+ DeviceRequestedFeatures features(requested_features);
+ enable_default_features(features);
+ features.use_denoising = false;
+ features.use_object_motion = false;
+ features.use_camera_motion = false;
+ features.use_baking = false;
+ features.use_transparent = false;
+ features.use_shadow_tricks = false;
+ features.use_subsurface = false;
+ features.use_volume = false;
+ features.nodes_features &= ~NODE_FEATURE_VOLUME;
+ features.use_denoising = false;
+ features.use_principled = false;
+ features.use_integrator_branched = false;
+ return features.get_build_options();
+ }
+ else if (opencl_program_name == "background") {
+ /* Background uses Background shading
+ * It is save to disable shadow features, subsurface and volumetric. */
+ DeviceRequestedFeatures features(requested_features);
+ enable_default_features(features);
+ features.use_baking = false;
+ features.use_object_motion = false;
+ features.use_camera_motion = false;
+ features.use_transparent = false;
+ features.use_shadow_tricks = false;
+ features.use_denoising = false;
+ /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
+ * Perhaps we should remove them in UI as it does not make any sense when
+ * rendering background. */
+ features.nodes_features &= ~NODE_FEATURE_VOLUME;
+ features.use_subsurface = false;
+ features.use_volume = false;
+ features.use_shader_raytrace = false;
+ features.use_patch_evaluation = false;
+ features.use_integrator_branched = false;
+ return features.get_build_options();
+ }
+
+ string build_options = "-D__SPLIT_KERNEL__ ";
+ /* Set compute device build option. */
+ cl_device_type device_type;
+ OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
+ assert(this->ciErr == CL_SUCCESS);
+ if (device_type == CL_DEVICE_TYPE_GPU) {
+ build_options += "-D__COMPUTE_DEVICE_GPU__ ";
+ }
+
+ DeviceRequestedFeatures nofeatures;
+ enable_default_features(nofeatures);
+
+ /* Add program specific optimized compile directives */
+ if (preview_kernel) {
+ DeviceRequestedFeatures preview_features;
+ preview_features.use_hair = true;
+ build_options += "-D__KERNEL_AO_PREVIEW__ ";
+ build_options += preview_features.get_build_options();
+ }
+ else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
+ build_options += nofeatures.get_build_options();
+ }
+ else {
+ DeviceRequestedFeatures features(requested_features);
+ enable_default_features(features);
+
+ /* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
+ * this also makes sure that the kernels that are build during baking can be reused
+ * when not doing any baking. */
+ features.use_baking = false;
+
+ /* Do not vary on shaders when program doesn't do any shading.
+ * We have bundled them in a single program. */
+ if (opencl_program_name == "split_bundle") {
+ features.max_nodes_group = 0;
+ features.nodes_features = 0;
+ features.use_shader_raytrace = false;
+ }
+
+ /* No specific settings, just add the regular ones */
+ build_options += features.get_build_options();
+ }
+
+ return build_options;
}
OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
{
- device = device_;
+ device = device_;
}
OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
{
- program_split.release();
- program_lamp_emission.release();
- program_do_volume.release();
- program_indirect_background.release();
- program_shader_eval.release();
- program_holdout_emission_blurring_pathtermination_ao.release();
- program_subsurface_scatter.release();
- program_direct_lighting.release();
- program_shadow_blocked_ao.release();
- program_shadow_blocked_dl.release();
+ program_split.release();
+ program_lamp_emission.release();
+ program_do_volume.release();
+ program_indirect_background.release();
+ program_shader_eval.release();
+ program_holdout_emission_blurring_pathtermination_ao.release();
+ program_subsurface_scatter.release();
+ program_direct_lighting.release();
+ program_shadow_blocked_ao.release();
+ program_shadow_blocked_dl.release();
}
-void OpenCLDevice::OpenCLSplitPrograms::load_kernels(vector<OpenCLProgram*> &programs, const DeviceRequestedFeatures& requested_features, bool is_preview)
+void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
+ vector<OpenCLProgram *> &programs,
+ const DeviceRequestedFeatures &requested_features,
+ bool is_preview)
{
- if (!requested_features.use_baking) {
-#define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name));
-#define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
- const string program_name_##kernel_name = "split_"#kernel_name; \
- program_##kernel_name = \
- OpenCLDevice::OpenCLProgram(device, \
- program_name_##kernel_name, \
- "kernel_"#kernel_name".cl", \
- device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \
- program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \
- programs.push_back(&program_##kernel_name);
-
- /* Ordered with most complex kernels first, to reduce overall compile time. */
- ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
- if (requested_features.use_volume || is_preview) {
- ADD_SPLIT_KERNEL_PROGRAM(do_volume);
- }
- ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
- ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
- ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
- ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
- ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
- ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
- ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
-
- /* Quick kernels bundled in a single program to reduce overhead of starting
- * Blender processes. */
- program_split = OpenCLDevice::OpenCLProgram(device,
- "split_bundle" ,
- "kernel_split_bundle.cl",
- device->get_build_options(requested_features, "split_bundle", is_preview));
-
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
- ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
- programs.push_back(&program_split);
-
-#undef ADD_SPLIT_KERNEL_PROGRAM
-#undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
- }
+ if (!requested_features.use_baking) {
+# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
+ program_split.add_kernel(ustring("path_trace_" #kernel_name));
+# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
+ const string program_name_##kernel_name = "split_" #kernel_name; \
+ program_##kernel_name = OpenCLDevice::OpenCLProgram( \
+ device, \
+ program_name_##kernel_name, \
+ "kernel_" #kernel_name ".cl", \
+ device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \
+ program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
+ programs.push_back(&program_##kernel_name);
+
+ /* Ordered with most complex kernels first, to reduce overall compile time. */
+ ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
+ if (requested_features.use_volume || is_preview) {
+ ADD_SPLIT_KERNEL_PROGRAM(do_volume);
+ }
+ ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
+ ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
+ ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
+ ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
+ ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
+ ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
+ ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
+
+ /* Quick kernels bundled in a single program to reduce overhead of starting
+ * Blender processes. */
+ program_split = OpenCLDevice::OpenCLProgram(
+ device,
+ "split_bundle",
+ "kernel_split_bundle.cl",
+ device->get_build_options(requested_features, "split_bundle", is_preview));
+
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
+ ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
+ programs.push_back(&program_split);
+
+# undef ADD_SPLIT_KERNEL_PROGRAM
+# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
+ }
}
namespace {
@@ -291,1126 +296,1108 @@ namespace {
* fetch its size.
*/
typedef struct KernelGlobalsDummy {
- ccl_constant KernelData *data;
- ccl_global char *buffers[8];
+ ccl_constant KernelData *data;
+ ccl_global char *buffers[8];
-#define KERNEL_TEX(type, name) \
- TextureInfo name;
+# define KERNEL_TEX(type, name) TextureInfo name;
# include "kernel/kernel_textures.h"
-#undef KERNEL_TEX
- SplitData split_data;
- SplitParams split_param_data;
+# undef KERNEL_TEX
+ SplitData split_data;
+ SplitParams split_param_data;
} KernelGlobalsDummy;
} // namespace
-
struct CachedSplitMemory {
- int id;
- device_memory *split_data;
- device_memory *ray_state;
- device_memory *queue_index;
- device_memory *use_queues_flag;
- device_memory *work_pools;
- device_ptr *buffer;
+ int id;
+ device_memory *split_data;
+ device_memory *ray_state;
+ device_memory *queue_index;
+ device_memory *use_queues_flag;
+ device_memory *work_pools;
+ device_ptr *buffer;
};
class OpenCLSplitKernelFunction : public SplitKernelFunction {
-public:
- OpenCLDevice* device;
- OpenCLDevice::OpenCLProgram program;
- CachedSplitMemory& cached_memory;
- int cached_id;
-
- OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) :
- device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
- {
- }
-
- ~OpenCLSplitKernelFunction()
- {
- program.release();
- }
-
- virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
- {
- if(cached_id != cached_memory.id) {
- cl_uint start_arg_index =
- device->kernel_set_args(program(),
- 0,
- kg,
- data,
- *cached_memory.split_data,
- *cached_memory.ray_state);
-
- device->set_kernel_arg_buffers(program(), &start_arg_index);
-
- start_arg_index +=
- device->kernel_set_args(program(),
- start_arg_index,
- *cached_memory.queue_index,
- *cached_memory.use_queues_flag,
- *cached_memory.work_pools,
- *cached_memory.buffer);
-
- cached_id = cached_memory.id;
- }
-
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- program(),
- 2,
- NULL,
- dim.global_size,
- dim.local_size,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- if(device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return false;
- }
-
- return true;
- }
+ public:
+ OpenCLDevice *device;
+ OpenCLDevice::OpenCLProgram program;
+ CachedSplitMemory &cached_memory;
+ int cached_id;
+
+ OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
+ : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
+ {
+ }
+
+ ~OpenCLSplitKernelFunction()
+ {
+ program.release();
+ }
+
+ virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
+ {
+ if (cached_id != cached_memory.id) {
+ cl_uint start_arg_index = device->kernel_set_args(
+ program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
+
+ device->set_kernel_arg_buffers(program(), &start_arg_index);
+
+ start_arg_index += device->kernel_set_args(program(),
+ start_arg_index,
+ *cached_memory.queue_index,
+ *cached_memory.use_queues_flag,
+ *cached_memory.work_pools,
+ *cached_memory.buffer);
+
+ cached_id = cached_memory.id;
+ }
+
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ program(),
+ 2,
+ NULL,
+ dim.global_size,
+ dim.local_size,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ if (device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return false;
+ }
+
+ return true;
+ }
};
class OpenCLSplitKernel : public DeviceSplitKernel {
- OpenCLDevice *device;
- CachedSplitMemory cached_memory;
-public:
- explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) {
- }
-
- virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
- const DeviceRequestedFeatures& requested_features)
- {
- OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
-
- const string program_name = device->get_opencl_program_name(kernel_name);
- kernel->program =
- OpenCLDevice::OpenCLProgram(device,
- program_name,
- device->get_opencl_program_filename(kernel_name),
- device->get_build_options(requested_features,
- program_name,
- device->use_preview_kernels));
-
- kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
- kernel->program.load();
-
- if(!kernel->program.is_loaded()) {
- delete kernel;
- return NULL;
- }
-
- return kernel;
- }
-
- virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
- {
- device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
- size_buffer.alloc(1);
- size_buffer.zero_to_device();
-
- uint threads = num_threads;
- OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
- cl_kernel kernel_state_buffer_size = programs->program_split(ustring("path_trace_state_buffer_size"));
- device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
-
- size_t global_size = 64;
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- kernel_state_buffer_size,
- 1,
- NULL,
- &global_size,
- NULL,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- size_buffer.copy_from_device(0, 1, 1);
- size_t size = size_buffer[0];
- size_buffer.free();
-
- if(device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return 0;
- }
-
- return size;
- }
-
- virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
- RenderTile& rtile,
- int num_global_elements,
- device_memory& kernel_globals,
- device_memory& kernel_data,
- device_memory& split_data,
- device_memory& ray_state,
- device_memory& queue_index,
- device_memory& use_queues_flag,
- device_memory& work_pool_wgs
- )
- {
- cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
-
- /* Set the range of samples to be processed for every ray in
- * path-regeneration logic.
- */
- cl_int start_sample = rtile.start_sample;
- cl_int end_sample = rtile.start_sample + rtile.num_samples;
-
- OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
- cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
-
- cl_uint start_arg_index =
- device->kernel_set_args(kernel_data_init,
- 0,
- kernel_globals,
- kernel_data,
- split_data,
- num_global_elements,
- ray_state);
-
- device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
-
- start_arg_index +=
- device->kernel_set_args(kernel_data_init,
- start_arg_index,
- start_sample,
- end_sample,
- rtile.x,
- rtile.y,
- rtile.w,
- rtile.h,
- rtile.offset,
- rtile.stride,
- queue_index,
- dQueue_size,
- use_queues_flag,
- work_pool_wgs,
- rtile.num_samples,
- rtile.buffer);
-
- /* Enqueue ckPathTraceKernel_data_init kernel. */
- device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
- kernel_data_init,
- 2,
- NULL,
- dim.global_size,
- dim.local_size,
- 0,
- NULL,
- NULL);
-
- device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
-
- if(device->ciErr != CL_SUCCESS) {
- string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
- clewErrorString(device->ciErr));
- device->opencl_error(message);
- return false;
- }
-
- cached_memory.split_data = &split_data;
- cached_memory.ray_state = &ray_state;
- cached_memory.queue_index = &queue_index;
- cached_memory.use_queues_flag = &use_queues_flag;
- cached_memory.work_pools = &work_pool_wgs;
- cached_memory.buffer = &rtile.buffer;
- cached_memory.id++;
-
- return true;
- }
-
- virtual int2 split_kernel_local_size()
- {
- return make_int2(64, 1);
- }
-
- virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
- {
- cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
- /* Use small global size on CPU devices as it seems to be much faster. */
- if(type == CL_DEVICE_TYPE_CPU) {
- VLOG(1) << "Global size: (64, 64).";
- return make_int2(64, 64);
- }
-
- cl_ulong max_buffer_size;
- clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
-
- if(DebugFlags().opencl.mem_limit) {
- max_buffer_size = min(max_buffer_size,
- cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
- }
-
- VLOG(1) << "Maximum device allocation size: "
- << string_human_readable_number(max_buffer_size) << " bytes. ("
- << string_human_readable_size(max_buffer_size) << ").";
-
- /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
- max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
-
- size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
- int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
- VLOG(1) << "Global size: " << global_size << ".";
- return global_size;
- }
+ OpenCLDevice *device;
+ CachedSplitMemory cached_memory;
+
+ public:
+ explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
+ {
+ }
+
+ virtual SplitKernelFunction *get_split_kernel_function(
+ const string &kernel_name, const DeviceRequestedFeatures &requested_features)
+ {
+ OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
+
+ const string program_name = device->get_opencl_program_name(kernel_name);
+ kernel->program = OpenCLDevice::OpenCLProgram(
+ device,
+ program_name,
+ device->get_opencl_program_filename(kernel_name),
+ device->get_build_options(requested_features, program_name, device->use_preview_kernels));
+
+ kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
+ kernel->program.load();
+
+ if (!kernel->program.is_loaded()) {
+ delete kernel;
+ return NULL;
+ }
+
+ return kernel;
+ }
+
+ virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
+ {
+ device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+ size_buffer.alloc(1);
+ size_buffer.zero_to_device();
+
+ uint threads = num_threads;
+ OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
+ cl_kernel kernel_state_buffer_size = programs->program_split(
+ ustring("path_trace_state_buffer_size"));
+ device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
+
+ size_t global_size = 64;
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ kernel_state_buffer_size,
+ 1,
+ NULL,
+ &global_size,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ size_buffer.copy_from_device(0, 1, 1);
+ size_t size = size_buffer[0];
+ size_buffer.free();
+
+ if (device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return 0;
+ }
+
+ return size;
+ }
+
+ virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+ RenderTile &rtile,
+ int num_global_elements,
+ device_memory &kernel_globals,
+ device_memory &kernel_data,
+ device_memory &split_data,
+ device_memory &ray_state,
+ device_memory &queue_index,
+ device_memory &use_queues_flag,
+ device_memory &work_pool_wgs)
+ {
+ cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
+
+ /* Set the range of samples to be processed for every ray in
+ * path-regeneration logic.
+ */
+ cl_int start_sample = rtile.start_sample;
+ cl_int end_sample = rtile.start_sample + rtile.num_samples;
+
+ OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
+ cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
+
+ cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
+ 0,
+ kernel_globals,
+ kernel_data,
+ split_data,
+ num_global_elements,
+ ray_state);
+
+ device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
+
+ start_arg_index += device->kernel_set_args(kernel_data_init,
+ start_arg_index,
+ start_sample,
+ end_sample,
+ rtile.x,
+ rtile.y,
+ rtile.w,
+ rtile.h,
+ rtile.offset,
+ rtile.stride,
+ queue_index,
+ dQueue_size,
+ use_queues_flag,
+ work_pool_wgs,
+ rtile.num_samples,
+ rtile.buffer);
+
+ /* Enqueue ckPathTraceKernel_data_init kernel. */
+ device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
+ kernel_data_init,
+ 2,
+ NULL,
+ dim.global_size,
+ dim.local_size,
+ 0,
+ NULL,
+ NULL);
+
+ device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
+
+ if (device->ciErr != CL_SUCCESS) {
+ string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
+ clewErrorString(device->ciErr));
+ device->opencl_error(message);
+ return false;
+ }
+
+ cached_memory.split_data = &split_data;
+ cached_memory.ray_state = &ray_state;
+ cached_memory.queue_index = &queue_index;
+ cached_memory.use_queues_flag = &use_queues_flag;
+ cached_memory.work_pools = &work_pool_wgs;
+ cached_memory.buffer = &rtile.buffer;
+ cached_memory.id++;
+
+ return true;
+ }
+
+ virtual int2 split_kernel_local_size()
+ {
+ return make_int2(64, 1);
+ }
+
+ virtual int2 split_kernel_global_size(device_memory &kg,
+ device_memory &data,
+ DeviceTask * /*task*/)
+ {
+ cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
+ /* Use small global size on CPU devices as it seems to be much faster. */
+ if (type == CL_DEVICE_TYPE_CPU) {
+ VLOG(1) << "Global size: (64, 64).";
+ return make_int2(64, 64);
+ }
+
+ cl_ulong max_buffer_size;
+ clGetDeviceInfo(
+ device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
+
+ if (DebugFlags().opencl.mem_limit) {
+ max_buffer_size = min(max_buffer_size,
+ cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
+ }
+
+ VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
+ << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
+
+ /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
+ max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
+
+ size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
+ int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
+ (int)sqrt(num_elements));
+ VLOG(1) << "Global size: " << global_size << ".";
+ return global_size;
+ }
};
bool OpenCLDevice::opencl_error(cl_int err)
{
- if(err != CL_SUCCESS) {
- string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
- if(error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
- return true;
- }
-
- return false;
+ if (err != CL_SUCCESS) {
+ string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
+ if (error_msg == "")
+ error_msg = message;
+ fprintf(stderr, "%s\n", message.c_str());
+ return true;
+ }
+
+ return false;
}
-void OpenCLDevice::opencl_error(const string& message)
+void OpenCLDevice::opencl_error(const string &message)
{
- if(error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
+ if (error_msg == "")
+ error_msg = message;
+ fprintf(stderr, "%s\n", message.c_str());
}
-void OpenCLDevice::opencl_assert_err(cl_int err, const char* where)
+void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
{
- if(err != CL_SUCCESS) {
- string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
- if(error_msg == "")
- error_msg = message;
- fprintf(stderr, "%s\n", message.c_str());
-#ifndef NDEBUG
- abort();
-#endif
- }
+ if (err != CL_SUCCESS) {
+ string message = string_printf(
+ "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
+ if (error_msg == "")
+ error_msg = message;
+ fprintf(stderr, "%s\n", message.c_str());
+# ifndef NDEBUG
+ abort();
+# endif
+ }
}
-OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
-: Device(info, stats, profiler, background),
- kernel_programs(this),
- preview_programs(this),
- memory_manager(this),
- texture_info(this, "__texture_info", MEM_TEXTURE)
+OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
+ : Device(info, stats, profiler, background),
+ kernel_programs(this),
+ preview_programs(this),
+ memory_manager(this),
+ texture_info(this, "__texture_info", MEM_TEXTURE)
{
- cpPlatform = NULL;
- cdDevice = NULL;
- cxContext = NULL;
- cqCommandQueue = NULL;
- null_mem = 0;
- device_initialized = false;
- textures_need_update = true;
- use_preview_kernels = !background;
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- if(usable_devices.size() == 0) {
- opencl_error("OpenCL: no devices found.");
- return;
- }
- assert(info.num < usable_devices.size());
- OpenCLPlatformDevice& platform_device = usable_devices[info.num];
- device_num = info.num;
- cpPlatform = platform_device.platform_id;
- cdDevice = platform_device.device_id;
- platform_name = platform_device.platform_name;
- device_name = platform_device.device_name;
- VLOG(2) << "Creating new Cycles device for OpenCL platform "
- << platform_name << ", device "
- << device_name << ".";
-
- {
- /* try to use cached context */
- thread_scoped_lock cache_locker;
- cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
-
- if(cxContext == NULL) {
- /* create context properties array to specify platform */
- const cl_context_properties context_props[] = {
- CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
- 0, 0
- };
-
- /* create context */
- cxContext = clCreateContext(context_props, 1, &cdDevice,
- context_notify_callback, cdDevice, &ciErr);
-
- if(opencl_error(ciErr)) {
- opencl_error("OpenCL: clCreateContext failed");
- return;
- }
-
- /* cache it */
- OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
- }
- }
-
- cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
- if(opencl_error(ciErr)) {
- opencl_error("OpenCL: Error creating command queue");
- return;
- }
-
- null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
- if(opencl_error(ciErr)) {
- opencl_error("OpenCL: Error creating memory buffer for NULL");
- return;
- }
-
- /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
- texture_info.resize(1);
- memory_manager.alloc("texture_info", texture_info);
-
- device_initialized = true;
-
- split_kernel = new OpenCLSplitKernel(this);
- if (!background) {
- load_preview_kernels();
- }
+ cpPlatform = NULL;
+ cdDevice = NULL;
+ cxContext = NULL;
+ cqCommandQueue = NULL;
+ null_mem = 0;
+ device_initialized = false;
+ textures_need_update = true;
+ use_preview_kernels = !background;
+
+ vector<OpenCLPlatformDevice> usable_devices;
+ OpenCLInfo::get_usable_devices(&usable_devices);
+ if (usable_devices.size() == 0) {
+ opencl_error("OpenCL: no devices found.");
+ return;
+ }
+ assert(info.num < usable_devices.size());
+ OpenCLPlatformDevice &platform_device = usable_devices[info.num];
+ device_num = info.num;
+ cpPlatform = platform_device.platform_id;
+ cdDevice = platform_device.device_id;
+ platform_name = platform_device.platform_name;
+ device_name = platform_device.device_name;
+ VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
+ << device_name << ".";
+
+ {
+ /* try to use cached context */
+ thread_scoped_lock cache_locker;
+ cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
+
+ if (cxContext == NULL) {
+ /* create context properties array to specify platform */
+ const cl_context_properties context_props[] = {
+ CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
+
+ /* create context */
+ cxContext = clCreateContext(
+ context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
+
+ if (opencl_error(ciErr)) {
+ opencl_error("OpenCL: clCreateContext failed");
+ return;
+ }
+
+ /* cache it */
+ OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
+ }
+ }
+
+ cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
+ if (opencl_error(ciErr)) {
+ opencl_error("OpenCL: Error creating command queue");
+ return;
+ }
+
+ null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
+ if (opencl_error(ciErr)) {
+ opencl_error("OpenCL: Error creating memory buffer for NULL");
+ return;
+ }
+
+ /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
+ texture_info.resize(1);
+ memory_manager.alloc("texture_info", texture_info);
+
+ device_initialized = true;
+
+ split_kernel = new OpenCLSplitKernel(this);
+ if (!background) {
+ load_preview_kernels();
+ }
}
OpenCLDevice::~OpenCLDevice()
{
- task_pool.stop();
- load_required_kernel_task_pool.stop();
- load_kernel_task_pool.stop();
+ task_pool.stop();
+ load_required_kernel_task_pool.stop();
+ load_kernel_task_pool.stop();
- memory_manager.free();
+ memory_manager.free();
- if(null_mem)
- clReleaseMemObject(CL_MEM_PTR(null_mem));
+ if (null_mem)
+ clReleaseMemObject(CL_MEM_PTR(null_mem));
- ConstMemMap::iterator mt;
- for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
- delete mt->second;
- }
+ ConstMemMap::iterator mt;
+ for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
+ delete mt->second;
+ }
- base_program.release();
- bake_program.release();
- displace_program.release();
- background_program.release();
- denoising_program.release();
+ base_program.release();
+ bake_program.release();
+ displace_program.release();
+ background_program.release();
+ denoising_program.release();
- if(cqCommandQueue)
- clReleaseCommandQueue(cqCommandQueue);
- if(cxContext)
- clReleaseContext(cxContext);
+ if (cqCommandQueue)
+ clReleaseCommandQueue(cqCommandQueue);
+ if (cxContext)
+ clReleaseContext(cxContext);
- delete split_kernel;
+ delete split_kernel;
}
void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
- const void * /*private_info*/, size_t /*cb*/, void *user_data)
+ const void * /*private_info*/,
+ size_t /*cb*/,
+ void *user_data)
{
- string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
- fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
+ string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
+ fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
}
bool OpenCLDevice::opencl_version_check()
{
- string error;
- if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
- opencl_error(error);
- return false;
- }
- if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
- opencl_error(error);
- return false;
- }
- return true;
+ string error;
+ if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
+ opencl_error(error);
+ return false;
+ }
+ if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
+ opencl_error(error);
+ return false;
+ }
+ return true;
}
string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
{
- MD5Hash md5;
- char version[256], driver[256], name[256], vendor[256];
+ MD5Hash md5;
+ char version[256], driver[256], name[256], vendor[256];
- clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
- clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
+ clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
+ clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
+ clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+ clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
- md5.append((uint8_t*)vendor, strlen(vendor));
- md5.append((uint8_t*)version, strlen(version));
- md5.append((uint8_t*)name, strlen(name));
- md5.append((uint8_t*)driver, strlen(driver));
+ md5.append((uint8_t *)vendor, strlen(vendor));
+ md5.append((uint8_t *)version, strlen(version));
+ md5.append((uint8_t *)name, strlen(name));
+ md5.append((uint8_t *)driver, strlen(driver));
- string options = kernel_build_options();
- options += kernel_custom_build_options;
- md5.append((uint8_t*)options.c_str(), options.size());
+ string options = kernel_build_options();
+ options += kernel_custom_build_options;
+ md5.append((uint8_t *)options.c_str(), options.size());
- return md5.get_hex();
+ return md5.get_hex();
}
-bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features)
+bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
{
- VLOG(2) << "Loading kernels for platform " << platform_name
- << ", device " << device_name << ".";
- /* Verify if device was initialized. */
- if(!device_initialized) {
- fprintf(stderr, "OpenCL: failed to initialize device.\n");
- return false;
- }
-
- /* Verify we have right opencl version. */
- if(!opencl_version_check())
- return false;
-
- load_required_kernels(requested_features);
-
- vector<OpenCLProgram*> programs;
- kernel_programs.load_kernels(programs, requested_features, false);
-
- if (!requested_features.use_baking && requested_features.use_denoising) {
- denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
- denoising_program.add_kernel(ustring("filter_divide_shadow"));
- denoising_program.add_kernel(ustring("filter_get_feature"));
- denoising_program.add_kernel(ustring("filter_write_feature"));
- denoising_program.add_kernel(ustring("filter_detect_outliers"));
- denoising_program.add_kernel(ustring("filter_combine_halves"));
- denoising_program.add_kernel(ustring("filter_construct_transform"));
- denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
- denoising_program.add_kernel(ustring("filter_nlm_blur"));
- denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
- denoising_program.add_kernel(ustring("filter_nlm_update_output"));
- denoising_program.add_kernel(ustring("filter_nlm_normalize"));
- denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
- denoising_program.add_kernel(ustring("filter_finalize"));
- programs.push_back(&denoising_program);
- }
-
- load_required_kernel_task_pool.wait_work();
-
- /* Parallel compilation of Cycles kernels, this launches multiple
- * processes to workaround OpenCL frameworks serializing the calls
- * internally within a single process. */
- foreach(OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
- }
- }
- return true;
+ VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
+ /* Verify if device was initialized. */
+ if (!device_initialized) {
+ fprintf(stderr, "OpenCL: failed to initialize device.\n");
+ return false;
+ }
+
+ /* Verify we have right opencl version. */
+ if (!opencl_version_check())
+ return false;
+
+ load_required_kernels(requested_features);
+
+ vector<OpenCLProgram *> programs;
+ kernel_programs.load_kernels(programs, requested_features, false);
+
+ if (!requested_features.use_baking && requested_features.use_denoising) {
+ denoising_program = OpenCLProgram(
+ this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
+ denoising_program.add_kernel(ustring("filter_divide_shadow"));
+ denoising_program.add_kernel(ustring("filter_get_feature"));
+ denoising_program.add_kernel(ustring("filter_write_feature"));
+ denoising_program.add_kernel(ustring("filter_detect_outliers"));
+ denoising_program.add_kernel(ustring("filter_combine_halves"));
+ denoising_program.add_kernel(ustring("filter_construct_transform"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+ denoising_program.add_kernel(ustring("filter_nlm_blur"));
+ denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+ denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+ denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+ denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+ denoising_program.add_kernel(ustring("filter_finalize"));
+ programs.push_back(&denoising_program);
+ }
+
+ load_required_kernel_task_pool.wait_work();
+
+ /* Parallel compilation of Cycles kernels, this launches multiple
+ * processes to workaround OpenCL frameworks serializing the calls
+ * internally within a single process. */
+ foreach (OpenCLProgram *program, programs) {
+ if (!program->load()) {
+ load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+ }
+ }
+ return true;
}
-void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures& requested_features)
+void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
{
- vector<OpenCLProgram*> programs;
- base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
- base_program.add_kernel(ustring("convert_to_byte"));
- base_program.add_kernel(ustring("convert_to_half_float"));
- base_program.add_kernel(ustring("zero_buffer"));
- programs.push_back(&base_program);
-
- if (requested_features.use_true_displacement) {
- displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
- displace_program.add_kernel(ustring("displace"));
- programs.push_back(&displace_program);
- }
-
- if (requested_features.use_background_light) {
- background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options(requested_features, "background"));
- background_program.add_kernel(ustring("background"));
- programs.push_back(&background_program);
- }
-
- if (requested_features.use_baking) {
- bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
- bake_program.add_kernel(ustring("bake"));
- programs.push_back(&bake_program);
- }
-
- foreach(OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
- }
- }
+ vector<OpenCLProgram *> programs;
+ base_program = OpenCLProgram(
+ this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
+ base_program.add_kernel(ustring("convert_to_byte"));
+ base_program.add_kernel(ustring("convert_to_half_float"));
+ base_program.add_kernel(ustring("zero_buffer"));
+ programs.push_back(&base_program);
+
+ if (requested_features.use_true_displacement) {
+ displace_program = OpenCLProgram(
+ this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
+ displace_program.add_kernel(ustring("displace"));
+ programs.push_back(&displace_program);
+ }
+
+ if (requested_features.use_background_light) {
+ background_program = OpenCLProgram(this,
+ "background",
+ "kernel_background.cl",
+ get_build_options(requested_features, "background"));
+ background_program.add_kernel(ustring("background"));
+ programs.push_back(&background_program);
+ }
+
+ if (requested_features.use_baking) {
+ bake_program = OpenCLProgram(
+ this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
+ bake_program.add_kernel(ustring("bake"));
+ programs.push_back(&bake_program);
+ }
+
+ foreach (OpenCLProgram *program, programs) {
+ if (!program->load()) {
+ load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+ }
+ }
}
void OpenCLDevice::load_preview_kernels()
{
- DeviceRequestedFeatures no_features;
- vector<OpenCLProgram*> programs;
- preview_programs.load_kernels(programs, no_features, true);
-
- foreach(OpenCLProgram *program, programs) {
- if (!program->load()) {
- load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
- }
- }
+ DeviceRequestedFeatures no_features;
+ vector<OpenCLProgram *> programs;
+ preview_programs.load_kernels(programs, no_features, true);
+
+ foreach (OpenCLProgram *program, programs) {
+ if (!program->load()) {
+ load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
+ }
+ }
}
-bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures& requested_features)
+bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
{
- if (background) {
- load_kernel_task_pool.wait_work();
- use_preview_kernels = false;
- }
- else {
- /* We use a device setting to determine to load preview kernels or not
- * Better to check on device level than per kernel as mixing preview and
- * non-preview kernels does not work due to different data types */
- if (use_preview_kernels) {
- use_preview_kernels = !load_kernel_task_pool.finished();
- }
- }
- return split_kernel->load_kernels(requested_features);
+ if (background) {
+ load_kernel_task_pool.wait_work();
+ use_preview_kernels = false;
+ }
+ else {
+ /* We use a device setting to determine to load preview kernels or not
+ * Better to check on device level than per kernel as mixing preview and
+ * non-preview kernels does not work due to different data types */
+ if (use_preview_kernels) {
+ use_preview_kernels = !load_kernel_task_pool.finished();
+ }
+ }
+ return split_kernel->load_kernels(requested_features);
}
-OpenCLDevice::OpenCLSplitPrograms* OpenCLDevice::get_split_programs()
+OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
{
- return use_preview_kernels?&preview_programs:&kernel_programs;
+ return use_preview_kernels ? &preview_programs : &kernel_programs;
}
DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
{
- /* Do not switch kernels for background renderings
- * We do foreground rendering but use the preview kernels
- * Check for the optimized kernels
- *
- * This works also the other way around, where we are using
- * optimized kernels but new ones are being compiled due
- * to other features that are needed */
- if (background) {
- /* The if-statements below would find the same result,
- * But as the `finished` method uses a mutex we added
- * this as an early exit */
- return DEVICE_KERNEL_USING_FEATURE_KERNEL;
- }
-
- bool other_kernels_finished = load_kernel_task_pool.finished();
- if (use_preview_kernels) {
- if (other_kernels_finished) {
- return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
- }
- else {
- return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL;
- }
- }
- else {
- if (other_kernels_finished) {
- return DEVICE_KERNEL_USING_FEATURE_KERNEL;
- }
- else {
- return DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
- }
- }
+ /* Do not switch kernels for background renderings
+ * We do foreground rendering but use the preview kernels
+ * Check for the optimized kernels
+ *
+ * This works also the other way around, where we are using
+ * optimized kernels but new ones are being compiled due
+ * to other features that are needed */
+ if (background) {
+ /* The if-statements below would find the same result,
+ * But as the `finished` method uses a mutex we added
+ * this as an early exit */
+ return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+ }
+
+ bool other_kernels_finished = load_kernel_task_pool.finished();
+ if (use_preview_kernels) {
+ if (other_kernels_finished) {
+ return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
+ }
+ else {
+ return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL;
+ }
+ }
+ else {
+ if (other_kernels_finished) {
+ return DEVICE_KERNEL_USING_FEATURE_KERNEL;
+ }
+ else {
+ return DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
+ }
+ }
}
-void OpenCLDevice::mem_alloc(device_memory& mem)
+void OpenCLDevice::mem_alloc(device_memory &mem)
{
- if(mem.name) {
- VLOG(1) << "Buffer allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
- }
-
- size_t size = mem.memory_size();
-
- /* check there is enough memory available for the allocation */
- cl_ulong max_alloc_size = 0;
- clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
-
- if(DebugFlags().opencl.mem_limit) {
- max_alloc_size = min(max_alloc_size,
- cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
- }
-
- if(size > max_alloc_size) {
- string error = "Scene too complex to fit in available memory.";
- if(mem.name != NULL) {
- error += string_printf(" (allocating buffer %s failed.)", mem.name);
- }
- set_error(error);
-
- return;
- }
-
- cl_mem_flags mem_flag;
- void *mem_ptr = NULL;
-
- if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
- mem_flag = CL_MEM_READ_ONLY;
- else
- mem_flag = CL_MEM_READ_WRITE;
-
- /* Zero-size allocation might be invoked by render, but not really
- * supported by OpenCL. Using NULL as device pointer also doesn't really
- * work for some reason, so for the time being we'll use special case
- * will null_mem buffer.
- */
- if(size != 0) {
- mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
- mem_flag,
- size,
- mem_ptr,
- &ciErr);
- opencl_assert_err(ciErr, "clCreateBuffer");
- }
- else {
- mem.device_pointer = null_mem;
- }
-
- stats.mem_alloc(size);
- mem.device_size = size;
+ if (mem.name) {
+ VLOG(1) << "Buffer allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+ }
+
+ size_t size = mem.memory_size();
+
+ /* check there is enough memory available for the allocation */
+ cl_ulong max_alloc_size = 0;
+ clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
+
+ if (DebugFlags().opencl.mem_limit) {
+ max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
+ }
+
+ if (size > max_alloc_size) {
+ string error = "Scene too complex to fit in available memory.";
+ if (mem.name != NULL) {
+ error += string_printf(" (allocating buffer %s failed.)", mem.name);
+ }
+ set_error(error);
+
+ return;
+ }
+
+ cl_mem_flags mem_flag;
+ void *mem_ptr = NULL;
+
+ if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+ mem_flag = CL_MEM_READ_ONLY;
+ else
+ mem_flag = CL_MEM_READ_WRITE;
+
+ /* Zero-size allocation might be invoked by render, but not really
+ * supported by OpenCL. Using NULL as device pointer also doesn't really
+ * work for some reason, so for the time being we'll use special case
+ * will null_mem buffer.
+ */
+ if (size != 0) {
+ mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
+ opencl_assert_err(ciErr, "clCreateBuffer");
+ }
+ else {
+ mem.device_pointer = null_mem;
+ }
+
+ stats.mem_alloc(size);
+ mem.device_size = size;
}
-void OpenCLDevice::mem_copy_to(device_memory& mem)
+void OpenCLDevice::mem_copy_to(device_memory &mem)
{
- if(mem.type == MEM_TEXTURE) {
- tex_free(mem);
- tex_alloc(mem);
- }
- else {
- if(!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- /* this is blocking */
- size_t size = mem.memory_size();
- if(size != 0) {
- opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- 0,
- size,
- mem.host_pointer,
- 0,
- NULL, NULL));
- }
- }
+ if (mem.type == MEM_TEXTURE) {
+ tex_free(mem);
+ tex_alloc(mem);
+ }
+ else {
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ /* this is blocking */
+ size_t size = mem.memory_size();
+ if (size != 0) {
+ opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+ CL_MEM_PTR(mem.device_pointer),
+ CL_TRUE,
+ 0,
+ size,
+ mem.host_pointer,
+ 0,
+ NULL,
+ NULL));
+ }
+ }
}
-void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
+void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
{
- size_t offset = elem*y*w;
- size_t size = elem*w*h;
- assert(size != 0);
- opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- offset,
- size,
- (uchar*)mem.host_pointer + offset,
- 0,
- NULL, NULL));
+ size_t offset = elem * y * w;
+ size_t size = elem * w * h;
+ assert(size != 0);
+ opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
+ CL_MEM_PTR(mem.device_pointer),
+ CL_TRUE,
+ offset,
+ size,
+ (uchar *)mem.host_pointer + offset,
+ 0,
+ NULL,
+ NULL));
}
void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
{
- base_program.wait_for_availability();
- cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
- size_t global_size[] = {1024, 1024};
- size_t num_threads = global_size[0] * global_size[1];
-
- cl_mem d_buffer = CL_MEM_PTR(mem);
- cl_ulong d_offset = 0;
- cl_ulong d_size = 0;
-
- while(d_offset < size) {
- d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
-
- kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
- ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
- ckZeroBuffer,
- 2,
- NULL,
- global_size,
- NULL,
- 0,
- NULL,
- NULL);
- opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
- d_offset += d_size;
- }
+ base_program.wait_for_availability();
+ cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+ size_t global_size[] = {1024, 1024};
+ size_t num_threads = global_size[0] * global_size[1];
+
+ cl_mem d_buffer = CL_MEM_PTR(mem);
+ cl_ulong d_offset = 0;
+ cl_ulong d_size = 0;
+
+ while (d_offset < size) {
+ d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
+
+ kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+ ciErr = clEnqueueNDRangeKernel(
+ cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
+ opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+ d_offset += d_size;
+ }
}
-void OpenCLDevice::mem_zero(device_memory& mem)
+void OpenCLDevice::mem_zero(device_memory &mem)
{
- if(!mem.device_pointer) {
- mem_alloc(mem);
- }
-
- if(mem.device_pointer) {
- if(base_program.is_loaded()) {
- mem_zero_kernel(mem.device_pointer, mem.memory_size());
- }
-
- if(mem.host_pointer) {
- memset(mem.host_pointer, 0, mem.memory_size());
- }
-
- if(!base_program.is_loaded()) {
- void* zero = mem.host_pointer;
-
- if(!mem.host_pointer) {
- zero = util_aligned_malloc(mem.memory_size(), 16);
- memset(zero, 0, mem.memory_size());
- }
-
- opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
- CL_MEM_PTR(mem.device_pointer),
- CL_TRUE,
- 0,
- mem.memory_size(),
- zero,
- 0,
- NULL, NULL));
-
- if(!mem.host_pointer) {
- util_aligned_free(zero);
- }
- }
- }
+ if (!mem.device_pointer) {
+ mem_alloc(mem);
+ }
+
+ if (mem.device_pointer) {
+ if (base_program.is_loaded()) {
+ mem_zero_kernel(mem.device_pointer, mem.memory_size());
+ }
+
+ if (mem.host_pointer) {
+ memset(mem.host_pointer, 0, mem.memory_size());
+ }
+
+ if (!base_program.is_loaded()) {
+ void *zero = mem.host_pointer;
+
+ if (!mem.host_pointer) {
+ zero = util_aligned_malloc(mem.memory_size(), 16);
+ memset(zero, 0, mem.memory_size());
+ }
+
+ opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
+ CL_MEM_PTR(mem.device_pointer),
+ CL_TRUE,
+ 0,
+ mem.memory_size(),
+ zero,
+ 0,
+ NULL,
+ NULL));
+
+ if (!mem.host_pointer) {
+ util_aligned_free(zero);
+ }
+ }
+ }
}
-void OpenCLDevice::mem_free(device_memory& mem)
+void OpenCLDevice::mem_free(device_memory &mem)
{
- if(mem.type == MEM_TEXTURE) {
- tex_free(mem);
- }
- else {
- if(mem.device_pointer) {
- if(mem.device_pointer != null_mem) {
- opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
- }
- mem.device_pointer = 0;
-
- stats.mem_free(mem.device_size);
- mem.device_size = 0;
- }
- }
+ if (mem.type == MEM_TEXTURE) {
+ tex_free(mem);
+ }
+ else {
+ if (mem.device_pointer) {
+ if (mem.device_pointer != null_mem) {
+ opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
+ }
+ mem.device_pointer = 0;
+
+ stats.mem_free(mem.device_size);
+ mem.device_size = 0;
+ }
+ }
}
int OpenCLDevice::mem_sub_ptr_alignment()
{
- return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
+ return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
}
-device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
+device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
{
- cl_mem_flags mem_flag;
- if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
- mem_flag = CL_MEM_READ_ONLY;
- else
- mem_flag = CL_MEM_READ_WRITE;
-
- cl_buffer_region info;
- info.origin = mem.memory_elements_size(offset);
- info.size = mem.memory_elements_size(size);
-
- device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
- mem_flag,
- CL_BUFFER_CREATE_TYPE_REGION,
- &info,
- &ciErr);
- opencl_assert_err(ciErr, "clCreateSubBuffer");
- return sub_buf;
+ cl_mem_flags mem_flag;
+ if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
+ mem_flag = CL_MEM_READ_ONLY;
+ else
+ mem_flag = CL_MEM_READ_WRITE;
+
+ cl_buffer_region info;
+ info.origin = mem.memory_elements_size(offset);
+ info.size = mem.memory_elements_size(size);
+
+ device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
+ CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
+ opencl_assert_err(ciErr, "clCreateSubBuffer");
+ return sub_buf;
}
void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
{
- if(device_pointer && device_pointer != null_mem) {
- opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
- }
+ if (device_pointer && device_pointer != null_mem) {
+ opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+ }
}
void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
{
- ConstMemMap::iterator i = const_mem_map.find(name);
- device_vector<uchar> *data;
-
- if(i == const_mem_map.end()) {
- data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
- data->alloc(size);
- const_mem_map.insert(ConstMemMap::value_type(name, data));
- }
- else {
- data = i->second;
- }
-
- memcpy(data->data(), host, size);
- data->copy_to_device();
+ ConstMemMap::iterator i = const_mem_map.find(name);
+ device_vector<uchar> *data;
+
+ if (i == const_mem_map.end()) {
+ data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
+ data->alloc(size);
+ const_mem_map.insert(ConstMemMap::value_type(name, data));
+ }
+ else {
+ data = i->second;
+ }
+
+ memcpy(data->data(), host, size);
+ data->copy_to_device();
}
-void OpenCLDevice::tex_alloc(device_memory& mem)
+void OpenCLDevice::tex_alloc(device_memory &mem)
{
- VLOG(1) << "Texture allocate: " << mem.name << ", "
- << string_human_readable_number(mem.memory_size()) << " bytes. ("
- << string_human_readable_size(mem.memory_size()) << ")";
-
- memory_manager.alloc(mem.name, mem);
- /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
- mem.device_pointer = 1;
- textures[mem.name] = &mem;
- textures_need_update = true;
+ VLOG(1) << "Texture allocate: " << mem.name << ", "
+ << string_human_readable_number(mem.memory_size()) << " bytes. ("
+ << string_human_readable_size(mem.memory_size()) << ")";
+
+ memory_manager.alloc(mem.name, mem);
+ /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
+ mem.device_pointer = 1;
+ textures[mem.name] = &mem;
+ textures_need_update = true;
}
-void OpenCLDevice::tex_free(device_memory& mem)
+void OpenCLDevice::tex_free(device_memory &mem)
{
- if(mem.device_pointer) {
- mem.device_pointer = 0;
-
- if(memory_manager.free(mem)) {
- textures_need_update = true;
- }
-
- foreach(TexturesMap::value_type& value, textures) {
- if(value.second == &mem) {
- textures.erase(value.first);
- break;
- }
- }
- }
+ if (mem.device_pointer) {
+ mem.device_pointer = 0;
+
+ if (memory_manager.free(mem)) {
+ textures_need_update = true;
+ }
+
+ foreach (TexturesMap::value_type &value, textures) {
+ if (value.second == &mem) {
+ textures.erase(value.first);
+ break;
+ }
+ }
+ }
}
size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
{
- int r = global_size % group_size;
- return global_size + ((r == 0)? 0: group_size - r);
+ int r = global_size % group_size;
+ return global_size + ((r == 0) ? 0 : group_size - r);
}
-void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
+void OpenCLDevice::enqueue_kernel(
+ cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
{
- size_t workgroup_size, max_work_items[3];
-
- clGetKernelWorkGroupInfo(kernel, cdDevice,
- CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
- clGetDeviceInfo(cdDevice,
- CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
-
- if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
- workgroup_size = max_workgroup_size;
- }
-
- /* Try to divide evenly over 2 dimensions. */
- size_t local_size[2];
- if(x_workgroups) {
- local_size[0] = workgroup_size;
- local_size[1] = 1;
- }
- else {
- size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
- local_size[0] = local_size[1] = sqrt_workgroup_size;
- }
-
- /* Some implementations have max size 1 on 2nd dimension. */
- if(local_size[1] > max_work_items[1]) {
- local_size[0] = workgroup_size/max_work_items[1];
- local_size[1] = max_work_items[1];
- }
-
- size_t global_size[2] = {global_size_round_up(local_size[0], w),
- global_size_round_up(local_size[1], h)};
-
- /* Vertical size of 1 is coming from bake/shade kernels where we should
- * not round anything up because otherwise we'll either be doing too
- * much work per pixel (if we don't check global ID on Y axis) or will
- * be checking for global ID to always have Y of 0.
- */
- if(h == 1) {
- global_size[h] = 1;
- }
-
- /* run kernel */
- opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
- opencl_assert(clFlush(cqCommandQueue));
+ size_t workgroup_size, max_work_items[3];
+
+ clGetKernelWorkGroupInfo(
+ kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
+ clGetDeviceInfo(
+ cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
+
+ if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+ workgroup_size = max_workgroup_size;
+ }
+
+ /* Try to divide evenly over 2 dimensions. */
+ size_t local_size[2];
+ if (x_workgroups) {
+ local_size[0] = workgroup_size;
+ local_size[1] = 1;
+ }
+ else {
+ size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+ local_size[0] = local_size[1] = sqrt_workgroup_size;
+ }
+
+ /* Some implementations have max size 1 on 2nd dimension. */
+ if (local_size[1] > max_work_items[1]) {
+ local_size[0] = workgroup_size / max_work_items[1];
+ local_size[1] = max_work_items[1];
+ }
+
+ size_t global_size[2] = {global_size_round_up(local_size[0], w),
+ global_size_round_up(local_size[1], h)};
+
+ /* Vertical size of 1 is coming from bake/shade kernels where we should
+ * not round anything up because otherwise we'll either be doing too
+ * much work per pixel (if we don't check global ID on Y axis) or will
+ * be checking for global ID to always have Y of 0.
+ */
+ if (h == 1) {
+ global_size[h] = 1;
+ }
+
+ /* run kernel */
+ opencl_assert(
+ clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+ opencl_assert(clFlush(cqCommandQueue));
}
void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
{
- cl_mem ptr;
-
- MemMap::iterator i = mem_map.find(name);
- if(i != mem_map.end()) {
- ptr = CL_MEM_PTR(i->second);
- }
- else {
- /* work around NULL not working, even though the spec says otherwise */
- ptr = CL_MEM_PTR(null_mem);
- }
-
- opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
+ cl_mem ptr;
+
+ MemMap::iterator i = mem_map.find(name);
+ if (i != mem_map.end()) {
+ ptr = CL_MEM_PTR(i->second);
+ }
+ else {
+ /* work around NULL not working, even though the spec says otherwise */
+ ptr = CL_MEM_PTR(null_mem);
+ }
+
+ opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
}
void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
{
- flush_texture_buffers();
+ flush_texture_buffers();
- memory_manager.set_kernel_arg_buffers(kernel, narg);
+ memory_manager.set_kernel_arg_buffers(kernel, narg);
}
void OpenCLDevice::flush_texture_buffers()
{
- if(!textures_need_update) {
- return;
- }
- textures_need_update = false;
-
- /* Setup slots for textures. */
- int num_slots = 0;
-
- vector<texture_slot_t> texture_slots;
-
-#define KERNEL_TEX(type, name) \
- if(textures.find(#name) != textures.end()) { \
- texture_slots.push_back(texture_slot_t(#name, num_slots)); \
- } \
- num_slots++;
-#include "kernel/kernel_textures.h"
-
- int num_data_slots = num_slots;
-
- foreach(TexturesMap::value_type& tex, textures) {
- string name = tex.first;
-
- if(string_startswith(name, "__tex_image")) {
- int pos = name.rfind("_");
- int id = atoi(name.data() + pos + 1);
- texture_slots.push_back(texture_slot_t(name,
- num_data_slots + id));
- num_slots = max(num_slots, num_data_slots + id + 1);
- }
- }
-
- /* Realloc texture descriptors buffer. */
- memory_manager.free(texture_info);
- texture_info.resize(num_slots);
- memory_manager.alloc("texture_info", texture_info);
-
- /* Fill in descriptors */
- foreach(texture_slot_t& slot, texture_slots) {
- TextureInfo& info = texture_info[slot.slot];
-
- MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
- info.data = desc.offset;
- info.cl_buffer = desc.device_buffer;
-
- if(string_startswith(slot.name, "__tex_image")) {
- device_memory *mem = textures[slot.name];
-
- info.width = mem->data_width;
- info.height = mem->data_height;
- info.depth = mem->data_depth;
-
- info.interpolation = mem->interpolation;
- info.extension = mem->extension;
- }
- }
-
- /* Force write of descriptors. */
- memory_manager.free(texture_info);
- memory_manager.alloc("texture_info", texture_info);
-}
+ if (!textures_need_update) {
+ return;
+ }
+ textures_need_update = false;
+
+ /* Setup slots for textures. */
+ int num_slots = 0;
+
+ vector<texture_slot_t> texture_slots;
+
+# define KERNEL_TEX(type, name) \
+ if (textures.find(#name) != textures.end()) { \
+ texture_slots.push_back(texture_slot_t(#name, num_slots)); \
+ } \
+ num_slots++;
+# include "kernel/kernel_textures.h"
+
+ int num_data_slots = num_slots;
+
+ foreach (TexturesMap::value_type &tex, textures) {
+ string name = tex.first;
+
+ if (string_startswith(name, "__tex_image")) {
+ int pos = name.rfind("_");
+ int id = atoi(name.data() + pos + 1);
+ texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
+ num_slots = max(num_slots, num_data_slots + id + 1);
+ }
+ }
+
+ /* Realloc texture descriptors buffer. */
+ memory_manager.free(texture_info);
+ texture_info.resize(num_slots);
+ memory_manager.alloc("texture_info", texture_info);
+
+ /* Fill in descriptors */
+ foreach (texture_slot_t &slot, texture_slots) {
+ TextureInfo &info = texture_info[slot.slot];
+
+ MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
+ info.data = desc.offset;
+ info.cl_buffer = desc.device_buffer;
+ if (string_startswith(slot.name, "__tex_image")) {
+ device_memory *mem = textures[slot.name];
+
+ info.width = mem->data_width;
+ info.height = mem->data_height;
+ info.depth = mem->data_depth;
+
+ info.interpolation = mem->interpolation;
+ info.extension = mem->extension;
+ }
+ }
+
+ /* Force write of descriptors. */
+ memory_manager.free(texture_info);
+ memory_manager.alloc("texture_info", texture_info);
+}
void OpenCLDevice::thread_run(DeviceTask *task)
{
- flush_texture_buffers();
-
- if(task->type == DeviceTask::FILM_CONVERT) {
- film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
- }
- else if(task->type == DeviceTask::SHADER) {
- shader(*task);
- }
- else if(task->type == DeviceTask::RENDER) {
- RenderTile tile;
- DenoisingTask denoising(this, *task);
-
- /* Allocate buffer for kernel globals */
- device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
- kgbuffer.alloc_to_device(1);
-
- /* Keep rendering tiles until done. */
- while(task->acquire_tile(this, tile)) {
- if(tile.task == RenderTile::PATH_TRACE) {
- assert(tile.task == RenderTile::PATH_TRACE);
- scoped_timer timer(&tile.buffers->render_time);
-
- split_kernel->path_trace(task,
- tile,
- kgbuffer,
- *const_mem_map["__data"]);
-
- /* Complete kernel execution before release tile. */
- /* This helps in multi-device render;
- * The device that reaches the critical-section function
- * release_tile waits (stalling other devices from entering
- * release_tile) for all kernels to complete. If device1 (a
- * slow-render device) reaches release_tile first then it would
- * stall device2 (a fast-render device) from proceeding to render
- * next tile.
- */
- clFinish(cqCommandQueue);
- }
- else if(tile.task == RenderTile::DENOISE) {
- tile.sample = tile.start_sample + tile.num_samples;
- denoise(tile, denoising);
- task->update_progress(&tile, tile.w*tile.h);
- }
-
- task->release_tile(tile);
- }
-
- kgbuffer.free();
- }
+ flush_texture_buffers();
+
+ if (task->type == DeviceTask::FILM_CONVERT) {
+ film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+ }
+ else if (task->type == DeviceTask::SHADER) {
+ shader(*task);
+ }
+ else if (task->type == DeviceTask::RENDER) {
+ RenderTile tile;
+ DenoisingTask denoising(this, *task);
+
+ /* Allocate buffer for kernel globals */
+ device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
+ kgbuffer.alloc_to_device(1);
+
+ /* Keep rendering tiles until done. */
+ while (task->acquire_tile(this, tile)) {
+ if (tile.task == RenderTile::PATH_TRACE) {
+ assert(tile.task == RenderTile::PATH_TRACE);
+ scoped_timer timer(&tile.buffers->render_time);
+
+ split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
+
+ /* Complete kernel execution before release tile. */
+ /* This helps in multi-device render;
+ * The device that reaches the critical-section function
+ * release_tile waits (stalling other devices from entering
+ * release_tile) for all kernels to complete. If device1 (a
+ * slow-render device) reaches release_tile first then it would
+ * stall device2 (a fast-render device) from proceeding to render
+ * next tile.
+ */
+ clFinish(cqCommandQueue);
+ }
+ else if (tile.task == RenderTile::DENOISE) {
+ tile.sample = tile.start_sample + tile.num_samples;
+ denoise(tile, denoising);
+ task->update_progress(&tile, tile.w * tile.h);
+ }
+
+ task->release_tile(tile);
+ }
+
+ kgbuffer.free();
+ }
}
-void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
+void OpenCLDevice::film_convert(DeviceTask &task,
+ device_ptr buffer,
+ device_ptr rgba_byte,
+ device_ptr rgba_half)
{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
- cl_mem d_buffer = CL_MEM_PTR(buffer);
- cl_int d_x = task.x;
- cl_int d_y = task.y;
- cl_int d_w = task.w;
- cl_int d_h = task.h;
- cl_float d_sample_scale = 1.0f/(task.sample + 1);
- cl_int d_offset = task.offset;
- cl_int d_stride = task.stride;
-
-
- cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
-
- cl_uint start_arg_index =
- kernel_set_args(ckFilmConvertKernel,
- 0,
- d_data,
- d_rgba,
- d_buffer);
-
- set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(ckFilmConvertKernel,
- start_arg_index,
- d_sample_scale,
- d_x,
- d_y,
- d_w,
- d_h,
- d_offset,
- d_stride);
-
- enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
+ /* cast arguments to cl types */
+ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+ cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
+ cl_mem d_buffer = CL_MEM_PTR(buffer);
+ cl_int d_x = task.x;
+ cl_int d_y = task.y;
+ cl_int d_w = task.w;
+ cl_int d_h = task.h;
+ cl_float d_sample_scale = 1.0f / (task.sample + 1);
+ cl_int d_offset = task.offset;
+ cl_int d_stride = task.stride;
+
+ cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
+ base_program(ustring("convert_to_half_float"));
+
+ cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
+
+ set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
+
+ start_arg_index += kernel_set_args(ckFilmConvertKernel,
+ start_arg_index,
+ d_sample_scale,
+ d_x,
+ d_y,
+ d_w,
+ d_h,
+ d_offset,
+ d_stride);
+
+ enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
}
bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
@@ -1419,123 +1406,119 @@ bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
device_ptr out_ptr,
DenoisingTask *task)
{
- int stride = task->buffer.stride;
- int w = task->buffer.width;
- int h = task->buffer.h;
- int r = task->nlm_state.r;
- int f = task->nlm_state.f;
- float a = task->nlm_state.a;
- float k_2 = task->nlm_state.k_2;
-
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2*r+1)*(2*r+1);
- int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
-
- device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
- device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
- device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride);
- cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
- cl_mem difference_mem = CL_MEM_PTR(*difference);
- cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
- cl_mem image_mem = CL_MEM_PTR(image_ptr);
- cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
- cl_mem out_mem = CL_MEM_PTR(out_ptr);
- cl_mem scale_mem = NULL;
-
- mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
- mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
-
- cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
- cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
- cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
- cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
- cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
-
- kernel_set_args(ckNLMCalcDifference, 0,
- guide_mem,
- variance_mem,
- scale_mem,
- difference_mem,
- w, h, stride,
- pass_stride,
- r, channel_offset,
- 0, a, k_2);
- kernel_set_args(ckNLMBlur, 0,
- difference_mem,
- blurDifference_mem,
- w, h, stride,
- pass_stride,
- r, f);
- kernel_set_args(ckNLMCalcWeight, 0,
- blurDifference_mem,
- difference_mem,
- w, h, stride,
- pass_stride,
- r, f);
- kernel_set_args(ckNLMUpdateOutput, 0,
- blurDifference_mem,
- image_mem,
- out_mem,
- weightAccum_mem,
- w, h, stride,
- pass_stride,
- channel_offset,
- r, f);
-
- enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
- enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
- enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true);
-
- kernel_set_args(ckNLMNormalize, 0,
- out_mem, weightAccum_mem, w, h, stride);
- enqueue_kernel(ckNLMNormalize, w, h);
-
- return true;
+ int stride = task->buffer.stride;
+ int w = task->buffer.width;
+ int h = task->buffer.h;
+ int r = task->nlm_state.r;
+ int f = task->nlm_state.f;
+ float a = task->nlm_state.a;
+ float k_2 = task->nlm_state.k_2;
+
+ int pass_stride = task->buffer.pass_stride;
+ int num_shifts = (2 * r + 1) * (2 * r + 1);
+ int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+
+ device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
+ device_sub_ptr blurDifference(
+ task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
+ device_sub_ptr weightAccum(
+ task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
+ cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
+ cl_mem difference_mem = CL_MEM_PTR(*difference);
+ cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
+
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem out_mem = CL_MEM_PTR(out_ptr);
+ cl_mem scale_mem = NULL;
+
+ mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
+ mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
+ cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
+
+ kernel_set_args(ckNLMCalcDifference,
+ 0,
+ guide_mem,
+ variance_mem,
+ scale_mem,
+ difference_mem,
+ w,
+ h,
+ stride,
+ pass_stride,
+ r,
+ channel_offset,
+ 0,
+ a,
+ k_2);
+ kernel_set_args(
+ ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
+ kernel_set_args(
+ ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
+ kernel_set_args(ckNLMUpdateOutput,
+ 0,
+ blurDifference_mem,
+ image_mem,
+ out_mem,
+ weightAccum_mem,
+ w,
+ h,
+ stride,
+ pass_stride,
+ channel_offset,
+ r,
+ f);
+
+ enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
+
+ kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
+ enqueue_kernel(ckNLMNormalize, w, h);
+
+ return true;
}
bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
{
- cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
- cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- char use_time = task->buffer.use_time? 1 : 0;
-
- cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
-
- int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
- buffer_mem,
- tile_info_mem);
- cl_mem buffers[9];
- for(int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterConstructTransform,
- arg_ofs,
- buffers[i]);
- }
- kernel_set_args(ckFilterConstructTransform,
- arg_ofs,
- transform_mem,
- rank_mem,
- task->filter_area,
- task->rect,
- task->buffer.pass_stride,
- task->buffer.frame_stride,
- use_time,
- task->radius,
- task->pca_threshold);
-
- enqueue_kernel(ckFilterConstructTransform,
- task->storage.w,
- task->storage.h,
- 256);
-
- return true;
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+ char use_time = task->buffer.use_time ? 1 : 0;
+
+ cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+ int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
+ cl_mem buffers[9];
+ for (int i = 0; i < 9; i++) {
+ buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+ arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
+ }
+ kernel_set_args(ckFilterConstructTransform,
+ arg_ofs,
+ transform_mem,
+ rank_mem,
+ task->filter_area,
+ task->rect,
+ task->buffer.pass_stride,
+ task->buffer.frame_stride,
+ use_time,
+ task->radius,
+ task->pca_threshold);
+
+ enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
+
+ return true;
}
bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
@@ -1544,136 +1527,130 @@ bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
int frame,
DenoisingTask *task)
{
- cl_mem color_mem = CL_MEM_PTR(color_ptr);
- cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
- cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
-
- cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
- cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
- cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
- cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
- cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
- cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
- cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
- int stride = task->buffer.stride;
- int frame_offset = frame * task->buffer.frame_stride;
- int t = task->tile_info->frames[frame];
- char use_time = task->buffer.use_time? 1 : 0;
-
- int r = task->radius;
- int pass_stride = task->buffer.pass_stride;
- int num_shifts = (2*r+1)*(2*r+1);
-
- device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
- device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
- cl_mem difference_mem = CL_MEM_PTR(*difference);
- cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
-
- kernel_set_args(ckNLMCalcDifference, 0,
- color_mem,
- color_variance_mem,
- scale_mem,
- difference_mem,
- w, h, stride,
- pass_stride,
- r,
- pass_stride,
- frame_offset,
- 1.0f, task->nlm_k_2);
- kernel_set_args(ckNLMBlur, 0,
- difference_mem,
- blurDifference_mem,
- w, h, stride,
- pass_stride,
- r, 4);
- kernel_set_args(ckNLMCalcWeight, 0,
- blurDifference_mem,
- difference_mem,
- w, h, stride,
- pass_stride,
- r, 4);
- kernel_set_args(ckNLMConstructGramian, 0,
- t,
- blurDifference_mem,
- buffer_mem,
- transform_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- task->reconstruction_state.filter_window,
- w, h, stride,
- pass_stride,
- r, 4,
- frame_offset,
- use_time);
-
- enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
- enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true);
- enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
- enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
-
- return true;
+ cl_mem color_mem = CL_MEM_PTR(color_ptr);
+ cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+ cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
+
+ cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+ cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+ cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
+ cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
+ cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+ int stride = task->buffer.stride;
+ int frame_offset = frame * task->buffer.frame_stride;
+ int t = task->tile_info->frames[frame];
+ char use_time = task->buffer.use_time ? 1 : 0;
+
+ int r = task->radius;
+ int pass_stride = task->buffer.pass_stride;
+ int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+ device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
+ device_sub_ptr blurDifference(
+ task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
+ cl_mem difference_mem = CL_MEM_PTR(*difference);
+ cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
+
+ kernel_set_args(ckNLMCalcDifference,
+ 0,
+ color_mem,
+ color_variance_mem,
+ scale_mem,
+ difference_mem,
+ w,
+ h,
+ stride,
+ pass_stride,
+ r,
+ pass_stride,
+ frame_offset,
+ 1.0f,
+ task->nlm_k_2);
+ kernel_set_args(
+ ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
+ kernel_set_args(
+ ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
+ kernel_set_args(ckNLMConstructGramian,
+ 0,
+ t,
+ blurDifference_mem,
+ buffer_mem,
+ transform_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->reconstruction_state.filter_window,
+ w,
+ h,
+ stride,
+ pass_stride,
+ r,
+ 4,
+ frame_offset,
+ use_time);
+
+ enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
+ enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
+
+ return true;
}
-bool OpenCLDevice::denoising_solve(device_ptr output_ptr,
- DenoisingTask *task)
+bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
{
- cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
-
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
- cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
- cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
- cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
-
- int w = task->reconstruction_state.source_w;
- int h = task->reconstruction_state.source_h;
-
- kernel_set_args(ckFinalize, 0,
- output_mem,
- rank_mem,
- XtWX_mem,
- XtWY_mem,
- task->filter_area,
- task->reconstruction_state.buffer_params,
- task->render_buffer.samples);
- enqueue_kernel(ckFinalize, w, h);
-
- return true;
+ cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
+
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+ cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+ cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+ cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+ int w = task->reconstruction_state.source_w;
+ int h = task->reconstruction_state.source_h;
+
+ kernel_set_args(ckFinalize,
+ 0,
+ output_mem,
+ rank_mem,
+ XtWX_mem,
+ XtWY_mem,
+ task->filter_area,
+ task->reconstruction_state.buffer_params,
+ task->render_buffer.samples);
+ enqueue_kernel(ckFinalize, w, h);
+
+ return true;
}
bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
device_ptr b_ptr,
device_ptr mean_ptr,
device_ptr variance_ptr,
- int r, int4 rect,
+ int r,
+ int4 rect,
DenoisingTask *task)
{
- cl_mem a_mem = CL_MEM_PTR(a_ptr);
- cl_mem b_mem = CL_MEM_PTR(b_ptr);
- cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
- cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
-
- kernel_set_args(ckFilterCombineHalves, 0,
- mean_mem,
- variance_mem,
- a_mem,
- b_mem,
- rect,
- r);
- enqueue_kernel(ckFilterCombineHalves,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- return true;
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+ kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
+ enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ return true;
}
bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
@@ -1683,39 +1660,36 @@ bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
device_ptr buffer_variance_ptr,
DenoisingTask *task)
{
- cl_mem a_mem = CL_MEM_PTR(a_ptr);
- cl_mem b_mem = CL_MEM_PTR(b_ptr);
- cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
- cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
- cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
-
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
-
- int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0,
- task->render_buffer.samples,
- tile_info_mem);
- cl_mem buffers[9];
- for(int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs,
- buffers[i]);
- }
- kernel_set_args(ckFilterDivideShadow, arg_ofs,
- a_mem,
- b_mem,
- sample_variance_mem,
- sv_variance_mem,
- buffer_variance_mem,
- task->rect,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- enqueue_kernel(ckFilterDivideShadow,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- return true;
+ cl_mem a_mem = CL_MEM_PTR(a_ptr);
+ cl_mem b_mem = CL_MEM_PTR(b_ptr);
+ cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+ cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+ cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+ cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+ cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+ int arg_ofs = kernel_set_args(
+ ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
+ cl_mem buffers[9];
+ for (int i = 0; i < 9; i++) {
+ buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+ arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
+ }
+ kernel_set_args(ckFilterDivideShadow,
+ arg_ofs,
+ a_mem,
+ b_mem,
+ sample_variance_mem,
+ sv_variance_mem,
+ buffer_variance_mem,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.offset);
+ enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ return true;
}
bool OpenCLDevice::denoising_get_feature(int mean_offset,
@@ -1725,36 +1699,32 @@ bool OpenCLDevice::denoising_get_feature(int mean_offset,
float scale,
DenoisingTask *task)
{
- cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
-
- cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
- cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
-
- int arg_ofs = kernel_set_args(ckFilterGetFeature, 0,
- task->render_buffer.samples,
- tile_info_mem);
- cl_mem buffers[9];
- for(int i = 0; i < 9; i++) {
- buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
- arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs,
- buffers[i]);
- }
- kernel_set_args(ckFilterGetFeature, arg_ofs,
- mean_offset,
- variance_offset,
- mean_mem,
- variance_mem,
- scale,
- task->rect,
- task->render_buffer.pass_stride,
- task->render_buffer.offset);
- enqueue_kernel(ckFilterGetFeature,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- return true;
+ cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+ cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
+
+ cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+ int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
+ cl_mem buffers[9];
+ for (int i = 0; i < 9; i++) {
+ buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+ arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
+ }
+ kernel_set_args(ckFilterGetFeature,
+ arg_ofs,
+ mean_offset,
+ variance_offset,
+ mean_mem,
+ variance_mem,
+ scale,
+ task->rect,
+ task->render_buffer.pass_stride,
+ task->render_buffer.offset);
+ enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ return true;
}
bool OpenCLDevice::denoising_write_feature(int out_offset,
@@ -1762,24 +1732,23 @@ bool OpenCLDevice::denoising_write_feature(int out_offset,
device_ptr buffer_ptr,
DenoisingTask *task)
{
- cl_mem from_mem = CL_MEM_PTR(from_ptr);
- cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
-
- cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
-
- kernel_set_args(ckFilterWriteFeature, 0,
- task->render_buffer.samples,
- task->reconstruction_state.buffer_params,
- task->filter_area,
- from_mem,
- buffer_mem,
- out_offset,
- task->rect);
- enqueue_kernel(ckFilterWriteFeature,
- task->filter_area.z,
- task->filter_area.w);
-
- return true;
+ cl_mem from_mem = CL_MEM_PTR(from_ptr);
+ cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
+
+ cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
+
+ kernel_set_args(ckFilterWriteFeature,
+ 0,
+ task->render_buffer.samples,
+ task->reconstruction_state.buffer_params,
+ task->filter_area,
+ from_mem,
+ buffer_mem,
+ out_offset,
+ task->rect);
+ enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+ return true;
}
bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
@@ -1788,155 +1757,155 @@ bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
device_ptr output_ptr,
DenoisingTask *task)
{
- cl_mem image_mem = CL_MEM_PTR(image_ptr);
- cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
- cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
- cl_mem output_mem = CL_MEM_PTR(output_ptr);
-
- cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
-
- kernel_set_args(ckFilterDetectOutliers, 0,
- image_mem,
- variance_mem,
- depth_mem,
- output_mem,
- task->rect,
- task->buffer.pass_stride);
- enqueue_kernel(ckFilterDetectOutliers,
- task->rect.z-task->rect.x,
- task->rect.w-task->rect.y);
-
- return true;
+ cl_mem image_mem = CL_MEM_PTR(image_ptr);
+ cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+ cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
+ cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+ cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
+
+ kernel_set_args(ckFilterDetectOutliers,
+ 0,
+ image_mem,
+ variance_mem,
+ depth_mem,
+ output_mem,
+ task->rect,
+ task->buffer.pass_stride);
+ enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+ return true;
}
-void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising)
+void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
{
- denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising);
- denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
- denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
- denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
- denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
- denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
- denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
- denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
- denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
- denoising.render_buffer.samples = rtile.sample;
- denoising.buffer.gpu_temporary_mem = true;
-
- denoising.run_denoising(&rtile);
+ denoising.functions.construct_transform = function_bind(
+ &OpenCLDevice::denoising_construct_transform, this, &denoising);
+ denoising.functions.accumulate = function_bind(
+ &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
+ denoising.functions.divide_shadow = function_bind(
+ &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.non_local_means = function_bind(
+ &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+ denoising.functions.combine_halves = function_bind(
+ &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+ denoising.functions.get_feature = function_bind(
+ &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+ denoising.functions.write_feature = function_bind(
+ &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+ denoising.functions.detect_outliers = function_bind(
+ &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+ denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+ denoising.render_buffer.samples = rtile.sample;
+ denoising.buffer.gpu_temporary_mem = true;
+
+ denoising.run_denoising(&rtile);
}
-void OpenCLDevice::shader(DeviceTask& task)
+void OpenCLDevice::shader(DeviceTask &task)
{
- /* cast arguments to cl types */
- cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
- cl_mem d_input = CL_MEM_PTR(task.shader_input);
- cl_mem d_output = CL_MEM_PTR(task.shader_output);
- cl_int d_shader_eval_type = task.shader_eval_type;
- cl_int d_shader_filter = task.shader_filter;
- cl_int d_shader_x = task.shader_x;
- cl_int d_shader_w = task.shader_w;
- cl_int d_offset = task.offset;
-
- OpenCLDevice::OpenCLProgram *program = &background_program;
- if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
- program = &bake_program;
- }
- else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
- program = &displace_program;
- }
- program->wait_for_availability();
- cl_kernel kernel = (*program)();
-
- cl_uint start_arg_index =
- kernel_set_args(kernel,
- 0,
- d_data,
- d_input,
- d_output);
-
- set_kernel_arg_buffers(kernel, &start_arg_index);
-
- start_arg_index += kernel_set_args(kernel,
- start_arg_index,
- d_shader_eval_type);
- if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
- start_arg_index += kernel_set_args(kernel,
- start_arg_index,
- d_shader_filter);
- }
- start_arg_index += kernel_set_args(kernel,
- start_arg_index,
- d_shader_x,
- d_shader_w,
- d_offset);
-
- for(int sample = 0; sample < task.num_samples; sample++) {
-
- if(task.get_cancel())
- break;
-
- kernel_set_args(kernel, start_arg_index, sample);
-
- enqueue_kernel(kernel, task.shader_w, 1);
-
- clFinish(cqCommandQueue);
-
- task.update_progress(NULL);
- }
+ /* cast arguments to cl types */
+ cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+ cl_mem d_input = CL_MEM_PTR(task.shader_input);
+ cl_mem d_output = CL_MEM_PTR(task.shader_output);
+ cl_int d_shader_eval_type = task.shader_eval_type;
+ cl_int d_shader_filter = task.shader_filter;
+ cl_int d_shader_x = task.shader_x;
+ cl_int d_shader_w = task.shader_w;
+ cl_int d_offset = task.offset;
+
+ OpenCLDevice::OpenCLProgram *program = &background_program;
+ if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+ program = &bake_program;
+ }
+ else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+ program = &displace_program;
+ }
+ program->wait_for_availability();
+ cl_kernel kernel = (*program)();
+
+ cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
+
+ set_kernel_arg_buffers(kernel, &start_arg_index);
+
+ start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
+ if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+ start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
+ }
+ start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
+
+ for (int sample = 0; sample < task.num_samples; sample++) {
+
+ if (task.get_cancel())
+ break;
+
+ kernel_set_args(kernel, start_arg_index, sample);
+
+ enqueue_kernel(kernel, task.shader_w, 1);
+
+ clFinish(cqCommandQueue);
+
+ task.update_progress(NULL);
+ }
}
string OpenCLDevice::kernel_build_options(const string *debug_src)
{
- string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
-
- if(platform_name == "NVIDIA CUDA") {
- build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
- "-cl-nv-maxrregcount=32 "
- "-cl-nv-verbose ";
-
- uint compute_capability_major, compute_capability_minor;
- clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
- sizeof(cl_uint), &compute_capability_major, NULL);
- clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
- sizeof(cl_uint), &compute_capability_minor, NULL);
-
- build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
- compute_capability_major * 100 +
- compute_capability_minor * 10);
- }
-
- else if(platform_name == "Apple")
- build_options += "-D__KERNEL_OPENCL_APPLE__ ";
-
- else if(platform_name == "AMD Accelerated Parallel Processing")
- build_options += "-D__KERNEL_OPENCL_AMD__ ";
-
- else if(platform_name == "Intel(R) OpenCL") {
- build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
-
- /* Options for gdb source level kernel debugging.
- * this segfaults on linux currently.
- */
- if(OpenCLInfo::use_debug() && debug_src)
- build_options += "-g -s \"" + *debug_src + "\" ";
- }
-
- if(info.has_half_images) {
- build_options += "-D__KERNEL_CL_KHR_FP16__ ";
- }
-
- if(OpenCLInfo::use_debug()) {
- build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
- }
-
-#ifdef WITH_CYCLES_DEBUG
- build_options += "-D__KERNEL_DEBUG__ ";
-#endif
-
- return build_options;
+ string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
+
+ if (platform_name == "NVIDIA CUDA") {
+ build_options +=
+ "-D__KERNEL_OPENCL_NVIDIA__ "
+ "-cl-nv-maxrregcount=32 "
+ "-cl-nv-verbose ";
+
+ uint compute_capability_major, compute_capability_minor;
+ clGetDeviceInfo(cdDevice,
+ CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+ sizeof(cl_uint),
+ &compute_capability_major,
+ NULL);
+ clGetDeviceInfo(cdDevice,
+ CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+ sizeof(cl_uint),
+ &compute_capability_minor,
+ NULL);
+
+ build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
+ compute_capability_major * 100 + compute_capability_minor * 10);
+ }
+
+ else if (platform_name == "Apple")
+ build_options += "-D__KERNEL_OPENCL_APPLE__ ";
+
+ else if (platform_name == "AMD Accelerated Parallel Processing")
+ build_options += "-D__KERNEL_OPENCL_AMD__ ";
+
+ else if (platform_name == "Intel(R) OpenCL") {
+ build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
+
+ /* Options for gdb source level kernel debugging.
+ * this segfaults on linux currently.
+ */
+ if (OpenCLInfo::use_debug() && debug_src)
+ build_options += "-g -s \"" + *debug_src + "\" ";
+ }
+
+ if (info.has_half_images) {
+ build_options += "-D__KERNEL_CL_KHR_FP16__ ";
+ }
+
+ if (OpenCLInfo::use_debug()) {
+ build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
+ }
+
+# ifdef WITH_CYCLES_DEBUG
+ build_options += "-D__KERNEL_DEBUG__ ";
+# endif
+
+ return build_options;
}
/* TODO(sergey): In the future we can use variadic templates, once
@@ -1944,137 +1913,130 @@ string OpenCLDevice::kernel_build_options(const string *debug_src)
*/
int OpenCLDevice::kernel_set_args(cl_kernel kernel,
int start_argument_index,
- const ArgumentWrapper& arg1,
- const ArgumentWrapper& arg2,
- const ArgumentWrapper& arg3,
- const ArgumentWrapper& arg4,
- const ArgumentWrapper& arg5,
- const ArgumentWrapper& arg6,
- const ArgumentWrapper& arg7,
- const ArgumentWrapper& arg8,
- const ArgumentWrapper& arg9,
- const ArgumentWrapper& arg10,
- const ArgumentWrapper& arg11,
- const ArgumentWrapper& arg12,
- const ArgumentWrapper& arg13,
- const ArgumentWrapper& arg14,
- const ArgumentWrapper& arg15,
- const ArgumentWrapper& arg16,
- const ArgumentWrapper& arg17,
- const ArgumentWrapper& arg18,
- const ArgumentWrapper& arg19,
- const ArgumentWrapper& arg20,
- const ArgumentWrapper& arg21,
- const ArgumentWrapper& arg22,
- const ArgumentWrapper& arg23,
- const ArgumentWrapper& arg24,
- const ArgumentWrapper& arg25,
- const ArgumentWrapper& arg26,
- const ArgumentWrapper& arg27,
- const ArgumentWrapper& arg28,
- const ArgumentWrapper& arg29,
- const ArgumentWrapper& arg30,
- const ArgumentWrapper& arg31,
- const ArgumentWrapper& arg32,
- const ArgumentWrapper& arg33)
+ const ArgumentWrapper &arg1,
+ const ArgumentWrapper &arg2,
+ const ArgumentWrapper &arg3,
+ const ArgumentWrapper &arg4,
+ const ArgumentWrapper &arg5,
+ const ArgumentWrapper &arg6,
+ const ArgumentWrapper &arg7,
+ const ArgumentWrapper &arg8,
+ const ArgumentWrapper &arg9,
+ const ArgumentWrapper &arg10,
+ const ArgumentWrapper &arg11,
+ const ArgumentWrapper &arg12,
+ const ArgumentWrapper &arg13,
+ const ArgumentWrapper &arg14,
+ const ArgumentWrapper &arg15,
+ const ArgumentWrapper &arg16,
+ const ArgumentWrapper &arg17,
+ const ArgumentWrapper &arg18,
+ const ArgumentWrapper &arg19,
+ const ArgumentWrapper &arg20,
+ const ArgumentWrapper &arg21,
+ const ArgumentWrapper &arg22,
+ const ArgumentWrapper &arg23,
+ const ArgumentWrapper &arg24,
+ const ArgumentWrapper &arg25,
+ const ArgumentWrapper &arg26,
+ const ArgumentWrapper &arg27,
+ const ArgumentWrapper &arg28,
+ const ArgumentWrapper &arg29,
+ const ArgumentWrapper &arg30,
+ const ArgumentWrapper &arg31,
+ const ArgumentWrapper &arg32,
+ const ArgumentWrapper &arg33)
{
- int current_arg_index = 0;
-#define FAKE_VARARG_HANDLE_ARG(arg) \
- do { \
- if(arg.pointer != NULL) { \
- opencl_assert(clSetKernelArg( \
- kernel, \
- start_argument_index + current_arg_index, \
- arg.size, arg.pointer)); \
- ++current_arg_index; \
- } \
- else { \
- return current_arg_index; \
- } \
- } while(false)
- FAKE_VARARG_HANDLE_ARG(arg1);
- FAKE_VARARG_HANDLE_ARG(arg2);
- FAKE_VARARG_HANDLE_ARG(arg3);
- FAKE_VARARG_HANDLE_ARG(arg4);
- FAKE_VARARG_HANDLE_ARG(arg5);
- FAKE_VARARG_HANDLE_ARG(arg6);
- FAKE_VARARG_HANDLE_ARG(arg7);
- FAKE_VARARG_HANDLE_ARG(arg8);
- FAKE_VARARG_HANDLE_ARG(arg9);
- FAKE_VARARG_HANDLE_ARG(arg10);
- FAKE_VARARG_HANDLE_ARG(arg11);
- FAKE_VARARG_HANDLE_ARG(arg12);
- FAKE_VARARG_HANDLE_ARG(arg13);
- FAKE_VARARG_HANDLE_ARG(arg14);
- FAKE_VARARG_HANDLE_ARG(arg15);
- FAKE_VARARG_HANDLE_ARG(arg16);
- FAKE_VARARG_HANDLE_ARG(arg17);
- FAKE_VARARG_HANDLE_ARG(arg18);
- FAKE_VARARG_HANDLE_ARG(arg19);
- FAKE_VARARG_HANDLE_ARG(arg20);
- FAKE_VARARG_HANDLE_ARG(arg21);
- FAKE_VARARG_HANDLE_ARG(arg22);
- FAKE_VARARG_HANDLE_ARG(arg23);
- FAKE_VARARG_HANDLE_ARG(arg24);
- FAKE_VARARG_HANDLE_ARG(arg25);
- FAKE_VARARG_HANDLE_ARG(arg26);
- FAKE_VARARG_HANDLE_ARG(arg27);
- FAKE_VARARG_HANDLE_ARG(arg28);
- FAKE_VARARG_HANDLE_ARG(arg29);
- FAKE_VARARG_HANDLE_ARG(arg30);
- FAKE_VARARG_HANDLE_ARG(arg31);
- FAKE_VARARG_HANDLE_ARG(arg32);
- FAKE_VARARG_HANDLE_ARG(arg33);
-#undef FAKE_VARARG_HANDLE_ARG
- return current_arg_index;
+ int current_arg_index = 0;
+# define FAKE_VARARG_HANDLE_ARG(arg) \
+ do { \
+ if (arg.pointer != NULL) { \
+ opencl_assert(clSetKernelArg( \
+ kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
+ ++current_arg_index; \
+ } \
+ else { \
+ return current_arg_index; \
+ } \
+ } while (false)
+ FAKE_VARARG_HANDLE_ARG(arg1);
+ FAKE_VARARG_HANDLE_ARG(arg2);
+ FAKE_VARARG_HANDLE_ARG(arg3);
+ FAKE_VARARG_HANDLE_ARG(arg4);
+ FAKE_VARARG_HANDLE_ARG(arg5);
+ FAKE_VARARG_HANDLE_ARG(arg6);
+ FAKE_VARARG_HANDLE_ARG(arg7);
+ FAKE_VARARG_HANDLE_ARG(arg8);
+ FAKE_VARARG_HANDLE_ARG(arg9);
+ FAKE_VARARG_HANDLE_ARG(arg10);
+ FAKE_VARARG_HANDLE_ARG(arg11);
+ FAKE_VARARG_HANDLE_ARG(arg12);
+ FAKE_VARARG_HANDLE_ARG(arg13);
+ FAKE_VARARG_HANDLE_ARG(arg14);
+ FAKE_VARARG_HANDLE_ARG(arg15);
+ FAKE_VARARG_HANDLE_ARG(arg16);
+ FAKE_VARARG_HANDLE_ARG(arg17);
+ FAKE_VARARG_HANDLE_ARG(arg18);
+ FAKE_VARARG_HANDLE_ARG(arg19);
+ FAKE_VARARG_HANDLE_ARG(arg20);
+ FAKE_VARARG_HANDLE_ARG(arg21);
+ FAKE_VARARG_HANDLE_ARG(arg22);
+ FAKE_VARARG_HANDLE_ARG(arg23);
+ FAKE_VARARG_HANDLE_ARG(arg24);
+ FAKE_VARARG_HANDLE_ARG(arg25);
+ FAKE_VARARG_HANDLE_ARG(arg26);
+ FAKE_VARARG_HANDLE_ARG(arg27);
+ FAKE_VARARG_HANDLE_ARG(arg28);
+ FAKE_VARARG_HANDLE_ARG(arg29);
+ FAKE_VARARG_HANDLE_ARG(arg30);
+ FAKE_VARARG_HANDLE_ARG(arg31);
+ FAKE_VARARG_HANDLE_ARG(arg32);
+ FAKE_VARARG_HANDLE_ARG(arg33);
+# undef FAKE_VARARG_HANDLE_ARG
+ return current_arg_index;
}
void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
{
- if(kernel) {
- clReleaseKernel(kernel);
- }
+ if (kernel) {
+ clReleaseKernel(kernel);
+ }
}
void OpenCLDevice::release_mem_object_safe(cl_mem mem)
{
- if(mem != NULL) {
- clReleaseMemObject(mem);
- }
+ if (mem != NULL) {
+ clReleaseMemObject(mem);
+ }
}
void OpenCLDevice::release_program_safe(cl_program program)
{
- if(program) {
- clReleaseProgram(program);
- }
+ if (program) {
+ clReleaseProgram(program);
+ }
}
/* ** Those guys are for workign around some compiler-specific bugs ** */
-cl_program OpenCLDevice::load_cached_kernel(ustring key,
- thread_scoped_lock& cache_locker)
+cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
{
- return OpenCLCache::get_program(cpPlatform,
- cdDevice,
- key,
- cache_locker);
+ return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
}
void OpenCLDevice::store_cached_kernel(cl_program program,
ustring key,
- thread_scoped_lock& cache_locker)
+ thread_scoped_lock &cache_locker)
{
- OpenCLCache::store_program(cpPlatform,
- cdDevice,
- program,
- key,
- cache_locker);
+ OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
}
-Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background)
+Device *opencl_create_split_device(DeviceInfo &info,
+ Stats &stats,
+ Profiler &profiler,
+ bool background)
{
- return new OpenCLDevice(info, stats, profiler, background);
+ return new OpenCLDevice(info, stats, profiler, background);
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp
index 5a1e12af8ab..cc40ad42b06 100644
--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -16,1059 +16,1017 @@
#ifdef WITH_OPENCL
-#include "device/opencl/opencl.h"
-#include "device/device_intern.h"
+# include "device/opencl/opencl.h"
+# include "device/device_intern.h"
-#include "util/util_debug.h"
-#include "util/util_logging.h"
-#include "util/util_md5.h"
-#include "util/util_path.h"
-#include "util/util_time.h"
-#include "util/util_system.h"
+# include "util/util_debug.h"
+# include "util/util_logging.h"
+# include "util/util_md5.h"
+# include "util/util_path.h"
+# include "util/util_time.h"
+# include "util/util_system.h"
using std::cerr;
using std::endl;
CCL_NAMESPACE_BEGIN
-OpenCLCache::Slot::ProgramEntry::ProgramEntry()
- : program(NULL),
- mutex(NULL)
+OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL)
{
}
-OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs)
- : program(rhs.program),
- mutex(NULL)
+OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs)
+ : program(rhs.program), mutex(NULL)
{
}
OpenCLCache::Slot::ProgramEntry::~ProgramEntry()
{
- delete mutex;
+ delete mutex;
}
-OpenCLCache::Slot::Slot()
- : context_mutex(NULL),
- context(NULL)
+OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL)
{
}
-OpenCLCache::Slot::Slot(const Slot& rhs)
- : context_mutex(NULL),
- context(NULL),
- programs(rhs.programs)
+OpenCLCache::Slot::Slot(const Slot &rhs)
+ : context_mutex(NULL), context(NULL), programs(rhs.programs)
{
}
OpenCLCache::Slot::~Slot()
{
- delete context_mutex;
+ delete context_mutex;
}
-OpenCLCache& OpenCLCache::global_instance()
+OpenCLCache &OpenCLCache::global_instance()
{
- static OpenCLCache instance;
- return instance;
+ static OpenCLCache instance;
+ return instance;
}
cl_context OpenCLCache::get_context(cl_platform_id platform,
cl_device_id device,
- thread_scoped_lock& slot_locker)
+ thread_scoped_lock &slot_locker)
{
- assert(platform != NULL);
+ assert(platform != NULL);
- OpenCLCache& self = global_instance();
+ OpenCLCache &self = global_instance();
- thread_scoped_lock cache_lock(self.cache_lock);
+ thread_scoped_lock cache_lock(self.cache_lock);
- pair<CacheMap::iterator,bool> ins = self.cache.insert(
- CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+ pair<CacheMap::iterator, bool> ins = self.cache.insert(
+ CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
- Slot &slot = ins.first->second;
+ Slot &slot = ins.first->second;
- /* create slot lock only while holding cache lock */
- if(!slot.context_mutex)
- slot.context_mutex = new thread_mutex;
+ /* create slot lock only while holding cache lock */
+ if (!slot.context_mutex)
+ slot.context_mutex = new thread_mutex;
- /* need to unlock cache before locking slot, to allow store to complete */
- cache_lock.unlock();
+ /* need to unlock cache before locking slot, to allow store to complete */
+ cache_lock.unlock();
- /* lock the slot */
- slot_locker = thread_scoped_lock(*slot.context_mutex);
+ /* lock the slot */
+ slot_locker = thread_scoped_lock(*slot.context_mutex);
- /* If the thing isn't cached */
- if(slot.context == NULL) {
- /* return with the caller's lock holder holding the slot lock */
- return NULL;
- }
+ /* If the thing isn't cached */
+ if (slot.context == NULL) {
+ /* return with the caller's lock holder holding the slot lock */
+ return NULL;
+ }
- /* the item was already cached, release the slot lock */
- slot_locker.unlock();
+ /* the item was already cached, release the slot lock */
+ slot_locker.unlock();
- cl_int ciErr = clRetainContext(slot.context);
- assert(ciErr == CL_SUCCESS);
- (void) ciErr;
+ cl_int ciErr = clRetainContext(slot.context);
+ assert(ciErr == CL_SUCCESS);
+ (void)ciErr;
- return slot.context;
+ return slot.context;
}
cl_program OpenCLCache::get_program(cl_platform_id platform,
cl_device_id device,
ustring key,
- thread_scoped_lock& slot_locker)
+ thread_scoped_lock &slot_locker)
{
- assert(platform != NULL);
+ assert(platform != NULL);
- OpenCLCache& self = global_instance();
+ OpenCLCache &self = global_instance();
- thread_scoped_lock cache_lock(self.cache_lock);
+ thread_scoped_lock cache_lock(self.cache_lock);
- pair<CacheMap::iterator,bool> ins = self.cache.insert(
- CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+ pair<CacheMap::iterator, bool> ins = self.cache.insert(
+ CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
- Slot &slot = ins.first->second;
+ Slot &slot = ins.first->second;
- pair<Slot::EntryMap::iterator,bool> ins2 = slot.programs.insert(
- Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
+ pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert(
+ Slot::EntryMap::value_type(key, Slot::ProgramEntry()));
- Slot::ProgramEntry &entry = ins2.first->second;
+ Slot::ProgramEntry &entry = ins2.first->second;
- /* create slot lock only while holding cache lock */
- if(!entry.mutex)
- entry.mutex = new thread_mutex;
+ /* create slot lock only while holding cache lock */
+ if (!entry.mutex)
+ entry.mutex = new thread_mutex;
- /* need to unlock cache before locking slot, to allow store to complete */
- cache_lock.unlock();
+ /* need to unlock cache before locking slot, to allow store to complete */
+ cache_lock.unlock();
- /* lock the slot */
- slot_locker = thread_scoped_lock(*entry.mutex);
+ /* lock the slot */
+ slot_locker = thread_scoped_lock(*entry.mutex);
- /* If the thing isn't cached */
- if(entry.program == NULL) {
- /* return with the caller's lock holder holding the slot lock */
- return NULL;
- }
+ /* If the thing isn't cached */
+ if (entry.program == NULL) {
+ /* return with the caller's lock holder holding the slot lock */
+ return NULL;
+ }
- /* the item was already cached, release the slot lock */
- slot_locker.unlock();
+ /* the item was already cached, release the slot lock */
+ slot_locker.unlock();
- cl_int ciErr = clRetainProgram(entry.program);
- assert(ciErr == CL_SUCCESS);
- (void) ciErr;
+ cl_int ciErr = clRetainProgram(entry.program);
+ assert(ciErr == CL_SUCCESS);
+ (void)ciErr;
- return entry.program;
+ return entry.program;
}
void OpenCLCache::store_context(cl_platform_id platform,
cl_device_id device,
cl_context context,
- thread_scoped_lock& slot_locker)
+ thread_scoped_lock &slot_locker)
{
- assert(platform != NULL);
- assert(device != NULL);
- assert(context != NULL);
+ assert(platform != NULL);
+ assert(device != NULL);
+ assert(context != NULL);
- OpenCLCache &self = global_instance();
+ OpenCLCache &self = global_instance();
- thread_scoped_lock cache_lock(self.cache_lock);
- CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
- cache_lock.unlock();
+ thread_scoped_lock cache_lock(self.cache_lock);
+ CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+ cache_lock.unlock();
- Slot &slot = i->second;
+ Slot &slot = i->second;
- /* sanity check */
- assert(i != self.cache.end());
- assert(slot.context == NULL);
+ /* sanity check */
+ assert(i != self.cache.end());
+ assert(slot.context == NULL);
- slot.context = context;
+ slot.context = context;
- /* unlock the slot */
- slot_locker.unlock();
+ /* unlock the slot */
+ slot_locker.unlock();
- /* increment reference count in OpenCL.
- * The caller is going to release the object when done with it. */
- cl_int ciErr = clRetainContext(context);
- assert(ciErr == CL_SUCCESS);
- (void) ciErr;
+ /* increment reference count in OpenCL.
+ * The caller is going to release the object when done with it. */
+ cl_int ciErr = clRetainContext(context);
+ assert(ciErr == CL_SUCCESS);
+ (void)ciErr;
}
void OpenCLCache::store_program(cl_platform_id platform,
cl_device_id device,
cl_program program,
ustring key,
- thread_scoped_lock& slot_locker)
+ thread_scoped_lock &slot_locker)
{
- assert(platform != NULL);
- assert(device != NULL);
- assert(program != NULL);
+ assert(platform != NULL);
+ assert(device != NULL);
+ assert(program != NULL);
- OpenCLCache &self = global_instance();
+ OpenCLCache &self = global_instance();
- thread_scoped_lock cache_lock(self.cache_lock);
+ thread_scoped_lock cache_lock(self.cache_lock);
- CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
- assert(i != self.cache.end());
- Slot &slot = i->second;
+ CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+ assert(i != self.cache.end());
+ Slot &slot = i->second;
- Slot::EntryMap::iterator i2 = slot.programs.find(key);
- assert(i2 != slot.programs.end());
- Slot::ProgramEntry &entry = i2->second;
+ Slot::EntryMap::iterator i2 = slot.programs.find(key);
+ assert(i2 != slot.programs.end());
+ Slot::ProgramEntry &entry = i2->second;
- assert(entry.program == NULL);
+ assert(entry.program == NULL);
- cache_lock.unlock();
+ cache_lock.unlock();
- entry.program = program;
+ entry.program = program;
- /* unlock the slot */
- slot_locker.unlock();
+ /* unlock the slot */
+ slot_locker.unlock();
- /* Increment reference count in OpenCL.
- * The caller is going to release the object when done with it.
- */
- cl_int ciErr = clRetainProgram(program);
- assert(ciErr == CL_SUCCESS);
- (void) ciErr;
+ /* Increment reference count in OpenCL.
+ * The caller is going to release the object when done with it.
+ */
+ cl_int ciErr = clRetainProgram(program);
+ assert(ciErr == CL_SUCCESS);
+ (void)ciErr;
}
string OpenCLCache::get_kernel_md5()
{
- OpenCLCache &self = global_instance();
- thread_scoped_lock lock(self.kernel_md5_lock);
+ OpenCLCache &self = global_instance();
+ thread_scoped_lock lock(self.kernel_md5_lock);
- if(self.kernel_md5.empty()) {
- self.kernel_md5 = path_files_md5_hash(path_get("source"));
- }
- return self.kernel_md5;
+ if (self.kernel_md5.empty()) {
+ self.kernel_md5 = path_files_md5_hash(path_get("source"));
+ }
+ return self.kernel_md5;
}
-static string get_program_source(const string& kernel_file)
+static string get_program_source(const string &kernel_file)
{
- string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
- /* We compile kernels consisting of many files. unfortunately OpenCL
- * kernel caches do not seem to recognize changes in included files.
- * so we force recompile on changes by adding the md5 hash of all files.
- */
- source = path_source_replace_includes(source, path_get("source"));
- source += "\n// " + util_md5_string(source) + "\n";
- return source;
+ string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n";
+ /* We compile kernels consisting of many files. unfortunately OpenCL
+ * kernel caches do not seem to recognize changes in included files.
+ * so we force recompile on changes by adding the md5 hash of all files.
+ */
+ source = path_source_replace_includes(source, path_get("source"));
+ source += "\n// " + util_md5_string(source) + "\n";
+ return source;
}
OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device,
- const string& program_name,
- const string& kernel_file,
- const string& kernel_build_options,
- bool use_stdout)
- : device(device),
- program_name(program_name),
- kernel_file(kernel_file),
- kernel_build_options(kernel_build_options),
- use_stdout(use_stdout)
+ const string &program_name,
+ const string &kernel_file,
+ const string &kernel_build_options,
+ bool use_stdout)
+ : device(device),
+ program_name(program_name),
+ kernel_file(kernel_file),
+ kernel_build_options(kernel_build_options),
+ use_stdout(use_stdout)
{
- loaded = false;
- needs_compiling = true;
- program = NULL;
+ loaded = false;
+ needs_compiling = true;
+ program = NULL;
}
OpenCLDevice::OpenCLProgram::~OpenCLProgram()
{
- release();
+ release();
}
void OpenCLDevice::OpenCLProgram::release()
{
- for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
- if(kernel->second) {
- clReleaseKernel(kernel->second);
- kernel->second = NULL;
- }
- }
- if(program) {
- clReleaseProgram(program);
- program = NULL;
- }
+ for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
+ ++kernel) {
+ if (kernel->second) {
+ clReleaseKernel(kernel->second);
+ kernel->second = NULL;
+ }
+ }
+ if (program) {
+ clReleaseProgram(program);
+ program = NULL;
+ }
}
-void OpenCLDevice::OpenCLProgram::add_log(const string& msg, bool debug)
+void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug)
{
- if(!use_stdout) {
- log += msg + "\n";
- }
- else if(!debug) {
- printf("%s\n", msg.c_str());
- fflush(stdout);
- }
- else {
- VLOG(2) << msg;
- }
+ if (!use_stdout) {
+ log += msg + "\n";
+ }
+ else if (!debug) {
+ printf("%s\n", msg.c_str());
+ fflush(stdout);
+ }
+ else {
+ VLOG(2) << msg;
+ }
}
-void OpenCLDevice::OpenCLProgram::add_error(const string& msg)
+void OpenCLDevice::OpenCLProgram::add_error(const string &msg)
{
- if(use_stdout) {
- fprintf(stderr, "%s\n", msg.c_str());
- }
- if(error_msg == "") {
- error_msg += "\n";
- }
- error_msg += msg;
+ if (use_stdout) {
+ fprintf(stderr, "%s\n", msg.c_str());
+ }
+ if (error_msg == "") {
+ error_msg += "\n";
+ }
+ error_msg += msg;
}
void OpenCLDevice::OpenCLProgram::add_kernel(ustring name)
{
- if(!kernels.count(name)) {
- kernels[name] = NULL;
- }
+ if (!kernels.count(name)) {
+ kernels[name] = NULL;
+ }
}
bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src)
{
- string build_options;
- build_options = device->kernel_build_options(debug_src) + kernel_build_options;
+ string build_options;
+ build_options = device->kernel_build_options(debug_src) + kernel_build_options;
- VLOG(1) << "Build options passed to clBuildProgram: '"
- << build_options << "'.";
- cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
+ VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'.";
+ cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
- /* show warnings even if build is successful */
- size_t ret_val_size = 0;
+ /* show warnings even if build is successful */
+ size_t ret_val_size = 0;
- clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+ clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
- if(ciErr != CL_SUCCESS) {
- add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + ", errors in console.");
- }
+ if (ciErr != CL_SUCCESS) {
+ add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) +
+ ", errors in console.");
+ }
- if(ret_val_size > 1) {
- vector<char> build_log(ret_val_size + 1);
- clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+ if (ret_val_size > 1) {
+ vector<char> build_log(ret_val_size + 1);
+ clGetProgramBuildInfo(
+ program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
- build_log[ret_val_size] = '\0';
- /* Skip meaningless empty output from the NVidia compiler. */
- if(!(ret_val_size == 2 && build_log[0] == '\n')) {
- add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), ciErr == CL_SUCCESS);
- }
- }
+ build_log[ret_val_size] = '\0';
+ /* Skip meaningless empty output from the NVidia compiler. */
+ if (!(ret_val_size == 2 && build_log[0] == '\n')) {
+ add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]),
+ ciErr == CL_SUCCESS);
+ }
+ }
- return (ciErr == CL_SUCCESS);
+ return (ciErr == CL_SUCCESS);
}
bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src)
{
- string source = get_program_source(kernel_file);
+ string source = get_program_source(kernel_file);
- if(debug_src) {
- path_write_text(*debug_src, source);
- }
+ if (debug_src) {
+ path_write_text(*debug_src, source);
+ }
- size_t source_len = source.size();
- const char *source_str = source.c_str();
- cl_int ciErr;
+ size_t source_len = source.size();
+ const char *source_str = source.c_str();
+ cl_int ciErr;
- program = clCreateProgramWithSource(device->cxContext,
- 1,
- &source_str,
- &source_len,
- &ciErr);
+ program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr);
- if(ciErr != CL_SUCCESS) {
- add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
- return false;
- }
+ if (ciErr != CL_SUCCESS) {
+ add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr));
+ return false;
+ }
- double starttime = time_dt();
- add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
- add_log(string("Build flags: ") + kernel_build_options, true);
+ double starttime = time_dt();
+ add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
+ add_log(string("Build flags: ") + kernel_build_options, true);
- if(!build_kernel(debug_src))
- return false;
+ if (!build_kernel(debug_src))
+ return false;
- double elapsed = time_dt() - starttime;
- add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false);
+ double elapsed = time_dt() - starttime;
+ add_log(
+ string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
+ false);
- return true;
+ return true;
}
-static void escape_python_string(string& str)
+static void escape_python_string(string &str)
{
- /* Escape string to be passed as a Python raw string with '' quotes'. */
- string_replace(str, "'", "\'");
+ /* Escape string to be passed as a Python raw string with '' quotes'. */
+ string_replace(str, "'", "\'");
}
-bool OpenCLDevice::OpenCLProgram::compile_separate(const string& clbin)
+bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin)
{
- vector<string> args;
- args.push_back("--background");
- args.push_back("--factory-startup");
- args.push_back("--python-expr");
-
- int device_platform_id = device->device_num;
- string device_name = device->device_name;
- string platform_name = device->platform_name;
- string build_options = device->kernel_build_options(NULL) + kernel_build_options;
- string kernel_file_escaped = kernel_file;
- string clbin_escaped = clbin;
-
- escape_python_string(device_name);
- escape_python_string(platform_name);
- escape_python_string(build_options);
- escape_python_string(kernel_file_escaped);
- escape_python_string(clbin_escaped);
-
- args.push_back(
- string_printf(
- "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
- device_platform_id,
- device_name.c_str(),
- platform_name.c_str(),
- build_options.c_str(),
- kernel_file_escaped.c_str(),
- clbin_escaped.c_str()));
-
- double starttime = time_dt();
- add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
- add_log(string("Build flags: ") + kernel_build_options, true);
- if(!system_call_self(args) || !path_exists(clbin)) {
- return false;
- }
-
- double elapsed = time_dt() - starttime;
- add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false);
-
- return load_binary(clbin);
+ vector<string> args;
+ args.push_back("--background");
+ args.push_back("--factory-startup");
+ args.push_back("--python-expr");
+
+ int device_platform_id = device->device_num;
+ string device_name = device->device_name;
+ string platform_name = device->platform_name;
+ string build_options = device->kernel_build_options(NULL) + kernel_build_options;
+ string kernel_file_escaped = kernel_file;
+ string clbin_escaped = clbin;
+
+ escape_python_string(device_name);
+ escape_python_string(platform_name);
+ escape_python_string(build_options);
+ escape_python_string(kernel_file_escaped);
+ escape_python_string(clbin_escaped);
+
+ args.push_back(string_printf(
+ "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')",
+ device_platform_id,
+ device_name.c_str(),
+ platform_name.c_str(),
+ build_options.c_str(),
+ kernel_file_escaped.c_str(),
+ clbin_escaped.c_str()));
+
+ double starttime = time_dt();
+ add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false);
+ add_log(string("Build flags: ") + kernel_build_options, true);
+ if (!system_call_self(args) || !path_exists(clbin)) {
+ return false;
+ }
+
+ double elapsed = time_dt() - starttime;
+ add_log(
+ string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed),
+ false);
+
+ return load_binary(clbin);
}
/* Compile opencl kernel. This method is called from the _cycles Python
* module compile kernels. Parameters must match function above. */
-bool device_opencl_compile_kernel(const vector<string>& parameters)
+bool device_opencl_compile_kernel(const vector<string> &parameters)
{
- int device_platform_id = std::stoi(parameters[0]);
- const string& device_name = parameters[1];
- const string& platform_name = parameters[2];
- const string& build_options = parameters[3];
- const string& kernel_file = parameters[4];
- const string& binary_path = parameters[5];
-
- if(clewInit() != CLEW_SUCCESS) {
- return false;
- }
-
- vector<OpenCLPlatformDevice> usable_devices;
- OpenCLInfo::get_usable_devices(&usable_devices);
- if(device_platform_id >= usable_devices.size()) {
- return false;
- }
-
- OpenCLPlatformDevice& platform_device = usable_devices[device_platform_id];
- if(platform_device.platform_name != platform_name ||
- platform_device.device_name != device_name)
- {
- return false;
- }
-
- cl_platform_id platform = platform_device.platform_id;
- cl_device_id device = platform_device.device_id;
- const cl_context_properties context_props[] = {
- CL_CONTEXT_PLATFORM, (cl_context_properties) platform,
- 0, 0
- };
-
- cl_int err;
- cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
- if(err != CL_SUCCESS) {
- return false;
- }
-
- string source = get_program_source(kernel_file);
- size_t source_len = source.size();
- const char *source_str = source.c_str();
- cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
- bool result = false;
-
- if(err == CL_SUCCESS) {
- err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
-
- if(err == CL_SUCCESS) {
- size_t size = 0;
- clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
- if(size > 0) {
- vector<uint8_t> binary(size);
- uint8_t *bytes = &binary[0];
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
- result = path_write_binary(binary_path, binary);
- }
- }
- clReleaseProgram(program);
- }
-
- clReleaseContext(context);
-
- return result;
+ int device_platform_id = std::stoi(parameters[0]);
+ const string &device_name = parameters[1];
+ const string &platform_name = parameters[2];
+ const string &build_options = parameters[3];
+ const string &kernel_file = parameters[4];
+ const string &binary_path = parameters[5];
+
+ if (clewInit() != CLEW_SUCCESS) {
+ return false;
+ }
+
+ vector<OpenCLPlatformDevice> usable_devices;
+ OpenCLInfo::get_usable_devices(&usable_devices);
+ if (device_platform_id >= usable_devices.size()) {
+ return false;
+ }
+
+ OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id];
+ if (platform_device.platform_name != platform_name ||
+ platform_device.device_name != device_name) {
+ return false;
+ }
+
+ cl_platform_id platform = platform_device.platform_id;
+ cl_device_id device = platform_device.device_id;
+ const cl_context_properties context_props[] = {
+ CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0};
+
+ cl_int err;
+ cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err);
+ if (err != CL_SUCCESS) {
+ return false;
+ }
+
+ string source = get_program_source(kernel_file);
+ size_t source_len = source.size();
+ const char *source_str = source.c_str();
+ cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err);
+ bool result = false;
+
+ if (err == CL_SUCCESS) {
+ err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL);
+
+ if (err == CL_SUCCESS) {
+ size_t size = 0;
+ clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+ if (size > 0) {
+ vector<uint8_t> binary(size);
+ uint8_t *bytes = &binary[0];
+ clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
+ result = path_write_binary(binary_path, binary);
+ }
+ }
+ clReleaseProgram(program);
+ }
+
+ clReleaseContext(context);
+
+ return result;
}
-bool OpenCLDevice::OpenCLProgram::load_binary(const string& clbin,
- const string *debug_src)
+bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src)
{
- /* read binary into memory */
- vector<uint8_t> binary;
+ /* read binary into memory */
+ vector<uint8_t> binary;
- if(!path_read_binary(clbin, binary)) {
- add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
- return false;
- }
+ if (!path_read_binary(clbin, binary)) {
+ add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
+ return false;
+ }
- /* create program */
- cl_int status, ciErr;
- size_t size = binary.size();
- const uint8_t *bytes = &binary[0];
+ /* create program */
+ cl_int status, ciErr;
+ size_t size = binary.size();
+ const uint8_t *bytes = &binary[0];
- program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice,
- &size, &bytes, &status, &ciErr);
+ program = clCreateProgramWithBinary(
+ device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr);
- if(status != CL_SUCCESS || ciErr != CL_SUCCESS) {
- add_error(string("OpenCL failed create program from cached binary ") + clbin + ": "
- + clewErrorString(status) + " " + clewErrorString(ciErr));
- return false;
- }
+ if (status != CL_SUCCESS || ciErr != CL_SUCCESS) {
+ add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " +
+ clewErrorString(status) + " " + clewErrorString(ciErr));
+ return false;
+ }
- if(!build_kernel(debug_src))
- return false;
+ if (!build_kernel(debug_src))
+ return false;
- return true;
+ return true;
}
-bool OpenCLDevice::OpenCLProgram::save_binary(const string& clbin)
+bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin)
{
- size_t size = 0;
- clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+ size_t size = 0;
+ clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
- if(!size)
- return false;
+ if (!size)
+ return false;
- vector<uint8_t> binary(size);
- uint8_t *bytes = &binary[0];
+ vector<uint8_t> binary(size);
+ uint8_t *bytes = &binary[0];
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+ clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL);
- return path_write_binary(clbin, binary);
+ return path_write_binary(clbin, binary);
}
bool OpenCLDevice::OpenCLProgram::load()
{
- loaded = false;
- string device_md5 = device->device_md5_hash(kernel_build_options);
-
- /* Try to use cached kernel. */
- thread_scoped_lock cache_locker;
- ustring cache_key(program_name + device_md5);
- program = device->load_cached_kernel(cache_key,
- cache_locker);
- if (!program) {
- add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
- /* need to create source to get md5 */
- string source = get_program_source(kernel_file);
-
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
- basename = path_cache_get(path_join("kernels", basename));
- string clbin = basename + ".clbin";
-
- /* If binary kernel exists already, try use it. */
- if(path_exists(clbin) && load_binary(clbin)) {
- /* Kernel loaded from binary, nothing to do. */
- add_log(string("Loaded program from ") + clbin + ".", true);
-
- /* Cache the program. */
- device->store_cached_kernel(program,
- cache_key,
- cache_locker);
- }
- else {
- add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
- cache_locker.unlock();
- }
- }
-
- if (program) {
- create_kernels();
- loaded = true;
- needs_compiling = false;
- }
-
- return loaded;
+ loaded = false;
+ string device_md5 = device->device_md5_hash(kernel_build_options);
+
+ /* Try to use cached kernel. */
+ thread_scoped_lock cache_locker;
+ ustring cache_key(program_name + device_md5);
+ program = device->load_cached_kernel(cache_key, cache_locker);
+ if (!program) {
+ add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+ /* need to create source to get md5 */
+ string source = get_program_source(kernel_file);
+
+ string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
+ util_md5_string(source);
+ basename = path_cache_get(path_join("kernels", basename));
+ string clbin = basename + ".clbin";
+
+ /* If binary kernel exists already, try use it. */
+ if (path_exists(clbin) && load_binary(clbin)) {
+ /* Kernel loaded from binary, nothing to do. */
+ add_log(string("Loaded program from ") + clbin + ".", true);
+
+ /* Cache the program. */
+ device->store_cached_kernel(program, cache_key, cache_locker);
+ }
+ else {
+ add_log(string("OpenCL program ") + program_name + " not found on disk.", true);
+ cache_locker.unlock();
+ }
+ }
+
+ if (program) {
+ create_kernels();
+ loaded = true;
+ needs_compiling = false;
+ }
+
+ return loaded;
}
void OpenCLDevice::OpenCLProgram::compile()
{
- assert(device);
-
- string device_md5 = device->device_md5_hash(kernel_build_options);
-
- /* Try to use cached kernel. */
- thread_scoped_lock cache_locker;
- ustring cache_key(program_name + device_md5);
- program = device->load_cached_kernel(cache_key,
- cache_locker);
-
- if (!program)
- {
-
- add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
-
- /* need to create source to get md5 */
- string source = get_program_source(kernel_file);
-
- string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source);
- basename = path_cache_get(path_join("kernels", basename));
- string clbin = basename + ".clbin";
-
- /* path to preprocessed source for debugging */
- string clsrc, *debug_src = NULL;
-
- if(OpenCLInfo::use_debug()) {
- clsrc = basename + ".cl";
- debug_src = &clsrc;
- }
-
- /* If binary kernel exists already, try use it. */
- if(compile_separate(clbin)) {
- add_log(string("Built and loaded program from ") + clbin + ".", true);
- loaded = true;
- }
- else {
- add_log(string("Separate-process building of ") + clbin + " failed, will fall back to regular building.", true);
-
- /* If does not exist or loading binary failed, compile kernel. */
- if(!compile_kernel(debug_src)) {
- needs_compiling = false;
- return;
- }
-
- /* Save binary for reuse. */
- if(!save_binary(clbin)) {
- add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
- }
- }
-
- /* Cache the program. */
- device->store_cached_kernel(program,
- cache_key,
- cache_locker);
- }
-
- create_kernels();
- needs_compiling = false;
- loaded = true;
+ assert(device);
+
+ string device_md5 = device->device_md5_hash(kernel_build_options);
+
+ /* Try to use cached kernel. */
+ thread_scoped_lock cache_locker;
+ ustring cache_key(program_name + device_md5);
+ program = device->load_cached_kernel(cache_key, cache_locker);
+
+ if (!program) {
+
+ add_log(string("OpenCL program ") + program_name + " not found in cache.", true);
+
+ /* need to create source to get md5 */
+ string source = get_program_source(kernel_file);
+
+ string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" +
+ util_md5_string(source);
+ basename = path_cache_get(path_join("kernels", basename));
+ string clbin = basename + ".clbin";
+
+ /* path to preprocessed source for debugging */
+ string clsrc, *debug_src = NULL;
+
+ if (OpenCLInfo::use_debug()) {
+ clsrc = basename + ".cl";
+ debug_src = &clsrc;
+ }
+
+ /* If binary kernel exists already, try use it. */
+ if (compile_separate(clbin)) {
+ add_log(string("Built and loaded program from ") + clbin + ".", true);
+ loaded = true;
+ }
+ else {
+ add_log(string("Separate-process building of ") + clbin +
+ " failed, will fall back to regular building.",
+ true);
+
+ /* If does not exist or loading binary failed, compile kernel. */
+ if (!compile_kernel(debug_src)) {
+ needs_compiling = false;
+ return;
+ }
+
+ /* Save binary for reuse. */
+ if (!save_binary(clbin)) {
+ add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true);
+ }
+ }
+
+ /* Cache the program. */
+ device->store_cached_kernel(program, cache_key, cache_locker);
+ }
+
+ create_kernels();
+ needs_compiling = false;
+ loaded = true;
}
void OpenCLDevice::OpenCLProgram::create_kernels()
{
- for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) {
- assert(kernel->second == NULL);
- cl_int ciErr;
- string name = "kernel_ocl_" + kernel->first.string();
- kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
- if(device->opencl_error(ciErr)) {
- add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + clewErrorString(ciErr));
- return;
- }
- }
+ for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end();
+ ++kernel) {
+ assert(kernel->second == NULL);
+ cl_int ciErr;
+ string name = "kernel_ocl_" + kernel->first.string();
+ kernel->second = clCreateKernel(program, name.c_str(), &ciErr);
+ if (device->opencl_error(ciErr)) {
+ add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " +
+ clewErrorString(ciErr));
+ return;
+ }
+ }
}
bool OpenCLDevice::OpenCLProgram::wait_for_availability()
{
- add_log(string("Waiting for availability of ") + program_name + ".", true);
- while (needs_compiling) {
- time_sleep(0.1);
- }
- return loaded;
+ add_log(string("Waiting for availability of ") + program_name + ".", true);
+ while (needs_compiling) {
+ time_sleep(0.1);
+ }
+ return loaded;
}
void OpenCLDevice::OpenCLProgram::report_error()
{
- /* If loaded is true, there was no error. */
- if(loaded) return;
- /* if use_stdout is true, the error was already reported. */
- if(use_stdout) return;
-
- cerr << error_msg << endl;
- if(!compile_output.empty()) {
- cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
- cerr << compile_output << endl;
- }
+ /* If loaded is true, there was no error. */
+ if (loaded)
+ return;
+ /* if use_stdout is true, the error was already reported. */
+ if (use_stdout)
+ return;
+
+ cerr << error_msg << endl;
+ if (!compile_output.empty()) {
+ cerr << "OpenCL kernel build output for " << program_name << ":" << endl;
+ cerr << compile_output << endl;
+ }
}
cl_kernel OpenCLDevice::OpenCLProgram::operator()()
{
- assert(kernels.size() == 1);
- return kernels.begin()->second;
+ assert(kernels.size() == 1);
+ return kernels.begin()->second;
}
cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name)
{
- assert(kernels.count(name));
- return kernels[name];
+ assert(kernels.count(name));
+ return kernels[name];
}
cl_device_type OpenCLInfo::device_type()
{
- switch(DebugFlags().opencl.device_type)
- {
- case DebugFlags::OpenCL::DEVICE_NONE:
- return 0;
- case DebugFlags::OpenCL::DEVICE_ALL:
- return CL_DEVICE_TYPE_ALL;
- case DebugFlags::OpenCL::DEVICE_DEFAULT:
- return CL_DEVICE_TYPE_DEFAULT;
- case DebugFlags::OpenCL::DEVICE_CPU:
- return CL_DEVICE_TYPE_CPU;
- case DebugFlags::OpenCL::DEVICE_GPU:
- return CL_DEVICE_TYPE_GPU;
- case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
- return CL_DEVICE_TYPE_ACCELERATOR;
- default:
- return CL_DEVICE_TYPE_ALL;
- }
+ switch (DebugFlags().opencl.device_type) {
+ case DebugFlags::OpenCL::DEVICE_NONE:
+ return 0;
+ case DebugFlags::OpenCL::DEVICE_ALL:
+ return CL_DEVICE_TYPE_ALL;
+ case DebugFlags::OpenCL::DEVICE_DEFAULT:
+ return CL_DEVICE_TYPE_DEFAULT;
+ case DebugFlags::OpenCL::DEVICE_CPU:
+ return CL_DEVICE_TYPE_CPU;
+ case DebugFlags::OpenCL::DEVICE_GPU:
+ return CL_DEVICE_TYPE_GPU;
+ case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
+ return CL_DEVICE_TYPE_ACCELERATOR;
+ default:
+ return CL_DEVICE_TYPE_ALL;
+ }
}
bool OpenCLInfo::use_debug()
{
- return DebugFlags().opencl.debug;
+ return DebugFlags().opencl.debug;
}
-bool OpenCLInfo::device_supported(const string& platform_name,
- const cl_device_id device_id)
+bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id)
{
- cl_device_type device_type;
- if(!get_device_type(device_id, &device_type)) {
- return false;
- }
- string device_name;
- if(!get_device_name(device_id, &device_name)) {
- return false;
- }
-
- int driver_major = 0;
- int driver_minor = 0;
- if(!get_driver_version(device_id, &driver_major, &driver_minor)) {
- return false;
- }
- VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
-
- /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
- * (aka, it will not be on Intel framework). This isn't supported
- * and needs an explicit blacklist.
- */
- if(strstr(device_name.c_str(), "Iris")) {
- return false;
- }
- if(platform_name == "AMD Accelerated Parallel Processing" &&
- device_type == CL_DEVICE_TYPE_GPU)
- {
- if(driver_major < 2236) {
- VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
- return false;
- }
- const char *blacklist[] = {
- /* GCN 1 */
- "Tahiti", "Pitcairn", "Capeverde", "Oland", "Hainan",
- NULL
- };
- for(int i = 0; blacklist[i] != NULL; i++) {
- if(device_name == blacklist[i]) {
- VLOG(1) << "AMD device " << device_name << " not supported";
- return false;
- }
- }
- return true;
- }
- if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
- return false;
- }
- return false;
+ cl_device_type device_type;
+ if (!get_device_type(device_id, &device_type)) {
+ return false;
+ }
+ string device_name;
+ if (!get_device_name(device_id, &device_name)) {
+ return false;
+ }
+
+ int driver_major = 0;
+ int driver_minor = 0;
+ if (!get_driver_version(device_id, &driver_major, &driver_minor)) {
+ return false;
+ }
+ VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor;
+
+ /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework
+ * (aka, it will not be on Intel framework). This isn't supported
+ * and needs an explicit blacklist.
+ */
+ if (strstr(device_name.c_str(), "Iris")) {
+ return false;
+ }
+ if (platform_name == "AMD Accelerated Parallel Processing" &&
+ device_type == CL_DEVICE_TYPE_GPU) {
+ if (driver_major < 2236) {
+ VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported.";
+ return false;
+ }
+ const char *blacklist[] = {/* GCN 1 */
+ "Tahiti",
+ "Pitcairn",
+ "Capeverde",
+ "Oland",
+ "Hainan",
+ NULL};
+ for (int i = 0; blacklist[i] != NULL; i++) {
+ if (device_name == blacklist[i]) {
+ VLOG(1) << "AMD device " << device_name << " not supported";
+ return false;
+ }
+ }
+ return true;
+ }
+ if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
+ return false;
+ }
+ return false;
}
-bool OpenCLInfo::platform_version_check(cl_platform_id platform,
- string *error)
+bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error)
{
- const int req_major = 1, req_minor = 1;
- int major, minor;
- char version[256];
- clGetPlatformInfo(platform,
- CL_PLATFORM_VERSION,
- sizeof(version),
- &version,
- NULL);
- if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
- if(error != NULL) {
- *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
- }
- return false;
- }
- if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
- if(error != NULL) {
- *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
- }
- return false;
- }
- if(error != NULL) {
- *error = "";
- }
- return true;
+ const int req_major = 1, req_minor = 1;
+ int major, minor;
+ char version[256];
+ clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
+ if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
+ if (error != NULL) {
+ *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
+ }
+ return false;
+ }
+ if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
+ if (error != NULL) {
+ *error = string_printf(
+ "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
+ }
+ return false;
+ }
+ if (error != NULL) {
+ *error = "";
+ }
+ return true;
}
-bool OpenCLInfo::device_version_check(cl_device_id device,
- string *error)
+bool OpenCLInfo::device_version_check(cl_device_id device, string *error)
{
- const int req_major = 1, req_minor = 1;
- int major, minor;
- char version[256];
- clGetDeviceInfo(device,
- CL_DEVICE_OPENCL_C_VERSION,
- sizeof(version),
- &version,
- NULL);
- if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
- if(error != NULL) {
- *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
- }
- return false;
- }
- if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
- if(error != NULL) {
- *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
- }
- return false;
- }
- if(error != NULL) {
- *error = "";
- }
- return true;
+ const int req_major = 1, req_minor = 1;
+ int major, minor;
+ char version[256];
+ clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
+ if (sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
+ if (error != NULL) {
+ *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
+ }
+ return false;
+ }
+ if (!((major == req_major && minor >= req_minor) || (major > req_major))) {
+ if (error != NULL) {
+ *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
+ }
+ return false;
+ }
+ if (error != NULL) {
+ *error = "";
+ }
+ return true;
}
-string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id)
+string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id)
{
- if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
- /* Use cl_amd_device_topology extension. */
- cl_char topology[24];
- if(clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && topology[0] == 1) {
- return string_printf("%02x:%02x.%01x",
- (unsigned int)topology[21],
- (unsigned int)topology[22],
- (unsigned int)topology[23]);
- }
- }
- else if(platform_name == "NVIDIA CUDA") {
- /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
- cl_int bus_id, slot_id;
- if(clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
- clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
- return string_printf("%02x:%02x.%01x",
- (unsigned int)(bus_id),
- (unsigned int)(slot_id >> 3),
- (unsigned int)(slot_id & 0x7));
- }
- }
- /* No general way to get a hardware ID from OpenCL => give up. */
- return "";
+ if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") {
+ /* Use cl_amd_device_topology extension. */
+ cl_char topology[24];
+ if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS &&
+ topology[0] == 1) {
+ return string_printf("%02x:%02x.%01x",
+ (unsigned int)topology[21],
+ (unsigned int)topology[22],
+ (unsigned int)topology[23]);
+ }
+ }
+ else if (platform_name == "NVIDIA CUDA") {
+ /* Use two undocumented options of the cl_nv_device_attribute_query extension. */
+ cl_int bus_id, slot_id;
+ if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS &&
+ clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) {
+ return string_printf("%02x:%02x.%01x",
+ (unsigned int)(bus_id),
+ (unsigned int)(slot_id >> 3),
+ (unsigned int)(slot_id & 0x7));
+ }
+ }
+ /* No general way to get a hardware ID from OpenCL => give up. */
+ return "";
}
-void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices,
- bool force_all)
+void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all)
{
- const cl_device_type device_type = OpenCLInfo::device_type();
- static bool first_time = true;
-#define FIRST_VLOG(severity) if(first_time) VLOG(severity)
-
- usable_devices->clear();
-
- if(device_type == 0) {
- FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
- first_time = false;
- return;
- }
-
- cl_int error;
- vector<cl_device_id> device_ids;
- vector<cl_platform_id> platform_ids;
-
- /* Get platforms. */
- if(!get_platforms(&platform_ids, &error)) {
- FIRST_VLOG(2) << "Error fetching platforms:"
- << string(clewErrorString(error));
- first_time = false;
- return;
- }
- if(platform_ids.size() == 0) {
- FIRST_VLOG(2) << "No OpenCL platforms were found.";
- first_time = false;
- return;
- }
- /* Devices are numbered consecutively across platforms. */
- for(int platform = 0; platform < platform_ids.size(); platform++) {
- cl_platform_id platform_id = platform_ids[platform];
- string platform_name;
- if(!get_platform_name(platform_id, &platform_name)) {
- FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
- continue;
- }
- FIRST_VLOG(2) << "Enumerating devices for platform "
- << platform_name << ".";
- if(!platform_version_check(platform_id)) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << " due to too old compiler version.";
- continue;
- }
- if(!get_platform_devices(platform_id,
- device_type,
- &device_ids,
- &error))
- {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << ", failed to fetch of devices: "
- << string(clewErrorString(error));
- continue;
- }
- if(device_ids.size() == 0) {
- FIRST_VLOG(2) << "Ignoring platform " << platform_name
- << ", it has no devices.";
- continue;
- }
- for(int num = 0; num < device_ids.size(); num++) {
- const cl_device_id device_id = device_ids[num];
- string device_name;
- if(!get_device_name(device_id, &device_name, &error)) {
- FIRST_VLOG(2) << "Failed to fetch device name: "
- << string(clewErrorString(error))
- << ", ignoring.";
- continue;
- }
- if(!device_version_check(device_id)) {
- FIRST_VLOG(2) << "Ignoring device " << device_name
- << " due to old compiler version.";
- continue;
- }
- if(force_all ||
- device_supported(platform_name, device_id))
- {
- cl_device_type device_type;
- if(!get_device_type(device_id, &device_type, &error)) {
- FIRST_VLOG(2) << "Ignoring device " << device_name
- << ", failed to fetch device type:"
- << string(clewErrorString(error));
- continue;
- }
- string readable_device_name =
- get_readable_device_name(device_id);
- if(readable_device_name != device_name) {
- FIRST_VLOG(2) << "Using more readable device name: "
- << readable_device_name;
- }
- FIRST_VLOG(2) << "Adding new device "
- << readable_device_name << ".";
- string hardware_id = get_hardware_id(platform_name, device_id);
- string device_extensions = get_device_extensions(device_id);
- usable_devices->push_back(OpenCLPlatformDevice(
- platform_id,
- platform_name,
- device_id,
- device_type,
- readable_device_name,
- hardware_id,
- device_extensions));
- }
- else {
- FIRST_VLOG(2) << "Ignoring device " << device_name
- << ", not officially supported yet.";
- }
- }
- }
- first_time = false;
+ const cl_device_type device_type = OpenCLInfo::device_type();
+ static bool first_time = true;
+# define FIRST_VLOG(severity) \
+ if (first_time) \
+ VLOG(severity)
+
+ usable_devices->clear();
+
+ if (device_type == 0) {
+ FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
+ first_time = false;
+ return;
+ }
+
+ cl_int error;
+ vector<cl_device_id> device_ids;
+ vector<cl_platform_id> platform_ids;
+
+ /* Get platforms. */
+ if (!get_platforms(&platform_ids, &error)) {
+ FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error));
+ first_time = false;
+ return;
+ }
+ if (platform_ids.size() == 0) {
+ FIRST_VLOG(2) << "No OpenCL platforms were found.";
+ first_time = false;
+ return;
+ }
+ /* Devices are numbered consecutively across platforms. */
+ for (int platform = 0; platform < platform_ids.size(); platform++) {
+ cl_platform_id platform_id = platform_ids[platform];
+ string platform_name;
+ if (!get_platform_name(platform_id, &platform_name)) {
+ FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
+ continue;
+ }
+ FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << ".";
+ if (!platform_version_check(platform_id)) {
+ FIRST_VLOG(2) << "Ignoring platform " << platform_name
+ << " due to too old compiler version.";
+ continue;
+ }
+ if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) {
+ FIRST_VLOG(2) << "Ignoring platform " << platform_name
+ << ", failed to fetch of devices: " << string(clewErrorString(error));
+ continue;
+ }
+ if (device_ids.size() == 0) {
+ FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices.";
+ continue;
+ }
+ for (int num = 0; num < device_ids.size(); num++) {
+ const cl_device_id device_id = device_ids[num];
+ string device_name;
+ if (!get_device_name(device_id, &device_name, &error)) {
+ FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error))
+ << ", ignoring.";
+ continue;
+ }
+ if (!device_version_check(device_id)) {
+ FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version.";
+ continue;
+ }
+ if (force_all || device_supported(platform_name, device_id)) {
+ cl_device_type device_type;
+ if (!get_device_type(device_id, &device_type, &error)) {
+ FIRST_VLOG(2) << "Ignoring device " << device_name
+ << ", failed to fetch device type:" << string(clewErrorString(error));
+ continue;
+ }
+ string readable_device_name = get_readable_device_name(device_id);
+ if (readable_device_name != device_name) {
+ FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name;
+ }
+ FIRST_VLOG(2) << "Adding new device " << readable_device_name << ".";
+ string hardware_id = get_hardware_id(platform_name, device_id);
+ string device_extensions = get_device_extensions(device_id);
+ usable_devices->push_back(OpenCLPlatformDevice(platform_id,
+ platform_name,
+ device_id,
+ device_type,
+ readable_device_name,
+ hardware_id,
+ device_extensions));
+ }
+ else {
+ FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet.";
+ }
+ }
+ }
+ first_time = false;
}
-bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids,
- cl_int *error)
+bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error)
{
- /* Reset from possible previous state. */
- platform_ids->resize(0);
- cl_uint num_platforms;
- if(!get_num_platforms(&num_platforms, error)) {
- return false;
- }
- /* Get actual platforms. */
- cl_int err;
- platform_ids->resize(num_platforms);
- if((err = clGetPlatformIDs(num_platforms,
- &platform_ids->at(0),
- NULL)) != CL_SUCCESS) {
- if(error != NULL) {
- *error = err;
- }
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
+ /* Reset from possible previous state. */
+ platform_ids->resize(0);
+ cl_uint num_platforms;
+ if (!get_num_platforms(&num_platforms, error)) {
+ return false;
+ }
+ /* Get actual platforms. */
+ cl_int err;
+ platform_ids->resize(num_platforms);
+ if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ return true;
}
vector<cl_platform_id> OpenCLInfo::get_platforms()
{
- vector<cl_platform_id> platform_ids;
- get_platforms(&platform_ids);
- return platform_ids;
+ vector<cl_platform_id> platform_ids;
+ get_platforms(&platform_ids);
+ return platform_ids;
}
bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error)
{
- cl_int err;
- if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
- if(error != NULL) {
- *error = err;
- }
- *num_platforms = 0;
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
+ cl_int err;
+ if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ *num_platforms = 0;
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ return true;
}
cl_uint OpenCLInfo::get_num_platforms()
{
- cl_uint num_platforms;
- if(!get_num_platforms(&num_platforms)) {
- return 0;
- }
- return num_platforms;
+ cl_uint num_platforms;
+ if (!get_num_platforms(&num_platforms)) {
+ return 0;
+ }
+ return num_platforms;
}
-bool OpenCLInfo::get_platform_name(cl_platform_id platform_id,
- string *platform_name)
+bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name)
{
- char buffer[256];
- if(clGetPlatformInfo(platform_id,
- CL_PLATFORM_NAME,
- sizeof(buffer),
- &buffer,
- NULL) != CL_SUCCESS)
- {
- *platform_name = "";
- return false;
- }
- *platform_name = buffer;
- return true;
+ char buffer[256];
+ if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) !=
+ CL_SUCCESS) {
+ *platform_name = "";
+ return false;
+ }
+ *platform_name = buffer;
+ return true;
}
string OpenCLInfo::get_platform_name(cl_platform_id platform_id)
{
- string platform_name;
- if(!get_platform_name(platform_id, &platform_name)) {
- return "";
- }
- return platform_name;
+ string platform_name;
+ if (!get_platform_name(platform_id, &platform_name)) {
+ return "";
+ }
+ return platform_name;
}
bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
@@ -1076,266 +1034,222 @@ bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
cl_uint *num_devices,
cl_int *error)
{
- cl_int err;
- if((err = clGetDeviceIDs(platform_id,
- device_type,
- 0,
- NULL,
- num_devices)) != CL_SUCCESS)
- {
- if(error != NULL) {
- *error = err;
- }
- *num_devices = 0;
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
+ cl_int err;
+ if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ *num_devices = 0;
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ return true;
}
cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id,
cl_device_type device_type)
{
- cl_uint num_devices;
- if(!get_num_platform_devices(platform_id,
- device_type,
- &num_devices))
- {
- return 0;
- }
- return num_devices;
+ cl_uint num_devices;
+ if (!get_num_platform_devices(platform_id, device_type, &num_devices)) {
+ return 0;
+ }
+ return num_devices;
}
bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
cl_device_type device_type,
vector<cl_device_id> *device_ids,
- cl_int* error)
+ cl_int *error)
{
- /* Reset from possible previous state. */
- device_ids->resize(0);
- /* Get number of devices to pre-allocate memory. */
- cl_uint num_devices;
- if(!get_num_platform_devices(platform_id,
- device_type,
- &num_devices,
- error))
- {
- return false;
- }
- /* Get actual device list. */
- device_ids->resize(num_devices);
- cl_int err;
- if((err = clGetDeviceIDs(platform_id,
- device_type,
- num_devices,
- &device_ids->at(0),
- NULL)) != CL_SUCCESS)
- {
- if(error != NULL) {
- *error = err;
- }
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
+ /* Reset from possible previous state. */
+ device_ids->resize(0);
+ /* Get number of devices to pre-allocate memory. */
+ cl_uint num_devices;
+ if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) {
+ return false;
+ }
+ /* Get actual device list. */
+ device_ids->resize(num_devices);
+ cl_int err;
+ if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) !=
+ CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ return true;
}
vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id,
cl_device_type device_type)
{
- vector<cl_device_id> devices;
- get_platform_devices(platform_id, device_type, &devices);
- return devices;
+ vector<cl_device_id> devices;
+ get_platform_devices(platform_id, device_type, &devices);
+ return devices;
}
-bool OpenCLInfo::get_device_name(cl_device_id device_id,
- string *device_name,
- cl_int* error)
+bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error)
{
- char buffer[1024];
- cl_int err;
- if((err = clGetDeviceInfo(device_id,
- CL_DEVICE_NAME,
- sizeof(buffer),
- &buffer,
- NULL)) != CL_SUCCESS)
- {
- if(error != NULL) {
- *error = err;
- }
- *device_name = "";
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- *device_name = buffer;
- return true;
+ char buffer[1024];
+ cl_int err;
+ if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) !=
+ CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ *device_name = "";
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ *device_name = buffer;
+ return true;
}
string OpenCLInfo::get_device_name(cl_device_id device_id)
{
- string device_name;
- if(!get_device_name(device_id, &device_name)) {
- return "";
- }
- return device_name;
+ string device_name;
+ if (!get_device_name(device_id, &device_name)) {
+ return "";
+ }
+ return device_name;
}
bool OpenCLInfo::get_device_extensions(cl_device_id device_id,
- string *device_extensions,
- cl_int* error)
+ string *device_extensions,
+ cl_int *error)
{
- char buffer[1024];
- cl_int err;
- if((err = clGetDeviceInfo(device_id,
- CL_DEVICE_EXTENSIONS,
- sizeof(buffer),
- &buffer,
- NULL)) != CL_SUCCESS)
- {
- if(error != NULL) {
- *error = err;
- }
- *device_extensions = "";
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- *device_extensions = buffer;
- return true;
+ char buffer[1024];
+ cl_int err;
+ if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(buffer), &buffer, NULL)) !=
+ CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ *device_extensions = "";
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ *device_extensions = buffer;
+ return true;
}
string OpenCLInfo::get_device_extensions(cl_device_id device_id)
{
- string device_extensions;
- if(!get_device_extensions(device_id, &device_extensions)) {
- return "";
- }
- return device_extensions;
+ string device_extensions;
+ if (!get_device_extensions(device_id, &device_extensions)) {
+ return "";
+ }
+ return device_extensions;
}
bool OpenCLInfo::get_device_type(cl_device_id device_id,
cl_device_type *device_type,
- cl_int* error)
+ cl_int *error)
{
- cl_int err;
- if((err = clGetDeviceInfo(device_id,
- CL_DEVICE_TYPE,
- sizeof(cl_device_type),
- device_type,
- NULL)) != CL_SUCCESS)
- {
- if(error != NULL) {
- *error = err;
- }
- *device_type = 0;
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- return true;
+ cl_int err;
+ if ((err = clGetDeviceInfo(
+ device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ *device_type = 0;
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ return true;
}
cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id)
{
- cl_device_type device_type;
- if(!get_device_type(device_id, &device_type)) {
- return 0;
- }
- return device_type;
+ cl_device_type device_type;
+ if (!get_device_type(device_id, &device_type)) {
+ return 0;
+ }
+ return device_type;
}
string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
{
- string name = "";
- char board_name[1024];
- size_t length = 0;
- if(clGetDeviceInfo(device_id,
- CL_DEVICE_BOARD_NAME_AMD,
- sizeof(board_name),
- &board_name,
- &length) == CL_SUCCESS)
- {
- if(length != 0 && board_name[0] != '\0') {
- name = board_name;
- }
- }
-
- /* Fallback to standard device name API. */
- if(name.empty()) {
- name = get_device_name(device_id);
- }
-
- /* Special exception for AMD Vega, need to be able to tell
- * Vega 56 from 64 apart.
- */
- if(name == "Radeon RX Vega") {
- cl_int max_compute_units = 0;
- if(clGetDeviceInfo(device_id,
- CL_DEVICE_MAX_COMPUTE_UNITS,
- sizeof(max_compute_units),
- &max_compute_units,
- NULL) == CL_SUCCESS)
- {
- name += " " + to_string(max_compute_units);
- }
- }
-
- /* Distinguish from our native CPU device. */
- if(get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
- name += " (OpenCL)";
- }
-
- return name;
+ string name = "";
+ char board_name[1024];
+ size_t length = 0;
+ if (clGetDeviceInfo(
+ device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) ==
+ CL_SUCCESS) {
+ if (length != 0 && board_name[0] != '\0') {
+ name = board_name;
+ }
+ }
+
+ /* Fallback to standard device name API. */
+ if (name.empty()) {
+ name = get_device_name(device_id);
+ }
+
+ /* Special exception for AMD Vega, need to be able to tell
+ * Vega 56 from 64 apart.
+ */
+ if (name == "Radeon RX Vega") {
+ cl_int max_compute_units = 0;
+ if (clGetDeviceInfo(device_id,
+ CL_DEVICE_MAX_COMPUTE_UNITS,
+ sizeof(max_compute_units),
+ &max_compute_units,
+ NULL) == CL_SUCCESS) {
+ name += " " + to_string(max_compute_units);
+ }
+ }
+
+ /* Distinguish from our native CPU device. */
+ if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) {
+ name += " (OpenCL)";
+ }
+
+ return name;
}
-bool OpenCLInfo::get_driver_version(cl_device_id device_id,
- int *major,
- int *minor,
- cl_int* error)
+bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error)
{
- char buffer[1024];
- cl_int err;
- if((err = clGetDeviceInfo(device_id,
- CL_DRIVER_VERSION,
- sizeof(buffer),
- &buffer,
- NULL)) != CL_SUCCESS)
- {
- if(error != NULL) {
- *error = err;
- }
- return false;
- }
- if(error != NULL) {
- *error = CL_SUCCESS;
- }
- if(sscanf(buffer, "%d.%d", major, minor) < 2) {
- VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
- return false;
- }
- return true;
+ char buffer[1024];
+ cl_int err;
+ if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) !=
+ CL_SUCCESS) {
+ if (error != NULL) {
+ *error = err;
+ }
+ return false;
+ }
+ if (error != NULL) {
+ *error = CL_SUCCESS;
+ }
+ if (sscanf(buffer, "%d.%d", major, minor) < 2) {
+ VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer);
+ return false;
+ }
+ return true;
}
int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id)
{
- int base_align_bits;
- if(clGetDeviceInfo(device_id,
- CL_DEVICE_MEM_BASE_ADDR_ALIGN,
- sizeof(int),
- &base_align_bits,
- NULL) == CL_SUCCESS)
- {
- return base_align_bits/8;
- }
- return 1;
+ int base_align_bits;
+ if (clGetDeviceInfo(
+ device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) ==
+ CL_SUCCESS) {
+ return base_align_bits / 8;
+ }
+ return 1;
}
CCL_NAMESPACE_END