diff options
Diffstat (limited to 'intern/cycles/device')
23 files changed, 11227 insertions, 11040 deletions
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index d804a07bcab..75f4a72bee3 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -1,61 +1,61 @@ set(INC - .. - ../../glew-mx + .. + ../../glew-mx ) set(INC_SYS - ${GLEW_INCLUDE_DIR} - ../../../extern/clew/include + ${GLEW_INCLUDE_DIR} + ../../../extern/clew/include ) if(WITH_CUDA_DYNLOAD) - list(APPEND INC - ../../../extern/cuew/include - ) - add_definitions(-DWITH_CUDA_DYNLOAD) + list(APPEND INC + ../../../extern/cuew/include + ) + add_definitions(-DWITH_CUDA_DYNLOAD) else() - list(APPEND INC_SYS - ${CUDA_TOOLKIT_INCLUDE} - ) - add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") + list(APPEND INC_SYS + ${CUDA_TOOLKIT_INCLUDE} + ) + add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") endif() set(SRC - device.cpp - device_cpu.cpp - device_cuda.cpp - device_denoising.cpp - device_memory.cpp - device_multi.cpp - device_opencl.cpp - device_split_kernel.cpp - device_task.cpp + device.cpp + device_cpu.cpp + device_cuda.cpp + device_denoising.cpp + device_memory.cpp + device_multi.cpp + device_opencl.cpp + device_split_kernel.cpp + device_task.cpp ) set(SRC_OPENCL - opencl/opencl.h - opencl/memory_manager.h + opencl/opencl.h + opencl/memory_manager.h - opencl/opencl_split.cpp - opencl/opencl_util.cpp - opencl/memory_manager.cpp + opencl/opencl_split.cpp + opencl/opencl_util.cpp + opencl/memory_manager.cpp ) if(WITH_CYCLES_NETWORK) - list(APPEND SRC - device_network.cpp - ) + list(APPEND SRC + device_network.cpp + ) endif() set(SRC_HEADERS - device.h - device_denoising.h - device_memory.h - device_intern.h - device_network.h - device_split_kernel.h - device_task.h + device.h + device_denoising.h + device_memory.h + device_intern.h + device_network.h + device_split_kernel.h + device_task.h ) set(LIB @@ -63,27 +63,27 @@ set(LIB ) if(WITH_CUDA_DYNLOAD) - list(APPEND LIB - extern_cuew - ) + list(APPEND LIB + extern_cuew + ) else() - list(APPEND LIB - ${CUDA_CUDA_LIBRARY} - ) + list(APPEND LIB + ${CUDA_CUDA_LIBRARY} + ) endif() add_definitions(${GL_DEFINITIONS}) if(WITH_CYCLES_NETWORK) - add_definitions(-DWITH_NETWORK) + add_definitions(-DWITH_NETWORK) endif() if(WITH_CYCLES_DEVICE_OPENCL) - add_definitions(-DWITH_OPENCL) + add_definitions(-DWITH_OPENCL) endif() if(WITH_CYCLES_DEVICE_CUDA) - add_definitions(-DWITH_CUDA) + add_definitions(-DWITH_CUDA) endif() if(WITH_CYCLES_DEVICE_MULTI) - add_definitions(-DWITH_MULTI) + add_definitions(-DWITH_MULTI) endif() include_directories(${INC}) diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index e74637472ef..16a68e8b855 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -44,572 +44,577 @@ uint Device::devices_initialized_mask = 0; /* Device Requested Features */ -std::ostream& operator <<(std::ostream &os, - const DeviceRequestedFeatures& requested_features) +std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features) { - os << "Experimental features: " - << (requested_features.experimental ? "On" : "Off") << std::endl; - os << "Max nodes group: " << requested_features.max_nodes_group << std::endl; - /* TODO(sergey): Decode bitflag into list of names. */ - os << "Nodes features: " << requested_features.nodes_features << std::endl; - os << "Use Hair: " - << string_from_bool(requested_features.use_hair) << std::endl; - os << "Use Object Motion: " - << string_from_bool(requested_features.use_object_motion) << std::endl; - os << "Use Camera Motion: " - << string_from_bool(requested_features.use_camera_motion) << std::endl; - os << "Use Baking: " - << string_from_bool(requested_features.use_baking) << std::endl; - os << "Use Subsurface: " - << string_from_bool(requested_features.use_subsurface) << std::endl; - os << "Use Volume: " - << string_from_bool(requested_features.use_volume) << std::endl; - os << "Use Branched Integrator: " - << string_from_bool(requested_features.use_integrator_branched) << std::endl; - os << "Use Patch Evaluation: " - << string_from_bool(requested_features.use_patch_evaluation) << std::endl; - os << "Use Transparent Shadows: " - << string_from_bool(requested_features.use_transparent) << std::endl; - os << "Use Principled BSDF: " - << string_from_bool(requested_features.use_principled) << std::endl; - os << "Use Denoising: " - << string_from_bool(requested_features.use_denoising) << std::endl; - os << "Use Displacement: " - << string_from_bool(requested_features.use_true_displacement) << std::endl; - os << "Use Background Light: " - << string_from_bool(requested_features.use_background_light) << std::endl; - return os; + os << "Experimental features: " << (requested_features.experimental ? "On" : "Off") << std::endl; + os << "Max nodes group: " << requested_features.max_nodes_group << std::endl; + /* TODO(sergey): Decode bitflag into list of names. */ + os << "Nodes features: " << requested_features.nodes_features << std::endl; + os << "Use Hair: " << string_from_bool(requested_features.use_hair) << std::endl; + os << "Use Object Motion: " << string_from_bool(requested_features.use_object_motion) + << std::endl; + os << "Use Camera Motion: " << string_from_bool(requested_features.use_camera_motion) + << std::endl; + os << "Use Baking: " << string_from_bool(requested_features.use_baking) << std::endl; + os << "Use Subsurface: " << string_from_bool(requested_features.use_subsurface) << std::endl; + os << "Use Volume: " << string_from_bool(requested_features.use_volume) << std::endl; + os << "Use Branched Integrator: " << string_from_bool(requested_features.use_integrator_branched) + << std::endl; + os << "Use Patch Evaluation: " << string_from_bool(requested_features.use_patch_evaluation) + << std::endl; + os << "Use Transparent Shadows: " << string_from_bool(requested_features.use_transparent) + << std::endl; + os << "Use Principled BSDF: " << string_from_bool(requested_features.use_principled) + << std::endl; + os << "Use Denoising: " << string_from_bool(requested_features.use_denoising) << std::endl; + os << "Use Displacement: " << string_from_bool(requested_features.use_true_displacement) + << std::endl; + os << "Use Background Light: " << string_from_bool(requested_features.use_background_light) + << std::endl; + return os; } /* Device */ Device::~Device() { - if(!background) { - if(vertex_buffer != 0) { - glDeleteBuffers(1, &vertex_buffer); - } - if(fallback_shader_program != 0) { - glDeleteProgram(fallback_shader_program); - } - } + if (!background) { + if (vertex_buffer != 0) { + glDeleteBuffers(1, &vertex_buffer); + } + if (fallback_shader_program != 0) { + glDeleteProgram(fallback_shader_program); + } + } } /* TODO move shaders to standalone .glsl file. */ const char *FALLBACK_VERTEX_SHADER = -"#version 330\n" -"uniform vec2 fullscreen;\n" -"in vec2 texCoord;\n" -"in vec2 pos;\n" -"out vec2 texCoord_interp;\n" -"\n" -"vec2 normalize_coordinates()\n" -"{\n" -" return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n" -"}\n" -"\n" -"void main()\n" -"{\n" -" gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n" -" texCoord_interp = texCoord;\n" -"}\n\0"; + "#version 330\n" + "uniform vec2 fullscreen;\n" + "in vec2 texCoord;\n" + "in vec2 pos;\n" + "out vec2 texCoord_interp;\n" + "\n" + "vec2 normalize_coordinates()\n" + "{\n" + " return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n" + "}\n" + "\n" + "void main()\n" + "{\n" + " gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n" + " texCoord_interp = texCoord;\n" + "}\n\0"; const char *FALLBACK_FRAGMENT_SHADER = -"#version 330\n" -"uniform sampler2D image_texture;\n" -"in vec2 texCoord_interp;\n" -"out vec4 fragColor;\n" -"\n" -"void main()\n" -"{\n" -" fragColor = texture(image_texture, texCoord_interp);\n" -"}\n\0"; + "#version 330\n" + "uniform sampler2D image_texture;\n" + "in vec2 texCoord_interp;\n" + "out vec4 fragColor;\n" + "\n" + "void main()\n" + "{\n" + " fragColor = texture(image_texture, texCoord_interp);\n" + "}\n\0"; static void shader_print_errors(const char *task, const char *log, const char *code) { - LOG(ERROR) << "Shader: " << task << " error:"; - LOG(ERROR) << "===== shader string ===="; - - stringstream stream(code); - string partial; - - int line = 1; - while(getline(stream, partial, '\n')) { - if(line < 10) { - LOG(ERROR) << " " << line << " " << partial; - } - else { - LOG(ERROR) << line << " " << partial; - } - line++; - } - LOG(ERROR) << log; + LOG(ERROR) << "Shader: " << task << " error:"; + LOG(ERROR) << "===== shader string ===="; + + stringstream stream(code); + string partial; + + int line = 1; + while (getline(stream, partial, '\n')) { + if (line < 10) { + LOG(ERROR) << " " << line << " " << partial; + } + else { + LOG(ERROR) << line << " " << partial; + } + line++; + } + LOG(ERROR) << log; } static int bind_fallback_shader(void) { - GLint status; - GLchar log[5000]; - GLsizei length = 0; - GLuint program = 0; + GLint status; + GLchar log[5000]; + GLsizei length = 0; + GLuint program = 0; - struct Shader { - const char *source; - GLenum type; - } shaders[2] = { - {FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER}, - {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER} - }; + struct Shader { + const char *source; + GLenum type; + } shaders[2] = {{FALLBACK_VERTEX_SHADER, GL_VERTEX_SHADER}, + {FALLBACK_FRAGMENT_SHADER, GL_FRAGMENT_SHADER}}; - program = glCreateProgram(); + program = glCreateProgram(); - for(int i = 0; i < 2; i++) { - GLuint shader = glCreateShader(shaders[i].type); + for (int i = 0; i < 2; i++) { + GLuint shader = glCreateShader(shaders[i].type); - string source_str = shaders[i].source; - const char *c_str = source_str.c_str(); + string source_str = shaders[i].source; + const char *c_str = source_str.c_str(); - glShaderSource(shader, 1, &c_str, NULL); - glCompileShader(shader); + glShaderSource(shader, 1, &c_str, NULL); + glCompileShader(shader); - glGetShaderiv(shader, GL_COMPILE_STATUS, &status); + glGetShaderiv(shader, GL_COMPILE_STATUS, &status); - if(!status) { - glGetShaderInfoLog(shader, sizeof(log), &length, log); - shader_print_errors("compile", log, c_str); - return 0; - } + if (!status) { + glGetShaderInfoLog(shader, sizeof(log), &length, log); + shader_print_errors("compile", log, c_str); + return 0; + } - glAttachShader(program, shader); - } + glAttachShader(program, shader); + } - /* Link output. */ - glBindFragDataLocation(program, 0, "fragColor"); + /* Link output. */ + glBindFragDataLocation(program, 0, "fragColor"); - /* Link and error check. */ - glLinkProgram(program); + /* Link and error check. */ + glLinkProgram(program); - glGetProgramiv(program, GL_LINK_STATUS, &status); - if(!status) { - glGetShaderInfoLog(program, sizeof(log), &length, log); - shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER); - shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER); - return 0; - } + glGetProgramiv(program, GL_LINK_STATUS, &status); + if (!status) { + glGetShaderInfoLog(program, sizeof(log), &length, log); + shader_print_errors("linking", log, FALLBACK_VERTEX_SHADER); + shader_print_errors("linking", log, FALLBACK_FRAGMENT_SHADER); + return 0; + } - return program; + return program; } bool Device::bind_fallback_display_space_shader(const float width, const float height) { - if(fallback_status == FALLBACK_SHADER_STATUS_ERROR) { - return false; - } - - if(fallback_status == FALLBACK_SHADER_STATUS_NONE) { - fallback_shader_program = bind_fallback_shader(); - fallback_status = FALLBACK_SHADER_STATUS_ERROR; - - if(fallback_shader_program == 0) { - return false; - } - - glUseProgram(fallback_shader_program); - image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture"); - if(image_texture_location < 0) { - LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform."; - return false; - } - - fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen"); - if(fullscreen_location < 0) { - LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform."; - return false; - } - - fallback_status = FALLBACK_SHADER_STATUS_SUCCESS; - } - - /* Run this every time. */ - glUseProgram(fallback_shader_program); - glUniform1i(image_texture_location, 0); - glUniform2f(fullscreen_location, width, height); - return true; + if (fallback_status == FALLBACK_SHADER_STATUS_ERROR) { + return false; + } + + if (fallback_status == FALLBACK_SHADER_STATUS_NONE) { + fallback_shader_program = bind_fallback_shader(); + fallback_status = FALLBACK_SHADER_STATUS_ERROR; + + if (fallback_shader_program == 0) { + return false; + } + + glUseProgram(fallback_shader_program); + image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture"); + if (image_texture_location < 0) { + LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform."; + return false; + } + + fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen"); + if (fullscreen_location < 0) { + LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform."; + return false; + } + + fallback_status = FALLBACK_SHADER_STATUS_SUCCESS; + } + + /* Run this every time. */ + glUseProgram(fallback_shader_program); + glUniform1i(image_texture_location, 0); + glUniform2f(fullscreen_location, width, height); + return true; } -void Device::draw_pixels( - device_memory& rgba, int y, - int w, int h, int width, int height, - int dx, int dy, int dw, int dh, - bool transparent, const DeviceDrawParams &draw_params) +void Device::draw_pixels(device_memory &rgba, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params) { - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - - assert(rgba.type == MEM_PIXELS); - mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1)); - - GLuint texid; - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &texid); - glBindTexture(GL_TEXTURE_2D, texid); - - if(rgba.data_type == TYPE_HALF) { - GLhalf *data_pointer = (GLhalf*)rgba.host_pointer; - data_pointer += 4 * y * w; - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer); - } - else { - uint8_t *data_pointer = (uint8_t*)rgba.host_pointer; - data_pointer += 4 * y * w; - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer); - } - - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - - if(transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if(use_fallback_shader) { - if(!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if(!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if(vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = 1.0f; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = 1.0f; - vpointer[9] = 1.0f; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = 1.0f; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - if(vertex_buffer) { - glUnmapBuffer(GL_ARRAY_BUFFER); - } - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if(vertex_buffer) { - glBindBuffer(GL_ARRAY_BUFFER, 0); - } - - if(use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - glDeleteVertexArrays(1, &vertex_array_object); - glBindTexture(GL_TEXTURE_2D, 0); - glDeleteTextures(1, &texid); - - if(transparent) { - glDisable(GL_BLEND); - } + const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); + + assert(rgba.type == MEM_PIXELS); + mem_copy_from(rgba, y, w, h, rgba.memory_elements_size(1)); + + GLuint texid; + glActiveTexture(GL_TEXTURE0); + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + + if (rgba.data_type == TYPE_HALF) { + GLhalf *data_pointer = (GLhalf *)rgba.host_pointer; + data_pointer += 4 * y * w; + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, w, h, 0, GL_RGBA, GL_HALF_FLOAT, data_pointer); + } + else { + uint8_t *data_pointer = (uint8_t *)rgba.host_pointer; + data_pointer += 4 * y * w; + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, w, h, 0, GL_RGBA, GL_UNSIGNED_BYTE, data_pointer); + } + + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + if (transparent) { + glEnable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + } + + GLint shader_program; + if (use_fallback_shader) { + if (!bind_fallback_display_space_shader(dw, dh)) { + return; + } + shader_program = fallback_shader_program; + } + else { + draw_params.bind_display_space_shader_cb(); + glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); + } + + if (!vertex_buffer) { + glGenBuffers(1, &vertex_buffer); + } + + glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); + /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */ + glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); + + float *vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + + if (vpointer) { + /* texture coordinate - vertex pair */ + vpointer[0] = 0.0f; + vpointer[1] = 0.0f; + vpointer[2] = dx; + vpointer[3] = dy; + + vpointer[4] = 1.0f; + vpointer[5] = 0.0f; + vpointer[6] = (float)width + dx; + vpointer[7] = dy; + + vpointer[8] = 1.0f; + vpointer[9] = 1.0f; + vpointer[10] = (float)width + dx; + vpointer[11] = (float)height + dy; + + vpointer[12] = 0.0f; + vpointer[13] = 1.0f; + vpointer[14] = dx; + vpointer[15] = (float)height + dy; + + if (vertex_buffer) { + glUnmapBuffer(GL_ARRAY_BUFFER); + } + } + + GLuint vertex_array_object; + GLuint position_attribute, texcoord_attribute; + + glGenVertexArrays(1, &vertex_array_object); + glBindVertexArray(vertex_array_object); + + texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); + position_attribute = glGetAttribLocation(shader_program, "pos"); + + glEnableVertexAttribArray(texcoord_attribute); + glEnableVertexAttribArray(position_attribute); + + glVertexAttribPointer( + texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); + glVertexAttribPointer(position_attribute, + 2, + GL_FLOAT, + GL_FALSE, + 4 * sizeof(float), + (const GLvoid *)(sizeof(float) * 2)); + + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + + if (vertex_buffer) { + glBindBuffer(GL_ARRAY_BUFFER, 0); + } + + if (use_fallback_shader) { + glUseProgram(0); + } + else { + draw_params.unbind_display_space_shader_cb(); + } + + glDeleteVertexArrays(1, &vertex_array_object); + glBindTexture(GL_TEXTURE_2D, 0); + glDeleteTextures(1, &texid); + + if (transparent) { + glDisable(GL_BLEND); + } } -Device *Device::create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) +Device *Device::create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) { - Device *device; + Device *device; - switch(info.type) { - case DEVICE_CPU: - device = device_cpu_create(info, stats, profiler, background); - break; + switch (info.type) { + case DEVICE_CPU: + device = device_cpu_create(info, stats, profiler, background); + break; #ifdef WITH_CUDA - case DEVICE_CUDA: - if(device_cuda_init()) - device = device_cuda_create(info, stats, profiler, background); - else - device = NULL; - break; + case DEVICE_CUDA: + if (device_cuda_init()) + device = device_cuda_create(info, stats, profiler, background); + else + device = NULL; + break; #endif #ifdef WITH_MULTI - case DEVICE_MULTI: - device = device_multi_create(info, stats, profiler, background); - break; + case DEVICE_MULTI: + device = device_multi_create(info, stats, profiler, background); + break; #endif #ifdef WITH_NETWORK - case DEVICE_NETWORK: - device = device_network_create(info, stats, profiler, "127.0.0.1"); - break; + case DEVICE_NETWORK: + device = device_network_create(info, stats, profiler, "127.0.0.1"); + break; #endif #ifdef WITH_OPENCL - case DEVICE_OPENCL: - if(device_opencl_init()) - device = device_opencl_create(info, stats, profiler, background); - else - device = NULL; - break; + case DEVICE_OPENCL: + if (device_opencl_init()) + device = device_opencl_create(info, stats, profiler, background); + else + device = NULL; + break; #endif - default: - return NULL; - } + default: + return NULL; + } - return device; + return device; } DeviceType Device::type_from_string(const char *name) { - if(strcmp(name, "CPU") == 0) - return DEVICE_CPU; - else if(strcmp(name, "CUDA") == 0) - return DEVICE_CUDA; - else if(strcmp(name, "OPENCL") == 0) - return DEVICE_OPENCL; - else if(strcmp(name, "NETWORK") == 0) - return DEVICE_NETWORK; - else if(strcmp(name, "MULTI") == 0) - return DEVICE_MULTI; - - return DEVICE_NONE; + if (strcmp(name, "CPU") == 0) + return DEVICE_CPU; + else if (strcmp(name, "CUDA") == 0) + return DEVICE_CUDA; + else if (strcmp(name, "OPENCL") == 0) + return DEVICE_OPENCL; + else if (strcmp(name, "NETWORK") == 0) + return DEVICE_NETWORK; + else if (strcmp(name, "MULTI") == 0) + return DEVICE_MULTI; + + return DEVICE_NONE; } string Device::string_from_type(DeviceType type) { - if(type == DEVICE_CPU) - return "CPU"; - else if(type == DEVICE_CUDA) - return "CUDA"; - else if(type == DEVICE_OPENCL) - return "OPENCL"; - else if(type == DEVICE_NETWORK) - return "NETWORK"; - else if(type == DEVICE_MULTI) - return "MULTI"; - - return ""; + if (type == DEVICE_CPU) + return "CPU"; + else if (type == DEVICE_CUDA) + return "CUDA"; + else if (type == DEVICE_OPENCL) + return "OPENCL"; + else if (type == DEVICE_NETWORK) + return "NETWORK"; + else if (type == DEVICE_MULTI) + return "MULTI"; + + return ""; } vector<DeviceType> Device::available_types() { - vector<DeviceType> types; - types.push_back(DEVICE_CPU); + vector<DeviceType> types; + types.push_back(DEVICE_CPU); #ifdef WITH_CUDA - types.push_back(DEVICE_CUDA); + types.push_back(DEVICE_CUDA); #endif #ifdef WITH_OPENCL - types.push_back(DEVICE_OPENCL); + types.push_back(DEVICE_OPENCL); #endif #ifdef WITH_NETWORK - types.push_back(DEVICE_NETWORK); + types.push_back(DEVICE_NETWORK); #endif - return types; + return types; } vector<DeviceInfo> Device::available_devices(uint mask) { - /* Lazy initialize devices. On some platforms OpenCL or CUDA drivers can - * be broken and cause crashes when only trying to get device info, so - * we don't want to do any initialization until the user chooses to. */ - thread_scoped_lock lock(device_mutex); - vector<DeviceInfo> devices; + /* Lazy initialize devices. On some platforms OpenCL or CUDA drivers can + * be broken and cause crashes when only trying to get device info, so + * we don't want to do any initialization until the user chooses to. */ + thread_scoped_lock lock(device_mutex); + vector<DeviceInfo> devices; #ifdef WITH_OPENCL - if(mask & DEVICE_MASK_OPENCL) { - if(!(devices_initialized_mask & DEVICE_MASK_OPENCL)) { - if(device_opencl_init()) { - device_opencl_info(opencl_devices); - } - devices_initialized_mask |= DEVICE_MASK_OPENCL; - } - foreach(DeviceInfo& info, opencl_devices) { - devices.push_back(info); - } - } + if (mask & DEVICE_MASK_OPENCL) { + if (!(devices_initialized_mask & DEVICE_MASK_OPENCL)) { + if (device_opencl_init()) { + device_opencl_info(opencl_devices); + } + devices_initialized_mask |= DEVICE_MASK_OPENCL; + } + foreach (DeviceInfo &info, opencl_devices) { + devices.push_back(info); + } + } #endif #ifdef WITH_CUDA - if(mask & DEVICE_MASK_CUDA) { - if(!(devices_initialized_mask & DEVICE_MASK_CUDA)) { - if(device_cuda_init()) { - device_cuda_info(cuda_devices); - } - devices_initialized_mask |= DEVICE_MASK_CUDA; - } - foreach(DeviceInfo& info, cuda_devices) { - devices.push_back(info); - } - } + if (mask & DEVICE_MASK_CUDA) { + if (!(devices_initialized_mask & DEVICE_MASK_CUDA)) { + if (device_cuda_init()) { + device_cuda_info(cuda_devices); + } + devices_initialized_mask |= DEVICE_MASK_CUDA; + } + foreach (DeviceInfo &info, cuda_devices) { + devices.push_back(info); + } + } #endif - if(mask & DEVICE_MASK_CPU) { - if(!(devices_initialized_mask & DEVICE_MASK_CPU)) { - device_cpu_info(cpu_devices); - devices_initialized_mask |= DEVICE_MASK_CPU; - } - foreach(DeviceInfo& info, cpu_devices) { - devices.push_back(info); - } - } + if (mask & DEVICE_MASK_CPU) { + if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { + device_cpu_info(cpu_devices); + devices_initialized_mask |= DEVICE_MASK_CPU; + } + foreach (DeviceInfo &info, cpu_devices) { + devices.push_back(info); + } + } #ifdef WITH_NETWORK - if(mask & DEVICE_MASK_NETWORK) { - if(!(devices_initialized_mask & DEVICE_MASK_NETWORK)) { - device_network_info(network_devices); - devices_initialized_mask |= DEVICE_MASK_NETWORK; - } - foreach(DeviceInfo& info, network_devices) { - devices.push_back(info); - } - } + if (mask & DEVICE_MASK_NETWORK) { + if (!(devices_initialized_mask & DEVICE_MASK_NETWORK)) { + device_network_info(network_devices); + devices_initialized_mask |= DEVICE_MASK_NETWORK; + } + foreach (DeviceInfo &info, network_devices) { + devices.push_back(info); + } + } #endif - return devices; + return devices; } string Device::device_capabilities(uint mask) { - thread_scoped_lock lock(device_mutex); - string capabilities = ""; + thread_scoped_lock lock(device_mutex); + string capabilities = ""; - if(mask & DEVICE_MASK_CPU) { - capabilities += "\nCPU device capabilities: "; - capabilities += device_cpu_capabilities() + "\n"; - } + if (mask & DEVICE_MASK_CPU) { + capabilities += "\nCPU device capabilities: "; + capabilities += device_cpu_capabilities() + "\n"; + } #ifdef WITH_OPENCL - if(mask & DEVICE_MASK_OPENCL) { - if(device_opencl_init()) { - capabilities += "\nOpenCL device capabilities:\n"; - capabilities += device_opencl_capabilities(); - } - } + if (mask & DEVICE_MASK_OPENCL) { + if (device_opencl_init()) { + capabilities += "\nOpenCL device capabilities:\n"; + capabilities += device_opencl_capabilities(); + } + } #endif #ifdef WITH_CUDA - if(mask & DEVICE_MASK_CUDA) { - if(device_cuda_init()) { - capabilities += "\nCUDA device capabilities:\n"; - capabilities += device_cuda_capabilities(); - } - } + if (mask & DEVICE_MASK_CUDA) { + if (device_cuda_init()) { + capabilities += "\nCUDA device capabilities:\n"; + capabilities += device_cuda_capabilities(); + } + } #endif - return capabilities; + return capabilities; } -DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int threads, bool background) +DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, + int threads, + bool background) { - assert(subdevices.size() > 0); - - if(subdevices.size() == 1) { - /* No multi device needed. */ - return subdevices.front(); - } - - DeviceInfo info; - info.type = DEVICE_MULTI; - info.id = "MULTI"; - info.description = "Multi Device"; - info.num = 0; - - info.has_half_images = true; - info.has_volume_decoupled = true; - info.has_osl = true; - info.has_profiling = true; - - foreach(const DeviceInfo &device, subdevices) { - /* Ensure CPU device does not slow down GPU. */ - if(device.type == DEVICE_CPU && subdevices.size() > 1) { - if(background) { - int orig_cpu_threads = (threads)? threads: system_cpu_thread_count(); - int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0); - - VLOG(1) << "CPU render threads reduced from " - << orig_cpu_threads << " to " << cpu_threads - << ", to dedicate to GPU."; - - if(cpu_threads >= 1) { - DeviceInfo cpu_device = device; - cpu_device.cpu_threads = cpu_threads; - info.multi_devices.push_back(cpu_device); - } - else { - continue; - } - } - else { - VLOG(1) << "CPU render threads disabled for interactive render."; - continue; - } - } - else { - info.multi_devices.push_back(device); - } - - /* Accumulate device info. */ - info.has_half_images &= device.has_half_images; - info.has_volume_decoupled &= device.has_volume_decoupled; - info.has_osl &= device.has_osl; - info.has_profiling &= device.has_profiling; - } - - return info; + assert(subdevices.size() > 0); + + if (subdevices.size() == 1) { + /* No multi device needed. */ + return subdevices.front(); + } + + DeviceInfo info; + info.type = DEVICE_MULTI; + info.id = "MULTI"; + info.description = "Multi Device"; + info.num = 0; + + info.has_half_images = true; + info.has_volume_decoupled = true; + info.has_osl = true; + info.has_profiling = true; + + foreach (const DeviceInfo &device, subdevices) { + /* Ensure CPU device does not slow down GPU. */ + if (device.type == DEVICE_CPU && subdevices.size() > 1) { + if (background) { + int orig_cpu_threads = (threads) ? threads : system_cpu_thread_count(); + int cpu_threads = max(orig_cpu_threads - (subdevices.size() - 1), 0); + + VLOG(1) << "CPU render threads reduced from " << orig_cpu_threads << " to " << cpu_threads + << ", to dedicate to GPU."; + + if (cpu_threads >= 1) { + DeviceInfo cpu_device = device; + cpu_device.cpu_threads = cpu_threads; + info.multi_devices.push_back(cpu_device); + } + else { + continue; + } + } + else { + VLOG(1) << "CPU render threads disabled for interactive render."; + continue; + } + } + else { + info.multi_devices.push_back(device); + } + + /* Accumulate device info. */ + info.has_half_images &= device.has_half_images; + info.has_volume_decoupled &= device.has_volume_decoupled; + info.has_osl &= device.has_osl; + info.has_profiling &= device.has_profiling; + } + + return info; } void Device::tag_update() { - free_memory(); + free_memory(); } void Device::free_memory() { - devices_initialized_mask = 0; - cuda_devices.free_memory(); - opencl_devices.free_memory(); - cpu_devices.free_memory(); - network_devices.free_memory(); + devices_initialized_mask = 0; + cuda_devices.free_memory(); + opencl_devices.free_memory(); + cpu_devices.free_memory(); + network_devices.free_memory(); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index aa0a8e434d2..15a0ceb4a19 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -40,384 +40,428 @@ class RenderTile; /* Device Types */ enum DeviceType { - DEVICE_NONE = 0, - DEVICE_CPU, - DEVICE_OPENCL, - DEVICE_CUDA, - DEVICE_NETWORK, - DEVICE_MULTI + DEVICE_NONE = 0, + DEVICE_CPU, + DEVICE_OPENCL, + DEVICE_CUDA, + DEVICE_NETWORK, + DEVICE_MULTI }; enum DeviceTypeMask { - DEVICE_MASK_CPU = (1 << DEVICE_CPU), - DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL), - DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), - DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK), - DEVICE_MASK_ALL = ~0 + DEVICE_MASK_CPU = (1 << DEVICE_CPU), + DEVICE_MASK_OPENCL = (1 << DEVICE_OPENCL), + DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), + DEVICE_MASK_NETWORK = (1 << DEVICE_NETWORK), + DEVICE_MASK_ALL = ~0 }; enum DeviceKernelStatus { - DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0, - DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE, - DEVICE_KERNEL_USING_FEATURE_KERNEL, - DEVICE_KERNEL_FEATURE_KERNEL_INVALID, - DEVICE_KERNEL_UNKNOWN, + DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL = 0, + DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE, + DEVICE_KERNEL_USING_FEATURE_KERNEL, + DEVICE_KERNEL_FEATURE_KERNEL_INVALID, + DEVICE_KERNEL_UNKNOWN, }; #define DEVICE_MASK(type) (DeviceTypeMask)(1 << type) class DeviceInfo { -public: - DeviceType type; - string description; - string id; /* used for user preferences, should stay fixed with changing hardware config */ - int num; - bool display_device; /* GPU is used as a display device. */ - bool has_half_images; /* Support half-float textures. */ - bool has_volume_decoupled; /* Decoupled volume shading. */ - bool has_osl; /* Support Open Shading Language. */ - bool use_split_kernel; /* Use split or mega kernel. */ - bool has_profiling; /* Supports runtime collection of profiling info. */ - int cpu_threads; - vector<DeviceInfo> multi_devices; - - DeviceInfo() - { - type = DEVICE_CPU; - id = "CPU"; - num = 0; - cpu_threads = 0; - display_device = false; - has_half_images = false; - has_volume_decoupled = false; - has_osl = false; - use_split_kernel = false; - has_profiling = false; - } - - bool operator==(const DeviceInfo &info) { - /* Multiple Devices with the same ID would be very bad. */ - assert(id != info.id || (type == info.type && num == info.num && description == info.description)); - return id == info.id; - } + public: + DeviceType type; + string description; + string id; /* used for user preferences, should stay fixed with changing hardware config */ + int num; + bool display_device; /* GPU is used as a display device. */ + bool has_half_images; /* Support half-float textures. */ + bool has_volume_decoupled; /* Decoupled volume shading. */ + bool has_osl; /* Support Open Shading Language. */ + bool use_split_kernel; /* Use split or mega kernel. */ + bool has_profiling; /* Supports runtime collection of profiling info. */ + int cpu_threads; + vector<DeviceInfo> multi_devices; + + DeviceInfo() + { + type = DEVICE_CPU; + id = "CPU"; + num = 0; + cpu_threads = 0; + display_device = false; + has_half_images = false; + has_volume_decoupled = false; + has_osl = false; + use_split_kernel = false; + has_profiling = false; + } + + bool operator==(const DeviceInfo &info) + { + /* Multiple Devices with the same ID would be very bad. */ + assert(id != info.id || + (type == info.type && num == info.num && description == info.description)); + return id == info.id; + } }; class DeviceRequestedFeatures { -public: - /* Use experimental feature set. */ - bool experimental; - - /* Selective nodes compilation. */ - - /* Identifier of a node group up to which all the nodes needs to be - * compiled in. Nodes from higher group indices will be ignores. - */ - int max_nodes_group; - - /* Features bitfield indicating which features from the requested group - * will be compiled in. Nodes which corresponds to features which are not - * in this bitfield will be ignored even if they're in the requested group. - */ - int nodes_features; - - /* BVH/sampling kernel features. */ - bool use_hair; - bool use_object_motion; - bool use_camera_motion; - - /* Denotes whether baking functionality is needed. */ - bool use_baking; - - /* Use subsurface scattering materials. */ - bool use_subsurface; - - /* Use volume materials. */ - bool use_volume; - - /* Use branched integrator. */ - bool use_integrator_branched; - - /* Use OpenSubdiv patch evaluation */ - bool use_patch_evaluation; - - /* Use Transparent shadows */ - bool use_transparent; - - /* Use various shadow tricks, such as shadow catcher. */ - bool use_shadow_tricks; - - /* Per-uber shader usage flags. */ - bool use_principled; - - /* Denoising features. */ - bool use_denoising; - - /* Use raytracing in shaders. */ - bool use_shader_raytrace; - - /* Use true displacement */ - bool use_true_displacement; - - /* Use background lights */ - bool use_background_light; - - DeviceRequestedFeatures() - { - /* TODO(sergey): Find more meaningful defaults. */ - experimental = false; - max_nodes_group = 0; - nodes_features = 0; - use_hair = false; - use_object_motion = false; - use_camera_motion = false; - use_baking = false; - use_subsurface = false; - use_volume = false; - use_integrator_branched = false; - use_patch_evaluation = false; - use_transparent = false; - use_shadow_tricks = false; - use_principled = false; - use_denoising = false; - use_shader_raytrace = false; - use_true_displacement = false; - use_background_light = false; - } - - bool modified(const DeviceRequestedFeatures& requested_features) - { - return !(experimental == requested_features.experimental && - max_nodes_group == requested_features.max_nodes_group && - nodes_features == requested_features.nodes_features && - use_hair == requested_features.use_hair && - use_object_motion == requested_features.use_object_motion && - use_camera_motion == requested_features.use_camera_motion && - use_baking == requested_features.use_baking && - use_subsurface == requested_features.use_subsurface && - use_volume == requested_features.use_volume && - use_integrator_branched == requested_features.use_integrator_branched && - use_patch_evaluation == requested_features.use_patch_evaluation && - use_transparent == requested_features.use_transparent && - use_shadow_tricks == requested_features.use_shadow_tricks && - use_principled == requested_features.use_principled && - use_denoising == requested_features.use_denoising && - use_shader_raytrace == requested_features.use_shader_raytrace && - use_true_displacement == requested_features.use_true_displacement && - use_background_light == requested_features.use_background_light); - } - - /* Convert the requested features structure to a build options, - * which could then be passed to compilers. - */ - string get_build_options() const - { - string build_options = ""; - if(experimental) { - build_options += "-D__KERNEL_EXPERIMENTAL__ "; - } - build_options += "-D__NODES_MAX_GROUP__=" + - string_printf("%d", max_nodes_group); - build_options += " -D__NODES_FEATURES__=" + - string_printf("%d", nodes_features); - if(!use_hair) { - build_options += " -D__NO_HAIR__"; - } - if(!use_object_motion) { - build_options += " -D__NO_OBJECT_MOTION__"; - } - if(!use_camera_motion) { - build_options += " -D__NO_CAMERA_MOTION__"; - } - if(!use_baking) { - build_options += " -D__NO_BAKING__"; - } - if(!use_volume) { - build_options += " -D__NO_VOLUME__"; - } - if(!use_subsurface) { - build_options += " -D__NO_SUBSURFACE__"; - } - if(!use_integrator_branched) { - build_options += " -D__NO_BRANCHED_PATH__"; - } - if(!use_patch_evaluation) { - build_options += " -D__NO_PATCH_EVAL__"; - } - if(!use_transparent && !use_volume) { - build_options += " -D__NO_TRANSPARENT__"; - } - if(!use_shadow_tricks) { - build_options += " -D__NO_SHADOW_TRICKS__"; - } - if(!use_principled) { - build_options += " -D__NO_PRINCIPLED__"; - } - if(!use_denoising) { - build_options += " -D__NO_DENOISING__"; - } - if(!use_shader_raytrace) { - build_options += " -D__NO_SHADER_RAYTRACE__"; - } - return build_options; - } + public: + /* Use experimental feature set. */ + bool experimental; + + /* Selective nodes compilation. */ + + /* Identifier of a node group up to which all the nodes needs to be + * compiled in. Nodes from higher group indices will be ignores. + */ + int max_nodes_group; + + /* Features bitfield indicating which features from the requested group + * will be compiled in. Nodes which corresponds to features which are not + * in this bitfield will be ignored even if they're in the requested group. + */ + int nodes_features; + + /* BVH/sampling kernel features. */ + bool use_hair; + bool use_object_motion; + bool use_camera_motion; + + /* Denotes whether baking functionality is needed. */ + bool use_baking; + + /* Use subsurface scattering materials. */ + bool use_subsurface; + + /* Use volume materials. */ + bool use_volume; + + /* Use branched integrator. */ + bool use_integrator_branched; + + /* Use OpenSubdiv patch evaluation */ + bool use_patch_evaluation; + + /* Use Transparent shadows */ + bool use_transparent; + + /* Use various shadow tricks, such as shadow catcher. */ + bool use_shadow_tricks; + + /* Per-uber shader usage flags. */ + bool use_principled; + + /* Denoising features. */ + bool use_denoising; + + /* Use raytracing in shaders. */ + bool use_shader_raytrace; + + /* Use true displacement */ + bool use_true_displacement; + + /* Use background lights */ + bool use_background_light; + + DeviceRequestedFeatures() + { + /* TODO(sergey): Find more meaningful defaults. */ + experimental = false; + max_nodes_group = 0; + nodes_features = 0; + use_hair = false; + use_object_motion = false; + use_camera_motion = false; + use_baking = false; + use_subsurface = false; + use_volume = false; + use_integrator_branched = false; + use_patch_evaluation = false; + use_transparent = false; + use_shadow_tricks = false; + use_principled = false; + use_denoising = false; + use_shader_raytrace = false; + use_true_displacement = false; + use_background_light = false; + } + + bool modified(const DeviceRequestedFeatures &requested_features) + { + return !(experimental == requested_features.experimental && + max_nodes_group == requested_features.max_nodes_group && + nodes_features == requested_features.nodes_features && + use_hair == requested_features.use_hair && + use_object_motion == requested_features.use_object_motion && + use_camera_motion == requested_features.use_camera_motion && + use_baking == requested_features.use_baking && + use_subsurface == requested_features.use_subsurface && + use_volume == requested_features.use_volume && + use_integrator_branched == requested_features.use_integrator_branched && + use_patch_evaluation == requested_features.use_patch_evaluation && + use_transparent == requested_features.use_transparent && + use_shadow_tricks == requested_features.use_shadow_tricks && + use_principled == requested_features.use_principled && + use_denoising == requested_features.use_denoising && + use_shader_raytrace == requested_features.use_shader_raytrace && + use_true_displacement == requested_features.use_true_displacement && + use_background_light == requested_features.use_background_light); + } + + /* Convert the requested features structure to a build options, + * which could then be passed to compilers. + */ + string get_build_options() const + { + string build_options = ""; + if (experimental) { + build_options += "-D__KERNEL_EXPERIMENTAL__ "; + } + build_options += "-D__NODES_MAX_GROUP__=" + string_printf("%d", max_nodes_group); + build_options += " -D__NODES_FEATURES__=" + string_printf("%d", nodes_features); + if (!use_hair) { + build_options += " -D__NO_HAIR__"; + } + if (!use_object_motion) { + build_options += " -D__NO_OBJECT_MOTION__"; + } + if (!use_camera_motion) { + build_options += " -D__NO_CAMERA_MOTION__"; + } + if (!use_baking) { + build_options += " -D__NO_BAKING__"; + } + if (!use_volume) { + build_options += " -D__NO_VOLUME__"; + } + if (!use_subsurface) { + build_options += " -D__NO_SUBSURFACE__"; + } + if (!use_integrator_branched) { + build_options += " -D__NO_BRANCHED_PATH__"; + } + if (!use_patch_evaluation) { + build_options += " -D__NO_PATCH_EVAL__"; + } + if (!use_transparent && !use_volume) { + build_options += " -D__NO_TRANSPARENT__"; + } + if (!use_shadow_tricks) { + build_options += " -D__NO_SHADOW_TRICKS__"; + } + if (!use_principled) { + build_options += " -D__NO_PRINCIPLED__"; + } + if (!use_denoising) { + build_options += " -D__NO_DENOISING__"; + } + if (!use_shader_raytrace) { + build_options += " -D__NO_SHADER_RAYTRACE__"; + } + return build_options; + } }; -std::ostream& operator <<(std::ostream &os, - const DeviceRequestedFeatures& requested_features); +std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &requested_features); /* Device */ struct DeviceDrawParams { - function<void()> bind_display_space_shader_cb; - function<void()> unbind_display_space_shader_cb; + function<void()> bind_display_space_shader_cb; + function<void()> unbind_display_space_shader_cb; }; class Device { - friend class device_sub_ptr; -protected: - enum { - FALLBACK_SHADER_STATUS_NONE = 0, - FALLBACK_SHADER_STATUS_ERROR, - FALLBACK_SHADER_STATUS_SUCCESS, - }; - - Device(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background) : background(background), - vertex_buffer(0), - fallback_status(FALLBACK_SHADER_STATUS_NONE), fallback_shader_program(0), - info(info_), stats(stats_), profiler(profiler_) {} - - bool background; - string error_msg; - - /* used for real time display */ - unsigned int vertex_buffer; - int fallback_status, fallback_shader_program; - int image_texture_location, fullscreen_location; - - bool bind_fallback_display_space_shader(const float width, const float height); - - virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/) - { - /* Only required for devices that implement denoising. */ - assert(false); - return (device_ptr) 0; - } - virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {}; - -public: - virtual ~Device(); - - /* info */ - DeviceInfo info; - virtual const string& error_message() { return error_msg; } - bool have_error() { return !error_message().empty(); } - virtual void set_error(const string& error) - { - if(!have_error()) { - error_msg = error; - } - fprintf(stderr, "%s\n", error.c_str()); - fflush(stderr); - } - virtual bool show_samples() const { return false; } - virtual BVHLayoutMask get_bvh_layout_mask() const = 0; - - /* statistics */ - Stats &stats; - Profiler &profiler; - - /* memory alignment */ - virtual int mem_sub_ptr_alignment() { return MIN_ALIGNMENT_CPU_DATA_TYPES; } - - /* constant memory */ - virtual void const_copy_to(const char *name, void *host, size_t size) = 0; - - /* open shading language, only for CPU device */ - virtual void *osl_memory() { return NULL; } - - /* load/compile kernels, must be called before adding tasks */ - virtual bool load_kernels( - const DeviceRequestedFeatures& /*requested_features*/) - { return true; } - - /* Wait for device to become available to upload data and receive tasks - * This method is used by the OpenCL device to load the - * optimized kernels or when not (yet) available load the - * generic kernels (only during foreground rendering) */ - virtual bool wait_for_availability( - const DeviceRequestedFeatures& /*requested_features*/) - { return true; } - /* Check if there are 'better' kernels available to be used - * We can switch over to these kernels - * This method is used to determine if we can switch the preview kernels - * to regular kernels */ - virtual DeviceKernelStatus get_active_kernel_switch_state() - { return DEVICE_KERNEL_USING_FEATURE_KERNEL; } - - /* tasks */ - virtual int get_split_task_count(DeviceTask& task) = 0; - virtual void task_add(DeviceTask& task) = 0; - virtual void task_wait() = 0; - virtual void task_cancel() = 0; - - /* opengl drawing */ - virtual void draw_pixels(device_memory& mem, int y, - int w, int h, int width, int height, - int dx, int dy, int dw, int dh, - bool transparent, const DeviceDrawParams &draw_params); + friend class device_sub_ptr; + + protected: + enum { + FALLBACK_SHADER_STATUS_NONE = 0, + FALLBACK_SHADER_STATUS_ERROR, + FALLBACK_SHADER_STATUS_SUCCESS, + }; + + Device(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background) + : background(background), + vertex_buffer(0), + fallback_status(FALLBACK_SHADER_STATUS_NONE), + fallback_shader_program(0), + info(info_), + stats(stats_), + profiler(profiler_) + { + } + + bool background; + string error_msg; + + /* used for real time display */ + unsigned int vertex_buffer; + int fallback_status, fallback_shader_program; + int image_texture_location, fullscreen_location; + + bool bind_fallback_display_space_shader(const float width, const float height); + + virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/) + { + /* Only required for devices that implement denoising. */ + assert(false); + return (device_ptr)0; + } + virtual void mem_free_sub_ptr(device_ptr /*ptr*/){}; + + public: + virtual ~Device(); + + /* info */ + DeviceInfo info; + virtual const string &error_message() + { + return error_msg; + } + bool have_error() + { + return !error_message().empty(); + } + virtual void set_error(const string &error) + { + if (!have_error()) { + error_msg = error; + } + fprintf(stderr, "%s\n", error.c_str()); + fflush(stderr); + } + virtual bool show_samples() const + { + return false; + } + virtual BVHLayoutMask get_bvh_layout_mask() const = 0; + + /* statistics */ + Stats &stats; + Profiler &profiler; + + /* memory alignment */ + virtual int mem_sub_ptr_alignment() + { + return MIN_ALIGNMENT_CPU_DATA_TYPES; + } + + /* constant memory */ + virtual void const_copy_to(const char *name, void *host, size_t size) = 0; + + /* open shading language, only for CPU device */ + virtual void *osl_memory() + { + return NULL; + } + + /* load/compile kernels, must be called before adding tasks */ + virtual bool load_kernels(const DeviceRequestedFeatures & /*requested_features*/) + { + return true; + } + + /* Wait for device to become available to upload data and receive tasks + * This method is used by the OpenCL device to load the + * optimized kernels or when not (yet) available load the + * generic kernels (only during foreground rendering) */ + virtual bool wait_for_availability(const DeviceRequestedFeatures & /*requested_features*/) + { + return true; + } + /* Check if there are 'better' kernels available to be used + * We can switch over to these kernels + * This method is used to determine if we can switch the preview kernels + * to regular kernels */ + virtual DeviceKernelStatus get_active_kernel_switch_state() + { + return DEVICE_KERNEL_USING_FEATURE_KERNEL; + } + + /* tasks */ + virtual int get_split_task_count(DeviceTask &task) = 0; + virtual void task_add(DeviceTask &task) = 0; + virtual void task_wait() = 0; + virtual void task_cancel() = 0; + + /* opengl drawing */ + virtual void draw_pixels(device_memory &mem, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params); #ifdef WITH_NETWORK - /* networking */ - void server_run(); + /* networking */ + void server_run(); #endif - /* multi device */ - virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {} - virtual int device_number(Device * /*sub_device*/) { return 0; } - virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} - virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {} - - /* static */ - static Device *create(DeviceInfo& info, Stats &stats, Profiler& profiler, bool background = true); - - static DeviceType type_from_string(const char *name); - static string string_from_type(DeviceType type); - static vector<DeviceType> available_types(); - static vector<DeviceInfo> available_devices(uint device_type_mask = DEVICE_MASK_ALL); - static string device_capabilities(uint device_type_mask = DEVICE_MASK_ALL); - static DeviceInfo get_multi_device(const vector<DeviceInfo>& subdevices, - int threads, - bool background); - - /* Tag devices lists for update. */ - static void tag_update(); - - static void free_memory(); - -protected: - /* Memory allocation, only accessed through device_memory. */ - friend class MultiDevice; - friend class DeviceServer; - friend class device_memory; - - virtual void mem_alloc(device_memory& mem) = 0; - virtual void mem_copy_to(device_memory& mem) = 0; - virtual void mem_copy_from(device_memory& mem, - int y, int w, int h, int elem) = 0; - virtual void mem_zero(device_memory& mem) = 0; - virtual void mem_free(device_memory& mem) = 0; - -private: - /* Indicted whether device types and devices lists were initialized. */ - static bool need_types_update, need_devices_update; - static thread_mutex device_mutex; - static vector<DeviceInfo> cuda_devices; - static vector<DeviceInfo> opencl_devices; - static vector<DeviceInfo> cpu_devices; - static vector<DeviceInfo> network_devices; - static uint devices_initialized_mask; + /* multi device */ + virtual void map_tile(Device * /*sub_device*/, RenderTile & /*tile*/) + { + } + virtual int device_number(Device * /*sub_device*/) + { + return 0; + } + virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) + { + } + virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) + { + } + + /* static */ + static Device *create(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + bool background = true); + + static DeviceType type_from_string(const char *name); + static string string_from_type(DeviceType type); + static vector<DeviceType> available_types(); + static vector<DeviceInfo> available_devices(uint device_type_mask = DEVICE_MASK_ALL); + static string device_capabilities(uint device_type_mask = DEVICE_MASK_ALL); + static DeviceInfo get_multi_device(const vector<DeviceInfo> &subdevices, + int threads, + bool background); + + /* Tag devices lists for update. */ + static void tag_update(); + + static void free_memory(); + + protected: + /* Memory allocation, only accessed through device_memory. */ + friend class MultiDevice; + friend class DeviceServer; + friend class device_memory; + + virtual void mem_alloc(device_memory &mem) = 0; + virtual void mem_copy_to(device_memory &mem) = 0; + virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0; + virtual void mem_zero(device_memory &mem) = 0; + virtual void mem_free(device_memory &mem) = 0; + + private: + /* Indicted whether device types and devices lists were initialized. */ + static bool need_types_update, need_devices_update; + static thread_mutex device_mutex; + static vector<DeviceInfo> cuda_devices; + static vector<DeviceInfo> opencl_devices; + static vector<DeviceInfo> cpu_devices; + static vector<DeviceInfo> network_devices; + static uint devices_initialized_mask; }; CCL_NAMESPACE_END -#endif /* __DEVICE_H__ */ +#endif /* __DEVICE_H__ */ diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 73f1fc02b08..837a8186064 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -61,1087 +61,1183 @@ class CPUDevice; /* Has to be outside of the class to be shared across template instantiations. */ static const char *logged_architecture = ""; -template<typename F> -class KernelFunctions { -public: - KernelFunctions() - { - kernel = (F)NULL; - } - - KernelFunctions(F kernel_default, - F kernel_sse2, - F kernel_sse3, - F kernel_sse41, - F kernel_avx, - F kernel_avx2) - { - const char *architecture_name = "default"; - kernel = kernel_default; - - /* Silence potential warnings about unused variables - * when compiling without some architectures. */ - (void) kernel_sse2; - (void) kernel_sse3; - (void) kernel_sse41; - (void) kernel_avx; - (void) kernel_avx2; +template<typename F> class KernelFunctions { + public: + KernelFunctions() + { + kernel = (F)NULL; + } + + KernelFunctions( + F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2) + { + const char *architecture_name = "default"; + kernel = kernel_default; + + /* Silence potential warnings about unused variables + * when compiling without some architectures. */ + (void)kernel_sse2; + (void)kernel_sse3; + (void)kernel_sse41; + (void)kernel_avx; + (void)kernel_avx2; #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 - if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - architecture_name = "AVX2"; - kernel = kernel_avx2; - } - else + if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { + architecture_name = "AVX2"; + kernel = kernel_avx2; + } + else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX - if(DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { - architecture_name = "AVX"; - kernel = kernel_avx; - } - else + if (DebugFlags().cpu.has_avx() && system_cpu_support_avx()) { + architecture_name = "AVX"; + kernel = kernel_avx; + } + else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 - if(DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { - architecture_name = "SSE4.1"; - kernel = kernel_sse41; - } - else + if (DebugFlags().cpu.has_sse41() && system_cpu_support_sse41()) { + architecture_name = "SSE4.1"; + kernel = kernel_sse41; + } + else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 - if(DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { - architecture_name = "SSE3"; - kernel = kernel_sse3; - } - else + if (DebugFlags().cpu.has_sse3() && system_cpu_support_sse3()) { + architecture_name = "SSE3"; + kernel = kernel_sse3; + } + else #endif #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 - if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - architecture_name = "SSE2"; - kernel = kernel_sse2; - } + if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { + architecture_name = "SSE2"; + kernel = kernel_sse2; + } #endif - if(strcmp(architecture_name, logged_architecture) != 0) { - VLOG(1) << "Will be using " << architecture_name << " kernels."; - logged_architecture = architecture_name; - } - } - - inline F operator()() const { - assert(kernel); - return kernel; - } -protected: - F kernel; + if (strcmp(architecture_name, logged_architecture) != 0) { + VLOG(1) << "Will be using " << architecture_name << " kernels."; + logged_architecture = architecture_name; + } + } + + inline F operator()() const + { + assert(kernel); + return kernel; + } + + protected: + F kernel; }; class CPUSplitKernel : public DeviceSplitKernel { - CPUDevice *device; -public: - explicit CPUSplitKernel(CPUDevice *device); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); - - virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); + CPUDevice *device; + + public: + explicit CPUSplitKernel(CPUDevice *device); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data_, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs); + + virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); + virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); }; -class CPUDevice : public Device -{ -public: - TaskPool task_pool; - KernelGlobals kernel_globals; +class CPUDevice : public Device { + public: + TaskPool task_pool; + KernelGlobals kernel_globals; - device_vector<TextureInfo> texture_info; - bool need_texture_info; + device_vector<TextureInfo> texture_info; + bool need_texture_info; #ifdef WITH_OSL - OSLGlobals osl_globals; + OSLGlobals osl_globals; #endif - bool use_split_kernel; - - DeviceRequestedFeatures requested_features; - - KernelFunctions<void(*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel; - KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel; - KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel; - KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel; - - KernelFunctions<void(*)(int, TileInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel; - KernelFunctions<void(*)(int, TileInfo*, int, int, int, int, float*, float*, float, int*, int, int)> filter_get_feature_kernel; - KernelFunctions<void(*)(int, int, int, int*, float*, float*, int, int*)> filter_write_feature_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_detect_outliers_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel; - - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, int, float, float)> filter_nlm_calc_difference_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel; - KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, int, int, int)> filter_nlm_update_output_kernel; - KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - - KernelFunctions<void(*)(float*, TileInfo*, int, int, int, float*, int*, int*, int, int, bool, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; - - KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, - int, int, int, int, int, int, int, int, ccl_global int*, int, - ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel; - unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels; + bool use_split_kernel; + + DeviceRequestedFeatures requested_features; + + KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel; + KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> + convert_to_half_float_kernel; + KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> + convert_to_byte_kernel; + KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> + shader_kernel; + + KernelFunctions<void (*)( + int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)> + filter_divide_shadow_kernel; + KernelFunctions<void (*)( + int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)> + filter_get_feature_kernel; + KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)> + filter_write_feature_kernel; + KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)> + filter_detect_outliers_kernel; + KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)> + filter_combine_halves_kernel; + + KernelFunctions<void (*)( + int, int, float *, float *, float *, float *, int *, int, int, int, float, float)> + filter_nlm_calc_difference_kernel; + KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel; + KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel; + KernelFunctions<void (*)( + int, int, float *, float *, float *, float *, float *, int *, int, int, int)> + filter_nlm_update_output_kernel; + KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel; + + KernelFunctions<void (*)( + float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)> + filter_construct_transform_kernel; + KernelFunctions<void (*)(int, + int, + int, + float *, + float *, + float *, + int *, + float *, + float3 *, + int *, + int *, + int, + int, + int, + int, + bool)> + filter_nlm_construct_gramian_kernel; + KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)> + filter_finalize_kernel; + + KernelFunctions<void (*)(KernelGlobals *, + ccl_constant KernelData *, + ccl_global void *, + int, + ccl_global char *, + int, + int, + int, + int, + int, + int, + int, + int, + ccl_global int *, + int, + ccl_global char *, + ccl_global unsigned int *, + unsigned int, + ccl_global float *)> + data_init_kernel; + unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels; #define KERNEL_FUNCTIONS(name) \ - KERNEL_NAME_EVAL(cpu, name), \ - KERNEL_NAME_EVAL(cpu_sse2, name), \ - KERNEL_NAME_EVAL(cpu_sse3, name), \ - KERNEL_NAME_EVAL(cpu_sse41, name), \ - KERNEL_NAME_EVAL(cpu_avx, name), \ - KERNEL_NAME_EVAL(cpu_avx2, name) - - CPUDevice(DeviceInfo& info_, Stats &stats_, Profiler &profiler_, bool background_) - : Device(info_, stats_, profiler_, background_), - texture_info(this, "__texture_info", MEM_TEXTURE), -#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name)) - REGISTER_KERNEL(path_trace), - REGISTER_KERNEL(convert_to_half_float), - REGISTER_KERNEL(convert_to_byte), - REGISTER_KERNEL(shader), - REGISTER_KERNEL(filter_divide_shadow), - REGISTER_KERNEL(filter_get_feature), - REGISTER_KERNEL(filter_write_feature), - REGISTER_KERNEL(filter_detect_outliers), - REGISTER_KERNEL(filter_combine_halves), - REGISTER_KERNEL(filter_nlm_calc_difference), - REGISTER_KERNEL(filter_nlm_blur), - REGISTER_KERNEL(filter_nlm_calc_weight), - REGISTER_KERNEL(filter_nlm_update_output), - REGISTER_KERNEL(filter_nlm_normalize), - REGISTER_KERNEL(filter_construct_transform), - REGISTER_KERNEL(filter_nlm_construct_gramian), - REGISTER_KERNEL(filter_finalize), - REGISTER_KERNEL(data_init) + KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \ + KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \ + KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name) + + CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_) + : Device(info_, stats_, profiler_, background_), + texture_info(this, "__texture_info", MEM_TEXTURE), +#define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name)) + REGISTER_KERNEL(path_trace), + REGISTER_KERNEL(convert_to_half_float), + REGISTER_KERNEL(convert_to_byte), + REGISTER_KERNEL(shader), + REGISTER_KERNEL(filter_divide_shadow), + REGISTER_KERNEL(filter_get_feature), + REGISTER_KERNEL(filter_write_feature), + REGISTER_KERNEL(filter_detect_outliers), + REGISTER_KERNEL(filter_combine_halves), + REGISTER_KERNEL(filter_nlm_calc_difference), + REGISTER_KERNEL(filter_nlm_blur), + REGISTER_KERNEL(filter_nlm_calc_weight), + REGISTER_KERNEL(filter_nlm_update_output), + REGISTER_KERNEL(filter_nlm_normalize), + REGISTER_KERNEL(filter_construct_transform), + REGISTER_KERNEL(filter_nlm_construct_gramian), + REGISTER_KERNEL(filter_finalize), + REGISTER_KERNEL(data_init) #undef REGISTER_KERNEL - { - if(info.cpu_threads == 0) { - info.cpu_threads = TaskScheduler::num_threads(); - } + { + if (info.cpu_threads == 0) { + info.cpu_threads = TaskScheduler::num_threads(); + } #ifdef WITH_OSL - kernel_globals.osl = &osl_globals; + kernel_globals.osl = &osl_globals; #endif - use_split_kernel = DebugFlags().cpu.split_kernel; - if(use_split_kernel) { - VLOG(1) << "Will be using split kernel."; - } - need_texture_info = false; - -#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name)) - REGISTER_SPLIT_KERNEL(path_init); - REGISTER_SPLIT_KERNEL(scene_intersect); - REGISTER_SPLIT_KERNEL(lamp_emission); - REGISTER_SPLIT_KERNEL(do_volume); - REGISTER_SPLIT_KERNEL(queue_enqueue); - REGISTER_SPLIT_KERNEL(indirect_background); - REGISTER_SPLIT_KERNEL(shader_setup); - REGISTER_SPLIT_KERNEL(shader_sort); - REGISTER_SPLIT_KERNEL(shader_eval); - REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); - REGISTER_SPLIT_KERNEL(subsurface_scatter); - REGISTER_SPLIT_KERNEL(direct_lighting); - REGISTER_SPLIT_KERNEL(shadow_blocked_ao); - REGISTER_SPLIT_KERNEL(shadow_blocked_dl); - REGISTER_SPLIT_KERNEL(enqueue_inactive); - REGISTER_SPLIT_KERNEL(next_iteration_setup); - REGISTER_SPLIT_KERNEL(indirect_subsurface); - REGISTER_SPLIT_KERNEL(buffer_update); + use_split_kernel = DebugFlags().cpu.split_kernel; + if (use_split_kernel) { + VLOG(1) << "Will be using split kernel."; + } + need_texture_info = false; + +#define REGISTER_SPLIT_KERNEL(name) \ + split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \ + KERNEL_FUNCTIONS(name)) + REGISTER_SPLIT_KERNEL(path_init); + REGISTER_SPLIT_KERNEL(scene_intersect); + REGISTER_SPLIT_KERNEL(lamp_emission); + REGISTER_SPLIT_KERNEL(do_volume); + REGISTER_SPLIT_KERNEL(queue_enqueue); + REGISTER_SPLIT_KERNEL(indirect_background); + REGISTER_SPLIT_KERNEL(shader_setup); + REGISTER_SPLIT_KERNEL(shader_sort); + REGISTER_SPLIT_KERNEL(shader_eval); + REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao); + REGISTER_SPLIT_KERNEL(subsurface_scatter); + REGISTER_SPLIT_KERNEL(direct_lighting); + REGISTER_SPLIT_KERNEL(shadow_blocked_ao); + REGISTER_SPLIT_KERNEL(shadow_blocked_dl); + REGISTER_SPLIT_KERNEL(enqueue_inactive); + REGISTER_SPLIT_KERNEL(next_iteration_setup); + REGISTER_SPLIT_KERNEL(indirect_subsurface); + REGISTER_SPLIT_KERNEL(buffer_update); #undef REGISTER_SPLIT_KERNEL #undef KERNEL_FUNCTIONS - } - - ~CPUDevice() - { - task_pool.stop(); - texture_info.free(); - } - - virtual bool show_samples() const - { - return (info.cpu_threads == 1); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const { - BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; - if(DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH4; - } - if(DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { - bvh_layout_mask |= BVH_LAYOUT_BVH8; - } + } + + ~CPUDevice() + { + task_pool.stop(); + texture_info.free(); + } + + virtual bool show_samples() const + { + return (info.cpu_threads == 1); + } + + virtual BVHLayoutMask get_bvh_layout_mask() const + { + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2; + if (DebugFlags().cpu.has_sse2() && system_cpu_support_sse2()) { + bvh_layout_mask |= BVH_LAYOUT_BVH4; + } + if (DebugFlags().cpu.has_avx2() && system_cpu_support_avx2()) { + bvh_layout_mask |= BVH_LAYOUT_BVH8; + } #ifdef WITH_EMBREE - bvh_layout_mask |= BVH_LAYOUT_EMBREE; -#endif /* WITH_EMBREE */ - return bvh_layout_mask; - } - - void load_texture_info() - { - if(need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - void mem_alloc(device_memory& mem) - { - if(mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else { - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - if(mem.type == MEM_DEVICE_ONLY) { - assert(!mem.host_pointer); - size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; - void *data = util_aligned_malloc(mem.memory_size(), alignment); - mem.device_pointer = (device_ptr)data; - } - else { - mem.device_pointer = (device_ptr)mem.host_pointer; - } - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - } - } - - void mem_copy_to(device_memory& mem) - { - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else if(mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - /* copy is no-op */ - } - } - - void mem_copy_from(device_memory& /*mem*/, - int /*y*/, int /*w*/, int /*h*/, - int /*elem*/) - { - /* no-op */ - } - - void mem_zero(device_memory& mem) - { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - if(mem.device_pointer) { - memset((void*)mem.device_pointer, 0, mem.memory_size()); - } - } - - void mem_free(device_memory& mem) - { - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else if(mem.device_pointer) { - if(mem.type == MEM_DEVICE_ONLY) { - util_aligned_free((void*)mem.device_pointer); - } - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/) - { - return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); - } - - void const_copy_to(const char *name, void *host, size_t size) - { - kernel_const_copy(&kernel_globals, name, host, size); - } - - void tex_alloc(device_memory& mem) - { - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - if(mem.interpolation == INTERPOLATION_NONE) { - /* Data texture. */ - kernel_tex_copy(&kernel_globals, - mem.name, - mem.host_pointer, - mem.data_size); - } - else { - /* Image Texture. */ - int flat_slot = 0; - if(string_startswith(mem.name, "__tex_image")) { - int pos = string(mem.name).rfind("_"); - flat_slot = atoi(mem.name + pos + 1); - } - else { - assert(0); - } - - if(flat_slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(flat_slot + 128); - } - - TextureInfo& info = texture_info[flat_slot]; - info.data = (uint64_t)mem.host_pointer; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; - - need_texture_info = true; - } - - mem.device_pointer = (device_ptr)mem.host_pointer; - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - } - - void tex_free(device_memory& mem) - { - if(mem.device_pointer) { - mem.device_pointer = 0; - stats.mem_free(mem.device_size); - mem.device_size = 0; - need_texture_info = true; - } - } - - void *osl_memory() - { + bvh_layout_mask |= BVH_LAYOUT_EMBREE; +#endif /* WITH_EMBREE */ + return bvh_layout_mask; + } + + void load_texture_info() + { + if (need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } + } + + void mem_alloc(device_memory &mem) + { + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + if (mem.type == MEM_DEVICE_ONLY) { + assert(!mem.host_pointer); + size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES; + void *data = util_aligned_malloc(mem.memory_size(), alignment); + mem.device_pointer = (device_ptr)data; + } + else { + mem.device_pointer = (device_ptr)mem.host_pointer; + } + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + } + } + + void mem_copy_to(device_memory &mem) + { + if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else if (mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + /* copy is no-op */ + } + } + + void mem_copy_from(device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) + { + /* no-op */ + } + + void mem_zero(device_memory &mem) + { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.device_pointer) { + memset((void *)mem.device_pointer, 0, mem.memory_size()); + } + } + + void mem_free(device_memory &mem) + { + if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else if (mem.device_pointer) { + if (mem.type == MEM_DEVICE_ONLY) { + util_aligned_free((void *)mem.device_pointer); + } + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) + { + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); + } + + void const_copy_to(const char *name, void *host, size_t size) + { + kernel_const_copy(&kernel_globals, name, host, size); + } + + void tex_alloc(device_memory &mem) + { + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + if (mem.interpolation == INTERPOLATION_NONE) { + /* Data texture. */ + kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); + } + else { + /* Image Texture. */ + int flat_slot = 0; + if (string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); + } + else { + assert(0); + } + + if (flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + TextureInfo &info = texture_info[flat_slot]; + info.data = (uint64_t)mem.host_pointer; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + + need_texture_info = true; + } + + mem.device_pointer = (device_ptr)mem.host_pointer; + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + } + + void tex_free(device_memory &mem) + { + if (mem.device_pointer) { + mem.device_pointer = 0; + stats.mem_free(mem.device_size); + mem.device_size = 0; + need_texture_info = true; + } + } + + void *osl_memory() + { #ifdef WITH_OSL - return &osl_globals; + return &osl_globals; #else - return NULL; + return NULL; #endif - } - - void thread_run(DeviceTask *task) - { - if(task->type == DeviceTask::RENDER) { - thread_render(*task); - } - else if(task->type == DeviceTask::FILM_CONVERT) - thread_film_convert(*task); - else if(task->type == DeviceTask::SHADER) - thread_shader(*task); - } - - class CPUDeviceTask : public DeviceTask { - public: - CPUDeviceTask(CPUDevice *device, DeviceTask& task) - : DeviceTask(task) - { - run = function_bind(&CPUDevice::thread_run, device, this); - } - }; - - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS); - - int4 rect = task->rect; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int w = align_up(rect.z-rect.x, 4); - int h = rect.w-rect.y; - int stride = task->buffer.stride; - int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; - - float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; - float *blurDifference = temporary_mem; - float *difference = temporary_mem + task->buffer.pass_stride; - float *weightAccum = temporary_mem + 2*task->buffer.pass_stride; - - memset(weightAccum, 0, sizeof(float)*w*h); - memset((float*) out_ptr, 0, sizeof(float)*w*h); - - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, - (float*) guide_ptr, - (float*) variance_ptr, - NULL, - difference, - local_rect, - w, channel_offset, - 0, a, k_2); - - filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); - filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f); - - filter_nlm_update_output_kernel()(dx, dy, - blurDifference, - (float*) image_ptr, - difference, - (float*) out_ptr, - weightAccum, - local_rect, - channel_offset, - stride, f); - } - - int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y}; - filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w); - - return true; - } - - bool denoising_construct_transform(DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM); - - for(int y = 0; y < task->filter_area.w; y++) { - for(int x = 0; x < task->filter_area.z; x++) { - filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer, - task->tile_info, - x + task->filter_area.x, - y + task->filter_area.y, - y*task->filter_area.z + x, - (float*) task->storage.transform.device_pointer, - (int*) task->storage.rank.device_pointer, - &task->rect.x, - task->buffer.pass_stride, - task->buffer.frame_stride, - task->buffer.use_time, - task->radius, - task->pca_threshold); - } - } - return true; - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); - - float *temporary_mem = (float*) task->buffer.temporary_mem.device_pointer; - float *difference = temporary_mem; - float *blurDifference = temporary_mem + task->buffer.pass_stride; - - int r = task->radius; - int frame_offset = frame * task->buffer.frame_stride; - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - filter_nlm_calc_difference_kernel()(dx, dy, - (float*) color_ptr, - (float*) color_variance_ptr, - (float*) scale_ptr, - difference, - local_rect, - task->buffer.stride, - task->buffer.pass_stride, - frame_offset, - 1.0f, - task->nlm_k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); - filter_nlm_construct_gramian_kernel()(dx, dy, - task->tile_info->frames[frame], - blurDifference, - (float*) task->buffer.mem.device_pointer, - (float*) task->storage.transform.device_pointer, - (int*) task->storage.rank.device_pointer, - (float*) task->storage.XtWX.device_pointer, - (float3*) task->storage.XtWY.device_pointer, - local_rect, - &task->reconstruction_state.filter_window.x, - task->buffer.stride, - 4, - task->buffer.pass_stride, - frame_offset, - task->buffer.use_time); - } - - return true; - } - - bool denoising_solve(device_ptr output_ptr, - DenoisingTask *task) - { - for(int y = 0; y < task->filter_area.w; y++) { - for(int x = 0; x < task->filter_area.z; x++) { - filter_finalize_kernel()(x, - y, - y*task->filter_area.z + x, - (float*) output_ptr, - (int*) task->storage.rank.device_pointer, - (float*) task->storage.XtWX.device_pointer, - (float3*) task->storage.XtWY.device_pointer, - &task->reconstruction_state.buffer_params.x, - task->render_buffer.samples); - } - } - return true; - } - - bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, - device_ptr mean_ptr, device_ptr variance_ptr, - int r, int4 rect, DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES); - - for(int y = rect.y; y < rect.w; y++) { - for(int x = rect.x; x < rect.z; x++) { - filter_combine_halves_kernel()(x, y, - (float*) mean_ptr, - (float*) variance_ptr, - (float*) a_ptr, - (float*) b_ptr, - &rect.x, - r); - } - } - return true; - } - - bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, - device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW); - - for(int y = task->rect.y; y < task->rect.w; y++) { - for(int x = task->rect.x; x < task->rect.z; x++) { - filter_divide_shadow_kernel()(task->render_buffer.samples, - task->tile_info, - x, y, - (float*) a_ptr, - (float*) b_ptr, - (float*) sample_variance_ptr, - (float*) sv_variance_ptr, - (float*) buffer_variance_ptr, - &task->rect.x, - task->render_buffer.pass_stride, - task->render_buffer.offset); - } - } - return true; - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); - - for(int y = task->rect.y; y < task->rect.w; y++) { - for(int x = task->rect.x; x < task->rect.z; x++) { - filter_get_feature_kernel()(task->render_buffer.samples, - task->tile_info, - mean_offset, - variance_offset, - x, y, - (float*) mean_ptr, - (float*) variance_ptr, - scale, - &task->rect.x, - task->render_buffer.pass_stride, - task->render_buffer.offset); - } - } - return true; - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) - { - for(int y = 0; y < task->filter_area.w; y++) { - for(int x = 0; x < task->filter_area.z; x++) { - filter_write_feature_kernel()(task->render_buffer.samples, - x + task->filter_area.x, - y + task->filter_area.y, - &task->reconstruction_state.buffer_params.x, - (float*) from_ptr, - (float*) buffer_ptr, - out_offset, - &task->rect.x); - } - } - return true; - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) - { - ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS); - - for(int y = task->rect.y; y < task->rect.w; y++) { - for(int x = task->rect.x; x < task->rect.z; x++) { - filter_detect_outliers_kernel()(x, y, - (float*) image_ptr, - (float*) variance_ptr, - (float*) depth_ptr, - (float*) output_ptr, - &task->rect.x, - task->buffer.pass_stride); - } - } - return true; - } - - void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) - { - const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; - - scoped_timer timer(&tile.buffers->render_time); - - Coverage coverage(kg, tile); - if(use_coverage) { - coverage.init_path_trace(); - } - - float *render_buffer = (float*)tile.buffer; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - /* Needed for Embree. */ - SIMD_SET_FLUSH_TO_ZERO; - - for(int sample = start_sample; sample < end_sample; sample++) { - if(task.get_cancel() || task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } - - for(int y = tile.y; y < tile.y + tile.h; y++) { - for(int x = tile.x; x < tile.x + tile.w; x++) { - if(use_coverage) { - coverage.init_pixel(x, y); - } - path_trace_kernel()(kg, render_buffer, - sample, x, y, tile.offset, tile.stride); - } - } - - tile.sample = sample + 1; - - task.update_progress(&tile, tile.w*tile.h); - } - if(use_coverage) { - coverage.finalize(); - } - } - - void denoise(DenoisingTask& denoising, RenderTile &tile) - { - ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); - - tile.sample = tile.start_sample + tile.num_samples; - - denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind(&CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); - denoising.render_buffer.samples = tile.sample; - denoising.buffer.gpu_temporary_mem = false; - - denoising.run_denoising(&tile); - } - - void thread_render(DeviceTask& task) - { - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - return; - } - - /* allocate buffer for kernel globals */ - device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init()); - - profiler.add_state(&kg->profiler); - - CPUSplitKernel *split_kernel = NULL; - if(use_split_kernel) { - split_kernel = new CPUSplitKernel(this); - if(!split_kernel->load_kernels(requested_features)) { - thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); - kgbuffer.free(); - delete split_kernel; - return; - } - } - - RenderTile tile; - DenoisingTask denoising(this, task); - denoising.profiler = &kg->profiler; - - while(task.acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - if(use_split_kernel) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); - } - else { - path_trace(task, tile, kg); - } - } - else if(tile.task == RenderTile::DENOISE) { - denoise(denoising, tile); - task.update_progress(&tile, tile.w*tile.h); - } - - task.release_tile(tile); - - if(task_pool.canceled()) { - if(task.need_finish_queue == false) - break; - } - } - - profiler.remove_state(&kg->profiler); - - thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer); - kg->~KernelGlobals(); - kgbuffer.free(); - delete split_kernel; - } - - void thread_film_convert(DeviceTask& task) - { - float sample_scale = 1.0f/(task.sample + 1); - - if(task.rgba_half) { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - } - else { - for(int y = task.y; y < task.y + task.h; y++) - for(int x = task.x; x < task.x + task.w; x++) - convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer, - sample_scale, x, y, task.offset, task.stride); - - } - } - - void thread_shader(DeviceTask& task) - { - KernelGlobals kg = kernel_globals; + } + + void thread_run(DeviceTask *task) + { + if (task->type == DeviceTask::RENDER) { + thread_render(*task); + } + else if (task->type == DeviceTask::FILM_CONVERT) + thread_film_convert(*task); + else if (task->type == DeviceTask::SHADER) + thread_shader(*task); + } + + class CPUDeviceTask : public DeviceTask { + public: + CPUDeviceTask(CPUDevice *device, DeviceTask &task) : DeviceTask(task) + { + run = function_bind(&CPUDevice::thread_run, device, this); + } + }; + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS); + + int4 rect = task->rect; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int w = align_up(rect.z - rect.x, 4); + int h = rect.w - rect.y; + int stride = task->buffer.stride; + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + + float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer; + float *blurDifference = temporary_mem; + float *difference = temporary_mem + task->buffer.pass_stride; + float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride; + + memset(weightAccum, 0, sizeof(float) * w * h); + memset((float *)out_ptr, 0, sizeof(float) * w * h); + + for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) { + int dy = i / (2 * r + 1) - r; + int dx = i % (2 * r + 1) - r; + + int local_rect[4] = { + max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, + dy, + (float *)guide_ptr, + (float *)variance_ptr, + NULL, + difference, + local_rect, + w, + channel_offset, + 0, + a, + k_2); + + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f); + + filter_nlm_update_output_kernel()(dx, + dy, + blurDifference, + (float *)image_ptr, + difference, + (float *)out_ptr, + weightAccum, + local_rect, + channel_offset, + stride, + f); + } + + int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y}; + filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w); + + return true; + } + + bool denoising_construct_transform(DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM); + + for (int y = 0; y < task->filter_area.w; y++) { + for (int x = 0; x < task->filter_area.z; x++) { + filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer, + task->tile_info, + x + task->filter_area.x, + y + task->filter_area.y, + y * task->filter_area.z + x, + (float *)task->storage.transform.device_pointer, + (int *)task->storage.rank.device_pointer, + &task->rect.x, + task->buffer.pass_stride, + task->buffer.frame_stride, + task->buffer.use_time, + task->radius, + task->pca_threshold); + } + } + return true; + } + + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT); + + float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer; + float *difference = temporary_mem; + float *blurDifference = temporary_mem + task->buffer.pass_stride; + + int r = task->radius; + int frame_offset = frame * task->buffer.frame_stride; + for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) { + int dy = i / (2 * r + 1) - r; + int dx = i % (2 * r + 1) - r; + + int local_rect[4] = {max(0, -dx), + max(0, -dy), + task->reconstruction_state.source_w - max(0, dx), + task->reconstruction_state.source_h - max(0, dy)}; + filter_nlm_calc_difference_kernel()(dx, + dy, + (float *)color_ptr, + (float *)color_variance_ptr, + (float *)scale_ptr, + difference, + local_rect, + task->buffer.stride, + task->buffer.pass_stride, + frame_offset, + 1.0f, + task->nlm_k_2); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); + filter_nlm_calc_weight_kernel()( + blurDifference, difference, local_rect, task->buffer.stride, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); + filter_nlm_construct_gramian_kernel()(dx, + dy, + task->tile_info->frames[frame], + blurDifference, + (float *)task->buffer.mem.device_pointer, + (float *)task->storage.transform.device_pointer, + (int *)task->storage.rank.device_pointer, + (float *)task->storage.XtWX.device_pointer, + (float3 *)task->storage.XtWY.device_pointer, + local_rect, + &task->reconstruction_state.filter_window.x, + task->buffer.stride, + 4, + task->buffer.pass_stride, + frame_offset, + task->buffer.use_time); + } + + return true; + } + + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task) + { + for (int y = 0; y < task->filter_area.w; y++) { + for (int x = 0; x < task->filter_area.z; x++) { + filter_finalize_kernel()(x, + y, + y * task->filter_area.z + x, + (float *)output_ptr, + (int *)task->storage.rank.device_pointer, + (float *)task->storage.XtWX.device_pointer, + (float3 *)task->storage.XtWY.device_pointer, + &task->reconstruction_state.buffer_params.x, + task->render_buffer.samples); + } + } + return true; + } + + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES); + + for (int y = rect.y; y < rect.w; y++) { + for (int x = rect.x; x < rect.z; x++) { + filter_combine_halves_kernel()(x, + y, + (float *)mean_ptr, + (float *)variance_ptr, + (float *)a_ptr, + (float *)b_ptr, + &rect.x, + r); + } + } + return true; + } + + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW); + + for (int y = task->rect.y; y < task->rect.w; y++) { + for (int x = task->rect.x; x < task->rect.z; x++) { + filter_divide_shadow_kernel()(task->render_buffer.samples, + task->tile_info, + x, + y, + (float *)a_ptr, + (float *)b_ptr, + (float *)sample_variance_ptr, + (float *)sv_variance_ptr, + (float *)buffer_variance_ptr, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.offset); + } + } + return true; + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE); + + for (int y = task->rect.y; y < task->rect.w; y++) { + for (int x = task->rect.x; x < task->rect.z; x++) { + filter_get_feature_kernel()(task->render_buffer.samples, + task->tile_info, + mean_offset, + variance_offset, + x, + y, + (float *)mean_ptr, + (float *)variance_ptr, + scale, + &task->rect.x, + task->render_buffer.pass_stride, + task->render_buffer.offset); + } + } + return true; + } + + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + for (int y = 0; y < task->filter_area.w; y++) { + for (int x = 0; x < task->filter_area.z; x++) { + filter_write_feature_kernel()(task->render_buffer.samples, + x + task->filter_area.x, + y + task->filter_area.y, + &task->reconstruction_state.buffer_params.x, + (float *)from_ptr, + (float *)buffer_ptr, + out_offset, + &task->rect.x); + } + } + return true; + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS); + + for (int y = task->rect.y; y < task->rect.w; y++) { + for (int x = task->rect.x; x < task->rect.z; x++) { + filter_detect_outliers_kernel()(x, + y, + (float *)image_ptr, + (float *)variance_ptr, + (float *)depth_ptr, + (float *)output_ptr, + &task->rect.x, + task->buffer.pass_stride); + } + } + return true; + } + + void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) + { + const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; + + scoped_timer timer(&tile.buffers->render_time); + + Coverage coverage(kg, tile); + if (use_coverage) { + coverage.init_path_trace(); + } + + float *render_buffer = (float *)tile.buffer; + int start_sample = tile.start_sample; + int end_sample = tile.start_sample + tile.num_samples; + + /* Needed for Embree. */ + SIMD_SET_FLUSH_TO_ZERO; + + for (int sample = start_sample; sample < end_sample; sample++) { + if (task.get_cancel() || task_pool.canceled()) { + if (task.need_finish_queue == false) + break; + } + + for (int y = tile.y; y < tile.y + tile.h; y++) { + for (int x = tile.x; x < tile.x + tile.w; x++) { + if (use_coverage) { + coverage.init_pixel(x, y); + } + path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride); + } + } + + tile.sample = sample + 1; + + task.update_progress(&tile, tile.w * tile.h); + } + if (use_coverage) { + coverage.finalize(); + } + } + + void denoise(DenoisingTask &denoising, RenderTile &tile) + { + ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); + + tile.sample = tile.start_sample + tile.num_samples; + + denoising.functions.construct_transform = function_bind( + &CPUDevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind( + &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind( + &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind( + &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind( + &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind( + &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind( + &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind( + &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h); + denoising.render_buffer.samples = tile.sample; + denoising.buffer.gpu_temporary_mem = false; + + denoising.run_denoising(&tile); + } + + void thread_render(DeviceTask &task) + { + if (task_pool.canceled()) { + if (task.need_finish_queue == false) + return; + } + + /* allocate buffer for kernel globals */ + device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals"); + kgbuffer.alloc_to_device(1); + + KernelGlobals *kg = new ((void *)kgbuffer.device_pointer) + KernelGlobals(thread_kernel_globals_init()); + + profiler.add_state(&kg->profiler); + + CPUSplitKernel *split_kernel = NULL; + if (use_split_kernel) { + split_kernel = new CPUSplitKernel(this); + if (!split_kernel->load_kernels(requested_features)) { + thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); + kgbuffer.free(); + delete split_kernel; + return; + } + } + + RenderTile tile; + DenoisingTask denoising(this, task); + denoising.profiler = &kg->profiler; + + while (task.acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) { + if (use_split_kernel) { + device_only_memory<uchar> void_buffer(this, "void_buffer"); + split_kernel->path_trace(&task, tile, kgbuffer, void_buffer); + } + else { + path_trace(task, tile, kg); + } + } + else if (tile.task == RenderTile::DENOISE) { + denoise(denoising, tile); + task.update_progress(&tile, tile.w * tile.h); + } + + task.release_tile(tile); + + if (task_pool.canceled()) { + if (task.need_finish_queue == false) + break; + } + } + + profiler.remove_state(&kg->profiler); + + thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); + kg->~KernelGlobals(); + kgbuffer.free(); + delete split_kernel; + } + + void thread_film_convert(DeviceTask &task) + { + float sample_scale = 1.0f / (task.sample + 1); + + if (task.rgba_half) { + for (int y = task.y; y < task.y + task.h; y++) + for (int x = task.x; x < task.x + task.w; x++) + convert_to_half_float_kernel()(&kernel_globals, + (uchar4 *)task.rgba_half, + (float *)task.buffer, + sample_scale, + x, + y, + task.offset, + task.stride); + } + else { + for (int y = task.y; y < task.y + task.h; y++) + for (int x = task.x; x < task.x + task.w; x++) + convert_to_byte_kernel()(&kernel_globals, + (uchar4 *)task.rgba_byte, + (float *)task.buffer, + sample_scale, + x, + y, + task.offset, + task.stride); + } + } + + void thread_shader(DeviceTask &task) + { + KernelGlobals kg = kernel_globals; #ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - for(int sample = 0; sample < task.num_samples; sample++) { - for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) - shader_kernel()(&kg, - (uint4*)task.shader_input, - (float4*)task.shader_output, - task.shader_eval_type, - task.shader_filter, - x, - task.offset, - sample); - - if(task.get_cancel() || task_pool.canceled()) - break; - - task.update_progress(NULL); - - } + for (int sample = 0; sample < task.num_samples; sample++) { + for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++) + shader_kernel()(&kg, + (uint4 *)task.shader_input, + (float4 *)task.shader_output, + task.shader_eval_type, + task.shader_filter, + x, + task.offset, + sample); + + if (task.get_cancel() || task_pool.canceled()) + break; + + task.update_progress(NULL); + } #ifdef WITH_OSL - OSLShader::thread_free(&kg); + OSLShader::thread_free(&kg); #endif - } - - int get_split_task_count(DeviceTask& task) - { - if(task.type == DeviceTask::SHADER) - return task.get_subtask_count(info.cpu_threads, 256); - else - return task.get_subtask_count(info.cpu_threads); - } - - void task_add(DeviceTask& task) - { - /* Load texture info. */ - load_texture_info(); - - /* split task into smaller ones */ - list<DeviceTask> tasks; - - if(task.type == DeviceTask::SHADER) - task.split(tasks, info.cpu_threads, 256); - else - task.split(tasks, info.cpu_threads); - - foreach(DeviceTask& task, tasks) - task_pool.push(new CPUDeviceTask(this, task)); - } - - void task_wait() - { - task_pool.wait_work(); - } - - void task_cancel() - { - task_pool.cancel(); - } - -protected: - inline KernelGlobals thread_kernel_globals_init() - { - KernelGlobals kg = kernel_globals; - kg.transparent_shadow_intersections = NULL; - const int decoupled_count = sizeof(kg.decoupled_volume_steps) / - sizeof(*kg.decoupled_volume_steps); - for(int i = 0; i < decoupled_count; ++i) { - kg.decoupled_volume_steps[i] = NULL; - } - kg.decoupled_volume_steps_index = 0; - kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL; + } + + int get_split_task_count(DeviceTask &task) + { + if (task.type == DeviceTask::SHADER) + return task.get_subtask_count(info.cpu_threads, 256); + else + return task.get_subtask_count(info.cpu_threads); + } + + void task_add(DeviceTask &task) + { + /* Load texture info. */ + load_texture_info(); + + /* split task into smaller ones */ + list<DeviceTask> tasks; + + if (task.type == DeviceTask::SHADER) + task.split(tasks, info.cpu_threads, 256); + else + task.split(tasks, info.cpu_threads); + + foreach (DeviceTask &task, tasks) + task_pool.push(new CPUDeviceTask(this, task)); + } + + void task_wait() + { + task_pool.wait_work(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + protected: + inline KernelGlobals thread_kernel_globals_init() + { + KernelGlobals kg = kernel_globals; + kg.transparent_shadow_intersections = NULL; + const int decoupled_count = sizeof(kg.decoupled_volume_steps) / + sizeof(*kg.decoupled_volume_steps); + for (int i = 0; i < decoupled_count; ++i) { + kg.decoupled_volume_steps[i] = NULL; + } + kg.decoupled_volume_steps_index = 0; + kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL; #ifdef WITH_OSL - OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); + OSLShader::thread_init(&kg, &kernel_globals, &osl_globals); #endif - return kg; - } - - inline void thread_kernel_globals_free(KernelGlobals *kg) - { - if(kg == NULL) { - return; - } - - if(kg->transparent_shadow_intersections != NULL) { - free(kg->transparent_shadow_intersections); - } - const int decoupled_count = sizeof(kg->decoupled_volume_steps) / - sizeof(*kg->decoupled_volume_steps); - for(int i = 0; i < decoupled_count; ++i) { - if(kg->decoupled_volume_steps[i] != NULL) { - free(kg->decoupled_volume_steps[i]); - } - } + return kg; + } + + inline void thread_kernel_globals_free(KernelGlobals *kg) + { + if (kg == NULL) { + return; + } + + if (kg->transparent_shadow_intersections != NULL) { + free(kg->transparent_shadow_intersections); + } + const int decoupled_count = sizeof(kg->decoupled_volume_steps) / + sizeof(*kg->decoupled_volume_steps); + for (int i = 0; i < decoupled_count; ++i) { + if (kg->decoupled_volume_steps[i] != NULL) { + free(kg->decoupled_volume_steps[i]); + } + } #ifdef WITH_OSL - OSLShader::thread_free(kg); + OSLShader::thread_free(kg); #endif - } + } - virtual bool load_kernels(const DeviceRequestedFeatures& requested_features_) { - requested_features = requested_features_; + virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) + { + requested_features = requested_features_; - return true; - } + return true; + } }; /* split kernel */ class CPUSplitKernelFunction : public SplitKernelFunction { -public: - CPUDevice* device; - void (*func)(KernelGlobals *kg, KernelData *data); - - CPUSplitKernelFunction(CPUDevice* device) : device(device), func(NULL) {} - ~CPUSplitKernelFunction() {} - - virtual bool enqueue(const KernelDimensions& dim, device_memory& kernel_globals, device_memory& data) - { - if(!func) { - return false; - } - - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; - kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); - - for(int y = 0; y < dim.global_size[1]; y++) { - for(int x = 0; x < dim.global_size[0]; x++) { - kg->global_id = make_int2(x, y); - - func(kg, (KernelData*)data.device_pointer); - } - } - - return true; - } + public: + CPUDevice *device; + void (*func)(KernelGlobals *kg, KernelData *data); + + CPUSplitKernelFunction(CPUDevice *device) : device(device), func(NULL) + { + } + ~CPUSplitKernelFunction() + { + } + + virtual bool enqueue(const KernelDimensions &dim, + device_memory &kernel_globals, + device_memory &data) + { + if (!func) { + return false; + } + + KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for (int y = 0; y < dim.global_size[1]; y++) { + for (int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + func(kg, (KernelData *)data.device_pointer); + } + } + + return true; + } }; CPUSplitKernel::CPUSplitKernel(CPUDevice *device) : DeviceSplitKernel(device), device(device) { } -bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, +bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, int num_global_elements, - device_memory& kernel_globals, - device_memory& data, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flags, - device_memory& work_pool_wgs) + device_memory &kernel_globals, + device_memory &data, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flags, + device_memory &work_pool_wgs) { - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; - kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); - - for(int y = 0; y < dim.global_size[1]; y++) { - for(int x = 0; x < dim.global_size[0]; x++) { - kg->global_id = make_int2(x, y); - - device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer, - (KernelData*)data.device_pointer, - (void*)split_data.device_pointer, - num_global_elements, - (char*)ray_state.device_pointer, - rtile.start_sample, - rtile.start_sample + rtile.num_samples, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - (int*)queue_index.device_pointer, - dim.global_size[0] * dim.global_size[1], - (char*)use_queues_flags.device_pointer, - (uint*)work_pool_wgs.device_pointer, - rtile.num_samples, - (float*)rtile.buffer); - } - } - - return true; + KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; + kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); + + for (int y = 0; y < dim.global_size[1]; y++) { + for (int x = 0; x < dim.global_size[0]; x++) { + kg->global_id = make_int2(x, y); + + device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer, + (KernelData *)data.device_pointer, + (void *)split_data.device_pointer, + num_global_elements, + (char *)ray_state.device_pointer, + rtile.start_sample, + rtile.start_sample + rtile.num_samples, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + (int *)queue_index.device_pointer, + dim.global_size[0] * dim.global_size[1], + (char *)use_queues_flags.device_pointer, + (uint *)work_pool_wgs.device_pointer, + rtile.num_samples, + (float *)rtile.buffer); + } + } + + return true; } -SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&) +SplitKernelFunction *CPUSplitKernel::get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &) { - CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); + CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device); - kernel->func = device->split_kernels[kernel_name](); - if(!kernel->func) { - delete kernel; - return NULL; - } + kernel->func = device->split_kernels[kernel_name](); + if (!kernel->func) { + delete kernel; + return NULL; + } - return kernel; + return kernel; } int2 CPUSplitKernel::split_kernel_local_size() { - return make_int2(1, 1); + return make_int2(1, 1); } -int2 CPUSplitKernel::split_kernel_global_size(device_memory& /*kg*/, device_memory& /*data*/, DeviceTask * /*task*/) { - return make_int2(1, 1); +int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, + device_memory & /*data*/, + DeviceTask * /*task*/) +{ + return make_int2(1, 1); } -uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device_memory& /*data*/, size_t num_threads) { - KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer; +uint64_t CPUSplitKernel::state_buffer_size(device_memory &kernel_globals, + device_memory & /*data*/, + size_t num_threads) +{ + KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; - return split_data_buffer_size(kg, num_threads); + return split_data_buffer_size(kg, num_threads); } -Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) +Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) { - return new CPUDevice(info, stats, profiler, background); + return new CPUDevice(info, stats, profiler, background); } -void device_cpu_info(vector<DeviceInfo>& devices) +void device_cpu_info(vector<DeviceInfo> &devices) { - DeviceInfo info; - - info.type = DEVICE_CPU; - info.description = system_cpu_brand_string(); - info.id = "CPU"; - info.num = 0; - info.has_volume_decoupled = true; - info.has_osl = true; - info.has_half_images = true; - info.has_profiling = true; - - devices.insert(devices.begin(), info); + DeviceInfo info; + + info.type = DEVICE_CPU; + info.description = system_cpu_brand_string(); + info.id = "CPU"; + info.num = 0; + info.has_volume_decoupled = true; + info.has_osl = true; + info.has_half_images = true; + info.has_profiling = true; + + devices.insert(devices.begin(), info); } string device_cpu_capabilities() { - string capabilities = ""; - capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; - capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; - capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; - capabilities += system_cpu_support_avx() ? "AVX " : ""; - capabilities += system_cpu_support_avx2() ? "AVX2" : ""; - if(capabilities[capabilities.size() - 1] == ' ') - capabilities.resize(capabilities.size() - 1); - return capabilities; + string capabilities = ""; + capabilities += system_cpu_support_sse2() ? "SSE2 " : ""; + capabilities += system_cpu_support_sse3() ? "SSE3 " : ""; + capabilities += system_cpu_support_sse41() ? "SSE41 " : ""; + capabilities += system_cpu_support_avx() ? "AVX " : ""; + capabilities += system_cpu_support_avx2() ? "AVX2" : ""; + if (capabilities[capabilities.size() - 1] == ' ') + capabilities.resize(capabilities.size() - 1); + return capabilities; } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 3aa6bce155e..68bc3bd4045 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -62,2144 +62,2242 @@ namespace { const char *cuewErrorString(CUresult result) { - /* We can only give error code here without major code duplication, that - * should be enough since dynamic loading is only being disabled by folks - * who knows what they're doing anyway. - * - * NOTE: Avoid call from several threads. - */ - static string error; - error = string_printf("%d", result); - return error.c_str(); + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); } const char *cuewCompilerPath() { - return CYCLES_CUDA_NVCC_EXECUTABLE; + return CYCLES_CUDA_NVCC_EXECUTABLE; } int cuewCompilerVersion() { - return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); + return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10); } -} /* namespace */ -#endif /* WITH_CUDA_DYNLOAD */ +} /* namespace */ +#endif /* WITH_CUDA_DYNLOAD */ class CUDADevice; class CUDASplitKernel : public DeviceSplitKernel { - CUDADevice *device; -public: - explicit CUDASplitKernel(CUDADevice *device); - - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs); - - virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&); - virtual int2 split_kernel_local_size(); - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task); + CUDADevice *device; + + public: + explicit CUDASplitKernel(CUDADevice *device); + + virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data_, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs); + + virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &); + virtual int2 split_kernel_local_size(); + virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task); }; /* Utility to push/pop CUDA context. */ class CUDAContextScope { -public: - CUDAContextScope(CUDADevice *device); - ~CUDAContextScope(); + public: + CUDAContextScope(CUDADevice *device); + ~CUDAContextScope(); -private: - CUDADevice *device; + private: + CUDADevice *device; }; -class CUDADevice : public Device -{ -public: - DedicatedTaskPool task_pool; - CUdevice cuDevice; - CUcontext cuContext; - CUmodule cuModule, cuFilterModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; - int cuDevId; - int cuDevArchitecture; - bool first_error; - CUDASplitKernel *split_kernel; - - struct CUDAMem { - CUDAMem() - : texobject(0), array(0), map_host_pointer(0), free_map_host(false) {} - - CUtexObject texobject; - CUarray array; - void *map_host_pointer; - bool free_map_host; - }; - typedef map<device_memory*, CUDAMem> CUDAMemMap; - CUDAMemMap cuda_mem_map; - - struct PixelMem { - GLuint cuPBO; - CUgraphicsResource cuPBOresource; - GLuint cuTexId; - int w, h; - }; - map<device_ptr, PixelMem> pixel_mem_map; - - /* Bindless Textures */ - device_vector<TextureInfo> texture_info; - bool need_texture_info; - - CUdeviceptr cuda_device_ptr(device_ptr mem) - { - return (CUdeviceptr)mem; - } - - static bool have_precompiled_kernels() - { - string cubins_path = path_get("lib"); - return path_exists(cubins_path); - } - - virtual bool show_samples() const - { - /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ - return true; - } - - virtual BVHLayoutMask get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; - } - -/*#ifdef NDEBUG +class CUDADevice : public Device { + public: + DedicatedTaskPool task_pool; + CUdevice cuDevice; + CUcontext cuContext; + CUmodule cuModule, cuFilterModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int cuDevId; + int cuDevArchitecture; + bool first_error; + CUDASplitKernel *split_kernel; + + struct CUDAMem { + CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false) + { + } + + CUtexObject texobject; + CUarray array; + void *map_host_pointer; + bool free_map_host; + }; + typedef map<device_memory *, CUDAMem> CUDAMemMap; + CUDAMemMap cuda_mem_map; + + struct PixelMem { + GLuint cuPBO; + CUgraphicsResource cuPBOresource; + GLuint cuTexId; + int w, h; + }; + map<device_ptr, PixelMem> pixel_mem_map; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + CUdeviceptr cuda_device_ptr(device_ptr mem) + { + return (CUdeviceptr)mem; + } + + static bool have_precompiled_kernels() + { + string cubins_path = path_get("lib"); + return path_exists(cubins_path); + } + + virtual bool show_samples() const + { + /* The CUDADevice only processes one tile at a time, so showing samples is fine. */ + return true; + } + + virtual BVHLayoutMask get_bvh_layout_mask() const + { + return BVH_LAYOUT_BVH2; + } + + /*#ifdef NDEBUG #define cuda_abort() #else #define cuda_abort() abort() #endif*/ - void cuda_error_documentation() - { - if(first_error) { - fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); - fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n"); - first_error = false; - } - } + void cuda_error_documentation() + { + if (first_error) { + fprintf(stderr, + "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } + } #define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - \ - if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ - if(error_msg == "") \ - error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - cuda_error_documentation(); \ - } \ - } (void) 0 - - bool cuda_error_(CUresult result, const string& stmt) - { - if(result == CUDA_SUCCESS) - return false; - - string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - return true; - } + { \ + CUresult result = stmt; \ +\ + if (result != CUDA_SUCCESS) { \ + string message = string_printf( \ + "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \ + if (error_msg == "") \ + error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + cuda_error_documentation(); \ + } \ + } \ + (void)0 + + bool cuda_error_(CUresult result, const string &stmt) + { + if (result == CUDA_SUCCESS) + return false; + + string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result)); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + cuda_error_documentation(); + return true; + } #define cuda_error(stmt) cuda_error_(stmt, #stmt) - void cuda_error_message(const string& message) - { - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - cuda_error_documentation(); - } - - CUDADevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), - texture_info(this, "__texture_info", MEM_TEXTURE) - { - first_error = true; - background = background_; - - cuDevId = info.num; - cuDevice = 0; - cuContext = 0; - - cuModule = 0; - cuFilterModule = 0; - - split_kernel = NULL; - - need_texture_info = false; - - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; - - /* Intialize CUDA. */ - if(cuda_error(cuInit(0))) - return; - - /* Setup device and context. */ - if(cuda_error(cuDeviceGet(&cuDevice, cuDevId))) - return; - - /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. - * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, - * so we can predict which memory to map to host. */ - cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); - - unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; - if(can_map_host) { - ctx_flags |= CU_CTX_MAP_HOST; - init_host_memory(); - } - - /* Create context. */ - CUresult result; - - if(background) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - } - else { - result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); - - if(result != CUDA_SUCCESS) { - result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); - background = true; - } - } - - if(cuda_error_(result, "cuCtxCreate")) - return; - - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - cuDevArchitecture = major*100 + minor*10; - - /* Pop context set by cuCtxCreate. */ - cuCtxPopCurrent(NULL); - } - - ~CUDADevice() - { - task_pool.stop(); - - delete split_kernel; - - texture_info.free(); - - cuda_assert(cuCtxDestroy(cuContext)); - } - - bool support_device(const DeviceRequestedFeatures& /*requested_features*/) - { - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* We only support sm_30 and above */ - if(major < 3) { - cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor)); - return false; - } - - return true; - } - - bool use_adaptive_compilation() - { - return DebugFlags().cuda.adaptive_compile; - } - - bool use_split_kernel() - { - return DebugFlags().cuda.split_kernel; - } - - /* Common NVCC flags which stays the same regardless of shading model, - * kernel sources md5 and only depends on compiler or compilation settings. - */ - string compile_kernel_get_common_cflags( - const DeviceRequestedFeatures& requested_features, - bool filter=false, bool split=false) - { - const int machine = system_cpu_bits(); - const string source_path = path_get("source"); - const string include_path = source_path; - string cflags = string_printf("-m%d " - "--ptxas-options=\"-v\" " - "--use_fast_math " - "-DNVCC " - "-I\"%s\"", - machine, - include_path.c_str()); - if(!filter && use_adaptive_compilation()) { - cflags += " " + requested_features.get_build_options(); - } - const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); - if(extra_cflags) { - cflags += string(" ") + string(extra_cflags); - } + void cuda_error_message(const string &message) + { + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + cuda_error_documentation(); + } + + CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) + : Device(info, stats, profiler, background_), + texture_info(this, "__texture_info", MEM_TEXTURE) + { + first_error = true; + background = background_; + + cuDevId = info.num; + cuDevice = 0; + cuContext = 0; + + cuModule = 0; + cuFilterModule = 0; + + split_kernel = NULL; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + + /* Intialize CUDA. */ + if (cuda_error(cuInit(0))) + return; + + /* Setup device and context. */ + if (cuda_error(cuDeviceGet(&cuDevice, cuDevId))) + return; + + /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. + * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, + * so we can predict which memory to map to host. */ + cuda_assert( + cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; + if (can_map_host) { + ctx_flags |= CU_CTX_MAP_HOST; + init_host_memory(); + } + + /* Create context. */ + CUresult result; + + if (background) { + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + } + else { + result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice); + + if (result != CUDA_SUCCESS) { + result = cuCtxCreate(&cuContext, ctx_flags, cuDevice); + background = true; + } + } + + if (cuda_error_(result, "cuCtxCreate")) + return; + + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + cuDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by cuCtxCreate. */ + cuCtxPopCurrent(NULL); + } + + ~CUDADevice() + { + task_pool.stop(); + + delete split_kernel; + + texture_info.free(); + + cuda_assert(cuCtxDestroy(cuContext)); + } + + bool support_device(const DeviceRequestedFeatures & /*requested_features*/) + { + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* We only support sm_30 and above */ + if (major < 3) { + cuda_error_message(string_printf( + "CUDA device supported only with compute capability 3.0 or up, found %d.%d.", + major, + minor)); + return false; + } + + return true; + } + + bool use_adaptive_compilation() + { + return DebugFlags().cuda.adaptive_compile; + } + + bool use_split_kernel() + { + return DebugFlags().cuda.split_kernel; + } + + /* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ + string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features, + bool filter = false, + bool split = false) + { + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DNVCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (!filter && use_adaptive_compilation()) { + cflags += " " + requested_features.get_build_options(); + } + const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS"); + if (extra_cflags) { + cflags += string(" ") + string(extra_cflags); + } #ifdef WITH_CYCLES_DEBUG - cflags += " -D__KERNEL_DEBUG__"; + cflags += " -D__KERNEL_DEBUG__"; #endif - if(split) { - cflags += " -D__SPLIT__"; - } - - return cflags; - } - - bool compile_check_compiler() { - const char *nvcc = cuewCompilerPath(); - if(nvcc == NULL) { - cuda_error_message("CUDA nvcc compiler not found. " - "Install CUDA toolkit in default location."); - return false; - } - const int cuda_version = cuewCompilerVersion(); - VLOG(1) << "Found nvcc " << nvcc - << ", CUDA version " << cuda_version - << "."; - const int major = cuda_version / 10, minor = cuda_version % 10; - if(cuda_version == 0) { - cuda_error_message("CUDA nvcc compiler version could not be parsed."); - return false; - } - if(cuda_version < 80) { - printf("Unsupported CUDA version %d.%d detected, " - "you need CUDA 8.0 or newer.\n", - major, minor); - return false; - } - else if(cuda_version != 101) { - printf("CUDA version %d.%d detected, build may succeed but only " - "CUDA 10.1 is officially supported.\n", - major, minor); - } - return true; - } - - string compile_kernel(const DeviceRequestedFeatures& requested_features, - bool filter=false, bool split=false) - { - const char *name, *source; - if(filter) { - name = "filter"; - source = "filter.cu"; - } - else if(split) { - name = "kernel_split"; - source = "kernel_split.cu"; - } - else { - name = "kernel"; - source = "kernel.cu"; - } - /* Compute cubin name. */ - int major, minor; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); - cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); - - /* Attempt to use kernel provided with Blender. */ - if(!use_adaptive_compilation()) { - const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", - name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; - if(path_exists(cubin)) { - VLOG(1) << "Using precompiled kernel."; - return cubin; - } - } - - const string common_cflags = - compile_kernel_get_common_cflags(requested_features, filter, split); - - /* Try to use locally compiled kernel. */ - const string source_path = path_get("source"); - const string kernel_md5 = path_files_md5_hash(source_path); - - /* We include cflags into md5 so changing cuda toolkit or changing other - * compiler command line arguments makes sure cubin gets re-built. - */ - const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); - - const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin", - name, major, minor, - cubin_md5.c_str()); - const string cubin = path_cache_get(path_join("kernels", cubin_file)); - VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; - if(path_exists(cubin)) { - VLOG(1) << "Using locally compiled kernel."; - return cubin; - } + if (split) { + cflags += " -D__SPLIT__"; + } + + return cflags; + } + + bool compile_check_compiler() + { + const char *nvcc = cuewCompilerPath(); + if (nvcc == NULL) { + cuda_error_message( + "CUDA nvcc compiler not found. " + "Install CUDA toolkit in default location."); + return false; + } + const int cuda_version = cuewCompilerVersion(); + VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << "."; + const int major = cuda_version / 10, minor = cuda_version % 10; + if (cuda_version == 0) { + cuda_error_message("CUDA nvcc compiler version could not be parsed."); + return false; + } + if (cuda_version < 80) { + printf( + "Unsupported CUDA version %d.%d detected, " + "you need CUDA 8.0 or newer.\n", + major, + minor); + return false; + } + else if (cuda_version != 101) { + printf( + "CUDA version %d.%d detected, build may succeed but only " + "CUDA 10.1 is officially supported.\n", + major, + minor); + } + return true; + } + + string compile_kernel(const DeviceRequestedFeatures &requested_features, + bool filter = false, + bool split = false) + { + const char *name, *source; + if (filter) { + name = "filter"; + source = "filter.cu"; + } + else if (split) { + name = "kernel_split"; + source = "kernel_split.cu"; + } + else { + name = "kernel"; + source = "kernel.cu"; + } + /* Compute cubin name. */ + int major, minor; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId); + cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using precompiled kernel."; + return cubin; + } + } + + const string common_cflags = compile_kernel_get_common_cflags( + requested_features, filter, split); + + /* Try to use locally compiled kernel. */ + const string source_path = path_get("source"); + const string kernel_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing cuda toolkit or changing other + * compiler command line arguments makes sure cubin gets re-built. + */ + const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags); + + const string cubin_file = string_printf( + "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str()); + const string cubin = path_cache_get(path_join("kernels", cubin_file)); + VLOG(1) << "Testing for locally compiled kernel " << cubin << "."; + if (path_exists(cubin)) { + VLOG(1) << "Using locally compiled kernel."; + return cubin; + } #ifdef _WIN32 - if(have_precompiled_kernels()) { - if(major < 3) { - cuda_error_message(string_printf( - "CUDA device requires compute capability 3.0 or up, " - "found %d.%d. Your GPU is not supported.", - major, minor)); - } - else { - cuda_error_message(string_printf( - "CUDA binary kernel for this graphics card compute " - "capability (%d.%d) not found.", - major, minor)); - } - return ""; - } + if (have_precompiled_kernels()) { + if (major < 3) { + cuda_error_message( + string_printf("CUDA device requires compute capability 3.0 or up, " + "found %d.%d. Your GPU is not supported.", + major, + minor)); + } + else { + cuda_error_message( + string_printf("CUDA binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return ""; + } #endif - /* Compile. */ - if(!compile_check_compiler()) { - return ""; - } - const char *nvcc = cuewCompilerPath(); - const string kernel = path_join( - path_join(source_path, "kernel"), - path_join("kernels", - path_join("cuda", source))); - double starttime = time_dt(); - printf("Compiling CUDA kernel ...\n"); - - path_create_directories(cubin); - - string command = string_printf("\"%s\" " - "-arch=sm_%d%d " - "--cubin \"%s\" " - "-o \"%s\" " - "%s ", - nvcc, - major, minor, - kernel.c_str(), - cubin.c_str(), - common_cflags.c_str()); - - printf("%s\n", command.c_str()); - - if(system(command.c_str()) == -1) { - cuda_error_message("Failed to execute compilation command, " - "see console for details."); - return ""; - } - - /* Verify if compilation succeeded */ - if(!path_exists(cubin)) { - cuda_error_message("CUDA kernel compilation failed, " - "see console for details."); - return ""; - } - - printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); - - return cubin; - } - - bool load_kernels(const DeviceRequestedFeatures& requested_features) - { - /* TODO(sergey): Support kernels re-load for CUDA devices. - * - * Currently re-loading kernel will invalidate memory pointers, - * causing problems in cuCtxSynchronize. - */ - if(cuFilterModule && cuModule) { - VLOG(1) << "Skipping kernel reload, not currently supported."; - return true; - } - - /* check if cuda init succeeded */ - if(cuContext == 0) - return false; - - /* check if GPU is supported */ - if(!support_device(requested_features)) - return false; - - /* get kernel */ - string cubin = compile_kernel(requested_features, false, use_split_kernel()); - if(cubin == "") - return false; - - string filter_cubin = compile_kernel(requested_features, true, false); - if(filter_cubin == "") - return false; - - /* open module */ - CUDAContextScope scope(this); - - string cubin_data; - CUresult result; - - if(path_read_text(cubin, cubin_data)) - result = cuModuleLoadData(&cuModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if(cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); - - if(path_read_text(filter_cubin, cubin_data)) - result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); - else - result = CUDA_ERROR_FILE_NOT_FOUND; - - if(cuda_error_(result, "cuModuleLoad")) - cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); - - if(result == CUDA_SUCCESS) { - reserve_local_memory(requested_features); - } - - return (result == CUDA_SUCCESS); - } - - void reserve_local_memory(const DeviceRequestedFeatures& requested_features) - { - if(use_split_kernel()) { - /* Split kernel mostly uses global memory and adaptive compilation, - * difficult to predict how much is needed currently. */ - return; - } - - /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory - * needed for kernel launches, so that we can reliably figure out when - * to allocate scene data in mapped host memory. */ - CUDAContextScope scope(this); - - size_t total = 0, free_before = 0, free_after = 0; - cuMemGetInfo(&free_before, &total); - - /* Get kernel function. */ - CUfunction cuPathTrace; - - if(requested_features.use_integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); - } - - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - - int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); - - /* Launch kernel, using just 1 block appears sufficient to reserve - * memory for all multiprocessors. It would be good to do this in - * parallel for the multi GPU case still to make it faster. */ - CUdeviceptr d_work_tiles = 0; - uint total_work_size = 0; - - void *args[] = {&d_work_tiles, - &total_work_size}; - - cuda_assert(cuLaunchKernel(cuPathTrace, - 1, 1, 1, - num_threads_per_block, 1, 1, - 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - cuMemGetInfo(&free_after, &total); - VLOG(1) << "Local memory reserved " - << string_human_readable_number(free_before - free_after) << " bytes. (" - << string_human_readable_size(free_before - free_after) << ")"; + /* Compile. */ + if (!compile_check_compiler()) { + return ""; + } + const char *nvcc = cuewCompilerPath(); + const string kernel = path_join(path_join(source_path, "kernel"), + path_join("kernels", path_join("cuda", source))); + double starttime = time_dt(); + printf("Compiling CUDA kernel ...\n"); + + path_create_directories(cubin); + + string command = string_printf( + "\"%s\" " + "-arch=sm_%d%d " + "--cubin \"%s\" " + "-o \"%s\" " + "%s ", + nvcc, + major, + minor, + kernel.c_str(), + cubin.c_str(), + common_cflags.c_str()); + + printf("%s\n", command.c_str()); + + if (system(command.c_str()) == -1) { + cuda_error_message( + "Failed to execute compilation command, " + "see console for details."); + return ""; + } + + /* Verify if compilation succeeded */ + if (!path_exists(cubin)) { + cuda_error_message( + "CUDA kernel compilation failed, " + "see console for details."); + return ""; + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return cubin; + } + + bool load_kernels(const DeviceRequestedFeatures &requested_features) + { + /* TODO(sergey): Support kernels re-load for CUDA devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in cuCtxSynchronize. + */ + if (cuFilterModule && cuModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if cuda init succeeded */ + if (cuContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(requested_features)) + return false; + + /* get kernel */ + string cubin = compile_kernel(requested_features, false, use_split_kernel()); + if (cubin == "") + return false; + + string filter_cubin = compile_kernel(requested_features, true, false); + if (filter_cubin == "") + return false; + + /* open module */ + CUDAContextScope scope(this); + + string cubin_data; + CUresult result; + + if (path_read_text(cubin, cubin_data)) + result = cuModuleLoadData(&cuModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); + + if (path_read_text(filter_cubin, cubin_data)) + result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str()); + else + result = CUDA_ERROR_FILE_NOT_FOUND; + + if (cuda_error_(result, "cuModuleLoad")) + cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str())); + + if (result == CUDA_SUCCESS) { + reserve_local_memory(requested_features); + } + + return (result == CUDA_SUCCESS); + } + + void reserve_local_memory(const DeviceRequestedFeatures &requested_features) + { + if (use_split_kernel()) { + /* Split kernel mostly uses global memory and adaptive compilation, + * difficult to predict how much is needed currently. */ + return; + } + + /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + CUDAContextScope scope(this); + + size_t total = 0, free_before = 0, free_after = 0; + cuMemGetInfo(&free_before, &total); + + /* Get kernel function. */ + CUfunction cuPathTrace; + + if (requested_features.use_integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize( + &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + + /* Launch kernel, using just 1 block appears sufficient to reserve + * memory for all multiprocessors. It would be good to do this in + * parallel for the multi GPU case still to make it faster. */ + CUdeviceptr d_work_tiles = 0; + uint total_work_size = 0; + + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + cuMemGetInfo(&free_after, &total); + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; #if 0 - /* For testing mapped host memory, fill up device memory. */ - const size_t keep_mb = 1024; - - while(free_after > keep_mb * 1024 * 1024LL) { - CUdeviceptr tmp; - cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); - cuMemGetInfo(&free_after, &total); - } + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while(free_after > keep_mb * 1024 * 1024LL) { + CUdeviceptr tmp; + cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL)); + cuMemGetInfo(&free_after, &total); + } #endif - } - - void init_host_memory() - { - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if(system_ram > 0) { - if(system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG(1) << "Mapped host memory limit set to " - << string_human_readable_number(map_host_limit) << " bytes. (" - << string_human_readable_size(map_host_limit) << ")"; - } - - void load_texture_info() - { - if(need_texture_info) { - texture_info.copy_to_device(); - need_texture_info = false; - } - } - - void move_textures_to_host(size_t size, bool for_texture) - { - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while(size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - foreach(CUDAMemMap::value_type& pair, cuda_mem_map) { - device_memory& mem = *pair.first; - CUDAMem *cmem = &pair.second; - - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if(!is_texture || cmem->array) { - continue; - } - - /* Already in host memory. */ - if(cmem->map_host_pointer) { - continue; - } - - /* For other textures, only move image textures. */ - if(for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if(is_image > max_is_image || - (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if(max_mem) { - VLOG(1) << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - /* Preserve the original device pointer, in case of multi device - * we can't change it because the pointer mapping would break. */ - device_ptr prev_pointer = max_mem->device_pointer; - size_t prev_size = max_mem->device_size; - - tex_free(*max_mem); - tex_alloc(*max_mem); - size = (max_size >= size)? 0: size - max_size; - - max_mem->device_pointer = prev_pointer; - max_mem->device_size = prev_size; - } - else { - break; - } - } - - /* Update texture info array with new pointers. */ - load_texture_info(); - - move_texture_to_host = false; - } - - CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0) - { - CUDAContextScope scope(this); - - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture)? device_texture_headroom: - device_working_headroom; - - size_t total = 0, free = 0; - cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if(!move_texture_to_host && !is_image && (size + headroom) >= free) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if(!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if(mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - void *map_host_pointer = 0; - bool free_map_host = false; - - if(mem_alloc_result != CUDA_SUCCESS && can_map_host && - map_host_used + size < map_host_limit) { - if(mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - map_host_pointer = mem.shared_pointer; - } - else { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size, - CU_MEMHOSTALLOC_DEVICEMAP | - CU_MEMHOSTALLOC_WRITECOMBINED); - mem.shared_pointer = map_host_pointer; - free_map_host = true; - } - - if(mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - if(!move_texture_to_host && pitch_padding == 0 && - mem.host_pointer && mem.host_pointer != mem.shared_pointer) { - memcpy(mem.shared_pointer, mem.host_pointer, size); - mem.host_free(); - mem.host_pointer = mem.shared_pointer; - } - } - else { - status = " failed, out of host memory"; - } - } - else if(mem_alloc_result != CUDA_SUCCESS) { - status = " failed, out of device and host memory"; - } - - if(mem_alloc_result != CUDA_SUCCESS) { - cuda_assert(mem_alloc_result); - } - - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" - << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if(!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - CUDAMem *cmem = &cuda_mem_map[&mem]; - cmem->map_host_pointer = map_host_pointer; - cmem->free_map_host = free_map_host; - return cmem; - } - - void generic_copy_to(device_memory& mem) - { - if(mem.host_pointer && mem.device_pointer) { - CUDAContextScope scope(this); - - if(mem.host_pointer != mem.shared_pointer) { - cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), - mem.host_pointer, - mem.memory_size())); - } - } - } - - void generic_free(device_memory& mem) - { - if(mem.device_pointer) { - CUDAContextScope scope(this); - const CUDAMem& cmem = cuda_mem_map[&mem]; - - if(cmem.map_host_pointer) { - /* Free host memory. */ - if(cmem.free_map_host) { - cuMemFreeHost(cmem.map_host_pointer); - if(mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - mem.shared_pointer = 0; - } - - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuMemFree(mem.device_pointer); - } - - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - } - - void mem_alloc(device_memory& mem) - { - if(mem.type == MEM_PIXELS && !background) { - pixels_alloc(mem); - } - else if(mem.type == MEM_TEXTURE) { - assert(!"mem_alloc not supported for textures."); - } - else { - generic_alloc(mem); - } - } - - void mem_copy_to(device_memory& mem) - { - if(mem.type == MEM_PIXELS) { - assert(!"mem_copy_to not supported for pixels."); - } - else if(mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if(!mem.device_pointer) { - generic_alloc(mem); - } - - generic_copy_to(mem); - } - } - - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) - { - if(mem.type == MEM_PIXELS && !background) { - pixels_copy_from(mem, y, w, h); - } - else if(mem.type == MEM_TEXTURE) { - assert(!"mem_copy_from not supported for textures."); - } - else { - CUDAContextScope scope(this); - size_t offset = elem*y*w; - size_t size = elem*w*h; - - if(mem.host_pointer && mem.device_pointer) { - cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset, - (CUdeviceptr)(mem.device_pointer + offset), size)); - } - else if(mem.host_pointer) { - memset((char*)mem.host_pointer + offset, 0, size); - } - } - } - - void mem_zero(device_memory& mem) - { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - if(mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if(mem.device_pointer && - (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { - CUDAContextScope scope(this); - cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); - } - } - - void mem_free(device_memory& mem) - { - if(mem.type == MEM_PIXELS && !background) { - pixels_free(mem); - } - else if(mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - generic_free(mem); - } - } - - virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/) - { - return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset)); - } - - void const_copy_to(const char *name, void *host, size_t size) - { - CUDAContextScope scope(this); - CUdeviceptr mem; - size_t bytes; - - cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); - //assert(bytes == size); - cuda_assert(cuMemcpyHtoD(mem, host, size)); - } - - void tex_alloc(device_memory& mem) - { - CUDAContextScope scope(this); - - /* General variables for both architectures */ - string bind_name = mem.name; - size_t dsize = datatype_size(mem.data_type); - size_t size = mem.memory_size(); - - CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; - switch(mem.extension) { - case EXTENSION_REPEAT: - address_mode = CU_TR_ADDRESS_MODE_WRAP; - break; - case EXTENSION_EXTEND: - address_mode = CU_TR_ADDRESS_MODE_CLAMP; - break; - case EXTENSION_CLIP: - address_mode = CU_TR_ADDRESS_MODE_BORDER; - break; - default: - assert(0); - break; - } - - CUfilter_mode filter_mode; - if(mem.interpolation == INTERPOLATION_CLOSEST) { - filter_mode = CU_TR_FILTER_MODE_POINT; - } - else { - filter_mode = CU_TR_FILTER_MODE_LINEAR; - } - - /* Data Storage */ - if(mem.interpolation == INTERPOLATION_NONE) { - generic_alloc(mem); - generic_copy_to(mem); - - CUdeviceptr cumem; - size_t cubytes; - - cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); - - if(cubytes == 8) { - /* 64 bit device pointer */ - uint64_t ptr = mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - else { - /* 32 bit device pointer */ - uint32_t ptr = (uint32_t)mem.device_pointer; - cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)); - } - return; - } - - /* Image Texture Storage */ - CUarray_format_enum format; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT16: format = CU_AD_FORMAT_UNSIGNED_INT16; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - case TYPE_HALF: format = CU_AD_FORMAT_HALF; break; - default: assert(0); return; - } - - CUDAMem *cmem = NULL; - CUarray array_3d = NULL; - size_t src_pitch = mem.data_width * dsize * mem.data_elements; - size_t dst_pitch = src_pitch; - - if(mem.data_depth > 1) { - /* 3D texture using array, there is no API for linear memory. */ - CUDA_ARRAY3D_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Depth = mem.data_depth; - desc.Format = format; - desc.NumChannels = mem.data_elements; - desc.Flags = 0; - - VLOG(1) << "Array 3D allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - cuda_assert(cuArray3DCreate(&array_3d, &desc)); - - if(!array_3d) { - return; - } - - CUDA_MEMCPY3D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = array_3d; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - param.Depth = mem.data_depth; - - cuda_assert(cuMemcpy3D(¶m)); - - mem.device_pointer = (device_ptr)array_3d; - mem.device_size = size; - stats.mem_alloc(size); - - cmem = &cuda_mem_map[&mem]; - cmem->texobject = 0; - cmem->array = array_3d; - } - else if(mem.data_height > 0) { - /* 2D texture, using pitch aligned linear memory. */ - int alignment = 0; - cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); - dst_pitch = align_up(src_pitch, alignment); - size_t dst_size = dst_pitch * mem.data_height; - - cmem = generic_alloc(mem, dst_size - mem.memory_size()); - if(!cmem) { - return; - } - - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_DEVICE; - param.dstDevice = mem.device_pointer; - param.dstPitch = dst_pitch; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = mem.host_pointer; - param.srcPitch = src_pitch; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; - - cuda_assert(cuMemcpy2DUnaligned(¶m)); - } - else { - /* 1D texture, using linear memory. */ - cmem = generic_alloc(mem); - if(!cmem) { - return; - } - - cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); - } - - /* Kepler+, bindless textures. */ - int flat_slot = 0; - if(string_startswith(mem.name, "__tex_image")) { - int pos = string(mem.name).rfind("_"); - flat_slot = atoi(mem.name + pos + 1); - } - else { - assert(0); - } - - CUDA_RESOURCE_DESC resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - - if(array_3d) { - resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = array_3d; - resDesc.flags = 0; - } - else if(mem.data_height > 0) { - resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; - resDesc.res.pitch2D.devPtr = mem.device_pointer; - resDesc.res.pitch2D.format = format; - resDesc.res.pitch2D.numChannels = mem.data_elements; - resDesc.res.pitch2D.height = mem.data_height; - resDesc.res.pitch2D.width = mem.data_width; - resDesc.res.pitch2D.pitchInBytes = dst_pitch; - } - else { - resDesc.resType = CU_RESOURCE_TYPE_LINEAR; - resDesc.res.linear.devPtr = mem.device_pointer; - resDesc.res.linear.format = format; - resDesc.res.linear.numChannels = mem.data_elements; - resDesc.res.linear.sizeInBytes = mem.device_size; - } - - CUDA_TEXTURE_DESC texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - texDesc.addressMode[0] = address_mode; - texDesc.addressMode[1] = address_mode; - texDesc.addressMode[2] = address_mode; - texDesc.filterMode = filter_mode; - texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - - cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); - - /* Resize once */ - if(flat_slot >= texture_info.size()) { - /* Allocate some slots in advance, to reduce amount - * of re-allocations. */ - texture_info.resize(flat_slot + 128); - } - - /* Set Mapping and tag that we need to (re-)upload to device */ - TextureInfo& info = texture_info[flat_slot]; - info.data = (uint64_t)cmem->texobject; - info.cl_buffer = 0; - info.interpolation = mem.interpolation; - info.extension = mem.extension; - info.width = mem.data_width; - info.height = mem.data_height; - info.depth = mem.data_depth; - need_texture_info = true; - } - - void tex_free(device_memory& mem) - { - if(mem.device_pointer) { - CUDAContextScope scope(this); - const CUDAMem& cmem = cuda_mem_map[&mem]; - - if(cmem.texobject) { - /* Free bindless texture. */ - cuTexObjectDestroy(cmem.texobject); - } - - if(cmem.array) { - /* Free array. */ - cuArrayDestroy(cmem.array); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; - - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } - else { - generic_free(mem); - } - } - } - -#define CUDA_GET_BLOCKSIZE(func, w, h) \ - int threads_per_block; \ - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int threads = (int)sqrt((float)threads_per_block); \ - int xblocks = ((w) + threads - 1)/threads; \ - int yblocks = ((h) + threads - 1)/threads; - -#define CUDA_LAUNCH_KERNEL(func, args) \ - cuda_assert(cuLaunchKernel(func, \ - xblocks, yblocks, 1, \ - threads, threads, 1, \ - 0, 0, args, 0)); + } + + void init_host_memory() + { + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; + } + + void load_texture_info() + { + if (need_texture_info) { + texture_info.copy_to_device(); + need_texture_info = false; + } + } + + void move_textures_to_host(size_t size, bool for_texture) + { + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { + device_memory &mem = *pair.first; + CUDAMem *cmem = &pair.second; + + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* Already in host memory. */ + if (cmem->map_host_pointer) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + + /* Move to host memory. This part is mutex protected since + * multiple CUDA devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + /* Preserve the original device pointer, in case of multi device + * we can't change it because the pointer mapping would break. */ + device_ptr prev_pointer = max_mem->device_pointer; + size_t prev_size = max_mem->device_size; + + tex_free(*max_mem); + tex_alloc(*max_mem); + size = (max_size >= size) ? 0 : size - max_size; + + max_mem->device_pointer = prev_pointer; + max_mem->device_size = prev_size; + } + else { + break; + } + } + + /* Update texture info array with new pointers. */ + load_texture_info(); + + move_texture_to_host = false; + } + + CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0) + { + CUDAContextScope scope(this); + + CUdeviceptr device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + cuMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free) { + move_textures_to_host(size + headroom - free, is_texture); + cuMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = cuMemAlloc(&device_pointer, size); + if (mem_alloc_result == CUDA_SUCCESS) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + void *map_host_pointer = 0; + bool free_map_host = false; + + if (mem_alloc_result != CUDA_SUCCESS && can_map_host && + map_host_used + size < map_host_limit) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = CUDA_SUCCESS; + map_host_pointer = mem.shared_pointer; + } + else { + /* Allocate host memory ourselves. */ + mem_alloc_result = cuMemHostAlloc( + &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + mem.shared_pointer = map_host_pointer; + free_map_host = true; + } + + if (mem_alloc_result == CUDA_SUCCESS) { + cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + + /* Replace host pointer with our host allocation. Only works if + * CUDA memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != mem.shared_pointer) { + memcpy(mem.shared_pointer, mem.host_pointer, size); + mem.host_free(); + mem.host_pointer = mem.shared_pointer; + } + } + else { + status = " failed, out of host memory"; + } + } + else if (mem_alloc_result != CUDA_SUCCESS) { + status = " failed, out of device and host memory"; + } + + if (mem_alloc_result != CUDA_SUCCESS) { + cuda_assert(mem_alloc_result); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + CUDAMem *cmem = &cuda_mem_map[&mem]; + cmem->map_host_pointer = map_host_pointer; + cmem->free_map_host = free_map_host; + return cmem; + } + + void generic_copy_to(device_memory &mem) + { + if (mem.host_pointer && mem.device_pointer) { + CUDAContextScope scope(this); + + if (mem.host_pointer != mem.shared_pointer) { + cuda_assert(cuMemcpyHtoD( + cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size())); + } + } + } + + void generic_free(device_memory &mem) + { + if (mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.map_host_pointer) { + /* Free host memory. */ + if (cmem.free_map_host) { + cuMemFreeHost(cmem.map_host_pointer); + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + mem.shared_pointer = 0; + } + + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + cuMemFree(mem.device_pointer); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + } + + void mem_alloc(device_memory &mem) + { + if (mem.type == MEM_PIXELS && !background) { + pixels_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else { + generic_alloc(mem); + } + } + + void mem_copy_to(device_memory &mem) + { + if (mem.type == MEM_PIXELS) { + assert(!"mem_copy_to not supported for pixels."); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + + generic_copy_to(mem); + } + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) + { + if (mem.type == MEM_PIXELS && !background) { + pixels_copy_from(mem, y, w, h); + } + else if (mem.type == MEM_TEXTURE) { + assert(!"mem_copy_from not supported for textures."); + } + else { + CUDAContextScope scope(this); + size_t offset = elem * y * w; + size_t size = elem * w * h; + + if (mem.host_pointer && mem.device_pointer) { + cuda_assert(cuMemcpyDtoH( + (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size)); + } + else if (mem.host_pointer) { + memset((char *)mem.host_pointer + offset, 0, size); + } + } + } + + void mem_zero(device_memory &mem) + { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) { + CUDAContextScope scope(this); + cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())); + } + } + + void mem_free(device_memory &mem) + { + if (mem.type == MEM_PIXELS && !background) { + pixels_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + generic_free(mem); + } + } + + virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) + { + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); + } + + void const_copy_to(const char *name, void *host, size_t size) + { + CUDAContextScope scope(this); + CUdeviceptr mem; + size_t bytes; + + cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)); + //assert(bytes == size); + cuda_assert(cuMemcpyHtoD(mem, host, size)); + } + + void tex_alloc(device_memory &mem) + { + CUDAContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP; + switch (mem.extension) { + case EXTENSION_REPEAT: + address_mode = CU_TR_ADDRESS_MODE_WRAP; + break; + case EXTENSION_EXTEND: + address_mode = CU_TR_ADDRESS_MODE_CLAMP; + break; + case EXTENSION_CLIP: + address_mode = CU_TR_ADDRESS_MODE_BORDER; + break; + default: + assert(0); + break; + } + + CUfilter_mode filter_mode; + if (mem.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = CU_TR_FILTER_MODE_POINT; + } + else { + filter_mode = CU_TR_FILTER_MODE_LINEAR; + } + + /* Data Storage */ + if (mem.interpolation == INTERPOLATION_NONE) { + generic_alloc(mem); + generic_copy_to(mem); + + CUdeviceptr cumem; + size_t cubytes; + + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str())); + + if (cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes)); + } + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes)); + } + return; + } + + /* Image Texture Storage */ + CUarray_format_enum format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = CU_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = CU_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = CU_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = CU_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = CU_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = CU_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + CUDAMem *cmem = NULL; + CUarray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + CUDA_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + cuda_assert(cuArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + CUDA_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = array_3d; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + cuda_assert(cuMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + int alignment = 0; + cuda_assert( + cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + dst_pitch = align_up(src_pitch, alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Kepler+, bindless textures. */ + int flat_slot = 0; + if (string_startswith(mem.name, "__tex_image")) { + int pos = string(mem.name).rfind("_"); + flat_slot = atoi(mem.name + pos + 1); + } + else { + assert(0); + } + + CUDA_RESOURCE_DESC resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = CU_RESOURCE_TYPE_ARRAY; + resDesc.res.array.hArray = array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = CU_RESOURCE_TYPE_PITCH2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = CU_RESOURCE_TYPE_LINEAR; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + CUDA_TEXTURE_DESC texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; + + cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + /* Resize once */ + if (flat_slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(flat_slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + TextureInfo &info = texture_info[flat_slot]; + info.data = (uint64_t)cmem->texobject; + info.cl_buffer = 0; + info.interpolation = mem.interpolation; + info.extension = mem.extension; + info.width = mem.data_width; + info.height = mem.data_height; + info.depth = mem.data_depth; + need_texture_info = true; + } + + void tex_free(device_memory &mem) + { + if (mem.device_pointer) { + CUDAContextScope scope(this); + const CUDAMem &cmem = cuda_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + cuTexObjectDestroy(cmem.texobject); + } + + if (cmem.array) { + /* Free array. */ + cuArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else { + generic_free(mem); + } + } + } + +#define CUDA_GET_BLOCKSIZE(func, w, h) \ + int threads_per_block; \ + cuda_assert( \ + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int threads = (int)sqrt((float)threads_per_block); \ + int xblocks = ((w) + threads - 1) / threads; \ + int yblocks = ((h) + threads - 1) / threads; + +#define CUDA_LAUNCH_KERNEL(func, args) \ + cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); /* Similar as above, but for 1-dimensional blocks. */ -#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ - int threads_per_block; \ - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ - int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \ - int yblocks = h; - -#define CUDA_LAUNCH_KERNEL_1D(func, args) \ - cuda_assert(cuLaunchKernel(func, \ - xblocks, yblocks, 1, \ - threads_per_block, 1, 1, \ - 0, 0, args, 0)); - - bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; - int frame_offset = 0; - - if(have_error()) - return false; - - CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts; - CUdeviceptr scale_ptr = 0; - - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride)); - - { - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); - - void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); - } - - { - CUfunction cuNLMNormalize; - cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); - void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; - CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); - } - - return !have_error(); - } - - bool denoising_construct_transform(DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterConstructTransform; - cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); - cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); - CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, - task->storage.w, - task->storage.h); - - void *args[] = {&task->buffer.mem.device_pointer, - &task->tile_info_mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->filter_area, - &task->rect, - &task->radius, - &task->pca_threshold, - &task->buffer.pass_stride, - &task->buffer.frame_stride, - &task->buffer.use_time}; - CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - int r = task->radius; - int f = 4; - float a = 1.0f; - float k_2 = task->nlm_k_2; - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - - if(have_error()) - return false; - - CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); - CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts; - - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - - CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, - task->reconstruction_state.source_w * task->reconstruction_state.source_h, - num_shifts); - - void *calc_difference_args[] = {&color_ptr, - &color_variance_ptr, - &scale_ptr, - &difference, - &w, &h, - &stride, &pass_stride, - &r, &pass_stride, - &frame_offset, - &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; - void *construct_gramian_args[] = {&t, - &blurDifference, - &task->buffer.mem.device_pointer, - &task->storage.transform.device_pointer, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->reconstruction_state.filter_window, - &w, &h, &stride, - &pass_stride, &r, - &f, - &frame_offset, - &task->buffer.use_time}; - - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_solve(device_ptr output_ptr, - DenoisingTask *task) - { - CUfunction cuFinalize; - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); - void *finalize_args[] = {&output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_GET_BLOCKSIZE(cuFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, - device_ptr mean_ptr, device_ptr variance_ptr, - int r, int4 rect, DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterCombineHalves; - cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); - cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterCombineHalves, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&mean_ptr, - &variance_ptr, - &a_ptr, - &b_ptr, - &rect, - &r}; - CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, - device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDivideShadow; - cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterDivideShadow, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &a_ptr, - &b_ptr, - &sample_variance_ptr, - &sv_variance_ptr, - &buffer_variance_ptr, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterGetFeature; - cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterGetFeature, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&task->render_buffer.samples, - &task->tile_info_mem.device_pointer, - &mean_offset, - &variance_offset, - &mean_ptr, - &variance_ptr, - &scale, - &task->rect, - &task->render_buffer.pass_stride, - &task->render_buffer.offset}; - CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_write_feature(int out_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterWriteFeature; - cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); - cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, - task->filter_area.z, - task->filter_area.w); - - void *args[] = {&task->render_buffer.samples, - &task->reconstruction_state.buffer_params, - &task->filter_area, - &from_ptr, - &buffer_ptr, - &out_offset, - &task->rect}; - CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task) - { - if(have_error()) - return false; - - CUDAContextScope scope(this); - - CUfunction cuFilterDetectOutliers; - cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); - cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); - CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - void *args[] = {&image_ptr, - &variance_ptr, - &depth_ptr, - &output_ptr, - &task->rect, - &task->buffer.pass_stride}; - - CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); - cuda_assert(cuCtxSynchronize()); - - return !have_error(); - } - - void denoise(RenderTile &rtile, DenoisingTask& denoising) - { - denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); - } - - void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles) - { - scoped_timer timer(&rtile.buffers->render_time); - - if(have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuPathTrace; - - /* Get kernel function. */ - if(task.integrator_branched) { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); - } - - if(have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - WorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float*)cuda_device_ptr(rtile.buffer); - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); - if(!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - int start_sample = rtile.start_sample; - int end_sample = rtile.start_sample + rtile.num_samples; - - for(int sample = start_sample; sample < end_sample; sample += step_samples) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = min(step_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, - &total_work_size}; - - cuda_assert(cuLaunchKernel(cuPathTrace, - num_blocks, 1, 1, - num_threads_per_block, 1, 1, - 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - rtile.sample = sample + wtile->num_samples; - task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples); - - if(task.get_cancel()) { - if(task.need_finish_queue == false) - break; - } - } - } - - void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) - { - if(have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuFilmConvert; - CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half); - CUdeviceptr d_buffer = cuda_device_ptr(buffer); - - /* get kernel function */ - if(rgba_half) { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); - } - else { - cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); - } - - - float sample_scale = 1.0f/(task.sample + 1); - - /* pass in parameters */ - void *args[] = {&d_rgba, - &d_buffer, - &sample_scale, - &task.x, - &task.y, - &task.w, - &task.h, - &task.offset, - &task.stride}; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); - - int xthreads = (int)sqrt(threads_per_block); - int ythreads = (int)sqrt(threads_per_block); - int xblocks = (task.w + xthreads - 1)/xthreads; - int yblocks = (task.h + ythreads - 1)/ythreads; - - cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(cuFilmConvert, - xblocks , yblocks, 1, /* blocks */ - xthreads, ythreads, 1, /* threads */ - 0, 0, args, 0)); - - unmap_pixels((rgba_byte)? rgba_byte: rgba_half); - - cuda_assert(cuCtxSynchronize()); - } - - void shader(DeviceTask& task) - { - if(have_error()) - return; - - CUDAContextScope scope(this); - - CUfunction cuShader; - CUdeviceptr d_input = cuda_device_ptr(task.shader_input); - CUdeviceptr d_output = cuda_device_ptr(task.shader_output); - - /* get kernel function */ - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); - } - else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); - } - else { - cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); - } - - /* do tasks in smaller chunks, so we can cancel it */ - const int shader_chunk_size = 65536; - const int start = task.shader_x; - const int end = task.shader_x + task.shader_w; - int offset = task.offset; - - bool canceled = false; - for(int sample = 0; sample < task.num_samples && !canceled; sample++) { - for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { - int shader_w = min(shader_chunk_size, end - shader_x); - - /* pass in parameters */ - void *args[8]; - int arg = 0; - args[arg++] = &d_input; - args[arg++] = &d_output; - args[arg++] = &task.shader_eval_type; - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - args[arg++] = &task.shader_filter; - } - args[arg++] = &shader_x; - args[arg++] = &shader_w; - args[arg++] = &offset; - args[arg++] = &sample; - - /* launch kernel */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); - - int xblocks = (shader_w + threads_per_block - 1)/threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuLaunchKernel(cuShader, - xblocks , 1, 1, /* blocks */ - threads_per_block, 1, 1, /* threads */ - 0, 0, args, 0)); - - cuda_assert(cuCtxSynchronize()); - - if(task.get_cancel()) { - canceled = true; - break; - } - } - - task.update_progress(NULL); - } - } - - CUdeviceptr map_pixels(device_ptr mem) - { - if(!background) { - PixelMem pmem = pixel_mem_map[mem]; - CUdeviceptr buffer; - - size_t bytes; - cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); - cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); - - return buffer; - } - - return cuda_device_ptr(mem); - } - - void unmap_pixels(device_ptr mem) - { - if(!background) { - PixelMem pmem = pixel_mem_map[mem]; - - cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); - } - } - - void pixels_alloc(device_memory& mem) - { - PixelMem pmem; - - pmem.w = mem.data_width; - pmem.h = mem.data_height; - - CUDAContextScope scope(this); - - glGenBuffers(1, &pmem.cuPBO); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - if(mem.data_type == TYPE_HALF) - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW); - else - glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - glActiveTexture(GL_TEXTURE0); - glGenTextures(1, &pmem.cuTexId); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if(mem.data_type == TYPE_HALF) - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); - else - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); - glBindTexture(GL_TEXTURE_2D, 0); - - CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); - - if(result == CUDA_SUCCESS) { - mem.device_pointer = pmem.cuTexId; - pixel_mem_map[mem.device_pointer] = pmem; - - mem.device_size = mem.memory_size(); - stats.mem_alloc(mem.device_size); - - return; - } - else { - /* failed to register buffer, fallback to no interop */ - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - background = true; - } - } - - void pixels_copy_from(device_memory& mem, int y, int w, int h) - { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); - size_t offset = sizeof(uchar)*4*y*w; - memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h); - glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - } - - void pixels_free(device_memory& mem) - { - if(mem.device_pointer) { - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - - CUDAContextScope scope(this); - - cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); - glDeleteBuffers(1, &pmem.cuPBO); - glDeleteTextures(1, &pmem.cuTexId); - - pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } - - void draw_pixels( - device_memory& mem, int y, - int w, int h, int width, int height, - int dx, int dy, int dw, int dh, bool transparent, - const DeviceDrawParams &draw_params) - { - assert(mem.type == MEM_PIXELS); - - if(!background) { - const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); - PixelMem pmem = pixel_mem_map[mem.device_pointer]; - float *vpointer; - - CUDAContextScope scope(this); - - /* for multi devices, this assumes the inefficient method that we allocate - * all pixels on the device even though we only render to a subset */ - size_t offset = 4*y*w; - - if(mem.data_type == TYPE_HALF) - offset *= sizeof(GLhalf); - else - offset *= sizeof(uint8_t); - - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); - glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); - if(mem.data_type == TYPE_HALF) { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset); - } - else { - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset); - } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - - if(transparent) { - glEnable(GL_BLEND); - glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - } - - GLint shader_program; - if(use_fallback_shader) { - if(!bind_fallback_display_space_shader(dw, dh)) { - return; - } - shader_program = fallback_shader_program; - } - else { - draw_params.bind_display_space_shader_cb(); - glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); - } - - if(!vertex_buffer) { - glGenBuffers(1, &vertex_buffer); - } - - glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); - /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */ - glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); - - vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); - - if(vpointer) { - /* texture coordinate - vertex pair */ - vpointer[0] = 0.0f; - vpointer[1] = 0.0f; - vpointer[2] = dx; - vpointer[3] = dy; - - vpointer[4] = (float)w/(float)pmem.w; - vpointer[5] = 0.0f; - vpointer[6] = (float)width + dx; - vpointer[7] = dy; - - vpointer[8] = (float)w/(float)pmem.w; - vpointer[9] = (float)h/(float)pmem.h; - vpointer[10] = (float)width + dx; - vpointer[11] = (float)height + dy; - - vpointer[12] = 0.0f; - vpointer[13] = (float)h/(float)pmem.h; - vpointer[14] = dx; - vpointer[15] = (float)height + dy; - - glUnmapBuffer(GL_ARRAY_BUFFER); - } - - GLuint vertex_array_object; - GLuint position_attribute, texcoord_attribute; - - glGenVertexArrays(1, &vertex_array_object); - glBindVertexArray(vertex_array_object); - - texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); - position_attribute = glGetAttribLocation(shader_program, "pos"); - - glEnableVertexAttribArray(texcoord_attribute); - glEnableVertexAttribArray(position_attribute); - - glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); - glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2)); - - glDrawArrays(GL_TRIANGLE_FAN, 0, 4); - - if(use_fallback_shader) { - glUseProgram(0); - } - else { - draw_params.unbind_display_space_shader_cb(); - } - - if(transparent) { - glDisable(GL_BLEND); - } - - glBindTexture(GL_TEXTURE_2D, 0); - - return; - } - - Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); - } - - void thread_run(DeviceTask *task) - { - CUDAContextScope scope(this); - - if(task->type == DeviceTask::RENDER) { - DeviceRequestedFeatures requested_features; - if(use_split_kernel()) { - if(split_kernel == NULL) { - split_kernel = new CUDASplitKernel(this); - split_kernel->load_kernels(requested_features); - } - } - - device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, *task); - - while(task->acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - if(use_split_kernel()) { - device_only_memory<uchar> void_buffer(this, "void_buffer"); - split_kernel->path_trace(task, tile, void_buffer, void_buffer); - } - else { - path_trace(*task, tile, work_tiles); - } - } - else if(tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - - denoise(tile, denoising); - - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - - if(task->get_cancel()) { - if(task->need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - - cuda_assert(cuCtxSynchronize()); - } - } - - class CUDADeviceTask : public DeviceTask { - public: - CUDADeviceTask(CUDADevice *device, DeviceTask& task) - : DeviceTask(task) - { - run = function_bind(&CUDADevice::thread_run, device, this); - } - }; - - int get_split_task_count(DeviceTask& /*task*/) - { - return 1; - } - - void task_add(DeviceTask& task) - { - CUDAContextScope scope(this); - - /* Load texture info. */ - load_texture_info(); - - /* Synchronize all memory copies before executing task. */ - cuda_assert(cuCtxSynchronize()); - - if(task.type == DeviceTask::FILM_CONVERT) { - /* must be done in main thread due to opengl access */ - film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); - } - else { - task_pool.push(new CUDADeviceTask(this, task)); - } - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - friend class CUDASplitKernelFunction; - friend class CUDASplitKernel; - friend class CUDAContextScope; +#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads_per_block; \ + cuda_assert( \ + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \ + int yblocks = h; + +#define CUDA_LAUNCH_KERNEL_1D(func, args) \ + cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0)); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + int frame_offset = 0; + + if (have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts; + CUdeviceptr scale_ptr = 0; + + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride)); + + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + cuda_assert(cuModuleGetFunction( + &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction( + &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction( + &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, + &variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &channel_offset, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, + &image_ptr, + &out_ptr, + &weightAccum, + &w, + &h, + &stride, + &pass_stride, + &channel_offset, + &r, + &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); + } + + { + CUfunction cuNLMNormalize; + cuda_assert(cuModuleGetFunction( + &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + } + + return !have_error(); + } + + bool denoising_construct_transform(DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterConstructTransform; + cuda_assert(cuModuleGetFunction( + &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform")); + cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED)); + CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h); + + void *args[] = {&task->buffer.mem.device_pointer, + &task->tile_info_mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->filter_area, + &task->rect, + &task->radius, + &task->pca_threshold, + &task->buffer.pass_stride, + &task->buffer.frame_stride, + &task->buffer.use_time}; + CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + + if (have_error()) + return false; + + CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer); + CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts; + + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction( + &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction( + &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction( + &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * + task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, + &color_variance_ptr, + &scale_ptr, + &difference, + &w, + &h, + &stride, + &pass_stride, + &r, + &pass_stride, + &frame_offset, + &a, + &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f}; + void *calc_weight_args[] = { + &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f}; + void *construct_gramian_args[] = {&t, + &blurDifference, + &task->buffer.mem.device_pointer, + &task->storage.transform.device_pointer, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->reconstruction_state.filter_window, + &w, + &h, + &stride, + &pass_stride, + &r, + &f, + &frame_offset, + &task->buffer.use_time}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task) + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE( + cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterCombineHalves; + cuda_assert(cuModuleGetFunction( + &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves")); + cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r}; + CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDivideShadow; + cuda_assert(cuModuleGetFunction( + &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &a_ptr, + &b_ptr, + &sample_variance_ptr, + &sv_variance_ptr, + &buffer_variance_ptr, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterGetFeature; + cuda_assert(cuModuleGetFunction( + &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&task->render_buffer.samples, + &task->tile_info_mem.device_pointer, + &mean_offset, + &variance_offset, + &mean_ptr, + &variance_ptr, + &scale, + &task->rect, + &task->render_buffer.pass_stride, + &task->render_buffer.offset}; + CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_write_feature(int out_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterWriteFeature; + cuda_assert(cuModuleGetFunction( + &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature")); + cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w); + + void *args[] = {&task->render_buffer.samples, + &task->reconstruction_state.buffer_params, + &task->filter_area, + &from_ptr, + &buffer_ptr, + &out_offset, + &task->rect}; + CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task) + { + if (have_error()) + return false; + + CUDAContextScope scope(this); + + CUfunction cuFilterDetectOutliers; + cuda_assert(cuModuleGetFunction( + &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers")); + cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1)); + CUDA_GET_BLOCKSIZE( + cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + void *args[] = {&image_ptr, + &variance_ptr, + &depth_ptr, + &output_ptr, + &task->rect, + &task->buffer.pass_stride}; + + CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args); + cuda_assert(cuCtxSynchronize()); + + return !have_error(); + } + + void denoise(RenderTile &rtile, DenoisingTask &denoising) + { + denoising.functions.construct_transform = function_bind( + &CUDADevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind( + &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind( + &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind( + &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind( + &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind( + &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind( + &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind( + &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); + } + + void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles) + { + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + CUDAContextScope scope(this); + CUfunction cuPathTrace; + + /* Get kernel function. */ + if (task.integrator_branched) { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace")); + } + + if (have_error()) { + return; + } + + cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + WorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)cuda_device_ptr(rtile.buffer); + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + cuda_assert(cuOccupancyMaxPotentialBlockSize( + &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + int start_sample = rtile.start_sample; + int end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample; sample += step_samples) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = min(step_samples, end_sample - sample); + work_tiles.copy_to_device(); + + CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer); + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + cuda_assert(cuLaunchKernel( + cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + cuda_assert(cuCtxSynchronize()); + + /* Update progress. */ + rtile.sample = sample + wtile->num_samples; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + } + + void film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half) + { + if (have_error()) + return; + + CUDAContextScope scope(this); + + CUfunction cuFilmConvert; + CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half); + CUdeviceptr d_buffer = cuda_device_ptr(buffer); + + /* get kernel function */ + if (rgba_half) { + cuda_assert( + cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float")); + } + else { + cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte")); + } + + float sample_scale = 1.0f / (task.sample + 1); + + /* pass in parameters */ + void *args[] = {&d_rgba, + &d_buffer, + &sample_scale, + &task.x, + &task.y, + &task.w, + &task.h, + &task.offset, + &task.stride}; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert)); + + int xthreads = (int)sqrt(threads_per_block); + int ythreads = (int)sqrt(threads_per_block); + int xblocks = (task.w + xthreads - 1) / xthreads; + int yblocks = (task.h + ythreads - 1) / ythreads; + + cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(cuFilmConvert, + xblocks, + yblocks, + 1, /* blocks */ + xthreads, + ythreads, + 1, /* threads */ + 0, + 0, + args, + 0)); + + unmap_pixels((rgba_byte) ? rgba_byte : rgba_half); + + cuda_assert(cuCtxSynchronize()); + } + + void shader(DeviceTask &task) + { + if (have_error()) + return; + + CUDAContextScope scope(this); + + CUfunction cuShader; + CUdeviceptr d_input = cuda_device_ptr(task.shader_input); + CUdeviceptr d_output = cuda_device_ptr(task.shader_output); + + /* get kernel function */ + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake")); + } + else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace")); + } + else { + cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background")); + } + + /* do tasks in smaller chunks, so we can cancel it */ + const int shader_chunk_size = 65536; + const int start = task.shader_x; + const int end = task.shader_x + task.shader_w; + int offset = task.offset; + + bool canceled = false; + for (int sample = 0; sample < task.num_samples && !canceled; sample++) { + for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) { + int shader_w = min(shader_chunk_size, end - shader_x); + + /* pass in parameters */ + void *args[8]; + int arg = 0; + args[arg++] = &d_input; + args[arg++] = &d_output; + args[arg++] = &task.shader_eval_type; + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + args[arg++] = &task.shader_filter; + } + args[arg++] = &shader_x; + args[arg++] = &shader_w; + args[arg++] = &offset; + args[arg++] = &sample; + + /* launch kernel */ + int threads_per_block; + cuda_assert(cuFuncGetAttribute( + &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader)); + + int xblocks = (shader_w + threads_per_block - 1) / threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuLaunchKernel(cuShader, + xblocks, + 1, + 1, /* blocks */ + threads_per_block, + 1, + 1, /* threads */ + 0, + 0, + args, + 0)); + + cuda_assert(cuCtxSynchronize()); + + if (task.get_cancel()) { + canceled = true; + break; + } + } + + task.update_progress(NULL); + } + } + + CUdeviceptr map_pixels(device_ptr mem) + { + if (!background) { + PixelMem pmem = pixel_mem_map[mem]; + CUdeviceptr buffer; + + size_t bytes; + cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0)); + cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource)); + + return buffer; + } + + return cuda_device_ptr(mem); + } + + void unmap_pixels(device_ptr mem) + { + if (!background) { + PixelMem pmem = pixel_mem_map[mem]; + + cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0)); + } + } + + void pixels_alloc(device_memory &mem) + { + PixelMem pmem; + + pmem.w = mem.data_width; + pmem.h = mem.data_height; + + CUDAContextScope scope(this); + + glGenBuffers(1, &pmem.cuPBO); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + if (mem.data_type == TYPE_HALF) + glBufferData( + GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW); + else + glBufferData( + GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + glActiveTexture(GL_TEXTURE0); + glGenTextures(1, &pmem.cuTexId); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if (mem.data_type == TYPE_HALF) + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL); + else + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); + + CUresult result = cuGraphicsGLRegisterBuffer( + &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + + if (result == CUDA_SUCCESS) { + mem.device_pointer = pmem.cuTexId; + pixel_mem_map[mem.device_pointer] = pmem; + + mem.device_size = mem.memory_size(); + stats.mem_alloc(mem.device_size); + + return; + } + else { + /* failed to register buffer, fallback to no interop */ + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); + + background = true; + } + } + + void pixels_copy_from(device_memory &mem, int y, int w, int h) + { + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + + CUDAContextScope scope(this); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY); + size_t offset = sizeof(uchar) * 4 * y * w; + memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h); + glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + } + + void pixels_free(device_memory &mem) + { + if (mem.device_pointer) { + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + + CUDAContextScope scope(this); + + cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource)); + glDeleteBuffers(1, &pmem.cuPBO); + glDeleteTextures(1, &pmem.cuTexId); + + pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer)); + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } + + void draw_pixels(device_memory &mem, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params) + { + assert(mem.type == MEM_PIXELS); + + if (!background) { + const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL); + PixelMem pmem = pixel_mem_map[mem.device_pointer]; + float *vpointer; + + CUDAContextScope scope(this); + + /* for multi devices, this assumes the inefficient method that we allocate + * all pixels on the device even though we only render to a subset */ + size_t offset = 4 * y * w; + + if (mem.data_type == TYPE_HALF) + offset *= sizeof(GLhalf); + else + offset *= sizeof(uint8_t); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, pmem.cuTexId); + if (mem.data_type == TYPE_HALF) { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset); + } + else { + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset); + } + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + if (transparent) { + glEnable(GL_BLEND); + glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA); + } + + GLint shader_program; + if (use_fallback_shader) { + if (!bind_fallback_display_space_shader(dw, dh)) { + return; + } + shader_program = fallback_shader_program; + } + else { + draw_params.bind_display_space_shader_cb(); + glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program); + } + + if (!vertex_buffer) { + glGenBuffers(1, &vertex_buffer); + } + + glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer); + /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */ + glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW); + + vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); + + if (vpointer) { + /* texture coordinate - vertex pair */ + vpointer[0] = 0.0f; + vpointer[1] = 0.0f; + vpointer[2] = dx; + vpointer[3] = dy; + + vpointer[4] = (float)w / (float)pmem.w; + vpointer[5] = 0.0f; + vpointer[6] = (float)width + dx; + vpointer[7] = dy; + + vpointer[8] = (float)w / (float)pmem.w; + vpointer[9] = (float)h / (float)pmem.h; + vpointer[10] = (float)width + dx; + vpointer[11] = (float)height + dy; + + vpointer[12] = 0.0f; + vpointer[13] = (float)h / (float)pmem.h; + vpointer[14] = dx; + vpointer[15] = (float)height + dy; + + glUnmapBuffer(GL_ARRAY_BUFFER); + } + + GLuint vertex_array_object; + GLuint position_attribute, texcoord_attribute; + + glGenVertexArrays(1, &vertex_array_object); + glBindVertexArray(vertex_array_object); + + texcoord_attribute = glGetAttribLocation(shader_program, "texCoord"); + position_attribute = glGetAttribLocation(shader_program, "pos"); + + glEnableVertexAttribArray(texcoord_attribute); + glEnableVertexAttribArray(position_attribute); + + glVertexAttribPointer( + texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0); + glVertexAttribPointer(position_attribute, + 2, + GL_FLOAT, + GL_FALSE, + 4 * sizeof(float), + (const GLvoid *)(sizeof(float) * 2)); + + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + + if (use_fallback_shader) { + glUseProgram(0); + } + else { + draw_params.unbind_display_space_shader_cb(); + } + + if (transparent) { + glDisable(GL_BLEND); + } + + glBindTexture(GL_TEXTURE_2D, 0); + + return; + } + + Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params); + } + + void thread_run(DeviceTask *task) + { + CUDAContextScope scope(this); + + if (task->type == DeviceTask::RENDER) { + DeviceRequestedFeatures requested_features; + if (use_split_kernel()) { + if (split_kernel == NULL) { + split_kernel = new CUDASplitKernel(this); + split_kernel->load_kernels(requested_features); + } + } + + device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, *task); + + while (task->acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) { + if (use_split_kernel()) { + device_only_memory<uchar> void_buffer(this, "void_buffer"); + split_kernel->path_trace(task, tile, void_buffer, void_buffer); + } + else { + path_trace(*task, tile, work_tiles); + } + } + else if (tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + + denoise(tile, denoising); + + task->update_progress(&tile, tile.w * tile.h); + } + + task->release_tile(tile); + + if (task->get_cancel()) { + if (task->need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } + else if (task->type == DeviceTask::SHADER) { + shader(*task); + + cuda_assert(cuCtxSynchronize()); + } + } + + class CUDADeviceTask : public DeviceTask { + public: + CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task) + { + run = function_bind(&CUDADevice::thread_run, device, this); + } + }; + + int get_split_task_count(DeviceTask & /*task*/) + { + return 1; + } + + void task_add(DeviceTask &task) + { + CUDAContextScope scope(this); + + /* Load texture info. */ + load_texture_info(); + + /* Synchronize all memory copies before executing task. */ + cuda_assert(cuCtxSynchronize()); + + if (task.type == DeviceTask::FILM_CONVERT) { + /* must be done in main thread due to opengl access */ + film_convert(task, task.buffer, task.rgba_byte, task.rgba_half); + } + else { + task_pool.push(new CUDADeviceTask(this, task)); + } + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + friend class CUDASplitKernelFunction; + friend class CUDASplitKernel; + friend class CUDAContextScope; }; /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class @@ -2207,496 +2305,501 @@ public: */ #undef cuda_assert #define cuda_assert(stmt) \ - { \ - CUresult result = stmt; \ - \ - if(result != CUDA_SUCCESS) { \ - string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ - if(device->error_msg == "") \ - device->error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - /*cuda_abort();*/ \ - device->cuda_error_documentation(); \ - } \ - } (void) 0 - + { \ + CUresult result = stmt; \ +\ + if (result != CUDA_SUCCESS) { \ + string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \ + if (device->error_msg == "") \ + device->error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + /*cuda_abort();*/ \ + device->cuda_error_documentation(); \ + } \ + } \ + (void)0 /* CUDA context scope. */ -CUDAContextScope::CUDAContextScope(CUDADevice *device) -: device(device) +CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device) { - cuda_assert(cuCtxPushCurrent(device->cuContext)); + cuda_assert(cuCtxPushCurrent(device->cuContext)); } CUDAContextScope::~CUDAContextScope() { - cuda_assert(cuCtxPopCurrent(NULL)); + cuda_assert(cuCtxPopCurrent(NULL)); } /* split kernel */ -class CUDASplitKernelFunction : public SplitKernelFunction{ - CUDADevice* device; - CUfunction func; -public: - CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {} - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/) - { - return enqueue(dim, NULL); - } - - /* enqueue the kernel, returns false if there is an error */ - bool enqueue(const KernelDimensions &dim, void *args[]) - { - if(device->have_error()) - return false; - - CUDAContextScope scope(device); - - /* we ignore dim.local_size for now, as this is faster */ - int threads_per_block; - cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); - - int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block; - - cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); - - cuda_assert(cuLaunchKernel(func, - xblocks, 1, 1, /* blocks */ - threads_per_block, 1, 1, /* threads */ - 0, 0, args, 0)); - - return !device->have_error(); - } +class CUDASplitKernelFunction : public SplitKernelFunction { + CUDADevice *device; + CUfunction func; + + public: + CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) + { + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/) + { + return enqueue(dim, NULL); + } + + /* enqueue the kernel, returns false if there is an error */ + bool enqueue(const KernelDimensions &dim, void *args[]) + { + if (device->have_error()) + return false; + + CUDAContextScope scope(device); + + /* we ignore dim.local_size for now, as this is faster */ + int threads_per_block; + cuda_assert( + cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); + + int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) / + threads_per_block; + + cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); + + cuda_assert(cuLaunchKernel(func, + xblocks, + 1, + 1, /* blocks */ + threads_per_block, + 1, + 1, /* threads */ + 0, + 0, + args, + 0)); + + return !device->have_error(); + } }; CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device) { } -uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads) +uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/, + device_memory & /*data*/, + size_t num_threads) { - CUDAContextScope scope(device); + CUDAContextScope scope(device); - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); - uint threads = num_threads; - CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); + uint threads = num_threads; + CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer); - struct args_t { - uint* num_threads; - CUdeviceptr* size; - }; + struct args_t { + uint *num_threads; + CUdeviceptr *size; + }; - args_t args = { - &threads, - &d_size - }; + args_t args = {&threads, &d_size}; - CUfunction state_buffer_size; - cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); + CUfunction state_buffer_size; + cuda_assert( + cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size")); - cuda_assert(cuLaunchKernel(state_buffer_size, - 1, 1, 1, - 1, 1, 1, - 0, 0, (void**)&args, 0)); + cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0)); - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); - return size; + return size; } -bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& /*kernel_globals*/, - device_memory& /*kernel_data*/, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs) +bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory & /*kernel_globals*/, + device_memory & /*kernel_data*/, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs) { - CUDAContextScope scope(device); - - CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); - CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); - CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); - CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); - CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); - - CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); - - int end_sample = rtile.start_sample + rtile.num_samples; - int queue_size = dim.global_size[0] * dim.global_size[1]; - - struct args_t { - CUdeviceptr* split_data_buffer; - int* num_elements; - CUdeviceptr* ray_state; - int* start_sample; - int* end_sample; - int* sx; - int* sy; - int* sw; - int* sh; - int* offset; - int* stride; - CUdeviceptr* queue_index; - int* queuesize; - CUdeviceptr* use_queues_flag; - CUdeviceptr* work_pool_wgs; - int* num_samples; - CUdeviceptr* buffer; - }; - - args_t args = { - &d_split_data, - &num_global_elements, - &d_ray_state, - &rtile.start_sample, - &end_sample, - &rtile.x, - &rtile.y, - &rtile.w, - &rtile.h, - &rtile.offset, - &rtile.stride, - &d_queue_index, - &queue_size, - &d_use_queues_flag, - &d_work_pool_wgs, - &rtile.num_samples, - &d_buffer - }; - - CUfunction data_init; - cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); - if(device->have_error()) { - return false; - } - - CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args); - - return !device->have_error(); + CUDAContextScope scope(device); + + CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer); + CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer); + CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer); + CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer); + CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer); + + CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer); + + int end_sample = rtile.start_sample + rtile.num_samples; + int queue_size = dim.global_size[0] * dim.global_size[1]; + + struct args_t { + CUdeviceptr *split_data_buffer; + int *num_elements; + CUdeviceptr *ray_state; + int *start_sample; + int *end_sample; + int *sx; + int *sy; + int *sw; + int *sh; + int *offset; + int *stride; + CUdeviceptr *queue_index; + int *queuesize; + CUdeviceptr *use_queues_flag; + CUdeviceptr *work_pool_wgs; + int *num_samples; + CUdeviceptr *buffer; + }; + + args_t args = {&d_split_data, + &num_global_elements, + &d_ray_state, + &rtile.start_sample, + &end_sample, + &rtile.x, + &rtile.y, + &rtile.w, + &rtile.h, + &rtile.offset, + &rtile.stride, + &d_queue_index, + &queue_size, + &d_use_queues_flag, + &d_work_pool_wgs, + &rtile.num_samples, + &d_buffer}; + + CUfunction data_init; + cuda_assert( + cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init")); + if (device->have_error()) { + return false; + } + + CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args); + + return !device->have_error(); } -SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&) +SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &) { - CUDAContextScope scope(device); - CUfunction func; - - cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); - if(device->have_error()) { - device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); - return NULL; - } - - return new CUDASplitKernelFunction(device, func); + CUDAContextScope scope(device); + CUfunction func; + + cuda_assert( + cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data())); + if (device->have_error()) { + device->cuda_error_message( + string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data())); + return NULL; + } + + return new CUDASplitKernelFunction(device, func); } int2 CUDASplitKernel::split_kernel_local_size() { - return make_int2(32, 1); + return make_int2(32, 1); } -int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) +int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg, + device_memory &data, + DeviceTask * /*task*/) { - CUDAContextScope scope(device); - size_t free; - size_t total; + CUDAContextScope scope(device); + size_t free; + size_t total; - cuda_assert(cuMemGetInfo(&free, &total)); + cuda_assert(cuMemGetInfo(&free, &total)); - VLOG(1) << "Maximum device allocation size: " - << string_human_readable_number(free) << " bytes. (" - << string_human_readable_size(free) << ")."; + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free) + << " bytes. (" << string_human_readable_size(free) << ")."; - size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); - size_t side = round_down((int)sqrt(num_elements), 32); - int2 global_size = make_int2(side, round_down(num_elements / side, 16)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; + size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2); + size_t side = round_down((int)sqrt(num_elements), 32); + int2 global_size = make_int2(side, round_down(num_elements / side, 16)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; } bool device_cuda_init() { #ifdef WITH_CUDA_DYNLOAD - static bool initialized = false; - static bool result = false; - - if(initialized) - return result; - - initialized = true; - int cuew_result = cuewInit(CUEW_INIT_CUDA); - if(cuew_result == CUEW_SUCCESS) { - VLOG(1) << "CUEW initialization succeeded"; - if(CUDADevice::have_precompiled_kernels()) { - VLOG(1) << "Found precompiled kernels"; - result = true; - } -#ifndef _WIN32 - else if(cuewCompilerPath() != NULL) { - VLOG(1) << "Found CUDA compiler " << cuewCompilerPath(); - result = true; - } - else { - VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found," - << " unable to use CUDA"; - } -#endif - } - else { - VLOG(1) << "CUEW initialization failed: " - << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) - ? "Error setting up atexit() handler" - : "Error opening the library"); - } - - return result; + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + int cuew_result = cuewInit(CUEW_INIT_CUDA); + if (cuew_result == CUEW_SUCCESS) { + VLOG(1) << "CUEW initialization succeeded"; + if (CUDADevice::have_precompiled_kernels()) { + VLOG(1) << "Found precompiled kernels"; + result = true; + } +# ifndef _WIN32 + else if (cuewCompilerPath() != NULL) { + VLOG(1) << "Found CUDA compiler " << cuewCompilerPath(); + result = true; + } + else { + VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found," + << " unable to use CUDA"; + } +# endif + } + else { + VLOG(1) << "CUEW initialization failed: " + << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + + return result; #else /* WITH_CUDA_DYNLOAD */ - return true; -#endif /* WITH_CUDA_DYNLOAD */ + return true; +#endif /* WITH_CUDA_DYNLOAD */ } -Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) +Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) { - return new CUDADevice(info, stats, profiler, background); + return new CUDADevice(info, stats, profiler, background); } static CUresult device_cuda_safe_init() { #ifdef _WIN32 - __try { - return cuInit(0); - } - __except(EXCEPTION_EXECUTE_HANDLER) { - /* Ignore crashes inside the CUDA driver and hope we can - * survive even with corrupted CUDA installs. */ - fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); - } - - return CUDA_ERROR_NO_DEVICE; + __try { + return cuInit(0); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the CUDA driver and hope we can + * survive even with corrupted CUDA installs. */ + fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n"); + } + + return CUDA_ERROR_NO_DEVICE; #else - return cuInit(0); + return cuInit(0); #endif } -void device_cuda_info(vector<DeviceInfo>& devices) +void device_cuda_info(vector<DeviceInfo> &devices) { - CUresult result = device_cuda_safe_init(); - if(result != CUDA_SUCCESS) { - if(result != CUDA_ERROR_NO_DEVICE) - fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); - return; - } - - int count = 0; - result = cuDeviceGetCount(&count); - if(result != CUDA_SUCCESS) { - fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); - return; - } - - vector<DeviceInfo> display_devices; - - for(int num = 0; num < count; num++) { - char name[256]; - - result = cuDeviceGetName(name, 256, num); - if(result != CUDA_SUCCESS) { - fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result)); - continue; - } - - int major; - cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num); - if(major < 3) { - VLOG(1) << "Ignoring device \"" << name - << "\", this graphics card is no longer supported."; - continue; - } - - DeviceInfo info; - - info.type = DEVICE_CUDA; - info.description = string(name); - info.num = num; - - info.has_half_images = (major >= 3); - info.has_volume_decoupled = false; - - int pci_location[3] = {0, 0, 0}; - cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); - cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num); - cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num); - info.id = string_printf("CUDA_%s_%04x:%02x:%02x", - name, - (unsigned int)pci_location[0], - (unsigned int)pci_location[1], - (unsigned int)pci_location[2]); - - /* If device has a kernel timeout and no compute preemption, we assume - * it is connected to a display and will freeze the display while doing - * computations. */ - int timeout_attr = 0, preempt_attr = 0; - cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num); - cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num); - - if(timeout_attr && !preempt_attr) { - VLOG(1) << "Device is recognized as display."; - info.description += " (Display)"; - info.display_device = true; - display_devices.push_back(info); - } - else { - devices.push_back(info); - } - VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; - } - - if(!display_devices.empty()) - devices.insert(devices.end(), display_devices.begin(), display_devices.end()); + CUresult result = device_cuda_safe_init(); + if (result != CUDA_SUCCESS) { + if (result != CUDA_ERROR_NO_DEVICE) + fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result)); + return; + } + + int count = 0; + result = cuDeviceGetCount(&count); + if (result != CUDA_SUCCESS) { + fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result)); + return; + } + + vector<DeviceInfo> display_devices; + + for (int num = 0; num < count; num++) { + char name[256]; + + result = cuDeviceGetName(name, 256, num); + if (result != CUDA_SUCCESS) { + fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result)); + continue; + } + + int major; + cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num); + if (major < 3) { + VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported."; + continue; + } + + DeviceInfo info; + + info.type = DEVICE_CUDA; + info.description = string(name); + info.num = num; + + info.has_half_images = (major >= 3); + info.has_volume_decoupled = false; + + int pci_location[3] = {0, 0, 0}; + cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); + cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num); + cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num); + info.id = string_printf("CUDA_%s_%04x:%02x:%02x", + name, + (unsigned int)pci_location[0], + (unsigned int)pci_location[1], + (unsigned int)pci_location[2]); + + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num); + cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num); + + if (timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; + info.description += " (Display)"; + info.display_device = true; + display_devices.push_back(info); + } + else { + devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + } + + if (!display_devices.empty()) + devices.insert(devices.end(), display_devices.begin(), display_devices.end()); } string device_cuda_capabilities() { - CUresult result = device_cuda_safe_init(); - if(result != CUDA_SUCCESS) { - if(result != CUDA_ERROR_NO_DEVICE) { - return string("Error initializing CUDA: ") + cuewErrorString(result); - } - return "No CUDA device found\n"; - } - - int count; - result = cuDeviceGetCount(&count); - if(result != CUDA_SUCCESS) { - return string("Error getting devices: ") + cuewErrorString(result); - } - - string capabilities = ""; - for(int num = 0; num < count; num++) { - char name[256]; - if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) { - continue; - } - capabilities += string("\t") + name + "\n"; - int value; + CUresult result = device_cuda_safe_init(); + if (result != CUDA_SUCCESS) { + if (result != CUDA_ERROR_NO_DEVICE) { + return string("Error initializing CUDA: ") + cuewErrorString(result); + } + return "No CUDA device found\n"; + } + + int count; + result = cuDeviceGetCount(&count); + if (result != CUDA_SUCCESS) { + return string("Error getting devices: ") + cuewErrorString(result); + } + + string capabilities = ""; + for (int num = 0; num < count; num++) { + char name[256]; + if (cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) { + continue; + } + capabilities += string("\t") + name + "\n"; + int value; #define GET_ATTR(attr) \ - { \ - if(cuDeviceGetAttribute(&value, \ - CU_DEVICE_ATTRIBUTE_##attr, \ - num) == CUDA_SUCCESS) \ - { \ - capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \ - value); \ - } \ - } (void) 0 - /* TODO(sergey): Strip all attributes which are not useful for us - * or does not depend on the driver. - */ - GET_ATTR(MAX_THREADS_PER_BLOCK); - GET_ATTR(MAX_BLOCK_DIM_X); - GET_ATTR(MAX_BLOCK_DIM_Y); - GET_ATTR(MAX_BLOCK_DIM_Z); - GET_ATTR(MAX_GRID_DIM_X); - GET_ATTR(MAX_GRID_DIM_Y); - GET_ATTR(MAX_GRID_DIM_Z); - GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK); - GET_ATTR(SHARED_MEMORY_PER_BLOCK); - GET_ATTR(TOTAL_CONSTANT_MEMORY); - GET_ATTR(WARP_SIZE); - GET_ATTR(MAX_PITCH); - GET_ATTR(MAX_REGISTERS_PER_BLOCK); - GET_ATTR(REGISTERS_PER_BLOCK); - GET_ATTR(CLOCK_RATE); - GET_ATTR(TEXTURE_ALIGNMENT); - GET_ATTR(GPU_OVERLAP); - GET_ATTR(MULTIPROCESSOR_COUNT); - GET_ATTR(KERNEL_EXEC_TIMEOUT); - GET_ATTR(INTEGRATED); - GET_ATTR(CAN_MAP_HOST_MEMORY); - GET_ATTR(COMPUTE_MODE); - GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES); - GET_ATTR(SURFACE_ALIGNMENT); - GET_ATTR(CONCURRENT_KERNELS); - GET_ATTR(ECC_ENABLED); - GET_ATTR(TCC_DRIVER); - GET_ATTR(MEMORY_CLOCK_RATE); - GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH); - GET_ATTR(L2_CACHE_SIZE); - GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR); - GET_ATTR(ASYNC_ENGINE_COUNT); - GET_ATTR(UNIFIED_ADDRESSING); - GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS); - GET_ATTR(CAN_TEX2D_GATHER); - GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE); - GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE); - GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE); - GET_ATTR(TEXTURE_PITCH_ALIGNMENT); - GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH); - GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_SURFACE1D_WIDTH); - GET_ATTR(MAXIMUM_SURFACE2D_WIDTH); - GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT); - GET_ATTR(MAXIMUM_SURFACE3D_WIDTH); - GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT); - GET_ATTR(MAXIMUM_SURFACE3D_DEPTH); - GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT); - GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH); - GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH); - GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS); - GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT); - GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH); - GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH); - GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT); - GET_ATTR(COMPUTE_CAPABILITY_MAJOR); - GET_ATTR(COMPUTE_CAPABILITY_MINOR); - GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH); - GET_ATTR(STREAM_PRIORITIES_SUPPORTED); - GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED); - GET_ATTR(LOCAL_L1_CACHE_SUPPORTED); - GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); - GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR); - GET_ATTR(MANAGED_MEMORY); - GET_ATTR(MULTI_GPU_BOARD); - GET_ATTR(MULTI_GPU_BOARD_GROUP_ID); + { \ + if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \ + capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \ + } \ + } \ + (void)0 + /* TODO(sergey): Strip all attributes which are not useful for us + * or does not depend on the driver. + */ + GET_ATTR(MAX_THREADS_PER_BLOCK); + GET_ATTR(MAX_BLOCK_DIM_X); + GET_ATTR(MAX_BLOCK_DIM_Y); + GET_ATTR(MAX_BLOCK_DIM_Z); + GET_ATTR(MAX_GRID_DIM_X); + GET_ATTR(MAX_GRID_DIM_Y); + GET_ATTR(MAX_GRID_DIM_Z); + GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK); + GET_ATTR(SHARED_MEMORY_PER_BLOCK); + GET_ATTR(TOTAL_CONSTANT_MEMORY); + GET_ATTR(WARP_SIZE); + GET_ATTR(MAX_PITCH); + GET_ATTR(MAX_REGISTERS_PER_BLOCK); + GET_ATTR(REGISTERS_PER_BLOCK); + GET_ATTR(CLOCK_RATE); + GET_ATTR(TEXTURE_ALIGNMENT); + GET_ATTR(GPU_OVERLAP); + GET_ATTR(MULTIPROCESSOR_COUNT); + GET_ATTR(KERNEL_EXEC_TIMEOUT); + GET_ATTR(INTEGRATED); + GET_ATTR(CAN_MAP_HOST_MEMORY); + GET_ATTR(COMPUTE_MODE); + GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES); + GET_ATTR(SURFACE_ALIGNMENT); + GET_ATTR(CONCURRENT_KERNELS); + GET_ATTR(ECC_ENABLED); + GET_ATTR(TCC_DRIVER); + GET_ATTR(MEMORY_CLOCK_RATE); + GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH); + GET_ATTR(L2_CACHE_SIZE); + GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR); + GET_ATTR(ASYNC_ENGINE_COUNT); + GET_ATTR(UNIFIED_ADDRESSING); + GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS); + GET_ATTR(CAN_TEX2D_GATHER); + GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE); + GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE); + GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE); + GET_ATTR(TEXTURE_PITCH_ALIGNMENT); + GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH); + GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_SURFACE1D_WIDTH); + GET_ATTR(MAXIMUM_SURFACE2D_WIDTH); + GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT); + GET_ATTR(MAXIMUM_SURFACE3D_WIDTH); + GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT); + GET_ATTR(MAXIMUM_SURFACE3D_DEPTH); + GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT); + GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH); + GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH); + GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS); + GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT); + GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH); + GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH); + GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT); + GET_ATTR(COMPUTE_CAPABILITY_MAJOR); + GET_ATTR(COMPUTE_CAPABILITY_MINOR); + GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH); + GET_ATTR(STREAM_PRIORITIES_SUPPORTED); + GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED); + GET_ATTR(LOCAL_L1_CACHE_SUPPORTED); + GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR); + GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR); + GET_ATTR(MANAGED_MEMORY); + GET_ATTR(MULTI_GPU_BOARD); + GET_ATTR(MULTI_GPU_BOARD_GROUP_ID); #undef GET_ATTR - capabilities += "\n"; - } + capabilities += "\n"; + } - return capabilities; + return capabilities; } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 1bb144ef85a..05a7fb8ae4d 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -21,314 +21,329 @@ CCL_NAMESPACE_BEGIN DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task) -: tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE), - profiler(NULL), - storage(device), - buffer(device), - device(device) + : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE), + profiler(NULL), + storage(device), + buffer(device), + device(device) { - radius = task.denoising.radius; - nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength)); - if(task.denoising.relative_pca) { - pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength)); - } - else { - pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength)); - } - - render_buffer.frame_stride = task.frame_stride; - render_buffer.pass_stride = task.pass_stride; - render_buffer.offset = task.pass_denoising_data; - - target_buffer.pass_stride = task.target_pass_stride; - target_buffer.denoising_clean_offset = task.pass_denoising_clean; - target_buffer.offset = 0; - - functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); - functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); - - tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int)); - tile_info->from_render = task.denoising_from_render? 1 : 0; - - tile_info->frames[0] = 0; - tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); - for(int i = 1; i < tile_info->num_frames; i++) { - tile_info->frames[i] = task.denoising_frames[i-1]; - } - - write_passes = task.denoising_write_passes; - do_filter = task.denoising_do_filter; + radius = task.denoising.radius; + nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength)); + if (task.denoising.relative_pca) { + pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength)); + } + else { + pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength)); + } + + render_buffer.frame_stride = task.frame_stride; + render_buffer.pass_stride = task.pass_stride; + render_buffer.offset = task.pass_denoising_data; + + target_buffer.pass_stride = task.target_pass_stride; + target_buffer.denoising_clean_offset = task.pass_denoising_clean; + target_buffer.offset = 0; + + functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device); + functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device); + + tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int)); + tile_info->from_render = task.denoising_from_render ? 1 : 0; + + tile_info->frames[0] = 0; + tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES); + for (int i = 1; i < tile_info->num_frames; i++) { + tile_info->frames[i] = task.denoising_frames[i - 1]; + } + + write_passes = task.denoising_write_passes; + do_filter = task.denoising_do_filter; } DenoisingTask::~DenoisingTask() { - storage.XtWX.free(); - storage.XtWY.free(); - storage.transform.free(); - storage.rank.free(); - buffer.mem.free(); - buffer.temporary_mem.free(); - tile_info_mem.free(); + storage.XtWX.free(); + storage.XtWY.free(); + storage.transform.free(); + storage.rank.free(); + buffer.mem.free(); + buffer.temporary_mem.free(); + tile_info_mem.free(); } void DenoisingTask::set_render_buffer(RenderTile *rtiles) { - for(int i = 0; i < 9; i++) { - tile_info->offsets[i] = rtiles[i].offset; - tile_info->strides[i] = rtiles[i].stride; - tile_info->buffers[i] = rtiles[i].buffer; - } - tile_info->x[0] = rtiles[3].x; - tile_info->x[1] = rtiles[4].x; - tile_info->x[2] = rtiles[5].x; - tile_info->x[3] = rtiles[5].x + rtiles[5].w; - tile_info->y[0] = rtiles[1].y; - tile_info->y[1] = rtiles[4].y; - tile_info->y[2] = rtiles[7].y; - tile_info->y[3] = rtiles[7].y + rtiles[7].h; - - target_buffer.offset = rtiles[9].offset; - target_buffer.stride = rtiles[9].stride; - target_buffer.ptr = rtiles[9].buffer; - - if(write_passes && rtiles[9].buffers) { - target_buffer.denoising_output_offset = rtiles[9].buffers->params.get_denoising_prefiltered_offset(); - } - else { - target_buffer.denoising_output_offset = 0; - } - - tile_info_mem.copy_to_device(); + for (int i = 0; i < 9; i++) { + tile_info->offsets[i] = rtiles[i].offset; + tile_info->strides[i] = rtiles[i].stride; + tile_info->buffers[i] = rtiles[i].buffer; + } + tile_info->x[0] = rtiles[3].x; + tile_info->x[1] = rtiles[4].x; + tile_info->x[2] = rtiles[5].x; + tile_info->x[3] = rtiles[5].x + rtiles[5].w; + tile_info->y[0] = rtiles[1].y; + tile_info->y[1] = rtiles[4].y; + tile_info->y[2] = rtiles[7].y; + tile_info->y[3] = rtiles[7].y + rtiles[7].h; + + target_buffer.offset = rtiles[9].offset; + target_buffer.stride = rtiles[9].stride; + target_buffer.ptr = rtiles[9].buffer; + + if (write_passes && rtiles[9].buffers) { + target_buffer.denoising_output_offset = + rtiles[9].buffers->params.get_denoising_prefiltered_offset(); + } + else { + target_buffer.denoising_output_offset = 0; + } + + tile_info_mem.copy_to_device(); } void DenoisingTask::setup_denoising_buffer() { - /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ - rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); - rect = rect_expand(rect, radius); - rect = rect_clip(rect, make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); - - buffer.use_intensity = write_passes || (tile_info->num_frames > 1); - buffer.passes = buffer.use_intensity? 15 : 14; - buffer.width = rect.z - rect.x; - buffer.stride = align_up(buffer.width, 4); - buffer.h = rect.w - rect.y; - int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); - buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); - buffer.frame_stride = buffer.pass_stride * buffer.passes; - /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ - int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); - buffer.mem.alloc_to_device(mem_size, false); - buffer.use_time = (tile_info->num_frames > 1); - - /* CPUs process shifts sequentially while GPUs process them in parallel. */ - int num_layers; - if(buffer.gpu_temporary_mem) { - /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ - int max_radius = max(radius, 6); - int num_shifts = (2*max_radius + 1) * (2*max_radius + 1); - num_layers = 2*num_shifts + 1; - } - else { - num_layers = 3; - } - /* Allocate two layers per shift as well as one for the weight accumulation. */ - buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); + /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ + rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); + rect = rect_expand(rect, radius); + rect = rect_clip(rect, + make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3])); + + buffer.use_intensity = write_passes || (tile_info->num_frames > 1); + buffer.passes = buffer.use_intensity ? 15 : 14; + buffer.width = rect.z - rect.x; + buffer.stride = align_up(buffer.width, 4); + buffer.h = rect.w - rect.y; + int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float)); + buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats); + buffer.frame_stride = buffer.pass_stride * buffer.passes; + /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */ + int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats); + buffer.mem.alloc_to_device(mem_size, false); + buffer.use_time = (tile_info->num_frames > 1); + + /* CPUs process shifts sequentially while GPUs process them in parallel. */ + int num_layers; + if (buffer.gpu_temporary_mem) { + /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */ + int max_radius = max(radius, 6); + int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1); + num_layers = 2 * num_shifts + 1; + } + else { + num_layers = 3; + } + /* Allocate two layers per shift as well as one for the weight accumulation. */ + buffer.temporary_mem.alloc_to_device(num_layers * buffer.pass_stride); } void DenoisingTask::prefilter_shadowing() { - device_ptr null_ptr = (device_ptr) 0; - - device_sub_ptr unfiltered_a (buffer.mem, 0, buffer.pass_stride); - device_sub_ptr unfiltered_b (buffer.mem, 1*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var (buffer.mem, 2*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr sample_var_var (buffer.mem, 3*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr buffer_var (buffer.mem, 5*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr filtered_var (buffer.mem, 6*buffer.pass_stride, buffer.pass_stride); - - /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ - functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); - - /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ - nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); - functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); - - /* Reuse memory, the previous data isn't needed anymore. */ - device_ptr filtered_a = *buffer_var, - filtered_b = *sample_var; - /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ - nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); - functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); - functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); - - device_ptr residual_var = *sample_var_var; - /* Estimate the residual variance between the two filtered halves. */ - functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); - - device_ptr final_a = *unfiltered_a, - final_b = *unfiltered_b; - /* Use the residual variance for a second filter pass. */ - nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); - functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); - functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); - - /* Combine the two double-filtered halves to a final shadow feature. */ - device_sub_ptr shadow_pass(buffer.mem, 4*buffer.pass_stride, buffer.pass_stride); - functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); + device_ptr null_ptr = (device_ptr)0; + + device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride); + device_sub_ptr unfiltered_b(buffer.mem, 1 * buffer.pass_stride, buffer.pass_stride); + device_sub_ptr sample_var(buffer.mem, 2 * buffer.pass_stride, buffer.pass_stride); + device_sub_ptr sample_var_var(buffer.mem, 3 * buffer.pass_stride, buffer.pass_stride); + device_sub_ptr buffer_var(buffer.mem, 5 * buffer.pass_stride, buffer.pass_stride); + device_sub_ptr filtered_var(buffer.mem, 6 * buffer.pass_stride, buffer.pass_stride); + + /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */ + functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var); + + /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */ + nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false); + functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var); + + /* Reuse memory, the previous data isn't needed anymore. */ + device_ptr filtered_a = *buffer_var, filtered_b = *sample_var; + /* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */ + nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false); + functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a); + functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b); + + device_ptr residual_var = *sample_var_var; + /* Estimate the residual variance between the two filtered halves. */ + functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect); + + device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b; + /* Use the residual variance for a second filter pass. */ + nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false); + functions.non_local_means(filtered_a, filtered_b, residual_var, final_a); + functions.non_local_means(filtered_b, filtered_a, residual_var, final_b); + + /* Combine the two double-filtered halves to a final shadow feature. */ + device_sub_ptr shadow_pass(buffer.mem, 4 * buffer.pass_stride, buffer.pass_stride); + functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect); } void DenoisingTask::prefilter_features() { - device_sub_ptr unfiltered (buffer.mem, 8*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr variance (buffer.mem, 9*buffer.pass_stride, buffer.pass_stride); - - int mean_from[] = { 0, 1, 2, 12, 6, 7, 8 }; - int variance_from[] = { 3, 4, 5, 13, 9, 10, 11}; - int pass_to[] = { 1, 2, 3, 0, 5, 6, 7}; - for(int pass = 0; pass < 7; pass++) { - device_sub_ptr feature_pass(buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride); - /* Get the unfiltered pass and its variance from the RenderBuffers. */ - functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance, 1.0f / render_buffer.samples); - /* Smooth the pass and store the result in the denoising buffers. */ - nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); - functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); - } + device_sub_ptr unfiltered(buffer.mem, 8 * buffer.pass_stride, buffer.pass_stride); + device_sub_ptr variance(buffer.mem, 9 * buffer.pass_stride, buffer.pass_stride); + + int mean_from[] = {0, 1, 2, 12, 6, 7, 8}; + int variance_from[] = {3, 4, 5, 13, 9, 10, 11}; + int pass_to[] = {1, 2, 3, 0, 5, 6, 7}; + for (int pass = 0; pass < 7; pass++) { + device_sub_ptr feature_pass( + buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride); + /* Get the unfiltered pass and its variance from the RenderBuffers. */ + functions.get_feature(mean_from[pass], + variance_from[pass], + *unfiltered, + *variance, + 1.0f / render_buffer.samples); + /* Smooth the pass and store the result in the denoising buffers. */ + nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false); + functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass); + } } void DenoisingTask::prefilter_color() { - int mean_from[] = {20, 21, 22}; - int variance_from[] = {23, 24, 25}; - int mean_to[] = { 8, 9, 10}; - int variance_to[] = {11, 12, 13}; - int num_color_passes = 3; - - device_only_memory<float> temporary_color(device, "denoising temporary color"); - temporary_color.alloc_to_device(3*buffer.pass_stride, false); - - for(int pass = 0; pass < num_color_passes; pass++) { - device_sub_ptr color_pass(temporary_color, pass*buffer.pass_stride, buffer.pass_stride); - device_sub_ptr color_var_pass(buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride); - functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass, 1.0f / render_buffer.samples); - } - - device_sub_ptr depth_pass (buffer.mem, 0, buffer.pass_stride); - device_sub_ptr color_var_pass(buffer.mem, variance_to[0]*buffer.pass_stride, 3*buffer.pass_stride); - device_sub_ptr output_pass (buffer.mem, mean_to[0]*buffer.pass_stride, 3*buffer.pass_stride); - functions.detect_outliers(temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); - - if(buffer.use_intensity) { - device_sub_ptr intensity_pass(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); - nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2*4.0f, true); - functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); - } + int mean_from[] = {20, 21, 22}; + int variance_from[] = {23, 24, 25}; + int mean_to[] = {8, 9, 10}; + int variance_to[] = {11, 12, 13}; + int num_color_passes = 3; + + device_only_memory<float> temporary_color(device, "denoising temporary color"); + temporary_color.alloc_to_device(3 * buffer.pass_stride, false); + + for (int pass = 0; pass < num_color_passes; pass++) { + device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride); + device_sub_ptr color_var_pass( + buffer.mem, variance_to[pass] * buffer.pass_stride, buffer.pass_stride); + functions.get_feature(mean_from[pass], + variance_from[pass], + *color_pass, + *color_var_pass, + 1.0f / render_buffer.samples); + } + + device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride); + device_sub_ptr color_var_pass( + buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride); + device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride); + functions.detect_outliers( + temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass); + + if (buffer.use_intensity) { + device_sub_ptr intensity_pass(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride); + nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true); + functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass); + } } void DenoisingTask::load_buffer() { - device_ptr null_ptr = (device_ptr) 0; - - int original_offset = render_buffer.offset; - - int num_passes = buffer.use_intensity? 15 : 14; - for(int i = 0; i < tile_info->num_frames; i++) { - for(int pass = 0; pass < num_passes; pass++) { - device_sub_ptr to_pass(buffer.mem, i*buffer.frame_stride + pass*buffer.pass_stride, buffer.pass_stride); - bool is_variance = (pass >= 11) && (pass <= 13); - functions.get_feature(pass, -1, *to_pass, null_ptr, is_variance? (1.0f / render_buffer.samples) : 1.0f); - } - render_buffer.offset += render_buffer.frame_stride; - } - - render_buffer.offset = original_offset; + device_ptr null_ptr = (device_ptr)0; + + int original_offset = render_buffer.offset; + + int num_passes = buffer.use_intensity ? 15 : 14; + for (int i = 0; i < tile_info->num_frames; i++) { + for (int pass = 0; pass < num_passes; pass++) { + device_sub_ptr to_pass( + buffer.mem, i * buffer.frame_stride + pass * buffer.pass_stride, buffer.pass_stride); + bool is_variance = (pass >= 11) && (pass <= 13); + functions.get_feature( + pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f); + } + render_buffer.offset += render_buffer.frame_stride; + } + + render_buffer.offset = original_offset; } void DenoisingTask::write_buffer() { - reconstruction_state.buffer_params = make_int4(target_buffer.offset, - target_buffer.stride, - target_buffer.pass_stride, - target_buffer.denoising_clean_offset); - int num_passes = buffer.use_intensity? 15 : 14; - for(int pass = 0; pass < num_passes; pass++) { - device_sub_ptr from_pass(buffer.mem, pass*buffer.pass_stride, buffer.pass_stride); - int out_offset = pass + target_buffer.denoising_output_offset; - functions.write_feature(out_offset, *from_pass, target_buffer.ptr); - } + reconstruction_state.buffer_params = make_int4(target_buffer.offset, + target_buffer.stride, + target_buffer.pass_stride, + target_buffer.denoising_clean_offset); + int num_passes = buffer.use_intensity ? 15 : 14; + for (int pass = 0; pass < num_passes; pass++) { + device_sub_ptr from_pass(buffer.mem, pass * buffer.pass_stride, buffer.pass_stride); + int out_offset = pass + target_buffer.denoising_output_offset; + functions.write_feature(out_offset, *from_pass, target_buffer.ptr); + } } void DenoisingTask::construct_transform() { - storage.w = filter_area.z; - storage.h = filter_area.w; + storage.w = filter_area.z; + storage.h = filter_area.w; - storage.transform.alloc_to_device(storage.w*storage.h*TRANSFORM_SIZE, false); - storage.rank.alloc_to_device(storage.w*storage.h, false); + storage.transform.alloc_to_device(storage.w * storage.h * TRANSFORM_SIZE, false); + storage.rank.alloc_to_device(storage.w * storage.h, false); - functions.construct_transform(); + functions.construct_transform(); } void DenoisingTask::reconstruct() { - storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); - storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); - storage.XtWX.zero_to_device(); - storage.XtWY.zero_to_device(); - - reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); - int tile_coordinate_offset = filter_area.y*target_buffer.stride + filter_area.x; - reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset, - target_buffer.stride, - target_buffer.pass_stride, - target_buffer.denoising_clean_offset); - reconstruction_state.source_w = rect.z-rect.x; - reconstruction_state.source_h = rect.w-rect.y; - - device_sub_ptr color_ptr (buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride); - device_sub_ptr color_var_ptr(buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride); - for(int f = 0; f < tile_info->num_frames; f++) { - device_ptr scale_ptr = 0; - device_sub_ptr *scale_sub_ptr = NULL; - if(tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { - scale_sub_ptr = new device_sub_ptr(buffer.mem, 14*buffer.pass_stride, buffer.pass_stride); - scale_ptr = **scale_sub_ptr; - } - - functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); - delete scale_sub_ptr; - } - functions.solve(target_buffer.ptr); + storage.XtWX.alloc_to_device(storage.w * storage.h * XTWX_SIZE, false); + storage.XtWY.alloc_to_device(storage.w * storage.h * XTWY_SIZE, false); + storage.XtWX.zero_to_device(); + storage.XtWY.zero_to_device(); + + reconstruction_state.filter_window = rect_from_shape( + filter_area.x - rect.x, filter_area.y - rect.y, storage.w, storage.h); + int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x; + reconstruction_state.buffer_params = make_int4(target_buffer.offset + tile_coordinate_offset, + target_buffer.stride, + target_buffer.pass_stride, + target_buffer.denoising_clean_offset); + reconstruction_state.source_w = rect.z - rect.x; + reconstruction_state.source_h = rect.w - rect.y; + + device_sub_ptr color_ptr(buffer.mem, 8 * buffer.pass_stride, 3 * buffer.pass_stride); + device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride); + for (int f = 0; f < tile_info->num_frames; f++) { + device_ptr scale_ptr = 0; + device_sub_ptr *scale_sub_ptr = NULL; + if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) { + scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride); + scale_ptr = **scale_sub_ptr; + } + + functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f); + delete scale_sub_ptr; + } + functions.solve(target_buffer.ptr); } void DenoisingTask::run_denoising(RenderTile *tile) { - RenderTile rtiles[10]; - rtiles[4] = *tile; - functions.map_neighbor_tiles(rtiles); - set_render_buffer(rtiles); - - setup_denoising_buffer(); - - if(tile_info->from_render) { - prefilter_shadowing(); - prefilter_features(); - prefilter_color(); - } - else { - load_buffer(); - } - - if(do_filter) { - construct_transform(); - reconstruct(); - } - - if(write_passes) { - write_buffer(); - } - - functions.unmap_neighbor_tiles(rtiles); + RenderTile rtiles[10]; + rtiles[4] = *tile; + functions.map_neighbor_tiles(rtiles); + set_render_buffer(rtiles); + + setup_denoising_buffer(); + + if (tile_info->from_render) { + prefilter_shadowing(); + prefilter_features(); + prefilter_color(); + } + else { + load_buffer(); + } + + if (do_filter) { + construct_transform(); + reconstruct(); + } + + if (write_passes) { + write_buffer(); + } + + functions.unmap_neighbor_tiles(rtiles); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 5869aa05390..bd1d0193dbd 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -28,165 +28,169 @@ CCL_NAMESPACE_BEGIN class DenoisingTask { -public: - /* Parameters of the denoising algorithm. */ - int radius; - float nlm_k_2; - float pca_threshold; - - /* Parameters of the RenderBuffers. */ - struct RenderBuffers { - int offset; - int pass_stride; - int frame_stride; - int samples; - } render_buffer; - - /* Pointer and parameters of the target buffer. */ - struct TargetBuffer { - int offset; - int stride; - int pass_stride; - int denoising_clean_offset; - int denoising_output_offset; - device_ptr ptr; - } target_buffer; - - TileInfo *tile_info; - device_vector<int> tile_info_mem; - - ProfilingState *profiler; - - int4 rect; - int4 filter_area; - - bool write_passes; - bool do_filter; - - struct DeviceFunctions { - function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */ - device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ - device_ptr variance_ptr, /* Contains the variance of the guide image. */ - device_ptr out_ptr /* The filtered output is written into this image. */ - )> non_local_means; - function<bool(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame - )> accumulate; - function<bool(device_ptr output_ptr)> solve; - function<bool()> construct_transform; - - function<bool(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, - int4 rect - )> combine_halves; - function<bool(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr - )> divide_shadow; - function<bool(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale - )> get_feature; - function<bool(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr - )> detect_outliers; - function<bool(int out_offset, - device_ptr frop_ptr, - device_ptr buffer_ptr - )> write_feature; - function<void(RenderTile *rtiles)> map_neighbor_tiles; - function<void(RenderTile *rtiles)> unmap_neighbor_tiles; - } functions; - - /* Stores state of the current Reconstruction operation, - * which is accessed by the device in order to perform the operation. */ - struct ReconstructionState { - int4 filter_window; - int4 buffer_params; - - int source_w; - int source_h; - } reconstruction_state; - - /* Stores state of the current NLM operation, - * which is accessed by the device in order to perform the operation. */ - struct NLMState { - int r; /* Search radius of the filter. */ - int f; /* Patch size of the filter. */ - float a; /* Variance compensation factor in the MSE estimation. */ - float k_2; /* Squared value of the k parameter of the filter. */ - bool is_color; - - void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) { r = r_; f = f_; a = a_, k_2 = k_2_; is_color = is_color_; } - } nlm_state; - - struct Storage { - device_only_memory<float> transform; - device_only_memory<int> rank; - device_only_memory<float> XtWX; - device_only_memory<float3> XtWY; - int w; - int h; - - Storage(Device *device) - : transform(device, "denoising transform"), - rank(device, "denoising rank"), - XtWX(device, "denoising XtWX"), - XtWY(device, "denoising XtWY") - {} - } storage; - - DenoisingTask(Device *device, const DeviceTask &task); - ~DenoisingTask(); - - void run_denoising(RenderTile *tile); - - struct DenoiseBuffers { - int pass_stride; - int passes; - int stride; - int h; - int width; - int frame_stride; - device_only_memory<float> mem; - device_only_memory<float> temporary_mem; - bool use_time; - bool use_intensity; - - bool gpu_temporary_mem; - - DenoiseBuffers(Device *device) - : mem(device, "denoising pixel buffer"), - temporary_mem(device, "denoising temporary mem") - {} - } buffer; - -protected: - Device *device; - - void set_render_buffer(RenderTile *rtiles); - void setup_denoising_buffer(); - void prefilter_shadowing(); - void prefilter_features(); - void prefilter_color(); - void construct_transform(); - void reconstruct(); - - void load_buffer(); - void write_buffer(); + public: + /* Parameters of the denoising algorithm. */ + int radius; + float nlm_k_2; + float pca_threshold; + + /* Parameters of the RenderBuffers. */ + struct RenderBuffers { + int offset; + int pass_stride; + int frame_stride; + int samples; + } render_buffer; + + /* Pointer and parameters of the target buffer. */ + struct TargetBuffer { + int offset; + int stride; + int pass_stride; + int denoising_clean_offset; + int denoising_output_offset; + device_ptr ptr; + } target_buffer; + + TileInfo *tile_info; + device_vector<int> tile_info_mem; + + ProfilingState *profiler; + + int4 rect; + int4 filter_area; + + bool write_passes; + bool do_filter; + + struct DeviceFunctions { + function<bool( + device_ptr image_ptr, /* Contains the values that are smoothed. */ + device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */ + device_ptr variance_ptr, /* Contains the variance of the guide image. */ + device_ptr out_ptr /* The filtered output is written into this image. */ + )> + non_local_means; + function<bool( + device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> + accumulate; + function<bool(device_ptr output_ptr)> solve; + function<bool()> construct_transform; + + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect)> + combine_halves; + function<bool(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr)> + divide_shadow; + function<bool(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale)> + get_feature; + function<bool(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr)> + detect_outliers; + function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature; + function<void(RenderTile *rtiles)> map_neighbor_tiles; + function<void(RenderTile *rtiles)> unmap_neighbor_tiles; + } functions; + + /* Stores state of the current Reconstruction operation, + * which is accessed by the device in order to perform the operation. */ + struct ReconstructionState { + int4 filter_window; + int4 buffer_params; + + int source_w; + int source_h; + } reconstruction_state; + + /* Stores state of the current NLM operation, + * which is accessed by the device in order to perform the operation. */ + struct NLMState { + int r; /* Search radius of the filter. */ + int f; /* Patch size of the filter. */ + float a; /* Variance compensation factor in the MSE estimation. */ + float k_2; /* Squared value of the k parameter of the filter. */ + bool is_color; + + void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_) + { + r = r_; + f = f_; + a = a_, k_2 = k_2_; + is_color = is_color_; + } + } nlm_state; + + struct Storage { + device_only_memory<float> transform; + device_only_memory<int> rank; + device_only_memory<float> XtWX; + device_only_memory<float3> XtWY; + int w; + int h; + + Storage(Device *device) + : transform(device, "denoising transform"), + rank(device, "denoising rank"), + XtWX(device, "denoising XtWX"), + XtWY(device, "denoising XtWY") + { + } + } storage; + + DenoisingTask(Device *device, const DeviceTask &task); + ~DenoisingTask(); + + void run_denoising(RenderTile *tile); + + struct DenoiseBuffers { + int pass_stride; + int passes; + int stride; + int h; + int width; + int frame_stride; + device_only_memory<float> mem; + device_only_memory<float> temporary_mem; + bool use_time; + bool use_intensity; + + bool gpu_temporary_mem; + + DenoiseBuffers(Device *device) + : mem(device, "denoising pixel buffer"), temporary_mem(device, "denoising temporary mem") + { + } + } buffer; + + protected: + Device *device; + + void set_render_buffer(RenderTile *rtiles); + void setup_denoising_buffer(); + void prefilter_shadowing(); + void prefilter_features(); + void prefilter_color(); + void construct_transform(); + void reconstruct(); + + void load_buffer(); + void write_buffer(); }; CCL_NAMESPACE_END -#endif /* __DEVICE_DENOISING_H__ */ +#endif /* __DEVICE_DENOISING_H__ */ diff --git a/intern/cycles/device/device_intern.h b/intern/cycles/device/device_intern.h index 94df1e009eb..c393a3f9cda 100644 --- a/intern/cycles/device/device_intern.h +++ b/intern/cycles/device/device_intern.h @@ -21,19 +21,22 @@ CCL_NAMESPACE_BEGIN class Device; -Device *device_cpu_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background); +Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); bool device_opencl_init(); -Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background); -bool device_opencl_compile_kernel(const vector<string>& parameters); +Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); +bool device_opencl_compile_kernel(const vector<string> ¶meters); bool device_cuda_init(); -Device *device_cuda_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background); -Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address); -Device *device_multi_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background); - -void device_cpu_info(vector<DeviceInfo>& devices); -void device_opencl_info(vector<DeviceInfo>& devices); -void device_cuda_info(vector<DeviceInfo>& devices); -void device_network_info(vector<DeviceInfo>& devices); +Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); +Device *device_network_create(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + const char *address); +Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); + +void device_cpu_info(vector<DeviceInfo> &devices); +void device_opencl_info(vector<DeviceInfo> &devices); +void device_cuda_info(vector<DeviceInfo> &devices); +void device_network_info(vector<DeviceInfo> &devices); string device_cpu_capabilities(); string device_opencl_capabilities(); @@ -41,4 +44,4 @@ string device_cuda_capabilities(); CCL_NAMESPACE_END -#endif /* __DEVICE_INTERN_H__ */ +#endif /* __DEVICE_INTERN_H__ */ diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index a8d29896553..859535307f4 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -22,21 +22,21 @@ CCL_NAMESPACE_BEGIN /* Device Memory */ device_memory::device_memory(Device *device, const char *name, MemoryType type) -: data_type(device_type_traits<uchar>::data_type), - data_elements(device_type_traits<uchar>::num_elements), - data_size(0), - device_size(0), - data_width(0), - data_height(0), - data_depth(0), - type(type), - name(name), - interpolation(INTERPOLATION_NONE), - extension(EXTENSION_REPEAT), - device(device), - device_pointer(0), - host_pointer(0), - shared_pointer(0) + : data_type(device_type_traits<uchar>::data_type), + data_elements(device_type_traits<uchar>::num_elements), + data_size(0), + device_size(0), + data_width(0), + data_height(0), + data_depth(0), + type(type), + name(name), + interpolation(INTERPOLATION_NONE), + extension(EXTENSION_REPEAT), + device(device), + device_pointer(0), + host_pointer(0), + shared_pointer(0) { } @@ -46,95 +46,94 @@ device_memory::~device_memory() void *device_memory::host_alloc(size_t size) { - if(!size) { - return 0; - } + if (!size) { + return 0; + } - void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES); + void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES); - if(ptr) { - util_guarded_mem_alloc(size); - } - else { - throw std::bad_alloc(); - } + if (ptr) { + util_guarded_mem_alloc(size); + } + else { + throw std::bad_alloc(); + } - return ptr; + return ptr; } void device_memory::host_free() { - if(host_pointer) { - util_guarded_mem_free(memory_size()); - util_aligned_free((void*)host_pointer); - host_pointer = 0; - } + if (host_pointer) { + util_guarded_mem_free(memory_size()); + util_aligned_free((void *)host_pointer); + host_pointer = 0; + } } void device_memory::device_alloc() { - assert(!device_pointer && type != MEM_TEXTURE); - device->mem_alloc(*this); + assert(!device_pointer && type != MEM_TEXTURE); + device->mem_alloc(*this); } void device_memory::device_free() { - if(device_pointer) { - device->mem_free(*this); - } + if (device_pointer) { + device->mem_free(*this); + } } void device_memory::device_copy_to() { - if(host_pointer) { - device->mem_copy_to(*this); - } + if (host_pointer) { + device->mem_copy_to(*this); + } } void device_memory::device_copy_from(int y, int w, int h, int elem) { - assert(type != MEM_TEXTURE && type != MEM_READ_ONLY); - device->mem_copy_from(*this, y, w, h, elem); + assert(type != MEM_TEXTURE && type != MEM_READ_ONLY); + device->mem_copy_from(*this, y, w, h, elem); } void device_memory::device_zero() { - if(data_size) { - device->mem_zero(*this); - } + if (data_size) { + device->mem_zero(*this); + } } void device_memory::swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr) { - original_device = device; - original_device_size = device_size; - original_device_ptr = device_pointer; + original_device = device; + original_device_size = device_size; + original_device_ptr = device_pointer; - device = new_device; - device_size = new_device_size; - device_pointer = new_device_ptr; + device = new_device; + device_size = new_device_size; + device_pointer = new_device_ptr; } void device_memory::restore_device() { - device = original_device; - device_size = original_device_size; - device_pointer = original_device_ptr; + device = original_device; + device_size = original_device_size; + device_pointer = original_device_ptr; } /* Device Sub Ptr */ -device_sub_ptr::device_sub_ptr(device_memory& mem, int offset, int size) -: device(mem.device) +device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device) { - ptr = device->mem_alloc_sub_ptr(mem, offset, size); + ptr = device->mem_alloc_sub_ptr(mem, offset, size); } device_sub_ptr::~device_sub_ptr() { - device->mem_free_sub_ptr(ptr); + device->mem_free_sub_ptr(ptr); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index e43834bdc8d..f50184efba7 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -31,152 +31,155 @@ CCL_NAMESPACE_BEGIN class Device; -enum MemoryType { - MEM_READ_ONLY, - MEM_READ_WRITE, - MEM_DEVICE_ONLY, - MEM_TEXTURE, - MEM_PIXELS -}; +enum MemoryType { MEM_READ_ONLY, MEM_READ_WRITE, MEM_DEVICE_ONLY, MEM_TEXTURE, MEM_PIXELS }; /* Supported Data Types */ enum DataType { - TYPE_UNKNOWN, - TYPE_UCHAR, - TYPE_UINT16, - TYPE_UINT, - TYPE_INT, - TYPE_FLOAT, - TYPE_HALF, - TYPE_UINT64, + TYPE_UNKNOWN, + TYPE_UCHAR, + TYPE_UINT16, + TYPE_UINT, + TYPE_INT, + TYPE_FLOAT, + TYPE_HALF, + TYPE_UINT64, }; static inline size_t datatype_size(DataType datatype) { - switch(datatype) { - case TYPE_UNKNOWN: return 1; - case TYPE_UCHAR: return sizeof(uchar); - case TYPE_FLOAT: return sizeof(float); - case TYPE_UINT: return sizeof(uint); - case TYPE_UINT16: return sizeof(uint16_t); - case TYPE_INT: return sizeof(int); - case TYPE_HALF: return sizeof(half); - case TYPE_UINT64: return sizeof(uint64_t); - default: return 0; - } + switch (datatype) { + case TYPE_UNKNOWN: + return 1; + case TYPE_UCHAR: + return sizeof(uchar); + case TYPE_FLOAT: + return sizeof(float); + case TYPE_UINT: + return sizeof(uint); + case TYPE_UINT16: + return sizeof(uint16_t); + case TYPE_INT: + return sizeof(int); + case TYPE_HALF: + return sizeof(half); + case TYPE_UINT64: + return sizeof(uint64_t); + default: + return 0; + } } /* Traits for data types */ template<typename T> struct device_type_traits { - static const DataType data_type = TYPE_UNKNOWN; - static const int num_elements = sizeof(T); + static const DataType data_type = TYPE_UNKNOWN; + static const int num_elements = sizeof(T); }; template<> struct device_type_traits<uchar> { - static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 1; + static const DataType data_type = TYPE_UCHAR; + static const int num_elements = 1; }; template<> struct device_type_traits<uchar2> { - static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 2; + static const DataType data_type = TYPE_UCHAR; + static const int num_elements = 2; }; template<> struct device_type_traits<uchar3> { - static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 3; + static const DataType data_type = TYPE_UCHAR; + static const int num_elements = 3; }; template<> struct device_type_traits<uchar4> { - static const DataType data_type = TYPE_UCHAR; - static const int num_elements = 4; + static const DataType data_type = TYPE_UCHAR; + static const int num_elements = 4; }; template<> struct device_type_traits<uint> { - static const DataType data_type = TYPE_UINT; - static const int num_elements = 1; + static const DataType data_type = TYPE_UINT; + static const int num_elements = 1; }; template<> struct device_type_traits<uint2> { - static const DataType data_type = TYPE_UINT; - static const int num_elements = 2; + static const DataType data_type = TYPE_UINT; + static const int num_elements = 2; }; template<> struct device_type_traits<uint3> { - static const DataType data_type = TYPE_UINT; - static const int num_elements = 3; + static const DataType data_type = TYPE_UINT; + static const int num_elements = 3; }; template<> struct device_type_traits<uint4> { - static const DataType data_type = TYPE_UINT; - static const int num_elements = 4; + static const DataType data_type = TYPE_UINT; + static const int num_elements = 4; }; template<> struct device_type_traits<int> { - static const DataType data_type = TYPE_INT; - static const int num_elements = 1; + static const DataType data_type = TYPE_INT; + static const int num_elements = 1; }; template<> struct device_type_traits<int2> { - static const DataType data_type = TYPE_INT; - static const int num_elements = 2; + static const DataType data_type = TYPE_INT; + static const int num_elements = 2; }; template<> struct device_type_traits<int3> { - static const DataType data_type = TYPE_INT; - static const int num_elements = 3; + static const DataType data_type = TYPE_INT; + static const int num_elements = 3; }; template<> struct device_type_traits<int4> { - static const DataType data_type = TYPE_INT; - static const int num_elements = 4; + static const DataType data_type = TYPE_INT; + static const int num_elements = 4; }; template<> struct device_type_traits<float> { - static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 1; + static const DataType data_type = TYPE_FLOAT; + static const int num_elements = 1; }; template<> struct device_type_traits<float2> { - static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 2; + static const DataType data_type = TYPE_FLOAT; + static const int num_elements = 2; }; template<> struct device_type_traits<float3> { - static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 4; + static const DataType data_type = TYPE_FLOAT; + static const int num_elements = 4; }; template<> struct device_type_traits<float4> { - static const DataType data_type = TYPE_FLOAT; - static const int num_elements = 4; + static const DataType data_type = TYPE_FLOAT; + static const int num_elements = 4; }; template<> struct device_type_traits<half> { - static const DataType data_type = TYPE_HALF; - static const int num_elements = 1; + static const DataType data_type = TYPE_HALF; + static const int num_elements = 1; }; template<> struct device_type_traits<ushort4> { - static const DataType data_type = TYPE_UINT16; - static const int num_elements = 4; + static const DataType data_type = TYPE_UINT16; + static const int num_elements = 4; }; template<> struct device_type_traits<uint16_t> { - static const DataType data_type = TYPE_UINT16; - static const int num_elements = 1; + static const DataType data_type = TYPE_UINT16; + static const int num_elements = 1; }; template<> struct device_type_traits<half4> { - static const DataType data_type = TYPE_HALF; - static const int num_elements = 4; + static const DataType data_type = TYPE_HALF; + static const int num_elements = 4; }; template<> struct device_type_traits<uint64_t> { - static const DataType data_type = TYPE_UINT64; - static const int num_elements = 1; + static const DataType data_type = TYPE_UINT64; + static const int num_elements = 1; }; /* Device Memory @@ -184,64 +187,67 @@ template<> struct device_type_traits<uint64_t> { * Base class for all device memory. This should not be allocated directly, * instead the appropriate subclass can be used. */ -class device_memory -{ -public: - size_t memory_size() { return data_size*data_elements*datatype_size(data_type); } - size_t memory_elements_size(int elements) { - return elements*data_elements*datatype_size(data_type); - } - - /* Data information. */ - DataType data_type; - int data_elements; - size_t data_size; - size_t device_size; - size_t data_width; - size_t data_height; - size_t data_depth; - MemoryType type; - const char *name; - InterpolationType interpolation; - ExtensionType extension; - - /* Pointers. */ - Device *device; - device_ptr device_pointer; - void *host_pointer; - void *shared_pointer; - - virtual ~device_memory(); - - void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr); - void restore_device(); - -protected: - friend class CUDADevice; - - /* Only create through subclasses. */ - device_memory(Device *device, const char *name, MemoryType type); - - /* No copying allowed. */ - device_memory(const device_memory&); - device_memory& operator = (const device_memory&); - - /* Host allocation on the device. All host_pointer memory should be - * allocated with these functions, for devices that support using - * the same pointer for host and device. */ - void *host_alloc(size_t size); - void host_free(); - - /* Device memory allocation and copying. */ - void device_alloc(); - void device_free(); - void device_copy_to(); - void device_copy_from(int y, int w, int h, int elem); - void device_zero(); - - device_ptr original_device_ptr; - size_t original_device_size; - Device *original_device; +class device_memory { + public: + size_t memory_size() + { + return data_size * data_elements * datatype_size(data_type); + } + size_t memory_elements_size(int elements) + { + return elements * data_elements * datatype_size(data_type); + } + + /* Data information. */ + DataType data_type; + int data_elements; + size_t data_size; + size_t device_size; + size_t data_width; + size_t data_height; + size_t data_depth; + MemoryType type; + const char *name; + InterpolationType interpolation; + ExtensionType extension; + + /* Pointers. */ + Device *device; + device_ptr device_pointer; + void *host_pointer; + void *shared_pointer; + + virtual ~device_memory(); + + void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr); + void restore_device(); + + protected: + friend class CUDADevice; + + /* Only create through subclasses. */ + device_memory(Device *device, const char *name, MemoryType type); + + /* No copying allowed. */ + device_memory(const device_memory &); + device_memory &operator=(const device_memory &); + + /* Host allocation on the device. All host_pointer memory should be + * allocated with these functions, for devices that support using + * the same pointer for host and device. */ + void *host_alloc(size_t size); + void host_free(); + + /* Device memory allocation and copying. */ + void device_alloc(); + void device_free(); + void device_copy_to(); + void device_copy_from(int y, int w, int h, int elem); + void device_zero(); + + device_ptr original_device_ptr; + size_t original_device_size; + Device *original_device; }; /* Device Only Memory @@ -249,51 +255,49 @@ protected: * Working memory only needed by the device, with no corresponding allocation * on the host. Only used internally in the device implementations. */ -template<typename T> -class device_only_memory : public device_memory -{ -public: - device_only_memory(Device *device, const char *name) - : device_memory(device, name, MEM_DEVICE_ONLY) - { - data_type = device_type_traits<T>::data_type; - data_elements = max(device_type_traits<T>::num_elements, 1); - } - - virtual ~device_only_memory() - { - free(); - } - - void alloc_to_device(size_t num, bool shrink_to_fit = true) - { - size_t new_size = num; - bool reallocate; - - if(shrink_to_fit) { - reallocate = (data_size != new_size); - } - else { - reallocate = (data_size < new_size); - } - - if(reallocate) { - device_free(); - data_size = new_size; - device_alloc(); - } - } - - void free() - { - device_free(); - data_size = 0; - } - - void zero_to_device() - { - device_zero(); - } +template<typename T> class device_only_memory : public device_memory { + public: + device_only_memory(Device *device, const char *name) + : device_memory(device, name, MEM_DEVICE_ONLY) + { + data_type = device_type_traits<T>::data_type; + data_elements = max(device_type_traits<T>::num_elements, 1); + } + + virtual ~device_only_memory() + { + free(); + } + + void alloc_to_device(size_t num, bool shrink_to_fit = true) + { + size_t new_size = num; + bool reallocate; + + if (shrink_to_fit) { + reallocate = (data_size != new_size); + } + else { + reallocate = (data_size < new_size); + } + + if (reallocate) { + device_free(); + data_size = new_size; + device_alloc(); + } + } + + void free() + { + device_free(); + data_size = 0; + } + + void zero_to_device() + { + device_zero(); + } }; /* Device Vector @@ -307,135 +311,134 @@ public: * automatically attached to kernel globals, using the provided name * matching an entry in kernel_textures.h. */ -template<typename T> class device_vector : public device_memory -{ -public: - device_vector(Device *device, const char *name, MemoryType type) - : device_memory(device, name, type) - { - data_type = device_type_traits<T>::data_type; - data_elements = device_type_traits<T>::num_elements; - - assert(data_elements > 0); - } - - virtual ~device_vector() - { - free(); - } - - /* Host memory allocation. */ - T *alloc(size_t width, size_t height = 0, size_t depth = 0) - { - size_t new_size = size(width, height, depth); - - if(new_size != data_size) { - device_free(); - host_free(); - host_pointer = host_alloc(sizeof(T)*new_size); - assert(device_pointer == 0); - } - - data_size = new_size; - data_width = width; - data_height = height; - data_depth = depth; - - return data(); - } - - /* Host memory resize. Only use this if the original data needs to be - * preserved, it is faster to call alloc() if it can be discarded. */ - T *resize(size_t width, size_t height = 0, size_t depth = 0) - { - size_t new_size = size(width, height, depth); - - if(new_size != data_size) { - void *new_ptr = host_alloc(sizeof(T)*new_size); - - if(new_size && data_size) { - size_t min_size = ((new_size < data_size)? new_size: data_size); - memcpy((T*)new_ptr, (T*)host_pointer, sizeof(T)*min_size); - } - - device_free(); - host_free(); - host_pointer = new_ptr; - assert(device_pointer == 0); - } - - data_size = new_size; - data_width = width; - data_height = height; - data_depth = depth; - - return data(); - } - - /* Take over data from an existing array. */ - void steal_data(array<T>& from) - { - device_free(); - host_free(); - - data_size = from.size(); - data_width = 0; - data_height = 0; - data_depth = 0; - host_pointer = from.steal_pointer(); - assert(device_pointer == 0); - } - - /* Free device and host memory. */ - void free() - { - device_free(); - host_free(); - - data_size = 0; - data_width = 0; - data_height = 0; - data_depth = 0; - host_pointer = 0; - assert(device_pointer == 0); - } - - size_t size() - { - return data_size; - } - - T* data() - { - return (T*)host_pointer; - } - - T& operator[](size_t i) - { - assert(i < data_size); - return data()[i]; - } - - void copy_to_device() - { - device_copy_to(); - } - - void copy_from_device(int y, int w, int h) - { - device_copy_from(y, w, h, sizeof(T)); - } - - void zero_to_device() - { - device_zero(); - } - -protected: - size_t size(size_t width, size_t height, size_t depth) - { - return width * ((height == 0)? 1: height) * ((depth == 0)? 1: depth); - } +template<typename T> class device_vector : public device_memory { + public: + device_vector(Device *device, const char *name, MemoryType type) + : device_memory(device, name, type) + { + data_type = device_type_traits<T>::data_type; + data_elements = device_type_traits<T>::num_elements; + + assert(data_elements > 0); + } + + virtual ~device_vector() + { + free(); + } + + /* Host memory allocation. */ + T *alloc(size_t width, size_t height = 0, size_t depth = 0) + { + size_t new_size = size(width, height, depth); + + if (new_size != data_size) { + device_free(); + host_free(); + host_pointer = host_alloc(sizeof(T) * new_size); + assert(device_pointer == 0); + } + + data_size = new_size; + data_width = width; + data_height = height; + data_depth = depth; + + return data(); + } + + /* Host memory resize. Only use this if the original data needs to be + * preserved, it is faster to call alloc() if it can be discarded. */ + T *resize(size_t width, size_t height = 0, size_t depth = 0) + { + size_t new_size = size(width, height, depth); + + if (new_size != data_size) { + void *new_ptr = host_alloc(sizeof(T) * new_size); + + if (new_size && data_size) { + size_t min_size = ((new_size < data_size) ? new_size : data_size); + memcpy((T *)new_ptr, (T *)host_pointer, sizeof(T) * min_size); + } + + device_free(); + host_free(); + host_pointer = new_ptr; + assert(device_pointer == 0); + } + + data_size = new_size; + data_width = width; + data_height = height; + data_depth = depth; + + return data(); + } + + /* Take over data from an existing array. */ + void steal_data(array<T> &from) + { + device_free(); + host_free(); + + data_size = from.size(); + data_width = 0; + data_height = 0; + data_depth = 0; + host_pointer = from.steal_pointer(); + assert(device_pointer == 0); + } + + /* Free device and host memory. */ + void free() + { + device_free(); + host_free(); + + data_size = 0; + data_width = 0; + data_height = 0; + data_depth = 0; + host_pointer = 0; + assert(device_pointer == 0); + } + + size_t size() + { + return data_size; + } + + T *data() + { + return (T *)host_pointer; + } + + T &operator[](size_t i) + { + assert(i < data_size); + return data()[i]; + } + + void copy_to_device() + { + device_copy_to(); + } + + void copy_from_device(int y, int w, int h) + { + device_copy_from(y, w, h, sizeof(T)); + } + + void zero_to_device() + { + device_zero(); + } + + protected: + size_t size(size_t width, size_t height, size_t depth) + { + return width * ((height == 0) ? 1 : height) * ((depth == 0) ? 1 : depth); + } }; /* Pixel Memory @@ -443,28 +446,26 @@ protected: * Device memory to efficiently draw as pixels to the screen in interactive * rendering. Only copying pixels from the device is supported, not copying to. */ -template<typename T> class device_pixels : public device_vector<T> -{ -public: - device_pixels(Device *device, const char *name) - : device_vector<T>(device, name, MEM_PIXELS) - { - } - - void alloc_to_device(size_t width, size_t height, size_t depth = 0) - { - device_vector<T>::alloc(width, height, depth); - - if(!device_memory::device_pointer) { - device_memory::device_alloc(); - } - } - - T *copy_from_device(int y, int w, int h) - { - device_memory::device_copy_from(y, w, h, sizeof(T)); - return device_vector<T>::data(); - } +template<typename T> class device_pixels : public device_vector<T> { + public: + device_pixels(Device *device, const char *name) : device_vector<T>(device, name, MEM_PIXELS) + { + } + + void alloc_to_device(size_t width, size_t height, size_t depth = 0) + { + device_vector<T>::alloc(width, height, depth); + + if (!device_memory::device_pointer) { + device_memory::device_alloc(); + } + } + + T *copy_from_device(int y, int w, int h) + { + device_memory::device_copy_from(y, w, h, sizeof(T)); + return device_vector<T>::data(); + } }; /* Device Sub Memory @@ -476,25 +477,24 @@ public: * Note: some devices require offset and size of the sub_ptr to be properly * aligned to device->mem_address_alingment(). */ -class device_sub_ptr -{ -public: - device_sub_ptr(device_memory& mem, int offset, int size); - ~device_sub_ptr(); +class device_sub_ptr { + public: + device_sub_ptr(device_memory &mem, int offset, int size); + ~device_sub_ptr(); - device_ptr operator*() const - { - return ptr; - } + device_ptr operator*() const + { + return ptr; + } -protected: - /* No copying. */ - device_sub_ptr& operator = (const device_sub_ptr&); + protected: + /* No copying. */ + device_sub_ptr &operator=(const device_sub_ptr &); - Device *device; - device_ptr ptr; + Device *device; + device_ptr ptr; }; CCL_NAMESPACE_END -#endif /* __DEVICE_MEMORY_H__ */ +#endif /* __DEVICE_MEMORY_H__ */ diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index bdb7c87fa57..4a40e106115 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -31,391 +31,406 @@ CCL_NAMESPACE_BEGIN -class MultiDevice : public Device -{ -public: - struct SubDevice { - explicit SubDevice(Device *device_) - : device(device_) {} - - Device *device; - map<device_ptr, device_ptr> ptr_map; - }; - - list<SubDevice> devices; - device_ptr unique_key; - - MultiDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_) - : Device(info, stats, profiler, background_), unique_key(1) - { - foreach(DeviceInfo& subinfo, info.multi_devices) { - Device *device = Device::create(subinfo, sub_stats_, profiler, background); - - /* Always add CPU devices at the back since GPU devices can change - * host memory pointers, which CPU uses as device pointer. */ - if(subinfo.type == DEVICE_CPU) { - devices.push_back(SubDevice(device)); - } - else { - devices.push_front(SubDevice(device)); - } - } +class MultiDevice : public Device { + public: + struct SubDevice { + explicit SubDevice(Device *device_) : device(device_) + { + } + + Device *device; + map<device_ptr, device_ptr> ptr_map; + }; + + list<SubDevice> devices; + device_ptr unique_key; + + MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) + : Device(info, stats, profiler, background_), unique_key(1) + { + foreach (DeviceInfo &subinfo, info.multi_devices) { + Device *device = Device::create(subinfo, sub_stats_, profiler, background); + + /* Always add CPU devices at the back since GPU devices can change + * host memory pointers, which CPU uses as device pointer. */ + if (subinfo.type == DEVICE_CPU) { + devices.push_back(SubDevice(device)); + } + else { + devices.push_front(SubDevice(device)); + } + } #ifdef WITH_NETWORK - /* try to add network devices */ - ServerDiscovery discovery(true); - time_sleep(1.0); + /* try to add network devices */ + ServerDiscovery discovery(true); + time_sleep(1.0); - vector<string> servers = discovery.get_server_list(); + vector<string> servers = discovery.get_server_list(); - foreach(string& server, servers) { - Device *device = device_network_create(info, stats, profiler, server.c_str()); - if(device) - devices.push_back(SubDevice(device)); - } + foreach (string &server, servers) { + Device *device = device_network_create(info, stats, profiler, server.c_str()); + if (device) + devices.push_back(SubDevice(device)); + } #endif - } - - ~MultiDevice() - { - foreach(SubDevice& sub, devices) - delete sub.device; - } - - const string& error_message() - { - foreach(SubDevice& sub, devices) { - if(sub.device->error_message() != "") { - if(error_msg == "") - error_msg = sub.device->error_message(); - break; - } - } - - return error_msg; - } - - virtual bool show_samples() const - { - if(devices.size() > 1) { - return false; - } - return devices.front().device->show_samples(); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const { - BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; - foreach(const SubDevice& sub_device, devices) { - bvh_layout_mask &= sub_device.device->get_bvh_layout_mask(); - } - return bvh_layout_mask; - } - - bool load_kernels(const DeviceRequestedFeatures& requested_features) - { - foreach(SubDevice& sub, devices) - if(!sub.device->load_kernels(requested_features)) - return false; - - return true; - } - - bool wait_for_availability(const DeviceRequestedFeatures& requested_features) - { - foreach(SubDevice& sub, devices) - if(!sub.device->wait_for_availability(requested_features)) - return false; - - return true; - } - - DeviceKernelStatus get_active_kernel_switch_state() - { - DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL; - - foreach(SubDevice& sub, devices) { - DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state(); - switch (subresult) { - case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL: - result = subresult; - break; - - case DEVICE_KERNEL_FEATURE_KERNEL_INVALID: - case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE: - return subresult; - - case DEVICE_KERNEL_USING_FEATURE_KERNEL: - case DEVICE_KERNEL_UNKNOWN: - break; - } - } - return result; - } - - void mem_alloc(device_memory& mem) - { - device_ptr key = unique_key++; - - foreach(SubDevice& sub, devices) { - mem.device = sub.device; - mem.device_pointer = 0; - mem.device_size = 0; - - sub.device->mem_alloc(mem); - sub.ptr_map[key] = mem.device_pointer; - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size); - } - - void mem_copy_to(device_memory& mem) - { - device_ptr existing_key = mem.device_pointer; - device_ptr key = (existing_key)? existing_key: unique_key++; - size_t existing_size = mem.device_size; - - foreach(SubDevice& sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0; - mem.device_size = existing_size; - - sub.device->mem_copy_to(mem); - sub.ptr_map[key] = mem.device_pointer; - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size - existing_size); - } - - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) - { - device_ptr key = mem.device_pointer; - int i = 0, sub_h = h/devices.size(); - - foreach(SubDevice& sub, devices) { - int sy = y + i*sub_h; - int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h; - - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - - sub.device->mem_copy_from(mem, sy, w, sh, elem); - i++; - } - - mem.device = this; - mem.device_pointer = key; - } - - void mem_zero(device_memory& mem) - { - device_ptr existing_key = mem.device_pointer; - device_ptr key = (existing_key)? existing_key: unique_key++; - size_t existing_size = mem.device_size; - - foreach(SubDevice& sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key)? sub.ptr_map[existing_key]: 0; - mem.device_size = existing_size; - - sub.device->mem_zero(mem); - sub.ptr_map[key] = mem.device_pointer; - } - - mem.device = this; - mem.device_pointer = key; - stats.mem_alloc(mem.device_size - existing_size); - } - - void mem_free(device_memory& mem) - { - device_ptr key = mem.device_pointer; - size_t existing_size = mem.device_size; - - foreach(SubDevice& sub, devices) { - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - mem.device_size = existing_size; - - sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(key)); - } - - mem.device = this; - mem.device_pointer = 0; - mem.device_size = 0; - stats.mem_free(existing_size); - } - - void const_copy_to(const char *name, void *host, size_t size) - { - foreach(SubDevice& sub, devices) - sub.device->const_copy_to(name, host, size); - } - - void draw_pixels( - device_memory& rgba, int y, - int w, int h, int width, int height, - int dx, int dy, int dw, int dh, - bool transparent, const DeviceDrawParams &draw_params) - { - device_ptr key = rgba.device_pointer; - int i = 0, sub_h = h/devices.size(); - int sub_height = height/devices.size(); - - foreach(SubDevice& sub, devices) { - int sy = y + i*sub_h; - int sh = (i == (int)devices.size() - 1)? h - sub_h*i: sub_h; - int sheight = (i == (int)devices.size() - 1)? height - sub_height*i: sub_height; - int sdy = dy + i*sub_height; - /* adjust math for w/width */ - - rgba.device_pointer = sub.ptr_map[key]; - sub.device->draw_pixels(rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params); - i++; - } - - rgba.device_pointer = key; - } - - void map_tile(Device *sub_device, RenderTile& tile) - { - foreach(SubDevice& sub, devices) { - if(sub.device == sub_device) { - if(tile.buffer) tile.buffer = sub.ptr_map[tile.buffer]; - } - } - } - - int device_number(Device *sub_device) - { - int i = 0; - - foreach(SubDevice& sub, devices) { - if(sub.device == sub_device) - return i; - i++; - } - - return -1; - } - - void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) - { - for(int i = 0; i < 9; i++) { - if(!tiles[i].buffers) { - continue; - } - - /* If the tile was rendered on another device, copy its memory to - * to the current device now, for the duration of the denoising task. - * Note that this temporarily modifies the RenderBuffers and calls - * the device, so this function is not thread safe. */ - device_vector<float> &mem = tiles[i].buffers->buffer; - if(mem.device != sub_device) { - /* Only copy from device to host once. This is faster, but - * also required for the case where a CPU thread is denoising - * a tile rendered on the GPU. In that case we have to avoid - * overwriting the buffer being denoised by the CPU thread. */ - if(!tiles[i].buffers->map_neighbor_copied) { - tiles[i].buffers->map_neighbor_copied = true; - mem.copy_from_device(0, mem.data_size, 1); - } - - mem.swap_device(sub_device, 0, 0); - - mem.copy_to_device(); - tiles[i].buffer = mem.device_pointer; - tiles[i].device_size = mem.device_size; - - mem.restore_device(); - } - } - } - - void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles) - { - /* Copy denoised result back to the host. */ - device_vector<float> &mem = tiles[9].buffers->buffer; - mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer); - mem.copy_from_device(0, mem.data_size, 1); - mem.restore_device(); - /* Copy denoised result to the original device. */ - mem.copy_to_device(); - - for(int i = 0; i < 9; i++) { - if(!tiles[i].buffers) { - continue; - } - - device_vector<float> &mem = tiles[i].buffers->buffer; - if(mem.device != sub_device) { - mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer); - sub_device->mem_free(mem); - mem.restore_device(); - } - } - } - - int get_split_task_count(DeviceTask& task) - { - int total_tasks = 0; - list<DeviceTask> tasks; - task.split(tasks, devices.size()); - foreach(SubDevice& sub, devices) { - if(!tasks.empty()) { - DeviceTask subtask = tasks.front(); - tasks.pop_front(); - - total_tasks += sub.device->get_split_task_count(subtask); - } - } - return total_tasks; - } - - void task_add(DeviceTask& task) - { - list<DeviceTask> tasks; - task.split(tasks, devices.size()); - - foreach(SubDevice& sub, devices) { - if(!tasks.empty()) { - DeviceTask subtask = tasks.front(); - tasks.pop_front(); - - if(task.buffer) subtask.buffer = sub.ptr_map[task.buffer]; - if(task.rgba_byte) subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; - if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half]; - if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input]; - if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output]; - - sub.device->task_add(subtask); - } - } - } - - void task_wait() - { - foreach(SubDevice& sub, devices) - sub.device->task_wait(); - } - - void task_cancel() - { - foreach(SubDevice& sub, devices) - sub.device->task_cancel(); - } - -protected: - Stats sub_stats_; + } + + ~MultiDevice() + { + foreach (SubDevice &sub, devices) + delete sub.device; + } + + const string &error_message() + { + foreach (SubDevice &sub, devices) { + if (sub.device->error_message() != "") { + if (error_msg == "") + error_msg = sub.device->error_message(); + break; + } + } + + return error_msg; + } + + virtual bool show_samples() const + { + if (devices.size() > 1) { + return false; + } + return devices.front().device->show_samples(); + } + + virtual BVHLayoutMask get_bvh_layout_mask() const + { + BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL; + foreach (const SubDevice &sub_device, devices) { + bvh_layout_mask &= sub_device.device->get_bvh_layout_mask(); + } + return bvh_layout_mask; + } + + bool load_kernels(const DeviceRequestedFeatures &requested_features) + { + foreach (SubDevice &sub, devices) + if (!sub.device->load_kernels(requested_features)) + return false; + + return true; + } + + bool wait_for_availability(const DeviceRequestedFeatures &requested_features) + { + foreach (SubDevice &sub, devices) + if (!sub.device->wait_for_availability(requested_features)) + return false; + + return true; + } + + DeviceKernelStatus get_active_kernel_switch_state() + { + DeviceKernelStatus result = DEVICE_KERNEL_USING_FEATURE_KERNEL; + + foreach (SubDevice &sub, devices) { + DeviceKernelStatus subresult = sub.device->get_active_kernel_switch_state(); + switch (subresult) { + case DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL: + result = subresult; + break; + + case DEVICE_KERNEL_FEATURE_KERNEL_INVALID: + case DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE: + return subresult; + + case DEVICE_KERNEL_USING_FEATURE_KERNEL: + case DEVICE_KERNEL_UNKNOWN: + break; + } + } + return result; + } + + void mem_alloc(device_memory &mem) + { + device_ptr key = unique_key++; + + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = 0; + mem.device_size = 0; + + sub.device->mem_alloc(mem); + sub.ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size); + } + + void mem_copy_to(device_memory &mem) + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + sub.device->mem_copy_to(mem); + sub.ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) + { + device_ptr key = mem.device_pointer; + int i = 0, sub_h = h / devices.size(); + + foreach (SubDevice &sub, devices) { + int sy = y + i * sub_h; + int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; + + mem.device = sub.device; + mem.device_pointer = sub.ptr_map[key]; + + sub.device->mem_copy_from(mem, sy, w, sh, elem); + i++; + } + + mem.device = this; + mem.device_pointer = key; + } + + void mem_zero(device_memory &mem) + { + device_ptr existing_key = mem.device_pointer; + device_ptr key = (existing_key) ? existing_key : unique_key++; + size_t existing_size = mem.device_size; + + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + sub.device->mem_zero(mem); + sub.ptr_map[key] = mem.device_pointer; + } + + mem.device = this; + mem.device_pointer = key; + stats.mem_alloc(mem.device_size - existing_size); + } + + void mem_free(device_memory &mem) + { + device_ptr key = mem.device_pointer; + size_t existing_size = mem.device_size; + + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = sub.ptr_map[key]; + mem.device_size = existing_size; + + sub.device->mem_free(mem); + sub.ptr_map.erase(sub.ptr_map.find(key)); + } + + mem.device = this; + mem.device_pointer = 0; + mem.device_size = 0; + stats.mem_free(existing_size); + } + + void const_copy_to(const char *name, void *host, size_t size) + { + foreach (SubDevice &sub, devices) + sub.device->const_copy_to(name, host, size); + } + + void draw_pixels(device_memory &rgba, + int y, + int w, + int h, + int width, + int height, + int dx, + int dy, + int dw, + int dh, + bool transparent, + const DeviceDrawParams &draw_params) + { + device_ptr key = rgba.device_pointer; + int i = 0, sub_h = h / devices.size(); + int sub_height = height / devices.size(); + + foreach (SubDevice &sub, devices) { + int sy = y + i * sub_h; + int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; + int sheight = (i == (int)devices.size() - 1) ? height - sub_height * i : sub_height; + int sdy = dy + i * sub_height; + /* adjust math for w/width */ + + rgba.device_pointer = sub.ptr_map[key]; + sub.device->draw_pixels( + rgba, sy, w, sh, width, sheight, dx, sdy, dw, dh, transparent, draw_params); + i++; + } + + rgba.device_pointer = key; + } + + void map_tile(Device *sub_device, RenderTile &tile) + { + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) { + if (tile.buffer) + tile.buffer = sub.ptr_map[tile.buffer]; + } + } + } + + int device_number(Device *sub_device) + { + int i = 0; + + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) + return i; + i++; + } + + return -1; + } + + void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) + { + for (int i = 0; i < 9; i++) { + if (!tiles[i].buffers) { + continue; + } + + /* If the tile was rendered on another device, copy its memory to + * to the current device now, for the duration of the denoising task. + * Note that this temporarily modifies the RenderBuffers and calls + * the device, so this function is not thread safe. */ + device_vector<float> &mem = tiles[i].buffers->buffer; + if (mem.device != sub_device) { + /* Only copy from device to host once. This is faster, but + * also required for the case where a CPU thread is denoising + * a tile rendered on the GPU. In that case we have to avoid + * overwriting the buffer being denoised by the CPU thread. */ + if (!tiles[i].buffers->map_neighbor_copied) { + tiles[i].buffers->map_neighbor_copied = true; + mem.copy_from_device(0, mem.data_size, 1); + } + + mem.swap_device(sub_device, 0, 0); + + mem.copy_to_device(); + tiles[i].buffer = mem.device_pointer; + tiles[i].device_size = mem.device_size; + + mem.restore_device(); + } + } + } + + void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles) + { + /* Copy denoised result back to the host. */ + device_vector<float> &mem = tiles[9].buffers->buffer; + mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer); + mem.copy_from_device(0, mem.data_size, 1); + mem.restore_device(); + /* Copy denoised result to the original device. */ + mem.copy_to_device(); + + for (int i = 0; i < 9; i++) { + if (!tiles[i].buffers) { + continue; + } + + device_vector<float> &mem = tiles[i].buffers->buffer; + if (mem.device != sub_device) { + mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer); + sub_device->mem_free(mem); + mem.restore_device(); + } + } + } + + int get_split_task_count(DeviceTask &task) + { + int total_tasks = 0; + list<DeviceTask> tasks; + task.split(tasks, devices.size()); + foreach (SubDevice &sub, devices) { + if (!tasks.empty()) { + DeviceTask subtask = tasks.front(); + tasks.pop_front(); + + total_tasks += sub.device->get_split_task_count(subtask); + } + } + return total_tasks; + } + + void task_add(DeviceTask &task) + { + list<DeviceTask> tasks; + task.split(tasks, devices.size()); + + foreach (SubDevice &sub, devices) { + if (!tasks.empty()) { + DeviceTask subtask = tasks.front(); + tasks.pop_front(); + + if (task.buffer) + subtask.buffer = sub.ptr_map[task.buffer]; + if (task.rgba_byte) + subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; + if (task.rgba_half) + subtask.rgba_half = sub.ptr_map[task.rgba_half]; + if (task.shader_input) + subtask.shader_input = sub.ptr_map[task.shader_input]; + if (task.shader_output) + subtask.shader_output = sub.ptr_map[task.shader_output]; + + sub.device->task_add(subtask); + } + } + } + + void task_wait() + { + foreach (SubDevice &sub, devices) + sub.device->task_wait(); + } + + void task_cancel() + { + foreach (SubDevice &sub, devices) + sub.device->task_cancel(); + } + + protected: + Stats sub_stats_; }; -Device *device_multi_create(DeviceInfo& info, Stats &stats, Profiler& profiler, bool background) +Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) { - return new MultiDevice(info, stats, profiler, background); + return new MultiDevice(info, stats, profiler, background); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp index 6736480e95a..80334ad8f22 100644 --- a/intern/cycles/device/device_network.cpp +++ b/intern/cycles/device/device_network.cpp @@ -33,767 +33,776 @@ typedef map<device_ptr, DataVector> DataMap; typedef vector<RenderTile> TileList; /* search a list of tiles and find the one that matches the passed render tile */ -static TileList::iterator tile_list_find(TileList& tile_list, RenderTile& tile) +static TileList::iterator tile_list_find(TileList &tile_list, RenderTile &tile) { - for(TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it) - if(tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample) - return it; - return tile_list.end(); + for (TileList::iterator it = tile_list.begin(); it != tile_list.end(); ++it) + if (tile.x == it->x && tile.y == it->y && tile.start_sample == it->start_sample) + return it; + return tile_list.end(); } -class NetworkDevice : public Device -{ -public: - boost::asio::io_service io_service; - tcp::socket socket; - device_ptr mem_counter; - DeviceTask the_task; /* todo: handle multiple tasks */ - - thread_mutex rpc_lock; - - virtual bool show_samples() const - { - return false; - } - - NetworkDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address) - : Device(info, stats, profiler, true), socket(io_service) - { - error_func = NetworkError(); - stringstream portstr; - portstr << SERVER_PORT; - - tcp::resolver resolver(io_service); - tcp::resolver::query query(address, portstr.str()); - tcp::resolver::iterator endpoint_iterator = resolver.resolve(query); - tcp::resolver::iterator end; +class NetworkDevice : public Device { + public: + boost::asio::io_service io_service; + tcp::socket socket; + device_ptr mem_counter; + DeviceTask the_task; /* todo: handle multiple tasks */ + + thread_mutex rpc_lock; + + virtual bool show_samples() const + { + return false; + } + + NetworkDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, const char *address) + : Device(info, stats, profiler, true), socket(io_service) + { + error_func = NetworkError(); + stringstream portstr; + portstr << SERVER_PORT; + + tcp::resolver resolver(io_service); + tcp::resolver::query query(address, portstr.str()); + tcp::resolver::iterator endpoint_iterator = resolver.resolve(query); + tcp::resolver::iterator end; + + boost::system::error_code error = boost::asio::error::host_not_found; + while (error && endpoint_iterator != end) { + socket.close(); + socket.connect(*endpoint_iterator++, error); + } - boost::system::error_code error = boost::asio::error::host_not_found; - while(error && endpoint_iterator != end) - { - socket.close(); - socket.connect(*endpoint_iterator++, error); - } - - if(error) - error_func.network_error(error.message()); + if (error) + error_func.network_error(error.message()); - mem_counter = 0; - } + mem_counter = 0; + } - ~NetworkDevice() - { - RPCSend snd(socket, &error_func, "stop"); - snd.write(); - } - - virtual BVHLayoutMask get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; - } - - void mem_alloc(device_memory& mem) - { - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - thread_scoped_lock lock(rpc_lock); + ~NetworkDevice() + { + RPCSend snd(socket, &error_func, "stop"); + snd.write(); + } - mem.device_pointer = ++mem_counter; + virtual BVHLayoutMask get_bvh_layout_mask() const + { + return BVH_LAYOUT_BVH2; + } - RPCSend snd(socket, &error_func, "mem_alloc"); - snd.add(mem); - snd.write(); - } + void mem_alloc(device_memory &mem) + { + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } - void mem_copy_to(device_memory& mem) - { - thread_scoped_lock lock(rpc_lock); + thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "mem_copy_to"); + mem.device_pointer = ++mem_counter; - snd.add(mem); - snd.write(); - snd.write_buffer(mem.host_pointer, mem.memory_size()); - } + RPCSend snd(socket, &error_func, "mem_alloc"); + snd.add(mem); + snd.write(); + } - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) - { - thread_scoped_lock lock(rpc_lock); + void mem_copy_to(device_memory &mem) + { + thread_scoped_lock lock(rpc_lock); - size_t data_size = mem.memory_size(); + RPCSend snd(socket, &error_func, "mem_copy_to"); - RPCSend snd(socket, &error_func, "mem_copy_from"); + snd.add(mem); + snd.write(); + snd.write_buffer(mem.host_pointer, mem.memory_size()); + } - snd.add(mem); - snd.add(y); - snd.add(w); - snd.add(h); - snd.add(elem); - snd.write(); + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) + { + thread_scoped_lock lock(rpc_lock); - RPCReceive rcv(socket, &error_func); - rcv.read_buffer(mem.host_pointer, data_size); - } + size_t data_size = mem.memory_size(); - void mem_zero(device_memory& mem) - { - thread_scoped_lock lock(rpc_lock); + RPCSend snd(socket, &error_func, "mem_copy_from"); - RPCSend snd(socket, &error_func, "mem_zero"); + snd.add(mem); + snd.add(y); + snd.add(w); + snd.add(h); + snd.add(elem); + snd.write(); - snd.add(mem); - snd.write(); - } + RPCReceive rcv(socket, &error_func); + rcv.read_buffer(mem.host_pointer, data_size); + } - void mem_free(device_memory& mem) - { - if(mem.device_pointer) { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "mem_free"); - - snd.add(mem); - snd.write(); - - mem.device_pointer = 0; - } - } + void mem_zero(device_memory &mem) + { + thread_scoped_lock lock(rpc_lock); - void const_copy_to(const char *name, void *host, size_t size) - { - thread_scoped_lock lock(rpc_lock); + RPCSend snd(socket, &error_func, "mem_zero"); - RPCSend snd(socket, &error_func, "const_copy_to"); + snd.add(mem); + snd.write(); + } - string name_string(name); - - snd.add(name_string); - snd.add(size); - snd.write(); - snd.write_buffer(host, size); - } - - bool load_kernels(const DeviceRequestedFeatures& requested_features) - { - if(error_func.have_error()) - return false; - - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "load_kernels"); - snd.add(requested_features.experimental); - snd.add(requested_features.max_closure); - snd.add(requested_features.max_nodes_group); - snd.add(requested_features.nodes_features); - snd.write(); - - bool result; - RPCReceive rcv(socket, &error_func); - rcv.read(result); - - return result; - } - - void task_add(DeviceTask& task) - { - thread_scoped_lock lock(rpc_lock); - - the_task = task; - - RPCSend snd(socket, &error_func, "task_add"); - snd.add(task); - snd.write(); - } - - void task_wait() - { - thread_scoped_lock lock(rpc_lock); - - RPCSend snd(socket, &error_func, "task_wait"); - snd.write(); - - lock.unlock(); - - TileList the_tiles; - - /* todo: run this threaded for connecting to multiple clients */ - for(;;) { - if(error_func.have_error()) - break; - - RenderTile tile; - - lock.lock(); - RPCReceive rcv(socket, &error_func); - - if(rcv.name == "acquire_tile") { - lock.unlock(); - - /* todo: watch out for recursive calls! */ - if(the_task.acquire_tile(this, tile)) { /* write return as bool */ - the_tiles.push_back(tile); - - lock.lock(); - RPCSend snd(socket, &error_func, "acquire_tile"); - snd.add(tile); - snd.write(); - lock.unlock(); - } - else { - lock.lock(); - RPCSend snd(socket, &error_func, "acquire_tile_none"); - snd.write(); - lock.unlock(); - } - } - else if(rcv.name == "release_tile") { - rcv.read(tile); - lock.unlock(); - - TileList::iterator it = tile_list_find(the_tiles, tile); - if(it != the_tiles.end()) { - tile.buffers = it->buffers; - the_tiles.erase(it); - } - - assert(tile.buffers != NULL); - - the_task.release_tile(tile); - - lock.lock(); - RPCSend snd(socket, &error_func, "release_tile"); - snd.write(); - lock.unlock(); - } - else if(rcv.name == "task_wait_done") { - lock.unlock(); - break; - } - else - lock.unlock(); - } - } - - void task_cancel() - { - thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "task_cancel"); - snd.write(); - } - - int get_split_task_count(DeviceTask&) - { - return 1; - } - -private: - NetworkError error_func; + void mem_free(device_memory &mem) + { + if (mem.device_pointer) { + thread_scoped_lock lock(rpc_lock); + + RPCSend snd(socket, &error_func, "mem_free"); + + snd.add(mem); + snd.write(); + + mem.device_pointer = 0; + } + } + + void const_copy_to(const char *name, void *host, size_t size) + { + thread_scoped_lock lock(rpc_lock); + + RPCSend snd(socket, &error_func, "const_copy_to"); + + string name_string(name); + + snd.add(name_string); + snd.add(size); + snd.write(); + snd.write_buffer(host, size); + } + + bool load_kernels(const DeviceRequestedFeatures &requested_features) + { + if (error_func.have_error()) + return false; + + thread_scoped_lock lock(rpc_lock); + + RPCSend snd(socket, &error_func, "load_kernels"); + snd.add(requested_features.experimental); + snd.add(requested_features.max_closure); + snd.add(requested_features.max_nodes_group); + snd.add(requested_features.nodes_features); + snd.write(); + + bool result; + RPCReceive rcv(socket, &error_func); + rcv.read(result); + + return result; + } + + void task_add(DeviceTask &task) + { + thread_scoped_lock lock(rpc_lock); + + the_task = task; + + RPCSend snd(socket, &error_func, "task_add"); + snd.add(task); + snd.write(); + } + + void task_wait() + { + thread_scoped_lock lock(rpc_lock); + + RPCSend snd(socket, &error_func, "task_wait"); + snd.write(); + + lock.unlock(); + + TileList the_tiles; + + /* todo: run this threaded for connecting to multiple clients */ + for (;;) { + if (error_func.have_error()) + break; + + RenderTile tile; + + lock.lock(); + RPCReceive rcv(socket, &error_func); + + if (rcv.name == "acquire_tile") { + lock.unlock(); + + /* todo: watch out for recursive calls! */ + if (the_task.acquire_tile(this, tile)) { /* write return as bool */ + the_tiles.push_back(tile); + + lock.lock(); + RPCSend snd(socket, &error_func, "acquire_tile"); + snd.add(tile); + snd.write(); + lock.unlock(); + } + else { + lock.lock(); + RPCSend snd(socket, &error_func, "acquire_tile_none"); + snd.write(); + lock.unlock(); + } + } + else if (rcv.name == "release_tile") { + rcv.read(tile); + lock.unlock(); + + TileList::iterator it = tile_list_find(the_tiles, tile); + if (it != the_tiles.end()) { + tile.buffers = it->buffers; + the_tiles.erase(it); + } + + assert(tile.buffers != NULL); + + the_task.release_tile(tile); + + lock.lock(); + RPCSend snd(socket, &error_func, "release_tile"); + snd.write(); + lock.unlock(); + } + else if (rcv.name == "task_wait_done") { + lock.unlock(); + break; + } + else + lock.unlock(); + } + } + + void task_cancel() + { + thread_scoped_lock lock(rpc_lock); + RPCSend snd(socket, &error_func, "task_cancel"); + snd.write(); + } + + int get_split_task_count(DeviceTask &) + { + return 1; + } + + private: + NetworkError error_func; }; -Device *device_network_create(DeviceInfo& info, Stats &stats, Profiler &profiler, const char *address) +Device *device_network_create(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + const char *address) { - return new NetworkDevice(info, stats, profiler, address); + return new NetworkDevice(info, stats, profiler, address); } -void device_network_info(vector<DeviceInfo>& devices) +void device_network_info(vector<DeviceInfo> &devices) { - DeviceInfo info; + DeviceInfo info; - info.type = DEVICE_NETWORK; - info.description = "Network Device"; - info.id = "NETWORK"; - info.num = 0; + info.type = DEVICE_NETWORK; + info.description = "Network Device"; + info.id = "NETWORK"; + info.num = 0; - /* todo: get this info from device */ - info.has_volume_decoupled = false; - info.has_osl = false; + /* todo: get this info from device */ + info.has_volume_decoupled = false; + info.has_osl = false; - devices.push_back(info); + devices.push_back(info); } class DeviceServer { -public: - thread_mutex rpc_lock; - - void network_error(const string &message) { - error_func.network_error(message); - } - - bool have_error() { return error_func.have_error(); } - - DeviceServer(Device *device_, tcp::socket& socket_) - : device(device_), socket(socket_), stop(false), blocked_waiting(false) - { - error_func = NetworkError(); - } - - void listen() - { - /* receive remote function calls */ - for(;;) { - listen_step(); - - if(stop) - break; - } - } - -protected: - void listen_step() - { - thread_scoped_lock lock(rpc_lock); - RPCReceive rcv(socket, &error_func); - - if(rcv.name == "stop") - stop = true; - else - process(rcv, lock); - } - - /* create a memory buffer for a device buffer and insert it into mem_data */ - DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size) - { - /* create a new DataVector and insert it into mem_data */ - pair<DataMap::iterator,bool> data_ins = mem_data.insert( - DataMap::value_type(client_pointer, DataVector())); - - /* make sure it was a unique insertion */ - assert(data_ins.second); - - /* get a reference to the inserted vector */ - DataVector &data_v = data_ins.first->second; - - /* size the vector */ - data_v.resize(data_size); - - return data_v; - } - - DataVector &data_vector_find(device_ptr client_pointer) - { - DataMap::iterator i = mem_data.find(client_pointer); - assert(i != mem_data.end()); - return i->second; - } - - /* setup mapping and reverse mapping of client_pointer<->real_pointer */ - void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer) - { - pair<PtrMap::iterator,bool> mapins; - - /* insert mapping from client pointer to our real device pointer */ - mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer)); - assert(mapins.second); - - /* insert reverse mapping from real our device pointer to client pointer */ - mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer)); - assert(mapins.second); - } - - device_ptr device_ptr_from_client_pointer(device_ptr client_pointer) - { - PtrMap::iterator i = ptr_map.find(client_pointer); - assert(i != ptr_map.end()); - return i->second; - } - - device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer) - { - PtrMap::iterator i = ptr_map.find(client_pointer); - assert(i != ptr_map.end()); - - device_ptr result = i->second; - - /* erase the mapping */ - ptr_map.erase(i); - - /* erase the reverse mapping */ - PtrMap::iterator irev = ptr_imap.find(result); - assert(irev != ptr_imap.end()); - ptr_imap.erase(irev); - - /* erase the data vector */ - DataMap::iterator idata = mem_data.find(client_pointer); - assert(idata != mem_data.end()); - mem_data.erase(idata); - - return result; - } - - /* note that the lock must be already acquired upon entry. - * This is necessary because the caller often peeks at - * the header and delegates control to here when it doesn't - * specifically handle the current RPC. - * The lock must be unlocked before returning */ - void process(RPCReceive& rcv, thread_scoped_lock &lock) - { - if(rcv.name == "mem_alloc") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - /* Allocate host side data buffer. */ - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0; - - /* Perform the allocation on the actual device. */ - device->mem_alloc(mem); - - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - else if(rcv.name == "mem_copy_to") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - if(client_pointer) { - /* Lookup existing host side data buffer. */ - DataVector &data_v = data_vector_find(client_pointer); - mem.host_pointer = (void*)&data_v[0]; - - /* Translate the client pointer to a real device pointer. */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - } - else { - /* Allocate host side data buffer. */ - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (data_size)? (void*)&(data_v[0]): 0; - } - - /* Copy data from network into memory buffer. */ - rcv.read_buffer((uint8_t*)mem.host_pointer, data_size); - - /* Copy the data from the memory buffer to the device buffer. */ - device->mem_copy_to(mem); - - if(!client_pointer) { - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - } - else if(rcv.name == "mem_copy_from") { - string name; - network_device_memory mem(device); - int y, w, h, elem; - - rcv.read(mem, name); - rcv.read(y); - rcv.read(w); - rcv.read(h); - rcv.read(elem); - - device_ptr client_pointer = mem.device_pointer; - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - - DataVector &data_v = data_vector_find(client_pointer); - - mem.host_pointer = (device_ptr)&(data_v[0]); - - device->mem_copy_from(mem, y, w, h, elem); - - size_t data_size = mem.memory_size(); - - RPCSend snd(socket, &error_func, "mem_copy_from"); - snd.write(); - snd.write_buffer((uint8_t*)mem.host_pointer, data_size); - lock.unlock(); - } - else if(rcv.name == "mem_zero") { - string name; - network_device_memory mem(device); - rcv.read(mem, name); - lock.unlock(); - - size_t data_size = mem.memory_size(); - device_ptr client_pointer = mem.device_pointer; - - if(client_pointer) { - /* Lookup existing host side data buffer. */ - DataVector &data_v = data_vector_find(client_pointer); - mem.host_pointer = (void*)&data_v[0]; - - /* Translate the client pointer to a real device pointer. */ - mem.device_pointer = device_ptr_from_client_pointer(client_pointer); - } - else { - /* Allocate host side data buffer. */ - DataVector &data_v = data_vector_insert(client_pointer, data_size); - mem.host_pointer = (void*)? (device_ptr)&(data_v[0]): 0; - } - - /* Zero memory. */ - device->mem_zero(mem); - - if(!client_pointer) { - /* Store a mapping to/from client_pointer and real device pointer. */ - pointer_mapping_insert(client_pointer, mem.device_pointer); - } - } - else if(rcv.name == "mem_free") { - string name; - network_device_memory mem(device); - - rcv.read(mem, name); - lock.unlock(); - - device_ptr client_pointer = mem.device_pointer; - - mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer); - - device->mem_free(mem); - } - else if(rcv.name == "const_copy_to") { - string name_string; - size_t size; - - rcv.read(name_string); - rcv.read(size); - - vector<char> host_vector(size); - rcv.read_buffer(&host_vector[0], size); - lock.unlock(); - - device->const_copy_to(name_string.c_str(), &host_vector[0], size); - } - else if(rcv.name == "load_kernels") { - DeviceRequestedFeatures requested_features; - rcv.read(requested_features.experimental); - rcv.read(requested_features.max_closure); - rcv.read(requested_features.max_nodes_group); - rcv.read(requested_features.nodes_features); - - bool result; - result = device->load_kernels(requested_features); - RPCSend snd(socket, &error_func, "load_kernels"); - snd.add(result); - snd.write(); - lock.unlock(); - } - else if(rcv.name == "task_add") { - DeviceTask task; - - rcv.read(task); - lock.unlock(); - - if(task.buffer) - task.buffer = device_ptr_from_client_pointer(task.buffer); - - if(task.rgba_half) - task.rgba_half = device_ptr_from_client_pointer(task.rgba_half); - - if(task.rgba_byte) - task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte); - - if(task.shader_input) - task.shader_input = device_ptr_from_client_pointer(task.shader_input); - - if(task.shader_output) - task.shader_output = device_ptr_from_client_pointer(task.shader_output); - - task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); - task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); - task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, this); - task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1); - task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this); - - device->task_add(task); - } - else if(rcv.name == "task_wait") { - lock.unlock(); - - blocked_waiting = true; - device->task_wait(); - blocked_waiting = false; - - lock.lock(); - RPCSend snd(socket, &error_func, "task_wait_done"); - snd.write(); - lock.unlock(); - } - else if(rcv.name == "task_cancel") { - lock.unlock(); - device->task_cancel(); - } - else if(rcv.name == "acquire_tile") { - AcquireEntry entry; - entry.name = rcv.name; - rcv.read(entry.tile); - acquire_queue.push_back(entry); - lock.unlock(); - } - else if(rcv.name == "acquire_tile_none") { - AcquireEntry entry; - entry.name = rcv.name; - acquire_queue.push_back(entry); - lock.unlock(); - } - else if(rcv.name == "release_tile") { - AcquireEntry entry; - entry.name = rcv.name; - acquire_queue.push_back(entry); - lock.unlock(); - } - else { - cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n"; - lock.unlock(); - } - } - - bool task_acquire_tile(Device *, RenderTile& tile) - { - thread_scoped_lock acquire_lock(acquire_mutex); - - bool result = false; - - RPCSend snd(socket, &error_func, "acquire_tile"); - snd.write(); - - do { - if(blocked_waiting) - listen_step(); - - /* todo: avoid busy wait loop */ - thread_scoped_lock lock(rpc_lock); - - if(!acquire_queue.empty()) { - AcquireEntry entry = acquire_queue.front(); - acquire_queue.pop_front(); - - if(entry.name == "acquire_tile") { - tile = entry.tile; - - if(tile.buffer) tile.buffer = ptr_map[tile.buffer]; - - result = true; - break; - } - else if(entry.name == "acquire_tile_none") { - break; - } - else { - cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n"; - } - } - } while(acquire_queue.empty() && !stop && !have_error()); - - return result; - } - - void task_update_progress_sample() - { - ; /* skip */ - } - - void task_update_tile_sample(RenderTile&) - { - ; /* skip */ - } - - void task_release_tile(RenderTile& tile) - { - thread_scoped_lock acquire_lock(acquire_mutex); - - if(tile.buffer) tile.buffer = ptr_imap[tile.buffer]; - - { - thread_scoped_lock lock(rpc_lock); - RPCSend snd(socket, &error_func, "release_tile"); - snd.add(tile); - snd.write(); - lock.unlock(); - } - - do { - if(blocked_waiting) - listen_step(); - - /* todo: avoid busy wait loop */ - thread_scoped_lock lock(rpc_lock); - - if(!acquire_queue.empty()) { - AcquireEntry entry = acquire_queue.front(); - acquire_queue.pop_front(); - - if(entry.name == "release_tile") { - lock.unlock(); - break; - } - else { - cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n"; - } - } - } while(acquire_queue.empty() && !stop); - } - - bool task_get_cancel() - { - return false; - } - - /* properties */ - Device *device; - tcp::socket& socket; - - /* mapping of remote to local pointer */ - PtrMap ptr_map; - PtrMap ptr_imap; - DataMap mem_data; - - struct AcquireEntry { - string name; - RenderTile tile; - }; - - thread_mutex acquire_mutex; - list<AcquireEntry> acquire_queue; - - bool stop; - bool blocked_waiting; -private: - NetworkError error_func; - - /* todo: free memory and device (osl) on network error */ - + public: + thread_mutex rpc_lock; + + void network_error(const string &message) + { + error_func.network_error(message); + } + + bool have_error() + { + return error_func.have_error(); + } + + DeviceServer(Device *device_, tcp::socket &socket_) + : device(device_), socket(socket_), stop(false), blocked_waiting(false) + { + error_func = NetworkError(); + } + + void listen() + { + /* receive remote function calls */ + for (;;) { + listen_step(); + + if (stop) + break; + } + } + + protected: + void listen_step() + { + thread_scoped_lock lock(rpc_lock); + RPCReceive rcv(socket, &error_func); + + if (rcv.name == "stop") + stop = true; + else + process(rcv, lock); + } + + /* create a memory buffer for a device buffer and insert it into mem_data */ + DataVector &data_vector_insert(device_ptr client_pointer, size_t data_size) + { + /* create a new DataVector and insert it into mem_data */ + pair<DataMap::iterator, bool> data_ins = mem_data.insert( + DataMap::value_type(client_pointer, DataVector())); + + /* make sure it was a unique insertion */ + assert(data_ins.second); + + /* get a reference to the inserted vector */ + DataVector &data_v = data_ins.first->second; + + /* size the vector */ + data_v.resize(data_size); + + return data_v; + } + + DataVector &data_vector_find(device_ptr client_pointer) + { + DataMap::iterator i = mem_data.find(client_pointer); + assert(i != mem_data.end()); + return i->second; + } + + /* setup mapping and reverse mapping of client_pointer<->real_pointer */ + void pointer_mapping_insert(device_ptr client_pointer, device_ptr real_pointer) + { + pair<PtrMap::iterator, bool> mapins; + + /* insert mapping from client pointer to our real device pointer */ + mapins = ptr_map.insert(PtrMap::value_type(client_pointer, real_pointer)); + assert(mapins.second); + + /* insert reverse mapping from real our device pointer to client pointer */ + mapins = ptr_imap.insert(PtrMap::value_type(real_pointer, client_pointer)); + assert(mapins.second); + } + + device_ptr device_ptr_from_client_pointer(device_ptr client_pointer) + { + PtrMap::iterator i = ptr_map.find(client_pointer); + assert(i != ptr_map.end()); + return i->second; + } + + device_ptr device_ptr_from_client_pointer_erase(device_ptr client_pointer) + { + PtrMap::iterator i = ptr_map.find(client_pointer); + assert(i != ptr_map.end()); + + device_ptr result = i->second; + + /* erase the mapping */ + ptr_map.erase(i); + + /* erase the reverse mapping */ + PtrMap::iterator irev = ptr_imap.find(result); + assert(irev != ptr_imap.end()); + ptr_imap.erase(irev); + + /* erase the data vector */ + DataMap::iterator idata = mem_data.find(client_pointer); + assert(idata != mem_data.end()); + mem_data.erase(idata); + + return result; + } + + /* note that the lock must be already acquired upon entry. + * This is necessary because the caller often peeks at + * the header and delegates control to here when it doesn't + * specifically handle the current RPC. + * The lock must be unlocked before returning */ + void process(RPCReceive &rcv, thread_scoped_lock &lock) + { + if (rcv.name == "mem_alloc") { + string name; + network_device_memory mem(device); + rcv.read(mem, name); + lock.unlock(); + + /* Allocate host side data buffer. */ + size_t data_size = mem.memory_size(); + device_ptr client_pointer = mem.device_pointer; + + DataVector &data_v = data_vector_insert(client_pointer, data_size); + mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0; + + /* Perform the allocation on the actual device. */ + device->mem_alloc(mem); + + /* Store a mapping to/from client_pointer and real device pointer. */ + pointer_mapping_insert(client_pointer, mem.device_pointer); + } + else if (rcv.name == "mem_copy_to") { + string name; + network_device_memory mem(device); + rcv.read(mem, name); + lock.unlock(); + + size_t data_size = mem.memory_size(); + device_ptr client_pointer = mem.device_pointer; + + if (client_pointer) { + /* Lookup existing host side data buffer. */ + DataVector &data_v = data_vector_find(client_pointer); + mem.host_pointer = (void *)&data_v[0]; + + /* Translate the client pointer to a real device pointer. */ + mem.device_pointer = device_ptr_from_client_pointer(client_pointer); + } + else { + /* Allocate host side data buffer. */ + DataVector &data_v = data_vector_insert(client_pointer, data_size); + mem.host_pointer = (data_size) ? (void *)&(data_v[0]) : 0; + } + + /* Copy data from network into memory buffer. */ + rcv.read_buffer((uint8_t *)mem.host_pointer, data_size); + + /* Copy the data from the memory buffer to the device buffer. */ + device->mem_copy_to(mem); + + if (!client_pointer) { + /* Store a mapping to/from client_pointer and real device pointer. */ + pointer_mapping_insert(client_pointer, mem.device_pointer); + } + } + else if (rcv.name == "mem_copy_from") { + string name; + network_device_memory mem(device); + int y, w, h, elem; + + rcv.read(mem, name); + rcv.read(y); + rcv.read(w); + rcv.read(h); + rcv.read(elem); + + device_ptr client_pointer = mem.device_pointer; + mem.device_pointer = device_ptr_from_client_pointer(client_pointer); + + DataVector &data_v = data_vector_find(client_pointer); + + mem.host_pointer = (device_ptr) & (data_v[0]); + + device->mem_copy_from(mem, y, w, h, elem); + + size_t data_size = mem.memory_size(); + + RPCSend snd(socket, &error_func, "mem_copy_from"); + snd.write(); + snd.write_buffer((uint8_t *)mem.host_pointer, data_size); + lock.unlock(); + } + else if (rcv.name == "mem_zero") { + string name; + network_device_memory mem(device); + rcv.read(mem, name); + lock.unlock(); + + size_t data_size = mem.memory_size(); + device_ptr client_pointer = mem.device_pointer; + + if (client_pointer) { + /* Lookup existing host side data buffer. */ + DataVector &data_v = data_vector_find(client_pointer); + mem.host_pointer = (void *)&data_v[0]; + + /* Translate the client pointer to a real device pointer. */ + mem.device_pointer = device_ptr_from_client_pointer(client_pointer); + } + else { + /* Allocate host side data buffer. */ + DataVector &data_v = data_vector_insert(client_pointer, data_size); + mem.host_pointer = (void *) ? (device_ptr) & (data_v[0]) : 0; + } + + /* Zero memory. */ + device->mem_zero(mem); + + if (!client_pointer) { + /* Store a mapping to/from client_pointer and real device pointer. */ + pointer_mapping_insert(client_pointer, mem.device_pointer); + } + } + else if (rcv.name == "mem_free") { + string name; + network_device_memory mem(device); + + rcv.read(mem, name); + lock.unlock(); + + device_ptr client_pointer = mem.device_pointer; + + mem.device_pointer = device_ptr_from_client_pointer_erase(client_pointer); + + device->mem_free(mem); + } + else if (rcv.name == "const_copy_to") { + string name_string; + size_t size; + + rcv.read(name_string); + rcv.read(size); + + vector<char> host_vector(size); + rcv.read_buffer(&host_vector[0], size); + lock.unlock(); + + device->const_copy_to(name_string.c_str(), &host_vector[0], size); + } + else if (rcv.name == "load_kernels") { + DeviceRequestedFeatures requested_features; + rcv.read(requested_features.experimental); + rcv.read(requested_features.max_closure); + rcv.read(requested_features.max_nodes_group); + rcv.read(requested_features.nodes_features); + + bool result; + result = device->load_kernels(requested_features); + RPCSend snd(socket, &error_func, "load_kernels"); + snd.add(result); + snd.write(); + lock.unlock(); + } + else if (rcv.name == "task_add") { + DeviceTask task; + + rcv.read(task); + lock.unlock(); + + if (task.buffer) + task.buffer = device_ptr_from_client_pointer(task.buffer); + + if (task.rgba_half) + task.rgba_half = device_ptr_from_client_pointer(task.rgba_half); + + if (task.rgba_byte) + task.rgba_byte = device_ptr_from_client_pointer(task.rgba_byte); + + if (task.shader_input) + task.shader_input = device_ptr_from_client_pointer(task.shader_input); + + if (task.shader_output) + task.shader_output = device_ptr_from_client_pointer(task.shader_output); + + task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2); + task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1); + task.update_progress_sample = function_bind(&DeviceServer::task_update_progress_sample, + this); + task.update_tile_sample = function_bind(&DeviceServer::task_update_tile_sample, this, _1); + task.get_cancel = function_bind(&DeviceServer::task_get_cancel, this); + + device->task_add(task); + } + else if (rcv.name == "task_wait") { + lock.unlock(); + + blocked_waiting = true; + device->task_wait(); + blocked_waiting = false; + + lock.lock(); + RPCSend snd(socket, &error_func, "task_wait_done"); + snd.write(); + lock.unlock(); + } + else if (rcv.name == "task_cancel") { + lock.unlock(); + device->task_cancel(); + } + else if (rcv.name == "acquire_tile") { + AcquireEntry entry; + entry.name = rcv.name; + rcv.read(entry.tile); + acquire_queue.push_back(entry); + lock.unlock(); + } + else if (rcv.name == "acquire_tile_none") { + AcquireEntry entry; + entry.name = rcv.name; + acquire_queue.push_back(entry); + lock.unlock(); + } + else if (rcv.name == "release_tile") { + AcquireEntry entry; + entry.name = rcv.name; + acquire_queue.push_back(entry); + lock.unlock(); + } + else { + cout << "Error: unexpected RPC receive call \"" + rcv.name + "\"\n"; + lock.unlock(); + } + } + + bool task_acquire_tile(Device *, RenderTile &tile) + { + thread_scoped_lock acquire_lock(acquire_mutex); + + bool result = false; + + RPCSend snd(socket, &error_func, "acquire_tile"); + snd.write(); + + do { + if (blocked_waiting) + listen_step(); + + /* todo: avoid busy wait loop */ + thread_scoped_lock lock(rpc_lock); + + if (!acquire_queue.empty()) { + AcquireEntry entry = acquire_queue.front(); + acquire_queue.pop_front(); + + if (entry.name == "acquire_tile") { + tile = entry.tile; + + if (tile.buffer) + tile.buffer = ptr_map[tile.buffer]; + + result = true; + break; + } + else if (entry.name == "acquire_tile_none") { + break; + } + else { + cout << "Error: unexpected acquire RPC receive call \"" + entry.name + "\"\n"; + } + } + } while (acquire_queue.empty() && !stop && !have_error()); + + return result; + } + + void task_update_progress_sample() + { + ; /* skip */ + } + + void task_update_tile_sample(RenderTile &) + { + ; /* skip */ + } + + void task_release_tile(RenderTile &tile) + { + thread_scoped_lock acquire_lock(acquire_mutex); + + if (tile.buffer) + tile.buffer = ptr_imap[tile.buffer]; + + { + thread_scoped_lock lock(rpc_lock); + RPCSend snd(socket, &error_func, "release_tile"); + snd.add(tile); + snd.write(); + lock.unlock(); + } + + do { + if (blocked_waiting) + listen_step(); + + /* todo: avoid busy wait loop */ + thread_scoped_lock lock(rpc_lock); + + if (!acquire_queue.empty()) { + AcquireEntry entry = acquire_queue.front(); + acquire_queue.pop_front(); + + if (entry.name == "release_tile") { + lock.unlock(); + break; + } + else { + cout << "Error: unexpected release RPC receive call \"" + entry.name + "\"\n"; + } + } + } while (acquire_queue.empty() && !stop); + } + + bool task_get_cancel() + { + return false; + } + + /* properties */ + Device *device; + tcp::socket &socket; + + /* mapping of remote to local pointer */ + PtrMap ptr_map; + PtrMap ptr_imap; + DataMap mem_data; + + struct AcquireEntry { + string name; + RenderTile tile; + }; + + thread_mutex acquire_mutex; + list<AcquireEntry> acquire_queue; + + bool stop; + bool blocked_waiting; + + private: + NetworkError error_func; + + /* todo: free memory and device (osl) on network error */ }; void Device::server_run() { - try { - /* starts thread that responds to discovery requests */ - ServerDiscovery discovery; - - for(;;) { - /* accept connection */ - boost::asio::io_service io_service; - tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT)); - - tcp::socket socket(io_service); - acceptor.accept(socket); - - string remote_address = socket.remote_endpoint().address().to_string(); - printf("Connected to remote client at: %s\n", remote_address.c_str()); - - DeviceServer server(this, socket); - server.listen(); - - printf("Disconnected.\n"); - } - } - catch(exception& e) { - fprintf(stderr, "Network server exception: %s\n", e.what()); - } + try { + /* starts thread that responds to discovery requests */ + ServerDiscovery discovery; + + for (;;) { + /* accept connection */ + boost::asio::io_service io_service; + tcp::acceptor acceptor(io_service, tcp::endpoint(tcp::v4(), SERVER_PORT)); + + tcp::socket socket(io_service); + acceptor.accept(socket); + + string remote_address = socket.remote_endpoint().address().to_string(); + printf("Connected to remote client at: %s\n", remote_address.c_str()); + + DeviceServer server(this, socket); + server.listen(); + + printf("Disconnected.\n"); + } + } + catch (exception &e) { + fprintf(stderr, "Network server exception: %s\n", e.what()); + } } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h index 67626ae177f..5b69b815cc6 100644 --- a/intern/cycles/device/device_network.h +++ b/intern/cycles/device/device_network.h @@ -19,35 +19,35 @@ #ifdef WITH_NETWORK -#include <boost/archive/text_iarchive.hpp> -#include <boost/archive/text_oarchive.hpp> -#include <boost/archive/binary_iarchive.hpp> -#include <boost/archive/binary_oarchive.hpp> -#include <boost/array.hpp> -#include <boost/asio.hpp> -#include <boost/bind.hpp> -#include <boost/serialization/vector.hpp> -#include <boost/thread.hpp> - -#include <iostream> -#include <sstream> -#include <deque> - -#include "render/buffers.h" - -#include "util/util_foreach.h" -#include "util/util_list.h" -#include "util/util_map.h" -#include "util/util_param.h" -#include "util/util_string.h" +# include <boost/archive/text_iarchive.hpp> +# include <boost/archive/text_oarchive.hpp> +# include <boost/archive/binary_iarchive.hpp> +# include <boost/archive/binary_oarchive.hpp> +# include <boost/array.hpp> +# include <boost/asio.hpp> +# include <boost/bind.hpp> +# include <boost/serialization/vector.hpp> +# include <boost/thread.hpp> + +# include <iostream> +# include <sstream> +# include <deque> + +# include "render/buffers.h" + +# include "util/util_foreach.h" +# include "util/util_list.h" +# include "util/util_map.h" +# include "util/util_param.h" +# include "util/util_string.h" CCL_NAMESPACE_BEGIN -using std::cout; using std::cerr; +using std::cout; +using std::exception; using std::hex; using std::setw; -using std::exception; using boost::asio::ip::tcp; @@ -56,436 +56,435 @@ static const int DISCOVER_PORT = 5121; static const string DISCOVER_REQUEST_MSG = "REQUEST_RENDER_SERVER_IP"; static const string DISCOVER_REPLY_MSG = "REPLY_RENDER_SERVER_IP"; -#if 0 +# if 0 typedef boost::archive::text_oarchive o_archive; typedef boost::archive::text_iarchive i_archive; -#else +# else typedef boost::archive::binary_oarchive o_archive; typedef boost::archive::binary_iarchive i_archive; -#endif +# endif /* Serialization of device memory */ -class network_device_memory : public device_memory -{ -public: - network_device_memory(Device *device) - : device_memory(device, "", MEM_READ_ONLY) - { - } +class network_device_memory : public device_memory { + public: + network_device_memory(Device *device) : device_memory(device, "", MEM_READ_ONLY) + { + } - ~network_device_memory() - { - device_pointer = 0; - }; + ~network_device_memory() + { + device_pointer = 0; + }; - vector<char> local_data; + vector<char> local_data; }; /* Common netowrk error function / object for both DeviceNetwork and DeviceServer*/ class NetworkError { -public: - NetworkError() { - error = ""; - error_count = 0; - } - - ~NetworkError() {} - - void network_error(const string& message) { - error = message; - error_count += 1; - } - - bool have_error() { - return true ? error_count > 0 : false; - } - -private: - string error; - int error_count; + public: + NetworkError() + { + error = ""; + error_count = 0; + } + + ~NetworkError() + { + } + + void network_error(const string &message) + { + error = message; + error_count += 1; + } + + bool have_error() + { + return true ? error_count > 0 : false; + } + + private: + string error; + int error_count; }; - /* Remote procedure call Send */ class RPCSend { -public: - RPCSend(tcp::socket& socket_, NetworkError* e, const string& name_ = "") - : name(name_), socket(socket_), archive(archive_stream), sent(false) - { - archive & name_; - error_func = e; - fprintf(stderr, "rpc send %s\n", name.c_str()); - } - - ~RPCSend() - { - } - - void add(const device_memory& mem) - { - archive & mem.data_type & mem.data_elements & mem.data_size; - archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer; - archive & mem.type & string(mem.name); - archive & mem.interpolation & mem.extension; - archive & mem.device_pointer; - } - - template<typename T> void add(const T& data) - { - archive & data; - } - - void add(const DeviceTask& task) - { - int type = (int)task.type; - archive & type & task.x & task.y & task.w & task.h; - archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; - archive & task.offset & task.stride; - archive & task.shader_input & task.shader_output & task.shader_eval_type; - archive & task.shader_x & task.shader_w; - archive & task.need_finish_queue; - } - - void add(const RenderTile& tile) - { - archive & tile.x & tile.y & tile.w & tile.h; - archive & tile.start_sample & tile.num_samples & tile.sample; - archive & tile.resolution & tile.offset & tile.stride; - archive & tile.buffer; - } - - void write() - { - boost::system::error_code error; - - /* get string from stream */ - string archive_str = archive_stream.str(); - - /* first send fixed size header with size of following data */ - ostringstream header_stream; - header_stream << setw(8) << hex << archive_str.size(); - string header_str = header_stream.str(); - - boost::asio::write(socket, - boost::asio::buffer(header_str), - boost::asio::transfer_all(), error); - - if(error.value()) - error_func->network_error(error.message()); - - /* then send actual data */ - boost::asio::write(socket, - boost::asio::buffer(archive_str), - boost::asio::transfer_all(), error); - - if(error.value()) - error_func->network_error(error.message()); - - sent = true; - } - - void write_buffer(void *buffer, size_t size) - { - boost::system::error_code error; - - boost::asio::write(socket, - boost::asio::buffer(buffer, size), - boost::asio::transfer_all(), error); - - if(error.value()) - error_func->network_error(error.message()); - } - -protected: - string name; - tcp::socket& socket; - ostringstream archive_stream; - o_archive archive; - bool sent; - NetworkError *error_func; + public: + RPCSend(tcp::socket &socket_, NetworkError *e, const string &name_ = "") + : name(name_), socket(socket_), archive(archive_stream), sent(false) + { + archive &name_; + error_func = e; + fprintf(stderr, "rpc send %s\n", name.c_str()); + } + + ~RPCSend() + { + } + + void add(const device_memory &mem) + { + archive &mem.data_type &mem.data_elements &mem.data_size; + archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer; + archive &mem.type &string(mem.name); + archive &mem.interpolation &mem.extension; + archive &mem.device_pointer; + } + + template<typename T> void add(const T &data) + { + archive &data; + } + + void add(const DeviceTask &task) + { + int type = (int)task.type; + archive &type &task.x &task.y &task.w &task.h; + archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples; + archive &task.offset &task.stride; + archive &task.shader_input &task.shader_output &task.shader_eval_type; + archive &task.shader_x &task.shader_w; + archive &task.need_finish_queue; + } + + void add(const RenderTile &tile) + { + archive &tile.x &tile.y &tile.w &tile.h; + archive &tile.start_sample &tile.num_samples &tile.sample; + archive &tile.resolution &tile.offset &tile.stride; + archive &tile.buffer; + } + + void write() + { + boost::system::error_code error; + + /* get string from stream */ + string archive_str = archive_stream.str(); + + /* first send fixed size header with size of following data */ + ostringstream header_stream; + header_stream << setw(8) << hex << archive_str.size(); + string header_str = header_stream.str(); + + boost::asio::write( + socket, boost::asio::buffer(header_str), boost::asio::transfer_all(), error); + + if (error.value()) + error_func->network_error(error.message()); + + /* then send actual data */ + boost::asio::write( + socket, boost::asio::buffer(archive_str), boost::asio::transfer_all(), error); + + if (error.value()) + error_func->network_error(error.message()); + + sent = true; + } + + void write_buffer(void *buffer, size_t size) + { + boost::system::error_code error; + + boost::asio::write( + socket, boost::asio::buffer(buffer, size), boost::asio::transfer_all(), error); + + if (error.value()) + error_func->network_error(error.message()); + } + + protected: + string name; + tcp::socket &socket; + ostringstream archive_stream; + o_archive archive; + bool sent; + NetworkError *error_func; }; /* Remote procedure call Receive */ class RPCReceive { -public: - RPCReceive(tcp::socket& socket_, NetworkError* e ) - : socket(socket_), archive_stream(NULL), archive(NULL) - { - error_func = e; - /* read head with fixed size */ - vector<char> header(8); - boost::system::error_code error; - size_t len = boost::asio::read(socket, boost::asio::buffer(header), error); - - if(error.value()) { - error_func->network_error(error.message()); - } - - /* verify if we got something */ - if(len == header.size()) { - /* decode header */ - string header_str(&header[0], header.size()); - istringstream header_stream(header_str); - - size_t data_size; - - if((header_stream >> hex >> data_size)) { - - vector<char> data(data_size); - size_t len = boost::asio::read(socket, boost::asio::buffer(data), error); - - if(error.value()) - error_func->network_error(error.message()); - - - if(len == data_size) { - archive_str = (data.size())? string(&data[0], data.size()): string(""); - - archive_stream = new istringstream(archive_str); - archive = new i_archive(*archive_stream); - - *archive & name; - fprintf(stderr, "rpc receive %s\n", name.c_str()); - } - else { - error_func->network_error("Network receive error: data size doesn't match header"); - } - } - else { - error_func->network_error("Network receive error: can't decode data size from header"); - } - } - else { - error_func->network_error("Network receive error: invalid header size"); - } - } - - ~RPCReceive() - { - delete archive; - delete archive_stream; - } - - void read(network_device_memory& mem, string& name) - { - *archive & mem.data_type & mem.data_elements & mem.data_size; - *archive & mem.data_width & mem.data_height & mem.data_depth & mem.device_pointer; - *archive & mem.type & name; - *archive & mem.interpolation & mem.extension; - *archive & mem.device_pointer; - - mem.name = name.c_str(); - mem.host_pointer = 0; - - /* Can't transfer OpenGL texture over network. */ - if(mem.type == MEM_PIXELS) { - mem.type = MEM_READ_WRITE; - } - } - - template<typename T> void read(T& data) - { - *archive & data; - } - - void read_buffer(void *buffer, size_t size) - { - boost::system::error_code error; - size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error); - - if(error.value()) { - error_func->network_error(error.message()); - } - - if(len != size) - cout << "Network receive error: buffer size doesn't match expected size\n"; - } - - void read(DeviceTask& task) - { - int type; - - *archive & type & task.x & task.y & task.w & task.h; - *archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples; - *archive & task.offset & task.stride; - *archive & task.shader_input & task.shader_output & task.shader_eval_type; - *archive & task.shader_x & task.shader_w; - *archive & task.need_finish_queue; - - task.type = (DeviceTask::Type)type; - } - - void read(RenderTile& tile) - { - *archive & tile.x & tile.y & tile.w & tile.h; - *archive & tile.start_sample & tile.num_samples & tile.sample; - *archive & tile.resolution & tile.offset & tile.stride; - *archive & tile.buffer; - - tile.buffers = NULL; - } - - string name; - -protected: - tcp::socket& socket; - string archive_str; - istringstream *archive_stream; - i_archive *archive; - NetworkError *error_func; + public: + RPCReceive(tcp::socket &socket_, NetworkError *e) + : socket(socket_), archive_stream(NULL), archive(NULL) + { + error_func = e; + /* read head with fixed size */ + vector<char> header(8); + boost::system::error_code error; + size_t len = boost::asio::read(socket, boost::asio::buffer(header), error); + + if (error.value()) { + error_func->network_error(error.message()); + } + + /* verify if we got something */ + if (len == header.size()) { + /* decode header */ + string header_str(&header[0], header.size()); + istringstream header_stream(header_str); + + size_t data_size; + + if ((header_stream >> hex >> data_size)) { + + vector<char> data(data_size); + size_t len = boost::asio::read(socket, boost::asio::buffer(data), error); + + if (error.value()) + error_func->network_error(error.message()); + + if (len == data_size) { + archive_str = (data.size()) ? string(&data[0], data.size()) : string(""); + + archive_stream = new istringstream(archive_str); + archive = new i_archive(*archive_stream); + + *archive &name; + fprintf(stderr, "rpc receive %s\n", name.c_str()); + } + else { + error_func->network_error("Network receive error: data size doesn't match header"); + } + } + else { + error_func->network_error("Network receive error: can't decode data size from header"); + } + } + else { + error_func->network_error("Network receive error: invalid header size"); + } + } + + ~RPCReceive() + { + delete archive; + delete archive_stream; + } + + void read(network_device_memory &mem, string &name) + { + *archive &mem.data_type &mem.data_elements &mem.data_size; + *archive &mem.data_width &mem.data_height &mem.data_depth &mem.device_pointer; + *archive &mem.type &name; + *archive &mem.interpolation &mem.extension; + *archive &mem.device_pointer; + + mem.name = name.c_str(); + mem.host_pointer = 0; + + /* Can't transfer OpenGL texture over network. */ + if (mem.type == MEM_PIXELS) { + mem.type = MEM_READ_WRITE; + } + } + + template<typename T> void read(T &data) + { + *archive &data; + } + + void read_buffer(void *buffer, size_t size) + { + boost::system::error_code error; + size_t len = boost::asio::read(socket, boost::asio::buffer(buffer, size), error); + + if (error.value()) { + error_func->network_error(error.message()); + } + + if (len != size) + cout << "Network receive error: buffer size doesn't match expected size\n"; + } + + void read(DeviceTask &task) + { + int type; + + *archive &type &task.x &task.y &task.w &task.h; + *archive &task.rgba_byte &task.rgba_half &task.buffer &task.sample &task.num_samples; + *archive &task.offset &task.stride; + *archive &task.shader_input &task.shader_output &task.shader_eval_type; + *archive &task.shader_x &task.shader_w; + *archive &task.need_finish_queue; + + task.type = (DeviceTask::Type)type; + } + + void read(RenderTile &tile) + { + *archive &tile.x &tile.y &tile.w &tile.h; + *archive &tile.start_sample &tile.num_samples &tile.sample; + *archive &tile.resolution &tile.offset &tile.stride; + *archive &tile.buffer; + + tile.buffers = NULL; + } + + string name; + + protected: + tcp::socket &socket; + string archive_str; + istringstream *archive_stream; + i_archive *archive; + NetworkError *error_func; }; /* Server auto discovery */ class ServerDiscovery { -public: - explicit ServerDiscovery(bool discover = false) - : listen_socket(io_service), collect_servers(false) - { - /* setup listen socket */ - listen_endpoint.address(boost::asio::ip::address_v4::any()); - listen_endpoint.port(DISCOVER_PORT); - - listen_socket.open(listen_endpoint.protocol()); - - boost::asio::socket_base::reuse_address option(true); - listen_socket.set_option(option); - - listen_socket.bind(listen_endpoint); - - /* setup receive callback */ - async_receive(); - - /* start server discovery */ - if(discover) { - collect_servers = true; - servers.clear(); - - broadcast_message(DISCOVER_REQUEST_MSG); - } - - /* start thread */ - work = new boost::asio::io_service::work(io_service); - thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service)); - } - - ~ServerDiscovery() - { - io_service.stop(); - thread->join(); - delete thread; - delete work; - } - - vector<string> get_server_list() - { - vector<string> result; - - mutex.lock(); - result = vector<string>(servers.begin(), servers.end()); - mutex.unlock(); - - return result; - } - -private: - void handle_receive_from(const boost::system::error_code& error, size_t size) - { - if(error) { - cout << "Server discovery receive error: " << error.message() << "\n"; - return; - } - - if(size > 0) { - string msg = string(receive_buffer, size); - - /* handle incoming message */ - if(collect_servers) { - if(msg == DISCOVER_REPLY_MSG) { - string address = receive_endpoint.address().to_string(); - - mutex.lock(); - - /* add address if it's not already in the list */ - bool found = std::find(servers.begin(), servers.end(), - address) != servers.end(); - - if(!found) - servers.push_back(address); - - mutex.unlock(); - } - } - else { - /* reply to request */ - if(msg == DISCOVER_REQUEST_MSG) - broadcast_message(DISCOVER_REPLY_MSG); - } - } - - async_receive(); - } - - void async_receive() - { - listen_socket.async_receive_from( - boost::asio::buffer(receive_buffer), receive_endpoint, - boost::bind(&ServerDiscovery::handle_receive_from, this, - boost::asio::placeholders::error, boost::asio::placeholders::bytes_transferred)); - } - - void broadcast_message(const string& msg) - { - /* setup broadcast socket */ - boost::asio::ip::udp::socket socket(io_service); - - socket.open(boost::asio::ip::udp::v4()); - - boost::asio::socket_base::broadcast option(true); - socket.set_option(option); - - boost::asio::ip::udp::endpoint broadcast_endpoint( - boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT); - - /* broadcast message */ - socket.send_to(boost::asio::buffer(msg), broadcast_endpoint); - } - - /* network service and socket */ - boost::asio::io_service io_service; - boost::asio::ip::udp::endpoint listen_endpoint; - boost::asio::ip::udp::socket listen_socket; - - /* threading */ - boost::thread *thread; - boost::asio::io_service::work *work; - boost::mutex mutex; - - /* buffer and endpoint for receiving messages */ - char receive_buffer[256]; - boost::asio::ip::udp::endpoint receive_endpoint; - - // os, version, devices, status, host name, group name, ip as far as fields go - struct ServerInfo { - string cycles_version; - string os; - int device_count; - string status; - string host_name; - string group_name; - string host_addr; - }; - - /* collection of server addresses in list */ - bool collect_servers; - vector<string> servers; + public: + explicit ServerDiscovery(bool discover = false) + : listen_socket(io_service), collect_servers(false) + { + /* setup listen socket */ + listen_endpoint.address(boost::asio::ip::address_v4::any()); + listen_endpoint.port(DISCOVER_PORT); + + listen_socket.open(listen_endpoint.protocol()); + + boost::asio::socket_base::reuse_address option(true); + listen_socket.set_option(option); + + listen_socket.bind(listen_endpoint); + + /* setup receive callback */ + async_receive(); + + /* start server discovery */ + if (discover) { + collect_servers = true; + servers.clear(); + + broadcast_message(DISCOVER_REQUEST_MSG); + } + + /* start thread */ + work = new boost::asio::io_service::work(io_service); + thread = new boost::thread(boost::bind(&boost::asio::io_service::run, &io_service)); + } + + ~ServerDiscovery() + { + io_service.stop(); + thread->join(); + delete thread; + delete work; + } + + vector<string> get_server_list() + { + vector<string> result; + + mutex.lock(); + result = vector<string>(servers.begin(), servers.end()); + mutex.unlock(); + + return result; + } + + private: + void handle_receive_from(const boost::system::error_code &error, size_t size) + { + if (error) { + cout << "Server discovery receive error: " << error.message() << "\n"; + return; + } + + if (size > 0) { + string msg = string(receive_buffer, size); + + /* handle incoming message */ + if (collect_servers) { + if (msg == DISCOVER_REPLY_MSG) { + string address = receive_endpoint.address().to_string(); + + mutex.lock(); + + /* add address if it's not already in the list */ + bool found = std::find(servers.begin(), servers.end(), address) != servers.end(); + + if (!found) + servers.push_back(address); + + mutex.unlock(); + } + } + else { + /* reply to request */ + if (msg == DISCOVER_REQUEST_MSG) + broadcast_message(DISCOVER_REPLY_MSG); + } + } + + async_receive(); + } + + void async_receive() + { + listen_socket.async_receive_from(boost::asio::buffer(receive_buffer), + receive_endpoint, + boost::bind(&ServerDiscovery::handle_receive_from, + this, + boost::asio::placeholders::error, + boost::asio::placeholders::bytes_transferred)); + } + + void broadcast_message(const string &msg) + { + /* setup broadcast socket */ + boost::asio::ip::udp::socket socket(io_service); + + socket.open(boost::asio::ip::udp::v4()); + + boost::asio::socket_base::broadcast option(true); + socket.set_option(option); + + boost::asio::ip::udp::endpoint broadcast_endpoint( + boost::asio::ip::address::from_string("255.255.255.255"), DISCOVER_PORT); + + /* broadcast message */ + socket.send_to(boost::asio::buffer(msg), broadcast_endpoint); + } + + /* network service and socket */ + boost::asio::io_service io_service; + boost::asio::ip::udp::endpoint listen_endpoint; + boost::asio::ip::udp::socket listen_socket; + + /* threading */ + boost::thread *thread; + boost::asio::io_service::work *work; + boost::mutex mutex; + + /* buffer and endpoint for receiving messages */ + char receive_buffer[256]; + boost::asio::ip::udp::endpoint receive_endpoint; + + // os, version, devices, status, host name, group name, ip as far as fields go + struct ServerInfo { + string cycles_version; + string os; + int device_count; + string status; + string host_name; + string group_name; + string host_addr; + }; + + /* collection of server addresses in list */ + bool collect_servers; + vector<string> servers; }; CCL_NAMESPACE_END #endif -#endif /* __DEVICE_NETWORK_H__ */ +#endif /* __DEVICE_NETWORK_H__ */ diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp index 4cefaa217f1..99a8d2438d6 100644 --- a/intern/cycles/device/device_opencl.cpp +++ b/intern/cycles/device/device_opencl.cpp @@ -16,218 +16,211 @@ #ifdef WITH_OPENCL -#include "device/opencl/opencl.h" +# include "device/opencl/opencl.h" -#include "device/device_intern.h" +# include "device/device_intern.h" -#include "util/util_foreach.h" -#include "util/util_logging.h" -#include "util/util_set.h" -#include "util/util_string.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_set.h" +# include "util/util_string.h" CCL_NAMESPACE_BEGIN -Device *device_opencl_create(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) +Device *device_opencl_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) { - return opencl_create_split_device(info, stats, profiler, background); + return opencl_create_split_device(info, stats, profiler, background); } bool device_opencl_init() { - static bool initialized = false; - static bool result = false; - - if(initialized) - return result; - - initialized = true; - - if(OpenCLInfo::device_type() != 0) { - int clew_result = clewInit(); - if(clew_result == CLEW_SUCCESS) { - VLOG(1) << "CLEW initialization succeeded."; - result = true; - } - else { - VLOG(1) << "CLEW initialization failed: " - << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) - ? "Error setting up atexit() handler" - : "Error opening the library"); - } - } - else { - VLOG(1) << "Skip initializing CLEW, platform is force disabled."; - result = false; - } - - return result; + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + + if (OpenCLInfo::device_type() != 0) { + int clew_result = clewInit(); + if (clew_result == CLEW_SUCCESS) { + VLOG(1) << "CLEW initialization succeeded."; + result = true; + } + else { + VLOG(1) << "CLEW initialization failed: " + << ((clew_result == CLEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + } + else { + VLOG(1) << "Skip initializing CLEW, platform is force disabled."; + result = false; + } + + return result; } - static cl_int device_opencl_get_num_platforms_safe(cl_uint *num_platforms) { -#ifdef _WIN32 - __try { - return clGetPlatformIDs(0, NULL, num_platforms); - } - __except(EXCEPTION_EXECUTE_HANDLER) { - /* Ignore crashes inside the OpenCL driver and hope we can - * survive even with corrupted OpenCL installs. */ - fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n"); - } - - *num_platforms = 0; - return CL_DEVICE_NOT_FOUND; -#else - return clGetPlatformIDs(0, NULL, num_platforms); -#endif +# ifdef _WIN32 + __try { + return clGetPlatformIDs(0, NULL, num_platforms); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the OpenCL driver and hope we can + * survive even with corrupted OpenCL installs. */ + fprintf(stderr, "Cycles OpenCL: driver crashed, continuing without OpenCL.\n"); + } + + *num_platforms = 0; + return CL_DEVICE_NOT_FOUND; +# else + return clGetPlatformIDs(0, NULL, num_platforms); +# endif } -void device_opencl_info(vector<DeviceInfo>& devices) +void device_opencl_info(vector<DeviceInfo> &devices) { - cl_uint num_platforms = 0; - device_opencl_get_num_platforms_safe(&num_platforms); - if(num_platforms == 0) { - return; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - /* Devices are numbered consecutively across platforms. */ - int num_devices = 0; - set<string> unique_ids; - foreach(OpenCLPlatformDevice& platform_device, usable_devices) { - /* Compute unique ID for persistent user preferences. */ - const string& platform_name = platform_device.platform_name; - const string& device_name = platform_device.device_name; - string hardware_id = platform_device.hardware_id; - if(hardware_id == "") { - hardware_id = string_printf("ID_%d", num_devices); - } - string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; - - /* Hardware ID might not be unique, add device number in that case. */ - if(unique_ids.find(id) != unique_ids.end()) { - id += string_printf("_ID_%d", num_devices); - } - unique_ids.insert(id); - - /* Create DeviceInfo. */ - DeviceInfo info; - info.type = DEVICE_OPENCL; - info.description = string_remove_trademark(string(device_name)); - info.num = num_devices; - /* We don't know if it's used for display, but assume it is. */ - info.display_device = true; - info.use_split_kernel = true; - info.has_volume_decoupled = false; - info.id = id; - - /* Check OpenCL extensions */ - info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos; - - devices.push_back(info); - num_devices++; - } + cl_uint num_platforms = 0; + device_opencl_get_num_platforms_safe(&num_platforms); + if (num_platforms == 0) { + return; + } + + vector<OpenCLPlatformDevice> usable_devices; + OpenCLInfo::get_usable_devices(&usable_devices); + /* Devices are numbered consecutively across platforms. */ + int num_devices = 0; + set<string> unique_ids; + foreach (OpenCLPlatformDevice &platform_device, usable_devices) { + /* Compute unique ID for persistent user preferences. */ + const string &platform_name = platform_device.platform_name; + const string &device_name = platform_device.device_name; + string hardware_id = platform_device.hardware_id; + if (hardware_id == "") { + hardware_id = string_printf("ID_%d", num_devices); + } + string id = string("OPENCL_") + platform_name + "_" + device_name + "_" + hardware_id; + + /* Hardware ID might not be unique, add device number in that case. */ + if (unique_ids.find(id) != unique_ids.end()) { + id += string_printf("_ID_%d", num_devices); + } + unique_ids.insert(id); + + /* Create DeviceInfo. */ + DeviceInfo info; + info.type = DEVICE_OPENCL; + info.description = string_remove_trademark(string(device_name)); + info.num = num_devices; + /* We don't know if it's used for display, but assume it is. */ + info.display_device = true; + info.use_split_kernel = true; + info.has_volume_decoupled = false; + info.id = id; + + /* Check OpenCL extensions */ + info.has_half_images = platform_device.device_extensions.find("cl_khr_fp16") != string::npos; + + devices.push_back(info); + num_devices++; + } } string device_opencl_capabilities() { - if(OpenCLInfo::device_type() == 0) { - return "All OpenCL devices are forced to be OFF"; - } - string result = ""; - string error_msg = ""; /* Only used by opencl_assert(), but in the future - * it could also be nicely reported to the console. - */ - cl_uint num_platforms = 0; - opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms)); - if(num_platforms == 0) { - return "No OpenCL platforms found\n"; - } - result += string_printf("Number of platforms: %u\n", num_platforms); - - vector<cl_platform_id> platform_ids; - platform_ids.resize(num_platforms); - opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL)); - - typedef char cl_string[1024]; - -#define APPEND_INFO(func, id, name, what, type) \ - do { \ - type data; \ - memset(&data, 0, sizeof(data)); \ - opencl_assert(func(id, what, sizeof(data), &data, NULL)); \ - result += string_printf("%s: %s\n", name, to_string(data).c_str()); \ - } while(false) -#define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \ - do { \ - char data[1024] = "\0"; \ - size_t length = 0; \ - if(func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \ - if(length != 0 && data[0] != '\0') { \ - result += string_printf("%s: %s\n", name, data); \ - } \ - } \ - } while(false) -#define APPEND_PLATFORM_INFO(id, name, what, type) \ - APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type) -#define APPEND_DEVICE_INFO(id, name, what, type) \ - APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type) -#define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \ - APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what) - - vector<cl_device_id> device_ids; - for(cl_uint platform = 0; platform < num_platforms; ++platform) { - cl_platform_id platform_id = platform_ids[platform]; - - result += string_printf("Platform #%u\n", platform); - - APPEND_PLATFORM_INFO(platform_id, "Name", CL_PLATFORM_NAME, cl_string); - APPEND_PLATFORM_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR, cl_string); - APPEND_PLATFORM_INFO(platform_id, "Version", CL_PLATFORM_VERSION, cl_string); - APPEND_PLATFORM_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE, cl_string); - APPEND_PLATFORM_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS, cl_string); - - cl_uint num_devices = 0; - opencl_assert(clGetDeviceIDs(platform_ids[platform], - CL_DEVICE_TYPE_ALL, - 0, - NULL, - &num_devices)); - result += string_printf("\tNumber of devices: %u\n", num_devices); - - device_ids.resize(num_devices); - opencl_assert(clGetDeviceIDs(platform_ids[platform], - CL_DEVICE_TYPE_ALL, - num_devices, - &device_ids[0], - NULL)); - for(cl_uint device = 0; device < num_devices; ++device) { - cl_device_id device_id = device_ids[device]; - - result += string_printf("\t\tDevice: #%u\n", device); - - APPEND_DEVICE_INFO(device_id, "Name", CL_DEVICE_NAME, cl_string); - APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD); - APPEND_DEVICE_INFO(device_id, "Vendor", CL_DEVICE_VENDOR, cl_string); - APPEND_DEVICE_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION, cl_string); - APPEND_DEVICE_INFO(device_id, "Profile", CL_DEVICE_PROFILE, cl_string); - APPEND_DEVICE_INFO(device_id, "Version", CL_DEVICE_VERSION, cl_string); - APPEND_DEVICE_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS, cl_string); - APPEND_DEVICE_INFO(device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint); - APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint); - APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t); - } - } - -#undef APPEND_STRING_INFO -#undef APPEND_PLATFORM_STRING_INFO -#undef APPEND_DEVICE_STRING_INFO - - return result; + if (OpenCLInfo::device_type() == 0) { + return "All OpenCL devices are forced to be OFF"; + } + string result = ""; + string error_msg = ""; /* Only used by opencl_assert(), but in the future + * it could also be nicely reported to the console. + */ + cl_uint num_platforms = 0; + opencl_assert(device_opencl_get_num_platforms_safe(&num_platforms)); + if (num_platforms == 0) { + return "No OpenCL platforms found\n"; + } + result += string_printf("Number of platforms: %u\n", num_platforms); + + vector<cl_platform_id> platform_ids; + platform_ids.resize(num_platforms); + opencl_assert(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL)); + + typedef char cl_string[1024]; + +# define APPEND_INFO(func, id, name, what, type) \ + do { \ + type data; \ + memset(&data, 0, sizeof(data)); \ + opencl_assert(func(id, what, sizeof(data), &data, NULL)); \ + result += string_printf("%s: %s\n", name, to_string(data).c_str()); \ + } while (false) +# define APPEND_STRING_EXTENSION_INFO(func, id, name, what) \ + do { \ + char data[1024] = "\0"; \ + size_t length = 0; \ + if (func(id, what, sizeof(data), &data, &length) == CL_SUCCESS) { \ + if (length != 0 && data[0] != '\0') { \ + result += string_printf("%s: %s\n", name, data); \ + } \ + } \ + } while (false) +# define APPEND_PLATFORM_INFO(id, name, what, type) \ + APPEND_INFO(clGetPlatformInfo, id, "\tPlatform " name, what, type) +# define APPEND_DEVICE_INFO(id, name, what, type) \ + APPEND_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what, type) +# define APPEND_DEVICE_STRING_EXTENSION_INFO(id, name, what) \ + APPEND_STRING_EXTENSION_INFO(clGetDeviceInfo, id, "\t\t\tDevice " name, what) + + vector<cl_device_id> device_ids; + for (cl_uint platform = 0; platform < num_platforms; ++platform) { + cl_platform_id platform_id = platform_ids[platform]; + + result += string_printf("Platform #%u\n", platform); + + APPEND_PLATFORM_INFO(platform_id, "Name", CL_PLATFORM_NAME, cl_string); + APPEND_PLATFORM_INFO(platform_id, "Vendor", CL_PLATFORM_VENDOR, cl_string); + APPEND_PLATFORM_INFO(platform_id, "Version", CL_PLATFORM_VERSION, cl_string); + APPEND_PLATFORM_INFO(platform_id, "Profile", CL_PLATFORM_PROFILE, cl_string); + APPEND_PLATFORM_INFO(platform_id, "Extensions", CL_PLATFORM_EXTENSIONS, cl_string); + + cl_uint num_devices = 0; + opencl_assert( + clGetDeviceIDs(platform_ids[platform], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices)); + result += string_printf("\tNumber of devices: %u\n", num_devices); + + device_ids.resize(num_devices); + opencl_assert(clGetDeviceIDs( + platform_ids[platform], CL_DEVICE_TYPE_ALL, num_devices, &device_ids[0], NULL)); + for (cl_uint device = 0; device < num_devices; ++device) { + cl_device_id device_id = device_ids[device]; + + result += string_printf("\t\tDevice: #%u\n", device); + + APPEND_DEVICE_INFO(device_id, "Name", CL_DEVICE_NAME, cl_string); + APPEND_DEVICE_STRING_EXTENSION_INFO(device_id, "Board Name", CL_DEVICE_BOARD_NAME_AMD); + APPEND_DEVICE_INFO(device_id, "Vendor", CL_DEVICE_VENDOR, cl_string); + APPEND_DEVICE_INFO(device_id, "OpenCL C Version", CL_DEVICE_OPENCL_C_VERSION, cl_string); + APPEND_DEVICE_INFO(device_id, "Profile", CL_DEVICE_PROFILE, cl_string); + APPEND_DEVICE_INFO(device_id, "Version", CL_DEVICE_VERSION, cl_string); + APPEND_DEVICE_INFO(device_id, "Extensions", CL_DEVICE_EXTENSIONS, cl_string); + APPEND_DEVICE_INFO( + device_id, "Max clock frequency (MHz)", CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint); + APPEND_DEVICE_INFO(device_id, "Max compute units", CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint); + APPEND_DEVICE_INFO(device_id, "Max work group size", CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t); + } + } + +# undef APPEND_STRING_INFO +# undef APPEND_PLATFORM_STRING_INFO +# undef APPEND_DEVICE_STRING_INFO + + return result; } CCL_NAMESPACE_END -#endif /* WITH_OPENCL */ +#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index ee566e57918..42e597a34d7 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -27,299 +27,304 @@ CCL_NAMESPACE_BEGIN static const double alpha = 0.1; /* alpha for rolling average */ DeviceSplitKernel::DeviceSplitKernel(Device *device) -: device(device), - split_data(device, "split_data"), - ray_state(device, "ray_state", MEM_READ_WRITE), - queue_index(device, "queue_index"), - use_queues_flag(device, "use_queues_flag"), - work_pool_wgs(device, "work_pool_wgs"), - kernel_data_initialized(false) + : device(device), + split_data(device, "split_data"), + ray_state(device, "ray_state", MEM_READ_WRITE), + queue_index(device, "queue_index"), + use_queues_flag(device, "use_queues_flag"), + work_pool_wgs(device, "work_pool_wgs"), + kernel_data_initialized(false) { - avg_time_per_sample = 0.0; - - kernel_path_init = NULL; - kernel_scene_intersect = NULL; - kernel_lamp_emission = NULL; - kernel_do_volume = NULL; - kernel_queue_enqueue = NULL; - kernel_indirect_background = NULL; - kernel_shader_setup = NULL; - kernel_shader_sort = NULL; - kernel_shader_eval = NULL; - kernel_holdout_emission_blurring_pathtermination_ao = NULL; - kernel_subsurface_scatter = NULL; - kernel_direct_lighting = NULL; - kernel_shadow_blocked_ao = NULL; - kernel_shadow_blocked_dl = NULL; - kernel_enqueue_inactive = NULL; - kernel_next_iteration_setup = NULL; - kernel_indirect_subsurface = NULL; - kernel_buffer_update = NULL; + avg_time_per_sample = 0.0; + + kernel_path_init = NULL; + kernel_scene_intersect = NULL; + kernel_lamp_emission = NULL; + kernel_do_volume = NULL; + kernel_queue_enqueue = NULL; + kernel_indirect_background = NULL; + kernel_shader_setup = NULL; + kernel_shader_sort = NULL; + kernel_shader_eval = NULL; + kernel_holdout_emission_blurring_pathtermination_ao = NULL; + kernel_subsurface_scatter = NULL; + kernel_direct_lighting = NULL; + kernel_shadow_blocked_ao = NULL; + kernel_shadow_blocked_dl = NULL; + kernel_enqueue_inactive = NULL; + kernel_next_iteration_setup = NULL; + kernel_indirect_subsurface = NULL; + kernel_buffer_update = NULL; } DeviceSplitKernel::~DeviceSplitKernel() { - split_data.free(); - ray_state.free(); - use_queues_flag.free(); - queue_index.free(); - work_pool_wgs.free(); - - delete kernel_path_init; - delete kernel_scene_intersect; - delete kernel_lamp_emission; - delete kernel_do_volume; - delete kernel_queue_enqueue; - delete kernel_indirect_background; - delete kernel_shader_setup; - delete kernel_shader_sort; - delete kernel_shader_eval; - delete kernel_holdout_emission_blurring_pathtermination_ao; - delete kernel_subsurface_scatter; - delete kernel_direct_lighting; - delete kernel_shadow_blocked_ao; - delete kernel_shadow_blocked_dl; - delete kernel_enqueue_inactive; - delete kernel_next_iteration_setup; - delete kernel_indirect_subsurface; - delete kernel_buffer_update; + split_data.free(); + ray_state.free(); + use_queues_flag.free(); + queue_index.free(); + work_pool_wgs.free(); + + delete kernel_path_init; + delete kernel_scene_intersect; + delete kernel_lamp_emission; + delete kernel_do_volume; + delete kernel_queue_enqueue; + delete kernel_indirect_background; + delete kernel_shader_setup; + delete kernel_shader_sort; + delete kernel_shader_eval; + delete kernel_holdout_emission_blurring_pathtermination_ao; + delete kernel_subsurface_scatter; + delete kernel_direct_lighting; + delete kernel_shadow_blocked_ao; + delete kernel_shadow_blocked_dl; + delete kernel_enqueue_inactive; + delete kernel_next_iteration_setup; + delete kernel_indirect_subsurface; + delete kernel_buffer_update; } -bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features) +bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures &requested_features) { #define LOAD_KERNEL(name) \ - kernel_##name = get_split_kernel_function(#name, requested_features); \ - if(!kernel_##name) { \ - device->set_error(string("Split kernel error: failed to load kernel_") + #name); \ - return false; \ - } - - LOAD_KERNEL(path_init); - LOAD_KERNEL(scene_intersect); - LOAD_KERNEL(lamp_emission); - if (requested_features.use_volume) { - LOAD_KERNEL(do_volume); - } - LOAD_KERNEL(queue_enqueue); - LOAD_KERNEL(indirect_background); - LOAD_KERNEL(shader_setup); - LOAD_KERNEL(shader_sort); - LOAD_KERNEL(shader_eval); - LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); - LOAD_KERNEL(subsurface_scatter); - LOAD_KERNEL(direct_lighting); - LOAD_KERNEL(shadow_blocked_ao); - LOAD_KERNEL(shadow_blocked_dl); - LOAD_KERNEL(enqueue_inactive); - LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(indirect_subsurface); - LOAD_KERNEL(buffer_update); + kernel_##name = get_split_kernel_function(#name, requested_features); \ + if (!kernel_##name) { \ + device->set_error(string("Split kernel error: failed to load kernel_") + #name); \ + return false; \ + } + + LOAD_KERNEL(path_init); + LOAD_KERNEL(scene_intersect); + LOAD_KERNEL(lamp_emission); + if (requested_features.use_volume) { + LOAD_KERNEL(do_volume); + } + LOAD_KERNEL(queue_enqueue); + LOAD_KERNEL(indirect_background); + LOAD_KERNEL(shader_setup); + LOAD_KERNEL(shader_sort); + LOAD_KERNEL(shader_eval); + LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao); + LOAD_KERNEL(subsurface_scatter); + LOAD_KERNEL(direct_lighting); + LOAD_KERNEL(shadow_blocked_ao); + LOAD_KERNEL(shadow_blocked_dl); + LOAD_KERNEL(enqueue_inactive); + LOAD_KERNEL(next_iteration_setup); + LOAD_KERNEL(indirect_subsurface); + LOAD_KERNEL(buffer_update); #undef LOAD_KERNEL - /* Re-initialiaze kernel-dependent data when kernels change. */ - kernel_data_initialized = false; + /* Re-initialiaze kernel-dependent data when kernels change. */ + kernel_data_initialized = false; - return true; + return true; } -size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size) +size_t DeviceSplitKernel::max_elements_for_max_buffer_size(device_memory &kg, + device_memory &data, + uint64_t max_buffer_size) { - uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; - VLOG(1) << "Split state element size: " - << string_human_readable_number(size_per_element) << " bytes. (" - << string_human_readable_size(size_per_element) << ")."; - return max_buffer_size / size_per_element; + uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024; + VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element) + << " bytes. (" << string_human_readable_size(size_per_element) << ")."; + return max_buffer_size / size_per_element; } bool DeviceSplitKernel::path_trace(DeviceTask *task, - RenderTile& tile, - device_memory& kgbuffer, - device_memory& kernel_data) + RenderTile &tile, + device_memory &kgbuffer, + device_memory &kernel_data) { - if(device->have_error()) { - return false; - } + if (device->have_error()) { + return false; + } - /* Allocate all required global memory once. */ - if(!kernel_data_initialized) { - kernel_data_initialized = true; + /* Allocate all required global memory once. */ + if (!kernel_data_initialized) { + kernel_data_initialized = true; - /* Set local size */ - int2 lsize = split_kernel_local_size(); - local_size[0] = lsize[0]; - local_size[1] = lsize[1]; + /* Set local size */ + int2 lsize = split_kernel_local_size(); + local_size[0] = lsize[0]; + local_size[1] = lsize[1]; - /* Set global size */ - int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); + /* Set global size */ + int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task); - /* Make sure that set work size is a multiple of local - * work size dimensions. - */ - global_size[0] = round_up(gsize[0], local_size[0]); - global_size[1] = round_up(gsize[1], local_size[1]); + /* Make sure that set work size is a multiple of local + * work size dimensions. + */ + global_size[0] = round_up(gsize[0], local_size[0]); + global_size[1] = round_up(gsize[1], local_size[1]); - int num_global_elements = global_size[0] * global_size[1]; - assert(num_global_elements % WORK_POOL_SIZE == 0); + int num_global_elements = global_size[0] * global_size[1]; + assert(num_global_elements % WORK_POOL_SIZE == 0); - /* Calculate max groups */ + /* Calculate max groups */ - /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ - unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : WORK_POOL_SIZE_GPU; - unsigned int max_work_groups = num_global_elements / work_pool_size + 1; + /* Denotes the maximum work groups possible w.r.t. current requested tile size. */ + unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU : + WORK_POOL_SIZE_GPU; + unsigned int max_work_groups = num_global_elements / work_pool_size + 1; - /* Allocate work_pool_wgs memory. */ - work_pool_wgs.alloc_to_device(max_work_groups); - queue_index.alloc_to_device(NUM_QUEUES); - use_queues_flag.alloc_to_device(1); - split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); - ray_state.alloc(num_global_elements); - } + /* Allocate work_pool_wgs memory. */ + work_pool_wgs.alloc_to_device(max_work_groups); + queue_index.alloc_to_device(NUM_QUEUES); + use_queues_flag.alloc_to_device(1); + split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements)); + ray_state.alloc(num_global_elements); + } - /* Number of elements in the global state buffer */ - int num_global_elements = global_size[0] * global_size[1]; + /* Number of elements in the global state buffer */ + int num_global_elements = global_size[0] * global_size[1]; #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \ - if(device->have_error()) { \ - return false; \ - } \ - if(!kernel_##name->enqueue(KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ - return false; \ - } - - tile.sample = tile.start_sample; - - /* for exponential increase between tile updates */ - int time_multiplier = 1; - - while(tile.sample < tile.start_sample + tile.num_samples) { - /* to keep track of how long it takes to run a number of samples */ - double start_time = time_dt(); - - /* initial guess to start rolling average */ - const int initial_num_samples = 1; - /* approx number of samples per second */ - int samples_per_second = (avg_time_per_sample > 0.0) ? - int(double(time_multiplier) / avg_time_per_sample) + 1 : initial_num_samples; - - RenderTile subtile = tile; - subtile.start_sample = tile.sample; - subtile.num_samples = min(samples_per_second, tile.start_sample + tile.num_samples - tile.sample); - - if(device->have_error()) { - return false; - } - - /* reset state memory here as global size for data_init - * kernel might not be large enough to do in kernel - */ - work_pool_wgs.zero_to_device(); - split_data.zero_to_device(); - ray_state.zero_to_device(); - - if(!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), - subtile, - num_global_elements, - kgbuffer, - kernel_data, - split_data, - ray_state, - queue_index, - use_queues_flag, - work_pool_wgs)) - { - return false; - } - - ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); - - bool activeRaysAvailable = true; - double cancel_time = DBL_MAX; - - while(activeRaysAvailable) { - /* Do path-iteration in host [Enqueue Path-iteration kernels. */ - for(int PathIter = 0; PathIter < 16; PathIter++) { - ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); - if (kernel_do_volume) { - ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); - } - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); - ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); - - if(task->get_cancel() && cancel_time == DBL_MAX) { - /* Wait up to twice as many seconds for current samples to finish - * to avoid artifacts in render result from ending too soon. - */ - cancel_time = time_dt() + 2.0 * time_multiplier; - } - - if(time_dt() > cancel_time) { - return true; - } - } - - /* Decide if we should exit path-iteration in host. */ - ray_state.copy_from_device(0, global_size[0] * global_size[1], 1); - - activeRaysAvailable = false; - - for(int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { - if(!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) { - if(IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) { - /* Something went wrong, abort to avoid looping endlessly. */ - device->set_error("Split kernel error: invalid ray state"); - return false; - } - - /* Not all rays are RAY_INACTIVE. */ - activeRaysAvailable = true; - break; - } - } - - if(time_dt() > cancel_time) { - return true; - } - } - - double time_per_sample = ((time_dt()-start_time) / subtile.num_samples); - - if(avg_time_per_sample == 0.0) { - /* start rolling average */ - avg_time_per_sample = time_per_sample; - } - else { - avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; - } + if (device->have_error()) { \ + return false; \ + } \ + if (!kernel_##name->enqueue( \ + KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \ + return false; \ + } + + tile.sample = tile.start_sample; + + /* for exponential increase between tile updates */ + int time_multiplier = 1; + + while (tile.sample < tile.start_sample + tile.num_samples) { + /* to keep track of how long it takes to run a number of samples */ + double start_time = time_dt(); + + /* initial guess to start rolling average */ + const int initial_num_samples = 1; + /* approx number of samples per second */ + int samples_per_second = (avg_time_per_sample > 0.0) ? + int(double(time_multiplier) / avg_time_per_sample) + 1 : + initial_num_samples; + + RenderTile subtile = tile; + subtile.start_sample = tile.sample; + subtile.num_samples = min(samples_per_second, + tile.start_sample + tile.num_samples - tile.sample); + + if (device->have_error()) { + return false; + } + + /* reset state memory here as global size for data_init + * kernel might not be large enough to do in kernel + */ + work_pool_wgs.zero_to_device(); + split_data.zero_to_device(); + ray_state.zero_to_device(); + + if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size), + subtile, + num_global_elements, + kgbuffer, + kernel_data, + split_data, + ray_state, + queue_index, + use_queues_flag, + work_pool_wgs)) { + return false; + } + + ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size); + + bool activeRaysAvailable = true; + double cancel_time = DBL_MAX; + + while (activeRaysAvailable) { + /* Do path-iteration in host [Enqueue Path-iteration kernels. */ + for (int PathIter = 0; PathIter < 16; PathIter++) { + ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size); + if (kernel_do_volume) { + ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size); + } + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size); + ENQUEUE_SPLIT_KERNEL( + holdout_emission_blurring_pathtermination_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size); + ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size); + + if (task->get_cancel() && cancel_time == DBL_MAX) { + /* Wait up to twice as many seconds for current samples to finish + * to avoid artifacts in render result from ending too soon. + */ + cancel_time = time_dt() + 2.0 * time_multiplier; + } + + if (time_dt() > cancel_time) { + return true; + } + } + + /* Decide if we should exit path-iteration in host. */ + ray_state.copy_from_device(0, global_size[0] * global_size[1], 1); + + activeRaysAvailable = false; + + for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) { + if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) { + if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) { + /* Something went wrong, abort to avoid looping endlessly. */ + device->set_error("Split kernel error: invalid ray state"); + return false; + } + + /* Not all rays are RAY_INACTIVE. */ + activeRaysAvailable = true; + break; + } + } + + if (time_dt() > cancel_time) { + return true; + } + } + + double time_per_sample = ((time_dt() - start_time) / subtile.num_samples); + + if (avg_time_per_sample == 0.0) { + /* start rolling average */ + avg_time_per_sample = time_per_sample; + } + else { + avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample; + } #undef ENQUEUE_SPLIT_KERNEL - tile.sample += subtile.num_samples; - task->update_progress(&tile, tile.w*tile.h*subtile.num_samples); + tile.sample += subtile.num_samples; + task->update_progress(&tile, tile.w * tile.h * subtile.num_samples); - time_multiplier = min(time_multiplier << 1, 10); + time_multiplier = min(time_multiplier << 1, 10); - if(task->get_cancel()) { - return true; - } - } + if (task->get_cancel()) { + return true; + } + } - return true; + return true; } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 622733b843f..c9fb2ac844f 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -27,106 +27,115 @@ CCL_NAMESPACE_BEGIN * Since some bytes may be needed for aligning chunks of memory; * This is the amount of memory that we dedicate for that purpose. */ -#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB +#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB /* Types used for split kernel */ class KernelDimensions { -public: - size_t global_size[2]; - size_t local_size[2]; - - KernelDimensions(size_t global_size_[2], size_t local_size_[2]) - { - memcpy(global_size, global_size_, sizeof(global_size)); - memcpy(local_size, local_size_, sizeof(local_size)); - } + public: + size_t global_size[2]; + size_t local_size[2]; + + KernelDimensions(size_t global_size_[2], size_t local_size_[2]) + { + memcpy(global_size, global_size_, sizeof(global_size)); + memcpy(local_size, local_size_, sizeof(local_size)); + } }; class SplitKernelFunction { -public: - virtual ~SplitKernelFunction() {} + public: + virtual ~SplitKernelFunction() + { + } - /* enqueue the kernel, returns false if there is an error */ - virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) = 0; + /* enqueue the kernel, returns false if there is an error */ + virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) = 0; }; class DeviceSplitKernel { -private: - Device *device; - - SplitKernelFunction *kernel_path_init; - SplitKernelFunction *kernel_scene_intersect; - SplitKernelFunction *kernel_lamp_emission; - SplitKernelFunction *kernel_do_volume; - SplitKernelFunction *kernel_queue_enqueue; - SplitKernelFunction *kernel_indirect_background; - SplitKernelFunction *kernel_shader_setup; - SplitKernelFunction *kernel_shader_sort; - SplitKernelFunction *kernel_shader_eval; - SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; - SplitKernelFunction *kernel_subsurface_scatter; - SplitKernelFunction *kernel_direct_lighting; - SplitKernelFunction *kernel_shadow_blocked_ao; - SplitKernelFunction *kernel_shadow_blocked_dl; - SplitKernelFunction *kernel_enqueue_inactive; - SplitKernelFunction *kernel_next_iteration_setup; - SplitKernelFunction *kernel_indirect_subsurface; - SplitKernelFunction *kernel_buffer_update; - - /* Global memory variables [porting]; These memory is used for - * co-operation between different kernels; Data written by one - * kernel will be available to another kernel via this global - * memory. - */ - device_only_memory<uchar> split_data; - device_vector<uchar> ray_state; - device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */ - - /* Flag to make sceneintersect and lampemission kernel use queues. */ - device_only_memory<char> use_queues_flag; - - /* Approximate time it takes to complete one sample */ - double avg_time_per_sample; - - /* Work pool with respect to each work group. */ - device_only_memory<unsigned int> work_pool_wgs; - - /* Cached kernel-dependent data, initialized once. */ - bool kernel_data_initialized; - size_t local_size[2]; - size_t global_size[2]; - -public: - explicit DeviceSplitKernel(Device* device); - virtual ~DeviceSplitKernel(); - - bool load_kernels(const DeviceRequestedFeatures& requested_features); - bool path_trace(DeviceTask *task, - RenderTile& rtile, - device_memory& kgbuffer, - device_memory& kernel_data); - - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) = 0; - size_t max_elements_for_max_buffer_size(device_memory& kg, device_memory& data, uint64_t max_buffer_size); - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data_, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs) = 0; - - virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures&) = 0; - virtual int2 split_kernel_local_size() = 0; - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task) = 0; + private: + Device *device; + + SplitKernelFunction *kernel_path_init; + SplitKernelFunction *kernel_scene_intersect; + SplitKernelFunction *kernel_lamp_emission; + SplitKernelFunction *kernel_do_volume; + SplitKernelFunction *kernel_queue_enqueue; + SplitKernelFunction *kernel_indirect_background; + SplitKernelFunction *kernel_shader_setup; + SplitKernelFunction *kernel_shader_sort; + SplitKernelFunction *kernel_shader_eval; + SplitKernelFunction *kernel_holdout_emission_blurring_pathtermination_ao; + SplitKernelFunction *kernel_subsurface_scatter; + SplitKernelFunction *kernel_direct_lighting; + SplitKernelFunction *kernel_shadow_blocked_ao; + SplitKernelFunction *kernel_shadow_blocked_dl; + SplitKernelFunction *kernel_enqueue_inactive; + SplitKernelFunction *kernel_next_iteration_setup; + SplitKernelFunction *kernel_indirect_subsurface; + SplitKernelFunction *kernel_buffer_update; + + /* Global memory variables [porting]; These memory is used for + * co-operation between different kernels; Data written by one + * kernel will be available to another kernel via this global + * memory. + */ + device_only_memory<uchar> split_data; + device_vector<uchar> ray_state; + device_only_memory<int> + queue_index; /* Array of size num_queues that tracks the size of each queue. */ + + /* Flag to make sceneintersect and lampemission kernel use queues. */ + device_only_memory<char> use_queues_flag; + + /* Approximate time it takes to complete one sample */ + double avg_time_per_sample; + + /* Work pool with respect to each work group. */ + device_only_memory<unsigned int> work_pool_wgs; + + /* Cached kernel-dependent data, initialized once. */ + bool kernel_data_initialized; + size_t local_size[2]; + size_t global_size[2]; + + public: + explicit DeviceSplitKernel(Device *device); + virtual ~DeviceSplitKernel(); + + bool load_kernels(const DeviceRequestedFeatures &requested_features); + bool path_trace(DeviceTask *task, + RenderTile &rtile, + device_memory &kgbuffer, + device_memory &kernel_data); + + virtual uint64_t state_buffer_size(device_memory &kg, + device_memory &data, + size_t num_threads) = 0; + size_t max_elements_for_max_buffer_size(device_memory &kg, + device_memory &data, + uint64_t max_buffer_size); + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data_, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs) = 0; + + virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name, + const DeviceRequestedFeatures &) = 0; + virtual int2 split_kernel_local_size() = 0; + virtual int2 split_kernel_global_size(device_memory &kg, + device_memory &data, + DeviceTask *task) = 0; }; CCL_NAMESPACE_END -#endif /* __DEVICE_SPLIT_KERNEL_H__ */ +#endif /* __DEVICE_SPLIT_KERNEL_H__ */ diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp index 8310863886c..376ad06a734 100644 --- a/intern/cycles/device/device_task.cpp +++ b/intern/cycles/device/device_task.cpp @@ -29,100 +29,111 @@ CCL_NAMESPACE_BEGIN /* Device Task */ DeviceTask::DeviceTask(Type type_) -: type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0), - sample(0), num_samples(1), - shader_input(0), shader_output(0), - shader_eval_type(0), shader_filter(0), shader_x(0), shader_w(0) + : type(type_), + x(0), + y(0), + w(0), + h(0), + rgba_byte(0), + rgba_half(0), + buffer(0), + sample(0), + num_samples(1), + shader_input(0), + shader_output(0), + shader_eval_type(0), + shader_filter(0), + shader_x(0), + shader_w(0) { - last_update_time = time_dt(); + last_update_time = time_dt(); } int DeviceTask::get_subtask_count(int num, int max_size) { - if(max_size != 0) { - int max_size_num; - - if(type == SHADER) { - max_size_num = (shader_w + max_size - 1)/max_size; - } - else { - max_size = max(1, max_size/w); - max_size_num = (h + max_size - 1)/max_size; - } - - num = max(max_size_num, num); - } - - if(type == SHADER) { - num = min(shader_w, num); - } - else if(type == RENDER) { - } - else { - num = min(h, num); - } - - return num; + if (max_size != 0) { + int max_size_num; + + if (type == SHADER) { + max_size_num = (shader_w + max_size - 1) / max_size; + } + else { + max_size = max(1, max_size / w); + max_size_num = (h + max_size - 1) / max_size; + } + + num = max(max_size_num, num); + } + + if (type == SHADER) { + num = min(shader_w, num); + } + else if (type == RENDER) { + } + else { + num = min(h, num); + } + + return num; } -void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size) +void DeviceTask::split(list<DeviceTask> &tasks, int num, int max_size) { - num = get_subtask_count(num, max_size); - - if(type == SHADER) { - for(int i = 0; i < num; i++) { - int tx = shader_x + (shader_w/num)*i; - int tw = (i == num-1)? shader_w - i*(shader_w/num): shader_w/num; - - DeviceTask task = *this; - - task.shader_x = tx; - task.shader_w = tw; - - tasks.push_back(task); - } - } - else if(type == RENDER) { - for(int i = 0; i < num; i++) - tasks.push_back(*this); - } - else { - for(int i = 0; i < num; i++) { - int ty = y + (h/num)*i; - int th = (i == num-1)? h - i*(h/num): h/num; - - DeviceTask task = *this; - - task.y = ty; - task.h = th; - - tasks.push_back(task); - } - } + num = get_subtask_count(num, max_size); + + if (type == SHADER) { + for (int i = 0; i < num; i++) { + int tx = shader_x + (shader_w / num) * i; + int tw = (i == num - 1) ? shader_w - i * (shader_w / num) : shader_w / num; + + DeviceTask task = *this; + + task.shader_x = tx; + task.shader_w = tw; + + tasks.push_back(task); + } + } + else if (type == RENDER) { + for (int i = 0; i < num; i++) + tasks.push_back(*this); + } + else { + for (int i = 0; i < num; i++) { + int ty = y + (h / num) * i; + int th = (i == num - 1) ? h - i * (h / num) : h / num; + + DeviceTask task = *this; + + task.y = ty; + task.h = th; + + tasks.push_back(task); + } + } } void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples) { - if((type != RENDER) && - (type != SHADER)) - return; - - if(update_progress_sample) { - if(pixel_samples == -1) { - pixel_samples = shader_w; - } - update_progress_sample(pixel_samples, rtile? rtile->sample : 0); - } - - if(update_tile_sample) { - double current_time = time_dt(); - - if(current_time - last_update_time >= 1.0) { - update_tile_sample(*rtile); - - last_update_time = current_time; - } - } + if ((type != RENDER) && (type != SHADER)) + return; + + if (update_progress_sample) { + if (pixel_samples == -1) { + pixel_samples = shader_w; + } + update_progress_sample(pixel_samples, rtile ? rtile->sample : 0); + } + + if (update_tile_sample) { + double current_time = time_dt(); + + if (current_time - last_update_time >= 1.0) { + update_tile_sample(*rtile); + + last_update_time = current_time; + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index f1fd4246868..5cc2e5e25db 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -33,87 +33,88 @@ class RenderTile; class Tile; class DenoiseParams { -public: - /* Pixel radius for neighbouring pixels to take into account. */ - int radius; - /* Controls neighbor pixel weighting for the denoising filter. */ - float strength; - /* Preserve more or less detail based on feature passes. */ - float feature_strength; - /* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */ - bool relative_pca; - /* How many frames before and after the current center frame are included. */ - int neighbor_frames; - /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ - bool clamp_input; - - DenoiseParams() - { - radius = 8; - strength = 0.5f; - feature_strength = 0.5f; - relative_pca = false; - neighbor_frames = 2; - clamp_input = true; - } + public: + /* Pixel radius for neighbouring pixels to take into account. */ + int radius; + /* Controls neighbor pixel weighting for the denoising filter. */ + float strength; + /* Preserve more or less detail based on feature passes. */ + float feature_strength; + /* When removing pixels that don't carry information, use a relative threshold instead of an absolute one. */ + bool relative_pca; + /* How many frames before and after the current center frame are included. */ + int neighbor_frames; + /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ + bool clamp_input; + + DenoiseParams() + { + radius = 8; + strength = 0.5f; + feature_strength = 0.5f; + relative_pca = false; + neighbor_frames = 2; + clamp_input = true; + } }; class DeviceTask : public Task { -public: - typedef enum { RENDER, FILM_CONVERT, SHADER } Type; - Type type; - - int x, y, w, h; - device_ptr rgba_byte; - device_ptr rgba_half; - device_ptr buffer; - int sample; - int num_samples; - int offset, stride; - - device_ptr shader_input; - device_ptr shader_output; - int shader_eval_type; - int shader_filter; - int shader_x, shader_w; - - int passes_size; - - explicit DeviceTask(Type type = RENDER); - - int get_subtask_count(int num, int max_size = 0); - void split(list<DeviceTask>& tasks, int num, int max_size = 0); - - void update_progress(RenderTile *rtile, int pixel_samples = -1); - - function<bool(Device *device, RenderTile&)> acquire_tile; - function<void(long, int)> update_progress_sample; - function<void(RenderTile&)> update_tile_sample; - function<void(RenderTile&)> release_tile; - function<bool()> get_cancel; - function<void(RenderTile*, Device*)> map_neighbor_tiles; - function<void(RenderTile*, Device*)> unmap_neighbor_tiles; - - DenoiseParams denoising; - bool denoising_from_render; - vector<int> denoising_frames; - - bool denoising_do_filter; - bool denoising_write_passes; - - int pass_stride; - int frame_stride; - int target_pass_stride; - int pass_denoising_data; - int pass_denoising_clean; - - bool need_finish_queue; - bool integrator_branched; - int2 requested_tile_size; -protected: - double last_update_time; + public: + typedef enum { RENDER, FILM_CONVERT, SHADER } Type; + Type type; + + int x, y, w, h; + device_ptr rgba_byte; + device_ptr rgba_half; + device_ptr buffer; + int sample; + int num_samples; + int offset, stride; + + device_ptr shader_input; + device_ptr shader_output; + int shader_eval_type; + int shader_filter; + int shader_x, shader_w; + + int passes_size; + + explicit DeviceTask(Type type = RENDER); + + int get_subtask_count(int num, int max_size = 0); + void split(list<DeviceTask> &tasks, int num, int max_size = 0); + + void update_progress(RenderTile *rtile, int pixel_samples = -1); + + function<bool(Device *device, RenderTile &)> acquire_tile; + function<void(long, int)> update_progress_sample; + function<void(RenderTile &)> update_tile_sample; + function<void(RenderTile &)> release_tile; + function<bool()> get_cancel; + function<void(RenderTile *, Device *)> map_neighbor_tiles; + function<void(RenderTile *, Device *)> unmap_neighbor_tiles; + + DenoiseParams denoising; + bool denoising_from_render; + vector<int> denoising_frames; + + bool denoising_do_filter; + bool denoising_write_passes; + + int pass_stride; + int frame_stride; + int target_pass_stride; + int pass_denoising_data; + int pass_denoising_clean; + + bool need_finish_queue; + bool integrator_branched; + int2 requested_tile_size; + + protected: + double last_update_time; }; CCL_NAMESPACE_END -#endif /* __DEVICE_TASK_H__ */ +#endif /* __DEVICE_TASK_H__ */ diff --git a/intern/cycles/device/opencl/memory_manager.cpp b/intern/cycles/device/opencl/memory_manager.cpp index 9cb105982aa..f85aadce1c2 100644 --- a/intern/cycles/device/opencl/memory_manager.cpp +++ b/intern/cycles/device/opencl/memory_manager.cpp @@ -16,241 +16,246 @@ #ifdef WITH_OPENCL -#include "util/util_foreach.h" +# include "util/util_foreach.h" -#include "device/opencl/opencl.h" -#include "device/opencl/memory_manager.h" +# include "device/opencl/opencl.h" +# include "device/opencl/memory_manager.h" CCL_NAMESPACE_BEGIN -void MemoryManager::DeviceBuffer::add_allocation(Allocation& allocation) +void MemoryManager::DeviceBuffer::add_allocation(Allocation &allocation) { - allocations.push_back(&allocation); + allocations.push_back(&allocation); } void MemoryManager::DeviceBuffer::update_device_memory(OpenCLDevice *device) { - bool need_realloc = false; - - /* Calculate total size and remove any freed. */ - size_t total_size = 0; - - for(int i = allocations.size()-1; i >= 0; i--) { - Allocation* allocation = allocations[i]; - - /* Remove allocations that have been freed. */ - if(!allocation->mem || allocation->mem->memory_size() == 0) { - allocation->device_buffer = NULL; - allocation->size = 0; - - allocations.erase(allocations.begin()+i); - - need_realloc = true; - - continue; - } - - /* Get actual size for allocation. */ - size_t alloc_size = align_up(allocation->mem->memory_size(), 16); - - if(allocation->size != alloc_size) { - /* Allocation is either new or resized. */ - allocation->size = alloc_size; - allocation->needs_copy_to_device = true; - - need_realloc = true; - } - - total_size += alloc_size; - } - - if(need_realloc) { - cl_ulong max_buffer_size; - clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if(total_size > max_buffer_size) { - device->set_error("Scene too complex to fit in available memory."); - return; - } - - device_only_memory<uchar> *new_buffer = - new device_only_memory<uchar>(device, "memory manager buffer"); - - new_buffer->alloc_to_device(total_size); - - size_t offset = 0; - - foreach(Allocation* allocation, allocations) { - if(allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(new_buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, NULL, NULL - )); - - allocation->needs_copy_to_device = false; - } - else { - /* Fast copy from memory already on device. */ - opencl_device_assert(device, clEnqueueCopyBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_MEM_PTR(new_buffer->device_pointer), - allocation->desc.offset, - offset, - allocation->mem->memory_size(), - 0, NULL, NULL - )); - } - - allocation->desc.offset = offset; - offset += allocation->size; - } - - delete buffer; - - buffer = new_buffer; - } - else { - assert(total_size == buffer->data_size); - - size_t offset = 0; - - foreach(Allocation* allocation, allocations) { - if(allocation->needs_copy_to_device) { - /* Copy from host to device. */ - opencl_device_assert(device, clEnqueueWriteBuffer(device->cqCommandQueue, - CL_MEM_PTR(buffer->device_pointer), - CL_FALSE, - offset, - allocation->mem->memory_size(), - allocation->mem->host_pointer, - 0, NULL, NULL - )); - - allocation->needs_copy_to_device = false; - } - - offset += allocation->size; - } - } - - /* Not really necessary, but seems to improve responsiveness for some reason. */ - clFinish(device->cqCommandQueue); + bool need_realloc = false; + + /* Calculate total size and remove any freed. */ + size_t total_size = 0; + + for (int i = allocations.size() - 1; i >= 0; i--) { + Allocation *allocation = allocations[i]; + + /* Remove allocations that have been freed. */ + if (!allocation->mem || allocation->mem->memory_size() == 0) { + allocation->device_buffer = NULL; + allocation->size = 0; + + allocations.erase(allocations.begin() + i); + + need_realloc = true; + + continue; + } + + /* Get actual size for allocation. */ + size_t alloc_size = align_up(allocation->mem->memory_size(), 16); + + if (allocation->size != alloc_size) { + /* Allocation is either new or resized. */ + allocation->size = alloc_size; + allocation->needs_copy_to_device = true; + + need_realloc = true; + } + + total_size += alloc_size; + } + + if (need_realloc) { + cl_ulong max_buffer_size; + clGetDeviceInfo( + device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if (total_size > max_buffer_size) { + device->set_error("Scene too complex to fit in available memory."); + return; + } + + device_only_memory<uchar> *new_buffer = new device_only_memory<uchar>(device, + "memory manager buffer"); + + new_buffer->alloc_to_device(total_size); + + size_t offset = 0; + + foreach (Allocation *allocation, allocations) { + if (allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, + clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(new_buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + allocation->mem->host_pointer, + 0, + NULL, + NULL)); + + allocation->needs_copy_to_device = false; + } + else { + /* Fast copy from memory already on device. */ + opencl_device_assert(device, + clEnqueueCopyBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_MEM_PTR(new_buffer->device_pointer), + allocation->desc.offset, + offset, + allocation->mem->memory_size(), + 0, + NULL, + NULL)); + } + + allocation->desc.offset = offset; + offset += allocation->size; + } + + delete buffer; + + buffer = new_buffer; + } + else { + assert(total_size == buffer->data_size); + + size_t offset = 0; + + foreach (Allocation *allocation, allocations) { + if (allocation->needs_copy_to_device) { + /* Copy from host to device. */ + opencl_device_assert(device, + clEnqueueWriteBuffer(device->cqCommandQueue, + CL_MEM_PTR(buffer->device_pointer), + CL_FALSE, + offset, + allocation->mem->memory_size(), + allocation->mem->host_pointer, + 0, + NULL, + NULL)); + + allocation->needs_copy_to_device = false; + } + + offset += allocation->size; + } + } + + /* Not really necessary, but seems to improve responsiveness for some reason. */ + clFinish(device->cqCommandQueue); } void MemoryManager::DeviceBuffer::free(OpenCLDevice *) { - buffer->free(); + buffer->free(); } -MemoryManager::DeviceBuffer* MemoryManager::smallest_device_buffer() +MemoryManager::DeviceBuffer *MemoryManager::smallest_device_buffer() { - DeviceBuffer* smallest = device_buffers; + DeviceBuffer *smallest = device_buffers; - foreach(DeviceBuffer& device_buffer, device_buffers) { - if(device_buffer.size < smallest->size) { - smallest = &device_buffer; - } - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + if (device_buffer.size < smallest->size) { + smallest = &device_buffer; + } + } - return smallest; + return smallest; } -MemoryManager::MemoryManager(OpenCLDevice *device) -: device(device), need_update(false) +MemoryManager::MemoryManager(OpenCLDevice *device) : device(device), need_update(false) { - foreach(DeviceBuffer& device_buffer, device_buffers) { - device_buffer.buffer = - new device_only_memory<uchar>(device, "memory manager buffer"); - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + device_buffer.buffer = new device_only_memory<uchar>(device, "memory manager buffer"); + } } void MemoryManager::free() { - foreach(DeviceBuffer& device_buffer, device_buffers) { - device_buffer.free(device); - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + device_buffer.free(device); + } } -void MemoryManager::alloc(const char *name, device_memory& mem) +void MemoryManager::alloc(const char *name, device_memory &mem) { - Allocation& allocation = allocations[name]; + Allocation &allocation = allocations[name]; - allocation.mem = &mem; - allocation.needs_copy_to_device = true; + allocation.mem = &mem; + allocation.needs_copy_to_device = true; - if(!allocation.device_buffer) { - DeviceBuffer* device_buffer = smallest_device_buffer(); - allocation.device_buffer = device_buffer; + if (!allocation.device_buffer) { + DeviceBuffer *device_buffer = smallest_device_buffer(); + allocation.device_buffer = device_buffer; - allocation.desc.device_buffer = device_buffer - device_buffers; + allocation.desc.device_buffer = device_buffer - device_buffers; - device_buffer->add_allocation(allocation); + device_buffer->add_allocation(allocation); - device_buffer->size += mem.memory_size(); - } + device_buffer->size += mem.memory_size(); + } - need_update = true; + need_update = true; } -bool MemoryManager::free(device_memory& mem) +bool MemoryManager::free(device_memory &mem) { - foreach(AllocationsMap::value_type& value, allocations) { - Allocation& allocation = value.second; - if(allocation.mem == &mem) { + foreach (AllocationsMap::value_type &value, allocations) { + Allocation &allocation = value.second; + if (allocation.mem == &mem) { - allocation.device_buffer->size -= mem.memory_size(); + allocation.device_buffer->size -= mem.memory_size(); - allocation.mem = NULL; - allocation.needs_copy_to_device = false; + allocation.mem = NULL; + allocation.needs_copy_to_device = false; - need_update = true; - return true; - } - } + need_update = true; + return true; + } + } - return false; + return false; } MemoryManager::BufferDescriptor MemoryManager::get_descriptor(string name) { - update_device_memory(); + update_device_memory(); - Allocation& allocation = allocations[name]; - return allocation.desc; + Allocation &allocation = allocations[name]; + return allocation.desc; } void MemoryManager::update_device_memory() { - if(!need_update) { - return; - } + if (!need_update) { + return; + } - need_update = false; + need_update = false; - foreach(DeviceBuffer& device_buffer, device_buffers) { - device_buffer.update_device_memory(device); - } + foreach (DeviceBuffer &device_buffer, device_buffers) { + device_buffer.update_device_memory(device); + } } void MemoryManager::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) { - update_device_memory(); - - foreach(DeviceBuffer& device_buffer, device_buffers) { - if(device_buffer.buffer->device_pointer) { - device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); - } - else { - device->kernel_set_args(kernel, (*narg)++, device->null_mem); - } - } + update_device_memory(); + + foreach (DeviceBuffer &device_buffer, device_buffers) { + if (device_buffer.buffer->device_pointer) { + device->kernel_set_args(kernel, (*narg)++, *device_buffer.buffer); + } + else { + device->kernel_set_args(kernel, (*narg)++, device->null_mem); + } + } } CCL_NAMESPACE_END -#endif /* WITH_OPENCL */ +#endif /* WITH_OPENCL */ diff --git a/intern/cycles/device/opencl/memory_manager.h b/intern/cycles/device/opencl/memory_manager.h index 8fcc4440369..2fbc97a0756 100644 --- a/intern/cycles/device/opencl/memory_manager.h +++ b/intern/cycles/device/opencl/memory_manager.h @@ -29,78 +29,77 @@ CCL_NAMESPACE_BEGIN class OpenCLDevice; class MemoryManager { -public: - static const int NUM_DEVICE_BUFFERS = 8; + public: + static const int NUM_DEVICE_BUFFERS = 8; - struct BufferDescriptor { - uint device_buffer; - cl_ulong offset; - }; + struct BufferDescriptor { + uint device_buffer; + cl_ulong offset; + }; -private: - struct DeviceBuffer; + private: + struct DeviceBuffer; - struct Allocation { - device_memory *mem; + struct Allocation { + device_memory *mem; - DeviceBuffer *device_buffer; - size_t size; /* Size of actual allocation, may be larger than requested. */ + DeviceBuffer *device_buffer; + size_t size; /* Size of actual allocation, may be larger than requested. */ - BufferDescriptor desc; + BufferDescriptor desc; - bool needs_copy_to_device; + bool needs_copy_to_device; - Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) - { - } - }; + Allocation() : mem(NULL), device_buffer(NULL), size(0), needs_copy_to_device(false) + { + } + }; - struct DeviceBuffer { - device_only_memory<uchar> *buffer; - vector<Allocation*> allocations; - size_t size; /* Size of all allocations. */ + struct DeviceBuffer { + device_only_memory<uchar> *buffer; + vector<Allocation *> allocations; + size_t size; /* Size of all allocations. */ - DeviceBuffer() - : buffer(NULL), size(0) - { - } + DeviceBuffer() : buffer(NULL), size(0) + { + } - ~DeviceBuffer() - { - delete buffer; - buffer = NULL; - } + ~DeviceBuffer() + { + delete buffer; + buffer = NULL; + } - void add_allocation(Allocation& allocation); + void add_allocation(Allocation &allocation); - void update_device_memory(OpenCLDevice *device); + void update_device_memory(OpenCLDevice *device); - void free(OpenCLDevice *device); - }; + void free(OpenCLDevice *device); + }; - OpenCLDevice *device; + OpenCLDevice *device; - DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; + DeviceBuffer device_buffers[NUM_DEVICE_BUFFERS]; - typedef unordered_map<string, Allocation> AllocationsMap; - AllocationsMap allocations; + typedef unordered_map<string, Allocation> AllocationsMap; + AllocationsMap allocations; - bool need_update; + bool need_update; - DeviceBuffer* smallest_device_buffer(); + DeviceBuffer *smallest_device_buffer(); -public: - MemoryManager(OpenCLDevice *device); + public: + MemoryManager(OpenCLDevice *device); - void free(); /* Free all memory. */ + void free(); /* Free all memory. */ - void alloc(const char *name, device_memory& mem); - bool free(device_memory& mem); + void alloc(const char *name, device_memory &mem); + bool free(device_memory &mem); - BufferDescriptor get_descriptor(string name); + BufferDescriptor get_descriptor(string name); - void update_device_memory(); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); + void update_device_memory(); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index 89761293638..e7bafa0b8a8 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -16,645 +16,641 @@ #ifdef WITH_OPENCL -#include "device/device.h" -#include "device/device_denoising.h" -#include "device/device_split_kernel.h" +# include "device/device.h" +# include "device/device_denoising.h" +# include "device/device_split_kernel.h" -#include "util/util_map.h" -#include "util/util_param.h" -#include "util/util_string.h" +# include "util/util_map.h" +# include "util/util_param.h" +# include "util/util_string.h" -#include "clew.h" +# include "clew.h" -#include "device/opencl/memory_manager.h" +# include "device/opencl/memory_manager.h" CCL_NAMESPACE_BEGIN /* Disable workarounds, seems to be working fine on latest drivers. */ -#define CYCLES_DISABLE_DRIVER_WORKAROUNDS +# define CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Define CYCLES_DISABLE_DRIVER_WORKAROUNDS to disable workaounds for testing */ -#ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS +# ifndef CYCLES_DISABLE_DRIVER_WORKAROUNDS /* Work around AMD driver hangs by ensuring each command is finished before doing anything else. */ -# undef clEnqueueNDRangeKernel -# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); +# undef clEnqueueNDRangeKernel +# define clEnqueueNDRangeKernel(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueNDRangeKernel)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); -# undef clEnqueueWriteBuffer -# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); +# undef clEnqueueWriteBuffer +# define clEnqueueWriteBuffer(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueWriteBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); -# undef clEnqueueReadBuffer -# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ - CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ - clFinish(a); -#endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ +# undef clEnqueueReadBuffer +# define clEnqueueReadBuffer(a, b, c, d, e, f, g, h, i) \ + CLEW_GET_FUN(__clewEnqueueReadBuffer)(a, b, c, d, e, f, g, h, i); \ + clFinish(a); +# endif /* CYCLES_DISABLE_DRIVER_WORKAROUNDS */ -#define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) +# define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p)) struct OpenCLPlatformDevice { - OpenCLPlatformDevice(cl_platform_id platform_id, - const string& platform_name, - cl_device_id device_id, - cl_device_type device_type, - const string& device_name, - const string& hardware_id, - const string& device_extensions) - : platform_id(platform_id), - platform_name(platform_name), - device_id(device_id), - device_type(device_type), - device_name(device_name), - hardware_id(hardware_id), - device_extensions(device_extensions) {} - cl_platform_id platform_id; - string platform_name; - cl_device_id device_id; - cl_device_type device_type; - string device_name; - string hardware_id; - string device_extensions; + OpenCLPlatformDevice(cl_platform_id platform_id, + const string &platform_name, + cl_device_id device_id, + cl_device_type device_type, + const string &device_name, + const string &hardware_id, + const string &device_extensions) + : platform_id(platform_id), + platform_name(platform_name), + device_id(device_id), + device_type(device_type), + device_name(device_name), + hardware_id(hardware_id), + device_extensions(device_extensions) + { + } + cl_platform_id platform_id; + string platform_name; + cl_device_id device_id; + cl_device_type device_type; + string device_name; + string hardware_id; + string device_extensions; }; /* Contains all static OpenCL helper functions. */ -class OpenCLInfo -{ -public: - static cl_device_type device_type(); - static bool use_debug(); - static bool device_supported(const string& platform_name, - const cl_device_id device_id); - static bool platform_version_check(cl_platform_id platform, - string *error = NULL); - static bool device_version_check(cl_device_id device, - string *error = NULL); - static string get_hardware_id(const string& platform_name, - cl_device_id device_id); - static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, - bool force_all = false); - - /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ - - /* Platform information. */ - static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); - static cl_uint get_num_platforms(); - - static bool get_platforms(vector<cl_platform_id> *platform_ids, - cl_int *error = NULL); - static vector<cl_platform_id> get_platforms(); - - static bool get_platform_name(cl_platform_id platform_id, - string *platform_name); - static string get_platform_name(cl_platform_id platform_id); - - static bool get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - cl_uint *num_devices, - cl_int *error = NULL); - static cl_uint get_num_platform_devices(cl_platform_id platform_id, - cl_device_type device_type); - - static bool get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type, - vector<cl_device_id> *device_ids, - cl_int* error = NULL); - static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, - cl_device_type device_type); - - /* Device information. */ - static bool get_device_name(cl_device_id device_id, - string *device_name, - cl_int* error = NULL); - - static string get_device_name(cl_device_id device_id); - - static bool get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int* error = NULL); - - static string get_device_extensions(cl_device_id device_id); - - static bool get_device_type(cl_device_id device_id, - cl_device_type *device_type, - cl_int* error = NULL); - static cl_device_type get_device_type(cl_device_id device_id); - - static bool get_driver_version(cl_device_id device_id, - int *major, - int *minor, - cl_int* error = NULL); - - static int mem_sub_ptr_alignment(cl_device_id device_id); - - /* Get somewhat more readable device name. - * Main difference is AMD OpenCL here which only gives code name - * for the regular device name. This will give more sane device - * name using some extensions. - */ - static string get_readable_device_name(cl_device_id device_id); +class OpenCLInfo { + public: + static cl_device_type device_type(); + static bool use_debug(); + static bool device_supported(const string &platform_name, const cl_device_id device_id); + static bool platform_version_check(cl_platform_id platform, string *error = NULL); + static bool device_version_check(cl_device_id device, string *error = NULL); + static string get_hardware_id(const string &platform_name, cl_device_id device_id); + static void get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, + bool force_all = false); + + /* ** Some handy shortcuts to low level cl*GetInfo() functions. ** */ + + /* Platform information. */ + static bool get_num_platforms(cl_uint *num_platforms, cl_int *error = NULL); + static cl_uint get_num_platforms(); + + static bool get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error = NULL); + static vector<cl_platform_id> get_platforms(); + + static bool get_platform_name(cl_platform_id platform_id, string *platform_name); + static string get_platform_name(cl_platform_id platform_id); + + static bool get_num_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + cl_uint *num_devices, + cl_int *error = NULL); + static cl_uint get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type); + + static bool get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type, + vector<cl_device_id> *device_ids, + cl_int *error = NULL); + static vector<cl_device_id> get_platform_devices(cl_platform_id platform_id, + cl_device_type device_type); + + /* Device information. */ + static bool get_device_name(cl_device_id device_id, string *device_name, cl_int *error = NULL); + + static string get_device_name(cl_device_id device_id); + + static bool get_device_extensions(cl_device_id device_id, + string *device_extensions, + cl_int *error = NULL); + + static string get_device_extensions(cl_device_id device_id); + + static bool get_device_type(cl_device_id device_id, + cl_device_type *device_type, + cl_int *error = NULL); + static cl_device_type get_device_type(cl_device_id device_id); + + static bool get_driver_version(cl_device_id device_id, + int *major, + int *minor, + cl_int *error = NULL); + + static int mem_sub_ptr_alignment(cl_device_id device_id); + + /* Get somewhat more readable device name. + * Main difference is AMD OpenCL here which only gives code name + * for the regular device name. This will give more sane device + * name using some extensions. + */ + static string get_readable_device_name(cl_device_id device_id); }; /* Thread safe cache for contexts and programs. */ -class OpenCLCache -{ - struct Slot - { - struct ProgramEntry - { - ProgramEntry(); - ProgramEntry(const ProgramEntry& rhs); - ~ProgramEntry(); - cl_program program; - thread_mutex *mutex; - }; - - Slot(); - Slot(const Slot& rhs); - ~Slot(); - - thread_mutex *context_mutex; - cl_context context; - typedef map<ustring, ProgramEntry> EntryMap; - EntryMap programs; - - }; - - /* key is combination of platform ID and device ID */ - typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair; - - /* map of Slot objects */ - typedef map<PlatformDevicePair, Slot> CacheMap; - CacheMap cache; - - /* MD5 hash of the kernel source. */ - string kernel_md5; - - thread_mutex cache_lock; - thread_mutex kernel_md5_lock; - - /* lazy instantiate */ - static OpenCLCache& global_instance(); - -public: - - enum ProgramName { - OCL_DEV_BASE_PROGRAM, - OCL_DEV_MEGAKERNEL_PROGRAM, - }; - - /* Lookup context in the cache. If this returns NULL, slot_locker - * will be holding a lock for the cache. slot_locker should refer to a - * default constructed thread_scoped_lock. */ - static cl_context get_context(cl_platform_id platform, - cl_device_id device, - thread_scoped_lock& slot_locker); - /* Same as above. */ - static cl_program get_program(cl_platform_id platform, - cl_device_id device, - ustring key, - thread_scoped_lock& slot_locker); - - /* Store context in the cache. You MUST have tried to get the item before storing to it. */ - static void store_context(cl_platform_id platform, - cl_device_id device, - cl_context context, - thread_scoped_lock& slot_locker); - /* Same as above. */ - static void store_program(cl_platform_id platform, - cl_device_id device, - cl_program program, - ustring key, - thread_scoped_lock& slot_locker); - - static string get_kernel_md5(); +class OpenCLCache { + struct Slot { + struct ProgramEntry { + ProgramEntry(); + ProgramEntry(const ProgramEntry &rhs); + ~ProgramEntry(); + cl_program program; + thread_mutex *mutex; + }; + + Slot(); + Slot(const Slot &rhs); + ~Slot(); + + thread_mutex *context_mutex; + cl_context context; + typedef map<ustring, ProgramEntry> EntryMap; + EntryMap programs; + }; + + /* key is combination of platform ID and device ID */ + typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair; + + /* map of Slot objects */ + typedef map<PlatformDevicePair, Slot> CacheMap; + CacheMap cache; + + /* MD5 hash of the kernel source. */ + string kernel_md5; + + thread_mutex cache_lock; + thread_mutex kernel_md5_lock; + + /* lazy instantiate */ + static OpenCLCache &global_instance(); + + public: + enum ProgramName { + OCL_DEV_BASE_PROGRAM, + OCL_DEV_MEGAKERNEL_PROGRAM, + }; + + /* Lookup context in the cache. If this returns NULL, slot_locker + * will be holding a lock for the cache. slot_locker should refer to a + * default constructed thread_scoped_lock. */ + static cl_context get_context(cl_platform_id platform, + cl_device_id device, + thread_scoped_lock &slot_locker); + /* Same as above. */ + static cl_program get_program(cl_platform_id platform, + cl_device_id device, + ustring key, + thread_scoped_lock &slot_locker); + + /* Store context in the cache. You MUST have tried to get the item before storing to it. */ + static void store_context(cl_platform_id platform, + cl_device_id device, + cl_context context, + thread_scoped_lock &slot_locker); + /* Same as above. */ + static void store_program(cl_platform_id platform, + cl_device_id device, + cl_program program, + ustring key, + thread_scoped_lock &slot_locker); + + static string get_kernel_md5(); }; -#define opencl_device_assert(device, stmt) \ - { \ - cl_int err = stmt; \ - \ - if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if((device)->error_message() == "") \ - (device)->set_error(message); \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } (void) 0 - -#define opencl_assert(stmt) \ - { \ - cl_int err = stmt; \ - \ - if(err != CL_SUCCESS) { \ - string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ - if(error_msg == "") \ - error_msg = message; \ - fprintf(stderr, "%s\n", message.c_str()); \ - } \ - } (void) 0 - -class OpenCLDevice : public Device -{ -public: - DedicatedTaskPool task_pool; - - /* Task pool for required kernels (base, AO kernels during foreground rendering) */ - TaskPool load_required_kernel_task_pool; - /* Task pool for optional kernels (feature kernels during foreground rendering) */ - TaskPool load_kernel_task_pool; - cl_context cxContext; - cl_command_queue cqCommandQueue; - cl_platform_id cpPlatform; - cl_device_id cdDevice; - cl_int ciErr; - int device_num; - bool use_preview_kernels; - - class OpenCLProgram { - public: - OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) {} - OpenCLProgram(OpenCLDevice *device, - const string& program_name, - const string& kernel_name, - const string& kernel_build_options, - bool use_stdout = true); - ~OpenCLProgram(); - - void add_kernel(ustring name); - - /* Try to load the program from device cache or disk */ - bool load(); - /* Compile the kernel (first separate, failback to local) */ - void compile(); - /* Create the OpenCL kernels after loading or compiling */ - void create_kernels(); - - bool is_loaded() const { return loaded; } - const string& get_log() const { return log; } - void report_error(); - - /* Wait until this kernel is available to be used - * It will return true when the kernel is available. - * It will return false when the kernel is not available - * or could not be loaded. */ - bool wait_for_availability(); - - cl_kernel operator()(); - cl_kernel operator()(ustring name); - - void release(); - - private: - bool build_kernel(const string *debug_src); - /* Build the program by calling the own process. - * This is required for multithreaded OpenCL compilation, since most Frameworks serialize - * build calls internally if they come from the same process. - * If that is not supported, this function just returns false. - */ - bool compile_separate(const string& clbin); - /* Build the program by calling OpenCL directly. */ - bool compile_kernel(const string *debug_src); - /* Loading and saving the program from/to disk. */ - bool load_binary(const string& clbin, const string *debug_src = NULL); - bool save_binary(const string& clbin); - - void add_log(const string& msg, bool is_debug); - void add_error(const string& msg); - - bool loaded; - bool needs_compiling; - - cl_program program; - OpenCLDevice *device; - - /* Used for the OpenCLCache key. */ - string program_name; - - string kernel_file, kernel_build_options, device_md5; - - bool use_stdout; - string log, error_msg; - string compile_output; - - map<ustring, cl_kernel> kernels; - }; - - /* Container for all types of split programs. */ - class OpenCLSplitPrograms { - public: - OpenCLDevice *device; - OpenCLProgram program_split; - OpenCLProgram program_lamp_emission; - OpenCLProgram program_do_volume; - OpenCLProgram program_indirect_background; - OpenCLProgram program_shader_eval; - OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; - OpenCLProgram program_subsurface_scatter; - OpenCLProgram program_direct_lighting; - OpenCLProgram program_shadow_blocked_ao; - OpenCLProgram program_shadow_blocked_dl; - - OpenCLSplitPrograms(OpenCLDevice *device); - ~OpenCLSplitPrograms(); - - /* Load the kernels and put the created kernels in the given `programs` - * paramter. */ - void load_kernels(vector<OpenCLProgram*> &programs, - const DeviceRequestedFeatures& requested_features, - bool is_preview=false); - }; - - DeviceSplitKernel *split_kernel; - - OpenCLProgram base_program; - OpenCLProgram bake_program; - OpenCLProgram displace_program; - OpenCLProgram background_program; - OpenCLProgram denoising_program; - - OpenCLSplitPrograms kernel_programs; - OpenCLSplitPrograms preview_programs; - - typedef map<string, device_vector<uchar>*> ConstMemMap; - typedef map<string, device_ptr> MemMap; - - ConstMemMap const_mem_map; - MemMap mem_map; - device_ptr null_mem; - - bool device_initialized; - string platform_name; - string device_name; - - bool opencl_error(cl_int err); - void opencl_error(const string& message); - void opencl_assert_err(cl_int err, const char* where); - - OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background); - ~OpenCLDevice(); - - static void CL_CALLBACK context_notify_callback(const char *err_info, - const void * /*private_info*/, size_t /*cb*/, void *user_data); - - bool opencl_version_check(); - OpenCLSplitPrograms* get_split_programs(); - - string device_md5_hash(string kernel_custom_build_options = ""); - bool load_kernels(const DeviceRequestedFeatures& requested_features); - void load_required_kernels(const DeviceRequestedFeatures& requested_features); - void load_preview_kernels(); - - bool wait_for_availability(const DeviceRequestedFeatures& requested_features); - DeviceKernelStatus get_active_kernel_switch_state(); - - /* Get the name of the opencl program for the given kernel */ - const string get_opencl_program_name(const string& kernel_name); - /* Get the program file name to compile (*.cl) for the given kernel */ - const string get_opencl_program_filename(const string& kernel_name); - string get_build_options(const DeviceRequestedFeatures& requested_features, - const string& opencl_program_name, - bool preview_kernel=false); - /* Enable the default features to reduce recompilation events */ - void enable_default_features(DeviceRequestedFeatures& features); - - void mem_alloc(device_memory& mem); - void mem_copy_to(device_memory& mem); - void mem_copy_from(device_memory& mem, int y, int w, int h, int elem); - void mem_zero(device_memory& mem); - void mem_free(device_memory& mem); - - int mem_sub_ptr_alignment(); - - void const_copy_to(const char *name, void *host, size_t size); - void tex_alloc(device_memory& mem); - void tex_free(device_memory& mem); - - size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, - bool x_workgroups = false, - size_t max_workgroup_size = -1); - void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); - void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); - - void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half); - void shader(DeviceTask& task); - - void denoise(RenderTile& tile, DenoisingTask& denoising); - - class OpenCLDeviceTask : public DeviceTask { - public: - OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task) - : DeviceTask(task) - { - run = function_bind(&OpenCLDevice::thread_run, - device, - this); - } - }; - - int get_split_task_count(DeviceTask& /*task*/) - { - return 1; - } - - void task_add(DeviceTask& task) - { - task_pool.push(new OpenCLDeviceTask(this, task)); - } - - void task_wait() - { - task_pool.wait(); - } - - void task_cancel() - { - task_pool.cancel(); - } - - void thread_run(DeviceTask *task); - - virtual BVHLayoutMask get_bvh_layout_mask() const { - return BVH_LAYOUT_BVH2; - } - - virtual bool show_samples() const { - return true; - } - - -protected: - string kernel_build_options(const string *debug_src = NULL); - - void mem_zero_kernel(device_ptr ptr, size_t size); - - bool denoising_non_local_means(device_ptr image_ptr, - device_ptr guide_ptr, - device_ptr variance_ptr, - device_ptr out_ptr, - DenoisingTask *task); - bool denoising_construct_transform(DenoisingTask *task); - bool denoising_accumulate(device_ptr color_ptr, - device_ptr color_variance_ptr, - device_ptr scale_ptr, - int frame, - DenoisingTask *task); - bool denoising_solve(device_ptr output_ptr, - DenoisingTask *task); - bool denoising_combine_halves(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr mean_ptr, - device_ptr variance_ptr, - int r, int4 rect, - DenoisingTask *task); - bool denoising_divide_shadow(device_ptr a_ptr, - device_ptr b_ptr, - device_ptr sample_variance_ptr, - device_ptr sv_variance_ptr, - device_ptr buffer_variance_ptr, - DenoisingTask *task); - bool denoising_get_feature(int mean_offset, - int variance_offset, - device_ptr mean_ptr, - device_ptr variance_ptr, - float scale, - DenoisingTask *task); - bool denoising_write_feature(int to_offset, - device_ptr from_ptr, - device_ptr buffer_ptr, - DenoisingTask *task); - bool denoising_detect_outliers(device_ptr image_ptr, - device_ptr variance_ptr, - device_ptr depth_ptr, - device_ptr output_ptr, - DenoisingTask *task); - - device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size); - void mem_free_sub_ptr(device_ptr ptr); - - class ArgumentWrapper { - public: - ArgumentWrapper() : size(0), pointer(NULL) - { - } - - ArgumentWrapper(device_memory& argument) : size(sizeof(void*)), - pointer((void*)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_vector<T>& argument) : size(sizeof(void*)), - pointer((void*)(&argument.device_pointer)) - { - } - - template<typename T> - ArgumentWrapper(device_only_memory<T>& argument) : size(sizeof(void*)), - pointer((void*)(&argument.device_pointer)) - { - } - template<typename T> - ArgumentWrapper(T& argument) : size(sizeof(argument)), - pointer(&argument) - { - } - - ArgumentWrapper(int argument) : size(sizeof(int)), - int_value(argument), - pointer(&int_value) - { - } - - ArgumentWrapper(float argument) : size(sizeof(float)), - float_value(argument), - pointer(&float_value) - { - } - - size_t size; - int int_value; - float float_value; - void *pointer; - }; - - /* TODO(sergey): In the future we can use variadic templates, once - * C++0x is allowed. Should allow to clean this up a bit. - */ - int kernel_set_args(cl_kernel kernel, - int start_argument_index, - const ArgumentWrapper& arg1 = ArgumentWrapper(), - const ArgumentWrapper& arg2 = ArgumentWrapper(), - const ArgumentWrapper& arg3 = ArgumentWrapper(), - const ArgumentWrapper& arg4 = ArgumentWrapper(), - const ArgumentWrapper& arg5 = ArgumentWrapper(), - const ArgumentWrapper& arg6 = ArgumentWrapper(), - const ArgumentWrapper& arg7 = ArgumentWrapper(), - const ArgumentWrapper& arg8 = ArgumentWrapper(), - const ArgumentWrapper& arg9 = ArgumentWrapper(), - const ArgumentWrapper& arg10 = ArgumentWrapper(), - const ArgumentWrapper& arg11 = ArgumentWrapper(), - const ArgumentWrapper& arg12 = ArgumentWrapper(), - const ArgumentWrapper& arg13 = ArgumentWrapper(), - const ArgumentWrapper& arg14 = ArgumentWrapper(), - const ArgumentWrapper& arg15 = ArgumentWrapper(), - const ArgumentWrapper& arg16 = ArgumentWrapper(), - const ArgumentWrapper& arg17 = ArgumentWrapper(), - const ArgumentWrapper& arg18 = ArgumentWrapper(), - const ArgumentWrapper& arg19 = ArgumentWrapper(), - const ArgumentWrapper& arg20 = ArgumentWrapper(), - const ArgumentWrapper& arg21 = ArgumentWrapper(), - const ArgumentWrapper& arg22 = ArgumentWrapper(), - const ArgumentWrapper& arg23 = ArgumentWrapper(), - const ArgumentWrapper& arg24 = ArgumentWrapper(), - const ArgumentWrapper& arg25 = ArgumentWrapper(), - const ArgumentWrapper& arg26 = ArgumentWrapper(), - const ArgumentWrapper& arg27 = ArgumentWrapper(), - const ArgumentWrapper& arg28 = ArgumentWrapper(), - const ArgumentWrapper& arg29 = ArgumentWrapper(), - const ArgumentWrapper& arg30 = ArgumentWrapper(), - const ArgumentWrapper& arg31 = ArgumentWrapper(), - const ArgumentWrapper& arg32 = ArgumentWrapper(), - const ArgumentWrapper& arg33 = ArgumentWrapper()); - - void release_kernel_safe(cl_kernel kernel); - void release_mem_object_safe(cl_mem mem); - void release_program_safe(cl_program program); - - /* ** Those guys are for workign around some compiler-specific bugs ** */ - - cl_program load_cached_kernel( - ustring key, - thread_scoped_lock& cache_locker); - - void store_cached_kernel( - cl_program program, - ustring key, - thread_scoped_lock& cache_locker); - -private: - MemoryManager memory_manager; - friend class MemoryManager; - - static_assert_align(TextureInfo, 16); - device_vector<TextureInfo> texture_info; - - typedef map<string, device_memory*> TexturesMap; - TexturesMap textures; - - bool textures_need_update; - -protected: - void flush_texture_buffers(); - - friend class OpenCLSplitKernel; - friend class OpenCLSplitKernelFunction; +# define opencl_device_assert(device, stmt) \ + { \ + cl_int err = stmt; \ +\ + if (err != CL_SUCCESS) { \ + string message = string_printf( \ + "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if ((device)->error_message() == "") \ + (device)->set_error(message); \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } \ + (void)0 + +# define opencl_assert(stmt) \ + { \ + cl_int err = stmt; \ +\ + if (err != CL_SUCCESS) { \ + string message = string_printf( \ + "OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \ + if (error_msg == "") \ + error_msg = message; \ + fprintf(stderr, "%s\n", message.c_str()); \ + } \ + } \ + (void)0 + +class OpenCLDevice : public Device { + public: + DedicatedTaskPool task_pool; + + /* Task pool for required kernels (base, AO kernels during foreground rendering) */ + TaskPool load_required_kernel_task_pool; + /* Task pool for optional kernels (feature kernels during foreground rendering) */ + TaskPool load_kernel_task_pool; + cl_context cxContext; + cl_command_queue cqCommandQueue; + cl_platform_id cpPlatform; + cl_device_id cdDevice; + cl_int ciErr; + int device_num; + bool use_preview_kernels; + + class OpenCLProgram { + public: + OpenCLProgram() : loaded(false), needs_compiling(true), program(NULL), device(NULL) + { + } + OpenCLProgram(OpenCLDevice *device, + const string &program_name, + const string &kernel_name, + const string &kernel_build_options, + bool use_stdout = true); + ~OpenCLProgram(); + + void add_kernel(ustring name); + + /* Try to load the program from device cache or disk */ + bool load(); + /* Compile the kernel (first separate, failback to local) */ + void compile(); + /* Create the OpenCL kernels after loading or compiling */ + void create_kernels(); + + bool is_loaded() const + { + return loaded; + } + const string &get_log() const + { + return log; + } + void report_error(); + + /* Wait until this kernel is available to be used + * It will return true when the kernel is available. + * It will return false when the kernel is not available + * or could not be loaded. */ + bool wait_for_availability(); + + cl_kernel operator()(); + cl_kernel operator()(ustring name); + + void release(); + + private: + bool build_kernel(const string *debug_src); + /* Build the program by calling the own process. + * This is required for multithreaded OpenCL compilation, since most Frameworks serialize + * build calls internally if they come from the same process. + * If that is not supported, this function just returns false. + */ + bool compile_separate(const string &clbin); + /* Build the program by calling OpenCL directly. */ + bool compile_kernel(const string *debug_src); + /* Loading and saving the program from/to disk. */ + bool load_binary(const string &clbin, const string *debug_src = NULL); + bool save_binary(const string &clbin); + + void add_log(const string &msg, bool is_debug); + void add_error(const string &msg); + + bool loaded; + bool needs_compiling; + + cl_program program; + OpenCLDevice *device; + + /* Used for the OpenCLCache key. */ + string program_name; + + string kernel_file, kernel_build_options, device_md5; + + bool use_stdout; + string log, error_msg; + string compile_output; + + map<ustring, cl_kernel> kernels; + }; + + /* Container for all types of split programs. */ + class OpenCLSplitPrograms { + public: + OpenCLDevice *device; + OpenCLProgram program_split; + OpenCLProgram program_lamp_emission; + OpenCLProgram program_do_volume; + OpenCLProgram program_indirect_background; + OpenCLProgram program_shader_eval; + OpenCLProgram program_holdout_emission_blurring_pathtermination_ao; + OpenCLProgram program_subsurface_scatter; + OpenCLProgram program_direct_lighting; + OpenCLProgram program_shadow_blocked_ao; + OpenCLProgram program_shadow_blocked_dl; + + OpenCLSplitPrograms(OpenCLDevice *device); + ~OpenCLSplitPrograms(); + + /* Load the kernels and put the created kernels in the given `programs` + * paramter. */ + void load_kernels(vector<OpenCLProgram *> &programs, + const DeviceRequestedFeatures &requested_features, + bool is_preview = false); + }; + + DeviceSplitKernel *split_kernel; + + OpenCLProgram base_program; + OpenCLProgram bake_program; + OpenCLProgram displace_program; + OpenCLProgram background_program; + OpenCLProgram denoising_program; + + OpenCLSplitPrograms kernel_programs; + OpenCLSplitPrograms preview_programs; + + typedef map<string, device_vector<uchar> *> ConstMemMap; + typedef map<string, device_ptr> MemMap; + + ConstMemMap const_mem_map; + MemMap mem_map; + device_ptr null_mem; + + bool device_initialized; + string platform_name; + string device_name; + + bool opencl_error(cl_int err); + void opencl_error(const string &message); + void opencl_assert_err(cl_int err, const char *where); + + OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background); + ~OpenCLDevice(); + + static void CL_CALLBACK context_notify_callback(const char *err_info, + const void * /*private_info*/, + size_t /*cb*/, + void *user_data); + + bool opencl_version_check(); + OpenCLSplitPrograms *get_split_programs(); + + string device_md5_hash(string kernel_custom_build_options = ""); + bool load_kernels(const DeviceRequestedFeatures &requested_features); + void load_required_kernels(const DeviceRequestedFeatures &requested_features); + void load_preview_kernels(); + + bool wait_for_availability(const DeviceRequestedFeatures &requested_features); + DeviceKernelStatus get_active_kernel_switch_state(); + + /* Get the name of the opencl program for the given kernel */ + const string get_opencl_program_name(const string &kernel_name); + /* Get the program file name to compile (*.cl) for the given kernel */ + const string get_opencl_program_filename(const string &kernel_name); + string get_build_options(const DeviceRequestedFeatures &requested_features, + const string &opencl_program_name, + bool preview_kernel = false); + /* Enable the default features to reduce recompilation events */ + void enable_default_features(DeviceRequestedFeatures &features); + + void mem_alloc(device_memory &mem); + void mem_copy_to(device_memory &mem); + void mem_copy_from(device_memory &mem, int y, int w, int h, int elem); + void mem_zero(device_memory &mem); + void mem_free(device_memory &mem); + + int mem_sub_ptr_alignment(); + + void const_copy_to(const char *name, void *host, size_t size); + void tex_alloc(device_memory &mem); + void tex_free(device_memory &mem); + + size_t global_size_round_up(int group_size, int global_size); + void enqueue_kernel(cl_kernel kernel, + size_t w, + size_t h, + bool x_workgroups = false, + size_t max_workgroup_size = -1); + void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); + void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); + + void film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half); + void shader(DeviceTask &task); + + void denoise(RenderTile &tile, DenoisingTask &denoising); + + class OpenCLDeviceTask : public DeviceTask { + public: + OpenCLDeviceTask(OpenCLDevice *device, DeviceTask &task) : DeviceTask(task) + { + run = function_bind(&OpenCLDevice::thread_run, device, this); + } + }; + + int get_split_task_count(DeviceTask & /*task*/) + { + return 1; + } + + void task_add(DeviceTask &task) + { + task_pool.push(new OpenCLDeviceTask(this, task)); + } + + void task_wait() + { + task_pool.wait(); + } + + void task_cancel() + { + task_pool.cancel(); + } + + void thread_run(DeviceTask *task); + + virtual BVHLayoutMask get_bvh_layout_mask() const + { + return BVH_LAYOUT_BVH2; + } + + virtual bool show_samples() const + { + return true; + } + + protected: + string kernel_build_options(const string *debug_src = NULL); + + void mem_zero_kernel(device_ptr ptr, size_t size); + + bool denoising_non_local_means(device_ptr image_ptr, + device_ptr guide_ptr, + device_ptr variance_ptr, + device_ptr out_ptr, + DenoisingTask *task); + bool denoising_construct_transform(DenoisingTask *task); + bool denoising_accumulate(device_ptr color_ptr, + device_ptr color_variance_ptr, + device_ptr scale_ptr, + int frame, + DenoisingTask *task); + bool denoising_solve(device_ptr output_ptr, DenoisingTask *task); + bool denoising_combine_halves(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr mean_ptr, + device_ptr variance_ptr, + int r, + int4 rect, + DenoisingTask *task); + bool denoising_divide_shadow(device_ptr a_ptr, + device_ptr b_ptr, + device_ptr sample_variance_ptr, + device_ptr sv_variance_ptr, + device_ptr buffer_variance_ptr, + DenoisingTask *task); + bool denoising_get_feature(int mean_offset, + int variance_offset, + device_ptr mean_ptr, + device_ptr variance_ptr, + float scale, + DenoisingTask *task); + bool denoising_write_feature(int to_offset, + device_ptr from_ptr, + device_ptr buffer_ptr, + DenoisingTask *task); + bool denoising_detect_outliers(device_ptr image_ptr, + device_ptr variance_ptr, + device_ptr depth_ptr, + device_ptr output_ptr, + DenoisingTask *task); + + device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int size); + void mem_free_sub_ptr(device_ptr ptr); + + class ArgumentWrapper { + public: + ArgumentWrapper() : size(0), pointer(NULL) + { + } + + ArgumentWrapper(device_memory &argument) + : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_vector<T> &argument) + : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) + { + } + + template<typename T> + ArgumentWrapper(device_only_memory<T> &argument) + : size(sizeof(void *)), pointer((void *)(&argument.device_pointer)) + { + } + template<typename T> ArgumentWrapper(T &argument) : size(sizeof(argument)), pointer(&argument) + { + } + + ArgumentWrapper(int argument) : size(sizeof(int)), int_value(argument), pointer(&int_value) + { + } + + ArgumentWrapper(float argument) + : size(sizeof(float)), float_value(argument), pointer(&float_value) + { + } + + size_t size; + int int_value; + float float_value; + void *pointer; + }; + + /* TODO(sergey): In the future we can use variadic templates, once + * C++0x is allowed. Should allow to clean this up a bit. + */ + int kernel_set_args(cl_kernel kernel, + int start_argument_index, + const ArgumentWrapper &arg1 = ArgumentWrapper(), + const ArgumentWrapper &arg2 = ArgumentWrapper(), + const ArgumentWrapper &arg3 = ArgumentWrapper(), + const ArgumentWrapper &arg4 = ArgumentWrapper(), + const ArgumentWrapper &arg5 = ArgumentWrapper(), + const ArgumentWrapper &arg6 = ArgumentWrapper(), + const ArgumentWrapper &arg7 = ArgumentWrapper(), + const ArgumentWrapper &arg8 = ArgumentWrapper(), + const ArgumentWrapper &arg9 = ArgumentWrapper(), + const ArgumentWrapper &arg10 = ArgumentWrapper(), + const ArgumentWrapper &arg11 = ArgumentWrapper(), + const ArgumentWrapper &arg12 = ArgumentWrapper(), + const ArgumentWrapper &arg13 = ArgumentWrapper(), + const ArgumentWrapper &arg14 = ArgumentWrapper(), + const ArgumentWrapper &arg15 = ArgumentWrapper(), + const ArgumentWrapper &arg16 = ArgumentWrapper(), + const ArgumentWrapper &arg17 = ArgumentWrapper(), + const ArgumentWrapper &arg18 = ArgumentWrapper(), + const ArgumentWrapper &arg19 = ArgumentWrapper(), + const ArgumentWrapper &arg20 = ArgumentWrapper(), + const ArgumentWrapper &arg21 = ArgumentWrapper(), + const ArgumentWrapper &arg22 = ArgumentWrapper(), + const ArgumentWrapper &arg23 = ArgumentWrapper(), + const ArgumentWrapper &arg24 = ArgumentWrapper(), + const ArgumentWrapper &arg25 = ArgumentWrapper(), + const ArgumentWrapper &arg26 = ArgumentWrapper(), + const ArgumentWrapper &arg27 = ArgumentWrapper(), + const ArgumentWrapper &arg28 = ArgumentWrapper(), + const ArgumentWrapper &arg29 = ArgumentWrapper(), + const ArgumentWrapper &arg30 = ArgumentWrapper(), + const ArgumentWrapper &arg31 = ArgumentWrapper(), + const ArgumentWrapper &arg32 = ArgumentWrapper(), + const ArgumentWrapper &arg33 = ArgumentWrapper()); + + void release_kernel_safe(cl_kernel kernel); + void release_mem_object_safe(cl_mem mem); + void release_program_safe(cl_program program); + + /* ** Those guys are for workign around some compiler-specific bugs ** */ + + cl_program load_cached_kernel(ustring key, thread_scoped_lock &cache_locker); + + void store_cached_kernel(cl_program program, ustring key, thread_scoped_lock &cache_locker); + + private: + MemoryManager memory_manager; + friend class MemoryManager; + + static_assert_align(TextureInfo, 16); + device_vector<TextureInfo> texture_info; + + typedef map<string, device_memory *> TexturesMap; + TexturesMap textures; + + bool textures_need_update; + + protected: + void flush_texture_buffers(); + + friend class OpenCLSplitKernel; + friend class OpenCLSplitKernelFunction; }; -Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background); +Device *opencl_create_split_device(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + bool background); CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_split.cpp b/intern/cycles/device/opencl/opencl_split.cpp index 489d10b7087..70b1a643044 100644 --- a/intern/cycles/device/opencl/opencl_split.cpp +++ b/intern/cycles/device/opencl/opencl_split.cpp @@ -16,273 +16,278 @@ #ifdef WITH_OPENCL -#include "device/opencl/opencl.h" +# include "device/opencl/opencl.h" -#include "kernel/kernel_types.h" -#include "kernel/split/kernel_split_data_types.h" +# include "kernel/kernel_types.h" +# include "kernel/split/kernel_split_data_types.h" -#include "util/util_algorithm.h" -#include "util/util_debug.h" -#include "util/util_foreach.h" -#include "util/util_logging.h" -#include "util/util_md5.h" -#include "util/util_path.h" -#include "util/util_time.h" +# include "util/util_algorithm.h" +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_time.h" CCL_NAMESPACE_BEGIN struct texture_slot_t { - texture_slot_t(const string& name, int slot) - : name(name), - slot(slot) { - } - string name; - int slot; + texture_slot_t(const string &name, int slot) : name(name), slot(slot) + { + } + string name; + int slot; }; static const string NON_SPLIT_KERNELS = - "denoising " - "base " - "background " - "displace "; + "denoising " + "base " + "background " + "displace "; static const string SPLIT_BUNDLE_KERNELS = - "data_init " - "path_init " - "state_buffer_size " - "scene_intersect " - "queue_enqueue " - "shader_setup " - "shader_sort " - "enqueue_inactive " - "next_iteration_setup " - "indirect_subsurface " - "buffer_update"; - -const string OpenCLDevice::get_opencl_program_name(const string& kernel_name) + "data_init " + "path_init " + "state_buffer_size " + "scene_intersect " + "queue_enqueue " + "shader_setup " + "shader_sort " + "enqueue_inactive " + "next_iteration_setup " + "indirect_subsurface " + "buffer_update"; + +const string OpenCLDevice::get_opencl_program_name(const string &kernel_name) { - if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) { - return kernel_name; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "split_bundle"; - } - else { - return "split_" + kernel_name; - } + if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) { + return kernel_name; + } + else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { + return "split_bundle"; + } + else { + return "split_" + kernel_name; + } } -const string OpenCLDevice::get_opencl_program_filename(const string& kernel_name) +const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name) { - if (kernel_name == "denoising") { - return "filter.cl"; - } - else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { - return "kernel_split_bundle.cl"; - } - else { - return "kernel_" + kernel_name + ".cl"; - } + if (kernel_name == "denoising") { + return "filter.cl"; + } + else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) { + return "kernel_split_bundle.cl"; + } + else { + return "kernel_" + kernel_name + ".cl"; + } } /* Enable features that we always want to compile to reduce recompilation events */ -void OpenCLDevice::enable_default_features(DeviceRequestedFeatures& features) +void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features) { - features.use_transparent = true; - features.use_shadow_tricks = true; - features.use_principled = true; - features.use_denoising = true; - - if (!background) - { - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_hair = true; - features.use_subsurface = true; - features.use_camera_motion = false; - features.use_object_motion = false; - } + features.use_transparent = true; + features.use_shadow_tricks = true; + features.use_principled = true; + features.use_denoising = true; + + if (!background) { + features.max_nodes_group = NODE_GROUP_LEVEL_MAX; + features.nodes_features = NODE_FEATURE_ALL; + features.use_hair = true; + features.use_subsurface = true; + features.use_camera_motion = false; + features.use_object_motion = false; + } } -string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name, bool preview_kernel) +string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features, + const string &opencl_program_name, + bool preview_kernel) { - /* first check for non-split kernel programs */ - if (opencl_program_name == "base" || opencl_program_name == "denoising") { - return ""; - } - else if (opencl_program_name == "bake") { - /* Note: get_build_options for bake is only requested when baking is enabled. - * displace and background are always requested. - * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_hair = true; - features.use_subsurface = true; - features.max_nodes_group = NODE_GROUP_LEVEL_MAX; - features.nodes_features = NODE_FEATURE_ALL; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "displace") { - /* As displacement does not use any nodes from the Shading group (eg BSDF). - * We disable all features that are related to shading. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_denoising = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_baking = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_subsurface = false; - features.use_volume = false; - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_denoising = false; - features.use_principled = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - else if (opencl_program_name == "background") { - /* Background uses Background shading - * It is save to disable shadow features, subsurface and volumetric. */ - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - features.use_baking = false; - features.use_object_motion = false; - features.use_camera_motion = false; - features.use_transparent = false; - features.use_shadow_tricks = false; - features.use_denoising = false; - /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node. - * Perhaps we should remove them in UI as it does not make any sense when - * rendering background. */ - features.nodes_features &= ~NODE_FEATURE_VOLUME; - features.use_subsurface = false; - features.use_volume = false; - features.use_shader_raytrace = false; - features.use_patch_evaluation = false; - features.use_integrator_branched = false; - return features.get_build_options(); - } - - string build_options = "-D__SPLIT_KERNEL__ "; - /* Set compute device build option. */ - cl_device_type device_type; - OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); - assert(this->ciErr == CL_SUCCESS); - if(device_type == CL_DEVICE_TYPE_GPU) { - build_options += "-D__COMPUTE_DEVICE_GPU__ "; - } - - DeviceRequestedFeatures nofeatures; - enable_default_features(nofeatures); - - /* Add program specific optimized compile directives */ - if (preview_kernel) { - DeviceRequestedFeatures preview_features; - preview_features.use_hair = true; - build_options += "-D__KERNEL_AO_PREVIEW__ "; - build_options += preview_features.get_build_options(); - } - else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) { - build_options += nofeatures.get_build_options(); - } - else { - DeviceRequestedFeatures features(requested_features); - enable_default_features(features); - - /* Always turn off baking at this point. Baking is only usefull when building the bake kernel. - * this also makes sure that the kernels that are build during baking can be reused - * when not doing any baking. */ - features.use_baking = false; - - /* Do not vary on shaders when program doesn't do any shading. - * We have bundled them in a single program. */ - if (opencl_program_name == "split_bundle") { - features.max_nodes_group = 0; - features.nodes_features = 0; - features.use_shader_raytrace = false; - } - - /* No specific settings, just add the regular ones */ - build_options += features.get_build_options(); - } - - return build_options; + /* first check for non-split kernel programs */ + if (opencl_program_name == "base" || opencl_program_name == "denoising") { + return ""; + } + else if (opencl_program_name == "bake") { + /* Note: get_build_options for bake is only requested when baking is enabled. + * displace and background are always requested. + * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */ + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + features.use_denoising = false; + features.use_object_motion = false; + features.use_camera_motion = false; + features.use_hair = true; + features.use_subsurface = true; + features.max_nodes_group = NODE_GROUP_LEVEL_MAX; + features.nodes_features = NODE_FEATURE_ALL; + features.use_integrator_branched = false; + return features.get_build_options(); + } + else if (opencl_program_name == "displace") { + /* As displacement does not use any nodes from the Shading group (eg BSDF). + * We disable all features that are related to shading. */ + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + features.use_denoising = false; + features.use_object_motion = false; + features.use_camera_motion = false; + features.use_baking = false; + features.use_transparent = false; + features.use_shadow_tricks = false; + features.use_subsurface = false; + features.use_volume = false; + features.nodes_features &= ~NODE_FEATURE_VOLUME; + features.use_denoising = false; + features.use_principled = false; + features.use_integrator_branched = false; + return features.get_build_options(); + } + else if (opencl_program_name == "background") { + /* Background uses Background shading + * It is save to disable shadow features, subsurface and volumetric. */ + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + features.use_baking = false; + features.use_object_motion = false; + features.use_camera_motion = false; + features.use_transparent = false; + features.use_shadow_tricks = false; + features.use_denoising = false; + /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node. + * Perhaps we should remove them in UI as it does not make any sense when + * rendering background. */ + features.nodes_features &= ~NODE_FEATURE_VOLUME; + features.use_subsurface = false; + features.use_volume = false; + features.use_shader_raytrace = false; + features.use_patch_evaluation = false; + features.use_integrator_branched = false; + return features.get_build_options(); + } + + string build_options = "-D__SPLIT_KERNEL__ "; + /* Set compute device build option. */ + cl_device_type device_type; + OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr); + assert(this->ciErr == CL_SUCCESS); + if (device_type == CL_DEVICE_TYPE_GPU) { + build_options += "-D__COMPUTE_DEVICE_GPU__ "; + } + + DeviceRequestedFeatures nofeatures; + enable_default_features(nofeatures); + + /* Add program specific optimized compile directives */ + if (preview_kernel) { + DeviceRequestedFeatures preview_features; + preview_features.use_hair = true; + build_options += "-D__KERNEL_AO_PREVIEW__ "; + build_options += preview_features.get_build_options(); + } + else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) { + build_options += nofeatures.get_build_options(); + } + else { + DeviceRequestedFeatures features(requested_features); + enable_default_features(features); + + /* Always turn off baking at this point. Baking is only usefull when building the bake kernel. + * this also makes sure that the kernels that are build during baking can be reused + * when not doing any baking. */ + features.use_baking = false; + + /* Do not vary on shaders when program doesn't do any shading. + * We have bundled them in a single program. */ + if (opencl_program_name == "split_bundle") { + features.max_nodes_group = 0; + features.nodes_features = 0; + features.use_shader_raytrace = false; + } + + /* No specific settings, just add the regular ones */ + build_options += features.get_build_options(); + } + + return build_options; } OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_) { - device = device_; + device = device_; } OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms() { - program_split.release(); - program_lamp_emission.release(); - program_do_volume.release(); - program_indirect_background.release(); - program_shader_eval.release(); - program_holdout_emission_blurring_pathtermination_ao.release(); - program_subsurface_scatter.release(); - program_direct_lighting.release(); - program_shadow_blocked_ao.release(); - program_shadow_blocked_dl.release(); + program_split.release(); + program_lamp_emission.release(); + program_do_volume.release(); + program_indirect_background.release(); + program_shader_eval.release(); + program_holdout_emission_blurring_pathtermination_ao.release(); + program_subsurface_scatter.release(); + program_direct_lighting.release(); + program_shadow_blocked_ao.release(); + program_shadow_blocked_dl.release(); } -void OpenCLDevice::OpenCLSplitPrograms::load_kernels(vector<OpenCLProgram*> &programs, const DeviceRequestedFeatures& requested_features, bool is_preview) +void OpenCLDevice::OpenCLSplitPrograms::load_kernels( + vector<OpenCLProgram *> &programs, + const DeviceRequestedFeatures &requested_features, + bool is_preview) { - if (!requested_features.use_baking) { -#define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name)); -#define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \ - const string program_name_##kernel_name = "split_"#kernel_name; \ - program_##kernel_name = \ - OpenCLDevice::OpenCLProgram(device, \ - program_name_##kernel_name, \ - "kernel_"#kernel_name".cl", \ - device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \ - program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \ - programs.push_back(&program_##kernel_name); - - /* Ordered with most complex kernels first, to reduce overall compile time. */ - ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter); - if (requested_features.use_volume || is_preview) { - ADD_SPLIT_KERNEL_PROGRAM(do_volume); - } - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl); - ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao); - ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao); - ADD_SPLIT_KERNEL_PROGRAM(lamp_emission); - ADD_SPLIT_KERNEL_PROGRAM(direct_lighting); - ADD_SPLIT_KERNEL_PROGRAM(indirect_background); - ADD_SPLIT_KERNEL_PROGRAM(shader_eval); - - /* Quick kernels bundled in a single program to reduce overhead of starting - * Blender processes. */ - program_split = OpenCLDevice::OpenCLProgram(device, - "split_bundle" , - "kernel_split_bundle.cl", - device->get_build_options(requested_features, "split_bundle", is_preview)); - - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); - ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); - programs.push_back(&program_split); - -#undef ADD_SPLIT_KERNEL_PROGRAM -#undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM - } + if (!requested_features.use_baking) { +# define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \ + program_split.add_kernel(ustring("path_trace_" #kernel_name)); +# define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \ + const string program_name_##kernel_name = "split_" #kernel_name; \ + program_##kernel_name = OpenCLDevice::OpenCLProgram( \ + device, \ + program_name_##kernel_name, \ + "kernel_" #kernel_name ".cl", \ + device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \ + program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \ + programs.push_back(&program_##kernel_name); + + /* Ordered with most complex kernels first, to reduce overall compile time. */ + ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter); + if (requested_features.use_volume || is_preview) { + ADD_SPLIT_KERNEL_PROGRAM(do_volume); + } + ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl); + ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao); + ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao); + ADD_SPLIT_KERNEL_PROGRAM(lamp_emission); + ADD_SPLIT_KERNEL_PROGRAM(direct_lighting); + ADD_SPLIT_KERNEL_PROGRAM(indirect_background); + ADD_SPLIT_KERNEL_PROGRAM(shader_eval); + + /* Quick kernels bundled in a single program to reduce overhead of starting + * Blender processes. */ + program_split = OpenCLDevice::OpenCLProgram( + device, + "split_bundle", + "kernel_split_bundle.cl", + device->get_build_options(requested_features, "split_bundle", is_preview)); + + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface); + ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update); + programs.push_back(&program_split); + +# undef ADD_SPLIT_KERNEL_PROGRAM +# undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM + } } namespace { @@ -291,1126 +296,1108 @@ namespace { * fetch its size. */ typedef struct KernelGlobalsDummy { - ccl_constant KernelData *data; - ccl_global char *buffers[8]; + ccl_constant KernelData *data; + ccl_global char *buffers[8]; -#define KERNEL_TEX(type, name) \ - TextureInfo name; +# define KERNEL_TEX(type, name) TextureInfo name; # include "kernel/kernel_textures.h" -#undef KERNEL_TEX - SplitData split_data; - SplitParams split_param_data; +# undef KERNEL_TEX + SplitData split_data; + SplitParams split_param_data; } KernelGlobalsDummy; } // namespace - struct CachedSplitMemory { - int id; - device_memory *split_data; - device_memory *ray_state; - device_memory *queue_index; - device_memory *use_queues_flag; - device_memory *work_pools; - device_ptr *buffer; + int id; + device_memory *split_data; + device_memory *ray_state; + device_memory *queue_index; + device_memory *use_queues_flag; + device_memory *work_pools; + device_ptr *buffer; }; class OpenCLSplitKernelFunction : public SplitKernelFunction { -public: - OpenCLDevice* device; - OpenCLDevice::OpenCLProgram program; - CachedSplitMemory& cached_memory; - int cached_id; - - OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) : - device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1) - { - } - - ~OpenCLSplitKernelFunction() - { - program.release(); - } - - virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data) - { - if(cached_id != cached_memory.id) { - cl_uint start_arg_index = - device->kernel_set_args(program(), - 0, - kg, - data, - *cached_memory.split_data, - *cached_memory.ray_state); - - device->set_kernel_arg_buffers(program(), &start_arg_index); - - start_arg_index += - device->kernel_set_args(program(), - start_arg_index, - *cached_memory.queue_index, - *cached_memory.use_queues_flag, - *cached_memory.work_pools, - *cached_memory.buffer); - - cached_id = cached_memory.id; - } - - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - program(), - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if(device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - return true; - } + public: + OpenCLDevice *device; + OpenCLDevice::OpenCLProgram program; + CachedSplitMemory &cached_memory; + int cached_id; + + OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory) + : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1) + { + } + + ~OpenCLSplitKernelFunction() + { + program.release(); + } + + virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data) + { + if (cached_id != cached_memory.id) { + cl_uint start_arg_index = device->kernel_set_args( + program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state); + + device->set_kernel_arg_buffers(program(), &start_arg_index); + + start_arg_index += device->kernel_set_args(program(), + start_arg_index, + *cached_memory.queue_index, + *cached_memory.use_queues_flag, + *cached_memory.work_pools, + *cached_memory.buffer); + + cached_id = cached_memory.id; + } + + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + program(), + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if (device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; + } + + return true; + } }; class OpenCLSplitKernel : public DeviceSplitKernel { - OpenCLDevice *device; - CachedSplitMemory cached_memory; -public: - explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) { - } - - virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name, - const DeviceRequestedFeatures& requested_features) - { - OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory); - - const string program_name = device->get_opencl_program_name(kernel_name); - kernel->program = - OpenCLDevice::OpenCLProgram(device, - program_name, - device->get_opencl_program_filename(kernel_name), - device->get_build_options(requested_features, - program_name, - device->use_preview_kernels)); - - kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); - kernel->program.load(); - - if(!kernel->program.is_loaded()) { - delete kernel; - return NULL; - } - - return kernel; - } - - virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads) - { - device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); - size_buffer.alloc(1); - size_buffer.zero_to_device(); - - uint threads = num_threads; - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_state_buffer_size = programs->program_split(ustring("path_trace_state_buffer_size")); - device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer); - - size_t global_size = 64; - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_state_buffer_size, - 1, - NULL, - &global_size, - NULL, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - size_buffer.copy_from_device(0, 1, 1); - size_t size = size_buffer[0]; - size_buffer.free(); - - if(device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return 0; - } - - return size; - } - - virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim, - RenderTile& rtile, - int num_global_elements, - device_memory& kernel_globals, - device_memory& kernel_data, - device_memory& split_data, - device_memory& ray_state, - device_memory& queue_index, - device_memory& use_queues_flag, - device_memory& work_pool_wgs - ) - { - cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; - - /* Set the range of samples to be processed for every ray in - * path-regeneration logic. - */ - cl_int start_sample = rtile.start_sample; - cl_int end_sample = rtile.start_sample + rtile.num_samples; - - OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); - cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); - - cl_uint start_arg_index = - device->kernel_set_args(kernel_data_init, - 0, - kernel_globals, - kernel_data, - split_data, - num_global_elements, - ray_state); - - device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); - - start_arg_index += - device->kernel_set_args(kernel_data_init, - start_arg_index, - start_sample, - end_sample, - rtile.x, - rtile.y, - rtile.w, - rtile.h, - rtile.offset, - rtile.stride, - queue_index, - dQueue_size, - use_queues_flag, - work_pool_wgs, - rtile.num_samples, - rtile.buffer); - - /* Enqueue ckPathTraceKernel_data_init kernel. */ - device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, - kernel_data_init, - 2, - NULL, - dim.global_size, - dim.local_size, - 0, - NULL, - NULL); - - device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); - - if(device->ciErr != CL_SUCCESS) { - string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", - clewErrorString(device->ciErr)); - device->opencl_error(message); - return false; - } - - cached_memory.split_data = &split_data; - cached_memory.ray_state = &ray_state; - cached_memory.queue_index = &queue_index; - cached_memory.use_queues_flag = &use_queues_flag; - cached_memory.work_pools = &work_pool_wgs; - cached_memory.buffer = &rtile.buffer; - cached_memory.id++; - - return true; - } - - virtual int2 split_kernel_local_size() - { - return make_int2(64, 1); - } - - virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) - { - cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); - /* Use small global size on CPU devices as it seems to be much faster. */ - if(type == CL_DEVICE_TYPE_CPU) { - VLOG(1) << "Global size: (64, 64)."; - return make_int2(64, 64); - } - - cl_ulong max_buffer_size; - clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); - - if(DebugFlags().opencl.mem_limit) { - max_buffer_size = min(max_buffer_size, - cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); - } - - VLOG(1) << "Maximum device allocation size: " - << string_human_readable_number(max_buffer_size) << " bytes. (" - << string_human_readable_size(max_buffer_size) << ")."; - - /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ - max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); - - size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); - int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); - VLOG(1) << "Global size: " << global_size << "."; - return global_size; - } + OpenCLDevice *device; + CachedSplitMemory cached_memory; + + public: + explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) + { + } + + virtual SplitKernelFunction *get_split_kernel_function( + const string &kernel_name, const DeviceRequestedFeatures &requested_features) + { + OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory); + + const string program_name = device->get_opencl_program_name(kernel_name); + kernel->program = OpenCLDevice::OpenCLProgram( + device, + program_name, + device->get_opencl_program_filename(kernel_name), + device->get_build_options(requested_features, program_name, device->use_preview_kernels)); + + kernel->program.add_kernel(ustring("path_trace_" + kernel_name)); + kernel->program.load(); + + if (!kernel->program.is_loaded()) { + delete kernel; + return NULL; + } + + return kernel; + } + + virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads) + { + device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE); + size_buffer.alloc(1); + size_buffer.zero_to_device(); + + uint threads = num_threads; + OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); + cl_kernel kernel_state_buffer_size = programs->program_split( + ustring("path_trace_state_buffer_size")); + device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer); + + size_t global_size = 64; + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + kernel_state_buffer_size, + 1, + NULL, + &global_size, + NULL, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + size_buffer.copy_from_device(0, 1, 1); + size_t size = size_buffer[0]; + size_buffer.free(); + + if (device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return 0; + } + + return size; + } + + virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, + RenderTile &rtile, + int num_global_elements, + device_memory &kernel_globals, + device_memory &kernel_data, + device_memory &split_data, + device_memory &ray_state, + device_memory &queue_index, + device_memory &use_queues_flag, + device_memory &work_pool_wgs) + { + cl_int dQueue_size = dim.global_size[0] * dim.global_size[1]; + + /* Set the range of samples to be processed for every ray in + * path-regeneration logic. + */ + cl_int start_sample = rtile.start_sample; + cl_int end_sample = rtile.start_sample + rtile.num_samples; + + OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs(); + cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init")); + + cl_uint start_arg_index = device->kernel_set_args(kernel_data_init, + 0, + kernel_globals, + kernel_data, + split_data, + num_global_elements, + ray_state); + + device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index); + + start_arg_index += device->kernel_set_args(kernel_data_init, + start_arg_index, + start_sample, + end_sample, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + rtile.offset, + rtile.stride, + queue_index, + dQueue_size, + use_queues_flag, + work_pool_wgs, + rtile.num_samples, + rtile.buffer); + + /* Enqueue ckPathTraceKernel_data_init kernel. */ + device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue, + kernel_data_init, + 2, + NULL, + dim.global_size, + dim.local_size, + 0, + NULL, + NULL); + + device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel"); + + if (device->ciErr != CL_SUCCESS) { + string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", + clewErrorString(device->ciErr)); + device->opencl_error(message); + return false; + } + + cached_memory.split_data = &split_data; + cached_memory.ray_state = &ray_state; + cached_memory.queue_index = &queue_index; + cached_memory.use_queues_flag = &use_queues_flag; + cached_memory.work_pools = &work_pool_wgs; + cached_memory.buffer = &rtile.buffer; + cached_memory.id++; + + return true; + } + + virtual int2 split_kernel_local_size() + { + return make_int2(64, 1); + } + + virtual int2 split_kernel_global_size(device_memory &kg, + device_memory &data, + DeviceTask * /*task*/) + { + cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); + /* Use small global size on CPU devices as it seems to be much faster. */ + if (type == CL_DEVICE_TYPE_CPU) { + VLOG(1) << "Global size: (64, 64)."; + return make_int2(64, 64); + } + + cl_ulong max_buffer_size; + clGetDeviceInfo( + device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); + + if (DebugFlags().opencl.mem_limit) { + max_buffer_size = min(max_buffer_size, + cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); + } + + VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) + << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; + + /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ + max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024); + + size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); + int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), + (int)sqrt(num_elements)); + VLOG(1) << "Global size: " << global_size << "."; + return global_size; + } }; bool OpenCLDevice::opencl_error(cl_int err) { - if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); - return true; - } - - return false; + if (err != CL_SUCCESS) { + string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err)); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); + return true; + } + + return false; } -void OpenCLDevice::opencl_error(const string& message) +void OpenCLDevice::opencl_error(const string &message) { - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); } -void OpenCLDevice::opencl_assert_err(cl_int err, const char* where) +void OpenCLDevice::opencl_assert_err(cl_int err, const char *where) { - if(err != CL_SUCCESS) { - string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where); - if(error_msg == "") - error_msg = message; - fprintf(stderr, "%s\n", message.c_str()); -#ifndef NDEBUG - abort(); -#endif - } + if (err != CL_SUCCESS) { + string message = string_printf( + "OpenCL error (%d): %s in %s", err, clewErrorString(err), where); + if (error_msg == "") + error_msg = message; + fprintf(stderr, "%s\n", message.c_str()); +# ifndef NDEBUG + abort(); +# endif + } } -OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background) -: Device(info, stats, profiler, background), - kernel_programs(this), - preview_programs(this), - memory_manager(this), - texture_info(this, "__texture_info", MEM_TEXTURE) +OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) + : Device(info, stats, profiler, background), + kernel_programs(this), + preview_programs(this), + memory_manager(this), + texture_info(this, "__texture_info", MEM_TEXTURE) { - cpPlatform = NULL; - cdDevice = NULL; - cxContext = NULL; - cqCommandQueue = NULL; - null_mem = 0; - device_initialized = false; - textures_need_update = true; - use_preview_kernels = !background; - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if(usable_devices.size() == 0) { - opencl_error("OpenCL: no devices found."); - return; - } - assert(info.num < usable_devices.size()); - OpenCLPlatformDevice& platform_device = usable_devices[info.num]; - device_num = info.num; - cpPlatform = platform_device.platform_id; - cdDevice = platform_device.device_id; - platform_name = platform_device.platform_name; - device_name = platform_device.device_name; - VLOG(2) << "Creating new Cycles device for OpenCL platform " - << platform_name << ", device " - << device_name << "."; - - { - /* try to use cached context */ - thread_scoped_lock cache_locker; - cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); - - if(cxContext == NULL) { - /* create context properties array to specify platform */ - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, - 0, 0 - }; - - /* create context */ - cxContext = clCreateContext(context_props, 1, &cdDevice, - context_notify_callback, cdDevice, &ciErr); - - if(opencl_error(ciErr)) { - opencl_error("OpenCL: clCreateContext failed"); - return; - } - - /* cache it */ - OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); - } - } - - cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); - if(opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating command queue"); - return; - } - - null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); - if(opencl_error(ciErr)) { - opencl_error("OpenCL: Error creating memory buffer for NULL"); - return; - } - - /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ - texture_info.resize(1); - memory_manager.alloc("texture_info", texture_info); - - device_initialized = true; - - split_kernel = new OpenCLSplitKernel(this); - if (!background) { - load_preview_kernels(); - } + cpPlatform = NULL; + cdDevice = NULL; + cxContext = NULL; + cqCommandQueue = NULL; + null_mem = 0; + device_initialized = false; + textures_need_update = true; + use_preview_kernels = !background; + + vector<OpenCLPlatformDevice> usable_devices; + OpenCLInfo::get_usable_devices(&usable_devices); + if (usable_devices.size() == 0) { + opencl_error("OpenCL: no devices found."); + return; + } + assert(info.num < usable_devices.size()); + OpenCLPlatformDevice &platform_device = usable_devices[info.num]; + device_num = info.num; + cpPlatform = platform_device.platform_id; + cdDevice = platform_device.device_id; + platform_name = platform_device.platform_name; + device_name = platform_device.device_name; + VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device " + << device_name << "."; + + { + /* try to use cached context */ + thread_scoped_lock cache_locker; + cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker); + + if (cxContext == NULL) { + /* create context properties array to specify platform */ + const cl_context_properties context_props[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0}; + + /* create context */ + cxContext = clCreateContext( + context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr); + + if (opencl_error(ciErr)) { + opencl_error("OpenCL: clCreateContext failed"); + return; + } + + /* cache it */ + OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker); + } + } + + cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr); + if (opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating command queue"); + return; + } + + null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr); + if (opencl_error(ciErr)) { + opencl_error("OpenCL: Error creating memory buffer for NULL"); + return; + } + + /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */ + texture_info.resize(1); + memory_manager.alloc("texture_info", texture_info); + + device_initialized = true; + + split_kernel = new OpenCLSplitKernel(this); + if (!background) { + load_preview_kernels(); + } } OpenCLDevice::~OpenCLDevice() { - task_pool.stop(); - load_required_kernel_task_pool.stop(); - load_kernel_task_pool.stop(); + task_pool.stop(); + load_required_kernel_task_pool.stop(); + load_kernel_task_pool.stop(); - memory_manager.free(); + memory_manager.free(); - if(null_mem) - clReleaseMemObject(CL_MEM_PTR(null_mem)); + if (null_mem) + clReleaseMemObject(CL_MEM_PTR(null_mem)); - ConstMemMap::iterator mt; - for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { - delete mt->second; - } + ConstMemMap::iterator mt; + for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) { + delete mt->second; + } - base_program.release(); - bake_program.release(); - displace_program.release(); - background_program.release(); - denoising_program.release(); + base_program.release(); + bake_program.release(); + displace_program.release(); + background_program.release(); + denoising_program.release(); - if(cqCommandQueue) - clReleaseCommandQueue(cqCommandQueue); - if(cxContext) - clReleaseContext(cxContext); + if (cqCommandQueue) + clReleaseCommandQueue(cqCommandQueue); + if (cxContext) + clReleaseContext(cxContext); - delete split_kernel; + delete split_kernel; } void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info, - const void * /*private_info*/, size_t /*cb*/, void *user_data) + const void * /*private_info*/, + size_t /*cb*/, + void *user_data) { - string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); - fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); + string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data); + fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info); } bool OpenCLDevice::opencl_version_check() { - string error; - if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) { - opencl_error(error); - return false; - } - if(!OpenCLInfo::device_version_check(cdDevice, &error)) { - opencl_error(error); - return false; - } - return true; + string error; + if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) { + opencl_error(error); + return false; + } + if (!OpenCLInfo::device_version_check(cdDevice, &error)) { + opencl_error(error); + return false; + } + return true; } string OpenCLDevice::device_md5_hash(string kernel_custom_build_options) { - MD5Hash md5; - char version[256], driver[256], name[256], vendor[256]; + MD5Hash md5; + char version[256], driver[256], name[256], vendor[256]; - clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); - clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); + clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL); + clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL); + clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL); - md5.append((uint8_t*)vendor, strlen(vendor)); - md5.append((uint8_t*)version, strlen(version)); - md5.append((uint8_t*)name, strlen(name)); - md5.append((uint8_t*)driver, strlen(driver)); + md5.append((uint8_t *)vendor, strlen(vendor)); + md5.append((uint8_t *)version, strlen(version)); + md5.append((uint8_t *)name, strlen(name)); + md5.append((uint8_t *)driver, strlen(driver)); - string options = kernel_build_options(); - options += kernel_custom_build_options; - md5.append((uint8_t*)options.c_str(), options.size()); + string options = kernel_build_options(); + options += kernel_custom_build_options; + md5.append((uint8_t *)options.c_str(), options.size()); - return md5.get_hex(); + return md5.get_hex(); } -bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features) +bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features) { - VLOG(2) << "Loading kernels for platform " << platform_name - << ", device " << device_name << "."; - /* Verify if device was initialized. */ - if(!device_initialized) { - fprintf(stderr, "OpenCL: failed to initialize device.\n"); - return false; - } - - /* Verify we have right opencl version. */ - if(!opencl_version_check()) - return false; - - load_required_kernels(requested_features); - - vector<OpenCLProgram*> programs; - kernel_programs.load_kernels(programs, requested_features, false); - - if (!requested_features.use_baking && requested_features.use_denoising) { - denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising")); - denoising_program.add_kernel(ustring("filter_divide_shadow")); - denoising_program.add_kernel(ustring("filter_get_feature")); - denoising_program.add_kernel(ustring("filter_write_feature")); - denoising_program.add_kernel(ustring("filter_detect_outliers")); - denoising_program.add_kernel(ustring("filter_combine_halves")); - denoising_program.add_kernel(ustring("filter_construct_transform")); - denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); - denoising_program.add_kernel(ustring("filter_nlm_blur")); - denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); - denoising_program.add_kernel(ustring("filter_nlm_update_output")); - denoising_program.add_kernel(ustring("filter_nlm_normalize")); - denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); - denoising_program.add_kernel(ustring("filter_finalize")); - programs.push_back(&denoising_program); - } - - load_required_kernel_task_pool.wait_work(); - - /* Parallel compilation of Cycles kernels, this launches multiple - * processes to workaround OpenCL frameworks serializing the calls - * internally within a single process. */ - foreach(OpenCLProgram *program, programs) { - if (!program->load()) { - load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } - return true; + VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << "."; + /* Verify if device was initialized. */ + if (!device_initialized) { + fprintf(stderr, "OpenCL: failed to initialize device.\n"); + return false; + } + + /* Verify we have right opencl version. */ + if (!opencl_version_check()) + return false; + + load_required_kernels(requested_features); + + vector<OpenCLProgram *> programs; + kernel_programs.load_kernels(programs, requested_features, false); + + if (!requested_features.use_baking && requested_features.use_denoising) { + denoising_program = OpenCLProgram( + this, "denoising", "filter.cl", get_build_options(requested_features, "denoising")); + denoising_program.add_kernel(ustring("filter_divide_shadow")); + denoising_program.add_kernel(ustring("filter_get_feature")); + denoising_program.add_kernel(ustring("filter_write_feature")); + denoising_program.add_kernel(ustring("filter_detect_outliers")); + denoising_program.add_kernel(ustring("filter_combine_halves")); + denoising_program.add_kernel(ustring("filter_construct_transform")); + denoising_program.add_kernel(ustring("filter_nlm_calc_difference")); + denoising_program.add_kernel(ustring("filter_nlm_blur")); + denoising_program.add_kernel(ustring("filter_nlm_calc_weight")); + denoising_program.add_kernel(ustring("filter_nlm_update_output")); + denoising_program.add_kernel(ustring("filter_nlm_normalize")); + denoising_program.add_kernel(ustring("filter_nlm_construct_gramian")); + denoising_program.add_kernel(ustring("filter_finalize")); + programs.push_back(&denoising_program); + } + + load_required_kernel_task_pool.wait_work(); + + /* Parallel compilation of Cycles kernels, this launches multiple + * processes to workaround OpenCL frameworks serializing the calls + * internally within a single process. */ + foreach (OpenCLProgram *program, programs) { + if (!program->load()) { + load_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + } + } + return true; } -void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures& requested_features) +void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features) { - vector<OpenCLProgram*> programs; - base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base")); - base_program.add_kernel(ustring("convert_to_byte")); - base_program.add_kernel(ustring("convert_to_half_float")); - base_program.add_kernel(ustring("zero_buffer")); - programs.push_back(&base_program); - - if (requested_features.use_true_displacement) { - displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace")); - displace_program.add_kernel(ustring("displace")); - programs.push_back(&displace_program); - } - - if (requested_features.use_background_light) { - background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options(requested_features, "background")); - background_program.add_kernel(ustring("background")); - programs.push_back(&background_program); - } - - if (requested_features.use_baking) { - bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake")); - bake_program.add_kernel(ustring("bake")); - programs.push_back(&bake_program); - } - - foreach(OpenCLProgram *program, programs) { - if (!program->load()) { - load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } + vector<OpenCLProgram *> programs; + base_program = OpenCLProgram( + this, "base", "kernel_base.cl", get_build_options(requested_features, "base")); + base_program.add_kernel(ustring("convert_to_byte")); + base_program.add_kernel(ustring("convert_to_half_float")); + base_program.add_kernel(ustring("zero_buffer")); + programs.push_back(&base_program); + + if (requested_features.use_true_displacement) { + displace_program = OpenCLProgram( + this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace")); + displace_program.add_kernel(ustring("displace")); + programs.push_back(&displace_program); + } + + if (requested_features.use_background_light) { + background_program = OpenCLProgram(this, + "background", + "kernel_background.cl", + get_build_options(requested_features, "background")); + background_program.add_kernel(ustring("background")); + programs.push_back(&background_program); + } + + if (requested_features.use_baking) { + bake_program = OpenCLProgram( + this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake")); + bake_program.add_kernel(ustring("bake")); + programs.push_back(&bake_program); + } + + foreach (OpenCLProgram *program, programs) { + if (!program->load()) { + load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + } + } } void OpenCLDevice::load_preview_kernels() { - DeviceRequestedFeatures no_features; - vector<OpenCLProgram*> programs; - preview_programs.load_kernels(programs, no_features, true); - - foreach(OpenCLProgram *program, programs) { - if (!program->load()) { - load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); - } - } + DeviceRequestedFeatures no_features; + vector<OpenCLProgram *> programs; + preview_programs.load_kernels(programs, no_features, true); + + foreach (OpenCLProgram *program, programs) { + if (!program->load()) { + load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program)); + } + } } -bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures& requested_features) +bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features) { - if (background) { - load_kernel_task_pool.wait_work(); - use_preview_kernels = false; - } - else { - /* We use a device setting to determine to load preview kernels or not - * Better to check on device level than per kernel as mixing preview and - * non-preview kernels does not work due to different data types */ - if (use_preview_kernels) { - use_preview_kernels = !load_kernel_task_pool.finished(); - } - } - return split_kernel->load_kernels(requested_features); + if (background) { + load_kernel_task_pool.wait_work(); + use_preview_kernels = false; + } + else { + /* We use a device setting to determine to load preview kernels or not + * Better to check on device level than per kernel as mixing preview and + * non-preview kernels does not work due to different data types */ + if (use_preview_kernels) { + use_preview_kernels = !load_kernel_task_pool.finished(); + } + } + return split_kernel->load_kernels(requested_features); } -OpenCLDevice::OpenCLSplitPrograms* OpenCLDevice::get_split_programs() +OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs() { - return use_preview_kernels?&preview_programs:&kernel_programs; + return use_preview_kernels ? &preview_programs : &kernel_programs; } DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state() { - /* Do not switch kernels for background renderings - * We do foreground rendering but use the preview kernels - * Check for the optimized kernels - * - * This works also the other way around, where we are using - * optimized kernels but new ones are being compiled due - * to other features that are needed */ - if (background) { - /* The if-statements below would find the same result, - * But as the `finished` method uses a mutex we added - * this as an early exit */ - return DEVICE_KERNEL_USING_FEATURE_KERNEL; - } - - bool other_kernels_finished = load_kernel_task_pool.finished(); - if (use_preview_kernels) { - if (other_kernels_finished) { - return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE; - } - else { - return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL; - } - } - else { - if (other_kernels_finished) { - return DEVICE_KERNEL_USING_FEATURE_KERNEL; - } - else { - return DEVICE_KERNEL_FEATURE_KERNEL_INVALID; - } - } + /* Do not switch kernels for background renderings + * We do foreground rendering but use the preview kernels + * Check for the optimized kernels + * + * This works also the other way around, where we are using + * optimized kernels but new ones are being compiled due + * to other features that are needed */ + if (background) { + /* The if-statements below would find the same result, + * But as the `finished` method uses a mutex we added + * this as an early exit */ + return DEVICE_KERNEL_USING_FEATURE_KERNEL; + } + + bool other_kernels_finished = load_kernel_task_pool.finished(); + if (use_preview_kernels) { + if (other_kernels_finished) { + return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE; + } + else { + return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL; + } + } + else { + if (other_kernels_finished) { + return DEVICE_KERNEL_USING_FEATURE_KERNEL; + } + else { + return DEVICE_KERNEL_FEATURE_KERNEL_INVALID; + } + } } -void OpenCLDevice::mem_alloc(device_memory& mem) +void OpenCLDevice::mem_alloc(device_memory &mem) { - if(mem.name) { - VLOG(1) << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - } - - size_t size = mem.memory_size(); - - /* check there is enough memory available for the allocation */ - cl_ulong max_alloc_size = 0; - clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); - - if(DebugFlags().opencl.mem_limit) { - max_alloc_size = min(max_alloc_size, - cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); - } - - if(size > max_alloc_size) { - string error = "Scene too complex to fit in available memory."; - if(mem.name != NULL) { - error += string_printf(" (allocating buffer %s failed.)", mem.name); - } - set_error(error); - - return; - } - - cl_mem_flags mem_flag; - void *mem_ptr = NULL; - - if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - /* Zero-size allocation might be invoked by render, but not really - * supported by OpenCL. Using NULL as device pointer also doesn't really - * work for some reason, so for the time being we'll use special case - * will null_mem buffer. - */ - if(size != 0) { - mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, - mem_flag, - size, - mem_ptr, - &ciErr); - opencl_assert_err(ciErr, "clCreateBuffer"); - } - else { - mem.device_pointer = null_mem; - } - - stats.mem_alloc(size); - mem.device_size = size; + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + } + + size_t size = mem.memory_size(); + + /* check there is enough memory available for the allocation */ + cl_ulong max_alloc_size = 0; + clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL); + + if (DebugFlags().opencl.mem_limit) { + max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used)); + } + + if (size > max_alloc_size) { + string error = "Scene too complex to fit in available memory."; + if (mem.name != NULL) { + error += string_printf(" (allocating buffer %s failed.)", mem.name); + } + set_error(error); + + return; + } + + cl_mem_flags mem_flag; + void *mem_ptr = NULL; + + if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + /* Zero-size allocation might be invoked by render, but not really + * supported by OpenCL. Using NULL as device pointer also doesn't really + * work for some reason, so for the time being we'll use special case + * will null_mem buffer. + */ + if (size != 0) { + mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer"); + } + else { + mem.device_pointer = null_mem; + } + + stats.mem_alloc(size); + mem.device_size = size; } -void OpenCLDevice::mem_copy_to(device_memory& mem) +void OpenCLDevice::mem_copy_to(device_memory &mem) { - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - tex_alloc(mem); - } - else { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - /* this is blocking */ - size_t size = mem.memory_size(); - if(size != 0) { - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - size, - mem.host_pointer, - 0, - NULL, NULL)); - } - } + if (mem.type == MEM_TEXTURE) { + tex_free(mem); + tex_alloc(mem); + } + else { + if (!mem.device_pointer) { + mem_alloc(mem); + } + + /* this is blocking */ + size_t size = mem.memory_size(); + if (size != 0) { + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + size, + mem.host_pointer, + 0, + NULL, + NULL)); + } + } } -void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem) +void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) { - size_t offset = elem*y*w; - size_t size = elem*w*h; - assert(size != 0); - opencl_assert(clEnqueueReadBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - offset, - size, - (uchar*)mem.host_pointer + offset, - 0, - NULL, NULL)); + size_t offset = elem * y * w; + size_t size = elem * w * h; + assert(size != 0); + opencl_assert(clEnqueueReadBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + offset, + size, + (uchar *)mem.host_pointer + offset, + 0, + NULL, + NULL)); } void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size) { - base_program.wait_for_availability(); - cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); - - size_t global_size[] = {1024, 1024}; - size_t num_threads = global_size[0] * global_size[1]; - - cl_mem d_buffer = CL_MEM_PTR(mem); - cl_ulong d_offset = 0; - cl_ulong d_size = 0; - - while(d_offset < size) { - d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset); - - kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); - - ciErr = clEnqueueNDRangeKernel(cqCommandQueue, - ckZeroBuffer, - 2, - NULL, - global_size, - NULL, - 0, - NULL, - NULL); - opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); - - d_offset += d_size; - } + base_program.wait_for_availability(); + cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer")); + + size_t global_size[] = {1024, 1024}; + size_t num_threads = global_size[0] * global_size[1]; + + cl_mem d_buffer = CL_MEM_PTR(mem); + cl_ulong d_offset = 0; + cl_ulong d_size = 0; + + while (d_offset < size) { + d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset); + + kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset); + + ciErr = clEnqueueNDRangeKernel( + cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL); + opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); + + d_offset += d_size; + } } -void OpenCLDevice::mem_zero(device_memory& mem) +void OpenCLDevice::mem_zero(device_memory &mem) { - if(!mem.device_pointer) { - mem_alloc(mem); - } - - if(mem.device_pointer) { - if(base_program.is_loaded()) { - mem_zero_kernel(mem.device_pointer, mem.memory_size()); - } - - if(mem.host_pointer) { - memset(mem.host_pointer, 0, mem.memory_size()); - } - - if(!base_program.is_loaded()) { - void* zero = mem.host_pointer; - - if(!mem.host_pointer) { - zero = util_aligned_malloc(mem.memory_size(), 16); - memset(zero, 0, mem.memory_size()); - } - - opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, - CL_MEM_PTR(mem.device_pointer), - CL_TRUE, - 0, - mem.memory_size(), - zero, - 0, - NULL, NULL)); - - if(!mem.host_pointer) { - util_aligned_free(zero); - } - } - } + if (!mem.device_pointer) { + mem_alloc(mem); + } + + if (mem.device_pointer) { + if (base_program.is_loaded()) { + mem_zero_kernel(mem.device_pointer, mem.memory_size()); + } + + if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } + + if (!base_program.is_loaded()) { + void *zero = mem.host_pointer; + + if (!mem.host_pointer) { + zero = util_aligned_malloc(mem.memory_size(), 16); + memset(zero, 0, mem.memory_size()); + } + + opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, + CL_MEM_PTR(mem.device_pointer), + CL_TRUE, + 0, + mem.memory_size(), + zero, + 0, + NULL, + NULL)); + + if (!mem.host_pointer) { + util_aligned_free(zero); + } + } + } } -void OpenCLDevice::mem_free(device_memory& mem) +void OpenCLDevice::mem_free(device_memory &mem) { - if(mem.type == MEM_TEXTURE) { - tex_free(mem); - } - else { - if(mem.device_pointer) { - if(mem.device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); - } - mem.device_pointer = 0; - - stats.mem_free(mem.device_size); - mem.device_size = 0; - } - } + if (mem.type == MEM_TEXTURE) { + tex_free(mem); + } + else { + if (mem.device_pointer) { + if (mem.device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer))); + } + mem.device_pointer = 0; + + stats.mem_free(mem.device_size); + mem.device_size = 0; + } + } } int OpenCLDevice::mem_sub_ptr_alignment() { - return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); + return OpenCLInfo::mem_sub_ptr_alignment(cdDevice); } -device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size) +device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size) { - cl_mem_flags mem_flag; - if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) - mem_flag = CL_MEM_READ_ONLY; - else - mem_flag = CL_MEM_READ_WRITE; - - cl_buffer_region info; - info.origin = mem.memory_elements_size(offset); - info.size = mem.memory_elements_size(size); - - device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer), - mem_flag, - CL_BUFFER_CREATE_TYPE_REGION, - &info, - &ciErr); - opencl_assert_err(ciErr, "clCreateSubBuffer"); - return sub_buf; + cl_mem_flags mem_flag; + if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE) + mem_flag = CL_MEM_READ_ONLY; + else + mem_flag = CL_MEM_READ_WRITE; + + cl_buffer_region info; + info.origin = mem.memory_elements_size(offset); + info.size = mem.memory_elements_size(size); + + device_ptr sub_buf = (device_ptr)clCreateSubBuffer( + CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr); + opencl_assert_err(ciErr, "clCreateSubBuffer"); + return sub_buf; } void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer) { - if(device_pointer && device_pointer != null_mem) { - opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); - } + if (device_pointer && device_pointer != null_mem) { + opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer))); + } } void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size) { - ConstMemMap::iterator i = const_mem_map.find(name); - device_vector<uchar> *data; - - if(i == const_mem_map.end()) { - data = new device_vector<uchar>(this, name, MEM_READ_ONLY); - data->alloc(size); - const_mem_map.insert(ConstMemMap::value_type(name, data)); - } - else { - data = i->second; - } - - memcpy(data->data(), host, size); - data->copy_to_device(); + ConstMemMap::iterator i = const_mem_map.find(name); + device_vector<uchar> *data; + + if (i == const_mem_map.end()) { + data = new device_vector<uchar>(this, name, MEM_READ_ONLY); + data->alloc(size); + const_mem_map.insert(ConstMemMap::value_type(name, data)); + } + else { + data = i->second; + } + + memcpy(data->data(), host, size); + data->copy_to_device(); } -void OpenCLDevice::tex_alloc(device_memory& mem) +void OpenCLDevice::tex_alloc(device_memory &mem) { - VLOG(1) << "Texture allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")"; - - memory_manager.alloc(mem.name, mem); - /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ - mem.device_pointer = 1; - textures[mem.name] = &mem; - textures_need_update = true; + VLOG(1) << "Texture allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + memory_manager.alloc(mem.name, mem); + /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */ + mem.device_pointer = 1; + textures[mem.name] = &mem; + textures_need_update = true; } -void OpenCLDevice::tex_free(device_memory& mem) +void OpenCLDevice::tex_free(device_memory &mem) { - if(mem.device_pointer) { - mem.device_pointer = 0; - - if(memory_manager.free(mem)) { - textures_need_update = true; - } - - foreach(TexturesMap::value_type& value, textures) { - if(value.second == &mem) { - textures.erase(value.first); - break; - } - } - } + if (mem.device_pointer) { + mem.device_pointer = 0; + + if (memory_manager.free(mem)) { + textures_need_update = true; + } + + foreach (TexturesMap::value_type &value, textures) { + if (value.second == &mem) { + textures.erase(value.first); + break; + } + } + } } size_t OpenCLDevice::global_size_round_up(int group_size, int global_size) { - int r = global_size % group_size; - return global_size + ((r == 0)? 0: group_size - r); + int r = global_size % group_size; + return global_size + ((r == 0) ? 0 : group_size - r); } -void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) +void OpenCLDevice::enqueue_kernel( + cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) { - size_t workgroup_size, max_work_items[3]; - - clGetKernelWorkGroupInfo(kernel, cdDevice, - CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); - clGetDeviceInfo(cdDevice, - CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL); - - if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { - workgroup_size = max_workgroup_size; - } - - /* Try to divide evenly over 2 dimensions. */ - size_t local_size[2]; - if(x_workgroups) { - local_size[0] = workgroup_size; - local_size[1] = 1; - } - else { - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - local_size[0] = local_size[1] = sqrt_workgroup_size; - } - - /* Some implementations have max size 1 on 2nd dimension. */ - if(local_size[1] > max_work_items[1]) { - local_size[0] = workgroup_size/max_work_items[1]; - local_size[1] = max_work_items[1]; - } - - size_t global_size[2] = {global_size_round_up(local_size[0], w), - global_size_round_up(local_size[1], h)}; - - /* Vertical size of 1 is coming from bake/shade kernels where we should - * not round anything up because otherwise we'll either be doing too - * much work per pixel (if we don't check global ID on Y axis) or will - * be checking for global ID to always have Y of 0. - */ - if(h == 1) { - global_size[h] = 1; - } - - /* run kernel */ - opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); - opencl_assert(clFlush(cqCommandQueue)); + size_t workgroup_size, max_work_items[3]; + + clGetKernelWorkGroupInfo( + kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); + clGetDeviceInfo( + cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL); + + if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) { + workgroup_size = max_workgroup_size; + } + + /* Try to divide evenly over 2 dimensions. */ + size_t local_size[2]; + if (x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } + + /* Some implementations have max size 1 on 2nd dimension. */ + if (local_size[1] > max_work_items[1]) { + local_size[0] = workgroup_size / max_work_items[1]; + local_size[1] = max_work_items[1]; + } + + size_t global_size[2] = {global_size_round_up(local_size[0], w), + global_size_round_up(local_size[1], h)}; + + /* Vertical size of 1 is coming from bake/shade kernels where we should + * not round anything up because otherwise we'll either be doing too + * much work per pixel (if we don't check global ID on Y axis) or will + * be checking for global ID to always have Y of 0. + */ + if (h == 1) { + global_size[h] = 1; + } + + /* run kernel */ + opencl_assert( + clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); + opencl_assert(clFlush(cqCommandQueue)); } void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name) { - cl_mem ptr; - - MemMap::iterator i = mem_map.find(name); - if(i != mem_map.end()) { - ptr = CL_MEM_PTR(i->second); - } - else { - /* work around NULL not working, even though the spec says otherwise */ - ptr = CL_MEM_PTR(null_mem); - } - - opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr)); + cl_mem ptr; + + MemMap::iterator i = mem_map.find(name); + if (i != mem_map.end()) { + ptr = CL_MEM_PTR(i->second); + } + else { + /* work around NULL not working, even though the spec says otherwise */ + ptr = CL_MEM_PTR(null_mem); + } + + opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr)); } void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg) { - flush_texture_buffers(); + flush_texture_buffers(); - memory_manager.set_kernel_arg_buffers(kernel, narg); + memory_manager.set_kernel_arg_buffers(kernel, narg); } void OpenCLDevice::flush_texture_buffers() { - if(!textures_need_update) { - return; - } - textures_need_update = false; - - /* Setup slots for textures. */ - int num_slots = 0; - - vector<texture_slot_t> texture_slots; - -#define KERNEL_TEX(type, name) \ - if(textures.find(#name) != textures.end()) { \ - texture_slots.push_back(texture_slot_t(#name, num_slots)); \ - } \ - num_slots++; -#include "kernel/kernel_textures.h" - - int num_data_slots = num_slots; - - foreach(TexturesMap::value_type& tex, textures) { - string name = tex.first; - - if(string_startswith(name, "__tex_image")) { - int pos = name.rfind("_"); - int id = atoi(name.data() + pos + 1); - texture_slots.push_back(texture_slot_t(name, - num_data_slots + id)); - num_slots = max(num_slots, num_data_slots + id + 1); - } - } - - /* Realloc texture descriptors buffer. */ - memory_manager.free(texture_info); - texture_info.resize(num_slots); - memory_manager.alloc("texture_info", texture_info); - - /* Fill in descriptors */ - foreach(texture_slot_t& slot, texture_slots) { - TextureInfo& info = texture_info[slot.slot]; - - MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); - info.data = desc.offset; - info.cl_buffer = desc.device_buffer; - - if(string_startswith(slot.name, "__tex_image")) { - device_memory *mem = textures[slot.name]; - - info.width = mem->data_width; - info.height = mem->data_height; - info.depth = mem->data_depth; - - info.interpolation = mem->interpolation; - info.extension = mem->extension; - } - } - - /* Force write of descriptors. */ - memory_manager.free(texture_info); - memory_manager.alloc("texture_info", texture_info); -} + if (!textures_need_update) { + return; + } + textures_need_update = false; + + /* Setup slots for textures. */ + int num_slots = 0; + + vector<texture_slot_t> texture_slots; + +# define KERNEL_TEX(type, name) \ + if (textures.find(#name) != textures.end()) { \ + texture_slots.push_back(texture_slot_t(#name, num_slots)); \ + } \ + num_slots++; +# include "kernel/kernel_textures.h" + + int num_data_slots = num_slots; + + foreach (TexturesMap::value_type &tex, textures) { + string name = tex.first; + + if (string_startswith(name, "__tex_image")) { + int pos = name.rfind("_"); + int id = atoi(name.data() + pos + 1); + texture_slots.push_back(texture_slot_t(name, num_data_slots + id)); + num_slots = max(num_slots, num_data_slots + id + 1); + } + } + + /* Realloc texture descriptors buffer. */ + memory_manager.free(texture_info); + texture_info.resize(num_slots); + memory_manager.alloc("texture_info", texture_info); + + /* Fill in descriptors */ + foreach (texture_slot_t &slot, texture_slots) { + TextureInfo &info = texture_info[slot.slot]; + + MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name); + info.data = desc.offset; + info.cl_buffer = desc.device_buffer; + if (string_startswith(slot.name, "__tex_image")) { + device_memory *mem = textures[slot.name]; + + info.width = mem->data_width; + info.height = mem->data_height; + info.depth = mem->data_depth; + + info.interpolation = mem->interpolation; + info.extension = mem->extension; + } + } + + /* Force write of descriptors. */ + memory_manager.free(texture_info); + memory_manager.alloc("texture_info", texture_info); +} void OpenCLDevice::thread_run(DeviceTask *task) { - flush_texture_buffers(); - - if(task->type == DeviceTask::FILM_CONVERT) { - film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); - } - else if(task->type == DeviceTask::SHADER) { - shader(*task); - } - else if(task->type == DeviceTask::RENDER) { - RenderTile tile; - DenoisingTask denoising(this, *task); - - /* Allocate buffer for kernel globals */ - device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); - kgbuffer.alloc_to_device(1); - - /* Keep rendering tiles until done. */ - while(task->acquire_tile(this, tile)) { - if(tile.task == RenderTile::PATH_TRACE) { - assert(tile.task == RenderTile::PATH_TRACE); - scoped_timer timer(&tile.buffers->render_time); - - split_kernel->path_trace(task, - tile, - kgbuffer, - *const_mem_map["__data"]); - - /* Complete kernel execution before release tile. */ - /* This helps in multi-device render; - * The device that reaches the critical-section function - * release_tile waits (stalling other devices from entering - * release_tile) for all kernels to complete. If device1 (a - * slow-render device) reaches release_tile first then it would - * stall device2 (a fast-render device) from proceeding to render - * next tile. - */ - clFinish(cqCommandQueue); - } - else if(tile.task == RenderTile::DENOISE) { - tile.sample = tile.start_sample + tile.num_samples; - denoise(tile, denoising); - task->update_progress(&tile, tile.w*tile.h); - } - - task->release_tile(tile); - } - - kgbuffer.free(); - } + flush_texture_buffers(); + + if (task->type == DeviceTask::FILM_CONVERT) { + film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half); + } + else if (task->type == DeviceTask::SHADER) { + shader(*task); + } + else if (task->type == DeviceTask::RENDER) { + RenderTile tile; + DenoisingTask denoising(this, *task); + + /* Allocate buffer for kernel globals */ + device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals"); + kgbuffer.alloc_to_device(1); + + /* Keep rendering tiles until done. */ + while (task->acquire_tile(this, tile)) { + if (tile.task == RenderTile::PATH_TRACE) { + assert(tile.task == RenderTile::PATH_TRACE); + scoped_timer timer(&tile.buffers->render_time); + + split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]); + + /* Complete kernel execution before release tile. */ + /* This helps in multi-device render; + * The device that reaches the critical-section function + * release_tile waits (stalling other devices from entering + * release_tile) for all kernels to complete. If device1 (a + * slow-render device) reaches release_tile first then it would + * stall device2 (a fast-render device) from proceeding to render + * next tile. + */ + clFinish(cqCommandQueue); + } + else if (tile.task == RenderTile::DENOISE) { + tile.sample = tile.start_sample + tile.num_samples; + denoise(tile, denoising); + task->update_progress(&tile, tile.w * tile.h); + } + + task->release_tile(tile); + } + + kgbuffer.free(); + } } -void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half) +void OpenCLDevice::film_convert(DeviceTask &task, + device_ptr buffer, + device_ptr rgba_byte, + device_ptr rgba_half) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half); - cl_mem d_buffer = CL_MEM_PTR(buffer); - cl_int d_x = task.x; - cl_int d_y = task.y; - cl_int d_w = task.w; - cl_int d_h = task.h; - cl_float d_sample_scale = 1.0f/(task.sample + 1); - cl_int d_offset = task.offset; - cl_int d_stride = task.stride; - - - cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float")); - - cl_uint start_arg_index = - kernel_set_args(ckFilmConvertKernel, - 0, - d_data, - d_rgba, - d_buffer); - - set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); - - start_arg_index += kernel_set_args(ckFilmConvertKernel, - start_arg_index, - d_sample_scale, - d_x, - d_y, - d_w, - d_h, - d_offset, - d_stride); - - enqueue_kernel(ckFilmConvertKernel, d_w, d_h); + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half); + cl_mem d_buffer = CL_MEM_PTR(buffer); + cl_int d_x = task.x; + cl_int d_y = task.y; + cl_int d_w = task.w; + cl_int d_h = task.h; + cl_float d_sample_scale = 1.0f / (task.sample + 1); + cl_int d_offset = task.offset; + cl_int d_stride = task.stride; + + cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) : + base_program(ustring("convert_to_half_float")); + + cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer); + + set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index); + + start_arg_index += kernel_set_args(ckFilmConvertKernel, + start_arg_index, + d_sample_scale, + d_x, + d_y, + d_w, + d_h, + d_offset, + d_stride); + + enqueue_kernel(ckFilmConvertKernel, d_w, d_h); } bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, @@ -1419,123 +1406,119 @@ bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr, device_ptr out_ptr, DenoisingTask *task) { - int stride = task->buffer.stride; - int w = task->buffer.width; - int h = task->buffer.h; - int r = task->nlm_state.r; - int f = task->nlm_state.f; - float a = task->nlm_state.a; - float k_2 = task->nlm_state.k_2; - - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0; - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); - device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); - device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride); - cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem guide_mem = CL_MEM_PTR(guide_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem out_mem = CL_MEM_PTR(out_ptr); - cl_mem scale_mem = NULL; - - mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride); - mem_zero_kernel(out_ptr, sizeof(float)*pass_stride); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); - cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - - kernel_set_args(ckNLMCalcDifference, 0, - guide_mem, - variance_mem, - scale_mem, - difference_mem, - w, h, stride, - pass_stride, - r, channel_offset, - 0, a, k_2); - kernel_set_args(ckNLMBlur, 0, - difference_mem, - blurDifference_mem, - w, h, stride, - pass_stride, - r, f); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference_mem, - difference_mem, - w, h, stride, - pass_stride, - r, f); - kernel_set_args(ckNLMUpdateOutput, 0, - blurDifference_mem, - image_mem, - out_mem, - weightAccum_mem, - w, h, stride, - pass_stride, - channel_offset, - r, f); - - enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); - - kernel_set_args(ckNLMNormalize, 0, - out_mem, weightAccum_mem, w, h, stride); - enqueue_kernel(ckNLMNormalize, w, h); - - return true; + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; + int r = task->nlm_state.r; + int f = task->nlm_state.f; + float a = task->nlm_state.a; + float k_2 = task->nlm_state.k_2; + + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0; + + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); + device_sub_ptr blurDifference( + task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); + device_sub_ptr weightAccum( + task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride); + cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); + + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem guide_mem = CL_MEM_PTR(guide_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem out_mem = CL_MEM_PTR(out_ptr); + cl_mem scale_mem = NULL; + + mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride); + mem_zero_kernel(out_ptr, sizeof(float) * pass_stride); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); + cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); + + kernel_set_args(ckNLMCalcDifference, + 0, + guide_mem, + variance_mem, + scale_mem, + difference_mem, + w, + h, + stride, + pass_stride, + r, + channel_offset, + 0, + a, + k_2); + kernel_set_args( + ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f); + kernel_set_args( + ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f); + kernel_set_args(ckNLMUpdateOutput, + 0, + blurDifference_mem, + image_mem, + out_mem, + weightAccum_mem, + w, + h, + stride, + pass_stride, + channel_offset, + r, + f); + + enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true); + + kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride); + enqueue_kernel(ckNLMNormalize, w, h); + + return true; } bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task) { - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - char use_time = task->buffer.use_time? 1 : 0; - - cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); - - int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, - buffer_mem, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterConstructTransform, - arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterConstructTransform, - arg_ofs, - transform_mem, - rank_mem, - task->filter_area, - task->rect, - task->buffer.pass_stride, - task->buffer.frame_stride, - use_time, - task->radius, - task->pca_threshold); - - enqueue_kernel(ckFilterConstructTransform, - task->storage.w, - task->storage.h, - 256); - - return true; + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + char use_time = task->buffer.use_time ? 1 : 0; + + cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform")); + + int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem); + cl_mem buffers[9]; + for (int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]); + } + kernel_set_args(ckFilterConstructTransform, + arg_ofs, + transform_mem, + rank_mem, + task->filter_area, + task->rect, + task->buffer.pass_stride, + task->buffer.frame_stride, + use_time, + task->radius, + task->pca_threshold); + + enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256); + + return true; } bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, @@ -1544,136 +1527,130 @@ bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr, int frame, DenoisingTask *task) { - cl_mem color_mem = CL_MEM_PTR(color_ptr); - cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); - cl_mem scale_mem = CL_MEM_PTR(scale_ptr); - - cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); - cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); - cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); - cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); - cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - int stride = task->buffer.stride; - int frame_offset = frame * task->buffer.frame_stride; - int t = task->tile_info->frames[frame]; - char use_time = task->buffer.use_time? 1 : 0; - - int r = task->radius; - int pass_stride = task->buffer.pass_stride; - int num_shifts = (2*r+1)*(2*r+1); - - device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts); - device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts); - cl_mem difference_mem = CL_MEM_PTR(*difference); - cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); - - kernel_set_args(ckNLMCalcDifference, 0, - color_mem, - color_variance_mem, - scale_mem, - difference_mem, - w, h, stride, - pass_stride, - r, - pass_stride, - frame_offset, - 1.0f, task->nlm_k_2); - kernel_set_args(ckNLMBlur, 0, - difference_mem, - blurDifference_mem, - w, h, stride, - pass_stride, - r, 4); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference_mem, - difference_mem, - w, h, stride, - pass_stride, - r, 4); - kernel_set_args(ckNLMConstructGramian, 0, - t, - blurDifference_mem, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->reconstruction_state.filter_window, - w, h, stride, - pass_stride, - r, 4, - frame_offset, - use_time); - - enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); - enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); - enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); - - return true; + cl_mem color_mem = CL_MEM_PTR(color_ptr); + cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr); + cl_mem scale_mem = CL_MEM_PTR(scale_ptr); + + cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer); + cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference")); + cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur")); + cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight")); + cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + int frame_offset = frame * task->buffer.frame_stride; + int t = task->tile_info->frames[frame]; + char use_time = task->buffer.use_time ? 1 : 0; + + int r = task->radius; + int pass_stride = task->buffer.pass_stride; + int num_shifts = (2 * r + 1) * (2 * r + 1); + + device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts); + device_sub_ptr blurDifference( + task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts); + cl_mem difference_mem = CL_MEM_PTR(*difference); + cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference); + + kernel_set_args(ckNLMCalcDifference, + 0, + color_mem, + color_variance_mem, + scale_mem, + difference_mem, + w, + h, + stride, + pass_stride, + r, + pass_stride, + frame_offset, + 1.0f, + task->nlm_k_2); + kernel_set_args( + ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4); + kernel_set_args( + ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4); + kernel_set_args(ckNLMConstructGramian, + 0, + t, + blurDifference_mem, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, + h, + stride, + pass_stride, + r, + 4, + frame_offset, + use_time); + + enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w * h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256); + + return true; } -bool OpenCLDevice::denoising_solve(device_ptr output_ptr, - DenoisingTask *task) +bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task) { - cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - - cl_mem output_mem = CL_MEM_PTR(output_ptr); - cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); - cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); - cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); - - int w = task->reconstruction_state.source_w; - int h = task->reconstruction_state.source_h; - - kernel_set_args(ckFinalize, 0, - output_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - task->filter_area, - task->reconstruction_state.buffer_params, - task->render_buffer.samples); - enqueue_kernel(ckFinalize, w, h); - - return true; + cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); + + cl_mem output_mem = CL_MEM_PTR(output_ptr); + cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer); + cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer); + cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer); + + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + + kernel_set_args(ckFinalize, + 0, + output_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->filter_area, + task->reconstruction_state.buffer_params, + task->render_buffer.samples); + enqueue_kernel(ckFinalize, w, h); + + return true; } bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, - int r, int4 rect, + int r, + int4 rect, DenoisingTask *task) { - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); - - kernel_set_args(ckFilterCombineHalves, 0, - mean_mem, - variance_mem, - a_mem, - b_mem, - rect, - r); - enqueue_kernel(ckFilterCombineHalves, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves")); + + kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r); + enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, @@ -1683,39 +1660,36 @@ bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr, device_ptr buffer_variance_ptr, DenoisingTask *task) { - cl_mem a_mem = CL_MEM_PTR(a_ptr); - cl_mem b_mem = CL_MEM_PTR(b_ptr); - cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); - cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); - cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); - - int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0, - task->render_buffer.samples, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterDivideShadow, arg_ofs, - a_mem, - b_mem, - sample_variance_mem, - sv_variance_mem, - buffer_variance_mem, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterDivideShadow, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem a_mem = CL_MEM_PTR(a_ptr); + cl_mem b_mem = CL_MEM_PTR(b_ptr); + cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr); + cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr); + cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr); + + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow")); + + int arg_ofs = kernel_set_args( + ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem); + cl_mem buffers[9]; + for (int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]); + } + kernel_set_args(ckFilterDivideShadow, + arg_ofs, + a_mem, + b_mem, + sample_variance_mem, + sv_variance_mem, + buffer_variance_mem, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.offset); + enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } bool OpenCLDevice::denoising_get_feature(int mean_offset, @@ -1725,36 +1699,32 @@ bool OpenCLDevice::denoising_get_feature(int mean_offset, float scale, DenoisingTask *task) { - cl_mem mean_mem = CL_MEM_PTR(mean_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - - cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); - - cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); - - int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, - task->render_buffer.samples, - tile_info_mem); - cl_mem buffers[9]; - for(int i = 0; i < 9; i++) { - buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); - arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, - buffers[i]); - } - kernel_set_args(ckFilterGetFeature, arg_ofs, - mean_offset, - variance_offset, - mean_mem, - variance_mem, - scale, - task->rect, - task->render_buffer.pass_stride, - task->render_buffer.offset); - enqueue_kernel(ckFilterGetFeature, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem mean_mem = CL_MEM_PTR(mean_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + + cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer); + + cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature")); + + int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem); + cl_mem buffers[9]; + for (int i = 0; i < 9; i++) { + buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]); + arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]); + } + kernel_set_args(ckFilterGetFeature, + arg_ofs, + mean_offset, + variance_offset, + mean_mem, + variance_mem, + scale, + task->rect, + task->render_buffer.pass_stride, + task->render_buffer.offset); + enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } bool OpenCLDevice::denoising_write_feature(int out_offset, @@ -1762,24 +1732,23 @@ bool OpenCLDevice::denoising_write_feature(int out_offset, device_ptr buffer_ptr, DenoisingTask *task) { - cl_mem from_mem = CL_MEM_PTR(from_ptr); - cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); - - cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); - - kernel_set_args(ckFilterWriteFeature, 0, - task->render_buffer.samples, - task->reconstruction_state.buffer_params, - task->filter_area, - from_mem, - buffer_mem, - out_offset, - task->rect); - enqueue_kernel(ckFilterWriteFeature, - task->filter_area.z, - task->filter_area.w); - - return true; + cl_mem from_mem = CL_MEM_PTR(from_ptr); + cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr); + + cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature")); + + kernel_set_args(ckFilterWriteFeature, + 0, + task->render_buffer.samples, + task->reconstruction_state.buffer_params, + task->filter_area, + from_mem, + buffer_mem, + out_offset, + task->rect); + enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w); + + return true; } bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, @@ -1788,155 +1757,155 @@ bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr, device_ptr output_ptr, DenoisingTask *task) { - cl_mem image_mem = CL_MEM_PTR(image_ptr); - cl_mem variance_mem = CL_MEM_PTR(variance_ptr); - cl_mem depth_mem = CL_MEM_PTR(depth_ptr); - cl_mem output_mem = CL_MEM_PTR(output_ptr); - - cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); - - kernel_set_args(ckFilterDetectOutliers, 0, - image_mem, - variance_mem, - depth_mem, - output_mem, - task->rect, - task->buffer.pass_stride); - enqueue_kernel(ckFilterDetectOutliers, - task->rect.z-task->rect.x, - task->rect.w-task->rect.y); - - return true; + cl_mem image_mem = CL_MEM_PTR(image_ptr); + cl_mem variance_mem = CL_MEM_PTR(variance_ptr); + cl_mem depth_mem = CL_MEM_PTR(depth_ptr); + cl_mem output_mem = CL_MEM_PTR(output_ptr); + + cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers")); + + kernel_set_args(ckFilterDetectOutliers, + 0, + image_mem, + variance_mem, + depth_mem, + output_mem, + task->rect, + task->buffer.pass_stride); + enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y); + + return true; } -void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising) +void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising) { - denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising); - denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); - denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); - denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); - denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); - denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); - denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); - denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); - - denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); - denoising.render_buffer.samples = rtile.sample; - denoising.buffer.gpu_temporary_mem = true; - - denoising.run_denoising(&rtile); + denoising.functions.construct_transform = function_bind( + &OpenCLDevice::denoising_construct_transform, this, &denoising); + denoising.functions.accumulate = function_bind( + &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising); + denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising); + denoising.functions.divide_shadow = function_bind( + &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.non_local_means = function_bind( + &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising); + denoising.functions.combine_halves = function_bind( + &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising); + denoising.functions.get_feature = function_bind( + &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising); + denoising.functions.write_feature = function_bind( + &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising); + denoising.functions.detect_outliers = function_bind( + &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising); + + denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h); + denoising.render_buffer.samples = rtile.sample; + denoising.buffer.gpu_temporary_mem = true; + + denoising.run_denoising(&rtile); } -void OpenCLDevice::shader(DeviceTask& task) +void OpenCLDevice::shader(DeviceTask &task) { - /* cast arguments to cl types */ - cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); - cl_mem d_input = CL_MEM_PTR(task.shader_input); - cl_mem d_output = CL_MEM_PTR(task.shader_output); - cl_int d_shader_eval_type = task.shader_eval_type; - cl_int d_shader_filter = task.shader_filter; - cl_int d_shader_x = task.shader_x; - cl_int d_shader_w = task.shader_w; - cl_int d_offset = task.offset; - - OpenCLDevice::OpenCLProgram *program = &background_program; - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - program = &bake_program; - } - else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) { - program = &displace_program; - } - program->wait_for_availability(); - cl_kernel kernel = (*program)(); - - cl_uint start_arg_index = - kernel_set_args(kernel, - 0, - d_data, - d_input, - d_output); - - set_kernel_arg_buffers(kernel, &start_arg_index); - - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_eval_type); - if(task.shader_eval_type >= SHADER_EVAL_BAKE) { - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_filter); - } - start_arg_index += kernel_set_args(kernel, - start_arg_index, - d_shader_x, - d_shader_w, - d_offset); - - for(int sample = 0; sample < task.num_samples; sample++) { - - if(task.get_cancel()) - break; - - kernel_set_args(kernel, start_arg_index, sample); - - enqueue_kernel(kernel, task.shader_w, 1); - - clFinish(cqCommandQueue); - - task.update_progress(NULL); - } + /* cast arguments to cl types */ + cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer); + cl_mem d_input = CL_MEM_PTR(task.shader_input); + cl_mem d_output = CL_MEM_PTR(task.shader_output); + cl_int d_shader_eval_type = task.shader_eval_type; + cl_int d_shader_filter = task.shader_filter; + cl_int d_shader_x = task.shader_x; + cl_int d_shader_w = task.shader_w; + cl_int d_offset = task.offset; + + OpenCLDevice::OpenCLProgram *program = &background_program; + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + program = &bake_program; + } + else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) { + program = &displace_program; + } + program->wait_for_availability(); + cl_kernel kernel = (*program)(); + + cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output); + + set_kernel_arg_buffers(kernel, &start_arg_index); + + start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type); + if (task.shader_eval_type >= SHADER_EVAL_BAKE) { + start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter); + } + start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset); + + for (int sample = 0; sample < task.num_samples; sample++) { + + if (task.get_cancel()) + break; + + kernel_set_args(kernel, start_arg_index, sample); + + enqueue_kernel(kernel, task.shader_w, 1); + + clFinish(cqCommandQueue); + + task.update_progress(NULL); + } } string OpenCLDevice::kernel_build_options(const string *debug_src) { - string build_options = "-cl-no-signed-zeros -cl-mad-enable "; - - if(platform_name == "NVIDIA CUDA") { - build_options += "-D__KERNEL_OPENCL_NVIDIA__ " - "-cl-nv-maxrregcount=32 " - "-cl-nv-verbose "; - - uint compute_capability_major, compute_capability_minor; - clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, - sizeof(cl_uint), &compute_capability_major, NULL); - clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, - sizeof(cl_uint), &compute_capability_minor, NULL); - - build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", - compute_capability_major * 100 + - compute_capability_minor * 10); - } - - else if(platform_name == "Apple") - build_options += "-D__KERNEL_OPENCL_APPLE__ "; - - else if(platform_name == "AMD Accelerated Parallel Processing") - build_options += "-D__KERNEL_OPENCL_AMD__ "; - - else if(platform_name == "Intel(R) OpenCL") { - build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; - - /* Options for gdb source level kernel debugging. - * this segfaults on linux currently. - */ - if(OpenCLInfo::use_debug() && debug_src) - build_options += "-g -s \"" + *debug_src + "\" "; - } - - if(info.has_half_images) { - build_options += "-D__KERNEL_CL_KHR_FP16__ "; - } - - if(OpenCLInfo::use_debug()) { - build_options += "-D__KERNEL_OPENCL_DEBUG__ "; - } - -#ifdef WITH_CYCLES_DEBUG - build_options += "-D__KERNEL_DEBUG__ "; -#endif - - return build_options; + string build_options = "-cl-no-signed-zeros -cl-mad-enable "; + + if (platform_name == "NVIDIA CUDA") { + build_options += + "-D__KERNEL_OPENCL_NVIDIA__ " + "-cl-nv-maxrregcount=32 " + "-cl-nv-verbose "; + + uint compute_capability_major, compute_capability_minor; + clGetDeviceInfo(cdDevice, + CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, + sizeof(cl_uint), + &compute_capability_major, + NULL); + clGetDeviceInfo(cdDevice, + CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, + sizeof(cl_uint), + &compute_capability_minor, + NULL); + + build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ", + compute_capability_major * 100 + compute_capability_minor * 10); + } + + else if (platform_name == "Apple") + build_options += "-D__KERNEL_OPENCL_APPLE__ "; + + else if (platform_name == "AMD Accelerated Parallel Processing") + build_options += "-D__KERNEL_OPENCL_AMD__ "; + + else if (platform_name == "Intel(R) OpenCL") { + build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ "; + + /* Options for gdb source level kernel debugging. + * this segfaults on linux currently. + */ + if (OpenCLInfo::use_debug() && debug_src) + build_options += "-g -s \"" + *debug_src + "\" "; + } + + if (info.has_half_images) { + build_options += "-D__KERNEL_CL_KHR_FP16__ "; + } + + if (OpenCLInfo::use_debug()) { + build_options += "-D__KERNEL_OPENCL_DEBUG__ "; + } + +# ifdef WITH_CYCLES_DEBUG + build_options += "-D__KERNEL_DEBUG__ "; +# endif + + return build_options; } /* TODO(sergey): In the future we can use variadic templates, once @@ -1944,137 +1913,130 @@ string OpenCLDevice::kernel_build_options(const string *debug_src) */ int OpenCLDevice::kernel_set_args(cl_kernel kernel, int start_argument_index, - const ArgumentWrapper& arg1, - const ArgumentWrapper& arg2, - const ArgumentWrapper& arg3, - const ArgumentWrapper& arg4, - const ArgumentWrapper& arg5, - const ArgumentWrapper& arg6, - const ArgumentWrapper& arg7, - const ArgumentWrapper& arg8, - const ArgumentWrapper& arg9, - const ArgumentWrapper& arg10, - const ArgumentWrapper& arg11, - const ArgumentWrapper& arg12, - const ArgumentWrapper& arg13, - const ArgumentWrapper& arg14, - const ArgumentWrapper& arg15, - const ArgumentWrapper& arg16, - const ArgumentWrapper& arg17, - const ArgumentWrapper& arg18, - const ArgumentWrapper& arg19, - const ArgumentWrapper& arg20, - const ArgumentWrapper& arg21, - const ArgumentWrapper& arg22, - const ArgumentWrapper& arg23, - const ArgumentWrapper& arg24, - const ArgumentWrapper& arg25, - const ArgumentWrapper& arg26, - const ArgumentWrapper& arg27, - const ArgumentWrapper& arg28, - const ArgumentWrapper& arg29, - const ArgumentWrapper& arg30, - const ArgumentWrapper& arg31, - const ArgumentWrapper& arg32, - const ArgumentWrapper& arg33) + const ArgumentWrapper &arg1, + const ArgumentWrapper &arg2, + const ArgumentWrapper &arg3, + const ArgumentWrapper &arg4, + const ArgumentWrapper &arg5, + const ArgumentWrapper &arg6, + const ArgumentWrapper &arg7, + const ArgumentWrapper &arg8, + const ArgumentWrapper &arg9, + const ArgumentWrapper &arg10, + const ArgumentWrapper &arg11, + const ArgumentWrapper &arg12, + const ArgumentWrapper &arg13, + const ArgumentWrapper &arg14, + const ArgumentWrapper &arg15, + const ArgumentWrapper &arg16, + const ArgumentWrapper &arg17, + const ArgumentWrapper &arg18, + const ArgumentWrapper &arg19, + const ArgumentWrapper &arg20, + const ArgumentWrapper &arg21, + const ArgumentWrapper &arg22, + const ArgumentWrapper &arg23, + const ArgumentWrapper &arg24, + const ArgumentWrapper &arg25, + const ArgumentWrapper &arg26, + const ArgumentWrapper &arg27, + const ArgumentWrapper &arg28, + const ArgumentWrapper &arg29, + const ArgumentWrapper &arg30, + const ArgumentWrapper &arg31, + const ArgumentWrapper &arg32, + const ArgumentWrapper &arg33) { - int current_arg_index = 0; -#define FAKE_VARARG_HANDLE_ARG(arg) \ - do { \ - if(arg.pointer != NULL) { \ - opencl_assert(clSetKernelArg( \ - kernel, \ - start_argument_index + current_arg_index, \ - arg.size, arg.pointer)); \ - ++current_arg_index; \ - } \ - else { \ - return current_arg_index; \ - } \ - } while(false) - FAKE_VARARG_HANDLE_ARG(arg1); - FAKE_VARARG_HANDLE_ARG(arg2); - FAKE_VARARG_HANDLE_ARG(arg3); - FAKE_VARARG_HANDLE_ARG(arg4); - FAKE_VARARG_HANDLE_ARG(arg5); - FAKE_VARARG_HANDLE_ARG(arg6); - FAKE_VARARG_HANDLE_ARG(arg7); - FAKE_VARARG_HANDLE_ARG(arg8); - FAKE_VARARG_HANDLE_ARG(arg9); - FAKE_VARARG_HANDLE_ARG(arg10); - FAKE_VARARG_HANDLE_ARG(arg11); - FAKE_VARARG_HANDLE_ARG(arg12); - FAKE_VARARG_HANDLE_ARG(arg13); - FAKE_VARARG_HANDLE_ARG(arg14); - FAKE_VARARG_HANDLE_ARG(arg15); - FAKE_VARARG_HANDLE_ARG(arg16); - FAKE_VARARG_HANDLE_ARG(arg17); - FAKE_VARARG_HANDLE_ARG(arg18); - FAKE_VARARG_HANDLE_ARG(arg19); - FAKE_VARARG_HANDLE_ARG(arg20); - FAKE_VARARG_HANDLE_ARG(arg21); - FAKE_VARARG_HANDLE_ARG(arg22); - FAKE_VARARG_HANDLE_ARG(arg23); - FAKE_VARARG_HANDLE_ARG(arg24); - FAKE_VARARG_HANDLE_ARG(arg25); - FAKE_VARARG_HANDLE_ARG(arg26); - FAKE_VARARG_HANDLE_ARG(arg27); - FAKE_VARARG_HANDLE_ARG(arg28); - FAKE_VARARG_HANDLE_ARG(arg29); - FAKE_VARARG_HANDLE_ARG(arg30); - FAKE_VARARG_HANDLE_ARG(arg31); - FAKE_VARARG_HANDLE_ARG(arg32); - FAKE_VARARG_HANDLE_ARG(arg33); -#undef FAKE_VARARG_HANDLE_ARG - return current_arg_index; + int current_arg_index = 0; +# define FAKE_VARARG_HANDLE_ARG(arg) \ + do { \ + if (arg.pointer != NULL) { \ + opencl_assert(clSetKernelArg( \ + kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \ + ++current_arg_index; \ + } \ + else { \ + return current_arg_index; \ + } \ + } while (false) + FAKE_VARARG_HANDLE_ARG(arg1); + FAKE_VARARG_HANDLE_ARG(arg2); + FAKE_VARARG_HANDLE_ARG(arg3); + FAKE_VARARG_HANDLE_ARG(arg4); + FAKE_VARARG_HANDLE_ARG(arg5); + FAKE_VARARG_HANDLE_ARG(arg6); + FAKE_VARARG_HANDLE_ARG(arg7); + FAKE_VARARG_HANDLE_ARG(arg8); + FAKE_VARARG_HANDLE_ARG(arg9); + FAKE_VARARG_HANDLE_ARG(arg10); + FAKE_VARARG_HANDLE_ARG(arg11); + FAKE_VARARG_HANDLE_ARG(arg12); + FAKE_VARARG_HANDLE_ARG(arg13); + FAKE_VARARG_HANDLE_ARG(arg14); + FAKE_VARARG_HANDLE_ARG(arg15); + FAKE_VARARG_HANDLE_ARG(arg16); + FAKE_VARARG_HANDLE_ARG(arg17); + FAKE_VARARG_HANDLE_ARG(arg18); + FAKE_VARARG_HANDLE_ARG(arg19); + FAKE_VARARG_HANDLE_ARG(arg20); + FAKE_VARARG_HANDLE_ARG(arg21); + FAKE_VARARG_HANDLE_ARG(arg22); + FAKE_VARARG_HANDLE_ARG(arg23); + FAKE_VARARG_HANDLE_ARG(arg24); + FAKE_VARARG_HANDLE_ARG(arg25); + FAKE_VARARG_HANDLE_ARG(arg26); + FAKE_VARARG_HANDLE_ARG(arg27); + FAKE_VARARG_HANDLE_ARG(arg28); + FAKE_VARARG_HANDLE_ARG(arg29); + FAKE_VARARG_HANDLE_ARG(arg30); + FAKE_VARARG_HANDLE_ARG(arg31); + FAKE_VARARG_HANDLE_ARG(arg32); + FAKE_VARARG_HANDLE_ARG(arg33); +# undef FAKE_VARARG_HANDLE_ARG + return current_arg_index; } void OpenCLDevice::release_kernel_safe(cl_kernel kernel) { - if(kernel) { - clReleaseKernel(kernel); - } + if (kernel) { + clReleaseKernel(kernel); + } } void OpenCLDevice::release_mem_object_safe(cl_mem mem) { - if(mem != NULL) { - clReleaseMemObject(mem); - } + if (mem != NULL) { + clReleaseMemObject(mem); + } } void OpenCLDevice::release_program_safe(cl_program program) { - if(program) { - clReleaseProgram(program); - } + if (program) { + clReleaseProgram(program); + } } /* ** Those guys are for workign around some compiler-specific bugs ** */ -cl_program OpenCLDevice::load_cached_kernel(ustring key, - thread_scoped_lock& cache_locker) +cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker) { - return OpenCLCache::get_program(cpPlatform, - cdDevice, - key, - cache_locker); + return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker); } void OpenCLDevice::store_cached_kernel(cl_program program, ustring key, - thread_scoped_lock& cache_locker) + thread_scoped_lock &cache_locker) { - OpenCLCache::store_program(cpPlatform, - cdDevice, - program, - key, - cache_locker); + OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker); } -Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background) +Device *opencl_create_split_device(DeviceInfo &info, + Stats &stats, + Profiler &profiler, + bool background) { - return new OpenCLDevice(info, stats, profiler, background); + return new OpenCLDevice(info, stats, profiler, background); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/opencl/opencl_util.cpp b/intern/cycles/device/opencl/opencl_util.cpp index 5a1e12af8ab..cc40ad42b06 100644 --- a/intern/cycles/device/opencl/opencl_util.cpp +++ b/intern/cycles/device/opencl/opencl_util.cpp @@ -16,1059 +16,1017 @@ #ifdef WITH_OPENCL -#include "device/opencl/opencl.h" -#include "device/device_intern.h" +# include "device/opencl/opencl.h" +# include "device/device_intern.h" -#include "util/util_debug.h" -#include "util/util_logging.h" -#include "util/util_md5.h" -#include "util/util_path.h" -#include "util/util_time.h" -#include "util/util_system.h" +# include "util/util_debug.h" +# include "util/util_logging.h" +# include "util/util_md5.h" +# include "util/util_path.h" +# include "util/util_time.h" +# include "util/util_system.h" using std::cerr; using std::endl; CCL_NAMESPACE_BEGIN -OpenCLCache::Slot::ProgramEntry::ProgramEntry() - : program(NULL), - mutex(NULL) +OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL) { } -OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs) - : program(rhs.program), - mutex(NULL) +OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry &rhs) + : program(rhs.program), mutex(NULL) { } OpenCLCache::Slot::ProgramEntry::~ProgramEntry() { - delete mutex; + delete mutex; } -OpenCLCache::Slot::Slot() - : context_mutex(NULL), - context(NULL) +OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL) { } -OpenCLCache::Slot::Slot(const Slot& rhs) - : context_mutex(NULL), - context(NULL), - programs(rhs.programs) +OpenCLCache::Slot::Slot(const Slot &rhs) + : context_mutex(NULL), context(NULL), programs(rhs.programs) { } OpenCLCache::Slot::~Slot() { - delete context_mutex; + delete context_mutex; } -OpenCLCache& OpenCLCache::global_instance() +OpenCLCache &OpenCLCache::global_instance() { - static OpenCLCache instance; - return instance; + static OpenCLCache instance; + return instance; } cl_context OpenCLCache::get_context(cl_platform_id platform, cl_device_id device, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); + assert(platform != NULL); - OpenCLCache& self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); + thread_scoped_lock cache_lock(self.cache_lock); - pair<CacheMap::iterator,bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); + pair<CacheMap::iterator, bool> ins = self.cache.insert( + CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - Slot &slot = ins.first->second; + Slot &slot = ins.first->second; - /* create slot lock only while holding cache lock */ - if(!slot.context_mutex) - slot.context_mutex = new thread_mutex; + /* create slot lock only while holding cache lock */ + if (!slot.context_mutex) + slot.context_mutex = new thread_mutex; - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); + /* need to unlock cache before locking slot, to allow store to complete */ + cache_lock.unlock(); - /* lock the slot */ - slot_locker = thread_scoped_lock(*slot.context_mutex); + /* lock the slot */ + slot_locker = thread_scoped_lock(*slot.context_mutex); - /* If the thing isn't cached */ - if(slot.context == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } + /* If the thing isn't cached */ + if (slot.context == NULL) { + /* return with the caller's lock holder holding the slot lock */ + return NULL; + } - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); + /* the item was already cached, release the slot lock */ + slot_locker.unlock(); - cl_int ciErr = clRetainContext(slot.context); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + cl_int ciErr = clRetainContext(slot.context); + assert(ciErr == CL_SUCCESS); + (void)ciErr; - return slot.context; + return slot.context; } cl_program OpenCLCache::get_program(cl_platform_id platform, cl_device_id device, ustring key, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); + assert(platform != NULL); - OpenCLCache& self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); + thread_scoped_lock cache_lock(self.cache_lock); - pair<CacheMap::iterator,bool> ins = self.cache.insert( - CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); + pair<CacheMap::iterator, bool> ins = self.cache.insert( + CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); - Slot &slot = ins.first->second; + Slot &slot = ins.first->second; - pair<Slot::EntryMap::iterator,bool> ins2 = slot.programs.insert( - Slot::EntryMap::value_type(key, Slot::ProgramEntry())); + pair<Slot::EntryMap::iterator, bool> ins2 = slot.programs.insert( + Slot::EntryMap::value_type(key, Slot::ProgramEntry())); - Slot::ProgramEntry &entry = ins2.first->second; + Slot::ProgramEntry &entry = ins2.first->second; - /* create slot lock only while holding cache lock */ - if(!entry.mutex) - entry.mutex = new thread_mutex; + /* create slot lock only while holding cache lock */ + if (!entry.mutex) + entry.mutex = new thread_mutex; - /* need to unlock cache before locking slot, to allow store to complete */ - cache_lock.unlock(); + /* need to unlock cache before locking slot, to allow store to complete */ + cache_lock.unlock(); - /* lock the slot */ - slot_locker = thread_scoped_lock(*entry.mutex); + /* lock the slot */ + slot_locker = thread_scoped_lock(*entry.mutex); - /* If the thing isn't cached */ - if(entry.program == NULL) { - /* return with the caller's lock holder holding the slot lock */ - return NULL; - } + /* If the thing isn't cached */ + if (entry.program == NULL) { + /* return with the caller's lock holder holding the slot lock */ + return NULL; + } - /* the item was already cached, release the slot lock */ - slot_locker.unlock(); + /* the item was already cached, release the slot lock */ + slot_locker.unlock(); - cl_int ciErr = clRetainProgram(entry.program); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + cl_int ciErr = clRetainProgram(entry.program); + assert(ciErr == CL_SUCCESS); + (void)ciErr; - return entry.program; + return entry.program; } void OpenCLCache::store_context(cl_platform_id platform, cl_device_id device, cl_context context, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); - assert(device != NULL); - assert(context != NULL); + assert(platform != NULL); + assert(device != NULL); + assert(context != NULL); - OpenCLCache &self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - cache_lock.unlock(); + thread_scoped_lock cache_lock(self.cache_lock); + CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); + cache_lock.unlock(); - Slot &slot = i->second; + Slot &slot = i->second; - /* sanity check */ - assert(i != self.cache.end()); - assert(slot.context == NULL); + /* sanity check */ + assert(i != self.cache.end()); + assert(slot.context == NULL); - slot.context = context; + slot.context = context; - /* unlock the slot */ - slot_locker.unlock(); + /* unlock the slot */ + slot_locker.unlock(); - /* increment reference count in OpenCL. - * The caller is going to release the object when done with it. */ - cl_int ciErr = clRetainContext(context); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + /* increment reference count in OpenCL. + * The caller is going to release the object when done with it. */ + cl_int ciErr = clRetainContext(context); + assert(ciErr == CL_SUCCESS); + (void)ciErr; } void OpenCLCache::store_program(cl_platform_id platform, cl_device_id device, cl_program program, ustring key, - thread_scoped_lock& slot_locker) + thread_scoped_lock &slot_locker) { - assert(platform != NULL); - assert(device != NULL); - assert(program != NULL); + assert(platform != NULL); + assert(device != NULL); + assert(program != NULL); - OpenCLCache &self = global_instance(); + OpenCLCache &self = global_instance(); - thread_scoped_lock cache_lock(self.cache_lock); + thread_scoped_lock cache_lock(self.cache_lock); - CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); - assert(i != self.cache.end()); - Slot &slot = i->second; + CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); + assert(i != self.cache.end()); + Slot &slot = i->second; - Slot::EntryMap::iterator i2 = slot.programs.find(key); - assert(i2 != slot.programs.end()); - Slot::ProgramEntry &entry = i2->second; + Slot::EntryMap::iterator i2 = slot.programs.find(key); + assert(i2 != slot.programs.end()); + Slot::ProgramEntry &entry = i2->second; - assert(entry.program == NULL); + assert(entry.program == NULL); - cache_lock.unlock(); + cache_lock.unlock(); - entry.program = program; + entry.program = program; - /* unlock the slot */ - slot_locker.unlock(); + /* unlock the slot */ + slot_locker.unlock(); - /* Increment reference count in OpenCL. - * The caller is going to release the object when done with it. - */ - cl_int ciErr = clRetainProgram(program); - assert(ciErr == CL_SUCCESS); - (void) ciErr; + /* Increment reference count in OpenCL. + * The caller is going to release the object when done with it. + */ + cl_int ciErr = clRetainProgram(program); + assert(ciErr == CL_SUCCESS); + (void)ciErr; } string OpenCLCache::get_kernel_md5() { - OpenCLCache &self = global_instance(); - thread_scoped_lock lock(self.kernel_md5_lock); + OpenCLCache &self = global_instance(); + thread_scoped_lock lock(self.kernel_md5_lock); - if(self.kernel_md5.empty()) { - self.kernel_md5 = path_files_md5_hash(path_get("source")); - } - return self.kernel_md5; + if (self.kernel_md5.empty()) { + self.kernel_md5 = path_files_md5_hash(path_get("source")); + } + return self.kernel_md5; } -static string get_program_source(const string& kernel_file) +static string get_program_source(const string &kernel_file) { - string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; - /* We compile kernels consisting of many files. unfortunately OpenCL - * kernel caches do not seem to recognize changes in included files. - * so we force recompile on changes by adding the md5 hash of all files. - */ - source = path_source_replace_includes(source, path_get("source")); - source += "\n// " + util_md5_string(source) + "\n"; - return source; + string source = "#include \"kernel/kernels/opencl/" + kernel_file + "\"\n"; + /* We compile kernels consisting of many files. unfortunately OpenCL + * kernel caches do not seem to recognize changes in included files. + * so we force recompile on changes by adding the md5 hash of all files. + */ + source = path_source_replace_includes(source, path_get("source")); + source += "\n// " + util_md5_string(source) + "\n"; + return source; } OpenCLDevice::OpenCLProgram::OpenCLProgram(OpenCLDevice *device, - const string& program_name, - const string& kernel_file, - const string& kernel_build_options, - bool use_stdout) - : device(device), - program_name(program_name), - kernel_file(kernel_file), - kernel_build_options(kernel_build_options), - use_stdout(use_stdout) + const string &program_name, + const string &kernel_file, + const string &kernel_build_options, + bool use_stdout) + : device(device), + program_name(program_name), + kernel_file(kernel_file), + kernel_build_options(kernel_build_options), + use_stdout(use_stdout) { - loaded = false; - needs_compiling = true; - program = NULL; + loaded = false; + needs_compiling = true; + program = NULL; } OpenCLDevice::OpenCLProgram::~OpenCLProgram() { - release(); + release(); } void OpenCLDevice::OpenCLProgram::release() { - for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { - if(kernel->second) { - clReleaseKernel(kernel->second); - kernel->second = NULL; - } - } - if(program) { - clReleaseProgram(program); - program = NULL; - } + for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); + ++kernel) { + if (kernel->second) { + clReleaseKernel(kernel->second); + kernel->second = NULL; + } + } + if (program) { + clReleaseProgram(program); + program = NULL; + } } -void OpenCLDevice::OpenCLProgram::add_log(const string& msg, bool debug) +void OpenCLDevice::OpenCLProgram::add_log(const string &msg, bool debug) { - if(!use_stdout) { - log += msg + "\n"; - } - else if(!debug) { - printf("%s\n", msg.c_str()); - fflush(stdout); - } - else { - VLOG(2) << msg; - } + if (!use_stdout) { + log += msg + "\n"; + } + else if (!debug) { + printf("%s\n", msg.c_str()); + fflush(stdout); + } + else { + VLOG(2) << msg; + } } -void OpenCLDevice::OpenCLProgram::add_error(const string& msg) +void OpenCLDevice::OpenCLProgram::add_error(const string &msg) { - if(use_stdout) { - fprintf(stderr, "%s\n", msg.c_str()); - } - if(error_msg == "") { - error_msg += "\n"; - } - error_msg += msg; + if (use_stdout) { + fprintf(stderr, "%s\n", msg.c_str()); + } + if (error_msg == "") { + error_msg += "\n"; + } + error_msg += msg; } void OpenCLDevice::OpenCLProgram::add_kernel(ustring name) { - if(!kernels.count(name)) { - kernels[name] = NULL; - } + if (!kernels.count(name)) { + kernels[name] = NULL; + } } bool OpenCLDevice::OpenCLProgram::build_kernel(const string *debug_src) { - string build_options; - build_options = device->kernel_build_options(debug_src) + kernel_build_options; + string build_options; + build_options = device->kernel_build_options(debug_src) + kernel_build_options; - VLOG(1) << "Build options passed to clBuildProgram: '" - << build_options << "'."; - cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); + VLOG(1) << "Build options passed to clBuildProgram: '" << build_options << "'."; + cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - /* show warnings even if build is successful */ - size_t ret_val_size = 0; + /* show warnings even if build is successful */ + size_t ret_val_size = 0; - clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); + clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); - if(ciErr != CL_SUCCESS) { - add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + ", errors in console."); - } + if (ciErr != CL_SUCCESS) { + add_error(string("OpenCL build failed with error ") + clewErrorString(ciErr) + + ", errors in console."); + } - if(ret_val_size > 1) { - vector<char> build_log(ret_val_size + 1); - clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); + if (ret_val_size > 1) { + vector<char> build_log(ret_val_size + 1); + clGetProgramBuildInfo( + program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); - build_log[ret_val_size] = '\0'; - /* Skip meaningless empty output from the NVidia compiler. */ - if(!(ret_val_size == 2 && build_log[0] == '\n')) { - add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), ciErr == CL_SUCCESS); - } - } + build_log[ret_val_size] = '\0'; + /* Skip meaningless empty output from the NVidia compiler. */ + if (!(ret_val_size == 2 && build_log[0] == '\n')) { + add_log(string("OpenCL program ") + program_name + " build output: " + string(&build_log[0]), + ciErr == CL_SUCCESS); + } + } - return (ciErr == CL_SUCCESS); + return (ciErr == CL_SUCCESS); } bool OpenCLDevice::OpenCLProgram::compile_kernel(const string *debug_src) { - string source = get_program_source(kernel_file); + string source = get_program_source(kernel_file); - if(debug_src) { - path_write_text(*debug_src, source); - } + if (debug_src) { + path_write_text(*debug_src, source); + } - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_int ciErr; + size_t source_len = source.size(); + const char *source_str = source.c_str(); + cl_int ciErr; - program = clCreateProgramWithSource(device->cxContext, - 1, - &source_str, - &source_len, - &ciErr); + program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr); - if(ciErr != CL_SUCCESS) { - add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); - return false; - } + if (ciErr != CL_SUCCESS) { + add_error(string("OpenCL program creation failed: ") + clewErrorString(ciErr)); + return false; + } - double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); + double starttime = time_dt(); + add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); + add_log(string("Build flags: ") + kernel_build_options, true); - if(!build_kernel(debug_src)) - return false; + if (!build_kernel(debug_src)) + return false; - double elapsed = time_dt() - starttime; - add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false); + double elapsed = time_dt() - starttime; + add_log( + string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), + false); - return true; + return true; } -static void escape_python_string(string& str) +static void escape_python_string(string &str) { - /* Escape string to be passed as a Python raw string with '' quotes'. */ - string_replace(str, "'", "\'"); + /* Escape string to be passed as a Python raw string with '' quotes'. */ + string_replace(str, "'", "\'"); } -bool OpenCLDevice::OpenCLProgram::compile_separate(const string& clbin) +bool OpenCLDevice::OpenCLProgram::compile_separate(const string &clbin) { - vector<string> args; - args.push_back("--background"); - args.push_back("--factory-startup"); - args.push_back("--python-expr"); - - int device_platform_id = device->device_num; - string device_name = device->device_name; - string platform_name = device->platform_name; - string build_options = device->kernel_build_options(NULL) + kernel_build_options; - string kernel_file_escaped = kernel_file; - string clbin_escaped = clbin; - - escape_python_string(device_name); - escape_python_string(platform_name); - escape_python_string(build_options); - escape_python_string(kernel_file_escaped); - escape_python_string(clbin_escaped); - - args.push_back( - string_printf( - "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", - device_platform_id, - device_name.c_str(), - platform_name.c_str(), - build_options.c_str(), - kernel_file_escaped.c_str(), - clbin_escaped.c_str())); - - double starttime = time_dt(); - add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); - add_log(string("Build flags: ") + kernel_build_options, true); - if(!system_call_self(args) || !path_exists(clbin)) { - return false; - } - - double elapsed = time_dt() - starttime; - add_log(string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), false); - - return load_binary(clbin); + vector<string> args; + args.push_back("--background"); + args.push_back("--factory-startup"); + args.push_back("--python-expr"); + + int device_platform_id = device->device_num; + string device_name = device->device_name; + string platform_name = device->platform_name; + string build_options = device->kernel_build_options(NULL) + kernel_build_options; + string kernel_file_escaped = kernel_file; + string clbin_escaped = clbin; + + escape_python_string(device_name); + escape_python_string(platform_name); + escape_python_string(build_options); + escape_python_string(kernel_file_escaped); + escape_python_string(clbin_escaped); + + args.push_back(string_printf( + "import _cycles; _cycles.opencl_compile(r'%d', r'%s', r'%s', r'%s', r'%s', r'%s')", + device_platform_id, + device_name.c_str(), + platform_name.c_str(), + build_options.c_str(), + kernel_file_escaped.c_str(), + clbin_escaped.c_str())); + + double starttime = time_dt(); + add_log(string("Cycles: compiling OpenCL program ") + program_name + "...", false); + add_log(string("Build flags: ") + kernel_build_options, true); + if (!system_call_self(args) || !path_exists(clbin)) { + return false; + } + + double elapsed = time_dt() - starttime; + add_log( + string_printf("Kernel compilation of %s finished in %.2lfs.", program_name.c_str(), elapsed), + false); + + return load_binary(clbin); } /* Compile opencl kernel. This method is called from the _cycles Python * module compile kernels. Parameters must match function above. */ -bool device_opencl_compile_kernel(const vector<string>& parameters) +bool device_opencl_compile_kernel(const vector<string> ¶meters) { - int device_platform_id = std::stoi(parameters[0]); - const string& device_name = parameters[1]; - const string& platform_name = parameters[2]; - const string& build_options = parameters[3]; - const string& kernel_file = parameters[4]; - const string& binary_path = parameters[5]; - - if(clewInit() != CLEW_SUCCESS) { - return false; - } - - vector<OpenCLPlatformDevice> usable_devices; - OpenCLInfo::get_usable_devices(&usable_devices); - if(device_platform_id >= usable_devices.size()) { - return false; - } - - OpenCLPlatformDevice& platform_device = usable_devices[device_platform_id]; - if(platform_device.platform_name != platform_name || - platform_device.device_name != device_name) - { - return false; - } - - cl_platform_id platform = platform_device.platform_id; - cl_device_id device = platform_device.device_id; - const cl_context_properties context_props[] = { - CL_CONTEXT_PLATFORM, (cl_context_properties) platform, - 0, 0 - }; - - cl_int err; - cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err); - if(err != CL_SUCCESS) { - return false; - } - - string source = get_program_source(kernel_file); - size_t source_len = source.size(); - const char *source_str = source.c_str(); - cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err); - bool result = false; - - if(err == CL_SUCCESS) { - err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); - - if(err == CL_SUCCESS) { - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if(size > 0) { - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); - result = path_write_binary(binary_path, binary); - } - } - clReleaseProgram(program); - } - - clReleaseContext(context); - - return result; + int device_platform_id = std::stoi(parameters[0]); + const string &device_name = parameters[1]; + const string &platform_name = parameters[2]; + const string &build_options = parameters[3]; + const string &kernel_file = parameters[4]; + const string &binary_path = parameters[5]; + + if (clewInit() != CLEW_SUCCESS) { + return false; + } + + vector<OpenCLPlatformDevice> usable_devices; + OpenCLInfo::get_usable_devices(&usable_devices); + if (device_platform_id >= usable_devices.size()) { + return false; + } + + OpenCLPlatformDevice &platform_device = usable_devices[device_platform_id]; + if (platform_device.platform_name != platform_name || + platform_device.device_name != device_name) { + return false; + } + + cl_platform_id platform = platform_device.platform_id; + cl_device_id device = platform_device.device_id; + const cl_context_properties context_props[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0, 0}; + + cl_int err; + cl_context context = clCreateContext(context_props, 1, &device, NULL, NULL, &err); + if (err != CL_SUCCESS) { + return false; + } + + string source = get_program_source(kernel_file); + size_t source_len = source.size(); + const char *source_str = source.c_str(); + cl_program program = clCreateProgramWithSource(context, 1, &source_str, &source_len, &err); + bool result = false; + + if (err == CL_SUCCESS) { + err = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); + + if (err == CL_SUCCESS) { + size_t size = 0; + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + if (size > 0) { + vector<uint8_t> binary(size); + uint8_t *bytes = &binary[0]; + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); + result = path_write_binary(binary_path, binary); + } + } + clReleaseProgram(program); + } + + clReleaseContext(context); + + return result; } -bool OpenCLDevice::OpenCLProgram::load_binary(const string& clbin, - const string *debug_src) +bool OpenCLDevice::OpenCLProgram::load_binary(const string &clbin, const string *debug_src) { - /* read binary into memory */ - vector<uint8_t> binary; + /* read binary into memory */ + vector<uint8_t> binary; - if(!path_read_binary(clbin, binary)) { - add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str())); - return false; - } + if (!path_read_binary(clbin, binary)) { + add_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str())); + return false; + } - /* create program */ - cl_int status, ciErr; - size_t size = binary.size(); - const uint8_t *bytes = &binary[0]; + /* create program */ + cl_int status, ciErr; + size_t size = binary.size(); + const uint8_t *bytes = &binary[0]; - program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice, - &size, &bytes, &status, &ciErr); + program = clCreateProgramWithBinary( + device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr); - if(status != CL_SUCCESS || ciErr != CL_SUCCESS) { - add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " - + clewErrorString(status) + " " + clewErrorString(ciErr)); - return false; - } + if (status != CL_SUCCESS || ciErr != CL_SUCCESS) { + add_error(string("OpenCL failed create program from cached binary ") + clbin + ": " + + clewErrorString(status) + " " + clewErrorString(ciErr)); + return false; + } - if(!build_kernel(debug_src)) - return false; + if (!build_kernel(debug_src)) + return false; - return true; + return true; } -bool OpenCLDevice::OpenCLProgram::save_binary(const string& clbin) +bool OpenCLDevice::OpenCLProgram::save_binary(const string &clbin) { - size_t size = 0; - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); + size_t size = 0; + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); - if(!size) - return false; + if (!size) + return false; - vector<uint8_t> binary(size); - uint8_t *bytes = &binary[0]; + vector<uint8_t> binary(size); + uint8_t *bytes = &binary[0]; - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &bytes, NULL); - return path_write_binary(clbin, binary); + return path_write_binary(clbin, binary); } bool OpenCLDevice::OpenCLProgram::load() { - loaded = false; - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, - cache_locker); - if (!program) { - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* If binary kernel exists already, try use it. */ - if(path_exists(clbin) && load_binary(clbin)) { - /* Kernel loaded from binary, nothing to do. */ - add_log(string("Loaded program from ") + clbin + ".", true); - - /* Cache the program. */ - device->store_cached_kernel(program, - cache_key, - cache_locker); - } - else { - add_log(string("OpenCL program ") + program_name + " not found on disk.", true); - cache_locker.unlock(); - } - } - - if (program) { - create_kernels(); - loaded = true; - needs_compiling = false; - } - - return loaded; + loaded = false; + string device_md5 = device->device_md5_hash(kernel_build_options); + + /* Try to use cached kernel. */ + thread_scoped_lock cache_locker; + ustring cache_key(program_name + device_md5); + program = device->load_cached_kernel(cache_key, cache_locker); + if (!program) { + add_log(string("OpenCL program ") + program_name + " not found in cache.", true); + + /* need to create source to get md5 */ + string source = get_program_source(kernel_file); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + + util_md5_string(source); + basename = path_cache_get(path_join("kernels", basename)); + string clbin = basename + ".clbin"; + + /* If binary kernel exists already, try use it. */ + if (path_exists(clbin) && load_binary(clbin)) { + /* Kernel loaded from binary, nothing to do. */ + add_log(string("Loaded program from ") + clbin + ".", true); + + /* Cache the program. */ + device->store_cached_kernel(program, cache_key, cache_locker); + } + else { + add_log(string("OpenCL program ") + program_name + " not found on disk.", true); + cache_locker.unlock(); + } + } + + if (program) { + create_kernels(); + loaded = true; + needs_compiling = false; + } + + return loaded; } void OpenCLDevice::OpenCLProgram::compile() { - assert(device); - - string device_md5 = device->device_md5_hash(kernel_build_options); - - /* Try to use cached kernel. */ - thread_scoped_lock cache_locker; - ustring cache_key(program_name + device_md5); - program = device->load_cached_kernel(cache_key, - cache_locker); - - if (!program) - { - - add_log(string("OpenCL program ") + program_name + " not found in cache.", true); - - /* need to create source to get md5 */ - string source = get_program_source(kernel_file); - - string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + util_md5_string(source); - basename = path_cache_get(path_join("kernels", basename)); - string clbin = basename + ".clbin"; - - /* path to preprocessed source for debugging */ - string clsrc, *debug_src = NULL; - - if(OpenCLInfo::use_debug()) { - clsrc = basename + ".cl"; - debug_src = &clsrc; - } - - /* If binary kernel exists already, try use it. */ - if(compile_separate(clbin)) { - add_log(string("Built and loaded program from ") + clbin + ".", true); - loaded = true; - } - else { - add_log(string("Separate-process building of ") + clbin + " failed, will fall back to regular building.", true); - - /* If does not exist or loading binary failed, compile kernel. */ - if(!compile_kernel(debug_src)) { - needs_compiling = false; - return; - } - - /* Save binary for reuse. */ - if(!save_binary(clbin)) { - add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true); - } - } - - /* Cache the program. */ - device->store_cached_kernel(program, - cache_key, - cache_locker); - } - - create_kernels(); - needs_compiling = false; - loaded = true; + assert(device); + + string device_md5 = device->device_md5_hash(kernel_build_options); + + /* Try to use cached kernel. */ + thread_scoped_lock cache_locker; + ustring cache_key(program_name + device_md5); + program = device->load_cached_kernel(cache_key, cache_locker); + + if (!program) { + + add_log(string("OpenCL program ") + program_name + " not found in cache.", true); + + /* need to create source to get md5 */ + string source = get_program_source(kernel_file); + + string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + + util_md5_string(source); + basename = path_cache_get(path_join("kernels", basename)); + string clbin = basename + ".clbin"; + + /* path to preprocessed source for debugging */ + string clsrc, *debug_src = NULL; + + if (OpenCLInfo::use_debug()) { + clsrc = basename + ".cl"; + debug_src = &clsrc; + } + + /* If binary kernel exists already, try use it. */ + if (compile_separate(clbin)) { + add_log(string("Built and loaded program from ") + clbin + ".", true); + loaded = true; + } + else { + add_log(string("Separate-process building of ") + clbin + + " failed, will fall back to regular building.", + true); + + /* If does not exist or loading binary failed, compile kernel. */ + if (!compile_kernel(debug_src)) { + needs_compiling = false; + return; + } + + /* Save binary for reuse. */ + if (!save_binary(clbin)) { + add_log(string("Saving compiled OpenCL kernel to ") + clbin + " failed!", true); + } + } + + /* Cache the program. */ + device->store_cached_kernel(program, cache_key, cache_locker); + } + + create_kernels(); + needs_compiling = false; + loaded = true; } void OpenCLDevice::OpenCLProgram::create_kernels() { - for(map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { - assert(kernel->second == NULL); - cl_int ciErr; - string name = "kernel_ocl_" + kernel->first.string(); - kernel->second = clCreateKernel(program, name.c_str(), &ciErr); - if(device->opencl_error(ciErr)) { - add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + clewErrorString(ciErr)); - return; - } - } + for (map<ustring, cl_kernel>::iterator kernel = kernels.begin(); kernel != kernels.end(); + ++kernel) { + assert(kernel->second == NULL); + cl_int ciErr; + string name = "kernel_ocl_" + kernel->first.string(); + kernel->second = clCreateKernel(program, name.c_str(), &ciErr); + if (device->opencl_error(ciErr)) { + add_error(string("Error getting kernel ") + name + " from program " + program_name + ": " + + clewErrorString(ciErr)); + return; + } + } } bool OpenCLDevice::OpenCLProgram::wait_for_availability() { - add_log(string("Waiting for availability of ") + program_name + ".", true); - while (needs_compiling) { - time_sleep(0.1); - } - return loaded; + add_log(string("Waiting for availability of ") + program_name + ".", true); + while (needs_compiling) { + time_sleep(0.1); + } + return loaded; } void OpenCLDevice::OpenCLProgram::report_error() { - /* If loaded is true, there was no error. */ - if(loaded) return; - /* if use_stdout is true, the error was already reported. */ - if(use_stdout) return; - - cerr << error_msg << endl; - if(!compile_output.empty()) { - cerr << "OpenCL kernel build output for " << program_name << ":" << endl; - cerr << compile_output << endl; - } + /* If loaded is true, there was no error. */ + if (loaded) + return; + /* if use_stdout is true, the error was already reported. */ + if (use_stdout) + return; + + cerr << error_msg << endl; + if (!compile_output.empty()) { + cerr << "OpenCL kernel build output for " << program_name << ":" << endl; + cerr << compile_output << endl; + } } cl_kernel OpenCLDevice::OpenCLProgram::operator()() { - assert(kernels.size() == 1); - return kernels.begin()->second; + assert(kernels.size() == 1); + return kernels.begin()->second; } cl_kernel OpenCLDevice::OpenCLProgram::operator()(ustring name) { - assert(kernels.count(name)); - return kernels[name]; + assert(kernels.count(name)); + return kernels[name]; } cl_device_type OpenCLInfo::device_type() { - switch(DebugFlags().opencl.device_type) - { - case DebugFlags::OpenCL::DEVICE_NONE: - return 0; - case DebugFlags::OpenCL::DEVICE_ALL: - return CL_DEVICE_TYPE_ALL; - case DebugFlags::OpenCL::DEVICE_DEFAULT: - return CL_DEVICE_TYPE_DEFAULT; - case DebugFlags::OpenCL::DEVICE_CPU: - return CL_DEVICE_TYPE_CPU; - case DebugFlags::OpenCL::DEVICE_GPU: - return CL_DEVICE_TYPE_GPU; - case DebugFlags::OpenCL::DEVICE_ACCELERATOR: - return CL_DEVICE_TYPE_ACCELERATOR; - default: - return CL_DEVICE_TYPE_ALL; - } + switch (DebugFlags().opencl.device_type) { + case DebugFlags::OpenCL::DEVICE_NONE: + return 0; + case DebugFlags::OpenCL::DEVICE_ALL: + return CL_DEVICE_TYPE_ALL; + case DebugFlags::OpenCL::DEVICE_DEFAULT: + return CL_DEVICE_TYPE_DEFAULT; + case DebugFlags::OpenCL::DEVICE_CPU: + return CL_DEVICE_TYPE_CPU; + case DebugFlags::OpenCL::DEVICE_GPU: + return CL_DEVICE_TYPE_GPU; + case DebugFlags::OpenCL::DEVICE_ACCELERATOR: + return CL_DEVICE_TYPE_ACCELERATOR; + default: + return CL_DEVICE_TYPE_ALL; + } } bool OpenCLInfo::use_debug() { - return DebugFlags().opencl.debug; + return DebugFlags().opencl.debug; } -bool OpenCLInfo::device_supported(const string& platform_name, - const cl_device_id device_id) +bool OpenCLInfo::device_supported(const string &platform_name, const cl_device_id device_id) { - cl_device_type device_type; - if(!get_device_type(device_id, &device_type)) { - return false; - } - string device_name; - if(!get_device_name(device_id, &device_name)) { - return false; - } - - int driver_major = 0; - int driver_minor = 0; - if(!get_driver_version(device_id, &driver_major, &driver_minor)) { - return false; - } - VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; - - /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework - * (aka, it will not be on Intel framework). This isn't supported - * and needs an explicit blacklist. - */ - if(strstr(device_name.c_str(), "Iris")) { - return false; - } - if(platform_name == "AMD Accelerated Parallel Processing" && - device_type == CL_DEVICE_TYPE_GPU) - { - if(driver_major < 2236) { - VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; - return false; - } - const char *blacklist[] = { - /* GCN 1 */ - "Tahiti", "Pitcairn", "Capeverde", "Oland", "Hainan", - NULL - }; - for(int i = 0; blacklist[i] != NULL; i++) { - if(device_name == blacklist[i]) { - VLOG(1) << "AMD device " << device_name << " not supported"; - return false; - } - } - return true; - } - if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { - return false; - } - return false; + cl_device_type device_type; + if (!get_device_type(device_id, &device_type)) { + return false; + } + string device_name; + if (!get_device_name(device_id, &device_name)) { + return false; + } + + int driver_major = 0; + int driver_minor = 0; + if (!get_driver_version(device_id, &driver_major, &driver_minor)) { + return false; + } + VLOG(3) << "OpenCL driver version " << driver_major << "." << driver_minor; + + /* It is possible tyo have Iris GPU on AMD/Apple OpenCL framework + * (aka, it will not be on Intel framework). This isn't supported + * and needs an explicit blacklist. + */ + if (strstr(device_name.c_str(), "Iris")) { + return false; + } + if (platform_name == "AMD Accelerated Parallel Processing" && + device_type == CL_DEVICE_TYPE_GPU) { + if (driver_major < 2236) { + VLOG(1) << "AMD driver version " << driver_major << "." << driver_minor << " not supported."; + return false; + } + const char *blacklist[] = {/* GCN 1 */ + "Tahiti", + "Pitcairn", + "Capeverde", + "Oland", + "Hainan", + NULL}; + for (int i = 0; blacklist[i] != NULL; i++) { + if (device_name == blacklist[i]) { + VLOG(1) << "AMD device " << device_name << " not supported"; + return false; + } + } + return true; + } + if (platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { + return false; + } + return false; } -bool OpenCLInfo::platform_version_check(cl_platform_id platform, - string *error) +bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error) { - const int req_major = 1, req_minor = 1; - int major, minor; - char version[256]; - clGetPlatformInfo(platform, - CL_PLATFORM_VERSION, - sizeof(version), - &version, - NULL); - if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { - if(error != NULL) { - *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); - } - return false; - } - if(!((major == req_major && minor >= req_minor) || (major > req_major))) { - if(error != NULL) { - *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if(error != NULL) { - *error = ""; - } - return true; + const int req_major = 1, req_minor = 1; + int major, minor; + char version[256]; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL); + if (sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { + if (error != NULL) { + *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); + } + return false; + } + if (!((major == req_major && minor >= req_minor) || (major > req_major))) { + if (error != NULL) { + *error = string_printf( + "OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); + } + return false; + } + if (error != NULL) { + *error = ""; + } + return true; } -bool OpenCLInfo::device_version_check(cl_device_id device, - string *error) +bool OpenCLInfo::device_version_check(cl_device_id device, string *error) { - const int req_major = 1, req_minor = 1; - int major, minor; - char version[256]; - clGetDeviceInfo(device, - CL_DEVICE_OPENCL_C_VERSION, - sizeof(version), - &version, - NULL); - if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) { - if(error != NULL) { - *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); - } - return false; - } - if(!((major == req_major && minor >= req_minor) || (major > req_major))) { - if(error != NULL) { - *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); - } - return false; - } - if(error != NULL) { - *error = ""; - } - return true; + const int req_major = 1, req_minor = 1; + int major, minor; + char version[256]; + clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL); + if (sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) { + if (error != NULL) { + *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); + } + return false; + } + if (!((major == req_major && minor >= req_minor) || (major > req_major))) { + if (error != NULL) { + *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); + } + return false; + } + if (error != NULL) { + *error = ""; + } + return true; } -string OpenCLInfo::get_hardware_id(const string& platform_name, cl_device_id device_id) +string OpenCLInfo::get_hardware_id(const string &platform_name, cl_device_id device_id) { - if(platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { - /* Use cl_amd_device_topology extension. */ - cl_char topology[24]; - if(clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && topology[0] == 1) { - return string_printf("%02x:%02x.%01x", - (unsigned int)topology[21], - (unsigned int)topology[22], - (unsigned int)topology[23]); - } - } - else if(platform_name == "NVIDIA CUDA") { - /* Use two undocumented options of the cl_nv_device_attribute_query extension. */ - cl_int bus_id, slot_id; - if(clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS && - clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) { - return string_printf("%02x:%02x.%01x", - (unsigned int)(bus_id), - (unsigned int)(slot_id >> 3), - (unsigned int)(slot_id & 0x7)); - } - } - /* No general way to get a hardware ID from OpenCL => give up. */ - return ""; + if (platform_name == "AMD Accelerated Parallel Processing" || platform_name == "Apple") { + /* Use cl_amd_device_topology extension. */ + cl_char topology[24]; + if (clGetDeviceInfo(device_id, 0x4037, sizeof(topology), topology, NULL) == CL_SUCCESS && + topology[0] == 1) { + return string_printf("%02x:%02x.%01x", + (unsigned int)topology[21], + (unsigned int)topology[22], + (unsigned int)topology[23]); + } + } + else if (platform_name == "NVIDIA CUDA") { + /* Use two undocumented options of the cl_nv_device_attribute_query extension. */ + cl_int bus_id, slot_id; + if (clGetDeviceInfo(device_id, 0x4008, sizeof(cl_int), &bus_id, NULL) == CL_SUCCESS && + clGetDeviceInfo(device_id, 0x4009, sizeof(cl_int), &slot_id, NULL) == CL_SUCCESS) { + return string_printf("%02x:%02x.%01x", + (unsigned int)(bus_id), + (unsigned int)(slot_id >> 3), + (unsigned int)(slot_id & 0x7)); + } + } + /* No general way to get a hardware ID from OpenCL => give up. */ + return ""; } -void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, - bool force_all) +void OpenCLInfo::get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices, bool force_all) { - const cl_device_type device_type = OpenCLInfo::device_type(); - static bool first_time = true; -#define FIRST_VLOG(severity) if(first_time) VLOG(severity) - - usable_devices->clear(); - - if(device_type == 0) { - FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; - first_time = false; - return; - } - - cl_int error; - vector<cl_device_id> device_ids; - vector<cl_platform_id> platform_ids; - - /* Get platforms. */ - if(!get_platforms(&platform_ids, &error)) { - FIRST_VLOG(2) << "Error fetching platforms:" - << string(clewErrorString(error)); - first_time = false; - return; - } - if(platform_ids.size() == 0) { - FIRST_VLOG(2) << "No OpenCL platforms were found."; - first_time = false; - return; - } - /* Devices are numbered consecutively across platforms. */ - for(int platform = 0; platform < platform_ids.size(); platform++) { - cl_platform_id platform_id = platform_ids[platform]; - string platform_name; - if(!get_platform_name(platform_id, &platform_name)) { - FIRST_VLOG(2) << "Failed to get platform name, ignoring."; - continue; - } - FIRST_VLOG(2) << "Enumerating devices for platform " - << platform_name << "."; - if(!platform_version_check(platform_id)) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << " due to too old compiler version."; - continue; - } - if(!get_platform_devices(platform_id, - device_type, - &device_ids, - &error)) - { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", failed to fetch of devices: " - << string(clewErrorString(error)); - continue; - } - if(device_ids.size() == 0) { - FIRST_VLOG(2) << "Ignoring platform " << platform_name - << ", it has no devices."; - continue; - } - for(int num = 0; num < device_ids.size(); num++) { - const cl_device_id device_id = device_ids[num]; - string device_name; - if(!get_device_name(device_id, &device_name, &error)) { - FIRST_VLOG(2) << "Failed to fetch device name: " - << string(clewErrorString(error)) - << ", ignoring."; - continue; - } - if(!device_version_check(device_id)) { - FIRST_VLOG(2) << "Ignoring device " << device_name - << " due to old compiler version."; - continue; - } - if(force_all || - device_supported(platform_name, device_id)) - { - cl_device_type device_type; - if(!get_device_type(device_id, &device_type, &error)) { - FIRST_VLOG(2) << "Ignoring device " << device_name - << ", failed to fetch device type:" - << string(clewErrorString(error)); - continue; - } - string readable_device_name = - get_readable_device_name(device_id); - if(readable_device_name != device_name) { - FIRST_VLOG(2) << "Using more readable device name: " - << readable_device_name; - } - FIRST_VLOG(2) << "Adding new device " - << readable_device_name << "."; - string hardware_id = get_hardware_id(platform_name, device_id); - string device_extensions = get_device_extensions(device_id); - usable_devices->push_back(OpenCLPlatformDevice( - platform_id, - platform_name, - device_id, - device_type, - readable_device_name, - hardware_id, - device_extensions)); - } - else { - FIRST_VLOG(2) << "Ignoring device " << device_name - << ", not officially supported yet."; - } - } - } - first_time = false; + const cl_device_type device_type = OpenCLInfo::device_type(); + static bool first_time = true; +# define FIRST_VLOG(severity) \ + if (first_time) \ + VLOG(severity) + + usable_devices->clear(); + + if (device_type == 0) { + FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; + first_time = false; + return; + } + + cl_int error; + vector<cl_device_id> device_ids; + vector<cl_platform_id> platform_ids; + + /* Get platforms. */ + if (!get_platforms(&platform_ids, &error)) { + FIRST_VLOG(2) << "Error fetching platforms:" << string(clewErrorString(error)); + first_time = false; + return; + } + if (platform_ids.size() == 0) { + FIRST_VLOG(2) << "No OpenCL platforms were found."; + first_time = false; + return; + } + /* Devices are numbered consecutively across platforms. */ + for (int platform = 0; platform < platform_ids.size(); platform++) { + cl_platform_id platform_id = platform_ids[platform]; + string platform_name; + if (!get_platform_name(platform_id, &platform_name)) { + FIRST_VLOG(2) << "Failed to get platform name, ignoring."; + continue; + } + FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; + if (!platform_version_check(platform_id)) { + FIRST_VLOG(2) << "Ignoring platform " << platform_name + << " due to too old compiler version."; + continue; + } + if (!get_platform_devices(platform_id, device_type, &device_ids, &error)) { + FIRST_VLOG(2) << "Ignoring platform " << platform_name + << ", failed to fetch of devices: " << string(clewErrorString(error)); + continue; + } + if (device_ids.size() == 0) { + FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", it has no devices."; + continue; + } + for (int num = 0; num < device_ids.size(); num++) { + const cl_device_id device_id = device_ids[num]; + string device_name; + if (!get_device_name(device_id, &device_name, &error)) { + FIRST_VLOG(2) << "Failed to fetch device name: " << string(clewErrorString(error)) + << ", ignoring."; + continue; + } + if (!device_version_check(device_id)) { + FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version."; + continue; + } + if (force_all || device_supported(platform_name, device_id)) { + cl_device_type device_type; + if (!get_device_type(device_id, &device_type, &error)) { + FIRST_VLOG(2) << "Ignoring device " << device_name + << ", failed to fetch device type:" << string(clewErrorString(error)); + continue; + } + string readable_device_name = get_readable_device_name(device_id); + if (readable_device_name != device_name) { + FIRST_VLOG(2) << "Using more readable device name: " << readable_device_name; + } + FIRST_VLOG(2) << "Adding new device " << readable_device_name << "."; + string hardware_id = get_hardware_id(platform_name, device_id); + string device_extensions = get_device_extensions(device_id); + usable_devices->push_back(OpenCLPlatformDevice(platform_id, + platform_name, + device_id, + device_type, + readable_device_name, + hardware_id, + device_extensions)); + } + else { + FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet."; + } + } + } + first_time = false; } -bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, - cl_int *error) +bool OpenCLInfo::get_platforms(vector<cl_platform_id> *platform_ids, cl_int *error) { - /* Reset from possible previous state. */ - platform_ids->resize(0); - cl_uint num_platforms; - if(!get_num_platforms(&num_platforms, error)) { - return false; - } - /* Get actual platforms. */ - cl_int err; - platform_ids->resize(num_platforms); - if((err = clGetPlatformIDs(num_platforms, - &platform_ids->at(0), - NULL)) != CL_SUCCESS) { - if(error != NULL) { - *error = err; - } - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + /* Reset from possible previous state. */ + platform_ids->resize(0); + cl_uint num_platforms; + if (!get_num_platforms(&num_platforms, error)) { + return false; + } + /* Get actual platforms. */ + cl_int err; + platform_ids->resize(num_platforms); + if ((err = clGetPlatformIDs(num_platforms, &platform_ids->at(0), NULL)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } vector<cl_platform_id> OpenCLInfo::get_platforms() { - vector<cl_platform_id> platform_ids; - get_platforms(&platform_ids); - return platform_ids; + vector<cl_platform_id> platform_ids; + get_platforms(&platform_ids); + return platform_ids; } bool OpenCLInfo::get_num_platforms(cl_uint *num_platforms, cl_int *error) { - cl_int err; - if((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { - if(error != NULL) { - *error = err; - } - *num_platforms = 0; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + cl_int err; + if ((err = clGetPlatformIDs(0, NULL, num_platforms)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *num_platforms = 0; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } cl_uint OpenCLInfo::get_num_platforms() { - cl_uint num_platforms; - if(!get_num_platforms(&num_platforms)) { - return 0; - } - return num_platforms; + cl_uint num_platforms; + if (!get_num_platforms(&num_platforms)) { + return 0; + } + return num_platforms; } -bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, - string *platform_name) +bool OpenCLInfo::get_platform_name(cl_platform_id platform_id, string *platform_name) { - char buffer[256]; - if(clGetPlatformInfo(platform_id, - CL_PLATFORM_NAME, - sizeof(buffer), - &buffer, - NULL) != CL_SUCCESS) - { - *platform_name = ""; - return false; - } - *platform_name = buffer; - return true; + char buffer[256]; + if (clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(buffer), &buffer, NULL) != + CL_SUCCESS) { + *platform_name = ""; + return false; + } + *platform_name = buffer; + return true; } string OpenCLInfo::get_platform_name(cl_platform_id platform_id) { - string platform_name; - if(!get_platform_name(platform_id, &platform_name)) { - return ""; - } - return platform_name; + string platform_name; + if (!get_platform_name(platform_id, &platform_name)) { + return ""; + } + return platform_name; } bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, @@ -1076,266 +1034,222 @@ bool OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, cl_uint *num_devices, cl_int *error) { - cl_int err; - if((err = clGetDeviceIDs(platform_id, - device_type, - 0, - NULL, - num_devices)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *num_devices = 0; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + cl_int err; + if ((err = clGetDeviceIDs(platform_id, device_type, 0, NULL, num_devices)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *num_devices = 0; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } cl_uint OpenCLInfo::get_num_platform_devices(cl_platform_id platform_id, cl_device_type device_type) { - cl_uint num_devices; - if(!get_num_platform_devices(platform_id, - device_type, - &num_devices)) - { - return 0; - } - return num_devices; + cl_uint num_devices; + if (!get_num_platform_devices(platform_id, device_type, &num_devices)) { + return 0; + } + return num_devices; } bool OpenCLInfo::get_platform_devices(cl_platform_id platform_id, cl_device_type device_type, vector<cl_device_id> *device_ids, - cl_int* error) + cl_int *error) { - /* Reset from possible previous state. */ - device_ids->resize(0); - /* Get number of devices to pre-allocate memory. */ - cl_uint num_devices; - if(!get_num_platform_devices(platform_id, - device_type, - &num_devices, - error)) - { - return false; - } - /* Get actual device list. */ - device_ids->resize(num_devices); - cl_int err; - if((err = clGetDeviceIDs(platform_id, - device_type, - num_devices, - &device_ids->at(0), - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + /* Reset from possible previous state. */ + device_ids->resize(0); + /* Get number of devices to pre-allocate memory. */ + cl_uint num_devices; + if (!get_num_platform_devices(platform_id, device_type, &num_devices, error)) { + return false; + } + /* Get actual device list. */ + device_ids->resize(num_devices); + cl_int err; + if ((err = clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids->at(0), NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } vector<cl_device_id> OpenCLInfo::get_platform_devices(cl_platform_id platform_id, cl_device_type device_type) { - vector<cl_device_id> devices; - get_platform_devices(platform_id, device_type, &devices); - return devices; + vector<cl_device_id> devices; + get_platform_devices(platform_id, device_type, &devices); + return devices; } -bool OpenCLInfo::get_device_name(cl_device_id device_id, - string *device_name, - cl_int* error) +bool OpenCLInfo::get_device_name(cl_device_id device_id, string *device_name, cl_int *error) { - char buffer[1024]; - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DEVICE_NAME, - sizeof(buffer), - &buffer, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *device_name = ""; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - *device_name = buffer; - return true; + char buffer[1024]; + cl_int err; + if ((err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), &buffer, NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *device_name = ""; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + *device_name = buffer; + return true; } string OpenCLInfo::get_device_name(cl_device_id device_id) { - string device_name; - if(!get_device_name(device_id, &device_name)) { - return ""; - } - return device_name; + string device_name; + if (!get_device_name(device_id, &device_name)) { + return ""; + } + return device_name; } bool OpenCLInfo::get_device_extensions(cl_device_id device_id, - string *device_extensions, - cl_int* error) + string *device_extensions, + cl_int *error) { - char buffer[1024]; - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DEVICE_EXTENSIONS, - sizeof(buffer), - &buffer, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *device_extensions = ""; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - *device_extensions = buffer; - return true; + char buffer[1024]; + cl_int err; + if ((err = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(buffer), &buffer, NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *device_extensions = ""; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + *device_extensions = buffer; + return true; } string OpenCLInfo::get_device_extensions(cl_device_id device_id) { - string device_extensions; - if(!get_device_extensions(device_id, &device_extensions)) { - return ""; - } - return device_extensions; + string device_extensions; + if (!get_device_extensions(device_id, &device_extensions)) { + return ""; + } + return device_extensions; } bool OpenCLInfo::get_device_type(cl_device_id device_id, cl_device_type *device_type, - cl_int* error) + cl_int *error) { - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DEVICE_TYPE, - sizeof(cl_device_type), - device_type, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - *device_type = 0; - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - return true; + cl_int err; + if ((err = clGetDeviceInfo( + device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), device_type, NULL)) != CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + *device_type = 0; + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + return true; } cl_device_type OpenCLInfo::get_device_type(cl_device_id device_id) { - cl_device_type device_type; - if(!get_device_type(device_id, &device_type)) { - return 0; - } - return device_type; + cl_device_type device_type; + if (!get_device_type(device_id, &device_type)) { + return 0; + } + return device_type; } string OpenCLInfo::get_readable_device_name(cl_device_id device_id) { - string name = ""; - char board_name[1024]; - size_t length = 0; - if(clGetDeviceInfo(device_id, - CL_DEVICE_BOARD_NAME_AMD, - sizeof(board_name), - &board_name, - &length) == CL_SUCCESS) - { - if(length != 0 && board_name[0] != '\0') { - name = board_name; - } - } - - /* Fallback to standard device name API. */ - if(name.empty()) { - name = get_device_name(device_id); - } - - /* Special exception for AMD Vega, need to be able to tell - * Vega 56 from 64 apart. - */ - if(name == "Radeon RX Vega") { - cl_int max_compute_units = 0; - if(clGetDeviceInfo(device_id, - CL_DEVICE_MAX_COMPUTE_UNITS, - sizeof(max_compute_units), - &max_compute_units, - NULL) == CL_SUCCESS) - { - name += " " + to_string(max_compute_units); - } - } - - /* Distinguish from our native CPU device. */ - if(get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { - name += " (OpenCL)"; - } - - return name; + string name = ""; + char board_name[1024]; + size_t length = 0; + if (clGetDeviceInfo( + device_id, CL_DEVICE_BOARD_NAME_AMD, sizeof(board_name), &board_name, &length) == + CL_SUCCESS) { + if (length != 0 && board_name[0] != '\0') { + name = board_name; + } + } + + /* Fallback to standard device name API. */ + if (name.empty()) { + name = get_device_name(device_id); + } + + /* Special exception for AMD Vega, need to be able to tell + * Vega 56 from 64 apart. + */ + if (name == "Radeon RX Vega") { + cl_int max_compute_units = 0; + if (clGetDeviceInfo(device_id, + CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(max_compute_units), + &max_compute_units, + NULL) == CL_SUCCESS) { + name += " " + to_string(max_compute_units); + } + } + + /* Distinguish from our native CPU device. */ + if (get_device_type(device_id) & CL_DEVICE_TYPE_CPU) { + name += " (OpenCL)"; + } + + return name; } -bool OpenCLInfo::get_driver_version(cl_device_id device_id, - int *major, - int *minor, - cl_int* error) +bool OpenCLInfo::get_driver_version(cl_device_id device_id, int *major, int *minor, cl_int *error) { - char buffer[1024]; - cl_int err; - if((err = clGetDeviceInfo(device_id, - CL_DRIVER_VERSION, - sizeof(buffer), - &buffer, - NULL)) != CL_SUCCESS) - { - if(error != NULL) { - *error = err; - } - return false; - } - if(error != NULL) { - *error = CL_SUCCESS; - } - if(sscanf(buffer, "%d.%d", major, minor) < 2) { - VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); - return false; - } - return true; + char buffer[1024]; + cl_int err; + if ((err = clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), &buffer, NULL)) != + CL_SUCCESS) { + if (error != NULL) { + *error = err; + } + return false; + } + if (error != NULL) { + *error = CL_SUCCESS; + } + if (sscanf(buffer, "%d.%d", major, minor) < 2) { + VLOG(1) << string_printf("OpenCL: failed to parse driver version string (%s).", buffer); + return false; + } + return true; } int OpenCLInfo::mem_sub_ptr_alignment(cl_device_id device_id) { - int base_align_bits; - if(clGetDeviceInfo(device_id, - CL_DEVICE_MEM_BASE_ADDR_ALIGN, - sizeof(int), - &base_align_bits, - NULL) == CL_SUCCESS) - { - return base_align_bits/8; - } - return 1; + int base_align_bits; + if (clGetDeviceInfo( + device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &base_align_bits, NULL) == + CL_SUCCESS) { + return base_align_bits / 8; + } + return 1; } CCL_NAMESPACE_END |