diff options
Diffstat (limited to 'intern')
136 files changed, 4980 insertions, 1390 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt index 17096d441f0..2018c1d9648 100644 --- a/intern/cycles/CMakeLists.txt +++ b/intern/cycles/CMakeLists.txt @@ -297,6 +297,7 @@ endif() if(WITH_CYCLES_STANDALONE) set(WITH_CYCLES_DEVICE_CUDA TRUE) + set(WITH_CYCLES_DEVICE_HIP TRUE) endif() # TODO(sergey): Consider removing it, only causes confusion in interface. set(WITH_CYCLES_DEVICE_MULTI TRUE) diff --git a/intern/cycles/app/CMakeLists.txt b/intern/cycles/app/CMakeLists.txt index f9dc5f00802..3ed3f54ef9f 100644 --- a/intern/cycles/app/CMakeLists.txt +++ b/intern/cycles/app/CMakeLists.txt @@ -64,6 +64,8 @@ if(WITH_CYCLES_STANDALONE) cycles_standalone.cpp cycles_xml.cpp cycles_xml.h + oiio_output_driver.cpp + oiio_output_driver.h ) add_executable(cycles ${SRC} ${INC} ${INC_SYS}) unset(SRC) @@ -73,7 +75,7 @@ if(WITH_CYCLES_STANDALONE) if(APPLE) if(WITH_OPENCOLORIO) - set_property(TARGET cycles APPEND_STRING PROPERTY LINK_FLAGS " -framework IOKit") + set_property(TARGET cycles APPEND_STRING PROPERTY LINK_FLAGS " -framework IOKit -framework Carbon") endif() if(WITH_OPENIMAGEDENOISE AND "${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64") # OpenImageDenoise uses BNNS from the Accelerate framework. diff --git a/intern/cycles/app/cycles_standalone.cpp b/intern/cycles/app/cycles_standalone.cpp index 270096d70b0..00dc140648a 100644 --- a/intern/cycles/app/cycles_standalone.cpp +++ b/intern/cycles/app/cycles_standalone.cpp @@ -36,6 +36,9 @@ #include "util/util_unique_ptr.h" #include "util/util_version.h" +#include "app/cycles_xml.h" +#include "app/oiio_output_driver.h" + #ifdef WITH_CYCLES_STANDALONE_GUI # include "util/util_view.h" #endif @@ -53,7 +56,8 @@ struct Options { SessionParams session_params; bool quiet; bool show_help, interactive, pause; - string output_path; + string output_filepath; + string output_pass; } options; static void session_print(const string &str) @@ -89,30 +93,6 @@ static void session_print_status() session_print(status); } -static bool write_render(const uchar *pixels, int w, int h, int channels) -{ - string msg = string_printf("Writing image %s", options.output_path.c_str()); - session_print(msg); - - unique_ptr<ImageOutput> out = unique_ptr<ImageOutput>(ImageOutput::create(options.output_path)); - if (!out) { - return false; - } - - ImageSpec spec(w, h, channels, TypeDesc::UINT8); - if (!out->open(options.output_path, spec)) { - return false; - } - - /* conversion for different top/bottom convention */ - out->write_image( - TypeDesc::UINT8, pixels + (h - 1) * w * channels, AutoStride, -w * channels, AutoStride); - - out->close(); - - return true; -} - static BufferParams &session_buffer_params() { static BufferParams buffer_params; @@ -147,9 +127,14 @@ static void scene_init() static void session_init() { - options.session_params.write_render_cb = write_render; + options.output_pass = "combined"; options.session = new Session(options.session_params, options.scene_params); + if (!options.output_filepath.empty()) { + options.session->set_output_driver(make_unique<OIIOOutputDriver>( + options.output_filepath, options.output_pass, session_print)); + } + if (options.session_params.background && !options.quiet) options.session->progress.set_update_callback(function_bind(&session_print_status)); #ifdef WITH_CYCLES_STANDALONE_GUI @@ -160,7 +145,12 @@ static void session_init() /* load scene */ scene_init(); - options.session->reset(session_buffer_params(), options.session_params.samples); + /* add pass for output. */ + Pass *pass = options.scene->create_node<Pass>(); + pass->set_name(ustring(options.output_pass.c_str())); + pass->set_type(PASS_COMBINED); + + options.session->reset(options.session_params, session_buffer_params()); options.session->start(); } @@ -222,9 +212,7 @@ static void display_info(Progress &progress) static void display() { - static DeviceDrawParams draw_params = DeviceDrawParams(); - - options.session->draw(session_buffer_params(), draw_params); + options.session->draw(); display_info(options.session->progress); } @@ -254,7 +242,7 @@ static void motion(int x, int y, int button) options.session->scene->camera->need_flags_update = true; options.session->scene->camera->need_device_update = true; - options.session->reset(session_buffer_params(), options.session_params.samples); + options.session->reset(options.session_params, session_buffer_params()); } } @@ -271,7 +259,7 @@ static void resize(int width, int height) options.session->scene->camera->need_flags_update = true; options.session->scene->camera->need_device_update = true; - options.session->reset(session_buffer_params(), options.session_params.samples); + options.session->reset(options.session_params, session_buffer_params()); } } @@ -283,7 +271,7 @@ static void keyboard(unsigned char key) /* Reset */ else if (key == 'r') - options.session->reset(session_buffer_params(), options.session_params.samples); + options.session->reset(options.session_params, session_buffer_params()); /* Cancel */ else if (key == 27) // escape @@ -320,7 +308,7 @@ static void keyboard(unsigned char key) options.session->scene->camera->need_flags_update = true; options.session->scene->camera->need_device_update = true; - options.session->reset(session_buffer_params(), options.session_params.samples); + options.session->reset(options.session_params, session_buffer_params()); } /* Set Max Bounces */ @@ -346,7 +334,7 @@ static void keyboard(unsigned char key) options.session->scene->integrator->set_max_bounce(bounce); - options.session->reset(session_buffer_params(), options.session_params.samples); + options.session->reset(options.session_params, session_buffer_params()); } } #endif @@ -361,11 +349,13 @@ static int files_parse(int argc, const char *argv[]) static void options_parse(int argc, const char **argv) { - options.width = 0; - options.height = 0; + options.width = 1024; + options.height = 512; options.filepath = ""; options.session = NULL; options.quiet = false; + options.session_params.use_auto_tile = false; + options.session_params.tile_size = 0; /* device names */ string device_names = ""; @@ -411,7 +401,7 @@ static void options_parse(int argc, const char **argv) &options.session_params.samples, "Number of samples to render", "--output %s", - &options.output_path, + &options.output_filepath, "File path to write output image", "--threads %d", &options.session_params.threads, @@ -422,12 +412,9 @@ static void options_parse(int argc, const char **argv) "--height %d", &options.height, "Window height in pixel", - "--tile-width %d", - &options.session_params.tile_size.x, - "Tile width in pixels", - "--tile-height %d", - &options.session_params.tile_size.y, - "Tile height in pixels", + "--tile-size %d", + &options.session_params.tile_size, + "Tile size in pixels", "--list-devices", &list, "List information about all available devices", @@ -489,8 +476,9 @@ static void options_parse(int argc, const char **argv) options.session_params.background = true; #endif - /* Use progressive rendering */ - options.session_params.progressive = true; + if (options.session_params.tile_size > 0) { + options.session_params.use_auto_tile = true; + } /* find matching device */ DeviceType device_type = Device::type_from_string(devicename.c_str()); diff --git a/intern/cycles/app/cycles_xml.cpp b/intern/cycles/app/cycles_xml.cpp index 54f97fddbd9..0b83c60f32d 100644 --- a/intern/cycles/app/cycles_xml.cpp +++ b/intern/cycles/app/cycles_xml.cpp @@ -333,6 +333,7 @@ static void xml_read_shader_graph(XMLReadState &state, Shader *shader, xml_node } snode = (ShaderNode *)node_type->create(node_type); + snode->set_owner(graph); } xml_read_node(graph_reader, snode, node); diff --git a/intern/cycles/app/oiio_output_driver.cpp b/intern/cycles/app/oiio_output_driver.cpp new file mode 100644 index 00000000000..d791c89772f --- /dev/null +++ b/intern/cycles/app/oiio_output_driver.cpp @@ -0,0 +1,71 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "app/oiio_output_driver.h" + +CCL_NAMESPACE_BEGIN + +OIIOOutputDriver::OIIOOutputDriver(const string_view filepath, + const string_view pass, + LogFunction log) + : filepath_(filepath), pass_(pass), log_(log) +{ +} + +OIIOOutputDriver::~OIIOOutputDriver() +{ +} + +void OIIOOutputDriver::write_render_tile(const Tile &tile) +{ + /* Only write the full buffer, no intermediate tiles. */ + if (!(tile.size == tile.full_size)) { + return; + } + + log_(string_printf("Writing image %s", filepath_.c_str())); + + unique_ptr<ImageOutput> image_output(ImageOutput::create(filepath_)); + if (image_output == nullptr) { + log_("Failed to create image file"); + return; + } + + const int width = tile.size.x; + const int height = tile.size.y; + + ImageSpec spec(width, height, 4, TypeDesc::FLOAT); + if (!image_output->open(filepath_, spec)) { + log_("Failed to create image file"); + return; + } + + vector<float> pixels(width * height * 4); + if (!tile.get_pass_pixels(pass_, 4, pixels.data())) { + log_("Failed to read render pass pixels"); + return; + } + + /* Manipulate offset and stride to convert from bottom-up to top-down convention. */ + image_output->write_image(TypeDesc::FLOAT, + pixels.data() + (height - 1) * width * 4, + AutoStride, + -width * 4 * sizeof(float), + AutoStride); + image_output->close(); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/app/oiio_output_driver.h b/intern/cycles/app/oiio_output_driver.h new file mode 100644 index 00000000000..cdc4085d962 --- /dev/null +++ b/intern/cycles/app/oiio_output_driver.h @@ -0,0 +1,42 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "render/output_driver.h" + +#include "util/util_function.h" +#include "util/util_image.h" +#include "util/util_string.h" +#include "util/util_unique_ptr.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class OIIOOutputDriver : public OutputDriver { + public: + typedef function<void(const string &)> LogFunction; + + OIIOOutputDriver(const string_view filepath, const string_view pass, LogFunction log); + virtual ~OIIOOutputDriver(); + + void write_render_tile(const Tile &tile) override; + + protected: + string filepath_; + string pass_; + LogFunction log_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/blender/CMakeLists.txt b/intern/cycles/blender/CMakeLists.txt index 5bdcfd56a4d..a0442b3394b 100644 --- a/intern/cycles/blender/CMakeLists.txt +++ b/intern/cycles/blender/CMakeLists.txt @@ -31,13 +31,14 @@ set(INC_SYS set(SRC blender_camera.cpp blender_device.cpp + blender_display_driver.cpp blender_image.cpp blender_geometry.cpp - blender_gpu_display.cpp blender_light.cpp blender_mesh.cpp blender_object.cpp blender_object_cull.cpp + blender_output_driver.cpp blender_particles.cpp blender_curves.cpp blender_logging.cpp @@ -51,10 +52,11 @@ set(SRC CCL_api.h blender_device.h - blender_gpu_display.h + blender_display_driver.h blender_id_map.h blender_image.h blender_object_cull.h + blender_output_driver.h blender_sync.h blender_session.h blender_texture.h @@ -95,6 +97,9 @@ set(ADDON_FILES add_definitions(${GL_DEFINITIONS}) +if(WITH_CYCLES_DEVICE_HIP) + add_definitions(-DWITH_HIP) +endif() if(WITH_MOD_FLUID) add_definitions(-DWITH_FLUID) endif() diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index e0e8ca10bef..d729cb1ee69 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -28,7 +28,7 @@ def _configure_argument_parser(): action='store_true') parser.add_argument("--cycles-device", help="Set the device to use for Cycles, overriding user preferences and the scene setting." - "Valid options are 'CPU', 'CUDA' or 'OPTIX'." + "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'" "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.", default=None) return parser diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index 5fb0eeed925..cea70033784 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -111,6 +111,7 @@ enum_device_type = ( ('CPU', "CPU", "CPU", 0), ('CUDA', "CUDA", "CUDA", 1), ('OPTIX', "OptiX", "OptiX", 3), + ("HIP", "HIP", "HIP", 4) ) enum_texture_limit = ( @@ -123,7 +124,7 @@ enum_texture_limit = ( ('4096', "4096", "Limit texture size to 4096 pixels", 6), ('8192', "8192", "Limit texture size to 8192 pixels", 7), ) - + # NOTE: Identifiers are expected to be an upper case version of identifiers from `Pass::get_type_enum()` enum_view3d_shading_render_pass = ( ('', "General", ""), @@ -739,7 +740,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup): use_auto_tile: BoolProperty( name="Auto Tiles", - description="Automatically split image into tiles", + description="Automatically render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory", default=True, ) tile_size: IntProperty( @@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences): def get_device_types(self, context): import _cycles - has_cuda, has_optix = _cycles.get_device_types() + has_cuda, has_optix, has_hip = _cycles.get_device_types() + list = [('NONE', "None", "Don't use compute device", 0)] if has_cuda: list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1)) if has_optix: list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3)) + if has_hip: + list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4)) + return list compute_device_type: EnumProperty( @@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): def update_device_entries(self, device_list): for device in device_list: - if not device[1] in {'CUDA', 'OPTIX', 'CPU'}: + if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}: continue # Try to find existing Device entry entry = self.find_existing_device_entry(device) @@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): elif entry.type == 'CPU': cpu_devices.append(entry) # Extend all GPU devices with CPU. - if compute_device_type != 'CPU': + if compute_device_type != 'CPU' and compute_device_type != 'HIP': devices.extend(cpu_devices) return devices @@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences): import _cycles # Ensure `self.devices` is not re-allocated when the second call to # get_devices_for_type is made, freeing items from the first list. - for device_type in ('CUDA', 'OPTIX', 'OPENCL'): + for device_type in ('CUDA', 'OPTIX', 'HIP'): self.update_device_entries(_cycles.available_devices(device_type)) # Deprecated: use refresh_devices instead. diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py index d02627b9936..c4a1844480c 100644 --- a/intern/cycles/blender/addon/ui.py +++ b/intern/cycles/blender/addon/ui.py @@ -99,6 +99,11 @@ def use_cuda(context): return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU') +def use_hip(context): + cscene = context.scene.cycles + + return (get_device_type(context) == 'HIP' and cscene.device == 'GPU') + def use_optix(context): cscene = context.scene.cycles @@ -613,8 +618,8 @@ class CYCLES_RENDER_PT_performance_threads(CyclesButtonsPanel, Panel): sub.prop(rd, "threads") -class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel): - bl_label = "Tiles" +class CYCLES_RENDER_PT_performance_memory(CyclesButtonsPanel, Panel): + bl_label = "Memory" bl_parent_id = "CYCLES_RENDER_PT_performance" def draw(self, context): @@ -2107,7 +2112,7 @@ classes = ( CYCLES_RENDER_PT_film_transparency, CYCLES_RENDER_PT_performance, CYCLES_RENDER_PT_performance_threads, - CYCLES_RENDER_PT_performance_tiles, + CYCLES_RENDER_PT_performance_memory, CYCLES_RENDER_PT_performance_acceleration_structure, CYCLES_RENDER_PT_performance_final_render, CYCLES_RENDER_PT_performance_viewport, diff --git a/intern/cycles/blender/blender_curves.cpp b/intern/cycles/blender/blender_curves.cpp index 6fe5ea41fff..b6b4f206620 100644 --- a/intern/cycles/blender/blender_curves.cpp +++ b/intern/cycles/blender/blender_curves.cpp @@ -283,10 +283,13 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa return; Attribute *attr_intercept = NULL; + Attribute *attr_length = NULL; Attribute *attr_random = NULL; if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT); + if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH)) + attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH); if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM); @@ -336,6 +339,10 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa num_curve_keys++; } + if (attr_length != NULL) { + attr_length->add(CData->curve_length[curve]); + } + if (attr_random != NULL) { attr_random->add(hash_uint2_to_float(num_curves, 0)); } @@ -657,11 +664,15 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair) /* Add requested attributes. */ Attribute *attr_intercept = NULL; + Attribute *attr_length = NULL; Attribute *attr_random = NULL; if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) { attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT); } + if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH)) { + attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH); + } if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) { attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM); } @@ -714,6 +725,10 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair) } } + if (attr_length) { + attr_length->add(length); + } + /* Random number per curve. */ if (attr_random != NULL) { attr_random->add(hash_uint2_to_float(b_curve.index(), 0)); diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index ce1770f18a3..7bed33855c2 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -26,6 +26,7 @@ enum ComputeDevice { COMPUTE_DEVICE_CPU = 0, COMPUTE_DEVICE_CUDA = 1, COMPUTE_DEVICE_OPTIX = 3, + COMPUTE_DEVICE_HIP = 4, COMPUTE_DEVICE_NUM }; @@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen else if (compute_device == COMPUTE_DEVICE_OPTIX) { mask |= DEVICE_MASK_OPTIX; } + else if (compute_device == COMPUTE_DEVICE_HIP) { + mask |= DEVICE_MASK_HIP; + } vector<DeviceInfo> devices = Device::available_devices(mask); /* Match device preferences and available devices. */ diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_display_driver.cpp index c5c3a2bd155..f55a8ce8c4e 100644 --- a/intern/cycles/blender/blender_gpu_display.cpp +++ b/intern/cycles/blender/blender_display_driver.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "blender/blender_gpu_display.h" +#include "blender/blender_display_driver.h" #include "device/device.h" #include "util/util_logging.h" @@ -273,17 +273,17 @@ uint BlenderDisplaySpaceShader::get_shader_program() } /* -------------------------------------------------------------------- - * BlenderGPUDisplay. + * BlenderDisplayDriver. */ -BlenderGPUDisplay::BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene) +BlenderDisplayDriver::BlenderDisplayDriver(BL::RenderEngine &b_engine, BL::Scene &b_scene) : b_engine_(b_engine), display_shader_(BlenderDisplayShader::create(b_engine, b_scene)) { /* Create context while on the main thread. */ gl_context_create(); } -BlenderGPUDisplay::~BlenderGPUDisplay() +BlenderDisplayDriver::~BlenderDisplayDriver() { gl_resources_destroy(); } @@ -292,19 +292,18 @@ BlenderGPUDisplay::~BlenderGPUDisplay() * Update procedure. */ -bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams ¶ms, +bool BlenderDisplayDriver::update_begin(const Params ¶ms, int texture_width, int texture_height) { - /* Note that it's the responsibility of BlenderGPUDisplay to ensure updating and drawing + /* Note that it's the responsibility of BlenderDisplayDriver to ensure updating and drawing * the texture does not happen at the same time. This is achieved indirectly. * * When enabling the OpenGL context, it uses an internal mutex lock DST.gl_context_lock. * This same lock is also held when do_draw() is called, which together ensure mutual * exclusion. * - * This locking is not performed at the GPU display level, because that would cause lock - * inversion. */ + * This locking is not performed on the Cycles side, because that would cause lock inversion. */ if (!gl_context_enable()) { return false; } @@ -361,7 +360,7 @@ bool BlenderGPUDisplay::do_update_begin(const GPUDisplayParams ¶ms, return true; } -void BlenderGPUDisplay::do_update_end() +void BlenderDisplayDriver::update_end() { gl_upload_sync_ = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); glFlush(); @@ -370,53 +369,17 @@ void BlenderGPUDisplay::do_update_end() } /* -------------------------------------------------------------------- - * Texture update from CPU buffer. - */ - -void BlenderGPUDisplay::do_copy_pixels_to_texture( - const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height) -{ - /* This call copies pixels to a Pixel Buffer Object (PBO) which is much cheaper from CPU time - * point of view than to copy data directly to the OpenGL texture. - * - * The possible downside of this approach is that it might require a higher peak memory when - * doing partial updates of the texture (although, in practice even partial updates might peak - * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */ - - half4 *mapped_rgba_pixels = map_texture_buffer(); - if (!mapped_rgba_pixels) { - return; - } - - if (texture_x == 0 && texture_y == 0 && pixels_width == texture_.width && - pixels_height == texture_.height) { - const size_t size_in_bytes = sizeof(half4) * texture_.width * texture_.height; - memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes); - } - else { - const half4 *rgba_row = rgba_pixels; - half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x; - for (int y = 0; y < pixels_height; - ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) { - memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width); - } - } - - unmap_texture_buffer(); -} - -/* -------------------------------------------------------------------- * Texture buffer mapping. */ -half4 *BlenderGPUDisplay::do_map_texture_buffer() +half4 *BlenderDisplayDriver::map_texture_buffer() { glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id); half4 *mapped_rgba_pixels = reinterpret_cast<half4 *>( glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY)); if (!mapped_rgba_pixels) { - LOG(ERROR) << "Error mapping BlenderGPUDisplay pixel buffer object."; + LOG(ERROR) << "Error mapping BlenderDisplayDriver pixel buffer object."; } if (texture_.need_clear) { @@ -431,7 +394,7 @@ half4 *BlenderGPUDisplay::do_map_texture_buffer() return mapped_rgba_pixels; } -void BlenderGPUDisplay::do_unmap_texture_buffer() +void BlenderDisplayDriver::unmap_texture_buffer() { glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER); @@ -442,9 +405,9 @@ void BlenderGPUDisplay::do_unmap_texture_buffer() * Graphics interoperability. */ -DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get() +BlenderDisplayDriver::GraphicsInterop BlenderDisplayDriver::graphics_interop_get() { - DeviceGraphicsInteropDestination interop_dst; + GraphicsInterop interop_dst; interop_dst.buffer_width = texture_.buffer_width; interop_dst.buffer_height = texture_.buffer_height; @@ -456,12 +419,12 @@ DeviceGraphicsInteropDestination BlenderGPUDisplay::do_graphics_interop_get() return interop_dst; } -void BlenderGPUDisplay::graphics_interop_activate() +void BlenderDisplayDriver::graphics_interop_activate() { gl_context_enable(); } -void BlenderGPUDisplay::graphics_interop_deactivate() +void BlenderDisplayDriver::graphics_interop_deactivate() { gl_context_disable(); } @@ -470,27 +433,21 @@ void BlenderGPUDisplay::graphics_interop_deactivate() * Drawing. */ -void BlenderGPUDisplay::clear() +void BlenderDisplayDriver::clear() { texture_.need_clear = true; } -void BlenderGPUDisplay::set_zoom(float zoom_x, float zoom_y) +void BlenderDisplayDriver::set_zoom(float zoom_x, float zoom_y) { zoom_ = make_float2(zoom_x, zoom_y); } -void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms) +void BlenderDisplayDriver::draw(const Params ¶ms) { /* See do_update_begin() for why no locking is required here. */ const bool transparent = true; // TODO(sergey): Derive this from Film. - if (texture_.need_clear) { - /* Texture is requested to be cleared and was not yet cleared. - * Do early return which should be equivalent of drawing all-zero texture. */ - return; - } - if (!gl_draw_resources_ensure()) { return; } @@ -499,6 +456,16 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms) gl_context_mutex_.lock(); } + if (texture_.need_clear) { + /* Texture is requested to be cleared and was not yet cleared. + * + * Do early return which should be equivalent of drawing all-zero texture. + * Watch out for the lock though so that the clear happening during update is properly + * synchronized here. */ + gl_context_mutex_.unlock(); + return; + } + if (gl_upload_sync_) { glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED); } @@ -524,7 +491,7 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms) const float zoomed_width = params.size.x * zoom_.x; const float zoomed_height = params.size.y * zoom_.y; if (texture_.width != params.size.x || texture_.height != params.size.y) { - /* Resolution divider is different from 1, force enarest interpolation. */ + /* Resolution divider is different from 1, force nearest interpolation. */ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); } else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) { @@ -580,7 +547,7 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms) } } -void BlenderGPUDisplay::gl_context_create() +void BlenderDisplayDriver::gl_context_create() { /* When rendering in viewport there is no render context available via engine. * Check whether own context is to be created here. @@ -609,7 +576,7 @@ void BlenderGPUDisplay::gl_context_create() } } -bool BlenderGPUDisplay::gl_context_enable() +bool BlenderDisplayDriver::gl_context_enable() { if (use_gl_context_) { if (!gl_context_) { @@ -624,7 +591,7 @@ bool BlenderGPUDisplay::gl_context_enable() return true; } -void BlenderGPUDisplay::gl_context_disable() +void BlenderDisplayDriver::gl_context_disable() { if (use_gl_context_) { if (gl_context_) { @@ -637,7 +604,7 @@ void BlenderGPUDisplay::gl_context_disable() RE_engine_render_context_disable(reinterpret_cast<RenderEngine *>(b_engine_.ptr.data)); } -void BlenderGPUDisplay::gl_context_dispose() +void BlenderDisplayDriver::gl_context_dispose() { if (gl_context_) { const bool drw_state = DRW_opengl_context_release(); @@ -649,7 +616,7 @@ void BlenderGPUDisplay::gl_context_dispose() } } -bool BlenderGPUDisplay::gl_draw_resources_ensure() +bool BlenderDisplayDriver::gl_draw_resources_ensure() { if (!texture_.gl_id) { /* If there is no texture allocated, there is nothing to draw. Inform the draw call that it can @@ -676,7 +643,7 @@ bool BlenderGPUDisplay::gl_draw_resources_ensure() return true; } -void BlenderGPUDisplay::gl_resources_destroy() +void BlenderDisplayDriver::gl_resources_destroy() { gl_context_enable(); @@ -699,7 +666,7 @@ void BlenderGPUDisplay::gl_resources_destroy() gl_context_dispose(); } -bool BlenderGPUDisplay::gl_texture_resources_ensure() +bool BlenderDisplayDriver::gl_texture_resources_ensure() { if (texture_.creation_attempted) { return texture_.is_created; @@ -736,7 +703,7 @@ bool BlenderGPUDisplay::gl_texture_resources_ensure() return true; } -void BlenderGPUDisplay::texture_update_if_needed() +void BlenderDisplayDriver::texture_update_if_needed() { if (!texture_.need_update) { return; @@ -750,7 +717,7 @@ void BlenderGPUDisplay::texture_update_if_needed() texture_.need_update = false; } -void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams ¶ms) +void BlenderDisplayDriver::vertex_buffer_update(const Params ¶ms) { /* Invalidate old contents - avoids stalling if the buffer is still waiting in queue to be * rendered. */ @@ -763,23 +730,23 @@ void BlenderGPUDisplay::vertex_buffer_update(const GPUDisplayParams ¶ms) vpointer[0] = 0.0f; vpointer[1] = 0.0f; - vpointer[2] = params.offset.x; - vpointer[3] = params.offset.y; + vpointer[2] = params.full_offset.x; + vpointer[3] = params.full_offset.y; vpointer[4] = 1.0f; vpointer[5] = 0.0f; - vpointer[6] = (float)params.size.x + params.offset.x; - vpointer[7] = params.offset.y; + vpointer[6] = (float)params.size.x + params.full_offset.x; + vpointer[7] = params.full_offset.y; vpointer[8] = 1.0f; vpointer[9] = 1.0f; - vpointer[10] = (float)params.size.x + params.offset.x; - vpointer[11] = (float)params.size.y + params.offset.y; + vpointer[10] = (float)params.size.x + params.full_offset.x; + vpointer[11] = (float)params.size.y + params.full_offset.y; vpointer[12] = 0.0f; vpointer[13] = 1.0f; - vpointer[14] = params.offset.x; - vpointer[15] = (float)params.size.y + params.offset.y; + vpointer[14] = params.full_offset.x; + vpointer[15] = (float)params.size.y + params.full_offset.y; glUnmapBuffer(GL_ARRAY_BUFFER); } diff --git a/intern/cycles/blender/blender_gpu_display.h b/intern/cycles/blender/blender_display_driver.h index 89420567037..558997c6b4f 100644 --- a/intern/cycles/blender/blender_gpu_display.h +++ b/intern/cycles/blender/blender_display_driver.h @@ -22,12 +22,14 @@ #include "RNA_blender_cpp.h" -#include "render/gpu_display.h" +#include "render/display_driver.h" + +#include "util/util_thread.h" #include "util/util_unique_ptr.h" CCL_NAMESPACE_BEGIN -/* Base class of shader used for GPU display rendering. */ +/* Base class of shader used for display driver rendering. */ class BlenderDisplayShader { public: static constexpr const char *position_attribute_name = "pos"; @@ -96,11 +98,11 @@ class BlenderDisplaySpaceShader : public BlenderDisplayShader { uint shader_program_ = 0; }; -/* GPU display implementation which is specific for Blender viewport integration. */ -class BlenderGPUDisplay : public GPUDisplay { +/* Display driver implementation which is specific for Blender viewport integration. */ +class BlenderDisplayDriver : public DisplayDriver { public: - BlenderGPUDisplay(BL::RenderEngine &b_engine, BL::Scene &b_scene); - ~BlenderGPUDisplay(); + BlenderDisplayDriver(BL::RenderEngine &b_engine, BL::Scene &b_scene); + ~BlenderDisplayDriver(); virtual void graphics_interop_activate() override; virtual void graphics_interop_deactivate() override; @@ -110,22 +112,15 @@ class BlenderGPUDisplay : public GPUDisplay { void set_zoom(float zoom_x, float zoom_y); protected: - virtual bool do_update_begin(const GPUDisplayParams ¶ms, - int texture_width, - int texture_height) override; - virtual void do_update_end() override; + virtual bool update_begin(const Params ¶ms, int texture_width, int texture_height) override; + virtual void update_end() override; - virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels, - int texture_x, - int texture_y, - int pixels_width, - int pixels_height) override; - virtual void do_draw(const GPUDisplayParams ¶ms) override; + virtual half4 *map_texture_buffer() override; + virtual void unmap_texture_buffer() override; - virtual half4 *do_map_texture_buffer() override; - virtual void do_unmap_texture_buffer() override; + virtual GraphicsInterop graphics_interop_get() override; - virtual DeviceGraphicsInteropDestination do_graphics_interop_get() override; + virtual void draw(const Params ¶ms) override; /* Helper function which allocates new GPU context. */ void gl_context_create(); @@ -152,13 +147,13 @@ class BlenderGPUDisplay : public GPUDisplay { * This buffer is used to render texture in the viewport. * * NOTE: The buffer needs to be bound. */ - void vertex_buffer_update(const GPUDisplayParams ¶ms); + void vertex_buffer_update(const Params ¶ms); BL::RenderEngine b_engine_; /* OpenGL context which is used the render engine doesn't have its own. */ void *gl_context_ = nullptr; - /* The when Blender RenderEngine side context is not available and the GPUDisplay is to create + /* The when Blender RenderEngine side context is not available and the DisplayDriver is to create * its own context. */ bool use_gl_context_ = false; /* Mutex used to guard the `gl_context_`. */ diff --git a/intern/cycles/blender/blender_geometry.cpp b/intern/cycles/blender/blender_geometry.cpp index fca8cb9eda3..7b49bb7fbb7 100644 --- a/intern/cycles/blender/blender_geometry.cpp +++ b/intern/cycles/blender/blender_geometry.cpp @@ -80,8 +80,10 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph, { /* Test if we can instance or if the object is modified. */ Geometry::Type geom_type = determine_geom_type(b_ob_info, use_particle_hair); - BL::ID b_key_id = (BKE_object_is_modified(b_ob_info.real_object)) ? b_ob_info.real_object : - b_ob_info.object_data; + BL::ID b_key_id = (b_ob_info.is_real_object_data() && + BKE_object_is_modified(b_ob_info.real_object)) ? + b_ob_info.real_object : + b_ob_info.object_data; GeometryKey key(b_key_id.ptr.data, geom_type); /* Find shader indices. */ diff --git a/intern/cycles/blender/blender_output_driver.cpp b/intern/cycles/blender/blender_output_driver.cpp new file mode 100644 index 00000000000..f380b7b3bb1 --- /dev/null +++ b/intern/cycles/blender/blender_output_driver.cpp @@ -0,0 +1,127 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "blender/blender_output_driver.h" + +CCL_NAMESPACE_BEGIN + +BlenderOutputDriver::BlenderOutputDriver(BL::RenderEngine &b_engine) : b_engine_(b_engine) +{ +} + +BlenderOutputDriver::~BlenderOutputDriver() +{ +} + +bool BlenderOutputDriver::read_render_tile(const Tile &tile) +{ + /* Get render result. */ + BL::RenderResult b_rr = b_engine_.begin_result(tile.offset.x, + tile.offset.y, + tile.size.x, + tile.size.y, + tile.layer.c_str(), + tile.view.c_str()); + + /* Can happen if the intersected rectangle gives 0 width or height. */ + if (b_rr.ptr.data == NULL) { + return false; + } + + BL::RenderResult::layers_iterator b_single_rlay; + b_rr.layers.begin(b_single_rlay); + + /* layer will be missing if it was disabled in the UI */ + if (b_single_rlay == b_rr.layers.end()) { + return false; + } + + BL::RenderLayer b_rlay = *b_single_rlay; + + vector<float> pixels(tile.size.x * tile.size.y * 4); + + /* Copy each pass. + * TODO:copy only the required ones for better performance? */ + for (BL::RenderPass &b_pass : b_rlay.passes) { + tile.set_pass_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect()); + } + + b_engine_.end_result(b_rr, false, false, false); + + return true; +} + +bool BlenderOutputDriver::update_render_tile(const Tile &tile) +{ + /* Use final write for preview renders, otherwise render result wouldn't be be updated + * quickly on Blender side. For all other cases we use the display driver. */ + if (b_engine_.is_preview()) { + write_render_tile(tile); + return true; + } + else { + /* Don't highlight full-frame tile. */ + if (!(tile.size == tile.full_size)) { + b_engine_.tile_highlight_clear_all(); + b_engine_.tile_highlight_set(tile.offset.x, tile.offset.y, tile.size.x, tile.size.y, true); + } + + return false; + } +} + +void BlenderOutputDriver::write_render_tile(const Tile &tile) +{ + b_engine_.tile_highlight_clear_all(); + + /* Get render result. */ + BL::RenderResult b_rr = b_engine_.begin_result(tile.offset.x, + tile.offset.y, + tile.size.x, + tile.size.y, + tile.layer.c_str(), + tile.view.c_str()); + + /* Can happen if the intersected rectangle gives 0 width or height. */ + if (b_rr.ptr.data == NULL) { + return; + } + + BL::RenderResult::layers_iterator b_single_rlay; + b_rr.layers.begin(b_single_rlay); + + /* Layer will be missing if it was disabled in the UI. */ + if (b_single_rlay == b_rr.layers.end()) { + return; + } + + BL::RenderLayer b_rlay = *b_single_rlay; + + vector<float> pixels(tile.size.x * tile.size.y * 4); + + /* Copy each pass. */ + for (BL::RenderPass &b_pass : b_rlay.passes) { + if (!tile.get_pass_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) { + memset(&pixels[0], 0, pixels.size() * sizeof(float)); + } + + b_pass.rect(&pixels[0]); + } + + b_engine_.end_result(b_rr, true, false, true); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_output_driver.h b/intern/cycles/blender/blender_output_driver.h new file mode 100644 index 00000000000..8a1cf92d7c7 --- /dev/null +++ b/intern/cycles/blender/blender_output_driver.h @@ -0,0 +1,40 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "MEM_guardedalloc.h" + +#include "RNA_blender_cpp.h" + +#include "render/output_driver.h" + +CCL_NAMESPACE_BEGIN + +class BlenderOutputDriver : public OutputDriver { + public: + BlenderOutputDriver(BL::RenderEngine &b_engine); + ~BlenderOutputDriver(); + + virtual void write_render_tile(const Tile &tile) override; + virtual bool update_render_tile(const Tile &tile) override; + virtual bool read_render_tile(const Tile &tile) override; + + protected: + BL::RenderEngine b_engine_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 694d8454422..d681517c9e1 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args* static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/) { vector<DeviceType> device_types = Device::available_types(); - bool has_cuda = false, has_optix = false; + bool has_cuda = false, has_optix = false, has_hip = false; foreach (DeviceType device_type, device_types) { has_cuda |= (device_type == DEVICE_CUDA); has_optix |= (device_type == DEVICE_OPTIX); + has_hip |= (device_type == DEVICE_HIP); } - PyObject *list = PyTuple_New(2); + PyObject *list = PyTuple_New(3); PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda)); PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix)); + PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip)); return list; } @@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg) else if (override == "OPTIX") { BlenderSession::device_override = DEVICE_MASK_OPTIX; } + else if (override == "HIP") { + BlenderSession::device_override = DEVICE_MASK_HIP; + } else { printf("\nError: %s is not a valid Cycles device.\n", override.c_str()); Py_RETURN_FALSE; diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp index d65d89a7ddd..3be7ff32bd8 100644 --- a/intern/cycles/blender/blender_session.cpp +++ b/intern/cycles/blender/blender_session.cpp @@ -42,7 +42,8 @@ #include "util/util_progress.h" #include "util/util_time.h" -#include "blender/blender_gpu_display.h" +#include "blender/blender_display_driver.h" +#include "blender/blender_output_driver.h" #include "blender/blender_session.h" #include "blender/blender_sync.h" #include "blender/blender_util.h" @@ -71,7 +72,8 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine, width(0), height(0), preview_osl(preview_osl), - python_thread_state(NULL) + python_thread_state(NULL), + use_developer_ui(false) { /* offline render */ background = true; @@ -156,11 +158,13 @@ void BlenderSession::create_session() b_v3d, b_rv3d, scene->camera, width, height); session->reset(session_params, buffer_params); - /* Create GPU display. */ + /* Create GPU display. + * TODO(sergey): Investigate whether DisplayDriver can be used for the preview as well. */ if (!b_engine.is_preview() && !headless) { - unique_ptr<BlenderGPUDisplay> gpu_display = make_unique<BlenderGPUDisplay>(b_engine, b_scene); - gpu_display_ = gpu_display.get(); - session->set_gpu_display(move(gpu_display)); + unique_ptr<BlenderDisplayDriver> display_driver = make_unique<BlenderDisplayDriver>(b_engine, + b_scene); + display_driver_ = display_driver.get(); + session->set_display_driver(move(display_driver)); } /* Viewport and preview (as in, material preview) does not do tiled rendering, so can inform @@ -277,94 +281,6 @@ void BlenderSession::free_session() session = nullptr; } -void BlenderSession::read_render_tile() -{ - const int2 tile_offset = session->get_render_tile_offset(); - const int2 tile_size = session->get_render_tile_size(); - - /* get render result */ - BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x, - tile_offset.y, - tile_size.x, - tile_size.y, - b_rlay_name.c_str(), - b_rview_name.c_str()); - - /* can happen if the intersected rectangle gives 0 width or height */ - if (b_rr.ptr.data == NULL) { - return; - } - - BL::RenderResult::layers_iterator b_single_rlay; - b_rr.layers.begin(b_single_rlay); - - /* layer will be missing if it was disabled in the UI */ - if (b_single_rlay == b_rr.layers.end()) - return; - - BL::RenderLayer b_rlay = *b_single_rlay; - - vector<float> pixels(tile_size.x * tile_size.y * 4); - - /* Copy each pass. - * TODO:copy only the required ones for better performance? */ - for (BL::RenderPass &b_pass : b_rlay.passes) { - session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect()); - } -} - -void BlenderSession::write_render_tile() -{ - const int2 tile_offset = session->get_render_tile_offset(); - const int2 tile_size = session->get_render_tile_size(); - - const string_view render_layer_name = session->get_render_tile_layer(); - const string_view render_view_name = session->get_render_tile_view(); - - b_engine.tile_highlight_clear_all(); - - /* get render result */ - BL::RenderResult b_rr = b_engine.begin_result(tile_offset.x, - tile_offset.y, - tile_size.x, - tile_size.y, - render_layer_name.c_str(), - render_view_name.c_str()); - - /* can happen if the intersected rectangle gives 0 width or height */ - if (b_rr.ptr.data == NULL) { - return; - } - - BL::RenderResult::layers_iterator b_single_rlay; - b_rr.layers.begin(b_single_rlay); - - /* layer will be missing if it was disabled in the UI */ - if (b_single_rlay == b_rr.layers.end()) { - return; - } - - BL::RenderLayer b_rlay = *b_single_rlay; - - write_render_result(b_rlay); - - b_engine.end_result(b_rr, true, false, true); -} - -void BlenderSession::update_render_tile() -{ - if (!session->has_multiple_render_tiles()) { - /* Don't highlight full-frame tile. */ - return; - } - - const int2 tile_offset = session->get_render_tile_offset(); - const int2 tile_size = session->get_render_tile_size(); - - b_engine.tile_highlight_clear_all(); - b_engine.tile_highlight_set(tile_offset.x, tile_offset.y, tile_size.x, tile_size.y, true); -} - void BlenderSession::full_buffer_written(string_view filename) { full_buffer_files_.emplace_back(filename); @@ -438,18 +354,8 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) return; } - /* set callback to write out render results */ - session->write_render_tile_cb = [&]() { write_render_tile(); }; - - /* Use final write for preview renders, otherwise render result wouldn't be be updated on Blender - * side. */ - /* TODO(sergey): Investigate whether GPUDisplay can be used for the preview as well. */ - if (b_engine.is_preview()) { - session->update_render_tile_cb = [&]() { write_render_tile(); }; - } - else { - session->update_render_tile_cb = [&]() { update_render_tile(); }; - } + /* Create driver to write out render results. */ + session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine)); session->full_buffer_written_cb = [&](string_view filename) { full_buffer_written(filename); }; @@ -557,6 +463,11 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_) /* free result without merging */ b_engine.end_result(b_rr, true, false, false); + /* When tiled rendering is used there will be no "write" done for the tile. Forcefully clear + * highlighted tiles now, so that the highlight will be removed while processing full frame from + * file. */ + b_engine.tile_highlight_clear_all(); + double total_time, render_time; session->progress.get_time(total_time, render_time); VLOG(1) << "Total render time: " << total_time; @@ -581,12 +492,17 @@ void BlenderSession::render_frame_finish() for (string_view filename : full_buffer_files_) { session->process_full_buffer_from_disk(filename); + if (check_and_report_session_error()) { + break; + } + } + + for (string_view filename : full_buffer_files_) { path_remove(filename); } - /* clear callback */ - session->write_render_tile_cb = function_null; - session->update_render_tile_cb = function_null; + /* Clear driver. */ + session->set_output_driver(nullptr); session->full_buffer_written_cb = function_null; } @@ -692,9 +608,8 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_, pass->set_type(bake_type_to_pass(bake_type, bake_filter)); pass->set_include_albedo((bake_filter & BL::BakeSettings::pass_filter_COLOR)); - session->read_render_tile_cb = [&]() { read_render_tile(); }; - session->write_render_tile_cb = [&]() { write_render_tile(); }; - session->set_gpu_display(nullptr); + session->set_display_driver(nullptr); + session->set_output_driver(make_unique<BlenderOutputDriver>(b_engine)); if (!session->progress.get_cancel()) { /* Sync scene. */ @@ -737,43 +652,7 @@ void BlenderSession::bake(BL::Depsgraph &b_depsgraph_, session->wait(); } - session->read_render_tile_cb = function_null; - session->write_render_tile_cb = function_null; -} - -void BlenderSession::write_render_result(BL::RenderLayer &b_rlay) -{ - if (!session->copy_render_tile_from_device()) { - return; - } - - const int2 tile_size = session->get_render_tile_size(); - vector<float> pixels(tile_size.x * tile_size.y * 4); - - /* Copy each pass. */ - for (BL::RenderPass &b_pass : b_rlay.passes) { - if (!session->get_render_tile_pixels(b_pass.name(), b_pass.channels(), &pixels[0])) { - memset(&pixels[0], 0, pixels.size() * sizeof(float)); - } - - b_pass.rect(&pixels[0]); - } -} - -void BlenderSession::update_render_result(BL::RenderLayer &b_rlay) -{ - if (!session->copy_render_tile_from_device()) { - return; - } - - const int2 tile_size = session->get_render_tile_size(); - vector<float> pixels(tile_size.x * tile_size.y * 4); - - /* Copy combined pass. */ - BL::RenderPass b_combined_pass(b_rlay.passes.find_by_name("Combined", b_rview_name.c_str())); - if (session->get_render_tile_pixels("Combined", b_combined_pass.channels(), &pixels[0])) { - b_combined_pass.rect(&pixels[0]); - } + session->set_output_driver(nullptr); } void BlenderSession::synchronize(BL::Depsgraph &b_depsgraph_) @@ -881,7 +760,7 @@ void BlenderSession::draw(BL::SpaceImageEditor &space_image) } BL::Array<float, 2> zoom = space_image.zoom(); - gpu_display_->set_zoom(zoom[0], zoom[1]); + display_driver_->set_zoom(zoom[0], zoom[1]); session->draw(); } @@ -988,8 +867,9 @@ void BlenderSession::update_status_progress() get_status(status, substatus); get_progress(progress, total_time, render_time); - if (progress > 0) - remaining_time = (1.0 - (double)progress) * (render_time / (double)progress); + if (progress > 0) { + remaining_time = session->get_estimated_remaining_time(); + } if (background) { if (scene) @@ -1027,20 +907,27 @@ void BlenderSession::update_status_progress() last_progress = progress; } - if (session->progress.get_error()) { - string error = session->progress.get_error_message(); - if (error != last_error) { - /* TODO(sergey): Currently C++ RNA API doesn't let us to - * use mnemonic name for the variable. Would be nice to - * have this figured out. - * - * For until then, 1 << 5 means RPT_ERROR. - */ - b_engine.report(1 << 5, error.c_str()); - b_engine.error_set(error.c_str()); - last_error = error; - } + check_and_report_session_error(); +} + +bool BlenderSession::check_and_report_session_error() +{ + if (!session->progress.get_error()) { + return false; } + + const string error = session->progress.get_error_message(); + if (error != last_error) { + /* TODO(sergey): Currently C++ RNA API doesn't let us to use mnemonic name for the variable. + * Would be nice to have this figured out. + * + * For until then, 1 << 5 means RPT_ERROR. */ + b_engine.report(1 << 5, error.c_str()); + b_engine.error_set(error.c_str()); + last_error = error; + } + + return true; } void BlenderSession::tag_update() diff --git a/intern/cycles/blender/blender_session.h b/intern/cycles/blender/blender_session.h index 11e2657a325..fef6ad1adfc 100644 --- a/intern/cycles/blender/blender_session.h +++ b/intern/cycles/blender/blender_session.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN -class BlenderGPUDisplay; +class BlenderDisplayDriver; class BlenderSync; class ImageMetaData; class Scene; @@ -70,20 +70,7 @@ class BlenderSession { const int bake_width, const int bake_height); - void write_render_result(BL::RenderLayer &b_rlay); - void write_render_tile(); - - void update_render_tile(); - void full_buffer_written(string_view filename); - - /* update functions are used to update display buffer only after sample was rendered - * only needed for better visual feedback */ - void update_render_result(BL::RenderLayer &b_rlay); - - /* read functions for baking input */ - void read_render_tile(); - /* interactive updates */ void synchronize(BL::Depsgraph &b_depsgraph); @@ -110,8 +97,7 @@ class BlenderSession { BL::RenderSettings b_render; BL::Depsgraph b_depsgraph; /* NOTE: Blender's scene might become invalid after call - * free_blender_memory_if_possible(). - */ + * #free_blender_memory_if_possible(). */ BL::Scene b_scene; BL::SpaceView3D b_v3d; BL::RegionView3D b_rv3d; @@ -147,6 +133,11 @@ class BlenderSession { protected: void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name); + /* Check whether session error happened. + * If so, it is reported to the render engine and true is returned. + * Otherwise false is returned. */ + bool check_and_report_session_error(); + void builtin_images_load(); /* Is used after each render layer synchronization is done with the goal @@ -160,8 +151,8 @@ class BlenderSession { int last_pass_index = -1; } draw_state_; - /* NOTE: The BlenderSession references the GPU display. */ - BlenderGPUDisplay *gpu_display_ = nullptr; + /* NOTE: The BlenderSession references the display driver. */ + BlenderDisplayDriver *display_driver_ = nullptr; vector<string> full_buffer_files_; }; diff --git a/intern/cycles/blender/blender_shader.cpp b/intern/cycles/blender/blender_shader.cpp index 8c4f789ffd0..0b8aea15d6c 100644 --- a/intern/cycles/blender/blender_shader.cpp +++ b/intern/cycles/blender/blender_shader.cpp @@ -279,7 +279,7 @@ static ShaderNode *add_node(Scene *scene, array<float3> curve_mapping_curves; float min_x, max_x; curvemapping_color_to_array(mapping, curve_mapping_curves, RAMP_TABLE_SIZE, true); - curvemapping_minmax(mapping, true, &min_x, &max_x); + curvemapping_minmax(mapping, 4, &min_x, &max_x); curves->set_min_x(min_x); curves->set_max_x(max_x); curves->set_curves(curve_mapping_curves); @@ -292,12 +292,25 @@ static ShaderNode *add_node(Scene *scene, array<float3> curve_mapping_curves; float min_x, max_x; curvemapping_color_to_array(mapping, curve_mapping_curves, RAMP_TABLE_SIZE, false); - curvemapping_minmax(mapping, false, &min_x, &max_x); + curvemapping_minmax(mapping, 3, &min_x, &max_x); curves->set_min_x(min_x); curves->set_max_x(max_x); curves->set_curves(curve_mapping_curves); node = curves; } + else if (b_node.is_a(&RNA_ShaderNodeFloatCurve)) { + BL::ShaderNodeFloatCurve b_curve_node(b_node); + BL::CurveMapping mapping(b_curve_node.mapping()); + FloatCurveNode *curve = graph->create_node<FloatCurveNode>(); + array<float> curve_mapping_curve; + float min_x, max_x; + curvemapping_float_to_array(mapping, curve_mapping_curve, RAMP_TABLE_SIZE); + curvemapping_minmax(mapping, 1, &min_x, &max_x); + curve->set_min_x(min_x); + curve->set_max_x(max_x); + curve->set_curve(curve_mapping_curve); + node = curve; + } else if (b_node.is_a(&RNA_ShaderNodeValToRGB)) { RGBRampNode *ramp = graph->create_node<RGBRampNode>(); BL::ShaderNodeValToRGB b_ramp_node(b_node); diff --git a/intern/cycles/blender/blender_util.h b/intern/cycles/blender/blender_util.h index 04008d77d89..77b2bd5ac4f 100644 --- a/intern/cycles/blender/blender_util.h +++ b/intern/cycles/blender/blender_util.h @@ -90,26 +90,27 @@ static inline BL::Mesh object_to_mesh(BL::BlendData & /*data*/, } #endif - BL::Mesh mesh(PointerRNA_NULL); - if (b_ob_info.object_data.is_a(&RNA_Mesh)) { - /* TODO: calc_undeformed is not used. */ - mesh = BL::Mesh(b_ob_info.object_data); - - /* Make a copy to split faces if we use autosmooth, otherwise not needed. - * Also in edit mode do we need to make a copy, to ensure data layers like - * UV are not empty. */ - if (mesh.is_editmode() || - (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) { + BL::Mesh mesh = (b_ob_info.object_data.is_a(&RNA_Mesh)) ? BL::Mesh(b_ob_info.object_data) : + BL::Mesh(PointerRNA_NULL); + + if (b_ob_info.is_real_object_data()) { + if (mesh) { + /* Make a copy to split faces if we use autosmooth, otherwise not needed. + * Also in edit mode do we need to make a copy, to ensure data layers like + * UV are not empty. */ + if (mesh.is_editmode() || + (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) { + BL::Depsgraph depsgraph(PointerRNA_NULL); + mesh = b_ob_info.real_object.to_mesh(false, depsgraph); + } + } + else { BL::Depsgraph depsgraph(PointerRNA_NULL); - assert(b_ob_info.is_real_object_data()); mesh = b_ob_info.real_object.to_mesh(false, depsgraph); } } else { - BL::Depsgraph depsgraph(PointerRNA_NULL); - if (b_ob_info.is_real_object_data()) { - mesh = b_ob_info.real_object.to_mesh(false, depsgraph); - } + /* TODO: what to do about non-mesh geometry instances? */ } #if 0 @@ -170,12 +171,11 @@ static inline void curvemap_minmax_curve(/*const*/ BL::CurveMap &curve, float *m } static inline void curvemapping_minmax(/*const*/ BL::CurveMapping &cumap, - bool rgb_curve, + int num_curves, float *min_x, float *max_x) { // const int num_curves = cumap.curves.length(); /* Gives linking error so far. */ - const int num_curves = rgb_curve ? 4 : 3; *min_x = FLT_MAX; *max_x = -FLT_MAX; for (int i = 0; i < num_curves; ++i) { @@ -195,6 +195,28 @@ static inline void curvemapping_to_array(BL::CurveMapping &cumap, array<float> & } } +static inline void curvemapping_float_to_array(BL::CurveMapping &cumap, + array<float> &data, + int size) +{ + float min = 0.0f, max = 1.0f; + + curvemapping_minmax(cumap, 1, &min, &max); + + const float range = max - min; + + cumap.update(); + + BL::CurveMap map = cumap.curves[0]; + + data.resize(size); + + for (int i = 0; i < size; i++) { + float t = min + (float)i / (float)(size - 1) * range; + data[i] = cumap.evaluate(map, t); + } +} + static inline void curvemapping_color_to_array(BL::CurveMapping &cumap, array<float3> &data, int size, @@ -213,7 +235,8 @@ static inline void curvemapping_color_to_array(BL::CurveMapping &cumap, * * There might be some better estimations here tho. */ - curvemapping_minmax(cumap, rgb_curve, &min_x, &max_x); + const int num_curves = rgb_curve ? 4 : 3; + curvemapping_minmax(cumap, num_curves, &min_x, &max_x); const float range_x = max_x - min_x; diff --git a/intern/cycles/bvh/bvh_embree.cpp b/intern/cycles/bvh/bvh_embree.cpp index 96852510b63..20430cb164c 100644 --- a/intern/cycles/bvh/bvh_embree.cpp +++ b/intern/cycles/bvh/bvh_embree.cpp @@ -213,7 +213,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args) if (ctx->num_hits < ctx->max_hits) { Intersection current_isect; kernel_embree_convert_hit(kg, ray, hit, ¤t_isect); - for (size_t i = 0; i < ctx->max_hits; ++i) { + for (size_t i = 0; i < ctx->num_hits; ++i) { if (current_isect.object == ctx->isect_s[i].object && current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) { /* This intersection was already recorded, skip it. */ diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake index da259171844..b966edd4298 100644 --- a/intern/cycles/cmake/external_libs.cmake +++ b/intern/cycles/cmake/external_libs.cmake @@ -532,4 +532,13 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD) endif() endif() + +########################################################################### +# HIP +########################################################################### + +if(NOT WITH_HIP_DYNLOAD) + set(WITH_HIP_DYNLOAD ON) +endif() + unset(_cycles_lib_dir) diff --git a/intern/cycles/cmake/macros.cmake b/intern/cycles/cmake/macros.cmake index 47196dfd1ce..a470fb9c574 100644 --- a/intern/cycles/cmake/macros.cmake +++ b/intern/cycles/cmake/macros.cmake @@ -156,10 +156,16 @@ macro(cycles_target_link_libraries target) ${PLATFORM_LINKLIBS} ) - if(WITH_CUDA_DYNLOAD) - target_link_libraries(${target} extern_cuew) - else() - target_link_libraries(${target} ${CUDA_CUDA_LIBRARY}) + if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX) + if(WITH_CUDA_DYNLOAD) + target_link_libraries(${target} extern_cuew) + else() + target_link_libraries(${target} ${CUDA_CUDA_LIBRARY}) + endif() + endif() + + if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD) + target_link_libraries(${target} extern_hipew) endif() if(CYCLES_STANDALONE_REPOSITORY) diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt index d18f4360aef..6d33a6f107f 100644 --- a/intern/cycles/device/CMakeLists.txt +++ b/intern/cycles/device/CMakeLists.txt @@ -22,16 +22,25 @@ set(INC_SYS ../../../extern/clew/include ) -if(WITH_CUDA_DYNLOAD) +if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA) + if(WITH_CUDA_DYNLOAD) + list(APPEND INC + ../../../extern/cuew/include + ) + add_definitions(-DWITH_CUDA_DYNLOAD) + else() + list(APPEND INC_SYS + ${CUDA_TOOLKIT_INCLUDE} + ) + add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") + endif() +endif() + +if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD) list(APPEND INC - ../../../extern/cuew/include - ) - add_definitions(-DWITH_CUDA_DYNLOAD) -else() - list(APPEND INC_SYS - ${CUDA_TOOLKIT_INCLUDE} + ../../../extern/hipew/include ) - add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}") + add_definitions(-DWITH_HIP_DYNLOAD) endif() set(SRC @@ -70,6 +79,21 @@ set(SRC_CUDA cuda/util.h ) +set(SRC_HIP + hip/device.cpp + hip/device.h + hip/device_impl.cpp + hip/device_impl.h + hip/graphics_interop.cpp + hip/graphics_interop.h + hip/kernel.cpp + hip/kernel.h + hip/queue.cpp + hip/queue.h + hip/util.cpp + hip/util.h +) + set(SRC_DUMMY dummy/device.cpp dummy/device.h @@ -105,13 +129,21 @@ set(LIB ${CYCLES_GL_LIBRARIES} ) -if(WITH_CUDA_DYNLOAD) - list(APPEND LIB - extern_cuew - ) -else() +if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA) + if(WITH_CUDA_DYNLOAD) + list(APPEND LIB + extern_cuew + ) + else() + list(APPEND LIB + ${CUDA_CUDA_LIBRARY} + ) + endif() +endif() + +if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD) list(APPEND LIB - ${CUDA_CUDA_LIBRARY} + extern_hipew ) endif() @@ -120,6 +152,9 @@ add_definitions(${GL_DEFINITIONS}) if(WITH_CYCLES_DEVICE_CUDA) add_definitions(-DWITH_CUDA) endif() +if(WITH_CYCLES_DEVICE_HIP) + add_definitions(-DWITH_HIP) +endif() if(WITH_CYCLES_DEVICE_OPTIX) add_definitions(-DWITH_OPTIX) endif() @@ -140,6 +175,7 @@ cycles_add_library(cycles_device "${LIB}" ${SRC} ${SRC_CPU} ${SRC_CUDA} + ${SRC_HIP} ${SRC_DUMMY} ${SRC_MULTI} ${SRC_OPTIX} diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp index 3b0db6bdd0e..d02c18daee9 100644 --- a/intern/cycles/device/cpu/device_impl.cpp +++ b/intern/cycles/device/cpu/device_impl.cpp @@ -54,7 +54,6 @@ #include "util/util_function.h" #include "util/util_logging.h" #include "util/util_map.h" -#include "util/util_opengl.h" #include "util/util_openimagedenoise.h" #include "util/util_optimization.h" #include "util/util_progress.h" @@ -170,7 +169,7 @@ void CPUDevice::mem_copy_to(device_memory &mem) } void CPUDevice::mem_copy_from( - device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) + device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/) { /* no-op */ } @@ -204,7 +203,7 @@ void CPUDevice::mem_free(device_memory &mem) } } -device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) { return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); } @@ -298,154 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) Device::build_bvh(bvh, progress, refit); } -#if 0 -void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg) -{ - const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE; - - scoped_timer timer(&tile.buffers->render_time); - - Coverage coverage(kg, tile); - if (use_coverage) { - coverage.init_path_trace(); - } - - float *render_buffer = (float *)tile.buffer; - int start_sample = tile.start_sample; - int end_sample = tile.start_sample + tile.num_samples; - - /* Needed for Embree. */ - SIMD_SET_FLUSH_TO_ZERO; - - for (int sample = start_sample; sample < end_sample; sample++) { - if (task.get_cancel() || TaskPool::canceled()) { - if (task.need_finish_queue == false) - break; - } - - if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) { - tile.stealing_state = RenderTile::WAS_STOLEN; - break; - } - - if (tile.task == RenderTile::PATH_TRACE) { - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - if (use_coverage) { - coverage.init_pixel(x, y); - } - kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride); - } - } - } - else { - for (int y = tile.y; y < tile.y + tile.h; y++) { - for (int x = tile.x; x < tile.x + tile.w; x++) { - kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride); - } - } - } - tile.sample = sample + 1; - - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) { - const bool stop = adaptive_sampling_filter(kg, tile, sample); - if (stop) { - const int num_progress_samples = end_sample - sample; - tile.sample = end_sample; - task.update_progress(&tile, tile.w * tile.h * num_progress_samples); - break; - } - } - - task.update_progress(&tile, tile.w * tile.h); - } - if (use_coverage) { - coverage.finalize(); - } - - if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) { - adaptive_sampling_post(tile, kg); - } -} - -void CPUDevice::thread_render(DeviceTask &task) -{ - if (TaskPool::canceled()) { - if (task.need_finish_queue == false) - return; - } - - /* allocate buffer for kernel globals */ - CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory()); - - profiler.add_state(&kg.profiler); - - /* NLM denoiser. */ - DenoisingTask *denoising = NULL; - - /* OpenImageDenoise: we can only denoise with one thread at a time, so to - * avoid waiting with mutex locks in the denoiser, we let only a single - * thread acquire denoising tiles. */ - uint tile_types = task.tile_types; - bool hold_denoise_lock = false; - if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { - if (!oidn_task_lock.try_lock()) { - tile_types &= ~RenderTile::DENOISE; - hold_denoise_lock = true; - } - } - - RenderTile tile; - while (task.acquire_tile(this, tile, tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - render(task, tile, &kg); - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, &kg); - } - else if (tile.task == RenderTile::DENOISE) { - denoise_openimagedenoise(task, tile); - task.update_progress(&tile, tile.w * tile.h); - } - - task.release_tile(tile); - - if (TaskPool::canceled()) { - if (task.need_finish_queue == false) - break; - } - } - - if (hold_denoise_lock) { - oidn_task_lock.unlock(); - } - - profiler.remove_state(&kg.profiler); - - delete denoising; -} - -void CPUDevice::thread_denoise(DeviceTask &task) -{ - RenderTile tile; - tile.x = task.x; - tile.y = task.y; - tile.w = task.w; - tile.h = task.h; - tile.buffer = task.buffer; - tile.sample = task.sample + task.num_samples; - tile.num_samples = task.num_samples; - tile.start_sample = task.sample; - tile.offset = task.offset; - tile.stride = task.stride; - tile.buffers = task.buffers; - - denoise_openimagedenoise(task, tile); - - task.update_progress(&tile, tile.w * tile.h); -} -#endif - const CPUKernels *CPUDevice::get_cpu_kernels() const { return &kernels; diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h index 7d222808652..371d2258104 100644 --- a/intern/cycles/device/cpu/device_impl.h +++ b/intern/cycles/device/cpu/device_impl.h @@ -72,10 +72,13 @@ class CPUDevice : public Device { virtual void mem_alloc(device_memory &mem) override; virtual void mem_copy_to(device_memory &mem) override; - virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + virtual void mem_copy_from( + device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; virtual void mem_zero(device_memory &mem) override; virtual void mem_free(device_memory &mem) override; - virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, + size_t offset, + size_t /*size*/) override; virtual void const_copy_to(const char *name, void *host, size_t size) override; diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index 37fab8f8293..5e1a63c04df 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -31,7 +31,6 @@ # include "util/util_logging.h" # include "util/util_map.h" # include "util/util_md5.h" -# include "util/util_opengl.h" # include "util/util_path.h" # include "util/util_string.h" # include "util/util_system.h" @@ -837,7 +836,7 @@ void CUDADevice::mem_copy_to(device_memory &mem) } } -void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem) +void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) { if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { assert(!"mem_copy_from not supported for textures."); @@ -891,7 +890,7 @@ void CUDADevice::mem_free(device_memory &mem) } } -device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) +device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) { return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); } @@ -1169,141 +1168,6 @@ void CUDADevice::tex_free(device_texture &mem) } } -# if 0 -void CUDADevice::render(DeviceTask &task, - RenderTile &rtile, - device_vector<KernelWorkTile> &work_tiles) -{ - scoped_timer timer(&rtile.buffers->render_time); - - if (have_error()) - return; - - CUDAContextScope scope(this); - CUfunction cuRender; - - /* Get kernel function. */ - if (rtile.task == RenderTile::BAKE) { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake")); - } - else { - cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace")); - } - - if (have_error()) { - return; - } - - cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1)); - - /* Allocate work tile. */ - work_tiles.alloc(1); - - KernelWorkTile *wtile = work_tiles.data(); - wtile->x = rtile.x; - wtile->y = rtile.y; - wtile->w = rtile.w; - wtile->h = rtile.h; - wtile->offset = rtile.offset; - wtile->stride = rtile.stride; - wtile->buffer = (float *)(CUdeviceptr)rtile.buffer; - - /* Prepare work size. More step samples render faster, but for now we - * remain conservative for GPUs connected to a display to avoid driver - * timeouts and display freezing. */ - int min_blocks, num_threads_per_block; - cuda_assert( - cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0)); - if (!info.display_device) { - min_blocks *= 8; - } - - uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); - - /* Render all samples. */ - uint start_sample = rtile.start_sample; - uint end_sample = rtile.start_sample + rtile.num_samples; - - for (int sample = start_sample; sample < end_sample;) { - /* Setup and copy work tile to device. */ - wtile->start_sample = sample; - wtile->num_samples = step_samples; - if (task.adaptive_sampling.use) { - wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); - } - wtile->num_samples = min(wtile->num_samples, end_sample - sample); - work_tiles.copy_to_device(); - - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - uint total_work_size = wtile->w * wtile->h * wtile->num_samples; - uint num_blocks = divide_up(total_work_size, num_threads_per_block); - - /* Launch kernel. */ - void *args[] = {&d_work_tiles, &total_work_size}; - - cuda_assert( - cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); - - /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ - uint filter_sample = sample + wtile->num_samples - 1; - if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { - adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); - } - - cuda_assert(cuCtxSynchronize()); - - /* Update progress. */ - sample += wtile->num_samples; - rtile.sample = sample; - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - /* Finalize adaptive sampling. */ - if (task.adaptive_sampling.use) { - CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer; - adaptive_sampling_post(rtile, wtile, d_work_tiles); - cuda_assert(cuCtxSynchronize()); - task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); - } -} - -void CUDADevice::thread_run(DeviceTask &task) -{ - CUDAContextScope scope(this); - - if (task.type == DeviceTask::RENDER) { - device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); - - /* keep rendering tiles until done */ - RenderTile tile; - DenoisingTask denoising(this, task); - - while (task.acquire_tile(this, tile, task.tile_types)) { - if (tile.task == RenderTile::PATH_TRACE) { - render(task, tile, work_tiles); - } - else if (tile.task == RenderTile::BAKE) { - render(task, tile, work_tiles); - } - - task.release_tile(tile); - - if (task.get_cancel()) { - if (task.need_finish_queue == false) - break; - } - } - - work_tiles.free(); - } -} -# endif - unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create() { return make_unique<CUDADeviceQueue>(this); diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index 6b27db54ab4..c0316d18ba0 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -26,7 +26,6 @@ # ifdef WITH_CUDA_DYNLOAD # include "cuew.h" # else -# include "util/util_opengl.h" # include <cuda.h> # include <cudaGL.h> # endif @@ -120,13 +119,13 @@ class CUDADevice : public Device { void mem_copy_to(device_memory &mem) override; - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override; + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; void mem_zero(device_memory &mem) override; void mem_free(device_memory &mem) override; - device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override; + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; virtual void const_copy_to(const char *name, void *host, size_t size) override; diff --git a/intern/cycles/device/cuda/graphics_interop.cpp b/intern/cycles/device/cuda/graphics_interop.cpp index e8ca8b90eae..30efefd9b6b 100644 --- a/intern/cycles/device/cuda/graphics_interop.cpp +++ b/intern/cycles/device/cuda/graphics_interop.cpp @@ -37,14 +37,15 @@ CUDADeviceGraphicsInterop::~CUDADeviceGraphicsInterop() } } -void CUDADeviceGraphicsInterop::set_destination( - const DeviceGraphicsInteropDestination &destination) +void CUDADeviceGraphicsInterop::set_display_interop( + const DisplayDriver::GraphicsInterop &display_interop) { - const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height; + const int64_t new_buffer_area = int64_t(display_interop.buffer_width) * + display_interop.buffer_height; - need_clear_ = destination.need_clear; + need_clear_ = display_interop.need_clear; - if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) { + if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) { return; } @@ -55,12 +56,12 @@ void CUDADeviceGraphicsInterop::set_destination( } const CUresult result = cuGraphicsGLRegisterBuffer( - &cu_graphics_resource_, destination.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); + &cu_graphics_resource_, display_interop.opengl_pbo_id, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE); if (result != CUDA_SUCCESS) { LOG(ERROR) << "Error registering OpenGL buffer: " << cuewErrorString(result); } - opengl_pbo_id_ = destination.opengl_pbo_id; + opengl_pbo_id_ = display_interop.opengl_pbo_id; buffer_area_ = new_buffer_area; } diff --git a/intern/cycles/device/cuda/graphics_interop.h b/intern/cycles/device/cuda/graphics_interop.h index 8a70c8aa71d..ec480f20c86 100644 --- a/intern/cycles/device/cuda/graphics_interop.h +++ b/intern/cycles/device/cuda/graphics_interop.h @@ -41,7 +41,7 @@ class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop { CUDADeviceGraphicsInterop &operator=(const CUDADeviceGraphicsInterop &other) = delete; CUDADeviceGraphicsInterop &operator=(CUDADeviceGraphicsInterop &&other) = delete; - virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override; + virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) override; virtual device_ptr map() override; virtual void unmap() override; diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp index b7f86c10553..1149a835b14 100644 --- a/intern/cycles/device/cuda/queue.cpp +++ b/intern/cycles/device/cuda/queue.cpp @@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar } /* Launch kernel. */ - cuda_device_assert(cuda_device_, - cuLaunchKernel(cuda_kernel.function, - num_blocks, - 1, - 1, - num_threads_per_block, - 1, - 1, - shared_mem_bytes, - cuda_stream_, - args, - 0)); + assert_success(cuLaunchKernel(cuda_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + cuda_stream_, + args, + 0), + "enqueue"); return !(cuda_device_->have_error()); } @@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize() } const CUDAContextScope scope(cuda_device_); - cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_)); + assert_success(cuStreamSynchronize(cuda_stream_), "synchronize"); + debug_synchronize(); return !(cuda_device_->have_error()); @@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem) assert(mem.device_pointer != 0); const CUDAContextScope scope(cuda_device_); - cuda_device_assert( - cuda_device_, - cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_)); + assert_success( + cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_), + "zero_to_device"); } void CUDADeviceQueue::copy_to_device(device_memory &mem) @@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem) /* Copy memory to device. */ const CUDAContextScope scope(cuda_device_); - cuda_device_assert( - cuda_device_, + assert_success( cuMemcpyHtoDAsync( - (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_)); + (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_), + "copy_to_device"); } void CUDADeviceQueue::copy_from_device(device_memory &mem) @@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem) /* Copy memory from device. */ const CUDAContextScope scope(cuda_device_); - cuda_device_assert( - cuda_device_, + assert_success( cuMemcpyDtoHAsync( - mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_)); + mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_), + "copy_from_device"); +} + +void CUDADeviceQueue::assert_success(CUresult result, const char *operation) +{ + if (result != CUDA_SUCCESS) { + const char *name = cuewErrorString(result); + cuda_device_->set_error(string_printf( + "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str())); + } } unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create() diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h index 62e3aa3d6c2..4d1995ed69e 100644 --- a/intern/cycles/device/cuda/queue.h +++ b/intern/cycles/device/cuda/queue.h @@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue { protected: CUDADevice *cuda_device_; CUstream cuda_stream_; + + void assert_success(CUresult result, const char *operation); }; CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 6ccedcf54ef..81574e8b184 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -25,6 +25,7 @@ #include "device/cpu/device.h" #include "device/cuda/device.h" #include "device/dummy/device.h" +#include "device/hip/device.h" #include "device/multi/device.h" #include "device/optix/device.h" @@ -32,7 +33,6 @@ #include "util/util_half.h" #include "util/util_logging.h" #include "util/util_math.h" -#include "util/util_opengl.h" #include "util/util_string.h" #include "util/util_system.h" #include "util/util_time.h" @@ -47,6 +47,7 @@ thread_mutex Device::device_mutex; vector<DeviceInfo> Device::cuda_devices; vector<DeviceInfo> Device::optix_devices; vector<DeviceInfo> Device::cpu_devices; +vector<DeviceInfo> Device::hip_devices; uint Device::devices_initialized_mask = 0; /* Device */ @@ -97,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler) device = device_optix_create(info, stats, profiler); break; #endif + +#ifdef WITH_HIP + case DEVICE_HIP: + if (device_hip_init()) + device = device_hip_create(info, stats, profiler); + break; +#endif + default: break; } @@ -118,6 +127,8 @@ DeviceType Device::type_from_string(const char *name) return DEVICE_OPTIX; else if (strcmp(name, "MULTI") == 0) return DEVICE_MULTI; + else if (strcmp(name, "HIP") == 0) + return DEVICE_HIP; return DEVICE_NONE; } @@ -132,6 +143,8 @@ string Device::string_from_type(DeviceType type) return "OPTIX"; else if (type == DEVICE_MULTI) return "MULTI"; + else if (type == DEVICE_HIP) + return "HIP"; return ""; } @@ -146,6 +159,10 @@ vector<DeviceType> Device::available_types() #ifdef WITH_OPTIX types.push_back(DEVICE_OPTIX); #endif +#ifdef WITH_HIP + types.push_back(DEVICE_HIP); +#endif + return types; } @@ -187,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask) } #endif +#ifdef WITH_HIP + if (mask & DEVICE_MASK_HIP) { + if (!(devices_initialized_mask & DEVICE_MASK_HIP)) { + if (device_hip_init()) { + device_hip_info(hip_devices); + } + devices_initialized_mask |= DEVICE_MASK_HIP; + } + foreach (DeviceInfo &info, hip_devices) { + devices.push_back(info); + } + } +#endif + if (mask & DEVICE_MASK_CPU) { if (!(devices_initialized_mask & DEVICE_MASK_CPU)) { device_cpu_info(cpu_devices); @@ -227,6 +258,15 @@ string Device::device_capabilities(uint mask) } #endif +#ifdef WITH_HIP + if (mask & DEVICE_MASK_HIP) { + if (device_hip_init()) { + capabilities += "\nHIP device capabilities:\n"; + capabilities += device_hip_capabilities(); + } + } +#endif + return capabilities; } @@ -315,6 +355,7 @@ void Device::free_memory() devices_initialized_mask = 0; cuda_devices.free_memory(); optix_devices.free_memory(); + hip_devices.free_memory(); cpu_devices.free_memory(); } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 399d5eb91df..c73d74cdccc 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -51,6 +51,7 @@ enum DeviceType { DEVICE_CUDA, DEVICE_MULTI, DEVICE_OPTIX, + DEVICE_HIP, DEVICE_DUMMY, }; @@ -58,6 +59,7 @@ enum DeviceTypeMask { DEVICE_MASK_CPU = (1 << DEVICE_CPU), DEVICE_MASK_CUDA = (1 << DEVICE_CUDA), DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX), + DEVICE_MASK_HIP = (1 << DEVICE_HIP), DEVICE_MASK_ALL = ~0 }; @@ -119,7 +121,7 @@ class Device { string error_msg; - virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/) + virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, size_t /*offset*/, size_t /*size*/) { /* Only required for devices that implement denoising. */ assert(false); @@ -273,7 +275,7 @@ class Device { virtual void mem_alloc(device_memory &mem) = 0; virtual void mem_copy_to(device_memory &mem) = 0; - virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0; + virtual void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) = 0; virtual void mem_zero(device_memory &mem) = 0; virtual void mem_free(device_memory &mem) = 0; @@ -284,6 +286,7 @@ class Device { static vector<DeviceInfo> cuda_devices; static vector<DeviceInfo> optix_devices; static vector<DeviceInfo> cpu_devices; + static vector<DeviceInfo> hip_devices; static uint devices_initialized_mask; }; diff --git a/intern/cycles/device/device_graphics_interop.h b/intern/cycles/device/device_graphics_interop.h index 671b1c189d7..eaf76077141 100644 --- a/intern/cycles/device/device_graphics_interop.h +++ b/intern/cycles/device/device_graphics_interop.h @@ -16,25 +16,12 @@ #pragma once +#include "render/display_driver.h" + #include "util/util_types.h" CCL_NAMESPACE_BEGIN -/* Information about interoperability destination. - * Is provided by the GPUDisplay. */ -class DeviceGraphicsInteropDestination { - public: - /* Dimensions of the buffer, in pixels. */ - int buffer_width = 0; - int buffer_height = 0; - - /* OpenGL pixel buffer object. */ - int opengl_pbo_id = 0; - - /* Clear the entire destination before doing partial write to it. */ - bool need_clear = false; -}; - /* Device-side graphics interoperability support. * * Takes care of holding all the handlers needed by the device to implement interoperability with @@ -46,7 +33,7 @@ class DeviceGraphicsInterop { /* Update this device-side graphics interoperability object with the given destination resource * information. */ - virtual void set_destination(const DeviceGraphicsInteropDestination &destination) = 0; + virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) = 0; virtual device_ptr map() = 0; virtual void unmap() = 0; diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index c4d45829b83..c0ab2e17cae 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -136,7 +136,7 @@ void device_memory::device_copy_to() } } -void device_memory::device_copy_from(int y, int w, int h, int elem) +void device_memory::device_copy_from(size_t y, size_t w, size_t h, size_t elem) { assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL); device->mem_copy_from(*this, y, w, h, elem); @@ -181,7 +181,7 @@ bool device_memory::is_resident(Device *sub_device) const /* Device Sub Ptr */ -device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device) +device_sub_ptr::device_sub_ptr(device_memory &mem, size_t offset, size_t size) : device(mem.device) { ptr = device->mem_alloc_sub_ptr(mem, offset, size); } diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index c51594b8580..be6123e09b2 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -81,154 +81,154 @@ static constexpr size_t datatype_size(DataType datatype) template<typename T> struct device_type_traits { static const DataType data_type = TYPE_UNKNOWN; - static const int num_elements_cpu = sizeof(T); - static const int num_elements_gpu = sizeof(T); + static const size_t num_elements_cpu = sizeof(T); + static const size_t num_elements_gpu = sizeof(T); }; template<> struct device_type_traits<uchar> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar2> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements_cpu = 2; - static const int num_elements_gpu = 2; + static const size_t num_elements_cpu = 2; + static const size_t num_elements_gpu = 2; static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar3> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements_cpu = 3; - static const int num_elements_gpu = 3; + static const size_t num_elements_cpu = 3; + static const size_t num_elements_gpu = 3; static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uchar4> { static const DataType data_type = TYPE_UCHAR; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 4; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 4; static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint> { static const DataType data_type = TYPE_UINT; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint2> { static const DataType data_type = TYPE_UINT; - static const int num_elements_cpu = 2; - static const int num_elements_gpu = 2; + static const size_t num_elements_cpu = 2; + static const size_t num_elements_gpu = 2; static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint3> { static const DataType data_type = TYPE_UINT; - static const int num_elements_cpu = 3; - static const int num_elements_gpu = 3; + static const size_t num_elements_cpu = 3; + static const size_t num_elements_gpu = 3; static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint4> { static const DataType data_type = TYPE_UINT; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 4; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 4; static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int> { static const DataType data_type = TYPE_INT; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int2> { static const DataType data_type = TYPE_INT; - static const int num_elements_cpu = 2; - static const int num_elements_gpu = 2; + static const size_t num_elements_cpu = 2; + static const size_t num_elements_gpu = 2; static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int3> { static const DataType data_type = TYPE_INT; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 3; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 3; static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<int4> { static const DataType data_type = TYPE_INT; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 4; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 4; static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float2> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements_cpu = 2; - static const int num_elements_gpu = 2; + static const size_t num_elements_cpu = 2; + static const size_t num_elements_gpu = 2; static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float3> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 3; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 3; static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<float4> { static const DataType data_type = TYPE_FLOAT; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 4; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 4; static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<half> { static const DataType data_type = TYPE_HALF; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<ushort4> { static const DataType data_type = TYPE_UINT16; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 4; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 4; static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint16_t> { static const DataType data_type = TYPE_UINT16; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<half4> { static const DataType data_type = TYPE_HALF; - static const int num_elements_cpu = 4; - static const int num_elements_gpu = 4; + static const size_t num_elements_cpu = 4; + static const size_t num_elements_gpu = 4; static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type)); }; template<> struct device_type_traits<uint64_t> { static const DataType data_type = TYPE_UINT64; - static const int num_elements_cpu = 1; - static const int num_elements_gpu = 1; + static const size_t num_elements_cpu = 1; + static const size_t num_elements_gpu = 1; static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type)); }; @@ -277,6 +277,7 @@ class device_memory { protected: friend class CUDADevice; friend class OptiXDevice; + friend class HIPDevice; /* Only create through subclasses. */ device_memory(Device *device, const char *name, MemoryType type); @@ -296,7 +297,7 @@ class device_memory { void device_alloc(); void device_free(); void device_copy_to(); - void device_copy_from(int y, int w, int h, int elem); + void device_copy_from(size_t y, size_t w, size_t h, size_t elem); void device_zero(); bool device_is_cpu(); @@ -565,7 +566,7 @@ template<typename T> class device_vector : public device_memory { device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T)); } - void copy_from_device(int y, int w, int h) + void copy_from_device(size_t y, size_t w, size_t h) { device_copy_from(y, w, h, sizeof(T)); } @@ -601,7 +602,7 @@ template<typename T> class device_vector : public device_memory { class device_sub_ptr { public: - device_sub_ptr(device_memory &mem, int offset, int size); + device_sub_ptr(device_memory &mem, size_t offset, size_t size); ~device_sub_ptr(); device_ptr operator*() const diff --git a/intern/cycles/device/device_queue.cpp b/intern/cycles/device/device_queue.cpp index a89ba68d62c..f2b2f3496e0 100644 --- a/intern/cycles/device/device_queue.cpp +++ b/intern/cycles/device/device_queue.cpp @@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution() { if (VLOG_IS_ON(3)) { last_sync_time_ = time_dt(); - last_kernels_enqueued_ = 0; } + + last_kernels_enqueued_ = 0; } void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) @@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size) if (VLOG_IS_ON(3)) { VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size " << work_size; - last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel); } + + last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel); } void DeviceQueue::debug_synchronize() @@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize() stats_kernel_time_[last_kernels_enqueued_] += elapsed_time; last_sync_time_ = new_time; - last_kernels_enqueued_ = 0; } + + last_kernels_enqueued_ = 0; +} + +string DeviceQueue::debug_active_kernels() +{ + return device_kernel_mask_as_string(last_kernels_enqueued_); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h index edda3e61d51..e6835b787cf 100644 --- a/intern/cycles/device/device_queue.h +++ b/intern/cycles/device/device_queue.h @@ -21,6 +21,7 @@ #include "device/device_graphics_interop.h" #include "util/util_logging.h" #include "util/util_map.h" +#include "util/util_string.h" #include "util/util_unique_ptr.h" CCL_NAMESPACE_BEGIN @@ -101,6 +102,7 @@ class DeviceQueue { void debug_init_execution(); void debug_enqueue(DeviceKernel kernel, const int work_size); void debug_synchronize(); + string debug_active_kernels(); /* Combination of kernels enqueued together sync last synchronize. */ DeviceKernelMask last_kernels_enqueued_; diff --git a/intern/cycles/device/dummy/device.cpp b/intern/cycles/device/dummy/device.cpp index 678276ed025..e3cea272300 100644 --- a/intern/cycles/device/dummy/device.cpp +++ b/intern/cycles/device/dummy/device.cpp @@ -48,7 +48,7 @@ class DummyDevice : public Device { { } - virtual void mem_copy_from(device_memory &, int, int, int, int) override + virtual void mem_copy_from(device_memory &, size_t, size_t, size_t, size_t) override { } diff --git a/intern/cycles/device/hip/device.cpp b/intern/cycles/device/hip/device.cpp new file mode 100644 index 00000000000..90028ac7f10 --- /dev/null +++ b/intern/cycles/device/hip/device.cpp @@ -0,0 +1,276 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "device/hip/device.h" + +#include "util/util_logging.h" + +#ifdef WITH_HIP +# include "device/device.h" +# include "device/hip/device_impl.h" + +# include "util/util_string.h" +# include "util/util_windows.h" +#endif /* WITH_HIP */ + +CCL_NAMESPACE_BEGIN + +bool device_hip_init() +{ +#if !defined(WITH_HIP) + return false; +#elif defined(WITH_HIP_DYNLOAD) + static bool initialized = false; + static bool result = false; + + if (initialized) + return result; + + initialized = true; + int hipew_result = hipewInit(HIPEW_INIT_HIP); + if (hipew_result == HIPEW_SUCCESS) { + VLOG(1) << "HIPEW initialization succeeded"; + if (HIPDevice::have_precompiled_kernels()) { + VLOG(1) << "Found precompiled kernels"; + result = true; + } + else if (hipewCompilerPath() != NULL) { + VLOG(1) << "Found HIPCC " << hipewCompilerPath(); + result = true; + } + else { + VLOG(1) << "Neither precompiled kernels nor HIPCC was found," + << " unable to use HIP"; + } + } + else { + VLOG(1) << "HIPEW initialization failed: " + << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" : + "Error opening the library"); + } + + return result; +#else /* WITH_HIP_DYNLOAD */ + return true; +#endif /* WITH_HIP_DYNLOAD */ +} + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler) +{ +#ifdef WITH_HIP + return new HIPDevice(info, stats, profiler); +#else + (void)info; + (void)stats; + (void)profiler; + + LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen."; + + return nullptr; +#endif +} + +#ifdef WITH_HIP +static hipError_t device_hip_safe_init() +{ +# ifdef _WIN32 + __try { + return hipInit(0); + } + __except (EXCEPTION_EXECUTE_HANDLER) { + /* Ignore crashes inside the HIP driver and hope we can + * survive even with corrupted HIP installs. */ + fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n"); + } + + return hipErrorNoDevice; +# else + return hipInit(0); +# endif +} +#endif /* WITH_HIP */ + +void device_hip_info(vector<DeviceInfo> &devices) +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) + fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result)); + return; + } + + int count = 0; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result)); + return; + } + + vector<DeviceInfo> display_devices; + + for (int num = 0; num < count; num++) { + char name[256]; + + result = hipDeviceGetName(name, 256, num); + if (result != hipSuccess) { + fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result)); + continue; + } + + int major; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num); + // TODO : (Arya) What is the last major version we are supporting? + + DeviceInfo info; + + info.type = DEVICE_HIP; + info.description = string(name); + info.num = num; + + info.has_half_images = (major >= 3); + info.has_nanovdb = true; + info.denoisers = 0; + + info.has_gpu_queue = true; + /* Check if the device has P2P access to any other device in the system. */ + for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { + if (num != peer_num) { + int can_access = 0; + hipDeviceCanAccessPeer(&can_access, num, peer_num); + info.has_peer_memory = (can_access != 0); + } + } + + int pci_location[3] = {0, 0, 0}; + hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num); + hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num); + hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num); + info.id = string_printf("HIP_%s_%04x:%02x:%02x", + name, + (unsigned int)pci_location[0], + (unsigned int)pci_location[1], + (unsigned int)pci_location[2]); + + /* If device has a kernel timeout and no compute preemption, we assume + * it is connected to a display and will freeze the display while doing + * computations. */ + int timeout_attr = 0, preempt_attr = 0; + hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num); + + if (timeout_attr && !preempt_attr) { + VLOG(1) << "Device is recognized as display."; + info.description += " (Display)"; + info.display_device = true; + display_devices.push_back(info); + } + else { + VLOG(1) << "Device has compute preemption or is not used for display."; + devices.push_back(info); + } + VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\"."; + } + + if (!display_devices.empty()) + devices.insert(devices.end(), display_devices.begin(), display_devices.end()); +#else /* WITH_HIP */ + (void)devices; +#endif /* WITH_HIP */ +} + +string device_hip_capabilities() +{ +#ifdef WITH_HIP + hipError_t result = device_hip_safe_init(); + if (result != hipSuccess) { + if (result != hipErrorNoDevice) { + return string("Error initializing HIP: ") + hipewErrorString(result); + } + return "No HIP device found\n"; + } + + int count; + result = hipGetDeviceCount(&count); + if (result != hipSuccess) { + return string("Error getting devices: ") + hipewErrorString(result); + } + + string capabilities = ""; + for (int num = 0; num < count; num++) { + char name[256]; + if (hipDeviceGetName(name, 256, num) != hipSuccess) { + continue; + } + capabilities += string("\t") + name + "\n"; + int value; +# define GET_ATTR(attr) \ + { \ + if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \ + capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \ + } \ + } \ + (void)0 + /* TODO(sergey): Strip all attributes which are not useful for us + * or does not depend on the driver. + */ + GET_ATTR(MaxThreadsPerBlock); + GET_ATTR(MaxBlockDimX); + GET_ATTR(MaxBlockDimY); + GET_ATTR(MaxBlockDimZ); + GET_ATTR(MaxGridDimX); + GET_ATTR(MaxGridDimY); + GET_ATTR(MaxGridDimZ); + GET_ATTR(MaxSharedMemoryPerBlock); + GET_ATTR(TotalConstantMemory); + GET_ATTR(WarpSize); + GET_ATTR(MaxPitch); + GET_ATTR(MaxRegistersPerBlock); + GET_ATTR(ClockRate); + GET_ATTR(TextureAlignment); + GET_ATTR(MultiprocessorCount); + GET_ATTR(KernelExecTimeout); + GET_ATTR(Integrated); + GET_ATTR(CanMapHostMemory); + GET_ATTR(ComputeMode); + GET_ATTR(MaxTexture1DWidth); + GET_ATTR(MaxTexture2DWidth); + GET_ATTR(MaxTexture2DHeight); + GET_ATTR(MaxTexture3DWidth); + GET_ATTR(MaxTexture3DHeight); + GET_ATTR(MaxTexture3DDepth); + GET_ATTR(ConcurrentKernels); + GET_ATTR(EccEnabled); + GET_ATTR(MemoryClockRate); + GET_ATTR(MemoryBusWidth); + GET_ATTR(L2CacheSize); + GET_ATTR(MaxThreadsPerMultiProcessor); + GET_ATTR(ComputeCapabilityMajor); + GET_ATTR(ComputeCapabilityMinor); + GET_ATTR(MaxSharedMemoryPerMultiprocessor); + GET_ATTR(ManagedMemory); + GET_ATTR(IsMultiGpuBoard); +# undef GET_ATTR + capabilities += "\n"; + } + + return capabilities; + +#else /* WITH_HIP */ + return ""; +#endif /* WITH_HIP */ +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/hip/device.h b/intern/cycles/device/hip/device.h new file mode 100644 index 00000000000..965fd9e484b --- /dev/null +++ b/intern/cycles/device/hip/device.h @@ -0,0 +1,37 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_string.h" +#include "util/util_vector.h" + +CCL_NAMESPACE_BEGIN + +class Device; +class DeviceInfo; +class Profiler; +class Stats; + +bool device_hip_init(); + +Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler); + +void device_hip_info(vector<DeviceInfo> &devices); + +string device_hip_capabilities(); + +CCL_NAMESPACE_END diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp new file mode 100644 index 00000000000..0e5ac6ce401 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.cpp @@ -0,0 +1,1343 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include <climits> +# include <limits.h> +# include <stdio.h> +# include <stdlib.h> +# include <string.h> + +# include "device/hip/device_impl.h" + +# include "render/buffers.h" + +# include "util/util_debug.h" +# include "util/util_foreach.h" +# include "util/util_logging.h" +# include "util/util_map.h" +# include "util/util_md5.h" +# include "util/util_opengl.h" +# include "util/util_path.h" +# include "util/util_string.h" +# include "util/util_system.h" +# include "util/util_time.h" +# include "util/util_types.h" +# include "util/util_windows.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +bool HIPDevice::have_precompiled_kernels() +{ + string fatbins_path = path_get("lib"); + return path_exists(fatbins_path); +} + +bool HIPDevice::show_samples() const +{ + /* The HIPDevice only processes one tile at a time, so showing samples is fine. */ + return true; +} + +BVHLayoutMask HIPDevice::get_bvh_layout_mask() const +{ + return BVH_LAYOUT_BVH2; +} + +void HIPDevice::set_error(const string &error) +{ + Device::set_error(error); + + if (first_error) { + fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n"); + fprintf(stderr, + "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n"); + first_error = false; + } +} + +HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) + : Device(info, stats, profiler), texture_info(this, "__texture_info", MEM_GLOBAL) +{ + first_error = true; + + hipDevId = info.num; + hipDevice = 0; + hipContext = 0; + + hipModule = 0; + + need_texture_info = false; + + device_texture_headroom = 0; + device_working_headroom = 0; + move_texture_to_host = false; + map_host_limit = 0; + map_host_used = 0; + can_map_host = 0; + pitch_alignment = 0; + + /* Initialize HIP. */ + hipError_t result = hipInit(0); + if (result != hipSuccess) { + set_error(string_printf("Failed to initialize HIP runtime (%s)", hipewErrorString(result))); + return; + } + + /* Setup device and context. */ + result = hipGetDevice(&hipDevice, hipDevId); + if (result != hipSuccess) { + set_error(string_printf("Failed to get HIP device handle from ordinal (%s)", + hipewErrorString(result))); + return; + } + + hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); + + hip_assert( + hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice)); + + unsigned int ctx_flags = hipDeviceLmemResizeToMax; + if (can_map_host) { + ctx_flags |= hipDeviceMapHost; + init_host_memory(); + } + + /* Create context. */ + result = hipCtxCreate(&hipContext, ctx_flags, hipDevice); + + if (result != hipSuccess) { + set_error(string_printf("Failed to create HIP context (%s)", hipewErrorString(result))); + return; + } + + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + hipDevArchitecture = major * 100 + minor * 10; + + /* Pop context set by hipCtxCreate. */ + hipCtxPopCurrent(NULL); +} + +HIPDevice::~HIPDevice() +{ + texture_info.free(); + + hip_assert(hipCtxDestroy(hipContext)); +} + +bool HIPDevice::support_device(const uint /*kernel_features*/) +{ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + // TODO : (Arya) What versions do we plan to support? + return true; +} + +bool HIPDevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_HIP && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + HIPDevice *const peer_device_hip = static_cast<HIPDevice *>(peer_device); + + int can_access = 0; + hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + hip_assert(hipDeviceGetP2PAttribute( + &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const HIPContextScope scope(this); + hipError_t result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + { + const HIPContextScope scope(peer_device_hip); + hipError_t result = hipCtxEnablePeerAccess(hipContext, 0); + if (result != hipSuccess) { + set_error(string_printf("Failed to enable peer access on HIP context (%s)", + hipewErrorString(result))); + return false; + } + } + + return true; +} + +bool HIPDevice::use_adaptive_compilation() +{ + return DebugFlags().hip.adaptive_compile; +} + +/* Common NVCC flags which stays the same regardless of shading model, + * kernel sources md5 and only depends on compiler or compilation settings. + */ +string HIPDevice::compile_kernel_get_common_cflags(const uint kernel_features) +{ + const int machine = system_cpu_bits(); + const string source_path = path_get("source"); + const string include_path = source_path; + string cflags = string_printf( + "-m%d " + "--ptxas-options=\"-v\" " + "--use_fast_math " + "-DHIPCC " + "-I\"%s\"", + machine, + include_path.c_str()); + if (use_adaptive_compilation()) { + cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features); + } + return cflags; +} + +string HIPDevice::compile_kernel(const uint kernel_features, + const char *name, + const char *base, + bool force_ptx) +{ + /* Compute kernel name. */ + int major, minor; + hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId); + hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId); + + /* Attempt to use kernel provided with Blender. */ + if (!use_adaptive_compilation()) { + if (!force_ptx) { + const string fatbin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using precompiled kernel."; + return fatbin; + } + } + + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } + } + } + + /* Try to use locally compiled kernel. */ + string source_path = path_get("source"); + const string source_md5 = path_files_md5_hash(source_path); + + /* We include cflags into md5 so changing hip toolkit or changing other + * compiler command line arguments makes sure fatbin gets re-built. + */ + string common_cflags = compile_kernel_get_common_cflags(kernel_features); + const string kernel_md5 = util_md5_string(source_md5 + common_cflags); + + const char *const kernel_ext = "genco"; +# ifdef _WIN32 + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp"; +# else + const char *const options = + "save-temps -Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ggdb"; +# endif + const string include_path = source_path; + const char *const kernel_arch = force_ptx ? "compute" : "sm"; + const string fatbin_file = string_printf( + "cycles_%s_%s_%d%d_%s", name, kernel_arch, major, minor, kernel_md5.c_str()); + const string fatbin = path_cache_get(path_join("kernels", fatbin_file)); + VLOG(1) << "Testing for locally compiled kernel " << fatbin << "."; + if (path_exists(fatbin)) { + VLOG(1) << "Using locally compiled kernel."; + return fatbin; + } + +# ifdef _WIN32 + if (!use_adaptive_compilation() && have_precompiled_kernels()) { + if (major < 3) { + set_error( + string_printf("HIP backend requires compute capability 3.0 or up, but found %d.%d. " + "Your GPU is not supported.", + major, + minor)); + } + else { + set_error( + string_printf("HIP binary kernel for this graphics card compute " + "capability (%d.%d) not found.", + major, + minor)); + } + return string(); + } +# endif + + /* Compile. */ + const char *const hipcc = hipewCompilerPath(); + if (hipcc == NULL) { + set_error( + "HIP hipcc compiler not found. " + "Install HIP toolkit in default location."); + return string(); + } + + const int hipcc_hip_version = hipewCompilerVersion(); + VLOG(1) << "Found hipcc " << hipcc << ", HIP version " << hipcc_hip_version << "."; + if (hipcc_hip_version < 40) { + printf( + "Unsupported HIP version %d.%d detected, " + "you need HIP 4.0 or newer.\n", + hipcc_hip_version / 10, + hipcc_hip_version % 10); + return string(); + } + + double starttime = time_dt(); + + path_create_directories(fatbin); + + source_path = path_join(path_join(source_path, "kernel"), + path_join("device", path_join(base, string_printf("%s.cpp", name)))); + + string command = string_printf("%s -%s -I %s --%s %s -o \"%s\"", + hipcc, + options, + include_path.c_str(), + kernel_ext, + source_path.c_str(), + fatbin.c_str()); + + printf("Compiling HIP kernel ...\n%s\n", command.c_str()); + +# ifdef _WIN32 + command = "call " + command; +# endif + if (system(command.c_str()) != 0) { + set_error( + "Failed to execute compilation command, " + "see console for details."); + return string(); + } + + /* Verify if compilation succeeded */ + if (!path_exists(fatbin)) { + set_error( + "HIP kernel compilation failed, " + "see console for details."); + return string(); + } + + printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime); + + return fatbin; +} + +bool HIPDevice::load_kernels(const uint kernel_features) +{ + /* TODO(sergey): Support kernels re-load for HIP devices. + * + * Currently re-loading kernel will invalidate memory pointers, + * causing problems in hipCtxSynchronize. + */ + if (hipModule) { + VLOG(1) << "Skipping kernel reload, not currently supported."; + return true; + } + + /* check if hip init succeeded */ + if (hipContext == 0) + return false; + + /* check if GPU is supported */ + if (!support_device(kernel_features)) + return false; + + /* get kernel */ + const char *kernel_name = "kernel"; + string fatbin = compile_kernel(kernel_features, kernel_name); + if (fatbin.empty()) + return false; + + /* open module */ + HIPContextScope scope(this); + + string fatbin_data; + hipError_t result; + + if (path_read_text(fatbin, fatbin_data)) + result = hipModuleLoadData(&hipModule, fatbin_data.c_str()); + else + result = hipErrorFileNotFound; + + if (result != hipSuccess) + set_error(string_printf( + "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(result))); + + if (result == hipSuccess) { + kernels.load(this); + reserve_local_memory(kernel_features); + } + + return (result == hipSuccess); +} + +void HIPDevice::reserve_local_memory(const uint) +{ + /* Together with hipDeviceLmemResizeToMax, this reserves local memory + * needed for kernel launches, so that we can reliably figure out when + * to allocate scene data in mapped host memory. */ + size_t total = 0, free_before = 0, free_after = 0; + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_before, &total); + } + + { + /* Use the biggest kernel for estimation. */ + const DeviceKernel test_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE; + + /* Launch kernel, using just 1 block appears sufficient to reserve memory for all + * multiprocessors. It would be good to do this in parallel for the multi GPU case + * still to make it faster. */ + HIPDeviceQueue queue(this); + + void *d_path_index = nullptr; + void *d_render_buffer = nullptr; + int d_work_size = 0; + void *args[] = {&d_path_index, &d_render_buffer, &d_work_size}; + + queue.init_execution(); + queue.enqueue(test_kernel, 1, args); + queue.synchronize(); + } + + { + HIPContextScope scope(this); + hipMemGetInfo(&free_after, &total); + } + + VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after) + << " bytes. (" << string_human_readable_size(free_before - free_after) << ")"; + +# if 0 + /* For testing mapped host memory, fill up device memory. */ + const size_t keep_mb = 1024; + + while (free_after > keep_mb * 1024 * 1024LL) { + hipDeviceptr_t tmp; + hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL)); + hipMemGetInfo(&free_after, &total); + } +# endif +} + +void HIPDevice::init_host_memory() +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG(1) << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep is free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower so that some space is left after all + * texture memory allocations. */ + device_working_headroom = 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = 128 * 1024 * 1024LL; // 128MB + + VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void HIPDevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + } +} + +void HIPDevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(hip_mem_map_mutex); + foreach (HIPMemMap::value_type &pair, hip_mem_map) { + device_memory &mem = *pair.first; + HIPMem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple HIP devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG(1) << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + HIPContextScope scope(this); + + hipDeviceptr_t device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + hipError_t mem_alloc_result = hipErrorOutOfMemory; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + hipMemGetInfo(&free, &total); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + hipMemGetInfo(&free, &total); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = hipMalloc(&device_pointer, size); + if (mem_alloc_result == hipSuccess) { + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (mem_alloc_result != hipSuccess && can_map_host) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = hipSuccess; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = hipHostMalloc(&shared_pointer, size); + + assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || + (mem_alloc_result != hipSuccess && shared_pointer == 0)); + } + + if (mem_alloc_result == hipSuccess) { + hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0)); + map_host_used += size; + status = " in host memory"; + } + } + + if (mem_alloc_result != hipSuccess) { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + + if (mem.name) { + VLOG(1) << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(hip_mem_map_mutex); + HIPMem *cmem = &hip_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * HIP memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void HIPDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from + * mem.host_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert( + hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size())); + } +} + +void HIPDevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + hipHostFree(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + hip_assert(hipFree(mem.device_pointer)); + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } +} + +void HIPDevice::mem_alloc(device_memory &mem) +{ + if (mem.type == MEM_TEXTURE) { + assert(!"mem_alloc not supported for textures."); + } + else if (mem.type == MEM_GLOBAL) { + assert(!"mem_alloc not supported for global memory."); + } + else { + generic_alloc(mem); + } +} + +void HIPDevice::mem_copy_to(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + global_alloc(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + tex_alloc((device_texture &)mem); + } + else { + if (!mem.device_pointer) { + generic_alloc(mem); + } + generic_copy_to(mem); + } +} + +void HIPDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) +{ + if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) { + assert(!"mem_copy_from not supported for textures."); + } + else if (mem.host_pointer) { + const size_t size = elem * w * h; + const size_t offset = elem * y * w; + + if (mem.device_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemcpyDtoH( + (char *)mem.host_pointer + offset, (hipDeviceptr_t)mem.device_pointer + offset, size)); + } + else { + memset((char *)mem.host_pointer + offset, 0, size); + } + } +} + +void HIPDevice::mem_zero(device_memory &mem) +{ + if (!mem.device_pointer) { + mem_alloc(mem); + } + if (!mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory + * regardless of mem.host_pointer and mem.shared_pointer. */ + thread_scoped_lock lock(hip_mem_map_mutex); + if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + const HIPContextScope scope(this); + hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size())); + } + else if (mem.host_pointer) { + memset(mem.host_pointer, 0, mem.memory_size()); + } +} + +void HIPDevice::mem_free(device_memory &mem) +{ + if (mem.type == MEM_GLOBAL) { + global_free(mem); + } + else if (mem.type == MEM_TEXTURE) { + tex_free((device_texture &)mem); + } + else { + generic_free(mem); + } +} + +device_ptr HIPDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) +{ + return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset)); +} + +void HIPDevice::const_copy_to(const char *name, void *host, size_t size) +{ + HIPContextScope scope(this); + hipDeviceptr_t mem; + size_t bytes; + + hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule, name)); + assert(bytes == size); + hip_assert(hipMemcpyHtoD(mem, host, size)); +} + +void HIPDevice::global_alloc(device_memory &mem) +{ + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } + + const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); +} + +void HIPDevice::global_free(device_memory &mem) +{ + if (mem.is_resident(this) && mem.device_pointer) { + generic_free(mem); + } +} + +void HIPDevice::tex_alloc(device_texture &mem) +{ + HIPContextScope scope(this); + + /* General variables for both architectures */ + string bind_name = mem.name; + size_t dsize = datatype_size(mem.data_type); + size_t size = mem.memory_size(); + + hipTextureAddressMode address_mode = hipAddressModeWrap; + switch (mem.info.extension) { + case EXTENSION_REPEAT: + address_mode = hipAddressModeWrap; + break; + case EXTENSION_EXTEND: + address_mode = hipAddressModeClamp; + break; + case EXTENSION_CLIP: + // TODO : (Arya) setting this to Mode Clamp instead of Mode Border because it's unsupported + // in hip + address_mode = hipAddressModeClamp; + break; + default: + assert(0); + break; + } + + hipTextureFilterMode filter_mode; + if (mem.info.interpolation == INTERPOLATION_CLOSEST) { + filter_mode = hipFilterModePoint; + } + else { + filter_mode = hipFilterModeLinear; + } + + /* Image Texture Storage */ + hipArray_Format format; + switch (mem.data_type) { + case TYPE_UCHAR: + format = HIP_AD_FORMAT_UNSIGNED_INT8; + break; + case TYPE_UINT16: + format = HIP_AD_FORMAT_UNSIGNED_INT16; + break; + case TYPE_UINT: + format = HIP_AD_FORMAT_UNSIGNED_INT32; + break; + case TYPE_INT: + format = HIP_AD_FORMAT_SIGNED_INT32; + break; + case TYPE_FLOAT: + format = HIP_AD_FORMAT_FLOAT; + break; + case TYPE_HALF: + format = HIP_AD_FORMAT_HALF; + break; + default: + assert(0); + return; + } + + HIPMem *cmem = NULL; + hArray array_3d = NULL; + size_t src_pitch = mem.data_width * dsize * mem.data_elements; + size_t dst_pitch = src_pitch; + + if (!mem.is_resident(this)) { + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (hArray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { + /* 3D texture using array, there is no API for linear memory. */ + HIP_ARRAY3D_DESCRIPTOR desc; + + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Depth = mem.data_depth; + desc.Format = format; + desc.NumChannels = mem.data_elements; + desc.Flags = 0; + + VLOG(1) << "Array 3D allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")"; + + hip_assert(hipArray3DCreate(&array_3d, &desc)); + + if (!array_3d) { + return; + } + + HIP_MEMCPY3D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeArray; + param.dstArray = &array_3d; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + param.Depth = mem.data_depth; + + hip_assert(hipDrvMemcpy3D(¶m)); + + mem.device_pointer = (device_ptr)array_3d; + mem.device_size = size; + stats.mem_alloc(size); + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + cmem->texobject = 0; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + /* 2D texture, using pitch aligned linear memory. */ + dst_pitch = align_up(src_pitch, pitch_alignment); + size_t dst_size = dst_pitch * mem.data_height; + + cmem = generic_alloc(mem, dst_size - mem.memory_size()); + if (!cmem) { + return; + } + + hip_Memcpy2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = hipMemoryTypeDevice; + param.dstDevice = mem.device_pointer; + param.dstPitch = dst_pitch; + param.srcMemoryType = hipMemoryTypeHost; + param.srcHost = mem.host_pointer; + param.srcPitch = src_pitch; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + hip_assert(hipDrvMemcpy2DUnaligned(¶m)); + } + else { + /* 1D texture, using linear memory. */ + cmem = generic_alloc(mem); + if (!cmem) { + return; + } + + hip_assert(hipMemcpyHtoD(mem.device_pointer, mem.host_pointer, size)); + } + + /* Resize once */ + const uint slot = mem.slot; + if (slot >= texture_info.size()) { + /* Allocate some slots in advance, to reduce amount + * of re-allocations. */ + texture_info.resize(slot + 128); + } + + /* Set Mapping and tag that we need to (re-)upload to device */ + texture_info[slot] = mem.info; + need_texture_info = true; + + if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT && + mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { + /* Kepler+, bindless textures. */ + hipResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + + if (array_3d) { + resDesc.resType = hipResourceTypeArray; + resDesc.res.array.h_Array = &array_3d; + resDesc.flags = 0; + } + else if (mem.data_height > 0) { + resDesc.resType = hipResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = mem.device_pointer; + resDesc.res.pitch2D.format = format; + resDesc.res.pitch2D.numChannels = mem.data_elements; + resDesc.res.pitch2D.height = mem.data_height; + resDesc.res.pitch2D.width = mem.data_width; + resDesc.res.pitch2D.pitchInBytes = dst_pitch; + } + else { + resDesc.resType = hipResourceTypeLinear; + resDesc.res.linear.devPtr = mem.device_pointer; + resDesc.res.linear.format = format; + resDesc.res.linear.numChannels = mem.data_elements; + resDesc.res.linear.sizeInBytes = mem.device_size; + } + + hipTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.addressMode[0] = address_mode; + texDesc.addressMode[1] = address_mode; + texDesc.addressMode[2] = address_mode; + texDesc.filterMode = filter_mode; + texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES; + + thread_scoped_lock lock(hip_mem_map_mutex); + cmem = &hip_mem_map[&mem]; + + hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); + + texture_info[slot].data = (uint64_t)cmem->texobject; + } + else { + texture_info[slot].data = (uint64_t)mem.device_pointer; + } +} + +void HIPDevice::tex_free(device_texture &mem) +{ + if (mem.device_pointer) { + HIPContextScope scope(this); + thread_scoped_lock lock(hip_mem_map_mutex); + const HIPMem &cmem = hip_mem_map[&mem]; + + if (cmem.texobject) { + /* Free bindless texture. */ + hipTexObjectDestroy(cmem.texobject); + } + + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else if (cmem.array) { + /* Free array. */ + hipArrayDestroy(cmem.array); + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + hip_mem_map.erase(hip_mem_map.find(&mem)); + } + else { + lock.unlock(); + generic_free(mem); + } + } +} + +# if 0 +void HIPDevice::render(DeviceTask &task, + RenderTile &rtile, + device_vector<KernelWorkTile> &work_tiles) +{ + scoped_timer timer(&rtile.buffers->render_time); + + if (have_error()) + return; + + HIPContextScope scope(this); + hipFunction_t hipRender; + + /* Get kernel function. */ + if (rtile.task == RenderTile::BAKE) { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_bake")); + } + else { + hip_assert(hipModuleGetFunction(&hipRender, hipModule, "kernel_hip_path_trace")); + } + + if (have_error()) { + return; + } + + hip_assert(hipFuncSetCacheConfig(hipRender, hipFuncCachePreferL1)); + + /* Allocate work tile. */ + work_tiles.alloc(1); + + KernelWorkTile *wtile = work_tiles.data(); + wtile->x = rtile.x; + wtile->y = rtile.y; + wtile->w = rtile.w; + wtile->h = rtile.h; + wtile->offset = rtile.offset; + wtile->stride = rtile.stride; + wtile->buffer = (float *)(hipDeviceptr_t)rtile.buffer; + + /* Prepare work size. More step samples render faster, but for now we + * remain conservative for GPUs connected to a display to avoid driver + * timeouts and display freezing. */ + int min_blocks, num_threads_per_block; + hip_assert( + hipModuleOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, hipRender, NULL, 0, 0)); + if (!info.display_device) { + min_blocks *= 8; + } + + uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h); + + /* Render all samples. */ + uint start_sample = rtile.start_sample; + uint end_sample = rtile.start_sample + rtile.num_samples; + + for (int sample = start_sample; sample < end_sample;) { + /* Setup and copy work tile to device. */ + wtile->start_sample = sample; + wtile->num_samples = step_samples; + if (task.adaptive_sampling.use) { + wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples); + } + wtile->num_samples = min(wtile->num_samples, end_sample - sample); + work_tiles.copy_to_device(); + + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + uint total_work_size = wtile->w * wtile->h * wtile->num_samples; + uint num_blocks = divide_up(total_work_size, num_threads_per_block); + + /* Launch kernel. */ + void *args[] = {&d_work_tiles, &total_work_size}; + + hip_assert( + hipModuleLaunchKernel(hipRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0)); + + /* Run the adaptive sampling kernels at selected samples aligned to step samples. */ + uint filter_sample = sample + wtile->num_samples - 1; + if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) { + adaptive_sampling_filter(filter_sample, wtile, d_work_tiles); + } + + hip_assert(hipDeviceSynchronize()); + + /* Update progress. */ + sample += wtile->num_samples; + rtile.sample = sample; + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + /* Finalize adaptive sampling. */ + if (task.adaptive_sampling.use) { + hipDeviceptr_t d_work_tiles = (hipDeviceptr_t)work_tiles.device_pointer; + adaptive_sampling_post(rtile, wtile, d_work_tiles); + hip_assert(hipDeviceSynchronize()); + task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples); + } +} + +void HIPDevice::thread_run(DeviceTask &task) +{ + HIPContextScope scope(this); + + if (task.type == DeviceTask::RENDER) { + device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY); + + /* keep rendering tiles until done */ + RenderTile tile; + DenoisingTask denoising(this, task); + + while (task.acquire_tile(this, tile, task.tile_types)) { + if (tile.task == RenderTile::PATH_TRACE) { + render(task, tile, work_tiles); + } + else if (tile.task == RenderTile::BAKE) { + render(task, tile, work_tiles); + } + + task.release_tile(tile); + + if (task.get_cancel()) { + if (task.need_finish_queue == false) + break; + } + } + + work_tiles.free(); + } +} +# endif + +unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create() +{ + return make_unique<HIPDeviceQueue>(this); +} + +bool HIPDevice::should_use_graphics_interop() +{ + /* Check whether this device is part of OpenGL context. + * + * Using HIP device for graphics interoperability which is not part of the OpenGL context is + * possible, but from the empiric measurements it can be considerably slower than using naive + * pixels copy. */ + + HIPContextScope scope(this); + + int num_all_devices = 0; + hip_assert(hipGetDeviceCount(&num_all_devices)); + + if (num_all_devices == 0) { + return false; + } + + vector<hipDevice_t> gl_devices(num_all_devices); + uint num_gl_devices = 0; + hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll); + + for (hipDevice_t gl_device : gl_devices) { + if (gl_device == hipDevice) { + return true; + } + } + + return false; +} + +int HIPDevice::get_num_multiprocessors() +{ + return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0); +} + +int HIPDevice::get_max_num_threads_per_multiprocessor() +{ + return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0); +} + +bool HIPDevice::get_device_attribute(hipDeviceAttribute_t attribute, int *value) +{ + HIPContextScope scope(this); + + return hipDeviceGetAttribute(value, attribute, hipDevice) == hipSuccess; +} + +int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value) +{ + int value = 0; + if (!get_device_attribute(attribute, &value)) { + return default_value; + } + return value; +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h new file mode 100644 index 00000000000..1d138ee9856 --- /dev/null +++ b/intern/cycles/device/hip/device_impl.h @@ -0,0 +1,153 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device.h" +# include "device/hip/kernel.h" +# include "device/hip/queue.h" +# include "device/hip/util.h" + +# include "util/util_map.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# else +# include "util/util_opengl.h" +# endif + +CCL_NAMESPACE_BEGIN + +class DeviceQueue; + +class HIPDevice : public Device { + + friend class HIPContextScope; + + public: + hipDevice_t hipDevice; + hipCtx_t hipContext; + hipModule_t hipModule; + size_t device_texture_headroom; + size_t device_working_headroom; + bool move_texture_to_host; + size_t map_host_used; + size_t map_host_limit; + int can_map_host; + int pitch_alignment; + int hipDevId; + int hipDevArchitecture; + bool first_error; + + struct HIPMem { + HIPMem() : texobject(0), array(0), use_mapped_host(false) + { + } + + hipTextureObject_t texobject; + hArray array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map<device_memory *, HIPMem> HIPMemMap; + HIPMemMap hip_mem_map; + thread_mutex hip_mem_map_mutex; + + /* Bindless Textures */ + device_vector<TextureInfo> texture_info; + bool need_texture_info; + + HIPDeviceKernels kernels; + + static bool have_precompiled_kernels(); + + virtual bool show_samples() const override; + + virtual BVHLayoutMask get_bvh_layout_mask() const override; + + void set_error(const string &error) override; + + HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler); + + virtual ~HIPDevice(); + + bool support_device(const uint /*kernel_features*/); + + bool check_peer_access(Device *peer_device) override; + + bool use_adaptive_compilation(); + + virtual string compile_kernel_get_common_cflags(const uint kernel_features); + + string compile_kernel(const uint kernel_features, + const char *name, + const char *base = "hip", + bool force_ptx = false); + + virtual bool load_kernels(const uint kernel_features) override; + void reserve_local_memory(const uint kernel_features); + + void init_host_memory(); + + void load_texture_info(); + + void move_textures_to_host(size_t size, bool for_texture); + + HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + + void generic_copy_to(device_memory &mem); + + void generic_free(device_memory &mem); + + void mem_alloc(device_memory &mem) override; + + void mem_copy_to(device_memory &mem) override; + + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override; + + void mem_zero(device_memory &mem) override; + + void mem_free(device_memory &mem) override; + + device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override; + + virtual void const_copy_to(const char *name, void *host, size_t size) override; + + void global_alloc(device_memory &mem); + + void global_free(device_memory &mem); + + void tex_alloc(device_texture &mem); + + void tex_free(device_texture &mem); + + /* Graphics resources interoperability. */ + virtual bool should_use_graphics_interop() override; + + virtual unique_ptr<DeviceQueue> gpu_queue_create() override; + + int get_num_multiprocessors(); + int get_max_num_threads_per_multiprocessor(); + + protected: + bool get_device_attribute(hipDeviceAttribute_t attribute, int *value); + int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value); +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.cpp b/intern/cycles/device/hip/graphics_interop.cpp new file mode 100644 index 00000000000..0d5d71019b3 --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.cpp @@ -0,0 +1,105 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/graphics_interop.h" + +# include "device/hip/device_impl.h" +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue) + : queue_(queue), device_(static_cast<HIPDevice *>(queue->device)) +{ +} + +HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop() +{ + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } +} + +void HIPDeviceGraphicsInterop::set_display_interop( + const DisplayDriver::GraphicsInterop &display_interop) +{ + const int64_t new_buffer_area = int64_t(display_interop.buffer_width) * + display_interop.buffer_height; + + need_clear_ = display_interop.need_clear; + + if (opengl_pbo_id_ == display_interop.opengl_pbo_id && buffer_area_ == new_buffer_area) { + return; + } + + HIPContextScope scope(device_); + + if (hip_graphics_resource_) { + hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_)); + } + + const hipError_t result = hipGraphicsGLRegisterBuffer( + &hip_graphics_resource_, display_interop.opengl_pbo_id, hipGraphicsRegisterFlagsNone); + if (result != hipSuccess) { + LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result); + } + + opengl_pbo_id_ = display_interop.opengl_pbo_id; + buffer_area_ = new_buffer_area; +} + +device_ptr HIPDeviceGraphicsInterop::map() +{ + if (!hip_graphics_resource_) { + return 0; + } + + HIPContextScope scope(device_); + + hipDeviceptr_t hip_buffer; + size_t bytes; + + hip_device_assert(device_, + hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream())); + hip_device_assert( + device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_)); + + if (need_clear_) { + hip_device_assert( + device_, + hipMemsetD8Async(static_cast<hipDeviceptr_t>(hip_buffer), 0, bytes, queue_->stream())); + + need_clear_ = false; + } + + return static_cast<device_ptr>(hip_buffer); +} + +void HIPDeviceGraphicsInterop::unmap() +{ + HIPContextScope scope(device_); + + hip_device_assert(device_, + hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream())); +} + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/graphics_interop.h b/intern/cycles/device/hip/graphics_interop.h new file mode 100644 index 00000000000..2b2d287ff6c --- /dev/null +++ b/intern/cycles/device/hip/graphics_interop.h @@ -0,0 +1,64 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/device_graphics_interop.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class HIPDeviceQueue; + +class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop { + public: + explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue); + + HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete; + + ~HIPDeviceGraphicsInterop(); + + HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete; + HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete; + + virtual void set_display_interop(const DisplayDriver::GraphicsInterop &display_interop) override; + + virtual device_ptr map() override; + virtual void unmap() override; + + protected: + HIPDeviceQueue *queue_ = nullptr; + HIPDevice *device_ = nullptr; + + /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */ + uint opengl_pbo_id_ = 0; + /* Buffer area in pixels of the corresponding PBO. */ + int64_t buffer_area_ = 0; + + /* The destination was requested to be cleared. */ + bool need_clear_ = false; + + hipGraphicsResource hip_graphics_resource_ = nullptr; +}; + +CCL_NAMESPACE_END + +#endif diff --git a/intern/cycles/device/hip/kernel.cpp b/intern/cycles/device/hip/kernel.cpp new file mode 100644 index 00000000000..9ede8507a0c --- /dev/null +++ b/intern/cycles/device/hip/kernel.cpp @@ -0,0 +1,69 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/kernel.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +void HIPDeviceKernels::load(HIPDevice *device) +{ + hipModule_t hipModule = device->hipModule; + + for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { + HIPDeviceKernel &kernel = kernels_[i]; + + /* No mega-kernel used for GPU. */ + if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) { + continue; + } + + const std::string function_name = std::string("kernel_gpu_") + + device_kernel_as_string((DeviceKernel)i); + hip_device_assert(device, + hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str())); + + if (kernel.function) { + hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1)); + + hip_device_assert( + device, + hipModuleOccupancyMaxPotentialBlockSize( + &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0)); + } + else { + LOG(ERROR) << "Unable to load kernel " << function_name; + } + } + + loaded = true; +} + +const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const +{ + return kernels_[(int)kernel]; +} + +bool HIPDeviceKernels::available(DeviceKernel kernel) const +{ + return kernels_[(int)kernel].function != nullptr; +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP*/ diff --git a/intern/cycles/device/hip/kernel.h b/intern/cycles/device/hip/kernel.h new file mode 100644 index 00000000000..3301731f56e --- /dev/null +++ b/intern/cycles/device/hip/kernel.h @@ -0,0 +1,54 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* HIP kernel and associate occupancy information. */ +class HIPDeviceKernel { + public: + hipFunction_t function = nullptr; + + int num_threads_per_block = 0; + int min_blocks = 0; +}; + +/* Cache of HIP kernels for each DeviceKernel. */ +class HIPDeviceKernels { + public: + void load(HIPDevice *device); + const HIPDeviceKernel &get(DeviceKernel kernel) const; + bool available(DeviceKernel kernel) const; + + protected: + HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM]; + bool loaded = false; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.cpp b/intern/cycles/device/hip/queue.cpp new file mode 100644 index 00000000000..78c77e5fdae --- /dev/null +++ b/intern/cycles/device/hip/queue.cpp @@ -0,0 +1,209 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/queue.h" + +# include "device/hip/device_impl.h" +# include "device/hip/graphics_interop.h" +# include "device/hip/kernel.h" + +CCL_NAMESPACE_BEGIN + +/* HIPDeviceQueue */ + +HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device) + : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr) +{ + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking)); +} + +HIPDeviceQueue::~HIPDeviceQueue() +{ + const HIPContextScope scope(hip_device_); + hipStreamDestroy(hip_stream_); +} + +int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const +{ + /* TODO: compute automatically. */ + /* TODO: must have at least num_threads_per_block. */ + return 14416128; +} + +int HIPDeviceQueue::num_concurrent_busy_states() const +{ + const int max_num_threads = hip_device_->get_num_multiprocessors() * + hip_device_->get_max_num_threads_per_multiprocessor(); + + if (max_num_threads == 0) { + return 65536; + } + + return 4 * max_num_threads; +} + +void HIPDeviceQueue::init_execution() +{ + /* Synchronize all textures and memory copies before executing task. */ + HIPContextScope scope(hip_device_); + hip_device_->load_texture_info(); + hip_device_assert(hip_device_, hipDeviceSynchronize()); + + debug_init_execution(); +} + +bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const +{ + return hip_device_->kernels.available(kernel); +} + +bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[]) +{ + if (hip_device_->have_error()) { + return false; + } + + debug_enqueue(kernel, work_size); + + const HIPContextScope scope(hip_device_); + const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel); + + /* Compute kernel launch parameters. */ + const int num_threads_per_block = hip_kernel.num_threads_per_block; + const int num_blocks = divide_up(work_size, num_threads_per_block); + + int shared_mem_bytes = 0; + + switch (kernel) { + case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: + case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: + /* See parall_active_index.h for why this amount of shared memory is needed. */ + shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int); + break; + default: + break; + } + + /* Launch kernel. */ + hip_device_assert(hip_device_, + hipModuleLaunchKernel(hip_kernel.function, + num_blocks, + 1, + 1, + num_threads_per_block, + 1, + 1, + shared_mem_bytes, + hip_stream_, + args, + 0)); + return !(hip_device_->have_error()); +} + +bool HIPDeviceQueue::synchronize() +{ + if (hip_device_->have_error()) { + return false; + } + + const HIPContextScope scope(hip_device_); + hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_)); + debug_synchronize(); + + return !(hip_device_->have_error()); +} + +void HIPDeviceQueue::zero_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + /* Zero memory on device. */ + assert(mem.device_pointer != 0); + + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_to_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + /* Allocate on demand. */ + if (mem.device_pointer == 0) { + hip_device_->mem_alloc(mem); + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory to device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyHtoDAsync( + (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_)); +} + +void HIPDeviceQueue::copy_from_device(device_memory &mem) +{ + assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE); + + if (mem.memory_size() == 0) { + return; + } + + assert(mem.device_pointer != 0); + assert(mem.host_pointer != nullptr); + + /* Copy memory from device. */ + const HIPContextScope scope(hip_device_); + hip_device_assert( + hip_device_, + hipMemcpyDtoHAsync( + mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_)); +} + +// TODO : (Arya) Enable this after stabilizing dev branch +unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create() +{ + return make_unique<HIPDeviceGraphicsInterop>(this); +} + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/queue.h b/intern/cycles/device/hip/queue.h new file mode 100644 index 00000000000..04c8a5982ce --- /dev/null +++ b/intern/cycles/device/hip/queue.h @@ -0,0 +1,68 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# include "device/device_kernel.h" +# include "device/device_memory.h" +# include "device/device_queue.h" + +# include "device/hip/util.h" + +CCL_NAMESPACE_BEGIN + +class HIPDevice; +class device_memory; + +/* Base class for HIP queues. */ +class HIPDeviceQueue : public DeviceQueue { + public: + HIPDeviceQueue(HIPDevice *device); + ~HIPDeviceQueue(); + + virtual int num_concurrent_states(const size_t state_size) const override; + virtual int num_concurrent_busy_states() const override; + + virtual void init_execution() override; + + virtual bool kernel_available(DeviceKernel kernel) const override; + + virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override; + + virtual bool synchronize() override; + + virtual void zero_to_device(device_memory &mem) override; + virtual void copy_to_device(device_memory &mem) override; + virtual void copy_from_device(device_memory &mem) override; + + virtual hipStream_t stream() + { + return hip_stream_; + } + + // TODO : (Arya) Enable this after stabilizing the dev branch + virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override; + + protected: + HIPDevice *hip_device_; + hipStream_t hip_stream_; +}; + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.cpp b/intern/cycles/device/hip/util.cpp new file mode 100644 index 00000000000..44f52c4e17b --- /dev/null +++ b/intern/cycles/device/hip/util.cpp @@ -0,0 +1,61 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef WITH_HIP + +# include "device/hip/util.h" +# include "device/hip/device_impl.h" + +CCL_NAMESPACE_BEGIN + +HIPContextScope::HIPContextScope(HIPDevice *device) : device(device) +{ + hip_device_assert(device, hipCtxPushCurrent(device->hipContext)); +} + +HIPContextScope::~HIPContextScope() +{ + hip_device_assert(device, hipCtxPopCurrent(NULL)); +} + +# ifndef WITH_HIP_DYNLOAD +const char *hipewErrorString(hipError_t result) +{ + /* We can only give error code here without major code duplication, that + * should be enough since dynamic loading is only being disabled by folks + * who knows what they're doing anyway. + * + * NOTE: Avoid call from several threads. + */ + static string error; + error = string_printf("%d", result); + return error.c_str(); +} + +const char *hipewCompilerPath() +{ + return CYCLES_HIP_HIPCC_EXECUTABLE; +} + +int hipewCompilerVersion() +{ + return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10); +} +# endif + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/hip/util.h b/intern/cycles/device/hip/util.h new file mode 100644 index 00000000000..0db5174a3db --- /dev/null +++ b/intern/cycles/device/hip/util.h @@ -0,0 +1,63 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef WITH_HIP + +# ifdef WITH_HIP_DYNLOAD +# include "hipew.h" +# endif + +CCL_NAMESPACE_BEGIN + +class HIPDevice; + +/* Utility to push/pop HIP context. */ +class HIPContextScope { + public: + HIPContextScope(HIPDevice *device); + ~HIPContextScope(); + + private: + HIPDevice *device; +}; + +/* Utility for checking return values of HIP function calls. */ +# define hip_device_assert(hip_device, stmt) \ + { \ + hipError_t result = stmt; \ + if (result != hipSuccess) { \ + const char *name = hipewErrorString(result); \ + hip_device->set_error( \ + string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \ + } \ + } \ + (void)0 + +# define hip_assert(stmt) hip_device_assert(this, stmt) + +# ifndef WITH_HIP_DYNLOAD +/* Transparently implement some functions, so majority of the file does not need + * to worry about difference between dynamically loaded and linked HIP at all. */ +const char *hipewErrorString(hipError_t result); +const char *hipewCompilerPath(); +int hipewCompilerVersion(); +# endif /* WITH_HIP_DYNLOAD */ + +CCL_NAMESPACE_END + +#endif /* WITH_HIP */ diff --git a/intern/cycles/device/multi/device.cpp b/intern/cycles/device/multi/device.cpp index 6dbcce2d9a5..4f995abf2c4 100644 --- a/intern/cycles/device/multi/device.cpp +++ b/intern/cycles/device/multi/device.cpp @@ -315,14 +315,14 @@ class MultiDevice : public Device { stats.mem_alloc(mem.device_size - existing_size); } - void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override + void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override { device_ptr key = mem.device_pointer; - int i = 0, sub_h = h / devices.size(); + size_t i = 0, sub_h = h / devices.size(); foreach (SubDevice &sub, devices) { - int sy = y + i * sub_h; - int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; + size_t sy = y + i * sub_h; + size_t sh = (i == (size_t)devices.size() - 1) ? h - sub_h * i : sub_h; SubDevice *owner_sub = find_matching_mem_device(key, sub); mem.device = owner_sub->device; diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp index b54d423a183..49d4e22143f 100644 --- a/intern/cycles/device/optix/device_impl.cpp +++ b/intern/cycles/device/optix/device_impl.cpp @@ -315,6 +315,11 @@ bool OptiXDevice::load_kernels(const uint kernel_features) group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; group_descs[PG_HITS].hitgroup.moduleAH = optix_module; group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit"; + group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; + group_descs[PG_HITV].hitgroup.moduleCH = optix_module; + group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit"; + group_descs[PG_HITV].hitgroup.moduleAH = optix_module; + group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test"; if (kernel_features & KERNEL_FEATURE_HAIR) { if (kernel_features & KERNEL_FEATURE_HAIR_THICK) { @@ -397,6 +402,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); + trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH); trace_css = std::max(trace_css, stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); trace_css = std::max(trace_css, @@ -421,6 +427,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD]); pipeline_groups.push_back(groups[PG_HITS]); pipeline_groups.push_back(groups[PG_HITL]); + pipeline_groups.push_back(groups[PG_HITV]); if (motion_blur) { pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); @@ -459,6 +466,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features) pipeline_groups.push_back(groups[PG_HITD]); pipeline_groups.push_back(groups[PG_HITS]); pipeline_groups.push_back(groups[PG_HITL]); + pipeline_groups.push_back(groups[PG_HITV]); if (motion_blur) { pipeline_groups.push_back(groups[PG_HITD_MOTION]); pipeline_groups.push_back(groups[PG_HITS_MOTION]); @@ -1390,25 +1398,33 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) /* Set user instance ID to object index (but leave low bit blank). */ instance.instanceId = ob->get_device_index() << 1; - /* Have to have at least one bit in the mask, or else instance would always be culled. */ - instance.visibilityMask = 1; + /* Add some of the object visibility bits to the mask. + * __prim_visibility contains the combined visibility bits of all instances, so is not + * reliable if they differ between instances. But the OptiX visibility mask can only contain + * 8 bits, so have to trade-off here and select just a few important ones. + */ + instance.visibilityMask = ob->visibility_for_tracing() & 0xFF; - if (ob->get_geometry()->has_volume) { - /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes. - */ - instance.visibilityMask |= 2; + /* Have to have at least one bit in the mask, or else instance would always be culled. */ + if (0 == instance.visibilityMask) { + instance.visibilityMask = 0xFF; } - if (ob->get_geometry()->geometry_type == Geometry::HAIR) { - /* Same applies to curves (so they can be skipped in local trace calls). */ - instance.visibilityMask |= 4; - - if (motion_blur && ob->get_geometry()->has_motion_blur() && - static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { + if (ob->get_geometry()->geometry_type == Geometry::HAIR && + static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) { + if (motion_blur && ob->get_geometry()->has_motion_blur()) { /* Select between motion blur and non-motion blur built-in intersection module. */ instance.sbtOffset = PG_HITD_MOTION - PG_HITD; } } + else { + /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves, + * since it needs to filter out end-caps there). + * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit + * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT. + */ + instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT; + } /* Insert motion traversable if object has motion. */ if (motion_blur && ob->use_motion()) { @@ -1474,7 +1490,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) delete[] reinterpret_cast<uint8_t *>(&motion_transform); /* Disable instance transform if object uses motion transform already. */ - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; /* Get traversable handle to motion transform. */ optixConvertPointerToTraversableHandle(context, @@ -1491,7 +1507,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit) } else { /* Disable instance transform if geometry already has it applied to vertex data. */ - instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; + instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; /* Non-instanced objects read ID from 'prim_object', so distinguish * them from instanced objects with the low bit set. */ instance.instanceId |= 1; diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h index 91ef52e0a5a..3695ac6afc2 100644 --- a/intern/cycles/device/optix/device_impl.h +++ b/intern/cycles/device/optix/device_impl.h @@ -40,6 +40,7 @@ enum { PG_HITD, /* Default hit group. */ PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */ PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */ + PG_HITV, /* __VOLUME__ hit group. */ PG_HITD_MOTION, PG_HITS_MOTION, PG_CALL_SVM_AO, @@ -51,7 +52,7 @@ enum { static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS; static const int NUM_MIS_PROGRAM_GROUPS = 1; static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD; -static const int NUM_HIT_PROGRAM_GROUPS = 5; +static const int NUM_HIT_PROGRAM_GROUPS = 6; static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO; static const int NUM_CALLABLE_PROGRAM_GROUPS = 3; diff --git a/intern/cycles/integrator/CMakeLists.txt b/intern/cycles/integrator/CMakeLists.txt index bfabd35d7c3..949254606b8 100644 --- a/intern/cycles/integrator/CMakeLists.txt +++ b/intern/cycles/integrator/CMakeLists.txt @@ -27,6 +27,8 @@ set(SRC pass_accessor.cpp pass_accessor_cpu.cpp pass_accessor_gpu.cpp + path_trace_display.cpp + path_trace_tile.cpp path_trace_work.cpp path_trace_work_cpu.cpp path_trace_work_gpu.cpp @@ -47,6 +49,8 @@ set(SRC_HEADERS pass_accessor.h pass_accessor_cpu.h pass_accessor_gpu.h + path_trace_display.h + path_trace_tile.h path_trace_work.h path_trace_work_cpu.h path_trace_work_gpu.h diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp index b62a06aea43..7624b244175 100644 --- a/intern/cycles/integrator/path_trace.cpp +++ b/intern/cycles/integrator/path_trace.cpp @@ -19,8 +19,9 @@ #include "device/cpu/device.h" #include "device/device.h" #include "integrator/pass_accessor.h" +#include "integrator/path_trace_display.h" +#include "integrator/path_trace_tile.h" #include "integrator/render_scheduler.h" -#include "render/gpu_display.h" #include "render/pass.h" #include "render/scene.h" #include "render/tile.h" @@ -67,11 +68,11 @@ PathTrace::PathTrace(Device *device, PathTrace::~PathTrace() { /* Destroy any GPU resource which was used for graphics interop. - * Need to have access to the GPUDisplay as it is the only source of drawing context which is - * used for interop. */ - if (gpu_display_) { + * Need to have access to the PathTraceDisplay as it is the only source of drawing context which + * is used for interop. */ + if (display_) { for (auto &&path_trace_work : path_trace_works_) { - path_trace_work->destroy_gpu_resources(gpu_display_.get()); + path_trace_work->destroy_gpu_resources(display_.get()); } } } @@ -94,7 +95,7 @@ bool PathTrace::ready_to_reset() { /* The logic here is optimized for the best feedback in the viewport, which implies having a GPU * display. Of there is no such display, the logic here will break. */ - DCHECK(gpu_display_); + DCHECK(display_); /* The logic here tries to provide behavior which feels the most interactive feel to artists. * General idea is to be able to reset as quickly as possible, while still providing interactive @@ -126,8 +127,8 @@ void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_t /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation. * It is requires to inform about reset whenever it happens, so that the redraw state tracking is * properly updated. */ - if (gpu_display_) { - gpu_display_->reset(full_params); + if (display_) { + display_->reset(full_params); } render_state_.has_denoised_result = false; @@ -244,7 +245,7 @@ static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>> const int slice_height = max(lround(height * weight), 1); /* Disallow negative values to deal with situations when there are more compute devices than - * scanlines. */ + * scan-lines. */ const int remaining_height = max(0, height - current_y); BufferParams slide_params = buffer_params; @@ -535,25 +536,35 @@ void PathTrace::denoise(const RenderWork &render_work) render_scheduler_.report_denoise_time(render_work, time_dt() - start_time); } -void PathTrace::set_gpu_display(unique_ptr<GPUDisplay> gpu_display) +void PathTrace::set_output_driver(unique_ptr<OutputDriver> driver) { - gpu_display_ = move(gpu_display); + output_driver_ = move(driver); } -void PathTrace::clear_gpu_display() +void PathTrace::set_display_driver(unique_ptr<DisplayDriver> driver) { - if (gpu_display_) { - gpu_display_->clear(); + if (driver) { + display_ = make_unique<PathTraceDisplay>(move(driver)); + } + else { + display_ = nullptr; + } +} + +void PathTrace::clear_display() +{ + if (display_) { + display_->clear(); } } void PathTrace::draw() { - if (!gpu_display_) { + if (!display_) { return; } - did_draw_after_reset_ |= gpu_display_->draw(); + did_draw_after_reset_ |= display_->draw(); } void PathTrace::update_display(const RenderWork &render_work) @@ -562,31 +573,32 @@ void PathTrace::update_display(const RenderWork &render_work) return; } - if (!gpu_display_ && !tile_buffer_update_cb) { + if (!display_ && !output_driver_) { VLOG(3) << "Ignore display update."; return; } if (full_params_.width == 0 || full_params_.height == 0) { - VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer."; + VLOG(3) << "Skipping PathTraceDisplay update due to 0 size of the render buffer."; return; } const double start_time = time_dt(); - if (tile_buffer_update_cb) { + if (output_driver_) { VLOG(3) << "Invoke buffer update callback."; - tile_buffer_update_cb(); + PathTraceTile tile(*this); + output_driver_->update_render_tile(tile); } - if (gpu_display_) { + if (display_) { VLOG(3) << "Perform copy to GPUDisplay work."; const int resolution_divider = render_work.resolution_divider; const int texture_width = max(1, full_params_.width / resolution_divider); const int texture_height = max(1, full_params_.height / resolution_divider); - if (!gpu_display_->update_begin(texture_width, texture_height)) { + if (!display_->update_begin(texture_width, texture_height)) { LOG(ERROR) << "Error beginning GPUDisplay update."; return; } @@ -600,10 +612,10 @@ void PathTrace::update_display(const RenderWork &render_work) * all works in parallel. */ const int num_samples = get_num_samples_in_buffer(); for (auto &&path_trace_work : path_trace_works_) { - path_trace_work->copy_to_gpu_display(gpu_display_.get(), pass_mode, num_samples); + path_trace_work->copy_to_display(display_.get(), pass_mode, num_samples); } - gpu_display_->update_end(); + display_->update_end(); } render_scheduler_.report_display_update_time(render_work, time_dt() - start_time); @@ -753,20 +765,26 @@ bool PathTrace::is_cancel_requested() void PathTrace::tile_buffer_write() { - if (!tile_buffer_write_cb) { + if (!output_driver_) { return; } - tile_buffer_write_cb(); + PathTraceTile tile(*this); + output_driver_->write_render_tile(tile); } void PathTrace::tile_buffer_read() { - if (!tile_buffer_read_cb) { + if (!device_scene_->data.bake.use) { return; } - if (tile_buffer_read_cb()) { + if (!output_driver_) { + return; + } + + PathTraceTile tile(*this); + if (output_driver_->read_render_tile(tile)) { tbb::parallel_for_each(path_trace_works_, [](unique_ptr<PathTraceWork> &path_trace_work) { path_trace_work->copy_render_buffers_to_device(); }); @@ -801,7 +819,7 @@ void PathTrace::tile_buffer_write_to_disk() } if (!tile_manager_.write_tile(*buffers)) { - LOG(ERROR) << "Error writing tile to file."; + device_->set_error("Error writing tile to file"); } } @@ -894,7 +912,14 @@ void PathTrace::process_full_buffer_from_disk(string_view filename) DenoiseParams denoise_params; if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) { - LOG(ERROR) << "Error reading tiles from file."; + const string error_message = "Error reading tiles from file"; + if (progress_) { + progress_->set_error(error_message); + progress_->set_cancel(error_message); + } + else { + LOG(ERROR) << error_message; + } return; } @@ -998,6 +1023,11 @@ int2 PathTrace::get_render_tile_offset() const return make_int2(tile.x, tile.y); } +int2 PathTrace::get_render_size() const +{ + return tile_manager_.get_size(); +} + const BufferParams &PathTrace::get_render_tile_params() const { if (full_frame_state_.render_buffers) { @@ -1028,6 +1058,8 @@ static const char *device_type_for_description(const DeviceType type) return "CUDA"; case DEVICE_OPTIX: return "OptiX"; + case DEVICE_HIP: + return "HIP"; case DEVICE_DUMMY: return "Dummy"; case DEVICE_MULTI: diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h index fc7713e6df9..dbb22c204d9 100644 --- a/intern/cycles/integrator/path_trace.h +++ b/intern/cycles/integrator/path_trace.h @@ -31,12 +31,14 @@ CCL_NAMESPACE_BEGIN class AdaptiveSampling; class Device; class DeviceScene; +class DisplayDriver; class Film; class RenderBuffers; class RenderScheduler; class RenderWork; +class PathTraceDisplay; +class OutputDriver; class Progress; -class GPUDisplay; class TileManager; /* PathTrace class takes care of kernel graph and scheduling on a (multi)device. It takes care of @@ -98,13 +100,16 @@ class PathTrace { * Use this to configure the adaptive sampler before rendering any samples. */ void set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling); - /* Set GPU display which takes care of drawing the render result. */ - void set_gpu_display(unique_ptr<GPUDisplay> gpu_display); + /* Sets output driver for render buffer output. */ + void set_output_driver(unique_ptr<OutputDriver> driver); - /* Clear the GPU display by filling it in with all zeroes. */ - void clear_gpu_display(); + /* Set display driver for interactive render buffer display. */ + void set_display_driver(unique_ptr<DisplayDriver> driver); - /* Perform drawing of the current state of the GPUDisplay. */ + /* Clear the display buffer by filling it in with all zeroes. */ + void clear_display(); + + /* Perform drawing of the current state of the DisplayDriver. */ void draw(); /* Cancel rendering process as soon as possible, without waiting for full tile to be sampled. @@ -157,6 +162,7 @@ class PathTrace { * instead. */ int2 get_render_tile_size() const; int2 get_render_tile_offset() const; + int2 get_render_size() const; /* Get buffer parameters of the current tile. * @@ -168,18 +174,6 @@ class PathTrace { * times, and so on. */ string full_report() const; - /* Callback which communicates an updates state of the render buffer of the current big tile. - * Is called during path tracing to communicate work-in-progress state of the final buffer. */ - function<void(void)> tile_buffer_update_cb; - - /* Callback which communicates final rendered buffer. Is called after path-tracing is done. */ - function<void(void)> tile_buffer_write_cb; - - /* Callback which initializes rendered buffer. Is called before path-tracing starts. - * - * This is used for baking. */ - function<bool(void)> tile_buffer_read_cb; - /* Callback which is called to report current rendering progress. * * It is supposed to be cheaper than buffer update/write, hence can be called more often. @@ -252,7 +246,11 @@ class PathTrace { RenderScheduler &render_scheduler_; TileManager &tile_manager_; - unique_ptr<GPUDisplay> gpu_display_; + /* Display driver for interactive render buffer display. */ + unique_ptr<PathTraceDisplay> display_; + + /* Output driver to write render buffer to. */ + unique_ptr<OutputDriver> output_driver_; /* Per-compute device descriptors of work which is responsible for path tracing on its configured * device. */ @@ -286,7 +284,7 @@ class PathTrace { /* Parameters of the big tile with the current resolution divider applied. */ BufferParams effective_big_tile_params; - /* Denosier was run and there are denoised versions of the passes in the render buffers. */ + /* Denoiser was run and there are denoised versions of the passes in the render buffers. */ bool has_denoised_result = false; /* Current tile has been written (to either disk or callback. diff --git a/intern/cycles/render/gpu_display.cpp b/intern/cycles/integrator/path_trace_display.cpp index a8f0cc50583..28f0a7f7745 100644 --- a/intern/cycles/render/gpu_display.cpp +++ b/intern/cycles/integrator/path_trace_display.cpp @@ -14,20 +14,25 @@ * limitations under the License. */ -#include "render/gpu_display.h" +#include "integrator/path_trace_display.h" #include "render/buffers.h" + #include "util/util_logging.h" CCL_NAMESPACE_BEGIN -void GPUDisplay::reset(const BufferParams &buffer_params) +PathTraceDisplay::PathTraceDisplay(unique_ptr<DisplayDriver> driver) : driver_(move(driver)) +{ +} + +void PathTraceDisplay::reset(const BufferParams &buffer_params) { thread_scoped_lock lock(mutex_); - const GPUDisplayParams old_params = params_; + const DisplayDriver::Params old_params = params_; - params_.offset = make_int2(buffer_params.full_x, buffer_params.full_y); + params_.full_offset = make_int2(buffer_params.full_x, buffer_params.full_y); params_.full_size = make_int2(buffer_params.full_width, buffer_params.full_height); params_.size = make_int2(buffer_params.width, buffer_params.height); @@ -44,7 +49,7 @@ void GPUDisplay::reset(const BufferParams &buffer_params) texture_state_.is_outdated = true; } -void GPUDisplay::mark_texture_updated() +void PathTraceDisplay::mark_texture_updated() { texture_state_.is_outdated = false; texture_state_.is_usable = true; @@ -54,7 +59,7 @@ void GPUDisplay::mark_texture_updated() * Update procedure. */ -bool GPUDisplay::update_begin(int texture_width, int texture_height) +bool PathTraceDisplay::update_begin(int texture_width, int texture_height) { DCHECK(!update_state_.is_active); @@ -66,15 +71,15 @@ bool GPUDisplay::update_begin(int texture_width, int texture_height) /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time. * The update itself is non-blocking however, for better performance and to avoid * potential deadlocks due to locks held by the subclass. */ - GPUDisplayParams params; + DisplayDriver::Params params; { thread_scoped_lock lock(mutex_); params = params_; texture_state_.size = make_int2(texture_width, texture_height); } - if (!do_update_begin(params, texture_width, texture_height)) { - LOG(ERROR) << "GPUDisplay implementation could not begin update."; + if (!driver_->update_begin(params, texture_width, texture_height)) { + LOG(ERROR) << "PathTraceDisplay implementation could not begin update."; return false; } @@ -83,7 +88,7 @@ bool GPUDisplay::update_begin(int texture_width, int texture_height) return true; } -void GPUDisplay::update_end() +void PathTraceDisplay::update_end() { DCHECK(update_state_.is_active); @@ -92,12 +97,12 @@ void GPUDisplay::update_end() return; } - do_update_end(); + driver_->update_end(); update_state_.is_active = false; } -int2 GPUDisplay::get_texture_size() const +int2 PathTraceDisplay::get_texture_size() const { return texture_state_.size; } @@ -106,25 +111,54 @@ int2 GPUDisplay::get_texture_size() const * Texture update from CPU buffer. */ -void GPUDisplay::copy_pixels_to_texture( +void PathTraceDisplay::copy_pixels_to_texture( const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height) { DCHECK(update_state_.is_active); if (!update_state_.is_active) { - LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update."; + LOG(ERROR) << "Attempt to copy pixels data outside of PathTraceDisplay update."; return; } mark_texture_updated(); - do_copy_pixels_to_texture(rgba_pixels, texture_x, texture_y, pixels_width, pixels_height); + + /* This call copies pixels to a mapped texture buffer which is typically much cheaper from CPU + * time point of view than to copy data directly to a texture. + * + * The possible downside of this approach is that it might require a higher peak memory when + * doing partial updates of the texture (although, in practice even partial updates might peak + * with a full-frame buffer stored on the CPU if the GPU is currently occupied). */ + half4 *mapped_rgba_pixels = map_texture_buffer(); + if (!mapped_rgba_pixels) { + return; + } + + const int texture_width = texture_state_.size.x; + const int texture_height = texture_state_.size.y; + + if (texture_x == 0 && texture_y == 0 && pixels_width == texture_width && + pixels_height == texture_height) { + const size_t size_in_bytes = sizeof(half4) * texture_width * texture_height; + memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes); + } + else { + const half4 *rgba_row = rgba_pixels; + half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_width + texture_x; + for (int y = 0; y < pixels_height; + ++y, rgba_row += pixels_width, mapped_rgba_row += texture_width) { + memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width); + } + } + + unmap_texture_buffer(); } /* -------------------------------------------------------------------- * Texture buffer mapping. */ -half4 *GPUDisplay::map_texture_buffer() +half4 *PathTraceDisplay::map_texture_buffer() { DCHECK(!texture_buffer_state_.is_mapped); DCHECK(update_state_.is_active); @@ -135,11 +169,11 @@ half4 *GPUDisplay::map_texture_buffer() } if (!update_state_.is_active) { - LOG(ERROR) << "Attempt to copy pixels data outside of GPUDisplay update."; + LOG(ERROR) << "Attempt to copy pixels data outside of PathTraceDisplay update."; return nullptr; } - half4 *mapped_rgba_pixels = do_map_texture_buffer(); + half4 *mapped_rgba_pixels = driver_->map_texture_buffer(); if (mapped_rgba_pixels) { texture_buffer_state_.is_mapped = true; @@ -148,7 +182,7 @@ half4 *GPUDisplay::map_texture_buffer() return mapped_rgba_pixels; } -void GPUDisplay::unmap_texture_buffer() +void PathTraceDisplay::unmap_texture_buffer() { DCHECK(texture_buffer_state_.is_mapped); @@ -160,14 +194,14 @@ void GPUDisplay::unmap_texture_buffer() texture_buffer_state_.is_mapped = false; mark_texture_updated(); - do_unmap_texture_buffer(); + driver_->unmap_texture_buffer(); } /* -------------------------------------------------------------------- * Graphics interoperability. */ -DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get() +DisplayDriver::GraphicsInterop PathTraceDisplay::graphics_interop_get() { DCHECK(!texture_buffer_state_.is_mapped); DCHECK(update_state_.is_active); @@ -175,38 +209,45 @@ DeviceGraphicsInteropDestination GPUDisplay::graphics_interop_get() if (texture_buffer_state_.is_mapped) { LOG(ERROR) << "Attempt to use graphics interoperability mode while the texture buffer is mapped."; - return DeviceGraphicsInteropDestination(); + return DisplayDriver::GraphicsInterop(); } if (!update_state_.is_active) { - LOG(ERROR) << "Attempt to use graphics interoperability outside of GPUDisplay update."; - return DeviceGraphicsInteropDestination(); + LOG(ERROR) << "Attempt to use graphics interoperability outside of PathTraceDisplay update."; + return DisplayDriver::GraphicsInterop(); } /* Assume that interop will write new values to the texture. */ mark_texture_updated(); - return do_graphics_interop_get(); + return driver_->graphics_interop_get(); } -void GPUDisplay::graphics_interop_activate() +void PathTraceDisplay::graphics_interop_activate() { + driver_->graphics_interop_activate(); } -void GPUDisplay::graphics_interop_deactivate() +void PathTraceDisplay::graphics_interop_deactivate() { + driver_->graphics_interop_deactivate(); } /* -------------------------------------------------------------------- * Drawing. */ -bool GPUDisplay::draw() +void PathTraceDisplay::clear() +{ + driver_->clear(); +} + +bool PathTraceDisplay::draw() { /* Get parameters within a mutex lock, to avoid reset() modifying them at the same time. * The drawing itself is non-blocking however, for better performance and to avoid * potential deadlocks due to locks held by the subclass. */ - GPUDisplayParams params; + DisplayDriver::Params params; bool is_usable; bool is_outdated; @@ -218,7 +259,7 @@ bool GPUDisplay::draw() } if (is_usable) { - do_draw(params); + driver_->draw(params); } return !is_outdated; diff --git a/intern/cycles/render/gpu_display.h b/intern/cycles/integrator/path_trace_display.h index a01348d28d5..24aaa0df6b1 100644 --- a/intern/cycles/render/gpu_display.h +++ b/intern/cycles/integrator/path_trace_display.h @@ -16,52 +16,30 @@ #pragma once -#include "device/device_graphics_interop.h" +#include "render/display_driver.h" + #include "util/util_half.h" #include "util/util_thread.h" #include "util/util_types.h" +#include "util/util_unique_ptr.h" CCL_NAMESPACE_BEGIN class BufferParams; -/* GPUDisplay class takes care of drawing render result in a viewport. The render result is stored - * in a GPU-side texture, which is updated from a path tracer and drawn by an application. +/* PathTraceDisplay is used for efficient render buffer display. * - * The base GPUDisplay does some special texture state tracking, which allows render Session to - * make decisions on whether reset for an updated state is possible or not. This state should only - * be tracked in a base class and a particular implementation should not worry about it. + * The host applications implements a DisplayDriver, storing a render pass in a GPU-side + * textures. This texture is continuously updated by the path tracer and drawn by the host + * application. * - * The subclasses should only implement the pure virtual methods, which allows them to not worry - * about parent method calls, which helps them to be as small and reliable as possible. */ - -class GPUDisplayParams { - public: - /* Offset of the display within a viewport. - * For example, set to a lower-bottom corner of border render in Blender's viewport. */ - int2 offset = make_int2(0, 0); - - /* Full viewport size. - * - * NOTE: Is not affected by the resolution divider. */ - int2 full_size = make_int2(0, 0); - - /* Effective vieport size. - * In the case of border render, size of the border rectangle. - * - * NOTE: Is not affected by the resolution divider. */ - int2 size = make_int2(0, 0); - - bool modified(const GPUDisplayParams &other) const - { - return !(offset == other.offset && full_size == other.full_size && size == other.size); - } -}; + * PathTraceDisplay is a wrapper around the DisplayDriver, adding thread safety, state tracking + * and error checking. */ -class GPUDisplay { +class PathTraceDisplay { public: - GPUDisplay() = default; - virtual ~GPUDisplay() = default; + PathTraceDisplay(unique_ptr<DisplayDriver> driver); + virtual ~PathTraceDisplay() = default; /* Reset the display for the new state of render session. Is called whenever session is reset, * which happens on changes like viewport navigation or viewport dimension change. @@ -69,11 +47,6 @@ class GPUDisplay { * This call will configure parameters for a changed buffer and reset the texture state. */ void reset(const BufferParams &buffer_params); - const GPUDisplayParams &get_params() const - { - return params_; - } - /* -------------------------------------------------------------------- * Update procedure. * @@ -94,7 +67,8 @@ class GPUDisplay { /* -------------------------------------------------------------------- * Texture update from CPU buffer. * - * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`. + * NOTE: The PathTraceDisplay should be marked for an update being in process with + * `update_begin()`. * * Most portable implementation, which must be supported by all platforms. Might not be the most * efficient one. @@ -115,7 +89,8 @@ class GPUDisplay { * This functionality is used to update GPU-side texture content without need to maintain CPU * side buffer on the caller. * - * NOTE: The GPUDisplay should be marked for an update being in process with `update_begin()`. + * NOTE: The PathTraceDisplay should be marked for an update being in process with + * `update_begin()`. * * NOTE: Texture buffer can not be mapped while graphics interoperability is active. This means * that `map_texture_buffer()` is not allowed between `graphics_interop_begin()` and @@ -145,14 +120,14 @@ class GPUDisplay { * that `graphics_interop_get()` is not allowed between `map_texture_buffer()` and * `unmap_texture_buffer()` calls. */ - /* Get GPUDisplay graphics interoperability information which acts as a destination for the + /* Get PathTraceDisplay graphics interoperability information which acts as a destination for the * device API. */ - DeviceGraphicsInteropDestination graphics_interop_get(); + DisplayDriver::GraphicsInterop graphics_interop_get(); /* (De)activate GPU display for graphics interoperability outside of regular display update * routines. */ - virtual void graphics_interop_activate(); - virtual void graphics_interop_deactivate(); + void graphics_interop_activate(); + void graphics_interop_deactivate(); /* -------------------------------------------------------------------- * Drawing. @@ -163,47 +138,26 @@ class GPUDisplay { * This call might happen in parallel with draw, but can never happen in parallel with the * update. * - * The actual zero-ing can be deferred to a later moment. What is important is that after clear + * The actual zeroing can be deferred to a later moment. What is important is that after clear * and before pixels update the drawing texture will be fully empty, and that partial update * after clear will write new pixel values for an updating area, leaving everything else zeroed. * * If the GPU display supports graphics interoperability then the zeroing the display is to be - * delegated to the device via the `DeviceGraphicsInteropDestination`. */ - virtual void clear() = 0; + * delegated to the device via the `DisplayDriver::GraphicsInterop`. */ + void clear(); /* Draw the current state of the texture. * * Returns true if this call did draw an updated state of the texture. */ bool draw(); - protected: - /* Implementation-specific calls which subclasses are to implement. - * These `do_foo()` method corresponds to their `foo()` calls, but they are purely virtual to - * simplify their particular implementation. */ - virtual bool do_update_begin(const GPUDisplayParams ¶ms, - int texture_width, - int texture_height) = 0; - virtual void do_update_end() = 0; - - virtual void do_copy_pixels_to_texture(const half4 *rgba_pixels, - int texture_x, - int texture_y, - int pixels_width, - int pixels_height) = 0; - - virtual half4 *do_map_texture_buffer() = 0; - virtual void do_unmap_texture_buffer() = 0; - - /* Note that this might be called in parallel to do_update_begin() and do_update_end(), - * the subclass is responsible for appropriate mutex locks to avoid multiple threads - * editing and drawing the texture at the same time. */ - virtual void do_draw(const GPUDisplayParams ¶ms) = 0; - - virtual DeviceGraphicsInteropDestination do_graphics_interop_get() = 0; - private: + /* Display driver implemented by the host application. */ + unique_ptr<DisplayDriver> driver_; + + /* Current display parameters */ thread_mutex mutex_; - GPUDisplayParams params_; + DisplayDriver::Params params_; /* Mark texture as its content has been updated. * Used from places which knows that the texture content has been brought up-to-date, so that the diff --git a/intern/cycles/integrator/path_trace_tile.cpp b/intern/cycles/integrator/path_trace_tile.cpp new file mode 100644 index 00000000000..540f4aa5f68 --- /dev/null +++ b/intern/cycles/integrator/path_trace_tile.cpp @@ -0,0 +1,107 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "integrator/path_trace_tile.h" +#include "integrator/pass_accessor_cpu.h" +#include "integrator/path_trace.h" + +#include "render/buffers.h" +#include "render/film.h" +#include "render/pass.h" +#include "render/scene.h" + +CCL_NAMESPACE_BEGIN + +PathTraceTile::PathTraceTile(PathTrace &path_trace) + : OutputDriver::Tile(path_trace.get_render_tile_offset(), + path_trace.get_render_tile_size(), + path_trace.get_render_size(), + path_trace.get_render_tile_params().layer, + path_trace.get_render_tile_params().view), + path_trace_(path_trace), + copied_from_device_(false) +{ +} + +bool PathTraceTile::get_pass_pixels(const string_view pass_name, + const int num_channels, + float *pixels) const +{ + /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification + * is happening while this function runs. */ + + if (!copied_from_device_) { + /* Copy from device on demand. */ + path_trace_.copy_render_tile_from_device(); + const_cast<PathTraceTile *>(this)->copied_from_device_ = true; + } + + const BufferParams &buffer_params = path_trace_.get_render_tile_params(); + + const BufferPass *pass = buffer_params.find_pass(pass_name); + if (pass == nullptr) { + return false; + } + + const bool has_denoised_result = path_trace_.has_denoised_result(); + if (pass->mode == PassMode::DENOISED && !has_denoised_result) { + pass = buffer_params.find_pass(pass->type); + if (pass == nullptr) { + /* Happens when denoised result pass is requested but is never written by the kernel. */ + return false; + } + } + + pass = buffer_params.get_actual_display_pass(pass); + + const float exposure = buffer_params.exposure; + const int num_samples = path_trace_.get_num_render_tile_samples(); + + PassAccessor::PassAccessInfo pass_access_info(*pass); + pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher; + pass_access_info.use_approximate_shadow_catcher_background = + pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background; + + const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples); + const PassAccessor::Destination destination(pixels, num_channels); + + return path_trace_.get_render_tile_pixels(pass_accessor, destination); +} + +bool PathTraceTile::set_pass_pixels(const string_view pass_name, + const int num_channels, + const float *pixels) const +{ + /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification + * is happening while this function runs. */ + + const BufferParams &buffer_params = path_trace_.get_render_tile_params(); + const BufferPass *pass = buffer_params.find_pass(pass_name); + if (!pass) { + return false; + } + + const float exposure = buffer_params.exposure; + const int num_samples = 1; + + const PassAccessor::PassAccessInfo pass_access_info(*pass); + PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples); + PassAccessor::Source source(pixels, num_channels); + + return path_trace_.set_render_tile_pixels(pass_accessor, source); +} + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_tile.h b/intern/cycles/integrator/path_trace_tile.h new file mode 100644 index 00000000000..fd3e2969f6c --- /dev/null +++ b/intern/cycles/integrator/path_trace_tile.h @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "render/output_driver.h" + +CCL_NAMESPACE_BEGIN + +/* PathTraceTile + * + * Implementation of OutputDriver::Tile interface for path tracer. */ + +class PathTrace; + +class PathTraceTile : public OutputDriver::Tile { + public: + PathTraceTile(PathTrace &path_trace); + + bool get_pass_pixels(const string_view pass_name, const int num_channels, float *pixels) const; + bool set_pass_pixels(const string_view pass_name, + const int num_channels, + const float *pixels) const; + + private: + PathTrace &path_trace_; + bool copied_from_device_; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/integrator/path_trace_work.cpp b/intern/cycles/integrator/path_trace_work.cpp index d9634acac10..c29177907c9 100644 --- a/intern/cycles/integrator/path_trace_work.cpp +++ b/intern/cycles/integrator/path_trace_work.cpp @@ -16,12 +16,12 @@ #include "device/device.h" +#include "integrator/path_trace_display.h" #include "integrator/path_trace_work.h" #include "integrator/path_trace_work_cpu.h" #include "integrator/path_trace_work_gpu.h" #include "render/buffers.h" #include "render/film.h" -#include "render/gpu_display.h" #include "render/scene.h" #include "kernel/kernel_types.h" @@ -185,12 +185,12 @@ PassAccessor::PassAccessInfo PathTraceWork::get_display_pass_access_info(PassMod return pass_access_info; } -PassAccessor::Destination PathTraceWork::get_gpu_display_destination_template( - const GPUDisplay *gpu_display) const +PassAccessor::Destination PathTraceWork::get_display_destination_template( + const PathTraceDisplay *display) const { PassAccessor::Destination destination(film_->get_display_pass()); - const int2 display_texture_size = gpu_display->get_texture_size(); + const int2 display_texture_size = display->get_texture_size(); const int texture_x = effective_buffer_params_.full_x - effective_full_params_.full_x; const int texture_y = effective_buffer_params_.full_y - effective_full_params_.full_y; diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h index 8c9c8811199..404165b7c55 100644 --- a/intern/cycles/integrator/path_trace_work.h +++ b/intern/cycles/integrator/path_trace_work.h @@ -28,7 +28,7 @@ class BufferParams; class Device; class DeviceScene; class Film; -class GPUDisplay; +class PathTraceDisplay; class RenderBuffers; class PathTraceWork { @@ -83,11 +83,9 @@ class PathTraceWork { * noisy pass mode will be passed here when it is known that the buffer does not have denoised * passes yet (because denoiser did not run). If the denoised pass is requested and denoiser is * not used then this function will fall-back to the noisy pass instead. */ - virtual void copy_to_gpu_display(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) = 0; + virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) = 0; - virtual void destroy_gpu_resources(GPUDisplay *gpu_display) = 0; + virtual void destroy_gpu_resources(PathTraceDisplay *display) = 0; /* Copy data from/to given render buffers. * Will copy pixels from a corresponding place (from multi-device point of view) of the render @@ -104,7 +102,7 @@ class PathTraceWork { * - Copies work's render buffer to its device. */ void copy_from_render_buffers(const RenderBuffers *render_buffers); - /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the + /* Special version of the `copy_from_render_buffers()` which only copies denoised passes from the * given render buffers, leaving rest of the passes. * * Same notes about device copying applies to this call as well. */ @@ -162,8 +160,8 @@ class PathTraceWork { /* Get destination which offset and stride are configured so that writing to it will write to a * proper location of GPU display texture, taking current tile and device slice into account. */ - PassAccessor::Destination get_gpu_display_destination_template( - const GPUDisplay *gpu_display) const; + PassAccessor::Destination get_display_destination_template( + const PathTraceDisplay *display) const; /* Device which will be used for path tracing. * Note that it is an actual render device (and never is a multi-device). */ diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp index b9a33b64051..18a5365453d 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.cpp +++ b/intern/cycles/integrator/path_trace_work_cpu.cpp @@ -19,10 +19,12 @@ #include "device/cpu/kernel.h" #include "device/device.h" +#include "kernel/kernel_path_state.h" + #include "integrator/pass_accessor_cpu.h" +#include "integrator/path_trace_display.h" #include "render/buffers.h" -#include "render/gpu_display.h" #include "render/scene.h" #include "util/util_atomic.h" @@ -116,13 +118,17 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global const KernelWorkTile &work_tile, const int samples_num) { - const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher; const bool has_bake = device_scene_->data.bake.use; - IntegratorStateCPU integrator_states[2] = {}; + IntegratorStateCPU integrator_states[2]; IntegratorStateCPU *state = &integrator_states[0]; - IntegratorStateCPU *shadow_catcher_state = &integrator_states[1]; + IntegratorStateCPU *shadow_catcher_state = nullptr; + + if (device_scene_->data.integrator.has_shadow_catcher) { + shadow_catcher_state = &integrator_states[1]; + path_state_init_queues(kernel_globals, shadow_catcher_state); + } KernelWorkTile sample_work_tile = work_tile; float *render_buffer = buffers_->buffer.data(); @@ -147,7 +153,7 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global kernels_.integrator_megakernel(kernel_globals, state, render_buffer); - if (has_shadow_catcher) { + if (shadow_catcher_state) { kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer); } @@ -155,14 +161,14 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global } } -void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) +void PathTraceWorkCPU::copy_to_display(PathTraceDisplay *display, + PassMode pass_mode, + int num_samples) { - half4 *rgba_half = gpu_display->map_texture_buffer(); + half4 *rgba_half = display->map_texture_buffer(); if (!rgba_half) { - /* TODO(sergey): Look into using copy_to_gpu_display() if mapping failed. Might be needed for - * some implementations of GPUDisplay which can not map memory? */ + /* TODO(sergey): Look into using copy_to_display() if mapping failed. Might be needed for + * some implementations of PathTraceDisplay which can not map memory? */ return; } @@ -172,7 +178,7 @@ void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, const PassAccessorCPU pass_accessor(pass_access_info, kfilm.exposure, num_samples); - PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + PassAccessor::Destination destination = get_display_destination_template(display); destination.pixels_half_rgba = rgba_half; tbb::task_arena local_arena = local_tbb_arena_create(device_); @@ -180,10 +186,10 @@ void PathTraceWorkCPU::copy_to_gpu_display(GPUDisplay *gpu_display, pass_accessor.get_render_tile_pixels(buffers_.get(), effective_buffer_params_, destination); }); - gpu_display->unmap_texture_buffer(); + display->unmap_texture_buffer(); } -void PathTraceWorkCPU::destroy_gpu_resources(GPUDisplay * /*gpu_display*/) +void PathTraceWorkCPU::destroy_gpu_resources(PathTraceDisplay * /*display*/) { } diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h index ab729bbf879..d011e8d05bd 100644 --- a/intern/cycles/integrator/path_trace_work_cpu.h +++ b/intern/cycles/integrator/path_trace_work_cpu.h @@ -50,10 +50,10 @@ class PathTraceWorkCPU : public PathTraceWork { int start_sample, int samples_num) override; - virtual void copy_to_gpu_display(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) override; - virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override; + virtual void copy_to_display(PathTraceDisplay *display, + PassMode pass_mode, + int num_samples) override; + virtual void destroy_gpu_resources(PathTraceDisplay *display) override; virtual bool copy_render_buffers_from_device() override; virtual bool copy_render_buffers_to_device() override; diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp index 135466becc6..17c49f244d2 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.cpp +++ b/intern/cycles/integrator/path_trace_work_gpu.cpp @@ -15,12 +15,12 @@ */ #include "integrator/path_trace_work_gpu.h" +#include "integrator/path_trace_display.h" #include "device/device.h" #include "integrator/pass_accessor_gpu.h" #include "render/buffers.h" -#include "render/gpu_display.h" #include "render/scene.h" #include "util/util_logging.h" #include "util/util_tbb.h" @@ -46,7 +46,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device, queued_paths_(device, "queued_paths", MEM_READ_WRITE), num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE), work_tiles_(device, "work_tiles", MEM_READ_WRITE), - gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), + display_rgba_half_(device, "display buffer half", MEM_READ_WRITE), max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorStateCPU))), min_num_active_paths_(queue_->num_concurrent_busy_states()), max_active_path_index_(0) @@ -95,8 +95,8 @@ void PathTraceWorkGPU::alloc_integrator_soa() #define KERNEL_STRUCT_END(name) \ break; \ } -#define KERNEL_STRUCT_END_ARRAY(name, array_size) \ - if (array_index == array_size - 1) { \ +#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \ + if (array_index == gpu_array_size - 1) { \ break; \ } \ } @@ -652,7 +652,7 @@ int PathTraceWorkGPU::get_num_active_paths() bool PathTraceWorkGPU::should_use_graphics_interop() { /* There are few aspects with the graphics interop when using multiple devices caused by the fact - * that the GPUDisplay has a single texture: + * that the PathTraceDisplay has a single texture: * * CUDA will return `CUDA_ERROR_NOT_SUPPORTED` from `cuGraphicsGLRegisterBuffer()` when * attempting to register OpenGL PBO which has been mapped. Which makes sense, because @@ -678,9 +678,9 @@ bool PathTraceWorkGPU::should_use_graphics_interop() return interop_use_; } -void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) +void PathTraceWorkGPU::copy_to_display(PathTraceDisplay *display, + PassMode pass_mode, + int num_samples) { if (device_->have_error()) { /* Don't attempt to update GPU display if the device has errors: the error state will make @@ -694,7 +694,7 @@ void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, } if (should_use_graphics_interop()) { - if (copy_to_gpu_display_interop(gpu_display, pass_mode, num_samples)) { + if (copy_to_display_interop(display, pass_mode, num_samples)) { return; } @@ -703,12 +703,12 @@ void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, interop_use_ = false; } - copy_to_gpu_display_naive(gpu_display, pass_mode, num_samples); + copy_to_display_naive(display, pass_mode, num_samples); } -void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) +void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display, + PassMode pass_mode, + int num_samples) { const int full_x = effective_buffer_params_.full_x; const int full_y = effective_buffer_params_.full_y; @@ -725,43 +725,42 @@ void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display, * NOTE: allocation happens to the final resolution so that no re-allocation happens on every * change of the resolution divider. However, if the display becomes smaller, shrink the * allocated memory as well. */ - if (gpu_display_rgba_half_.data_width != final_width || - gpu_display_rgba_half_.data_height != final_height) { - gpu_display_rgba_half_.alloc(final_width, final_height); + if (display_rgba_half_.data_width != final_width || + display_rgba_half_.data_height != final_height) { + display_rgba_half_.alloc(final_width, final_height); /* TODO(sergey): There should be a way to make sure device-side memory is allocated without * transferring zeroes to the device. */ - queue_->zero_to_device(gpu_display_rgba_half_); + queue_->zero_to_device(display_rgba_half_); } PassAccessor::Destination destination(film_->get_display_pass()); - destination.d_pixels_half_rgba = gpu_display_rgba_half_.device_pointer; + destination.d_pixels_half_rgba = display_rgba_half_.device_pointer; get_render_tile_film_pixels(destination, pass_mode, num_samples); - gpu_display_rgba_half_.copy_from_device(); + queue_->copy_from_device(display_rgba_half_); + queue_->synchronize(); - gpu_display->copy_pixels_to_texture( - gpu_display_rgba_half_.data(), texture_x, texture_y, width, height); + display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height); } -bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) +bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display, + PassMode pass_mode, + int num_samples) { if (!device_graphics_interop_) { device_graphics_interop_ = queue_->graphics_interop_create(); } - const DeviceGraphicsInteropDestination graphics_interop_dst = - gpu_display->graphics_interop_get(); - device_graphics_interop_->set_destination(graphics_interop_dst); + const DisplayDriver::GraphicsInterop graphics_interop_dst = display->graphics_interop_get(); + device_graphics_interop_->set_display_interop(graphics_interop_dst); const device_ptr d_rgba_half = device_graphics_interop_->map(); if (!d_rgba_half) { return false; } - PassAccessor::Destination destination = get_gpu_display_destination_template(gpu_display); + PassAccessor::Destination destination = get_display_destination_template(display); destination.d_pixels_half_rgba = d_rgba_half; get_render_tile_film_pixels(destination, pass_mode, num_samples); @@ -771,14 +770,14 @@ bool PathTraceWorkGPU::copy_to_gpu_display_interop(GPUDisplay *gpu_display, return true; } -void PathTraceWorkGPU::destroy_gpu_resources(GPUDisplay *gpu_display) +void PathTraceWorkGPU::destroy_gpu_resources(PathTraceDisplay *display) { if (!device_graphics_interop_) { return; } - gpu_display->graphics_interop_activate(); + display->graphics_interop_activate(); device_graphics_interop_ = nullptr; - gpu_display->graphics_interop_deactivate(); + display->graphics_interop_deactivate(); } void PathTraceWorkGPU::get_render_tile_film_pixels(const PassAccessor::Destination &destination, diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h index 38788122b0d..9212537d2fd 100644 --- a/intern/cycles/integrator/path_trace_work_gpu.h +++ b/intern/cycles/integrator/path_trace_work_gpu.h @@ -48,10 +48,10 @@ class PathTraceWorkGPU : public PathTraceWork { int start_sample, int samples_num) override; - virtual void copy_to_gpu_display(GPUDisplay *gpu_display, - PassMode pass_mode, - int num_samples) override; - virtual void destroy_gpu_resources(GPUDisplay *gpu_display) override; + virtual void copy_to_display(PathTraceDisplay *display, + PassMode pass_mode, + int num_samples) override; + virtual void destroy_gpu_resources(PathTraceDisplay *display) override; virtual bool copy_render_buffers_from_device() override; virtual bool copy_render_buffers_to_device() override; @@ -88,16 +88,16 @@ class PathTraceWorkGPU : public PathTraceWork { int get_num_active_paths(); - /* Check whether graphics interop can be used for the GPUDisplay update. */ + /* Check whether graphics interop can be used for the PathTraceDisplay update. */ bool should_use_graphics_interop(); - /* Naive implementation of the `copy_to_gpu_display()` which performs film conversion on the - * device, then copies pixels to the host and pushes them to the `gpu_display`. */ - void copy_to_gpu_display_naive(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples); + /* Naive implementation of the `copy_to_display()` which performs film conversion on the + * device, then copies pixels to the host and pushes them to the `display`. */ + void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, int num_samples); - /* Implementation of `copy_to_gpu_display()` which uses driver's OpenGL/GPU interoperability + /* Implementation of `copy_to_display()` which uses driver's OpenGL/GPU interoperability * functionality, avoiding copy of pixels to the host. */ - bool copy_to_gpu_display_interop(GPUDisplay *gpu_display, PassMode pass_mode, int num_samples); + bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, int num_samples); /* Synchronously run film conversion kernel and store display result in the given destination. */ void get_render_tile_film_pixels(const PassAccessor::Destination &destination, @@ -139,9 +139,9 @@ class PathTraceWorkGPU : public PathTraceWork { /* Temporary buffer for passing work tiles to kernel. */ device_vector<KernelWorkTile> work_tiles_; - /* Temporary buffer used by the copy_to_gpu_display() whenever graphics interoperability is not + /* Temporary buffer used by the copy_to_display() whenever graphics interoperability is not * available. Is allocated on-demand. */ - device_vector<half4> gpu_display_rgba_half_; + device_vector<half4> display_rgba_half_; unique_ptr<DeviceGraphicsInterop> device_graphics_interop_; diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp index 3e5b3417a6a..322d3d5f94c 100644 --- a/intern/cycles/integrator/render_scheduler.cpp +++ b/intern/cycles/integrator/render_scheduler.cpp @@ -384,7 +384,7 @@ bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work) } if (denoiser_params_.use && !state_.last_work_tile_was_denoised) { - render_work->tile.denoise = true; + render_work->tile.denoise = !tile_manager_.has_multiple_tiles(); any_scheduled = true; } @@ -903,6 +903,12 @@ bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display) return false; } + /* When multiple tiles are used the full frame will be denoised. + * Avoid per-tile denoising to save up render time. */ + if (tile_manager_.has_multiple_tiles()) { + return false; + } + if (done()) { /* Always denoise at the last sample. */ return true; diff --git a/intern/cycles/integrator/render_scheduler.h b/intern/cycles/integrator/render_scheduler.h index b7b598fb10c..c4ab15e54ba 100644 --- a/intern/cycles/integrator/render_scheduler.h +++ b/intern/cycles/integrator/render_scheduler.h @@ -31,7 +31,7 @@ class RenderWork { int resolution_divider = 1; /* Initialize render buffers. - * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the + * Includes steps like zeroing the buffer on the device, and optional reading of pixels from the * baking target. */ bool init_render_buffers = false; @@ -344,7 +344,7 @@ class RenderScheduler { /* Number of rendered samples on top of the start sample. */ int num_rendered_samples = 0; - /* Point in time the latest GPUDisplay work has been scheduled. */ + /* Point in time the latest PathTraceDisplay work has been scheduled. */ double last_display_update_time = 0.0; /* Value of -1 means display was never updated. */ int last_display_update_sample = -1; diff --git a/intern/cycles/integrator/shader_eval.cpp b/intern/cycles/integrator/shader_eval.cpp index d35ff4cd03f..a14e41ec5be 100644 --- a/intern/cycles/integrator/shader_eval.cpp +++ b/intern/cycles/integrator/shader_eval.cpp @@ -149,14 +149,14 @@ bool ShaderEval::eval_gpu(Device *device, /* Execute work on GPU in chunk, so we can cancel. * TODO : query appropriate size from device.*/ - const int chunk_size = 65536; + const int64_t chunk_size = 65536; - const int work_size = output.size(); + const int64_t work_size = output.size(); void *d_input = (void *)input.device_pointer; void *d_output = (void *)output.device_pointer; - for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) { - int d_work_size = min(chunk_size, work_size - d_offset); + for (int64_t d_offset = 0; d_offset < work_size; d_offset += chunk_size) { + int64_t d_work_size = std::min(chunk_size, work_size - d_offset); void *args[] = {&d_input, &d_output, &d_offset, &d_work_size}; queue->enqueue(kernel, d_work_size, args); diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 4196539a9b1..7b56216e887 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA device/cuda/kernel.cu ) +set(SRC_DEVICE_HIP + device/hip/kernel.cpp +) + set(SRC_DEVICE_OPTIX device/optix/kernel.cu device/optix/kernel_shader_raytrace.cu @@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS device/cuda/globals.h ) +set(SRC_DEVICE_HIP_HEADERS + device/hip/compat.h + device/hip/config.h + device/hip/globals.h +) + set(SRC_DEVICE_OPTIX_HEADERS device/optix/compat.h device/optix/globals.h @@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES) cycles_set_solution_folder(cycles_kernel_cuda) endif() +####################################################### START + +# HIP module + +if(WITH_CYCLES_HIP_BINARIES) + # 64 bit only + set(HIP_BITS 64) + + # HIP version + execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT) + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}") + string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}") + set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}") + + + message(WARNING + "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected") + + # build for each arch + set(hip_sources device/hip/kernel.cpp + ${SRC_HEADERS} + ${SRC_DEVICE_HIP_HEADERS} + ${SRC_BVH_HEADERS} + ${SRC_SVM_HEADERS} + ${SRC_GEOM_HEADERS} + ${SRC_INTEGRATOR_HEADERS} + ${SRC_CLOSURE_HEADERS} + ${SRC_UTIL_HEADERS} + ) + set(hip_fatbins) + + macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental) + if(${arch} MATCHES "compute_.*") + set(format "ptx") + else() + set(format "fatbin") + endif() + set(hip_file ${name}_${arch}.${format}) + + set(kernel_sources ${sources}) + if(NOT ${prev_arch} STREQUAL "none") + if(${prev_arch} MATCHES "compute_.*") + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx) + else() + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin) + endif() + endif() + + set(hip_kernel_src "/device/hip/${name}.cpp") + + set(hip_flags ${flags} + -D CCL_NAMESPACE_BEGIN= + -D CCL_NAMESPACE_END= + -D HIPCC + -m ${HIP_BITS} + -I ${CMAKE_CURRENT_SOURCE_DIR}/.. + -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip + --use_fast_math + -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file}) + + if(${experimental}) + set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__) + set(name ${name}_experimental) + endif() + + if(WITH_CYCLES_DEBUG) + set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__) + endif() + + if(WITH_NANOVDB) + set(hip_flags ${hip_flags} + -D WITH_NANOVDB + -I "${NANOVDB_INCLUDE_DIR}") + endif() + endmacro() + + set(prev_arch "none") + foreach(arch ${CYCLES_HIP_BINARIES_ARCH}) + set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE}) + set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR}) + if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir) + # Compile regular kernel + CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE) + + if(WITH_CYCLES_HIP_BUILD_SERIAL) + set(prev_arch ${arch}) + endif() + + unset(hip_hipcc_executable) + unset(hip_toolkit_root_dir) + endif() + endforeach() + + add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins}) + cycles_set_solution_folder(cycles_kernel_hip) +endif() + +####################################################### END # OptiX PTX modules if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) @@ -602,11 +710,13 @@ endif() cycles_add_library(cycles_kernel "${LIB}" ${SRC_DEVICE_CPU} ${SRC_DEVICE_CUDA} + ${SRC_DEVICE_HIP} ${SRC_DEVICE_OPTIX} ${SRC_HEADERS} ${SRC_DEVICE_CPU_HEADERS} ${SRC_DEVICE_GPU_HEADERS} ${SRC_DEVICE_CUDA_HEADERS} + ${SRC_DEVICE_HIP_HEADERS} ${SRC_DEVICE_OPTIX_HEADERS} ${SRC_BVH_HEADERS} ${SRC_CLOSURE_HEADERS} @@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS}) source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS}) source_group("kernel" FILES ${SRC_HEADERS}) source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS}) +source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS}) source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS}) source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS}) source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS}) @@ -632,14 +743,19 @@ endif() if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES) add_dependencies(cycles_kernel cycles_kernel_optix) endif() +if(WITH_CYCLES_HIP) + add_dependencies(cycles_kernel cycles_kernel_hip) +endif() # Install kernel source for runtime compilation delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda) +delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure) diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h index 539e9fd05fb..0b44cc5db34 100644 --- a/intern/cycles/kernel/bvh/bvh.h +++ b/intern/cycles/kernel/bvh/bvh.h @@ -167,15 +167,25 @@ ccl_device_intersect bool scene_intersect(const KernelGlobals *kg, uint p4 = visibility; uint p5 = PRIMITIVE_NONE; + uint ray_mask = visibility & 0xFF; + uint ray_flags = OPTIX_RAY_FLAG_NONE; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT; + } + else if (visibility & PATH_RAY_SHADOW_OPAQUE) { + ray_flags = OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT; + } + optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, ray->P, ray->D, 0.0f, ray->t, ray->time, - 0xF, - OPTIX_RAY_FLAG_NONE, - 0, // SBT offset for PG_HITD + ray_mask, + ray_flags, + 0, /* SBT offset for PG_HITD */ 0, 0, p0, @@ -251,11 +261,11 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg, uint p2 = ((uint64_t)local_isect) & 0xFFFFFFFF; uint p3 = (((uint64_t)local_isect) >> 32) & 0xFFFFFFFF; uint p4 = local_object; - // Is set to zero on miss or if ray is aborted, so can be used as return value + /* Is set to zero on miss or if ray is aborted, so can be used as return value. */ uint p5 = max_hits; if (local_isect) { - local_isect->num_hits = 0; // Initialize hit count to zero + local_isect->num_hits = 0; /* Initialize hit count to zero. */ } optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, ray->P, @@ -263,11 +273,10 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg, 0.0f, ray->t, ray->time, - // Skip curves - 0x3, - // Need to always call into __anyhit__kernel_optix_local_hit + 0xFF, + /* Need to always call into __anyhit__kernel_optix_local_hit. */ OPTIX_RAY_FLAG_ENFORCE_ANYHIT, - 2, // SBT offset for PG_HITL + 2, /* SBT offset for PG_HITL */ 0, 0, p0, @@ -365,17 +374,22 @@ ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg, uint p4 = visibility; uint p5 = false; - *num_hits = 0; // Initialize hit count to zero + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + + *num_hits = 0; /* Initialize hit count to zero. */ optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, ray->P, ray->D, 0.0f, ray->t, ray->time, - 0xF, - // Need to always call into __anyhit__kernel_optix_shadow_all_hit + ray_mask, + /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */ OPTIX_RAY_FLAG_ENFORCE_ANYHIT, - 1, // SBT offset for PG_HITS + 1, /* SBT offset for PG_HITS */ 0, 0, p0, @@ -444,16 +458,21 @@ ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg, uint p4 = visibility; uint p5 = PRIMITIVE_NONE; + uint ray_mask = visibility & 0xFF; + if (0 == ray_mask && (visibility & ~0xFF) != 0) { + ray_mask = 0xFF; + } + optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0, ray->P, ray->D, 0.0f, ray->t, ray->time, - // Skip everything but volumes - 0x2, - OPTIX_RAY_FLAG_NONE, - 0, // SBT offset for PG_HITD + ray_mask, + /* Need to always call into __anyhit__kernel_optix_volume_test. */ + OPTIX_RAY_FLAG_ENFORCE_ANYHIT, + 3, /* SBT offset for PG_HITV */ 0, 0, p0, diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h index 85500bf4d07..db4a4bf71e0 100644 --- a/intern/cycles/kernel/device/gpu/parallel_active_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h @@ -21,11 +21,15 @@ CCL_NAMESPACE_BEGIN /* Given an array of states, build an array of indices for which the states * are active. * - * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */ + * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */ #include "util/util_atomic.h" -#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize, typename IsActiveOp> __device__ void gpu_parallel_active_index_array(const uint num_states, diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h index f609520b8b4..a1349e82efb 100644 --- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h +++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h @@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values) { diff --git a/intern/cycles/kernel/device/gpu/parallel_reduce.h b/intern/cycles/kernel/device/gpu/parallel_reduce.h index 65b1990dbb8..b60dceb2ed0 100644 --- a/intern/cycles/kernel/device/gpu/parallel_reduce.h +++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h @@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN * the overall cost of the algorithm while keeping the work complexity O(n) and * the step complexity O(log n). (Brent's Theorem optimization) */ -#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512 +#endif template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp> __device__ void gpu_parallel_sum( diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h index 99b35468517..9bca1fad22f 100644 --- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h +++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h @@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN #include "util/util_atomic.h" -#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#ifdef __HIP__ +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024 +#else +# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512 +#endif #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0) template<uint blocksize, typename GetKeyOp> diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h new file mode 100644 index 00000000000..95338fe7d6e --- /dev/null +++ b/intern/cycles/kernel/device/hip/compat.h @@ -0,0 +1,121 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_HIP__ +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#ifndef ATTR_FALLTHROUGH +# define ATTR_FALLTHROUGH +#endif + +#ifdef __HIPCC_RTC__ +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +#else +# include <stdint.h> +#endif + +#ifdef CYCLES_HIPBIN_CC +# define FLT_MIN 1.175494350822287507969e-38f +# define FLT_MAX 340282346638528859811704183484516925440.0f +# define FLT_EPSILON 1.192092896e-07F +#endif + +/* Qualifiers */ + +#define ccl_device __device__ __inline__ +#define ccl_device_inline __device__ __inline__ +#define ccl_device_forceinline __device__ __forceinline__ +#define ccl_device_noinline __device__ __noinline__ +#define ccl_device_noinline_cpu ccl_device +#define ccl_global +#define ccl_static_constant __constant__ +#define ccl_device_constant __constant__ __device__ +#define ccl_constant const +#define ccl_gpu_shared __shared__ +#define ccl_private +#define ccl_may_alias +#define ccl_addr_space +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_align(n) __align__(n) +#define ccl_optional_struct_init + +#define kernel_assert(cond) + +/* Types */ +#ifdef __HIP__ +# include "hip/hip_fp16.h" +# include "hip/hip_runtime.h" +#endif + +#ifdef _MSC_VER +# include <immintrin.h> +#endif + +#define ccl_gpu_thread_idx_x (threadIdx.x) +#define ccl_gpu_block_dim_x (blockDim.x) +#define ccl_gpu_block_idx_x (blockIdx.x) +#define ccl_gpu_grid_dim_x (gridDim.x) +#define ccl_gpu_warp_size (warpSize) + +#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x) +#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x) + +/* GPU warp synchronization */ + +#define ccl_gpu_syncthreads() __syncthreads() +#define ccl_gpu_ballot(predicate) __ballot(predicate) +#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla) +#define ccl_gpu_popc(x) __popc(x) + +/* GPU texture objects */ +typedef hipTextureObject_t ccl_gpu_tex_object; + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj, + const float x, + const float y) +{ + return tex2D<T>(texobj, x, y); +} + +template<typename T> +ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj, + const float x, + const float y, + const float z) +{ + return tex3D<T>(texobj, x, y, z); +} + +/* Use fast math functions */ + +#define cosf(x) __cosf(((float)(x))) +#define sinf(x) __sinf(((float)(x))) +#define powf(x, y) __powf(((float)(x)), ((float)(y))) +#define tanf(x) __tanf(((float)(x))) +#define logf(x) __logf(((float)(x))) +#define expf(x) __expf(((float)(x))) + +/* Types */ + +#include "util/util_half.h" +#include "util/util_types.h" diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h new file mode 100644 index 00000000000..2fde0d46015 --- /dev/null +++ b/intern/cycles/kernel/device/hip/config.h @@ -0,0 +1,57 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Device data taken from HIP occupancy calculator. + * + * Terminology + * - HIP GPUs have multiple streaming multiprocessors + * - Each multiprocessor executes multiple thread blocks + * - Each thread block contains a number of threads, also known as the block size + * - Multiprocessors have a fixed number of registers, and the amount of registers + * used by each threads limits the number of threads per block. + */ + +/* Launch Bound Definitions */ +#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536 +#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64 +#define GPU_BLOCK_MAX_THREADS 1024 +#define GPU_THREAD_MAX_REGISTERS 255 + +#define GPU_KERNEL_BLOCK_NUM_THREADS 1024 +#define GPU_KERNEL_MAX_REGISTERS 64 + +/* Compute number of threads per block and minimum blocks per multiprocessor + * given the maximum number of registers per thread. */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \ + extern "C" __global__ void __launch_bounds__(block_num_threads, \ + GPU_MULTIPRESSOR_MAX_REGISTERS / \ + (block_num_threads * thread_num_registers)) + +/* sanity checks */ + +#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS +# error "Maximum number of threads per block exceeded" +#endif + +#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \ + GPU_MULTIPROCESSOR_MAX_BLOCKS +# error "Maximum number of blocks per multiprocessor exceeded" +#endif + +#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS +# error "Maximum number of registers per thread exceeded" +#endif diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h new file mode 100644 index 00000000000..39978ae7899 --- /dev/null +++ b/intern/cycles/kernel/device/hip/globals.h @@ -0,0 +1,49 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Constant Globals */ + +#pragma once + +#include "kernel/kernel_profiling.h" +#include "kernel/kernel_types.h" + +#include "kernel/integrator/integrator_state.h" + +CCL_NAMESPACE_BEGIN + +/* Not actually used, just a NULL pointer that gets passed everywhere, which we + * hope gets optimized out by the compiler. */ +struct KernelGlobals { + /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */ + int unused[1]; +}; + +/* Global scene data and textures */ +__constant__ KernelData __data; +#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name; +#include "kernel/kernel_textures.h" + +/* Integrator state */ +__constant__ IntegratorStateGPU __integrator_state; + +/* Abstraction macros */ +#define kernel_data __data +#define kernel_tex_fetch(t, index) t[(index)] +#define kernel_tex_array(t) (t) +#define kernel_integrator_state __integrator_state + +CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/device/hip/kernel.cpp b/intern/cycles/kernel/device/hip/kernel.cpp new file mode 100644 index 00000000000..c801320a2e1 --- /dev/null +++ b/intern/cycles/kernel/device/hip/kernel.cpp @@ -0,0 +1,28 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* HIP kernel entry points */ + +#ifdef __HIP_DEVICE_COMPILE__ + +# include "kernel/device/hip/compat.h" +# include "kernel/device/hip/config.h" +# include "kernel/device/hip/globals.h" + +# include "kernel/device/gpu/image.h" +# include "kernel/device/gpu/kernel.h" + +#endif diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu index c1e36febfc0..7a79e0c4823 100644 --- a/intern/cycles/kernel/device/optix/kernel.cu +++ b/intern/cycles/kernel/device/optix/kernel.cu @@ -19,7 +19,7 @@ #include "kernel/device/optix/compat.h" #include "kernel/device/optix/globals.h" -#include "kernel/device/gpu/image.h" // Texture lookup uses normal CUDA intrinsics +#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */ #include "kernel/integrator/integrator_state.h" #include "kernel/integrator/integrator_state_flow.h" @@ -44,18 +44,18 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2() template<bool always = false> ccl_device_forceinline uint get_object_id() { #ifdef __OBJECT_MOTION__ - // Always get the the instance ID from the TLAS - // There might be a motion transform node between TLAS and BLAS which does not have one + /* Always get the the instance ID from the TLAS. + * There might be a motion transform node between TLAS and BLAS which does not have one. */ uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0)); #else uint object = optixGetInstanceId(); #endif - // Choose between always returning object ID or only for instances + /* Choose between always returning object ID or only for instances. */ if (always || (object & 1) == 0) - // Can just remove the low bit since instance always contains object ID + /* Can just remove the low bit since instance always contains object ID. */ return object >> 1; else - // Set to OBJECT_NONE if this is not an instanced object + /* Set to OBJECT_NONE if this is not an instanced object. */ return OBJECT_NONE; } @@ -93,23 +93,30 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st extern "C" __global__ void __miss__kernel_optix_miss() { - // 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss + /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */ optixSetPayload_0(__float_as_uint(optixGetRayTmax())); optixSetPayload_5(PRIMITIVE_NONE); } extern "C" __global__ void __anyhit__kernel_optix_local_hit() { +#ifdef __HAIR__ + if (!optixIsTriangleHit()) { + /* Ignore curves. */ + return optixIgnoreIntersection(); + } +#endif + #ifdef __BVH_LOCAL__ const uint object = get_object_id<true>(); if (object != optixGetPayload_4() /* local_object */) { - // Only intersect with matching object + /* Only intersect with matching object. */ return optixIgnoreIntersection(); } const uint max_hits = optixGetPayload_5(); if (max_hits == 0) { - // Special case for when no hit information is requested, just report that something was hit + /* Special case for when no hit information is requested, just report that something was hit */ optixSetPayload_5(true); return optixTerminateRay(); } @@ -136,8 +143,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit() } else { if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) { - // Record closest intersection only - // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit + /* Record closest intersection only. + * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit. + */ return optixIgnoreIntersection(); } @@ -154,14 +162,14 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit() isect->u = 1.0f - barycentrics.y - barycentrics.x; isect->v = barycentrics.x; - // Record geometric normal + /* Record geometric normal. */ const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim); const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0)); const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1)); const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2)); local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a)); - // Continue tracing (without this the trace call would return after the first hit) + /* Continue tracing (without this the trace call would return after the first hit). */ optixIgnoreIntersection(); #endif } @@ -190,7 +198,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() u = __uint_as_float(optixGetAttribute_0()); v = __uint_as_float(optixGetAttribute_1()); - // Filter out curve endcaps + /* Filter out curve endcaps. */ if (u == 0.0f || u == 1.0f) { ignore_intersection = true; } @@ -241,10 +249,10 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() isect->type = kernel_tex_fetch(__prim_type, prim); # ifdef __TRANSPARENT_SHADOWS__ - // Detect if this surface has a shader with transparent shadows + /* Detect if this surface has a shader with transparent shadows. */ if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) { # endif - // If no transparent shadows, all light is blocked and we can stop immediately + /* If no transparent shadows, all light is blocked and we can stop immediately. */ optixSetPayload_5(true); return optixTerminateRay(); # ifdef __TRANSPARENT_SHADOWS__ @@ -252,24 +260,39 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit() # endif } - // Continue tracing + /* Continue tracing. */ optixIgnoreIntersection(); #endif } -extern "C" __global__ void __anyhit__kernel_optix_visibility_test() +extern "C" __global__ void __anyhit__kernel_optix_volume_test() { - uint visibility = optixGetPayload_4(); +#ifdef __HAIR__ + if (!optixIsTriangleHit()) { + /* Ignore curves. */ + return optixIgnoreIntersection(); + } +#endif + #ifdef __VISIBILITY_FLAG__ const uint prim = optixGetPrimitiveIndex(); + const uint visibility = optixGetPayload_4(); if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) { return optixIgnoreIntersection(); } #endif + const uint object = get_object_id<true>(); + if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) { + return optixIgnoreIntersection(); + } +} + +extern "C" __global__ void __anyhit__kernel_optix_visibility_test() +{ #ifdef __HAIR__ if (!optixIsTriangleHit()) { - // Filter out curve endcaps + /* Filter out curve endcaps. */ const float u = __uint_as_float(optixGetAttribute_0()); if (u == 0.0f || u == 1.0f) { return optixIgnoreIntersection(); @@ -277,18 +300,26 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test() } #endif - // Shadow ray early termination +#ifdef __VISIBILITY_FLAG__ + const uint prim = optixGetPrimitiveIndex(); + const uint visibility = optixGetPayload_4(); + if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) { + return optixIgnoreIntersection(); + } + + /* Shadow ray early termination. */ if (visibility & PATH_RAY_SHADOW_OPAQUE) { return optixTerminateRay(); } +#endif } extern "C" __global__ void __closesthit__kernel_optix_hit() { - optixSetPayload_0(__float_as_uint(optixGetRayTmax())); // Intersection distance + optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */ optixSetPayload_3(optixGetPrimitiveIndex()); optixSetPayload_4(get_object_id()); - // Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index + /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */ optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex())); if (optixIsTriangleHit()) { @@ -297,7 +328,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit() optixSetPayload_2(__float_as_uint(barycentrics.x)); } else { - optixSetPayload_1(optixGetAttribute_0()); // Same as 'optixGetCurveParameter()' + optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */ optixSetPayload_2(optixGetAttribute_1()); } } @@ -311,7 +342,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type float3 P = optixGetObjectRayOrigin(); float3 dir = optixGetObjectRayDirection(); - // The direction is not normalized by default, but the curve intersection routine expects that + /* The direction is not normalized by default, but the curve intersection routine expects that */ float len; dir = normalize_len(dir, &len); @@ -323,15 +354,15 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type Intersection isect; isect.t = optixGetRayTmax(); - // Transform maximum distance into object space + /* Transform maximum distance into object space. */ if (isect.t != FLT_MAX) isect.t *= len; if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) { optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL, - __float_as_int(isect.u), // Attribute_0 - __float_as_int(isect.v)); // Attribute_1 + __float_as_int(isect.u), /* Attribute_0 */ + __float_as_int(isect.v)); /* Attribute_1 */ } } diff --git a/intern/cycles/kernel/geom/geom_curve_intersect.h b/intern/cycles/kernel/geom/geom_curve_intersect.h index 213f3e62ee0..a068e93790a 100644 --- a/intern/cycles/kernel/geom/geom_curve_intersect.h +++ b/intern/cycles/kernel/geom/geom_curve_intersect.h @@ -713,7 +713,7 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg, P = transform_point(&tfm, P); D = transform_direction(&tfm, D * t); - D = normalize_len(D, &t); + D = safe_normalize_len(D, &t); } int prim = kernel_tex_fetch(__prim_index, isect_prim); @@ -764,8 +764,10 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg, /* Thick curves, compute normal using direction from inside the curve. * This could be optimized by recording the normal in the intersection, * however for Optix this would go beyond the size of the payload. */ + /* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small + * radius). In this case use the view direction to approximate the normal. */ const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u)); - const float3 Ng = normalize(P - P_inside); + const float3 Ng = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I; sd->N = Ng; sd->Ng = Ng; diff --git a/intern/cycles/kernel/geom/geom_motion_triangle.h b/intern/cycles/kernel/geom/geom_motion_triangle.h index eb4a39e062b..239bd0a37b2 100644 --- a/intern/cycles/kernel/geom/geom_motion_triangle.h +++ b/intern/cycles/kernel/geom/geom_motion_triangle.h @@ -41,7 +41,18 @@ ccl_device_inline int find_attribute_motion(const KernelGlobals *kg, uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset); while (attr_map.x != id) { - attr_offset += ATTR_PRIM_TYPES; + if (UNLIKELY(attr_map.x == ATTR_STD_NONE)) { + if (UNLIKELY(attr_map.y == 0)) { + return (int)ATTR_STD_NOT_FOUND; + } + else { + /* Chain jump to a different part of the table. */ + attr_offset = attr_map.z; + } + } + else { + attr_offset += ATTR_PRIM_TYPES; + } attr_map = kernel_tex_fetch(__attributes_map, attr_offset); } diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h index 73b7cad32be..a24473addcc 100644 --- a/intern/cycles/kernel/integrator/integrator_shade_surface.h +++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h @@ -365,19 +365,16 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS, #ifdef __VOLUME__ if (!(sd.flag & SD_HAS_ONLY_VOLUME)) { #endif + const int path_flag = INTEGRATOR_STATE(path, flag); - { - const int path_flag = INTEGRATOR_STATE(path, flag); #ifdef __SUBSURFACE__ - /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */ - if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP))) + /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */ + if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP))) #endif - { - /* Evaluate shader. */ - PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL); - shader_eval_surface<node_feature_mask>( - INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag); - } + { + /* Evaluate shader. */ + PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL); + shader_eval_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag); } #ifdef __SUBSURFACE__ @@ -417,17 +414,20 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS, /* Perform path termination. Most paths have already been terminated in * the intersect_closest kernel, this is just for emission and for dividing - * throughput by the probability at the right moment. */ - const int path_flag = INTEGRATOR_STATE(path, flag); - const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ? - 0.0f : - path_state_continuation_probability(INTEGRATOR_STATE_PASS, - path_flag); - if (probability == 0.0f) { - return false; - } - else if (probability != 1.0f) { - INTEGRATOR_STATE_WRITE(path, throughput) /= probability; + * throughput by the probability at the right moment. + * + * Also ensure we don't do it twice for SSS at both the entry and exit point. */ + if (!(path_flag & PATH_RAY_SUBSURFACE)) { + const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ? + 0.0f : + path_state_continuation_probability(INTEGRATOR_STATE_PASS, + path_flag); + if (probability == 0.0f) { + return false; + } + else if (probability != 1.0f) { + INTEGRATOR_STATE_WRITE(path, throughput) /= probability; + } } #ifdef __DENOISING_FEATURES__ diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h index 095a28ac505..dac3efb3996 100644 --- a/intern/cycles/kernel/integrator/integrator_shade_volume.h +++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h @@ -74,7 +74,7 @@ ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS, ShaderData *ccl_restrict sd, float3 *ccl_restrict extinction) { - shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) { + shader_eval_volume<true>(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) { return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i); }); @@ -93,7 +93,7 @@ ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS, VolumeShaderCoefficients *coeff) { const int path_flag = INTEGRATOR_STATE(path, flag); - shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) { + shader_eval_volume<false>(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) { return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i); }); diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h index 094446be02c..f745ad3f4b9 100644 --- a/intern/cycles/kernel/integrator/integrator_state.h +++ b/intern/cycles/kernel/integrator/integrator_state.h @@ -60,7 +60,15 @@ CCL_NAMESPACE_BEGIN * TODO: these could be made dynamic depending on the features used in the scene. */ #define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE -#define INTEGRATOR_SHADOW_ISECT_SIZE 4 + +#define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024 +#define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4 + +#ifdef __KERNEL_CPU__ +# define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU +#else +# define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_GPU +#endif /* Data structures */ @@ -74,9 +82,9 @@ typedef struct IntegratorStateCPU { #define KERNEL_STRUCT_END(name) \ } \ name; -#define KERNEL_STRUCT_END_ARRAY(name, size) \ +#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \ } \ - name[size]; + name[cpu_size]; #include "kernel/integrator/integrator_state_template.h" #undef KERNEL_STRUCT_BEGIN #undef KERNEL_STRUCT_MEMBER @@ -103,9 +111,9 @@ typedef struct IntegratorStateGPU { #define KERNEL_STRUCT_END(name) \ } \ name; -#define KERNEL_STRUCT_END_ARRAY(name, size) \ +#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \ } \ - name[size]; + name[gpu_size]; #include "kernel/integrator/integrator_state_template.h" #undef KERNEL_STRUCT_BEGIN #undef KERNEL_STRUCT_MEMBER diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h index 41dd1bfcdbf..0d8126c64aa 100644 --- a/intern/cycles/kernel/integrator/integrator_state_template.h +++ b/intern/cycles/kernel/integrator/integrator_state_template.h @@ -107,7 +107,7 @@ KERNEL_STRUCT_END(subsurface) KERNEL_STRUCT_BEGIN(volume_stack) KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME) KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME) -KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE) +KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE) /********************************* Shadow Path State **************************/ @@ -153,11 +153,15 @@ KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACIN KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING) /* TODO: exclude for GPU. */ KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING) -KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE) +KERNEL_STRUCT_END_ARRAY(shadow_isect, + INTEGRATOR_SHADOW_ISECT_SIZE_CPU, + INTEGRATOR_SHADOW_ISECT_SIZE_GPU) /**************************** Shadow Volume Stack *****************************/ KERNEL_STRUCT_BEGIN(shadow_volume_stack) KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME) KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME) -KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE) +KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, + INTEGRATOR_VOLUME_STACK_SIZE, + INTEGRATOR_VOLUME_STACK_SIZE) diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h index cdf412fe22f..08d6cb00114 100644 --- a/intern/cycles/kernel/integrator/integrator_state_util.h +++ b/intern/cycles/kernel/integrator/integrator_state_util.h @@ -217,10 +217,10 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state while (false) \ ; -# define KERNEL_STRUCT_END_ARRAY(name, array_size) \ +# define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \ ++index; \ } \ - while (index < array_size) \ + while (index < gpu_array_size) \ ; # include "kernel/integrator/integrator_state_template.h" @@ -264,7 +264,12 @@ ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_AR IntegratorStateCPU *ccl_restrict split_state = state + 1; - *split_state = *state; + /* Only copy the required subset, since shadow intersections are big and irrelevant here. */ + split_state->path = state->path; + split_state->ray = state->ray; + split_state->isect = state->isect; + memcpy(split_state->volume_stack, state->volume_stack, sizeof(state->volume_stack)); + split_state->shadow_path = state->shadow_path; split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS; #endif diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h index 9e12d24dcf4..f4d00e4c20c 100644 --- a/intern/cycles/kernel/kernel_accumulate.h +++ b/intern/cycles/kernel/kernel_accumulate.h @@ -386,7 +386,7 @@ ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS, { /* The throughput for shadow paths already contains the light shader evaluation. */ float3 contribution = INTEGRATOR_STATE(shadow_path, throughput); - kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1); + kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce)); ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS, render_buffer); diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index e025bcd6674..abb1ba455e6 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -42,6 +42,16 @@ ccl_device void kernel_displace_evaluate(const KernelGlobals *kg, object_inverse_dir_transform(kg, &sd, &D); +#ifdef __KERNEL_DEBUG_NAN__ + if (!isfinite3_safe(D)) { + kernel_assert(!"Cycles displacement with non-finite value detected"); + } +#endif + + /* Ensure finite displacement, preventing BVH from becoming degenerate and avoiding possible + * traversal issues caused by non-finite math. */ + D = ensure_finite3(D); + /* Write output. */ output[offset] += make_float4(D.x, D.y, D.z, 0.0f); } @@ -66,7 +76,16 @@ ccl_device void kernel_background_evaluate(const KernelGlobals *kg, const int path_flag = PATH_RAY_EMISSION; shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>( INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag); - const float3 color = shader_background_eval(&sd); + float3 color = shader_background_eval(&sd); + +#ifdef __KERNEL_DEBUG_NAN__ + if (!isfinite3_safe(color)) { + kernel_assert(!"Cycles background with non-finite value detected"); + } +#endif + + /* Ensure finite color, avoiding possible numerical instabilities in the path tracing kernels. */ + color = ensure_finite3(color); /* Write output. */ output[offset] += make_float4(color.x, color.y, color.z, 0.0f); diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h index 715d764fb31..e8f4a21878e 100644 --- a/intern/cycles/kernel/kernel_film.h +++ b/intern/cycles/kernel/kernel_film.h @@ -394,7 +394,7 @@ film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_conver /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual * shadow catcher objects in the scene. In this case there will be no auxiliary passes required - * for the devision (to save up memory). So delay the asserts to this point so that the number of + * for the decision (to save up memory). So delay the asserts to this point so that the number of * samples check handles such configuration. */ kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED); kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED); diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 354e8115538..1beaf3cc2b2 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -74,10 +74,6 @@ ccl_device_inline float cmj_randfloat_simple(uint i, uint p) ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension) { - /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D - * the x part is used as the sample (TODO(@leesonw): Add using both x and y parts - * independently). */ - /* Perform Owen shuffle of the sample number to reorder the samples. */ #ifdef _SIMPLE_HASH_ const uint rv = cmj_hash_simple(dimension, rng_hash); @@ -95,7 +91,10 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha const uint sample_set = s / NUM_PMJ_SAMPLES; const uint d = (dimension + sample_set); const uint dim = d % NUM_PMJ_PATTERNS; - int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)); + + /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D + * the x part is used for even dims and the y for odd. */ + int index = 2 * ((dim >> 1) * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)) + (dim & 1); float fx = kernel_tex_fetch(__sample_pattern_lut, index); @@ -104,12 +103,11 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha # ifdef _SIMPLE_HASH_ float dx = cmj_randfloat_simple(d, rng_hash); # else - /* Only jitter within the grid interval. */ float dx = cmj_randfloat(d, rng_hash); # endif - fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES); + /* Jitter sample locations and map back into [0 1]. */ + fx = fx + dx; fx = fx - floorf(fx); - #else # warning "Not using Cranley-Patterson Rotation." #endif @@ -136,7 +134,7 @@ ccl_device void pmj_sample_2D( /* Based on the sample number a sample pattern is selected and offset by the dimension. */ const uint sample_set = s / NUM_PMJ_SAMPLES; const uint d = (dimension + sample_set); - const uint dim = d % NUM_PMJ_PATTERNS; + uint dim = d % NUM_PMJ_PATTERNS; int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)); float fx = kernel_tex_fetch(__sample_pattern_lut, index); @@ -151,17 +149,17 @@ ccl_device void pmj_sample_2D( float dx = cmj_randfloat(d, rng_hash); float dy = cmj_randfloat(d + 1, rng_hash); # endif - /* Only jitter within the grid cells. */ - fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS); - fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS); - fx = fx - floorf(fx); - fy = fy - floorf(fy); + /* Jitter sample locations and map back to the unit square [0 1]x[0 1]. */ + float sx = fx + dx; + float sy = fy + dy; + sx = sx - floorf(sx); + sy = sy - floorf(sy); #else # warning "Not using Cranley Patterson Rotation." #endif - (*x) = fx; - (*y) = fy; + (*x) = sx; + (*y) = sy; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 3052bb53040..e7133724c85 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -186,8 +186,8 @@ ccl_device_inline float _shader_bsdf_multi_eval(const KernelGlobals *kg, float sum_sample_weight, const uint light_shader_flags) { - /* this is the veach one-sample model with balance heuristic, some pdf - * factors drop out when using balance heuristic weighting */ + /* This is the veach one-sample model with balance heuristic, + * some PDF factors drop out when using balance heuristic weighting. */ for (int i = 0; i < sd->num_closure; i++) { const ShaderClosure *sc = &sd->closure[i]; @@ -750,7 +750,7 @@ ccl_device int shader_phase_sample_closure(const KernelGlobals *kg, /* Volume Evaluation */ -template<typename StackReadOp> +template<const bool shadow, typename StackReadOp> ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS, ShaderData *ccl_restrict sd, const int path_flag, @@ -815,8 +815,11 @@ ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS, # endif /* Merge closures to avoid exceeding number of closures limit. */ - if (i > 0) - shader_merge_volume_closures(sd); + if (!shadow) { + if (i > 0) { + shader_merge_volume_closures(sd); + } + } } } diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 66b7310ab65..3cc42bf7a85 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -572,6 +572,7 @@ typedef enum AttributeStandard { ATTR_STD_MOTION_VERTEX_NORMAL, ATTR_STD_PARTICLE, ATTR_STD_CURVE_INTERCEPT, + ATTR_STD_CURVE_LENGTH, ATTR_STD_CURVE_RANDOM, ATTR_STD_PTEX_FACE_ID, ATTR_STD_PTEX_UV, diff --git a/intern/cycles/kernel/osl/osl_services.cpp b/intern/cycles/kernel/osl/osl_services.cpp index 396f42080e4..4fc46a255a8 100644 --- a/intern/cycles/kernel/osl/osl_services.cpp +++ b/intern/cycles/kernel/osl/osl_services.cpp @@ -107,6 +107,7 @@ ustring OSLRenderServices::u_geom_undisplaced("geom:undisplaced"); ustring OSLRenderServices::u_is_smooth("geom:is_smooth"); ustring OSLRenderServices::u_is_curve("geom:is_curve"); ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness"); +ustring OSLRenderServices::u_curve_length("geom:curve_length"); ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal"); ustring OSLRenderServices::u_curve_random("geom:curve_random"); ustring OSLRenderServices::u_path_ray_length("path:ray_length"); diff --git a/intern/cycles/kernel/osl/osl_services.h b/intern/cycles/kernel/osl/osl_services.h index 58accb46e7d..2a5400282b3 100644 --- a/intern/cycles/kernel/osl/osl_services.h +++ b/intern/cycles/kernel/osl/osl_services.h @@ -294,6 +294,7 @@ class OSLRenderServices : public OSL::RendererServices { static ustring u_is_smooth; static ustring u_is_curve; static ustring u_curve_thickness; + static ustring u_curve_length; static ustring u_curve_tangent_normal; static ustring u_curve_random; static ustring u_path_ray_length; diff --git a/intern/cycles/kernel/shaders/CMakeLists.txt b/intern/cycles/kernel/shaders/CMakeLists.txt index 02be7813369..6b62e7bb52f 100644 --- a/intern/cycles/kernel/shaders/CMakeLists.txt +++ b/intern/cycles/kernel/shaders/CMakeLists.txt @@ -41,6 +41,7 @@ set(SRC_OSL node_vector_displacement.osl node_emission.osl node_environment_texture.osl + node_float_curve.osl node_fresnel.osl node_gamma.osl node_geometry.osl diff --git a/intern/cycles/kernel/shaders/node_float_curve.osl b/intern/cycles/kernel/shaders/node_float_curve.osl new file mode 100644 index 00000000000..f1f05fd88a9 --- /dev/null +++ b/intern/cycles/kernel/shaders/node_float_curve.osl @@ -0,0 +1,32 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "node_ramp_util.h" +#include "stdcycles.h" + +shader node_float_curve(float ramp[] = {0.0}, + float min_x = 0.0, + float max_x = 1.0, + float ValueIn = 0.0, + float Factor = 0.0, + output float ValueOut = 0.0) +{ + float c = (ValueIn - min_x) / (max_x - min_x); + + ValueOut = rgb_ramp_lookup(ramp, c, 1, 1); + + ValueOut = mix(ValueIn, ValueOut, Factor); +} diff --git a/intern/cycles/kernel/shaders/node_hair_info.osl b/intern/cycles/kernel/shaders/node_hair_info.osl index ee08ea57e68..ddc2e28b83a 100644 --- a/intern/cycles/kernel/shaders/node_hair_info.osl +++ b/intern/cycles/kernel/shaders/node_hair_info.osl @@ -18,12 +18,14 @@ shader node_hair_info(output float IsStrand = 0.0, output float Intercept = 0.0, + output float Length = 0.0, output float Thickness = 0.0, output normal TangentNormal = N, output float Random = 0) { getattribute("geom:is_curve", IsStrand); getattribute("geom:curve_intercept", Intercept); + getattribute("geom:curve_length", Length); getattribute("geom:curve_thickness", Thickness); getattribute("geom:curve_tangent_normal", TangentNormal); getattribute("geom:curve_random", Random); diff --git a/intern/cycles/kernel/svm/svm.h b/intern/cycles/kernel/svm/svm.h index 4aee1ef11b3..ad609b15f86 100644 --- a/intern/cycles/kernel/svm/svm.h +++ b/intern/cycles/kernel/svm/svm.h @@ -493,11 +493,13 @@ ccl_device void svm_eval_nodes(INTEGRATOR_STATE_CONST_ARGS, case NODE_IES: svm_node_ies(kg, sd, stack, node); break; - case NODE_RGB_CURVES: case NODE_VECTOR_CURVES: offset = svm_node_curves(kg, sd, stack, node, offset); break; + case NODE_FLOAT_CURVE: + offset = svm_node_curve(kg, sd, stack, node, offset); + break; case NODE_TANGENT: svm_node_tangent(kg, sd, stack, node); break; diff --git a/intern/cycles/kernel/svm/svm_geometry.h b/intern/cycles/kernel/svm/svm_geometry.h index 10e9f291d0e..432529eb061 100644 --- a/intern/cycles/kernel/svm/svm_geometry.h +++ b/intern/cycles/kernel/svm/svm_geometry.h @@ -213,6 +213,8 @@ ccl_device_noinline void svm_node_hair_info( } case NODE_INFO_CURVE_INTERCEPT: break; /* handled as attribute */ + case NODE_INFO_CURVE_LENGTH: + break; /* handled as attribute */ case NODE_INFO_CURVE_RANDOM: break; /* handled as attribute */ case NODE_INFO_CURVE_THICKNESS: { diff --git a/intern/cycles/kernel/svm/svm_ramp.h b/intern/cycles/kernel/svm/svm_ramp.h index e92df3c093c..563e5bcb5e4 100644 --- a/intern/cycles/kernel/svm/svm_ramp.h +++ b/intern/cycles/kernel/svm/svm_ramp.h @@ -21,6 +21,48 @@ CCL_NAMESPACE_BEGIN /* NOTE: svm_ramp.h, svm_ramp_util.h and node_ramp_util.h must stay consistent */ +ccl_device_inline float fetch_float(const KernelGlobals *kg, int offset) +{ + uint4 node = kernel_tex_fetch(__svm_nodes, offset); + return __uint_as_float(node.x); +} + +ccl_device_inline float float_ramp_lookup(const KernelGlobals *kg, + int offset, + float f, + bool interpolate, + bool extrapolate, + int table_size) +{ + if ((f < 0.0f || f > 1.0f) && extrapolate) { + float t0, dy; + if (f < 0.0f) { + t0 = fetch_float(kg, offset); + dy = t0 - fetch_float(kg, offset + 1); + f = -f; + } + else { + t0 = fetch_float(kg, offset + table_size - 1); + dy = t0 - fetch_float(kg, offset + table_size - 2); + f = f - 1.0f; + } + return t0 + dy * f * (table_size - 1); + } + + f = saturate(f) * (table_size - 1); + + /* clamp int as well in case of NaN */ + int i = clamp(float_to_int(f), 0, table_size - 1); + float t = f - (float)i; + + float a = fetch_float(kg, offset + i); + + if (interpolate && t > 0.0f) + a = (1.0f - t) * a + t * fetch_float(kg, offset + i + 1); + + return a; +} + ccl_device_inline float4 rgb_ramp_lookup(const KernelGlobals *kg, int offset, float f, @@ -105,6 +147,30 @@ ccl_device_noinline int svm_node_curves( return offset; } +ccl_device_noinline int svm_node_curve( + const KernelGlobals *kg, ShaderData *sd, float *stack, uint4 node, int offset) +{ + uint fac_offset, value_in_offset, out_offset; + svm_unpack_node_uchar3(node.y, &fac_offset, &value_in_offset, &out_offset); + + uint table_size = read_node(kg, &offset).x; + + float fac = stack_load_float(stack, fac_offset); + float in = stack_load_float(stack, value_in_offset); + + const float min = __int_as_float(node.z), max = __int_as_float(node.w); + const float range = max - min; + const float relpos = (in - min) / range; + + float v = float_ramp_lookup(kg, offset, relpos, true, true, table_size); + + in = (1.0f - fac) * in + fac * v; + stack_store_float(stack, out_offset, in); + + offset += table_size; + return offset; +} + CCL_NAMESPACE_END #endif /* __SVM_RAMP_H__ */ diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h index c053be96c51..59a0e33acbc 100644 --- a/intern/cycles/kernel/svm/svm_types.h +++ b/intern/cycles/kernel/svm/svm_types.h @@ -122,6 +122,7 @@ typedef enum ShaderNodeType { NODE_AOV_START, NODE_AOV_COLOR, NODE_AOV_VALUE, + NODE_FLOAT_CURVE, /* NOTE: for best OpenCL performance, item definition in the enum must * match the switch case order in svm.h. */ } ShaderNodeType; @@ -173,6 +174,7 @@ typedef enum NodeParticleInfo { typedef enum NodeHairInfo { NODE_INFO_CURVE_IS_STRAND, NODE_INFO_CURVE_INTERCEPT, + NODE_INFO_CURVE_LENGTH, NODE_INFO_CURVE_THICKNESS, /* Fade for minimum hair width transiency. */ // NODE_INFO_CURVE_FADE, diff --git a/intern/cycles/render/CMakeLists.txt b/intern/cycles/render/CMakeLists.txt index 6edb5261b32..323222b8c85 100644 --- a/intern/cycles/render/CMakeLists.txt +++ b/intern/cycles/render/CMakeLists.txt @@ -35,7 +35,6 @@ set(SRC denoising.cpp film.cpp geometry.cpp - gpu_display.cpp graph.cpp hair.cpp image.cpp @@ -78,9 +77,10 @@ set(SRC_HEADERS colorspace.h constant_fold.h denoising.h + display_driver.h + output_driver.h film.h geometry.h - gpu_display.h graph.h hair.h image.h diff --git a/intern/cycles/render/attribute.cpp b/intern/cycles/render/attribute.cpp index ea5a5f50f2d..aaf21ad9fd2 100644 --- a/intern/cycles/render/attribute.cpp +++ b/intern/cycles/render/attribute.cpp @@ -342,6 +342,8 @@ const char *Attribute::standard_name(AttributeStandard std) return "particle"; case ATTR_STD_CURVE_INTERCEPT: return "curve_intercept"; + case ATTR_STD_CURVE_LENGTH: + return "curve_length"; case ATTR_STD_CURVE_RANDOM: return "curve_random"; case ATTR_STD_PTEX_FACE_ID: @@ -586,6 +588,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name) case ATTR_STD_CURVE_INTERCEPT: attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY); break; + case ATTR_STD_CURVE_LENGTH: + attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE); + break; case ATTR_STD_CURVE_RANDOM: attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE); break; diff --git a/intern/cycles/render/buffers.cpp b/intern/cycles/render/buffers.cpp index 1882510cd70..3682b55049a 100644 --- a/intern/cycles/render/buffers.cpp +++ b/intern/cycles/render/buffers.cpp @@ -22,7 +22,6 @@ #include "util/util_foreach.h" #include "util/util_hash.h" #include "util/util_math.h" -#include "util/util_opengl.h" #include "util/util_time.h" #include "util/util_types.h" diff --git a/intern/cycles/render/display_driver.h b/intern/cycles/render/display_driver.h new file mode 100644 index 00000000000..85f305034d7 --- /dev/null +++ b/intern/cycles/render/display_driver.h @@ -0,0 +1,131 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_half.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +/* Display driver for efficient interactive display of renders. + * + * Host applications implement this interface for viewport rendering. For best performance, we + * recommend: + * - Allocating a texture on the GPU to be interactively updated + * - Using the graphics interop mechanism to avoid CPU-GPU copying overhead + * - Using a dedicated or thread-safe graphics API context for updates, to avoid + * blocking the host application. + */ +class DisplayDriver { + public: + DisplayDriver() = default; + virtual ~DisplayDriver() = default; + + /* Render buffer parameters. */ + struct Params { + public: + /* Render resolution, ignoring progressive resolution changes. + * The texture buffer should be allocated with this size. */ + int2 size = make_int2(0, 0); + + /* For border rendering, the full resolution of the render, and the offset within that larger + * render. */ + int2 full_size = make_int2(0, 0); + int2 full_offset = make_int2(0, 0); + + bool modified(const Params &other) const + { + return !(full_offset == other.full_offset && full_size == other.full_size && + size == other.size); + } + }; + + /* Update the render from the rendering thread. + * + * Cycles periodically updates the render to be displayed. For multithreaded updates with + * potentially multiple rendering devices, it will call these methods as follows. + * + * if (driver.update_begin(params, width, height)) { + * parallel_for_each(rendering_device) { + * buffer = driver.map_texture_buffer(); + * if (buffer) { + * fill(buffer); + * driver.unmap_texture_buffer(); + * } + * } + * driver.update_end(); + * } + * + * The parameters may dynamically change due to camera changes in the scene, and resources should + * be re-allocated accordingly. + * + * The width and height passed to update_begin() are the effective render resolution taking into + * account progressive resolution changes, which may be equal to or smaller than the params.size. + * For efficiency, changes in this resolution should be handled without re-allocating resources, + * but rather by using a subset of the full resolution buffer. */ + virtual bool update_begin(const Params ¶ms, int width, int height) = 0; + virtual void update_end() = 0; + + virtual half4 *map_texture_buffer() = 0; + virtual void unmap_texture_buffer() = 0; + + /* Optionally return a handle to a native graphics API texture buffer. If supported, + * the rendering device may write directly to this buffer instead of calling + * map_texture_buffer() and unmap_texture_buffer(). */ + class GraphicsInterop { + public: + /* Dimensions of the buffer, in pixels. */ + int buffer_width = 0; + int buffer_height = 0; + + /* OpenGL pixel buffer object. */ + int opengl_pbo_id = 0; + + /* Clear the entire buffer before doing partial write to it. */ + bool need_clear = false; + }; + + virtual GraphicsInterop graphics_interop_get() + { + return GraphicsInterop(); + } + + /* (De)activate graphics context required for editing or deleting the graphics interop + * object. + * + * For example, destruction of the CUDA object associated with an OpenGL requires the + * OpenGL context to be active. */ + virtual void graphics_interop_activate(){}; + virtual void graphics_interop_deactivate(){}; + + /* Clear the display buffer by filling it with zeros. */ + virtual void clear() = 0; + + /* Draw the render using the native graphics API. + * + * Note that this may be called in parallel to updates. The implementation is responsible for + * mutex locking or other mechanisms to avoid conflicts. + * + * The parameters may have changed since the last update. The implementation is responsible for + * deciding to skip or adjust render display for such changes. + * + * Host application drawing the render buffer should use Session.draw(), which will + * call this method. */ + virtual void draw(const Params ¶ms) = 0; +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/film.cpp b/intern/cycles/render/film.cpp index 8e14b338bd3..ad3336ca089 100644 --- a/intern/cycles/render/film.cpp +++ b/intern/cycles/render/film.cpp @@ -434,7 +434,8 @@ void Film::update_passes(Scene *scene, bool add_sample_count_pass) const ObjectManager *object_manager = scene->object_manager; Integrator *integrator = scene->integrator; - if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) { + if (!is_modified() && !object_manager->need_update() && !integrator->is_modified() && + !background->is_modified()) { return; } diff --git a/intern/cycles/render/integrator.h b/intern/cycles/render/integrator.h index 32e108d62ca..5ad419e02ca 100644 --- a/intern/cycles/render/integrator.h +++ b/intern/cycles/render/integrator.h @@ -19,7 +19,7 @@ #include "kernel/kernel_types.h" -#include "device/device_denoise.h" /* For the paramaters and type enum. */ +#include "device/device_denoise.h" /* For the parameters and type enum. */ #include "graph/node.h" #include "integrator/adaptive_sampling.h" diff --git a/intern/cycles/render/nodes.cpp b/intern/cycles/render/nodes.cpp index 03b79d7de3e..1629895ff6e 100644 --- a/intern/cycles/render/nodes.cpp +++ b/intern/cycles/render/nodes.cpp @@ -4368,6 +4368,7 @@ NODE_DEFINE(HairInfoNode) SOCKET_OUT_FLOAT(is_strand, "Is Strand"); SOCKET_OUT_FLOAT(intercept, "Intercept"); + SOCKET_OUT_FLOAT(size, "Length"); SOCKET_OUT_FLOAT(thickness, "Thickness"); SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal"); #if 0 /* Output for minimum hair width transparency - deactivated. */ @@ -4390,6 +4391,9 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes) if (!intercept_out->links.empty()) attributes->add(ATTR_STD_CURVE_INTERCEPT); + if (!output("Length")->links.empty()) + attributes->add(ATTR_STD_CURVE_LENGTH); + if (!output("Random")->links.empty()) attributes->add(ATTR_STD_CURVE_RANDOM); } @@ -4412,6 +4416,12 @@ void HairInfoNode::compile(SVMCompiler &compiler) compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT); } + out = output("Length"); + if (!out->links.empty()) { + int attr = compiler.attribute(ATTR_STD_CURVE_LENGTH); + compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT); + } + out = output("Thickness"); if (!out->links.empty()) { compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_THICKNESS, compiler.stack_assign(out)); @@ -6372,7 +6382,7 @@ void BumpNode::constant_fold(const ConstantFolder &folder) /* TODO(sergey): Ignore bump with zero strength. */ } -/* Curve node */ +/* Curves node */ CurvesNode::CurvesNode(const NodeType *node_type) : ShaderNode(node_type) { @@ -6521,6 +6531,83 @@ void VectorCurvesNode::compile(OSLCompiler &compiler) CurvesNode::compile(compiler, "node_vector_curves"); } +/* FloatCurveNode */ + +NODE_DEFINE(FloatCurveNode) +{ + NodeType *type = NodeType::add("float_curve", create, NodeType::SHADER); + + SOCKET_FLOAT_ARRAY(curve, "Curve", array<float>()); + SOCKET_FLOAT(min_x, "Min X", 0.0f); + SOCKET_FLOAT(max_x, "Max X", 1.0f); + + SOCKET_IN_FLOAT(fac, "Factor", 0.0f); + SOCKET_IN_FLOAT(value, "Value", 0.0f); + + SOCKET_OUT_FLOAT(value, "Value"); + + return type; +} + +FloatCurveNode::FloatCurveNode() : ShaderNode(get_node_type()) +{ +} + +void FloatCurveNode::constant_fold(const ConstantFolder &folder) +{ + ShaderInput *value_in = input("Value"); + ShaderInput *fac_in = input("Factor"); + + /* evaluate fully constant node */ + if (folder.all_inputs_constant()) { + if (curve.size() == 0) { + return; + } + + float pos = (value - min_x) / (max_x - min_x); + float result = float_ramp_lookup(curve.data(), pos, true, true, curve.size()); + + folder.make_constant(value + fac * (result - value)); + } + /* remove no-op node */ + else if (!fac_in->link && fac == 0.0f) { + /* link is not null because otherwise all inputs are constant */ + folder.bypass(value_in->link); + } +} + +void FloatCurveNode::compile(SVMCompiler &compiler) +{ + if (curve.size() == 0) + return; + + ShaderInput *value_in = input("Value"); + ShaderInput *fac_in = input("Factor"); + ShaderOutput *value_out = output("Value"); + + compiler.add_node(NODE_FLOAT_CURVE, + compiler.encode_uchar4(compiler.stack_assign(fac_in), + compiler.stack_assign(value_in), + compiler.stack_assign(value_out)), + __float_as_int(min_x), + __float_as_int(max_x)); + + compiler.add_node(curve.size()); + for (int i = 0; i < curve.size(); i++) + compiler.add_node(make_float4(curve[i])); +} + +void FloatCurveNode::compile(OSLCompiler &compiler) +{ + if (curve.size() == 0) + return; + + compiler.parameter_array("ramp", curve.data(), curve.size()); + compiler.parameter(this, "min_x"); + compiler.parameter(this, "max_x"); + compiler.add(this, "node_float_curve"); +} + /* RGBRampNode */ NODE_DEFINE(RGBRampNode) diff --git a/intern/cycles/render/nodes.h b/intern/cycles/render/nodes.h index 22bdb06b059..5ac72835ac5 100644 --- a/intern/cycles/render/nodes.h +++ b/intern/cycles/render/nodes.h @@ -1398,6 +1398,18 @@ class VectorCurvesNode : public CurvesNode { void constant_fold(const ConstantFolder &folder); }; +class FloatCurveNode : public ShaderNode { + public: + SHADER_NODE_CLASS(FloatCurveNode) + void constant_fold(const ConstantFolder &folder); + + NODE_SOCKET_API_ARRAY(array<float>, curve) + NODE_SOCKET_API(float, min_x) + NODE_SOCKET_API(float, max_x) + NODE_SOCKET_API(float, fac) + NODE_SOCKET_API(float, value) +}; + class RGBRampNode : public ShaderNode { public: SHADER_NODE_CLASS(RGBRampNode) diff --git a/intern/cycles/render/osl.cpp b/intern/cycles/render/osl.cpp index d28b222c10e..5a43b641872 100644 --- a/intern/cycles/render/osl.cpp +++ b/intern/cycles/render/osl.cpp @@ -727,8 +727,8 @@ void OSLCompiler::add(ShaderNode *node, const char *name, bool isfilepath) } } - /* create shader of the appropriate type. OSL only distinguishes between "surface" - * and "displacement" atm */ + /* Create shader of the appropriate type. OSL only distinguishes between "surface" + * and "displacement" at the moment. */ if (current_type == SHADER_TYPE_SURFACE) ss->Shader("surface", name, id(node).c_str()); else if (current_type == SHADER_TYPE_VOLUME) diff --git a/intern/cycles/render/output_driver.h b/intern/cycles/render/output_driver.h new file mode 100644 index 00000000000..b7e980d71d4 --- /dev/null +++ b/intern/cycles/render/output_driver.h @@ -0,0 +1,82 @@ +/* + * Copyright 2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "util/util_math.h" +#include "util/util_string.h" +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +/* Output driver for reading render buffers. + * + * Host applications implement this interface for outputting render buffers for offline rendering. + * Drivers can be used to copy the buffers into the host application or write them directly to + * disk. This interface may also be used for interactive display, however the DisplayDriver is more + * efficient for that purpose. + */ +class OutputDriver { + public: + OutputDriver() = default; + virtual ~OutputDriver() = default; + + class Tile { + public: + Tile(const int2 offset, + const int2 size, + const int2 full_size, + const string_view layer, + const string_view view) + : offset(offset), size(size), full_size(full_size), layer(layer), view(view) + { + } + virtual ~Tile() = default; + + const int2 offset; + const int2 size; + const int2 full_size; + const string layer; + const string view; + + virtual bool get_pass_pixels(const string_view pass_name, + const int num_channels, + float *pixels) const = 0; + virtual bool set_pass_pixels(const string_view pass_name, + const int num_channels, + const float *pixels) const = 0; + }; + + /* Write tile once it has finished rendering. */ + virtual void write_render_tile(const Tile &tile) = 0; + + /* Update tile while rendering is in progress. Return true if any update + * was performed. */ + virtual bool update_render_tile(const Tile & /* tile */) + { + return false; + } + + /* For baking, read render pass PASS_BAKE_PRIMITIVE and PASS_BAKE_DIFFERENTIAL + * to determine which shading points to use for baking at each pixel. Return + * true if any data was read. */ + virtual bool read_render_tile(const Tile & /* tile */) + { + return false; + } +}; + +CCL_NAMESPACE_END diff --git a/intern/cycles/render/session.cpp b/intern/cycles/render/session.cpp index 823c34ed519..550188b196a 100644 --- a/intern/cycles/render/session.cpp +++ b/intern/cycles/render/session.cpp @@ -25,12 +25,13 @@ #include "render/bake.h" #include "render/buffers.h" #include "render/camera.h" -#include "render/gpu_display.h" +#include "render/display_driver.h" #include "render/graph.h" #include "render/integrator.h" #include "render/light.h" #include "render/mesh.h" #include "render/object.h" +#include "render/output_driver.h" #include "render/scene.h" #include "render/session.h" @@ -38,7 +39,6 @@ #include "util/util_function.h" #include "util/util_logging.h" #include "util/util_math.h" -#include "util/util_opengl.h" #include "util/util_task.h" #include "util/util_time.h" @@ -65,25 +65,6 @@ Session::Session(const SessionParams ¶ms_, const SceneParams &scene_params) path_trace_ = make_unique<PathTrace>( device, scene->film, &scene->dscene, render_scheduler_, tile_manager_); path_trace_->set_progress(&progress); - path_trace_->tile_buffer_update_cb = [&]() { - if (!update_render_tile_cb) { - return; - } - update_render_tile_cb(); - }; - path_trace_->tile_buffer_write_cb = [&]() { - if (!write_render_tile_cb) { - return; - } - write_render_tile_cb(); - }; - path_trace_->tile_buffer_read_cb = [&]() -> bool { - if (!read_render_tile_cb) { - return false; - } - read_render_tile_cb(); - return true; - }; path_trace_->progress_update_cb = [&]() { update_status_time(); }; tile_manager_.full_buffer_written_cb = [&](string_view filename) { @@ -98,24 +79,6 @@ Session::~Session() { cancel(); - /* TODO(sergey): Bring the passes in viewport back. - * It is unclear why there is such an exception needed though. */ -#if 0 - if (buffers && params.write_render_cb) { - /* Copy to display buffer and write out image if requested */ - delete display; - - display = new DisplayBuffer(device, false); - display->reset(buffers->params); - copy_to_display_buffer(params.samples); - - int w = display->draw_width; - int h = display->draw_height; - uchar4 *pixels = display->rgba_byte.copy_from_device(0, w, h); - params.write_render_cb((uchar *)pixels, w, h, 4); - } -#endif - /* Make sure path tracer is destroyed before the device. This is needed because destruction might * need to access device for device memory free. */ /* TODO(sergey): Convert device to be unique_ptr, and rely on C++ to destruct objects in the @@ -163,7 +126,7 @@ bool Session::ready_to_reset() void Session::run_main_render_loop() { - path_trace_->clear_gpu_display(); + path_trace_->clear_display(); while (true) { RenderWork render_work = run_update_for_next_iteration(); @@ -397,8 +360,8 @@ int2 Session::get_effective_tile_size() const /* TODO(sergey): Take available memory into account, and if there is enough memory do not tile * and prefer optimal performance. */ - - return make_int2(params.tile_size, params.tile_size); + const int tile_size = tile_manager_.compute_render_tile_size(params.tile_size); + return make_int2(tile_size, tile_size); } void Session::do_delayed_reset() @@ -515,9 +478,33 @@ void Session::set_pause(bool pause) } } -void Session::set_gpu_display(unique_ptr<GPUDisplay> gpu_display) +void Session::set_output_driver(unique_ptr<OutputDriver> driver) { - path_trace_->set_gpu_display(move(gpu_display)); + path_trace_->set_output_driver(move(driver)); +} + +void Session::set_display_driver(unique_ptr<DisplayDriver> driver) +{ + path_trace_->set_display_driver(move(driver)); +} + +double Session::get_estimated_remaining_time() const +{ + const float completed = progress.get_progress(); + if (completed == 0.0f) { + return 0.0; + } + + double total_time, render_time; + progress.get_time(total_time, render_time); + double remaining = (1.0 - (double)completed) * (render_time / (double)completed); + + const double time_limit = render_scheduler_.get_time_limit(); + if (time_limit != 0.0) { + remaining = min(remaining, max(time_limit - render_time, 0.0)); + } + + return remaining; } void Session::wait() @@ -619,101 +606,6 @@ void Session::collect_statistics(RenderStats *render_stats) } /* -------------------------------------------------------------------- - * Tile and tile pixels access. - */ - -bool Session::has_multiple_render_tiles() const -{ - return tile_manager_.has_multiple_tiles(); -} - -int2 Session::get_render_tile_size() const -{ - return path_trace_->get_render_tile_size(); -} - -int2 Session::get_render_tile_offset() const -{ - return path_trace_->get_render_tile_offset(); -} - -string_view Session::get_render_tile_layer() const -{ - const BufferParams &buffer_params = path_trace_->get_render_tile_params(); - return buffer_params.layer; -} - -string_view Session::get_render_tile_view() const -{ - const BufferParams &buffer_params = path_trace_->get_render_tile_params(); - return buffer_params.view; -} - -bool Session::copy_render_tile_from_device() -{ - return path_trace_->copy_render_tile_from_device(); -} - -bool Session::get_render_tile_pixels(const string &pass_name, int num_components, float *pixels) -{ - /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification - * is happening while this function runs. */ - - const BufferParams &buffer_params = path_trace_->get_render_tile_params(); - - const BufferPass *pass = buffer_params.find_pass(pass_name); - if (pass == nullptr) { - return false; - } - - const bool has_denoised_result = path_trace_->has_denoised_result(); - if (pass->mode == PassMode::DENOISED && !has_denoised_result) { - pass = buffer_params.find_pass(pass->type); - if (pass == nullptr) { - /* Happens when denoised result pass is requested but is never written by the kernel. */ - return false; - } - } - - pass = buffer_params.get_actual_display_pass(pass); - - const float exposure = buffer_params.exposure; - const int num_samples = path_trace_->get_num_render_tile_samples(); - - PassAccessor::PassAccessInfo pass_access_info(*pass); - pass_access_info.use_approximate_shadow_catcher = buffer_params.use_approximate_shadow_catcher; - pass_access_info.use_approximate_shadow_catcher_background = - pass_access_info.use_approximate_shadow_catcher && !buffer_params.use_transparent_background; - - const PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples); - const PassAccessor::Destination destination(pixels, num_components); - - return path_trace_->get_render_tile_pixels(pass_accessor, destination); -} - -bool Session::set_render_tile_pixels(const string &pass_name, - int num_components, - const float *pixels) -{ - /* NOTE: The code relies on a fact that session is fully update and no scene/buffer modification - * is happening while this function runs. */ - - const BufferPass *pass = buffer_params_.find_pass(pass_name); - if (!pass) { - return false; - } - - const float exposure = scene->film->get_exposure(); - const int num_samples = render_scheduler_.get_num_rendered_samples(); - - const PassAccessor::PassAccessInfo pass_access_info(*pass); - PassAccessorCPU pass_accessor(pass_access_info, exposure, num_samples); - PassAccessor::Source source(pixels, num_components); - - return path_trace_->set_render_tile_pixels(pass_accessor, source); -} - -/* -------------------------------------------------------------------- * Full-frame on-disk storage. */ diff --git a/intern/cycles/render/session.h b/intern/cycles/render/session.h index 5623604bfe8..46c964bc98c 100644 --- a/intern/cycles/render/session.h +++ b/intern/cycles/render/session.h @@ -35,9 +35,10 @@ CCL_NAMESPACE_BEGIN class BufferParams; class Device; class DeviceScene; +class DisplayDriver; +class OutputDriver; class PathTrace; class Progress; -class GPUDisplay; class RenderBuffers; class Scene; class SceneParams; @@ -67,8 +68,6 @@ class SessionParams { ShadingSystem shadingsystem; - function<bool(const uchar *pixels, int width, int height, int channels)> write_render_cb; - SessionParams() { headless = false; @@ -114,10 +113,6 @@ class Session { Stats stats; Profiler profiler; - function<void(void)> write_render_tile_cb; - function<void(void)> update_render_tile_cb; - function<void(void)> read_render_tile_cb; - /* Callback is invoked by tile manager whenever on-dist tiles storage file is closed after * writing. Allows an engine integration to keep track of those files without worry about * transferring the information when it needs to re-create session during rendering. */ @@ -143,7 +138,10 @@ class Session { void set_samples(int samples); void set_time_limit(double time_limit); - void set_gpu_display(unique_ptr<GPUDisplay> gpu_display); + void set_output_driver(unique_ptr<OutputDriver> driver); + void set_display_driver(unique_ptr<DisplayDriver> driver); + + double get_estimated_remaining_time() const; void device_free(); @@ -154,24 +152,6 @@ class Session { void collect_statistics(RenderStats *stats); /* -------------------------------------------------------------------- - * Tile and tile pixels access. - */ - - bool has_multiple_render_tiles() const; - - /* Get size and offset (relative to the buffer's full x/y) of the currently rendering tile. */ - int2 get_render_tile_size() const; - int2 get_render_tile_offset() const; - - string_view get_render_tile_layer() const; - string_view get_render_tile_view() const; - - bool copy_render_tile_from_device(); - - bool get_render_tile_pixels(const string &pass_name, int num_components, float *pixels); - bool set_render_tile_pixels(const string &pass_name, int num_components, const float *pixels); - - /* -------------------------------------------------------------------- * Full-frame on-disk storage. */ diff --git a/intern/cycles/render/tile.cpp b/intern/cycles/render/tile.cpp index 28910bffa7b..7e53a9d0911 100644 --- a/intern/cycles/render/tile.cpp +++ b/intern/cycles/render/tile.cpp @@ -307,8 +307,8 @@ static bool configure_image_spec_from_buffer(ImageSpec *image_spec, DCHECK_GT(tile_size.x, 0); DCHECK_GT(tile_size.y, 0); - image_spec->tile_width = tile_size.x; - image_spec->tile_height = tile_size.y; + image_spec->tile_width = min(TileManager::IMAGE_TILE_SIZE, tile_size.x); + image_spec->tile_height = min(TileManager::IMAGE_TILE_SIZE, tile_size.y); } return true; @@ -335,6 +335,15 @@ TileManager::~TileManager() { } +int TileManager::compute_render_tile_size(const int suggested_tile_size) const +{ + /* Must be a multiple of IMAGE_TILE_SIZE so that we can write render tiles into the image file + * aligned on image tile boundaries. We can't set IMAGE_TILE_SIZE equal to the render tile size + * because too big tile size leads to integer overflow inside OpenEXR. */ + return (suggested_tile_size <= IMAGE_TILE_SIZE) ? suggested_tile_size : + align_up(suggested_tile_size, IMAGE_TILE_SIZE); +} + void TileManager::reset_scheduling(const BufferParams ¶ms, int2 tile_size) { VLOG(3) << "Using tile size of " << tile_size; @@ -411,6 +420,11 @@ const Tile &TileManager::get_current_tile() const return tile_state_.current_tile; } +const int2 TileManager::get_size() const +{ + return make_int2(buffer_params_.width, buffer_params_.height); +} + bool TileManager::open_tile_output() { write_state_.filename = path_temp_get("cycles-tile-buffer-" + tile_file_unique_part_ + "-" + @@ -427,7 +441,12 @@ bool TileManager::open_tile_output() return false; } - write_state_.tile_out->open(write_state_.filename, write_state_.image_spec); + if (!write_state_.tile_out->open(write_state_.filename, write_state_.image_spec)) { + LOG(ERROR) << "Error opening tile file: " << write_state_.tile_out->geterror(); + write_state_.tile_out = nullptr; + return false; + } + write_state_.num_tiles_written = 0; VLOG(3) << "Opened tile file " << write_state_.filename; @@ -466,33 +485,29 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers) const BufferParams &tile_params = tile_buffers.params; - vector<float> pixel_storage; const float *pixels = tile_buffers.buffer.data(); - - /* Tiled writing expects pixels to contain data for an entire tile. Pad the render buffers with - * empty pixels for tiles which are on the image boundary. */ - if (tile_params.width != tile_size_.x || tile_params.height != tile_size_.y) { - const int64_t pass_stride = tile_params.pass_stride; - const int64_t src_row_stride = tile_params.width * pass_stride; - - const int64_t dst_row_stride = tile_size_.x * pass_stride; - pixel_storage.resize(dst_row_stride * tile_size_.y); - - const float *src = tile_buffers.buffer.data(); - float *dst = pixel_storage.data(); - pixels = dst; - - for (int y = 0; y < tile_params.height; ++y, src += src_row_stride, dst += dst_row_stride) { - memcpy(dst, src, src_row_stride * sizeof(float)); - } - } - const int tile_x = tile_params.full_x - buffer_params_.full_x; const int tile_y = tile_params.full_y - buffer_params_.full_y; VLOG(3) << "Write tile at " << tile_x << ", " << tile_y; - if (!write_state_.tile_out->write_tile(tile_x, tile_y, 0, TypeDesc::FLOAT, pixels)) { + + /* The image tile sizes in the OpenEXR file are different from the size of our big tiles. The + * write_tiles() method expects a contiguous image region that will be split into tiles + * internally. OpenEXR expects the size of this region to be a multiple of the tile size, + * however OpenImageIO automatically adds the required padding. + * + * The only thing we have to ensure is that the tile_x and tile_y are a multiple of the + * image tile size, which happens in compute_render_tile_size. */ + if (!write_state_.tile_out->write_tiles(tile_x, + tile_x + tile_params.width, + tile_y, + tile_y + tile_params.height, + 0, + 1, + TypeDesc::FLOAT, + pixels)) { LOG(ERROR) << "Error writing tile " << write_state_.tile_out->geterror(); + return false; } ++write_state_.num_tiles_written; @@ -518,7 +533,14 @@ void TileManager::finish_write_tiles() VLOG(3) << "Write dummy tile at " << tile.x << ", " << tile.y; - write_state_.tile_out->write_tile(tile.x, tile.y, 0, TypeDesc::FLOAT, pixel_storage.data()); + write_state_.tile_out->write_tiles(tile.x, + tile.x + tile.width, + tile.y, + tile.y + tile.height, + 0, + 1, + TypeDesc::FLOAT, + pixel_storage.data()); } } diff --git a/intern/cycles/render/tile.h b/intern/cycles/render/tile.h index 71b9e966278..08eaa4034f0 100644 --- a/intern/cycles/render/tile.h +++ b/intern/cycles/render/tile.h @@ -82,6 +82,7 @@ class TileManager { bool done(); const Tile &get_current_tile() const; + const int2 get_size() const; /* Write render buffer of a tile to a file on disk. * @@ -107,6 +108,12 @@ class TileManager { RenderBuffers *buffers, DenoiseParams *denoise_params); + /* Compute valid tile size compatible with image saving. */ + int compute_render_tile_size(const int suggested_tile_size) const; + + /* Tile size in the image file. */ + static const int IMAGE_TILE_SIZE = 128; + protected: /* Get tile configuration for its index. * The tile index must be within [0, state_.tile_state_). */ diff --git a/intern/cycles/util/util_atomic.h b/intern/cycles/util/util_atomic.h index de17efafcf2..faba411c769 100644 --- a/intern/cycles/util/util_atomic.h +++ b/intern/cycles/util/util_atomic.h @@ -34,7 +34,7 @@ #else /* __KERNEL_GPU__ */ -# ifdef __KERNEL_CUDA__ +# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) # define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x)) diff --git a/intern/cycles/util/util_debug.cpp b/intern/cycles/util/util_debug.cpp index 1d598725c84..2245668d02f 100644 --- a/intern/cycles/util/util_debug.cpp +++ b/intern/cycles/util/util_debug.cpp @@ -59,12 +59,23 @@ DebugFlags::CUDA::CUDA() : adaptive_compile(false) reset(); } +DebugFlags::HIP::HIP() : adaptive_compile(false) +{ + reset(); +} + void DebugFlags::CUDA::reset() { if (getenv("CYCLES_CUDA_ADAPTIVE_COMPILE") != NULL) adaptive_compile = true; } +void DebugFlags::HIP::reset() +{ + if (getenv("CYCLES_HIP_ADAPTIVE_COMPILE") != NULL) + adaptive_compile = true; +} + DebugFlags::OptiX::OptiX() { reset(); @@ -103,6 +114,10 @@ std::ostream &operator<<(std::ostream &os, DebugFlagsConstRef debug_flags) os << "OptiX flags:\n" << " Debug : " << string_from_bool(debug_flags.optix.use_debug) << "\n"; + + os << "HIP flags:\n" + << " HIP streams : " << string_from_bool(debug_flags.hip.adaptive_compile) << "\n"; + return os; } diff --git a/intern/cycles/util/util_debug.h b/intern/cycles/util/util_debug.h index 99e2723180c..81677201790 100644 --- a/intern/cycles/util/util_debug.h +++ b/intern/cycles/util/util_debug.h @@ -89,7 +89,18 @@ class DebugFlags { void reset(); /* Whether adaptive feature based runtime compile is enabled or not. - * Requires the CUDA Toolkit and only works on Linux atm. */ + * Requires the CUDA Toolkit and only works on Linux at the moment. */ + bool adaptive_compile; + }; + + /* Descriptor of HIP feature-set to be used. */ + struct HIP { + HIP(); + + /* Reset flags to their defaults. */ + void reset(); + + /* Whether adaptive feature based runtime compile is enabled or not.*/ bool adaptive_compile; }; @@ -124,6 +135,9 @@ class DebugFlags { /* Requested OptiX flags. */ OptiX optix; + /* Requested HIP flags. */ + HIP hip; + private: DebugFlags(); diff --git a/intern/cycles/util/util_half.h b/intern/cycles/util/util_half.h index d9edfec5da3..f36a492a1b0 100644 --- a/intern/cycles/util/util_half.h +++ b/intern/cycles/util/util_half.h @@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN /* Half Floats */ /* CUDA has its own half data type, no need to define then */ -#ifndef __KERNEL_CUDA__ +#if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from * unsigned shorts. */ class half { @@ -59,7 +59,7 @@ struct half4 { half x, y, z, w; }; -#ifdef __KERNEL_CUDA__ +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__) ccl_device_inline void float4_store_half(half *h, float4 f) { @@ -73,6 +73,7 @@ ccl_device_inline void float4_store_half(half *h, float4 f) ccl_device_inline void float4_store_half(half *h, float4 f) { + # ifndef __KERNEL_SSE2__ for (int i = 0; i < 4; i++) { /* optimized float to half for pixels: @@ -109,6 +110,8 @@ ccl_device_inline void float4_store_half(half *h, float4 f) # endif } +# ifndef __KERNEL_HIP__ + ccl_device_inline float half_to_float(half h) { float f; @@ -117,6 +120,23 @@ ccl_device_inline float half_to_float(half h) return f; } +# else + +ccl_device_inline float half_to_float(std::uint32_t a) noexcept +{ + + std::uint32_t u = ((a << 13) + 0x70000000U) & 0x8fffe000U; + + std::uint32_t v = __float_as_uint(__uint_as_float(u) * + __uint_as_float(0x77800000U) /*0x1.0p+112f*/) + + 0x38000000U; + + u = (a & 0x7fff) != 0 ? v : u; + + return __uint_as_float(u) * __uint_as_float(0x07800000U) /*0x1.0p-112f*/; +} + +# endif /* __KERNEL_HIP__ */ ccl_device_inline float4 half4_to_float4(half4 h) { diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 6d728dde679..cb1e94c838c 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -26,6 +26,10 @@ # include <cmath> #endif +#ifdef __HIP__ +# include <hip/hip_vector_types.h> +#endif + #include <float.h> #include <math.h> #include <stdio.h> @@ -83,7 +87,8 @@ CCL_NAMESPACE_BEGIN /* Scalar */ -#ifdef _WIN32 +#ifndef __HIP__ +# ifdef _WIN32 ccl_device_inline float fmaxf(float a, float b) { return (a > b) ? a : b; @@ -93,7 +98,9 @@ ccl_device_inline float fminf(float a, float b) { return (a < b) ? a : b; } -#endif /* _WIN32 */ + +# endif /* _WIN32 */ +#endif /* __HIP__ */ #ifndef __KERNEL_GPU__ using std::isfinite; @@ -199,6 +206,7 @@ ccl_device_inline uint as_uint(float f) return u.i; } +#ifndef __HIP__ ccl_device_inline int __float_as_int(float f) { union { @@ -238,6 +246,7 @@ ccl_device_inline float __uint_as_float(uint i) u.i = i; return u.f; } +#endif ccl_device_inline int4 __float4_as_int4(float4 f) { @@ -669,7 +678,7 @@ ccl_device float bits_to_01(uint bits) ccl_device_inline uint count_leading_zeros(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __clz(x); #else assert(x != 0); @@ -685,7 +694,7 @@ ccl_device_inline uint count_leading_zeros(uint x) ccl_device_inline uint count_trailing_zeros(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return (__ffs(x) - 1); #else assert(x != 0); @@ -701,7 +710,7 @@ ccl_device_inline uint count_trailing_zeros(uint x) ccl_device_inline uint find_first_set(uint x) { -#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) +#if defined(__KERNEL_CUDA__) || defined(__KERNEL_OPTIX__) || defined(__KERNEL_HIP__) return __ffs(x); #else # ifdef _MSC_VER diff --git a/intern/cycles/util/util_math_intersect.h b/intern/cycles/util/util_math_intersect.h index fa3a541eea9..fd0c9124345 100644 --- a/intern/cycles/util/util_math_intersect.h +++ b/intern/cycles/util/util_math_intersect.h @@ -40,7 +40,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P, /* Ray points away from sphere. */ return false; } - const float dsq = tsq - tp * tp; /* pythagoras */ + const float dsq = tsq - tp * tp; /* Pythagoras. */ if (dsq > radiussq) { /* Closest point on ray outside sphere. */ return false; diff --git a/intern/cycles/util/util_progress.h b/intern/cycles/util/util_progress.h index dca8d3d0ab5..176ee11e1e9 100644 --- a/intern/cycles/util/util_progress.h +++ b/intern/cycles/util/util_progress.h @@ -100,7 +100,7 @@ class Progress { cancel = true; } - bool get_cancel() + bool get_cancel() const { if (!cancel && cancel_cb) cancel_cb(); @@ -108,7 +108,7 @@ class Progress { return cancel; } - string get_cancel_message() + string get_cancel_message() const { thread_scoped_lock lock(progress_mutex); return cancel_message; @@ -130,12 +130,12 @@ class Progress { cancel = true; } - bool get_error() + bool get_error() const { return error; } - string get_error_message() + string get_error_message() const { thread_scoped_lock lock(progress_mutex); return error_message; @@ -168,7 +168,7 @@ class Progress { } } - void get_time(double &total_time_, double &render_time_) + void get_time(double &total_time_, double &render_time_) const { thread_scoped_lock lock(progress_mutex); @@ -200,7 +200,7 @@ class Progress { total_pixel_samples = total_pixel_samples_; } - float get_progress() + float get_progress() const { thread_scoped_lock lock(progress_mutex); @@ -236,7 +236,7 @@ class Progress { } } - int get_current_sample() + int get_current_sample() const { thread_scoped_lock lock(progress_mutex); /* Note that the value here always belongs to the last tile that updated, @@ -244,13 +244,13 @@ class Progress { return current_tile_sample; } - int get_rendered_tiles() + int get_rendered_tiles() const { thread_scoped_lock lock(progress_mutex); return rendered_tiles; } - int get_denoised_tiles() + int get_denoised_tiles() const { thread_scoped_lock lock(progress_mutex); return denoised_tiles; @@ -300,7 +300,7 @@ class Progress { set_update(); } - void get_status(string &status_, string &substatus_) + void get_status(string &status_, string &substatus_) const { thread_scoped_lock lock(progress_mutex); @@ -330,8 +330,8 @@ class Progress { } protected: - thread_mutex progress_mutex; - thread_mutex update_mutex; + mutable thread_mutex progress_mutex; + mutable thread_mutex update_mutex; function<void()> update_cb; function<void()> cancel_cb; diff --git a/intern/ghost/GHOST_IWindow.h b/intern/ghost/GHOST_IWindow.h index 5f9bd808c8c..91f576ca304 100644 --- a/intern/ghost/GHOST_IWindow.h +++ b/intern/ghost/GHOST_IWindow.h @@ -40,7 +40,7 @@ * There are two coordinate systems: * * - The screen coordinate system. The origin of the screen is located in the - * upper left corner of the screen.</li> + * upper left corner of the screen. * - The client rectangle coordinate system. The client rectangle of a window * is the area that is drawable by the application (excluding title bars etc.). */ diff --git a/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp b/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp index 5b026eb1632..09b2e4dfe2b 100644 --- a/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp +++ b/intern/ghost/intern/GHOST_DisplayManagerSDL.cpp @@ -101,8 +101,7 @@ GHOST_TSuccess GHOST_DisplayManagerSDL::setCurrentDisplaySetting( uint8_t display, const GHOST_DisplaySetting &setting) { /* - * Mode switching code ported from Quake 2 version 3.21 and bzflag version - * 2.4.0: + * Mode switching code ported from Quake 2 version 3.21 and BZFLAG version 2.4.0: * ftp://ftp.idsoftware.com/idstuff/source/q2source-3.21.zip * See linux/gl_glx.c:GLimp_SetMode * http://wiki.bzflag.org/BZFlag_Source diff --git a/intern/ghost/intern/GHOST_SystemSDL.cpp b/intern/ghost/intern/GHOST_SystemSDL.cpp index 35c7a7ef463..5370d4df857 100644 --- a/intern/ghost/intern/GHOST_SystemSDL.cpp +++ b/intern/ghost/intern/GHOST_SystemSDL.cpp @@ -374,8 +374,8 @@ void GHOST_SystemSDL::processEvent(SDL_Event *sdl_event) if (window->getCursorGrabBounds(bounds) == GHOST_kFailure) window->getClientBounds(bounds); - /* Could also clamp to screen bounds wrap with a window outside the view will fail atm. - * Use offset of 8 in case the window is at screen bounds. */ + /* Could also clamp to screen bounds wrap with a window outside the view will + * fail at the moment. Use offset of 8 in case the window is at screen bounds. */ bounds.wrapPoint(x_new, y_new, 8, window->getCursorGrabAxis()); window->getCursorGrabAccum(x_accum, y_accum); diff --git a/intern/ghost/intern/GHOST_SystemWin32.cpp b/intern/ghost/intern/GHOST_SystemWin32.cpp index f44107ee000..482f20f5cd1 100644 --- a/intern/ghost/intern/GHOST_SystemWin32.cpp +++ b/intern/ghost/intern/GHOST_SystemWin32.cpp @@ -1100,8 +1100,8 @@ GHOST_EventCursor *GHOST_SystemWin32::processCursorEvent(GHOST_WindowWin32 *wind window->getClientBounds(bounds); } - /* Could also clamp to screen bounds wrap with a window outside the view will fail atm. - * Use inset in case the window is at screen bounds. */ + /* Could also clamp to screen bounds wrap with a window outside the view will + * fail at the moment. Use inset in case the window is at screen bounds. */ bounds.wrapPoint(x_new, y_new, 2, window->getCursorGrabAxis()); window->getCursorGrabAccum(x_accum, y_accum); diff --git a/intern/ghost/intern/GHOST_SystemX11.cpp b/intern/ghost/intern/GHOST_SystemX11.cpp index 10ccb00cc15..86b4245ca67 100644 --- a/intern/ghost/intern/GHOST_SystemX11.cpp +++ b/intern/ghost/intern/GHOST_SystemX11.cpp @@ -973,8 +973,8 @@ void GHOST_SystemX11::processEvent(XEvent *xe) if (window->getCursorGrabBounds(bounds) == GHOST_kFailure) window->getClientBounds(bounds); - /* Could also clamp to screen bounds wrap with a window outside the view will fail atm. - * Use offset of 8 in case the window is at screen bounds. */ + /* Could also clamp to screen bounds wrap with a window outside the view will + * fail at the moment. Use offset of 8 in case the window is at screen bounds. */ bounds.wrapPoint(x_new, y_new, 8, window->getCursorGrabAxis()); window->getCursorGrabAccum(x_accum, y_accum); @@ -1528,13 +1528,13 @@ void GHOST_SystemX11::processEvent(XEvent *xe) window->GetTabletData().Pressure = axis_value / ((float)xtablet.PressureLevels); } - /* the (short) cast and the & 0xffff is bizarre and unexplained anywhere, - * but I got garbage data without it. Found it in the xidump.c source --matt + /* NOTE(@broken): the (short) cast and the & 0xffff is bizarre and unexplained anywhere, + * but I got garbage data without it. Found it in the `xidump.c` source. * - * The '& 0xffff' just truncates the value to its two lowest bytes, this probably means - * some drivers do not properly set the whole int value? Since we convert to float - * afterward, I don't think we need to cast to short here, but do not have a device to - * check this. --mont29 + * NOTE(@mont29): The '& 0xffff' just truncates the value to its two lowest bytes, + * this probably means some drivers do not properly set the whole int value? + * Since we convert to float afterward, + * I don't think we need to cast to short here, but do not have a device to check this. */ if (AXIS_VALUE_GET(3, axis_value)) { window->GetTabletData().Xtilt = (short)(axis_value & 0xffff) / diff --git a/intern/ghost/intern/GHOST_WindowX11.cpp b/intern/ghost/intern/GHOST_WindowX11.cpp index de389951613..8b44403c598 100644 --- a/intern/ghost/intern/GHOST_WindowX11.cpp +++ b/intern/ghost/intern/GHOST_WindowX11.cpp @@ -1092,9 +1092,9 @@ GHOST_TSuccess GHOST_WindowX11::setOrder(GHOST_TWindowOrder order) XWindowAttributes attr; Atom atom; - /* We use both XRaiseWindow and _NET_ACTIVE_WINDOW, since some - * window managers ignore the former (e.g. kwin from kde) and others - * don't implement the latter (e.g. fluxbox pre 0.9.9) */ + /* We use both #XRaiseWindow and #_NET_ACTIVE_WINDOW, since some + * window managers ignore the former (e.g. KWIN from KDE) and others + * don't implement the latter (e.g. FLUXBOX before 0.9.9). */ XRaiseWindow(m_display, m_window); diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c index 98a8553a3eb..bba72c907eb 100644 --- a/intern/guardedalloc/intern/mallocn_guarded_impl.c +++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c @@ -89,7 +89,7 @@ typedef struct localListBase { void *first, *last; } localListBase; -/* note: keep this struct aligned (e.g., irix/gcc) - Hos */ +/* NOTE(@hos): keep this struct aligned (e.g., IRIX/GCC). */ typedef struct MemHead { int tag1; size_t len; @@ -98,9 +98,8 @@ typedef struct MemHead { const char *nextname; int tag2; short pad1; - short alignment; /* if non-zero aligned alloc was used - * and alignment is stored here. - */ + /* if non-zero aligned allocation was used and alignment is stored here. */ + short alignment; #ifdef DEBUG_MEMCOUNTER int _count; #endif diff --git a/intern/opensubdiv/internal/evaluator/evaluator_impl.cc b/intern/opensubdiv/internal/evaluator/evaluator_impl.cc index b3fc021e1ee..4f4f332ff15 100644 --- a/intern/opensubdiv/internal/evaluator/evaluator_impl.cc +++ b/intern/opensubdiv/internal/evaluator/evaluator_impl.cc @@ -553,7 +553,7 @@ void convertPatchCoordsToArray(const OpenSubdiv_PatchCoord *patch_coords, } // namespace -// Note: Define as a class instead of typedcef to make it possible +// Note: Define as a class instead of typedef to make it possible // to have anonymous class in opensubdiv_evaluator_internal.h class CpuEvalOutput : public VolatileEvalOutput<CpuVertexBuffer, CpuVertexBuffer, diff --git a/intern/opensubdiv/opensubdiv_capi_type.h b/intern/opensubdiv/opensubdiv_capi_type.h index e759c5f43b0..e78842036be 100644 --- a/intern/opensubdiv/opensubdiv_capi_type.h +++ b/intern/opensubdiv/opensubdiv_capi_type.h @@ -23,7 +23,7 @@ extern "C" { #endif -// Keep this a bitmask os it's possible to pass available +// Keep this a bitmask so it's possible to pass available // evaluators to Blender. typedef enum eOpenSubdivEvaluator { OPENSUBDIV_EVALUATOR_CPU = (1 << 0), |