diff options
Diffstat (limited to 'intern/cycles')
-rw-r--r-- | intern/cycles/blender/addon/properties.py | 27 | ||||
-rw-r--r-- | intern/cycles/blender/blender_device.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/blender/blender_python.cpp | 3 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda.h | 3 | ||||
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 127 | ||||
-rw-r--r-- | intern/cycles/device/device.cpp | 2 | ||||
-rw-r--r-- | intern/cycles/device/device.h | 13 | ||||
-rw-r--r-- | intern/cycles/device/device_cuda.cpp | 9 | ||||
-rw-r--r-- | intern/cycles/device/device_memory.cpp | 5 | ||||
-rw-r--r-- | intern/cycles/device/device_memory.h | 2 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 273 |
11 files changed, 371 insertions, 97 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py index da18ac7c693..1635afab210 100644 --- a/intern/cycles/blender/addon/properties.py +++ b/intern/cycles/blender/addon/properties.py @@ -1535,6 +1535,12 @@ class CyclesPreferences(bpy.types.AddonPreferences): devices: bpy.props.CollectionProperty(type=CyclesDeviceSettings) + peer_memory: BoolProperty( + name="Distribute memory across devices", + description="Make more room for large scenes to fit by distributing memory across interconnected devices (e.g. via NVLink) rather than duplicating it", + default=False, + ) + def find_existing_device_entry(self, device): for device_entry in self.devices: if device_entry.id == device[2] and device_entry.type == device[1]: @@ -1632,14 +1638,21 @@ class CyclesPreferences(bpy.types.AddonPreferences): row = layout.row() row.prop(self, "compute_device_type", expand=True) - devices = self.get_devices_for_type(self.compute_device_type) + if self.compute_device_type == 'NONE': + return row = layout.row() - if self.compute_device_type == 'CUDA': - self._draw_devices(row, 'CUDA', devices) - elif self.compute_device_type == 'OPTIX': - self._draw_devices(row, 'OPTIX', devices) - elif self.compute_device_type == 'OPENCL': - self._draw_devices(row, 'OPENCL', devices) + devices = self.get_devices_for_type(self.compute_device_type) + self._draw_devices(row, self.compute_device_type, devices) + + import _cycles + has_peer_memory = 0 + for device in _cycles.available_devices(self.compute_device_type): + if device[3] and self.find_existing_device_entry(device).use: + has_peer_memory += 1 + if has_peer_memory > 1: + row = layout.row() + row.use_property_split = True + row.prop(self, "peer_memory") def draw(self, context): self.draw_impl(self.layout, context) diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp index 5140f190f36..3a923459782 100644 --- a/intern/cycles/blender/blender_device.cpp +++ b/intern/cycles/blender/blender_device.cpp @@ -113,6 +113,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen device = Device::get_multi_device(used_devices, threads, background); } /* Else keep using the CPU device that was set before. */ + + if (!get_boolean(cpreferences, "peer_memory")) { + device.has_peer_memory = false; + } } } diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp index 79c16856462..0be19dbffd1 100644 --- a/intern/cycles/blender/blender_python.cpp +++ b/intern/cycles/blender/blender_python.cpp @@ -416,10 +416,11 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args) for (size_t i = 0; i < devices.size(); i++) { DeviceInfo &device = devices[i]; string type_name = Device::string_from_type(device.type); - PyObject *device_tuple = PyTuple_New(3); + PyObject *device_tuple = PyTuple_New(4); PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str())); PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str())); PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str())); + PyTuple_SET_ITEM(device_tuple, 3, PyBool_FromLong(device.has_peer_memory)); PyTuple_SET_ITEM(ret, i, device_tuple); } diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h index 3f23f0fe4c5..9f31ed12cf4 100644 --- a/intern/cycles/device/cuda/device_cuda.h +++ b/intern/cycles/device/cuda/device_cuda.h @@ -51,6 +51,7 @@ class CUDADevice : public Device { size_t map_host_used; size_t map_host_limit; int can_map_host; + int pitch_alignment; int cuDevId; int cuDevArchitecture; bool first_error; @@ -111,6 +112,8 @@ class CUDADevice : public Device { bool support_device(const DeviceRequestedFeatures & /*requested_features*/); + bool check_peer_access(Device *peer_device); + bool use_adaptive_compilation(); bool use_split_kernel(); diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index acf53c3eb1b..64c7f5e7d34 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -207,6 +207,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool map_host_limit = 0; map_host_used = 0; can_map_host = 0; + pitch_alignment = 0; functions.loaded = false; @@ -224,6 +225,9 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool cuda_assert( cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + cuda_assert(cuDeviceGetAttribute( + &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); + unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX; if (can_map_host) { ctx_flags |= CU_CTX_MAP_HOST; @@ -286,6 +290,49 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat return true; } +bool CUDADevice::check_peer_access(Device *peer_device) +{ + if (peer_device == this) { + return false; + } + if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) { + return false; + } + + CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device); + + int can_access = 0; + cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Ensure array access over the link is possible as well (for 3D textures) + cuda_assert(cuDeviceGetP2PAttribute(&can_access, + CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED, + cuDevice, + peer_device_cuda->cuDevice)); + if (can_access == 0) { + return false; + } + + // Enable peer access in both directions + { + const CUDAContextScope scope(this); + if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) { + return false; + } + } + { + const CUDAContextScope scope(peer_device_cuda); + if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) { + return false; + } + } + + return true; +} + bool CUDADevice::use_adaptive_compilation() { return DebugFlags().cuda.adaptive_compile; @@ -674,6 +721,12 @@ void CUDADevice::load_texture_info() void CUDADevice::move_textures_to_host(size_t size, bool for_texture) { + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + /* Signal to reallocate textures in host memory only. */ move_texture_to_host = true; @@ -687,6 +740,12 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture) device_memory &mem = *pair.first; CUDAMem *cmem = &pair.second; + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); bool is_image = is_texture && (mem.data_height > 1); @@ -696,11 +755,6 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture) continue; } - /* Already in host memory. */ - if (cmem->use_mapped_host) { - continue; - } - /* For other textures, only move image textures. */ if (for_texture && !is_image) { continue; @@ -723,26 +777,30 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture) static thread_mutex move_mutex; thread_scoped_lock lock(move_mutex); - /* Preserve the original device pointer, in case of multi device - * we can't change it because the pointer mapping would break. */ - device_ptr prev_pointer = max_mem->device_pointer; - size_t prev_size = max_mem->device_size; + any_device_moving_textures_to_host = true; - mem_copy_to(*max_mem); + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); size = (max_size >= size) ? 0 : size - max_size; - max_mem->device_pointer = prev_pointer; - max_mem->device_size = prev_size; + any_device_moving_textures_to_host = false; } else { break; } } + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + /* Update texture info array with new pointers. */ load_texture_info(); - - move_texture_to_host = false; } CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) @@ -808,9 +866,6 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_ map_host_used += size; status = " in host memory"; } - else { - status = " failed, out of host memory"; - } } if (mem_alloc_result != CUDA_SUCCESS) { @@ -906,7 +961,7 @@ void CUDADevice::generic_free(device_memory &mem) } else { /* Free device memory. */ - cuMemFree(mem.device_pointer); + cuda_assert(cuMemFree(mem.device_pointer)); } stats.mem_free(mem.device_size); @@ -1032,18 +1087,17 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size) void CUDADevice::global_alloc(device_memory &mem) { - CUDAContextScope scope(this); - - generic_alloc(mem); - generic_copy_to(mem); + if (mem.is_resident(this)) { + generic_alloc(mem); + generic_copy_to(mem); + } const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer)); } void CUDADevice::global_free(device_memory &mem) { - if (mem.device_pointer) { - CUDAContextScope scope(this); + if (mem.is_resident(this) && mem.device_pointer) { generic_free(mem); } } @@ -1112,7 +1166,19 @@ void CUDADevice::tex_alloc(device_texture &mem) size_t src_pitch = mem.data_width * dsize * mem.data_elements; size_t dst_pitch = src_pitch; - if (mem.data_depth > 1) { + if (!mem.is_resident(this)) { + cmem = &cuda_mem_map[&mem]; + cmem->texobject = 0; + + if (mem.data_depth > 1) { + array_3d = (CUarray)mem.device_pointer; + cmem->array = array_3d; + } + else if (mem.data_height > 0) { + dst_pitch = align_up(src_pitch, pitch_alignment); + } + } + else if (mem.data_depth > 1) { /* 3D texture using array, there is no API for linear memory. */ CUDA_ARRAY3D_DESCRIPTOR desc; @@ -1156,10 +1222,7 @@ void CUDADevice::tex_alloc(device_texture &mem) } else if (mem.data_height > 0) { /* 2D texture, using pitch aligned linear memory. */ - int alignment = 0; - cuda_assert( - cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); - dst_pitch = align_up(src_pitch, alignment); + dst_pitch = align_up(src_pitch, pitch_alignment); size_t dst_size = dst_pitch * mem.data_height; cmem = generic_alloc(mem, dst_size - mem.memory_size()); @@ -1251,7 +1314,11 @@ void CUDADevice::tex_free(device_texture &mem) cuTexObjectDestroy(cmem.texobject); } - if (cmem.array) { + if (!mem.is_resident(this)) { + /* Do not free memory here, since it was allocated on a different device. */ + cuda_mem_map.erase(cuda_mem_map.find(&mem)); + } + else if (cmem.array) { /* Free array. */ cuArrayDestroy(cmem.array); stats.mem_free(mem.device_size); diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index bad156d40bf..41dd7894d93 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -602,6 +602,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_adaptive_stop_per_sample = true; info.has_osl = true; info.has_profiling = true; + info.has_peer_memory = false; foreach (const DeviceInfo &device, subdevices) { /* Ensure CPU device does not slow down GPU. */ @@ -645,6 +646,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices, info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample; info.has_osl &= device.has_osl; info.has_profiling &= device.has_profiling; + info.has_peer_memory |= device.has_peer_memory; } return info; diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index c55dfb3a83b..dff981080a5 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -82,6 +82,7 @@ class DeviceInfo { bool has_osl; /* Support Open Shading Language. */ bool use_split_kernel; /* Use split or mega kernel. */ bool has_profiling; /* Supports runtime collection of profiling info. */ + bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */ int cpu_threads; vector<DeviceInfo> multi_devices; vector<DeviceInfo> denoising_devices; @@ -99,6 +100,7 @@ class DeviceInfo { has_osl = false; use_split_kernel = false; has_profiling = false; + has_peer_memory = false; } bool operator==(const DeviceInfo &info) @@ -435,6 +437,17 @@ class Device { { } + virtual bool is_resident(device_ptr /*key*/, Device *sub_device) + { + /* Memory is always resident if this is not a multi device, regardless of whether the pointer + * is valid or not (since it may not have been allocated yet). */ + return sub_device == this; + } + virtual bool check_peer_access(Device * /*peer_device*/) + { + return false; + } + /* static */ static Device *create(DeviceInfo &info, Stats &stats, diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 4a53fcd151d..04c04761311 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -131,6 +131,15 @@ void device_cuda_info(vector<DeviceInfo> &devices) info.has_volume_decoupled = false; info.has_adaptive_stop_per_sample = false; + /* Check if the device has P2P access to any other device in the system. */ + for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) { + if (num != peer_num) { + int can_access = 0; + cuDeviceCanAccessPeer(&can_access, num, peer_num); + info.has_peer_memory = (can_access != 0); + } + } + int pci_location[3] = {0, 0, 0}; cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num); cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num); diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index 671cd7c29f3..8064d50d31f 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -125,6 +125,11 @@ void device_memory::restore_device() device_pointer = original_device_ptr; } +bool device_memory::is_resident(Device *sub_device) const +{ + return device->is_resident(device_pointer, sub_device); +} + /* Device Sub Ptr */ device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device) diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index 1c20db900bc..32654e62a6f 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -230,6 +230,8 @@ class device_memory { void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr); void restore_device(); + bool is_resident(Device *sub_device) const; + protected: friend class CUDADevice; friend class OptiXDevice; diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index 3636ecaa7a1..77ede3bf62a 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -34,37 +34,66 @@ CCL_NAMESPACE_BEGIN class MultiDevice : public Device { public: struct SubDevice { - explicit SubDevice(Device *device_) : device(device_) - { - } - + Stats stats; Device *device; map<device_ptr, device_ptr> ptr_map; + int peer_island_index = -1; }; list<SubDevice> devices, denoising_devices; device_ptr unique_key; + vector<vector<SubDevice *>> peer_islands; MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_) : Device(info, stats, profiler, background_), unique_key(1) { foreach (DeviceInfo &subinfo, info.multi_devices) { - Device *device = Device::create(subinfo, sub_stats_, profiler, background); - /* Always add CPU devices at the back since GPU devices can change * host memory pointers, which CPU uses as device pointer. */ + SubDevice *sub; if (subinfo.type == DEVICE_CPU) { - devices.push_back(SubDevice(device)); + devices.emplace_back(); + sub = &devices.back(); } else { - devices.push_front(SubDevice(device)); + devices.emplace_front(); + sub = &devices.front(); } + + /* The pointer to 'sub->stats' will stay valid even after new devices + * are added, since 'devices' is a linked list. */ + sub->device = Device::create(subinfo, sub->stats, profiler, background); } foreach (DeviceInfo &subinfo, info.denoising_devices) { - Device *device = Device::create(subinfo, sub_stats_, profiler, background); + denoising_devices.emplace_back(); + SubDevice *sub = &denoising_devices.back(); + + sub->device = Device::create(subinfo, sub->stats, profiler, background); + } + + /* Build a list of peer islands for the available render devices */ + foreach (SubDevice &sub, devices) { + /* First ensure that every device is in at least once peer island */ + if (sub.peer_island_index < 0) { + peer_islands.emplace_back(); + sub.peer_island_index = (int)peer_islands.size() - 1; + peer_islands[sub.peer_island_index].push_back(&sub); + } + + if (!info.has_peer_memory) { + continue; + } - denoising_devices.push_back(SubDevice(device)); + /* Second check peer access between devices and fill up the islands accordingly */ + foreach (SubDevice &peer_sub, devices) { + if (peer_sub.peer_island_index < 0 && + peer_sub.device->info.type == sub.device->info.type && + peer_sub.device->check_peer_access(sub.device)) { + peer_sub.peer_island_index = sub.peer_island_index; + peer_islands[sub.peer_island_index].push_back(&peer_sub); + } + } } #ifdef WITH_NETWORK @@ -175,11 +204,11 @@ class MultiDevice : public Device { bool build_optix_bvh(BVH *bvh) { - // Broadcast acceleration structure build to all render devices - foreach (SubDevice &sub, devices) + /* Broadcast acceleration structure build to all render devices */ + foreach (SubDevice &sub, devices) { if (!sub.device->build_optix_bvh(bvh)) return false; - + } return true; } @@ -191,17 +220,82 @@ class MultiDevice : public Device { return devices.front().device->osl_memory(); } + bool is_resident(device_ptr key, Device *sub_device) override + { + foreach (SubDevice &sub, devices) { + if (sub.device == sub_device) { + return find_matching_mem_device(key, sub)->device == sub_device; + } + } + return false; + } + + SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub) + { + assert(sub.peer_island_index >= 0 && key != 0); + + /* Get the memory owner of this key (first try current device, then peer devices) */ + SubDevice *owner_sub = ⊂ + if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) { + foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) { + if (island_sub != owner_sub && + island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) { + owner_sub = island_sub; + } + } + } + return owner_sub; + } + + SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island) + { + assert(!island.empty()); + + /* Get the memory owner of this key or the device with the lowest memory usage when new */ + SubDevice *owner_sub = island.front(); + foreach (SubDevice *island_sub, island) { + if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) : + (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) { + owner_sub = island_sub; + } + } + return owner_sub; + } + + inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub) + { + return find_matching_mem_device(key, sub)->ptr_map[key]; + } + void mem_alloc(device_memory &mem) { device_ptr key = unique_key++; - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = 0; - mem.device_size = 0; + if (mem.type == MEM_PIXELS) { + /* Always allocate pixels memory on all devices + * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */ + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = 0; + mem.device_size = 0; - sub.device->mem_alloc(mem); - sub.ptr_map[key] = mem.device_pointer; + sub.device->mem_alloc(mem); + sub.ptr_map[key] = mem.device_pointer; + } + } + else { + assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE || + mem.type == MEM_DEVICE_ONLY); + /* The remaining memory types can be distributed across devices */ + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(key, island); + mem.device = owner_sub->device; + mem.device_pointer = 0; + mem.device_size = 0; + + owner_sub->device->mem_alloc(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + } } mem.device = this; @@ -215,13 +309,36 @@ class MultiDevice : public Device { device_ptr key = (existing_key) ? existing_key : unique_key++; size_t existing_size = mem.device_size; - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; + /* The tile buffers are allocated on each device (see below), so copy to all of them */ + if (strcmp(mem.name, "RenderBuffers") == 0) { + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + sub.device->mem_copy_to(mem); + sub.ptr_map[key] = mem.device_pointer; + } + } + else { + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device; + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_copy_to(mem); + owner_sub->ptr_map[key] = mem.device_pointer; - sub.device->mem_copy_to(mem); - sub.ptr_map[key] = mem.device_pointer; + if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) { + /* Need to create texture objects and update pointer in kernel globals on all devices */ + foreach (SubDevice *island_sub, island) { + if (island_sub != owner_sub) { + island_sub->device->mem_copy_to(mem); + } + } + } + } } mem.device = this; @@ -238,10 +355,11 @@ class MultiDevice : public Device { int sy = y + i * sub_h; int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h; - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; + SubDevice *owner_sub = find_matching_mem_device(key, sub); + mem.device = owner_sub->device; + mem.device_pointer = owner_sub->ptr_map[key]; - sub.device->mem_copy_from(mem, sy, w, sh, elem); + owner_sub->device->mem_copy_from(mem, sy, w, sh, elem); i++; } @@ -255,16 +373,18 @@ class MultiDevice : public Device { device_ptr key = (existing_key) ? existing_key : unique_key++; size_t existing_size = mem.device_size; - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; - mem.device_size = existing_size; - - sub.device->mem_zero(mem); - sub.ptr_map[key] = mem.device_pointer; - } - + /* This is a hack to only allocate the tile buffers on denoising devices + * Similarily the tile buffers also need to be allocated separately on all devices so any + * overlap rendered for denoising does not interfer with each other */ if (strcmp(mem.name, "RenderBuffers") == 0) { + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + sub.device->mem_zero(mem); + sub.ptr_map[key] = mem.device_pointer; + } foreach (SubDevice &sub, denoising_devices) { mem.device = sub.device; mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0; @@ -274,6 +394,17 @@ class MultiDevice : public Device { sub.ptr_map[key] = mem.device_pointer; } } + else { + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_suitable_mem_device(existing_key, island); + mem.device = owner_sub->device; + mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0; + mem.device_size = existing_size; + + owner_sub->device->mem_zero(mem); + owner_sub->ptr_map[key] = mem.device_pointer; + } + } mem.device = this; mem.device_pointer = key; @@ -285,16 +416,16 @@ class MultiDevice : public Device { device_ptr key = mem.device_pointer; size_t existing_size = mem.device_size; - foreach (SubDevice &sub, devices) { - mem.device = sub.device; - mem.device_pointer = sub.ptr_map[key]; - mem.device_size = existing_size; - - sub.device->mem_free(mem); - sub.ptr_map.erase(sub.ptr_map.find(key)); - } + /* Free memory that was allocated for all devices (see above) on each device */ + if (strcmp(mem.name, "RenderBuffers") == 0 || mem.type == MEM_PIXELS) { + foreach (SubDevice &sub, devices) { + mem.device = sub.device; + mem.device_pointer = sub.ptr_map[key]; + mem.device_size = existing_size; - if (strcmp(mem.name, "RenderBuffers") == 0) { + sub.device->mem_free(mem); + sub.ptr_map.erase(sub.ptr_map.find(key)); + } foreach (SubDevice &sub, denoising_devices) { mem.device = sub.device; mem.device_pointer = sub.ptr_map[key]; @@ -304,6 +435,26 @@ class MultiDevice : public Device { sub.ptr_map.erase(sub.ptr_map.find(key)); } } + else { + foreach (const vector<SubDevice *> &island, peer_islands) { + SubDevice *owner_sub = find_matching_mem_device(key, *island.front()); + mem.device = owner_sub->device; + mem.device_pointer = owner_sub->ptr_map[key]; + mem.device_size = existing_size; + + owner_sub->device->mem_free(mem); + owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key)); + + if (mem.type == MEM_TEXTURE) { + /* Free texture objects on all devices */ + foreach (SubDevice *island_sub, island) { + if (island_sub != owner_sub) { + island_sub->device->mem_free(mem); + } + } + } + } + } mem.device = this; mem.device_pointer = 0; @@ -330,6 +481,8 @@ class MultiDevice : public Device { bool transparent, const DeviceDrawParams &draw_params) { + assert(rgba.type == MEM_PIXELS); + device_ptr key = rgba.device_pointer; int i = 0, sub_h = h / devices.size(); int sub_height = height / devices.size(); @@ -358,7 +511,7 @@ class MultiDevice : public Device { foreach (SubDevice &sub, devices) { if (sub.device == sub_device) { - tile.buffer = sub.ptr_map[tile.buffer]; + tile.buffer = find_matching_mem(tile.buffer, sub); return; } } @@ -517,16 +670,21 @@ class MultiDevice : public Device { DeviceTask subtask = tasks.front(); tasks.pop_front(); - if (task.buffer) + if (task.type == DeviceTask::DENOISE_BUFFER && !denoising_devices.empty()) { subtask.buffer = sub.ptr_map[task.buffer]; - if (task.rgba_byte) - subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; - if (task.rgba_half) - subtask.rgba_half = sub.ptr_map[task.rgba_half]; - if (task.shader_input) - subtask.shader_input = sub.ptr_map[task.shader_input]; - if (task.shader_output) - subtask.shader_output = sub.ptr_map[task.shader_output]; + } + else { + if (task.buffer) + subtask.buffer = find_matching_mem(task.buffer, sub); + if (task.rgba_byte) + subtask.rgba_byte = sub.ptr_map[task.rgba_byte]; + if (task.rgba_half) + subtask.rgba_half = sub.ptr_map[task.rgba_half]; + if (task.shader_input) + subtask.shader_input = find_matching_mem(task.shader_input, sub); + if (task.shader_output) + subtask.shader_output = find_matching_mem(task.shader_output, sub); + } sub.device->task_add(subtask); } @@ -548,9 +706,6 @@ class MultiDevice : public Device { foreach (SubDevice &sub, denoising_devices) sub.device->task_cancel(); } - - protected: - Stats sub_stats_; }; Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background) |