Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/cycles/blender/addon/properties.py27
-rw-r--r--intern/cycles/blender/blender_device.cpp4
-rw-r--r--intern/cycles/blender/blender_python.cpp3
-rw-r--r--intern/cycles/device/cuda/device_cuda.h3
-rw-r--r--intern/cycles/device/cuda/device_cuda_impl.cpp127
-rw-r--r--intern/cycles/device/device.cpp2
-rw-r--r--intern/cycles/device/device.h13
-rw-r--r--intern/cycles/device/device_cuda.cpp9
-rw-r--r--intern/cycles/device/device_memory.cpp5
-rw-r--r--intern/cycles/device/device_memory.h2
-rw-r--r--intern/cycles/device/device_multi.cpp273
11 files changed, 371 insertions, 97 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index da18ac7c693..1635afab210 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1535,6 +1535,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
devices: bpy.props.CollectionProperty(type=CyclesDeviceSettings)
+ peer_memory: BoolProperty(
+ name="Distribute memory across devices",
+ description="Make more room for large scenes to fit by distributing memory across interconnected devices (e.g. via NVLink) rather than duplicating it",
+ default=False,
+ )
+
def find_existing_device_entry(self, device):
for device_entry in self.devices:
if device_entry.id == device[2] and device_entry.type == device[1]:
@@ -1632,14 +1638,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
row = layout.row()
row.prop(self, "compute_device_type", expand=True)
- devices = self.get_devices_for_type(self.compute_device_type)
+ if self.compute_device_type == 'NONE':
+ return
row = layout.row()
- if self.compute_device_type == 'CUDA':
- self._draw_devices(row, 'CUDA', devices)
- elif self.compute_device_type == 'OPTIX':
- self._draw_devices(row, 'OPTIX', devices)
- elif self.compute_device_type == 'OPENCL':
- self._draw_devices(row, 'OPENCL', devices)
+ devices = self.get_devices_for_type(self.compute_device_type)
+ self._draw_devices(row, self.compute_device_type, devices)
+
+ import _cycles
+ has_peer_memory = 0
+ for device in _cycles.available_devices(self.compute_device_type):
+ if device[3] and self.find_existing_device_entry(device).use:
+ has_peer_memory += 1
+ if has_peer_memory > 1:
+ row = layout.row()
+ row.use_property_split = True
+ row.prop(self, "peer_memory")
def draw(self, context):
self.draw_impl(self.layout, context)
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 5140f190f36..3a923459782 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -113,6 +113,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
device = Device::get_multi_device(used_devices, threads, background);
}
/* Else keep using the CPU device that was set before. */
+
+ if (!get_boolean(cpreferences, "peer_memory")) {
+ device.has_peer_memory = false;
+ }
}
}
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 79c16856462..0be19dbffd1 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -416,10 +416,11 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
for (size_t i = 0; i < devices.size(); i++) {
DeviceInfo &device = devices[i];
string type_name = Device::string_from_type(device.type);
- PyObject *device_tuple = PyTuple_New(3);
+ PyObject *device_tuple = PyTuple_New(4);
PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
+ PyTuple_SET_ITEM(device_tuple, 3, PyBool_FromLong(device.has_peer_memory));
PyTuple_SET_ITEM(ret, i, device_tuple);
}
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 3f23f0fe4c5..9f31ed12cf4 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -51,6 +51,7 @@ class CUDADevice : public Device {
size_t map_host_used;
size_t map_host_limit;
int can_map_host;
+ int pitch_alignment;
int cuDevId;
int cuDevArchitecture;
bool first_error;
@@ -111,6 +112,8 @@ class CUDADevice : public Device {
bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
+ bool check_peer_access(Device *peer_device);
+
bool use_adaptive_compilation();
bool use_split_kernel();
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index acf53c3eb1b..64c7f5e7d34 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -207,6 +207,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
map_host_limit = 0;
map_host_used = 0;
can_map_host = 0;
+ pitch_alignment = 0;
functions.loaded = false;
@@ -224,6 +225,9 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
cuda_assert(
cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+ cuda_assert(cuDeviceGetAttribute(
+ &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
if (can_map_host) {
ctx_flags |= CU_CTX_MAP_HOST;
@@ -286,6 +290,49 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat
return true;
}
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+ if (peer_device == this) {
+ return false;
+ }
+ if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+ return false;
+ }
+
+ CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+ int can_access = 0;
+ cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+ if (can_access == 0) {
+ return false;
+ }
+
+ // Ensure array access over the link is possible as well (for 3D textures)
+ cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+ CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
+ cuDevice,
+ peer_device_cuda->cuDevice));
+ if (can_access == 0) {
+ return false;
+ }
+
+ // Enable peer access in both directions
+ {
+ const CUDAContextScope scope(this);
+ if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) {
+ return false;
+ }
+ }
+ {
+ const CUDAContextScope scope(peer_device_cuda);
+ if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
bool CUDADevice::use_adaptive_compilation()
{
return DebugFlags().cuda.adaptive_compile;
@@ -674,6 +721,12 @@ void CUDADevice::load_texture_info()
void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
{
+ /* Break out of recursive call, which can happen when moving memory on a multi device. */
+ static bool any_device_moving_textures_to_host = false;
+ if (any_device_moving_textures_to_host) {
+ return;
+ }
+
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true;
@@ -687,6 +740,12 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
device_memory &mem = *pair.first;
CUDAMem *cmem = &pair.second;
+ /* Can only move textures allocated on this device (and not those from peer devices).
+ * And need to ignore memory that is already on the host. */
+ if (!mem.is_resident(this) || cmem->use_mapped_host) {
+ continue;
+ }
+
bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
(&mem != &texture_info);
bool is_image = is_texture && (mem.data_height > 1);
@@ -696,11 +755,6 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
continue;
}
- /* Already in host memory. */
- if (cmem->use_mapped_host) {
- continue;
- }
-
/* For other textures, only move image textures. */
if (for_texture && !is_image) {
continue;
@@ -723,26 +777,30 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
static thread_mutex move_mutex;
thread_scoped_lock lock(move_mutex);
- /* Preserve the original device pointer, in case of multi device
- * we can't change it because the pointer mapping would break. */
- device_ptr prev_pointer = max_mem->device_pointer;
- size_t prev_size = max_mem->device_size;
+ any_device_moving_textures_to_host = true;
- mem_copy_to(*max_mem);
+ /* Potentially need to call back into multi device, so pointer mapping
+ * and peer devices are updated. This is also necessary since the device
+ * pointer may just be a key here, so cannot be accessed and freed directly.
+ * Unfortunately it does mean that memory is reallocated on all other
+ * devices as well, which is potentially dangerous when still in use (since
+ * a thread rendering on another devices would only be caught in this mutex
+ * if it so happens to do an allocation at the same time as well. */
+ max_mem->device_copy_to();
size = (max_size >= size) ? 0 : size - max_size;
- max_mem->device_pointer = prev_pointer;
- max_mem->device_size = prev_size;
+ any_device_moving_textures_to_host = false;
}
else {
break;
}
}
+ /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+ move_texture_to_host = false;
+
/* Update texture info array with new pointers. */
load_texture_info();
-
- move_texture_to_host = false;
}
CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
@@ -808,9 +866,6 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
map_host_used += size;
status = " in host memory";
}
- else {
- status = " failed, out of host memory";
- }
}
if (mem_alloc_result != CUDA_SUCCESS) {
@@ -906,7 +961,7 @@ void CUDADevice::generic_free(device_memory &mem)
}
else {
/* Free device memory. */
- cuMemFree(mem.device_pointer);
+ cuda_assert(cuMemFree(mem.device_pointer));
}
stats.mem_free(mem.device_size);
@@ -1032,18 +1087,17 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
void CUDADevice::global_alloc(device_memory &mem)
{
- CUDAContextScope scope(this);
-
- generic_alloc(mem);
- generic_copy_to(mem);
+ if (mem.is_resident(this)) {
+ generic_alloc(mem);
+ generic_copy_to(mem);
+ }
const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
}
void CUDADevice::global_free(device_memory &mem)
{
- if (mem.device_pointer) {
- CUDAContextScope scope(this);
+ if (mem.is_resident(this) && mem.device_pointer) {
generic_free(mem);
}
}
@@ -1112,7 +1166,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
size_t src_pitch = mem.data_width * dsize * mem.data_elements;
size_t dst_pitch = src_pitch;
- if (mem.data_depth > 1) {
+ if (!mem.is_resident(this)) {
+ cmem = &cuda_mem_map[&mem];
+ cmem->texobject = 0;
+
+ if (mem.data_depth > 1) {
+ array_3d = (CUarray)mem.device_pointer;
+ cmem->array = array_3d;
+ }
+ else if (mem.data_height > 0) {
+ dst_pitch = align_up(src_pitch, pitch_alignment);
+ }
+ }
+ else if (mem.data_depth > 1) {
/* 3D texture using array, there is no API for linear memory. */
CUDA_ARRAY3D_DESCRIPTOR desc;
@@ -1156,10 +1222,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
}
else if (mem.data_height > 0) {
/* 2D texture, using pitch aligned linear memory. */
- int alignment = 0;
- cuda_assert(
- cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
- dst_pitch = align_up(src_pitch, alignment);
+ dst_pitch = align_up(src_pitch, pitch_alignment);
size_t dst_size = dst_pitch * mem.data_height;
cmem = generic_alloc(mem, dst_size - mem.memory_size());
@@ -1251,7 +1314,11 @@ void CUDADevice::tex_free(device_texture &mem)
cuTexObjectDestroy(cmem.texobject);
}
- if (cmem.array) {
+ if (!mem.is_resident(this)) {
+ /* Do not free memory here, since it was allocated on a different device. */
+ cuda_mem_map.erase(cuda_mem_map.find(&mem));
+ }
+ else if (cmem.array) {
/* Free array. */
cuArrayDestroy(cmem.array);
stats.mem_free(mem.device_size);
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index bad156d40bf..41dd7894d93 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -602,6 +602,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.has_adaptive_stop_per_sample = true;
info.has_osl = true;
info.has_profiling = true;
+ info.has_peer_memory = false;
foreach (const DeviceInfo &device, subdevices) {
/* Ensure CPU device does not slow down GPU. */
@@ -645,6 +646,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
info.has_osl &= device.has_osl;
info.has_profiling &= device.has_profiling;
+ info.has_peer_memory |= device.has_peer_memory;
}
return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index c55dfb3a83b..dff981080a5 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -82,6 +82,7 @@ class DeviceInfo {
bool has_osl; /* Support Open Shading Language. */
bool use_split_kernel; /* Use split or mega kernel. */
bool has_profiling; /* Supports runtime collection of profiling info. */
+ bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
int cpu_threads;
vector<DeviceInfo> multi_devices;
vector<DeviceInfo> denoising_devices;
@@ -99,6 +100,7 @@ class DeviceInfo {
has_osl = false;
use_split_kernel = false;
has_profiling = false;
+ has_peer_memory = false;
}
bool operator==(const DeviceInfo &info)
@@ -435,6 +437,17 @@ class Device {
{
}
+ virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
+ {
+ /* Memory is always resident if this is not a multi device, regardless of whether the pointer
+ * is valid or not (since it may not have been allocated yet). */
+ return sub_device == this;
+ }
+ virtual bool check_peer_access(Device * /*peer_device*/)
+ {
+ return false;
+ }
+
/* static */
static Device *create(DeviceInfo &info,
Stats &stats,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 4a53fcd151d..04c04761311 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -131,6 +131,15 @@ void device_cuda_info(vector<DeviceInfo> &devices)
info.has_volume_decoupled = false;
info.has_adaptive_stop_per_sample = false;
+ /* Check if the device has P2P access to any other device in the system. */
+ for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+ if (num != peer_num) {
+ int can_access = 0;
+ cuDeviceCanAccessPeer(&can_access, num, peer_num);
+ info.has_peer_memory = (can_access != 0);
+ }
+ }
+
int pci_location[3] = {0, 0, 0};
cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 671cd7c29f3..8064d50d31f 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -125,6 +125,11 @@ void device_memory::restore_device()
device_pointer = original_device_ptr;
}
+bool device_memory::is_resident(Device *sub_device) const
+{
+ return device->is_resident(device_pointer, sub_device);
+}
+
/* Device Sub Ptr */
device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 1c20db900bc..32654e62a6f 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -230,6 +230,8 @@ class device_memory {
void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
void restore_device();
+ bool is_resident(Device *sub_device) const;
+
protected:
friend class CUDADevice;
friend class OptiXDevice;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 3636ecaa7a1..77ede3bf62a 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -34,37 +34,66 @@ CCL_NAMESPACE_BEGIN
class MultiDevice : public Device {
public:
struct SubDevice {
- explicit SubDevice(Device *device_) : device(device_)
- {
- }
-
+ Stats stats;
Device *device;
map<device_ptr, device_ptr> ptr_map;
+ int peer_island_index = -1;
};
list<SubDevice> devices, denoising_devices;
device_ptr unique_key;
+ vector<vector<SubDevice *>> peer_islands;
MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
: Device(info, stats, profiler, background_), unique_key(1)
{
foreach (DeviceInfo &subinfo, info.multi_devices) {
- Device *device = Device::create(subinfo, sub_stats_, profiler, background);
-
/* Always add CPU devices at the back since GPU devices can change
* host memory pointers, which CPU uses as device pointer. */
+ SubDevice *sub;
if (subinfo.type == DEVICE_CPU) {
- devices.push_back(SubDevice(device));
+ devices.emplace_back();
+ sub = &devices.back();
}
else {
- devices.push_front(SubDevice(device));
+ devices.emplace_front();
+ sub = &devices.front();
}
+
+ /* The pointer to 'sub->stats' will stay valid even after new devices
+ * are added, since 'devices' is a linked list. */
+ sub->device = Device::create(subinfo, sub->stats, profiler, background);
}
foreach (DeviceInfo &subinfo, info.denoising_devices) {
- Device *device = Device::create(subinfo, sub_stats_, profiler, background);
+ denoising_devices.emplace_back();
+ SubDevice *sub = &denoising_devices.back();
+
+ sub->device = Device::create(subinfo, sub->stats, profiler, background);
+ }
+
+ /* Build a list of peer islands for the available render devices */
+ foreach (SubDevice &sub, devices) {
+ /* First ensure that every device is in at least once peer island */
+ if (sub.peer_island_index < 0) {
+ peer_islands.emplace_back();
+ sub.peer_island_index = (int)peer_islands.size() - 1;
+ peer_islands[sub.peer_island_index].push_back(&sub);
+ }
+
+ if (!info.has_peer_memory) {
+ continue;
+ }
- denoising_devices.push_back(SubDevice(device));
+ /* Second check peer access between devices and fill up the islands accordingly */
+ foreach (SubDevice &peer_sub, devices) {
+ if (peer_sub.peer_island_index < 0 &&
+ peer_sub.device->info.type == sub.device->info.type &&
+ peer_sub.device->check_peer_access(sub.device)) {
+ peer_sub.peer_island_index = sub.peer_island_index;
+ peer_islands[sub.peer_island_index].push_back(&peer_sub);
+ }
+ }
}
#ifdef WITH_NETWORK
@@ -175,11 +204,11 @@ class MultiDevice : public Device {
bool build_optix_bvh(BVH *bvh)
{
- // Broadcast acceleration structure build to all render devices
- foreach (SubDevice &sub, devices)
+ /* Broadcast acceleration structure build to all render devices */
+ foreach (SubDevice &sub, devices) {
if (!sub.device->build_optix_bvh(bvh))
return false;
-
+ }
return true;
}
@@ -191,17 +220,82 @@ class MultiDevice : public Device {
return devices.front().device->osl_memory();
}
+ bool is_resident(device_ptr key, Device *sub_device) override
+ {
+ foreach (SubDevice &sub, devices) {
+ if (sub.device == sub_device) {
+ return find_matching_mem_device(key, sub)->device == sub_device;
+ }
+ }
+ return false;
+ }
+
+ SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+ {
+ assert(sub.peer_island_index >= 0 && key != 0);
+
+ /* Get the memory owner of this key (first try current device, then peer devices) */
+ SubDevice *owner_sub = &sub;
+ if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+ foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+ if (island_sub != owner_sub &&
+ island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+ owner_sub = island_sub;
+ }
+ }
+ }
+ return owner_sub;
+ }
+
+ SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+ {
+ assert(!island.empty());
+
+ /* Get the memory owner of this key or the device with the lowest memory usage when new */
+ SubDevice *owner_sub = island.front();
+ foreach (SubDevice *island_sub, island) {
+ if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+ (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+ owner_sub = island_sub;
+ }
+ }
+ return owner_sub;
+ }
+
+ inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+ {
+ return find_matching_mem_device(key, sub)->ptr_map[key];
+ }
+
void mem_alloc(device_memory &mem)
{
device_ptr key = unique_key++;
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = 0;
- mem.device_size = 0;
+ if (mem.type == MEM_PIXELS) {
+ /* Always allocate pixels memory on all devices
+ * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
- sub.device->mem_alloc(mem);
- sub.ptr_map[key] = mem.device_pointer;
+ sub.device->mem_alloc(mem);
+ sub.ptr_map[key] = mem.device_pointer;
+ }
+ }
+ else {
+ assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
+ mem.type == MEM_DEVICE_ONLY);
+ /* The remaining memory types can be distributed across devices */
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = 0;
+ mem.device_size = 0;
+
+ owner_sub->device->mem_alloc(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+ }
}
mem.device = this;
@@ -215,13 +309,36 @@ class MultiDevice : public Device {
device_ptr key = (existing_key) ? existing_key : unique_key++;
size_t existing_size = mem.device_size;
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
+ /* The tile buffers are allocated on each device (see below), so copy to all of them */
+ if (strcmp(mem.name, "RenderBuffers") == 0) {
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ sub.device->mem_copy_to(mem);
+ sub.ptr_map[key] = mem.device_pointer;
+ }
+ }
+ else {
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_copy_to(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
- sub.device->mem_copy_to(mem);
- sub.ptr_map[key] = mem.device_pointer;
+ if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+ /* Need to create texture objects and update pointer in kernel globals on all devices */
+ foreach (SubDevice *island_sub, island) {
+ if (island_sub != owner_sub) {
+ island_sub->device->mem_copy_to(mem);
+ }
+ }
+ }
+ }
}
mem.device = this;
@@ -238,10 +355,11 @@ class MultiDevice : public Device {
int sy = y + i * sub_h;
int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
+ SubDevice *owner_sub = find_matching_mem_device(key, sub);
+ mem.device = owner_sub->device;
+ mem.device_pointer = owner_sub->ptr_map[key];
- sub.device->mem_copy_from(mem, sy, w, sh, elem);
+ owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
i++;
}
@@ -255,16 +373,18 @@ class MultiDevice : public Device {
device_ptr key = (existing_key) ? existing_key : unique_key++;
size_t existing_size = mem.device_size;
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
- mem.device_size = existing_size;
-
- sub.device->mem_zero(mem);
- sub.ptr_map[key] = mem.device_pointer;
- }
-
+ /* This is a hack to only allocate the tile buffers on denoising devices
+ * Similarily the tile buffers also need to be allocated separately on all devices so any
+ * overlap rendered for denoising does not interfer with each other */
if (strcmp(mem.name, "RenderBuffers") == 0) {
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ sub.device->mem_zero(mem);
+ sub.ptr_map[key] = mem.device_pointer;
+ }
foreach (SubDevice &sub, denoising_devices) {
mem.device = sub.device;
mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
@@ -274,6 +394,17 @@ class MultiDevice : public Device {
sub.ptr_map[key] = mem.device_pointer;
}
}
+ else {
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+ mem.device = owner_sub->device;
+ mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_zero(mem);
+ owner_sub->ptr_map[key] = mem.device_pointer;
+ }
+ }
mem.device = this;
mem.device_pointer = key;
@@ -285,16 +416,16 @@ class MultiDevice : public Device {
device_ptr key = mem.device_pointer;
size_t existing_size = mem.device_size;
- foreach (SubDevice &sub, devices) {
- mem.device = sub.device;
- mem.device_pointer = sub.ptr_map[key];
- mem.device_size = existing_size;
-
- sub.device->mem_free(mem);
- sub.ptr_map.erase(sub.ptr_map.find(key));
- }
+ /* Free memory that was allocated for all devices (see above) on each device */
+ if (strcmp(mem.name, "RenderBuffers") == 0 || mem.type == MEM_PIXELS) {
+ foreach (SubDevice &sub, devices) {
+ mem.device = sub.device;
+ mem.device_pointer = sub.ptr_map[key];
+ mem.device_size = existing_size;
- if (strcmp(mem.name, "RenderBuffers") == 0) {
+ sub.device->mem_free(mem);
+ sub.ptr_map.erase(sub.ptr_map.find(key));
+ }
foreach (SubDevice &sub, denoising_devices) {
mem.device = sub.device;
mem.device_pointer = sub.ptr_map[key];
@@ -304,6 +435,26 @@ class MultiDevice : public Device {
sub.ptr_map.erase(sub.ptr_map.find(key));
}
}
+ else {
+ foreach (const vector<SubDevice *> &island, peer_islands) {
+ SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+ mem.device = owner_sub->device;
+ mem.device_pointer = owner_sub->ptr_map[key];
+ mem.device_size = existing_size;
+
+ owner_sub->device->mem_free(mem);
+ owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+ if (mem.type == MEM_TEXTURE) {
+ /* Free texture objects on all devices */
+ foreach (SubDevice *island_sub, island) {
+ if (island_sub != owner_sub) {
+ island_sub->device->mem_free(mem);
+ }
+ }
+ }
+ }
+ }
mem.device = this;
mem.device_pointer = 0;
@@ -330,6 +481,8 @@ class MultiDevice : public Device {
bool transparent,
const DeviceDrawParams &draw_params)
{
+ assert(rgba.type == MEM_PIXELS);
+
device_ptr key = rgba.device_pointer;
int i = 0, sub_h = h / devices.size();
int sub_height = height / devices.size();
@@ -358,7 +511,7 @@ class MultiDevice : public Device {
foreach (SubDevice &sub, devices) {
if (sub.device == sub_device) {
- tile.buffer = sub.ptr_map[tile.buffer];
+ tile.buffer = find_matching_mem(tile.buffer, sub);
return;
}
}
@@ -517,16 +670,21 @@ class MultiDevice : public Device {
DeviceTask subtask = tasks.front();
tasks.pop_front();
- if (task.buffer)
+ if (task.type == DeviceTask::DENOISE_BUFFER && !denoising_devices.empty()) {
subtask.buffer = sub.ptr_map[task.buffer];
- if (task.rgba_byte)
- subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
- if (task.rgba_half)
- subtask.rgba_half = sub.ptr_map[task.rgba_half];
- if (task.shader_input)
- subtask.shader_input = sub.ptr_map[task.shader_input];
- if (task.shader_output)
- subtask.shader_output = sub.ptr_map[task.shader_output];
+ }
+ else {
+ if (task.buffer)
+ subtask.buffer = find_matching_mem(task.buffer, sub);
+ if (task.rgba_byte)
+ subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+ if (task.rgba_half)
+ subtask.rgba_half = sub.ptr_map[task.rgba_half];
+ if (task.shader_input)
+ subtask.shader_input = find_matching_mem(task.shader_input, sub);
+ if (task.shader_output)
+ subtask.shader_output = find_matching_mem(task.shader_output, sub);
+ }
sub.device->task_add(subtask);
}
@@ -548,9 +706,6 @@ class MultiDevice : public Device {
foreach (SubDevice &sub, denoising_devices)
sub.device->task_cancel();
}
-
- protected:
- Stats sub_stats_;
};
Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)