11 files changed, 371 insertions, 97 deletions
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index da18ac7c693..1635afab210 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1535,6 +1535,12 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     devices: bpy.props.CollectionProperty(type=CyclesDeviceSettings)
 
+    peer_memory: BoolProperty(
+        name="Distribute memory across devices",
+        description="Make more room for large scenes to fit by distributing memory across interconnected devices (e.g. via NVLink) rather than duplicating it",
+        default=False,
+    )
+
     def find_existing_device_entry(self, device):
         for device_entry in self.devices:
             if device_entry.id == device[2] and device_entry.type == device[1]:
@@ -1632,14 +1638,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         row = layout.row()
         row.prop(self, "compute_device_type", expand=True)
 
-        devices = self.get_devices_for_type(self.compute_device_type)
+        if self.compute_device_type == 'NONE':
+            return
         row = layout.row()
-        if self.compute_device_type == 'CUDA':
-            self._draw_devices(row, 'CUDA', devices)
-        elif self.compute_device_type == 'OPTIX':
-            self._draw_devices(row, 'OPTIX', devices)
-        elif self.compute_device_type == 'OPENCL':
-            self._draw_devices(row, 'OPENCL', devices)
+        devices = self.get_devices_for_type(self.compute_device_type)
+        self._draw_devices(row, self.compute_device_type, devices)
+
+        import _cycles
+        has_peer_memory = 0
+        for device in _cycles.available_devices(self.compute_device_type):
+            if device[3] and self.find_existing_device_entry(device).use:
+                has_peer_memory += 1
+        if has_peer_memory > 1:
+            row = layout.row()
+            row.use_property_split = True
+            row.prop(self, "peer_memory")
 
     def draw(self, context):
         self.draw_impl(self.layout, context)
diff --git a/intern/cycles/blender/blender_device.cpp b/intern/cycles/blender/blender_device.cpp
index 5140f190f36..3a923459782 100644
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -113,6 +113,10 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
         device = Device::get_multi_device(used_devices, threads, background);
       }
       /* Else keep using the CPU device that was set before. */
+
+      if (!get_boolean(cpreferences, "peer_memory")) {
+        device.has_peer_memory = false;
+      }
     }
   }
 
diff --git a/intern/cycles/blender/blender_python.cpp b/intern/cycles/blender/blender_python.cpp
index 79c16856462..0be19dbffd1 100644
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -416,10 +416,11 @@ static PyObject *available_devices_func(PyObject * /*self*/, PyObject *args)
   for (size_t i = 0; i < devices.size(); i++) {
     DeviceInfo &device = devices[i];
     string type_name = Device::string_from_type(device.type);
-    PyObject *device_tuple = PyTuple_New(3);
+    PyObject *device_tuple = PyTuple_New(4);
     PyTuple_SET_ITEM(device_tuple, 0, pyunicode_from_string(device.description.c_str()));
     PyTuple_SET_ITEM(device_tuple, 1, pyunicode_from_string(type_name.c_str()));
     PyTuple_SET_ITEM(device_tuple, 2, pyunicode_from_string(device.id.c_str()));
+    PyTuple_SET_ITEM(device_tuple, 3, PyBool_FromLong(device.has_peer_memory));
     PyTuple_SET_ITEM(ret, i, device_tuple);
   }
 
diff --git a/intern/cycles/device/cuda/device_cuda.h b/intern/cycles/device/cuda/device_cuda.h
index 3f23f0fe4c5..9f31ed12cf4 100644
--- a/intern/cycles/device/cuda/device_cuda.h
+++ b/intern/cycles/device/cuda/device_cuda.h
@@ -51,6 +51,7 @@ class CUDADevice : public Device {
   size_t map_host_used;
   size_t map_host_limit;
   int can_map_host;
+  int pitch_alignment;
   int cuDevId;
   int cuDevArchitecture;
   bool first_error;
@@ -111,6 +112,8 @@ class CUDADevice : public Device {
 
   bool support_device(const DeviceRequestedFeatures & /*requested_features*/);
 
+  bool check_peer_access(Device *peer_device);
+
   bool use_adaptive_compilation();
 
   bool use_split_kernel();
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp
index acf53c3eb1b..64c7f5e7d34 100644
--- a/intern/cycles/device/cuda/device_cuda_impl.cpp
+++ b/intern/cycles/device/cuda/device_cuda_impl.cpp
@@ -207,6 +207,7 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   map_host_limit = 0;
   map_host_used = 0;
   can_map_host = 0;
+  pitch_alignment = 0;
 
   functions.loaded = false;
 
@@ -224,6 +225,9 @@ CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool
   cuda_assert(
       cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
 
+  cuda_assert(cuDeviceGetAttribute(
+      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+
   unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
   if (can_map_host) {
     ctx_flags |= CU_CTX_MAP_HOST;
@@ -286,6 +290,49 @@ bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_feat
   return true;
 }
 
+bool CUDADevice::check_peer_access(Device *peer_device)
+{
+  if (peer_device == this) {
+    return false;
+  }
+  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
+    return false;
+  }
+
+  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
+
+  int can_access = 0;
+  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Ensure array access over the link is possible as well (for 3D textures)
+  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
+                                      CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
+                                      cuDevice,
+                                      peer_device_cuda->cuDevice));
+  if (can_access == 0) {
+    return false;
+  }
+
+  // Enable peer access in both directions
+  {
+    const CUDAContextScope scope(this);
+    if (cuda_error(cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0))) {
+      return false;
+    }
+  }
+  {
+    const CUDAContextScope scope(peer_device_cuda);
+    if (cuda_error(cuCtxEnablePeerAccess(cuContext, 0))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
 bool CUDADevice::use_adaptive_compilation()
 {
   return DebugFlags().cuda.adaptive_compile;
@@ -674,6 +721,12 @@ void CUDADevice::load_texture_info()
 
 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
 {
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
   /* Signal to reallocate textures in host memory only. */
   move_texture_to_host = true;
 
@@ -687,6 +740,12 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
       device_memory &mem = *pair.first;
       CUDAMem *cmem = &pair.second;
 
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
       bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
                         (&mem != &texture_info);
       bool is_image = is_texture && (mem.data_height > 1);
@@ -696,11 +755,6 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
         continue;
       }
 
-      /* Already in host memory. */
-      if (cmem->use_mapped_host) {
-        continue;
-      }
-
       /* For other textures, only move image textures. */
       if (for_texture && !is_image) {
         continue;
@@ -723,26 +777,30 @@ void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
       static thread_mutex move_mutex;
       thread_scoped_lock lock(move_mutex);
 
-      /* Preserve the original device pointer, in case of multi device
-       * we can't change it because the pointer mapping would break. */
-      device_ptr prev_pointer = max_mem->device_pointer;
-      size_t prev_size = max_mem->device_size;
+      any_device_moving_textures_to_host = true;
 
-      mem_copy_to(*max_mem);
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
       size = (max_size >= size) ? 0 : size - max_size;
 
-      max_mem->device_pointer = prev_pointer;
-      max_mem->device_size = prev_size;
+      any_device_moving_textures_to_host = false;
     }
     else {
       break;
     }
   }
 
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
   /* Update texture info array with new pointers. */
   load_texture_info();
-
-  move_texture_to_host = false;
 }
 
 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
@@ -808,9 +866,6 @@ CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_
       map_host_used += size;
       status = " in host memory";
     }
-    else {
-      status = " failed, out of host memory";
-    }
   }
 
   if (mem_alloc_result != CUDA_SUCCESS) {
@@ -906,7 +961,7 @@ void CUDADevice::generic_free(device_memory &mem)
     }
     else {
       /* Free device memory. */
-      cuMemFree(mem.device_pointer);
+      cuda_assert(cuMemFree(mem.device_pointer));
     }
 
     stats.mem_free(mem.device_size);
@@ -1032,18 +1087,17 @@ void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
 
 void CUDADevice::global_alloc(device_memory &mem)
 {
-  CUDAContextScope scope(this);
-
-  generic_alloc(mem);
-  generic_copy_to(mem);
+  if (mem.is_resident(this)) {
+    generic_alloc(mem);
+    generic_copy_to(mem);
+  }
 
   const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
 }
 
 void CUDADevice::global_free(device_memory &mem)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
+  if (mem.is_resident(this) && mem.device_pointer) {
     generic_free(mem);
   }
 }
@@ -1112,7 +1166,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
   size_t src_pitch = mem.data_width * dsize * mem.data_elements;
   size_t dst_pitch = src_pitch;
 
-  if (mem.data_depth > 1) {
+  if (!mem.is_resident(this)) {
+    cmem = &cuda_mem_map[&mem];
+    cmem->texobject = 0;
+
+    if (mem.data_depth > 1) {
+      array_3d = (CUarray)mem.device_pointer;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      dst_pitch = align_up(src_pitch, pitch_alignment);
+    }
+  }
+  else if (mem.data_depth > 1) {
     /* 3D texture using array, there is no API for linear memory. */
     CUDA_ARRAY3D_DESCRIPTOR desc;
 
@@ -1156,10 +1222,7 @@ void CUDADevice::tex_alloc(device_texture &mem)
   }
   else if (mem.data_height > 0) {
     /* 2D texture, using pitch aligned linear memory. */
-    int alignment = 0;
-    cuda_assert(
-        cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-    dst_pitch = align_up(src_pitch, alignment);
+    dst_pitch = align_up(src_pitch, pitch_alignment);
     size_t dst_size = dst_pitch * mem.data_height;
 
     cmem = generic_alloc(mem, dst_size - mem.memory_size());
@@ -1251,7 +1314,11 @@ void CUDADevice::tex_free(device_texture &mem)
       cuTexObjectDestroy(cmem.texobject);
     }
 
-    if (cmem.array) {
+    if (!mem.is_resident(this)) {
+      /* Do not free memory here, since it was allocated on a different device. */
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+    else if (cmem.array) {
       /* Free array. */
       cuArrayDestroy(cmem.array);
       stats.mem_free(mem.device_size);
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index bad156d40bf..41dd7894d93 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -602,6 +602,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
   info.has_adaptive_stop_per_sample = true;
   info.has_osl = true;
   info.has_profiling = true;
+  info.has_peer_memory = false;
 
   foreach (const DeviceInfo &device, subdevices) {
     /* Ensure CPU device does not slow down GPU. */
@@ -645,6 +646,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
     info.has_adaptive_stop_per_sample &= device.has_adaptive_stop_per_sample;
     info.has_osl &= device.has_osl;
     info.has_profiling &= device.has_profiling;
+    info.has_peer_memory |= device.has_peer_memory;
   }
 
   return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index c55dfb3a83b..dff981080a5 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -82,6 +82,7 @@ class DeviceInfo {
   bool has_osl;                      /* Support Open Shading Language. */
   bool use_split_kernel;             /* Use split or mega kernel. */
   bool has_profiling;                /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;              /* GPU has P2P access to memory of another GPU. */
   int cpu_threads;
   vector<DeviceInfo> multi_devices;
   vector<DeviceInfo> denoising_devices;
@@ -99,6 +100,7 @@ class DeviceInfo {
     has_osl = false;
     use_split_kernel = false;
     has_profiling = false;
+    has_peer_memory = false;
   }
 
   bool operator==(const DeviceInfo &info)
@@ -435,6 +437,17 @@ class Device {
   {
   }
 
+  virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
+  {
+    /* Memory is always resident if this is not a multi device, regardless of whether the pointer
+     * is valid or not (since it may not have been allocated yet). */
+    return sub_device == this;
+  }
+  virtual bool check_peer_access(Device * /*peer_device*/)
+  {
+    return false;
+  }
+
   /* static */
   static Device *create(DeviceInfo &info,
                         Stats &stats,
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 4a53fcd151d..04c04761311 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -131,6 +131,15 @@ void device_cuda_info(vector<DeviceInfo> &devices)
     info.has_volume_decoupled = false;
     info.has_adaptive_stop_per_sample = false;
 
+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        cuDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
+
     int pci_location[3] = {0, 0, 0};
     cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
     cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index 671cd7c29f3..8064d50d31f 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -125,6 +125,11 @@ void device_memory::restore_device()
   device_pointer = original_device_ptr;
 }
 
+bool device_memory::is_resident(Device *sub_device) const
+{
+  return device->is_resident(device_pointer, sub_device);
+}
+
 /* Device Sub Ptr */
 
 device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 1c20db900bc..32654e62a6f 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -230,6 +230,8 @@ class device_memory {
   void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
   void restore_device();
 
+  bool is_resident(Device *sub_device) const;
+
  protected:
   friend class CUDADevice;
   friend class OptiXDevice;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 3636ecaa7a1..77ede3bf62a 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -34,37 +34,66 @@ CCL_NAMESPACE_BEGIN
 class MultiDevice : public Device {
  public:
   struct SubDevice {
-    explicit SubDevice(Device *device_) : device(device_)
-    {
-    }
-
+    Stats stats;
     Device *device;
     map<device_ptr, device_ptr> ptr_map;
+    int peer_island_index = -1;
   };
 
   list<SubDevice> devices, denoising_devices;
   device_ptr unique_key;
+  vector<vector<SubDevice *>> peer_islands;
 
   MultiDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
       : Device(info, stats, profiler, background_), unique_key(1)
   {
     foreach (DeviceInfo &subinfo, info.multi_devices) {
-      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
-
       /* Always add CPU devices at the back since GPU devices can change
        * host memory pointers, which CPU uses as device pointer. */
+      SubDevice *sub;
       if (subinfo.type == DEVICE_CPU) {
-        devices.push_back(SubDevice(device));
+        devices.emplace_back();
+        sub = &devices.back();
       }
       else {
-        devices.push_front(SubDevice(device));
+        devices.emplace_front();
+        sub = &devices.front();
       }
+
+      /* The pointer to 'sub->stats' will stay valid even after new devices
+       * are added, since 'devices' is a linked list. */
+      sub->device = Device::create(subinfo, sub->stats, profiler, background);
     }
 
     foreach (DeviceInfo &subinfo, info.denoising_devices) {
-      Device *device = Device::create(subinfo, sub_stats_, profiler, background);
+      denoising_devices.emplace_back();
+      SubDevice *sub = &denoising_devices.back();
+
+      sub->device = Device::create(subinfo, sub->stats, profiler, background);
+    }
+
+    /* Build a list of peer islands for the available render devices */
+    foreach (SubDevice &sub, devices) {
+      /* First ensure that every device is in at least once peer island */
+      if (sub.peer_island_index < 0) {
+        peer_islands.emplace_back();
+        sub.peer_island_index = (int)peer_islands.size() - 1;
+        peer_islands[sub.peer_island_index].push_back(&sub);
+      }
+
+      if (!info.has_peer_memory) {
+        continue;
+      }
 
-      denoising_devices.push_back(SubDevice(device));
+      /* Second check peer access between devices and fill up the islands accordingly */
+      foreach (SubDevice &peer_sub, devices) {
+        if (peer_sub.peer_island_index < 0 &&
+            peer_sub.device->info.type == sub.device->info.type &&
+            peer_sub.device->check_peer_access(sub.device)) {
+          peer_sub.peer_island_index = sub.peer_island_index;
+          peer_islands[sub.peer_island_index].push_back(&peer_sub);
+        }
+      }
     }
 
 #ifdef WITH_NETWORK
@@ -175,11 +204,11 @@ class MultiDevice : public Device {
 
   bool build_optix_bvh(BVH *bvh)
   {
-    // Broadcast acceleration structure build to all render devices
-    foreach (SubDevice &sub, devices)
+    /* Broadcast acceleration structure build to all render devices */
+    foreach (SubDevice &sub, devices) {
       if (!sub.device->build_optix_bvh(bvh))
         return false;
-
+    }
     return true;
   }
 
@@ -191,17 +220,82 @@ class MultiDevice : public Device {
     return devices.front().device->osl_memory();
   }
 
+  bool is_resident(device_ptr key, Device *sub_device) override
+  {
+    foreach (SubDevice &sub, devices) {
+      if (sub.device == sub_device) {
+        return find_matching_mem_device(key, sub)->device == sub_device;
+      }
+    }
+    return false;
+  }
+
+  SubDevice *find_matching_mem_device(device_ptr key, SubDevice &sub)
+  {
+    assert(sub.peer_island_index >= 0 && key != 0);
+
+    /* Get the memory owner of this key (first try current device, then peer devices) */
+    SubDevice *owner_sub = &sub;
+    if (owner_sub->ptr_map.find(key) == owner_sub->ptr_map.end()) {
+      foreach (SubDevice *island_sub, peer_islands[sub.peer_island_index]) {
+        if (island_sub != owner_sub &&
+            island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) {
+          owner_sub = island_sub;
+        }
+      }
+    }
+    return owner_sub;
+  }
+
+  SubDevice *find_suitable_mem_device(device_ptr key, const vector<SubDevice *> &island)
+  {
+    assert(!island.empty());
+
+    /* Get the memory owner of this key or the device with the lowest memory usage when new */
+    SubDevice *owner_sub = island.front();
+    foreach (SubDevice *island_sub, island) {
+      if (key ? (island_sub->ptr_map.find(key) != island_sub->ptr_map.end()) :
+                (island_sub->device->stats.mem_used < owner_sub->device->stats.mem_used)) {
+        owner_sub = island_sub;
+      }
+    }
+    return owner_sub;
+  }
+
+  inline device_ptr find_matching_mem(device_ptr key, SubDevice &sub)
+  {
+    return find_matching_mem_device(key, sub)->ptr_map[key];
+  }
+
   void mem_alloc(device_memory &mem)
   {
     device_ptr key = unique_key++;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = 0;
-      mem.device_size = 0;
+    if (mem.type == MEM_PIXELS) {
+      /* Always allocate pixels memory on all devices
+       * This is necessary to ensure PBOs are registered everywhere, which FILM_CONVERT uses */
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = 0;
+        mem.device_size = 0;
 
-      sub.device->mem_alloc(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        sub.device->mem_alloc(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+    else {
+      assert(mem.type == MEM_READ_ONLY || mem.type == MEM_READ_WRITE ||
+             mem.type == MEM_DEVICE_ONLY);
+      /* The remaining memory types can be distributed across devices */
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = 0;
+        mem.device_size = 0;
+
+        owner_sub->device->mem_alloc(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+      }
     }
 
     mem.device = this;
@@ -215,13 +309,36 @@ class MultiDevice : public Device {
     device_ptr key = (existing_key) ? existing_key : unique_key++;
     size_t existing_size = mem.device_size;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;
+    /* The tile buffers are allocated on each device (see below), so copy to all of them */
+    if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        sub.device->mem_copy_to(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
+    }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_copy_to(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
 
-      sub.device->mem_copy_to(mem);
-      sub.ptr_map[key] = mem.device_pointer;
+        if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
+          /* Need to create texture objects and update pointer in kernel globals on all devices */
+          foreach (SubDevice *island_sub, island) {
+            if (island_sub != owner_sub) {
+              island_sub->device->mem_copy_to(mem);
+            }
+          }
+        }
+      }
     }
 
     mem.device = this;
@@ -238,10 +355,11 @@ class MultiDevice : public Device {
       int sy = y + i * sub_h;
       int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
 
-      mem.device = sub.device;
-      mem.device_pointer = sub.ptr_map[key];
+      SubDevice *owner_sub = find_matching_mem_device(key, sub);
+      mem.device = owner_sub->device;
+      mem.device_pointer = owner_sub->ptr_map[key];
 
-      sub.device->mem_copy_from(mem, sy, w, sh, elem);
+      owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
       i++;
     }
 
@@ -255,16 +373,18 @@ class MultiDevice : public Device {
     device_ptr key = (existing_key) ? existing_key : unique_key++;
     size_t existing_size = mem.device_size;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;
-
-      sub.device->mem_zero(mem);
-      sub.ptr_map[key] = mem.device_pointer;
-    }
-
+    /* This is a hack to only allocate the tile buffers on denoising devices
+     * Similarily the tile buffers also need to be allocated separately on all devices so any
+     * overlap rendered for denoising does not interfer with each other */
     if (strcmp(mem.name, "RenderBuffers") == 0) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        sub.device->mem_zero(mem);
+        sub.ptr_map[key] = mem.device_pointer;
+      }
       foreach (SubDevice &sub, denoising_devices) {
         mem.device = sub.device;
         mem.device_pointer = (existing_key) ? sub.ptr_map[existing_key] : 0;
@@ -274,6 +394,17 @@ class MultiDevice : public Device {
         sub.ptr_map[key] = mem.device_pointer;
       }
     }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
+        mem.device = owner_sub->device;
+        mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_zero(mem);
+        owner_sub->ptr_map[key] = mem.device_pointer;
+      }
+    }
 
     mem.device = this;
     mem.device_pointer = key;
@@ -285,16 +416,16 @@ class MultiDevice : public Device {
     device_ptr key = mem.device_pointer;
     size_t existing_size = mem.device_size;
 
-    foreach (SubDevice &sub, devices) {
-      mem.device = sub.device;
-      mem.device_pointer = sub.ptr_map[key];
-      mem.device_size = existing_size;
-
-      sub.device->mem_free(mem);
-      sub.ptr_map.erase(sub.ptr_map.find(key));
-    }
+    /* Free memory that was allocated for all devices (see above) on each device */
+    if (strcmp(mem.name, "RenderBuffers") == 0 || mem.type == MEM_PIXELS) {
+      foreach (SubDevice &sub, devices) {
+        mem.device = sub.device;
+        mem.device_pointer = sub.ptr_map[key];
+        mem.device_size = existing_size;
 
-    if (strcmp(mem.name, "RenderBuffers") == 0) {
+        sub.device->mem_free(mem);
+        sub.ptr_map.erase(sub.ptr_map.find(key));
+      }
       foreach (SubDevice &sub, denoising_devices) {
         mem.device = sub.device;
         mem.device_pointer = sub.ptr_map[key];
@@ -304,6 +435,26 @@ class MultiDevice : public Device {
         sub.ptr_map.erase(sub.ptr_map.find(key));
       }
     }
+    else {
+      foreach (const vector<SubDevice *> &island, peer_islands) {
+        SubDevice *owner_sub = find_matching_mem_device(key, *island.front());
+        mem.device = owner_sub->device;
+        mem.device_pointer = owner_sub->ptr_map[key];
+        mem.device_size = existing_size;
+
+        owner_sub->device->mem_free(mem);
+        owner_sub->ptr_map.erase(owner_sub->ptr_map.find(key));
+
+        if (mem.type == MEM_TEXTURE) {
+          /* Free texture objects on all devices */
+          foreach (SubDevice *island_sub, island) {
+            if (island_sub != owner_sub) {
+              island_sub->device->mem_free(mem);
+            }
+          }
+        }
+      }
+    }
 
     mem.device = this;
     mem.device_pointer = 0;
@@ -330,6 +481,8 @@ class MultiDevice : public Device {
                    bool transparent,
                    const DeviceDrawParams &draw_params)
   {
+    assert(rgba.type == MEM_PIXELS);
+
     device_ptr key = rgba.device_pointer;
     int i = 0, sub_h = h / devices.size();
     int sub_height = height / devices.size();
@@ -358,7 +511,7 @@ class MultiDevice : public Device {
 
     foreach (SubDevice &sub, devices) {
       if (sub.device == sub_device) {
-        tile.buffer = sub.ptr_map[tile.buffer];
+        tile.buffer = find_matching_mem(tile.buffer, sub);
         return;
       }
     }
@@ -517,16 +670,21 @@ class MultiDevice : public Device {
         DeviceTask subtask = tasks.front();
         tasks.pop_front();
 
-        if (task.buffer)
+        if (task.type == DeviceTask::DENOISE_BUFFER && !denoising_devices.empty()) {
           subtask.buffer = sub.ptr_map[task.buffer];
-        if (task.rgba_byte)
-          subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
-        if (task.rgba_half)
-          subtask.rgba_half = sub.ptr_map[task.rgba_half];
-        if (task.shader_input)
-          subtask.shader_input = sub.ptr_map[task.shader_input];
-        if (task.shader_output)
-          subtask.shader_output = sub.ptr_map[task.shader_output];
+        }
+        else {
+          if (task.buffer)
+            subtask.buffer = find_matching_mem(task.buffer, sub);
+          if (task.rgba_byte)
+            subtask.rgba_byte = sub.ptr_map[task.rgba_byte];
+          if (task.rgba_half)
+            subtask.rgba_half = sub.ptr_map[task.rgba_half];
+          if (task.shader_input)
+            subtask.shader_input = find_matching_mem(task.shader_input, sub);
+          if (task.shader_output)
+            subtask.shader_output = find_matching_mem(task.shader_output, sub);
+        }
 
         sub.device->task_add(subtask);
       }
@@ -548,9 +706,6 @@ class MultiDevice : public Device {
     foreach (SubDevice &sub, denoising_devices)
       sub.device->task_cancel();
   }
-
- protected:
-  Stats sub_stats_;
 };
 
 Device *device_multi_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)