Cycles: Fix out of memory when rendering some scenes with OptiX that work with CUDA

The OptiX implementation wasn't trying to allocate memory on the host if device allocation failed, while the CUDA implementation did. This copies the implementation over to OptiX to remedy that. Differential Revision: https://developer.blender.org/D6068
author: Patrick Mours <pmours@nvidia.com> 2019-10-18 13:06:28 +0300
committer: Patrick Mours <pmours@nvidia.com> 2019-10-18 13:23:27 +0300
commit: 8378db40c71af53e3a48ef797cdacb38a1a26edc (patch)
tree: 90dcc39dc70702f1e64cc7291f3778ccdfa17bdb /intern/cycles
parent: 16665ad753cd511a57108c6491f7b4bebe4dffd4 (diff)
2 files changed, 503 insertions, 173 deletions
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index 5b43ce8b0bc..272da6a9ad3 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -224,6 +224,7 @@ class device_memory {
 
  protected:
   friend class CUDADevice;
+  friend class OptiXDevice;
 
   /* Only create through subclasses. */
   device_memory(Device *device, const char *name, MemoryType type);
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index 6f4734059da..e230662e698 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -177,7 +177,14 @@ class OptiXDevice : public Device {
   vector<device_only_memory<uint8_t>> blas;
   OptixTraversableHandle tlas_handle = 0;
 
+  // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
+  int can_map_host = 0;
+  size_t map_host_used = 0;
+  size_t map_host_limit = 0;
+  size_t device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+  size_t device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
   map<device_memory *, CUDAMem> cuda_mem_map;
+  bool move_texture_to_host = false;
 
  public:
   OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
@@ -199,6 +206,25 @@ class OptiXDevice : public Device {
     // Make that CUDA context current
     const CUDAContextScope scope(cuda_context);
 
+    // Limit amount of host mapped memory (see init_host_memory in device_cuda.cpp)
+    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+    size_t system_ram = system_physical_ram();
+    if (system_ram > 0) {
+      if (system_ram / 2 > default_limit) {
+        map_host_limit = system_ram - default_limit;
+      }
+      else {
+        map_host_limit = system_ram / 2;
+      }
+    }
+    else {
+      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+    }
+
+    // Check device support for pinned host memory
+    check_result_cuda(
+        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuda_device));
+
     // Create OptiX context for this device
     OptixDeviceContextOptions options = {};
 #  ifdef WITH_CYCLES_LOGGING
@@ -826,6 +852,7 @@ class OptiXDevice : public Device {
     device_only_memory<char> temp_mem(this, "temp_build_mem");
     temp_mem.alloc_to_device(sizes.tempSizeInBytes);
 
+    out_data.type = MEM_DEVICE_ONLY;
     out_data.data_type = TYPE_UNKNOWN;
     out_data.data_elements = 1;
     out_data.data_size = sizes.outputSizeInBytes;
@@ -1168,131 +1195,162 @@ class OptiXDevice : public Device {
 
   void mem_alloc(device_memory &mem) override
   {
-    const CUDAContextScope scope(cuda_context);
+    if (mem.type == MEM_PIXELS && !background) {
+      assert(!"mem_alloc not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_alloc not supported for textures.");
+    }
+    else {
+      generic_alloc(mem);
+    }
+  }
 
-    mem.device_size = mem.memory_size();
-
-    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
-      CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
-
-      CUDA_TEXTURE_DESC tex_desc = {};
-      tex_desc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-      CUDA_RESOURCE_DESC res_desc = {};
-
-      switch (mem.extension) {
-        default:
-          assert(0);
-        case EXTENSION_REPEAT:
-          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
-              CU_TR_ADDRESS_MODE_WRAP;
-          break;
-        case EXTENSION_EXTEND:
-          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
-              CU_TR_ADDRESS_MODE_CLAMP;
-          break;
-        case EXTENSION_CLIP:
-          tex_desc.addressMode[0] = tex_desc.addressMode[1] = tex_desc.addressMode[2] =
-              CU_TR_ADDRESS_MODE_BORDER;
-          break;
-      }
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
+  {
+    CUDAContextScope scope(cuda_context);
 
-      switch (mem.interpolation) {
-        default:  // Default to linear for unsupported interpolation types
-        case INTERPOLATION_LINEAR:
-          tex_desc.filterMode = CU_TR_FILTER_MODE_LINEAR;
-          break;
-        case INTERPOLATION_CLOSEST:
-          tex_desc.filterMode = CU_TR_FILTER_MODE_POINT;
-          break;
-      }
+    CUdeviceptr device_pointer = 0;
+    size_t size = mem.memory_size() + pitch_padding;
 
-      CUarray_format format;
-      switch (mem.data_type) {
-        default:
-          assert(0);
-        case TYPE_UCHAR:
-          format = CU_AD_FORMAT_UNSIGNED_INT8;
-          break;
-        case TYPE_UINT16:
-          format = CU_AD_FORMAT_UNSIGNED_INT16;
-          break;
-        case TYPE_UINT:
-          format = CU_AD_FORMAT_UNSIGNED_INT32;
-          break;
-        case TYPE_INT:
-          format = CU_AD_FORMAT_SIGNED_INT32;
-          break;
-        case TYPE_FLOAT:
-          format = CU_AD_FORMAT_FLOAT;
-          break;
-        case TYPE_HALF:
-          format = CU_AD_FORMAT_HALF;
-          break;
-      }
+    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+    const char *status = "";
 
-      if (mem.data_depth > 1) { /* 3D texture using array. */
-        CUDA_ARRAY3D_DESCRIPTOR desc;
-        desc.Width = mem.data_width;
-        desc.Height = mem.data_height;
-        desc.Depth = mem.data_depth;
-        desc.Format = format;
-        desc.NumChannels = mem.data_elements;
-        desc.Flags = 0;
+    /* First try allocating in device memory, respecting headroom. We make
+     * an exception for texture info. It is small and frequently accessed,
+     * so treat it as working memory.
+     *
+     * If there is not enough room for working memory, we will try to move
+     * textures to host memory, assuming the performance impact would have
+     * been worse for working memory. */
+    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+    bool is_image = is_texture && (mem.data_height > 1);
 
-        check_result_cuda(cuArray3DCreate(&cmem.array, &desc));
-        mem.device_pointer = (device_ptr)cmem.array;
+    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
 
-        res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-        res_desc.res.array.hArray = cmem.array;
+    size_t total = 0, free = 0;
+    cuMemGetInfo(&free, &total);
+
+    /* Move textures to host memory if needed. */
+    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+      move_textures_to_host(size + headroom - free, is_texture);
+      cuMemGetInfo(&free, &total);
+    }
+
+    /* Allocate in device memory. */
+    if (!move_texture_to_host && (size + headroom) < free) {
+      mem_alloc_result = cuMemAlloc(&device_pointer, size);
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        status = " in device memory";
       }
-      else if (mem.data_height > 0) { /* 2D texture using array. */
-        CUDA_ARRAY_DESCRIPTOR desc;
-        desc.Width = mem.data_width;
-        desc.Height = mem.data_height;
-        desc.Format = format;
-        desc.NumChannels = mem.data_elements;
-
-        check_result_cuda(cuArrayCreate(&cmem.array, &desc));
-        mem.device_pointer = (device_ptr)cmem.array;
-
-        res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-        res_desc.res.array.hArray = cmem.array;
+    }
+
+    /* Fall back to mapped host memory if needed and possible. */
+    void *map_host_pointer = 0;
+    bool free_map_host = false;
+
+    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+        map_host_used + size < map_host_limit) {
+      if (mem.shared_pointer) {
+        /* Another device already allocated host memory. */
+        mem_alloc_result = CUDA_SUCCESS;
+        map_host_pointer = mem.shared_pointer;
       }
       else {
-        check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+        /* Allocate host memory ourselves. */
+        mem_alloc_result = cuMemHostAlloc(
+            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+        mem.shared_pointer = map_host_pointer;
+        free_map_host = true;
+      }
 
-        res_desc.resType = CU_RESOURCE_TYPE_LINEAR;
-        res_desc.res.linear.devPtr = (CUdeviceptr)mem.device_pointer;
-        res_desc.res.linear.format = format;
-        res_desc.res.linear.numChannels = mem.data_elements;
-        res_desc.res.linear.sizeInBytes = mem.device_size;
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0);
+        map_host_used += size;
+        status = " in host memory";
+
+        /* Replace host pointer with our host allocation. Only works if
+         * CUDA memory layout is the same and has no pitch padding. Also
+         * does not work if we move textures to host during a render,
+         * since other devices might be using the memory. */
+        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+            mem.host_pointer != mem.shared_pointer) {
+          memcpy(mem.shared_pointer, mem.host_pointer, size);
+          mem.host_free();
+          mem.host_pointer = mem.shared_pointer;
+        }
       }
+      else {
+        status = " failed, out of host memory";
+      }
+    }
+    else if (mem_alloc_result != CUDA_SUCCESS) {
+      status = " failed, out of device and host memory";
+    }
 
-      check_result_cuda(cuTexObjectCreate(&cmem.texobject, &res_desc, &tex_desc, NULL));
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+    }
 
-      int flat_slot = 0;
-      if (string_startswith(mem.name, "__tex_image")) {
-        flat_slot = atoi(mem.name + string(mem.name).rfind("_") + 1);
-      }
+    if (mem_alloc_result != CUDA_SUCCESS) {
+      set_error(string_printf("Buffer allocate %s", status));
+      return NULL;
+    }
+
+    mem.device_pointer = (device_ptr)device_pointer;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    if (!mem.device_pointer) {
+      return NULL;
+    }
 
-      if (flat_slot >= texture_info.size())
-        texture_info.resize(flat_slot + 128);
+    /* Insert into map of allocations. */
+    CUDAMem *cmem = &cuda_mem_map[&mem];
+    cmem->map_host_pointer = map_host_pointer;
+    cmem->free_map_host = free_map_host;
+    return cmem;
+  }
 
-      TextureInfo &info = texture_info[flat_slot];
-      info.data = (uint64_t)cmem.texobject;
-      info.cl_buffer = 0;
-      info.interpolation = mem.interpolation;
-      info.extension = mem.extension;
-      info.width = mem.data_width;
-      info.height = mem.data_height;
-      info.depth = mem.data_depth;
+  void tex_alloc(device_memory &mem)
+  {
+    CUDAContextScope scope(cuda_context);
+
+    /* General variables for both architectures */
+    string bind_name = mem.name;
+    size_t dsize = datatype_size(mem.data_type);
+    size_t size = mem.memory_size();
+
+    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+    switch (mem.extension) {
+      case EXTENSION_REPEAT:
+        address_mode = CU_TR_ADDRESS_MODE_WRAP;
+        break;
+      case EXTENSION_EXTEND:
+        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+        break;
+      case EXTENSION_CLIP:
+        address_mode = CU_TR_ADDRESS_MODE_BORDER;
+        break;
+      default:
+        assert(0);
+        break;
+    }
 
-      // Texture information has changed and needs an update, delay this to next launch
-      need_texture_info = true;
+    CUfilter_mode filter_mode;
+    if (mem.interpolation == INTERPOLATION_CLOSEST) {
+      filter_mode = CU_TR_FILTER_MODE_POINT;
     }
     else {
-      // This is not a texture but simple linear memory
-      check_result_cuda(cuMemAlloc((CUdeviceptr *)&mem.device_pointer, mem.device_size));
+      filter_mode = CU_TR_FILTER_MODE_LINEAR;
+    }
+
+    /* Data Storage */
+    if (mem.interpolation == INTERPOLATION_NONE) {
+      generic_alloc(mem);
+      generic_copy_to(mem);
 
       // Update data storage pointers in launch parameters
 #  define KERNEL_TEX(data_type, tex_name) \
@@ -1301,77 +1359,233 @@ class OptiXDevice : public Device {
           mem.name, offsetof(KernelParams, tex_name), &mem.device_pointer, sizeof(device_ptr));
 #  include "kernel/kernel_textures.h"
 #  undef KERNEL_TEX
+      return;
     }
 
-    stats.mem_alloc(mem.device_size);
-  }
+    /* Image Texture Storage */
+    CUarray_format_enum format;
+    switch (mem.data_type) {
+      case TYPE_UCHAR:
+        format = CU_AD_FORMAT_UNSIGNED_INT8;
+        break;
+      case TYPE_UINT16:
+        format = CU_AD_FORMAT_UNSIGNED_INT16;
+        break;
+      case TYPE_UINT:
+        format = CU_AD_FORMAT_UNSIGNED_INT32;
+        break;
+      case TYPE_INT:
+        format = CU_AD_FORMAT_SIGNED_INT32;
+        break;
+      case TYPE_FLOAT:
+        format = CU_AD_FORMAT_FLOAT;
+        break;
+      case TYPE_HALF:
+        format = CU_AD_FORMAT_HALF;
+        break;
+      default:
+        assert(0);
+        return;
+    }
 
-  void mem_copy_to(device_memory &mem) override
-  {
-    if (!mem.host_pointer || mem.host_pointer == mem.shared_pointer)
-      return;
-    if (!mem.device_pointer)
-      mem_alloc(mem);  // Need to allocate memory first if it does not exist yet
+    CUDAMem *cmem = NULL;
+    CUarray array_3d = NULL;
+    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+    size_t dst_pitch = src_pitch;
 
-    const CUDAContextScope scope(cuda_context);
+    if (mem.data_depth > 1) {
+      /* 3D texture using array, there is no API for linear memory. */
+      CUDA_ARRAY3D_DESCRIPTOR desc;
 
-    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
-      const CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
+      desc.Width = mem.data_width;
+      desc.Height = mem.data_height;
+      desc.Depth = mem.data_depth;
+      desc.Format = format;
+      desc.NumChannels = mem.data_elements;
+      desc.Flags = 0;
 
-      size_t src_pitch = mem.data_width * datatype_size(mem.data_type) * mem.data_elements;
+      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
 
-      if (mem.data_depth > 1) {
-        CUDA_MEMCPY3D param;
-        memset(&param, 0, sizeof(param));
-        param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-        param.dstArray = cmem.array;
-        param.srcMemoryType = CU_MEMORYTYPE_HOST;
-        param.srcHost = mem.host_pointer;
-        param.srcPitch = src_pitch;
-        param.WidthInBytes = param.srcPitch;
-        param.Height = mem.data_height;
-        param.Depth = mem.data_depth;
+      check_result_cuda(cuArray3DCreate(&array_3d, &desc));
 
-        check_result_cuda(cuMemcpy3D(&param));
+      if (!array_3d) {
+        return;
       }
-      else if (mem.data_height > 0) {
-        CUDA_MEMCPY2D param;
-        memset(&param, 0, sizeof(param));
-        param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-        param.dstArray = cmem.array;
-        param.srcMemoryType = CU_MEMORYTYPE_HOST;
-        param.srcHost = mem.host_pointer;
-        param.srcPitch = src_pitch;
-        param.WidthInBytes = param.srcPitch;
-        param.Height = mem.data_height;
-
-        check_result_cuda(cuMemcpy2D(&param));
+
+      CUDA_MEMCPY3D param;
+      memset(&param, 0, sizeof(param));
+      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+      param.dstArray = array_3d;
+      param.srcMemoryType = CU_MEMORYTYPE_HOST;
+      param.srcHost = mem.host_pointer;
+      param.srcPitch = src_pitch;
+      param.WidthInBytes = param.srcPitch;
+      param.Height = mem.data_height;
+      param.Depth = mem.data_depth;
+
+      check_result_cuda(cuMemcpy3D(&param));
+
+      mem.device_pointer = (device_ptr)array_3d;
+      mem.device_size = size;
+      stats.mem_alloc(size);
+
+      cmem = &cuda_mem_map[&mem];
+      cmem->texobject = 0;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      /* 2D texture, using pitch aligned linear memory. */
+      int alignment = 0;
+      check_result_cuda(cuDeviceGetAttribute(
+          &alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuda_device));
+      dst_pitch = align_up(src_pitch, alignment);
+      size_t dst_size = dst_pitch * mem.data_height;
+
+      cmem = generic_alloc(mem, dst_size - mem.memory_size());
+      if (!cmem) {
+        return;
       }
-      else {
-        check_result_cuda(
-            cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size));
+
+      CUDA_MEMCPY2D param;
+      memset(&param, 0, sizeof(param));
+      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      param.dstDevice = mem.device_pointer;
+      param.dstPitch = dst_pitch;
+      param.srcMemoryType = CU_MEMORYTYPE_HOST;
+      param.srcHost = mem.host_pointer;
+      param.srcPitch = src_pitch;
+      param.WidthInBytes = param.srcPitch;
+      param.Height = mem.data_height;
+
+      check_result_cuda(cuMemcpy2DUnaligned(&param));
+    }
+    else {
+      /* 1D texture, using linear memory. */
+      cmem = generic_alloc(mem);
+      if (!cmem) {
+        return;
       }
+
+      check_result_cuda(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+    }
+
+    /* Kepler+, bindless textures. */
+    int flat_slot = 0;
+    if (string_startswith(mem.name, "__tex_image")) {
+      int pos = string(mem.name).rfind("_");
+      flat_slot = atoi(mem.name + pos + 1);
     }
     else {
-      // This is not a texture but simple linear memory
-      check_result_cuda(
-          cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.device_size));
+      assert(0);
+    }
+
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
     }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    check_result_cuda(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    /* Resize once */
+    if (flat_slot >= texture_info.size()) {
+      /* Allocate some slots in advance, to reduce amount
+       * of re-allocations. */
+      texture_info.resize(flat_slot + 128);
+    }
+
+    /* Set Mapping and tag that we need to (re-)upload to device */
+    TextureInfo &info = texture_info[flat_slot];
+    info.data = (uint64_t)cmem->texobject;
+    info.cl_buffer = 0;
+    info.interpolation = mem.interpolation;
+    info.extension = mem.extension;
+    info.width = mem.data_width;
+    info.height = mem.data_height;
+    info.depth = mem.data_depth;
+    need_texture_info = true;
   }
 
-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  void mem_copy_to(device_memory &mem) override
   {
-    // Calculate linear memory offset and size
-    const size_t size = elem * w * h;
-    const size_t offset = elem * y * w;
+    if (mem.type == MEM_PIXELS) {
+      assert(!"mem_copy_to not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+      tex_alloc(mem);
+    }
+    else {
+      if (!mem.device_pointer) {
+        generic_alloc(mem);
+      }
 
+      generic_copy_to(mem);
+    }
+  }
+
+  void generic_copy_to(device_memory &mem)
+  {
     if (mem.host_pointer && mem.device_pointer) {
-      const CUDAContextScope scope(cuda_context);
-      check_result_cuda(cuMemcpyDtoH(
-          (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+      CUDAContextScope scope(cuda_context);
+
+      if (mem.host_pointer != mem.shared_pointer) {
+        check_result_cuda(
+            cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
+      }
+    }
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      assert(!"mem_copy_from not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_copy_from not supported for textures.");
     }
-    else if (mem.host_pointer) {
-      memset((char *)mem.host_pointer + offset, 0, size);
+    else {
+      // Calculate linear memory offset and size
+      const size_t size = elem * w * h;
+      const size_t offset = elem * y * w;
+
+      if (mem.host_pointer && mem.device_pointer) {
+        const CUDAContextScope scope(cuda_context);
+        check_result_cuda(cuMemcpyDtoH(
+            (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
+      }
+      else if (mem.host_pointer) {
+        memset((char *)mem.host_pointer + offset, 0, size);
+      }
     }
   }
 
@@ -1391,30 +1605,145 @@ class OptiXDevice : public Device {
 
   void mem_free(device_memory &mem) override
   {
-    assert(mem.device_pointer);
+    if (mem.type == MEM_PIXELS && !background) {
+      assert(!"mem_free not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+    }
+    else {
+      generic_free(mem);
+    }
+  }
 
-    const CUDAContextScope scope(cuda_context);
+  void generic_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      CUDAContextScope scope(cuda_context);
+      const CUDAMem &cmem = cuda_mem_map[&mem];
+
+      if (cmem.map_host_pointer) {
+        /* Free host memory. */
+        if (cmem.free_map_host) {
+          cuMemFreeHost(cmem.map_host_pointer);
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          mem.shared_pointer = 0;
+        }
 
-    if (mem.type == MEM_TEXTURE && mem.interpolation != INTERPOLATION_NONE) {
-      CUDAMem &cmem = cuda_mem_map[&mem];  // Lock and get associated memory information
+        map_host_used -= mem.device_size;
+      }
+      else {
+        /* Free device memory. */
+        cuMemFree(mem.device_pointer);
+      }
 
-      if (cmem.array)
-        cuArrayDestroy(cmem.array);
-      else
-        cuMemFree((CUdeviceptr)mem.device_pointer);
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
 
-      if (cmem.texobject)
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+  }
+
+  void tex_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      CUDAContextScope scope(cuda_context);
+      const CUDAMem &cmem = cuda_mem_map[&mem];
+
+      if (cmem.texobject) {
+        /* Free bindless texture. */
         cuTexObjectDestroy(cmem.texobject);
+      }
+
+      if (cmem.array) {
+        /* Free array. */
+        cuArrayDestroy(cmem.array);
+        stats.mem_free(mem.device_size);
+        mem.device_pointer = 0;
+        mem.device_size = 0;
+
+        cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      }
+      else {
+        generic_free(mem);
+      }
     }
-    else {
-      // This is not a texture but simple linear memory
-      cuMemFree((CUdeviceptr)mem.device_pointer);
+  }
+
+  void move_textures_to_host(size_t size, bool for_texture)
+  {
+    /* Signal to reallocate textures in host memory only. */
+    move_texture_to_host = true;
+
+    while (size > 0) {
+      /* Find suitable memory allocation to move. */
+      device_memory *max_mem = NULL;
+      size_t max_size = 0;
+      bool max_is_image = false;
+
+      foreach (auto &pair, cuda_mem_map) {
+        device_memory &mem = *pair.first;
+        CUDAMem *cmem = &pair.second;
+
+        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+        bool is_image = is_texture && (mem.data_height > 1);
+
+        /* Can't move this type of memory. */
+        if (!is_texture || cmem->array) {
+          continue;
+        }
+
+        /* Already in host memory. */
+        if (cmem->map_host_pointer) {
+          continue;
+        }
+
+        /* For other textures, only move image textures. */
+        if (for_texture && !is_image) {
+          continue;
+        }
+
+        /* Try to move largest allocation, prefer moving images. */
+        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+          max_is_image = is_image;
+          max_size = mem.device_size;
+          max_mem = &mem;
+        }
+      }
+
+      /* Move to host memory. This part is mutex protected since
+       * multiple CUDA devices could be moving the memory. The
+       * first one will do it, and the rest will adopt the pointer. */
+      if (max_mem) {
+        VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+        static thread_mutex move_mutex;
+        thread_scoped_lock lock(move_mutex);
+
+        /* Preserve the original device pointer, in case of multi device
+         * we can't change it because the pointer mapping would break. */
+        device_ptr prev_pointer = max_mem->device_pointer;
+        size_t prev_size = max_mem->device_size;
+
+        tex_free(*max_mem);
+        tex_alloc(*max_mem);
+        size = (max_size >= size) ? 0 : size - max_size;
+
+        max_mem->device_pointer = prev_pointer;
+        max_mem->device_size = prev_size;
+      }
+      else {
+        break;
+      }
     }
 
-    stats.mem_free(mem.device_size);
+    /* Update texture info array with new pointers. */
+    update_texture_info();
 
-    mem.device_size = 0;
-    mem.device_pointer = 0;
+    move_texture_to_host = false;
   }
 
   void const_copy_to(const char *name, void *host, size_t size) override
author	Patrick Mours <pmours@nvidia.com>	2019-10-18 13:06:28 +0300
committer	Patrick Mours <pmours@nvidia.com>	2019-10-18 13:23:27 +0300
commit	8378db40c71af53e3a48ef797cdacb38a1a26edc (patch)
tree	90dcc39dc70702f1e64cc7291f3778ccdfa17bdb /intern/cycles
parent	16665ad753cd511a57108c6491f7b4bebe4dffd4 (diff)