4 files changed, 101 insertions, 77 deletions
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index c106b4505db..3a99a49dffc 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -47,31 +47,6 @@ device_memory::~device_memory()
   assert(shared_counter == 0);
 }
 
-device_memory::device_memory(device_memory &&other)
-    : data_type(other.data_type),
-      data_elements(other.data_elements),
-      data_size(other.data_size),
-      device_size(other.device_size),
-      data_width(other.data_width),
-      data_height(other.data_height),
-      data_depth(other.data_depth),
-      type(other.type),
-      name(other.name),
-      interpolation(other.interpolation),
-      extension(other.extension),
-      device(other.device),
-      device_pointer(other.device_pointer),
-      host_pointer(other.host_pointer),
-      shared_pointer(other.shared_pointer),
-      shared_counter(other.shared_counter)
-{
-  other.device_size = 0;
-  other.device_pointer = 0;
-  other.host_pointer = 0;
-  other.shared_pointer = 0;
-  other.shared_counter = 0;
-}
-
 void *device_memory::host_alloc(size_t size)
 {
   if (!size) {
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index f8324e2a214..60740807568 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -235,9 +235,6 @@ class device_memory {
   device_memory(const device_memory &) = delete;
   device_memory &operator=(const device_memory &) = delete;
 
-  /* But moving is possible. */
-  device_memory(device_memory &&);
-
   /* Host allocation on the device. All host_pointer memory should be
    * allocated with these functions, for devices that support using
    * the same pointer for host and device. */
@@ -275,11 +272,6 @@ template<typename T> class device_only_memory : public device_memory {
     free();
   }
 
-  device_only_memory(device_only_memory &&other)
-      : device_memory(static_cast<device_memory &&>(other))
-  {
-  }
-
   void alloc_to_device(size_t num, bool shrink_to_fit = true)
   {
     size_t new_size = num;
@@ -338,10 +330,6 @@ template<typename T> class device_vector : public device_memory {
     free();
   }
 
-  device_vector(device_vector &&other) : device_memory(static_cast<device_memory &&>(other))
-  {
-  }
-
   /* Host memory allocation. */
   T *alloc(size_t width, size_t height = 0, size_t depth = 0)
   {
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ae3ab7e1fc2..7335e0bc64d 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -174,7 +174,7 @@ class OptiXDevice : public Device {
   device_vector<SbtRecord> sbt_data;
   device_vector<TextureInfo> texture_info;
   device_only_memory<KernelParams> launch_params;
-  vector<device_only_memory<uint8_t>> as_mem;
+  vector<CUdeviceptr> as_mem;
   OptixTraversableHandle tlas_handle = 0;
 
   // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
@@ -269,6 +269,9 @@ class OptiXDevice : public Device {
     task_pool.stop();
 
     // Free all acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
     as_mem.clear();
 
     sbt_data.free();
@@ -831,7 +834,6 @@ class OptiXDevice : public Device {
 
   bool build_optix_bvh(const OptixBuildInput &build_input,
                        uint16_t num_motion_steps,
-                       device_memory &out_data,
                        OptixTraversableHandle &out_handle)
   {
     out_handle = 0;
@@ -842,7 +844,15 @@ class OptiXDevice : public Device {
     OptixAccelBufferSizes sizes = {};
     OptixAccelBuildOptions options;
     options.operation = OPTIX_BUILD_OPERATION_BUILD;
-    options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+    if (background) {
+      // Prefer best performance and lowest memory consumption in background
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+    }
+    else {
+      // Prefer fast updates in viewport
+      options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
+    }
+
     options.motionOptions.numKeys = num_motion_steps;
     options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
     options.motionOptions.timeBegin = 0.0f;
@@ -853,31 +863,75 @@ class OptiXDevice : public Device {
 
     // Allocate required output buffers
     device_only_memory<char> temp_mem(this, "temp_build_mem");
-    temp_mem.alloc_to_device(sizes.tempSizeInBytes);
+    temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+    if (!temp_mem.device_pointer)
+      return false;  // Make sure temporary memory allocation succeeded
+
+    // Move textures to host memory if there is not enough room
+    size_t size = 0, free = 0;
+    cuMemGetInfo(&free, &size);
+    size = sizes.outputSizeInBytes + device_working_headroom;
+    if (size >= free && can_map_host) {
+      move_textures_to_host(size - free, false);
+    }
 
-    out_data.type = MEM_DEVICE_ONLY;
-    out_data.data_type = TYPE_UNKNOWN;
-    out_data.data_elements = 1;
-    out_data.data_size = sizes.outputSizeInBytes;
-    mem_alloc(out_data);
+    CUdeviceptr out_data = 0;
+    check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+    as_mem.push_back(out_data);
 
     // Finally build the acceleration structure
+    OptixAccelEmitDesc compacted_size_prop;
+    compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+    // A tiny space was allocated for this property at the end of the temporary buffer above
+    // Make sure this pointer is 8-byte aligned
+    compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
     check_result_optix_ret(optixAccelBuild(context,
                                            NULL,
                                            &options,
                                            &build_input,
                                            1,
                                            temp_mem.device_pointer,
-                                           sizes.tempSizeInBytes,
-                                           out_data.device_pointer,
+                                           temp_mem.device_size,
+                                           out_data,
                                            sizes.outputSizeInBytes,
                                            &out_handle,
-                                           NULL,
-                                           0));
+                                           &compacted_size_prop,
+                                           1));
 
     // Wait for all operations to finish
     check_result_cuda_ret(cuStreamSynchronize(NULL));
 
+    // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
+    if (background) {
+      uint64_t compacted_size = sizes.outputSizeInBytes;
+      check_result_cuda_ret(
+          cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+      // Temporary memory is no longer needed, so free it now to make space
+      temp_mem.free();
+
+      // There is no point compacting if the size does not change
+      if (compacted_size < sizes.outputSizeInBytes) {
+        CUdeviceptr compacted_data = 0;
+        if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+          // Do not compact if memory allocation for compacted acceleration structure fails
+          // Can just use the uncompacted one then, so succeed here regardless
+          return true;
+        as_mem.push_back(compacted_data);
+
+        check_result_optix_ret(optixAccelCompact(
+            context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+
+        // Wait for compaction to finish
+        check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+        // Free uncompacted acceleration structure
+        cuMemFree(out_data);
+        as_mem.erase(as_mem.end() - 2);  // Remove 'out_data' from 'as_mem' array
+      }
+    }
+
     return true;
   }
 
@@ -889,7 +943,10 @@ class OptiXDevice : public Device {
     unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
     meshes.reserve(bvh->meshes.size());
 
-    // Free all previous acceleration structure
+    // Free all previous acceleration structures
+    for (CUdeviceptr mem : as_mem) {
+      cuMemFree(mem);
+    }
     as_mem.clear();
 
     // Build bottom level acceleration structures (BLAS)
@@ -968,9 +1025,8 @@ class OptiXDevice : public Device {
         build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
 
         // Allocate memory for new BLAS and build it
-        as_mem.emplace_back(this, "blas");
         handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
           return false;
       }
 
@@ -1034,9 +1090,8 @@ class OptiXDevice : public Device {
         build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
 
         // Allocate memory for new BLAS and build it
-        as_mem.emplace_back(this, "blas");
         handles.emplace_back();
-        if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+        if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
           return false;
       }
 
@@ -1081,15 +1136,17 @@ class OptiXDevice : public Device {
 
         // Insert motion traversable if object has motion
         if (motion_blur && ob->use_motion()) {
-          as_mem.emplace_back(this, "motion_transform");
-          device_only_memory<uint8_t> &motion_transform_gpu = as_mem.back();
-          motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
-                                               (max(ob->motion.size(), 2) - 2) *
-                                                   sizeof(OptixSRTData));
+          size_t motion_keys = max(ob->motion.size(), 2) - 2;
+          size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+                                         motion_keys * sizeof(OptixSRTData);
+
+          CUdeviceptr motion_transform_gpu = 0;
+          check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
+          as_mem.push_back(motion_transform_gpu);
 
           // Allocate host side memory for motion transform and fill it with transform data
           OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
-              motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
+              new uint8_t[motion_transform_size]);
           motion_transform.child = handle;
           motion_transform.motionOptions.numKeys = ob->motion.size();
           motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
@@ -1101,38 +1158,43 @@ class OptiXDevice : public Device {
           transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
 
           for (size_t i = 0; i < ob->motion.size(); ++i) {
-            // scaling
-            srt_data[i].a = decomp[i].z.x;   // scale.x.y
-            srt_data[i].b = decomp[i].z.y;   // scale.x.z
-            srt_data[i].c = decomp[i].w.x;   // scale.y.z
+            // Scale
             srt_data[i].sx = decomp[i].y.w;  // scale.x.x
             srt_data[i].sy = decomp[i].z.w;  // scale.y.y
             srt_data[i].sz = decomp[i].w.w;  // scale.z.z
-            srt_data[i].pvx = 0;
-            srt_data[i].pvy = 0;
-            srt_data[i].pvz = 0;
-            // rotation
+
+            // Shear
+            srt_data[i].a = decomp[i].z.x;  // scale.x.y
+            srt_data[i].b = decomp[i].z.y;  // scale.x.z
+            srt_data[i].c = decomp[i].w.x;  // scale.y.z
+
+            // Pivot point
+            srt_data[i].pvx = 0.0f;
+            srt_data[i].pvy = 0.0f;
+            srt_data[i].pvz = 0.0f;
+
+            // Rotation
             srt_data[i].qx = decomp[i].x.x;
             srt_data[i].qy = decomp[i].x.y;
             srt_data[i].qz = decomp[i].x.z;
             srt_data[i].qw = decomp[i].x.w;
-            // transform
+
+            // Translation
             srt_data[i].tx = decomp[i].y.x;
             srt_data[i].ty = decomp[i].y.y;
             srt_data[i].tz = decomp[i].y.z;
           }
 
           // Upload motion transform to GPU
-          mem_copy_to(motion_transform_gpu);
-          delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
-          motion_transform_gpu.host_pointer = 0;
+          cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+          delete[] reinterpret_cast<uint8_t *>(&motion_transform);
 
           // Disable instance transform if object uses motion transform already
           instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
 
           // Get traversable handle to motion transform
           optixConvertPointerToTraversableHandle(context,
-                                                 motion_transform_gpu.device_pointer,
+                                                 motion_transform_gpu,
                                                  OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
                                                  &instance.traversableHandle);
         }
@@ -1168,8 +1230,7 @@ class OptiXDevice : public Device {
     build_input.instanceArray.aabbs = aabbs.device_pointer;
     build_input.instanceArray.numAabbs = num_instances;
 
-    as_mem.emplace_back(this, "tlas");
-    return build_optix_bvh(build_input, 0, as_mem.back(), tlas_handle);
+    return build_optix_bvh(build_input, 0, tlas_handle);
   }
 
   void update_texture_info()
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index 434502f31f9..bf5957ec9e4 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -53,7 +53,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
   float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
 
   for (int sample = 0; sample < num_samples; sample++) {
-    float disk_u = 0.0f, disk_v = 0.0f;
+    float disk_u, disk_v;
     path_branched_rng_2D(
         kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);