diff options
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/device/device_memory.cpp | 25 | ||||
-rw-r--r-- | intern/cycles/device/device_memory.h | 12 | ||||
-rw-r--r-- | intern/cycles/device/device_optix.cpp | 139 | ||||
-rw-r--r-- | intern/cycles/kernel/svm/svm_bevel.h | 2 |
4 files changed, 101 insertions, 77 deletions
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp index c106b4505db..3a99a49dffc 100644 --- a/intern/cycles/device/device_memory.cpp +++ b/intern/cycles/device/device_memory.cpp @@ -47,31 +47,6 @@ device_memory::~device_memory() assert(shared_counter == 0); } -device_memory::device_memory(device_memory &&other) - : data_type(other.data_type), - data_elements(other.data_elements), - data_size(other.data_size), - device_size(other.device_size), - data_width(other.data_width), - data_height(other.data_height), - data_depth(other.data_depth), - type(other.type), - name(other.name), - interpolation(other.interpolation), - extension(other.extension), - device(other.device), - device_pointer(other.device_pointer), - host_pointer(other.host_pointer), - shared_pointer(other.shared_pointer), - shared_counter(other.shared_counter) -{ - other.device_size = 0; - other.device_pointer = 0; - other.host_pointer = 0; - other.shared_pointer = 0; - other.shared_counter = 0; -} - void *device_memory::host_alloc(size_t size) { if (!size) { diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h index f8324e2a214..60740807568 100644 --- a/intern/cycles/device/device_memory.h +++ b/intern/cycles/device/device_memory.h @@ -235,9 +235,6 @@ class device_memory { device_memory(const device_memory &) = delete; device_memory &operator=(const device_memory &) = delete; - /* But moving is possible. */ - device_memory(device_memory &&); - /* Host allocation on the device. All host_pointer memory should be * allocated with these functions, for devices that support using * the same pointer for host and device. */ @@ -275,11 +272,6 @@ template<typename T> class device_only_memory : public device_memory { free(); } - device_only_memory(device_only_memory &&other) - : device_memory(static_cast<device_memory &&>(other)) - { - } - void alloc_to_device(size_t num, bool shrink_to_fit = true) { size_t new_size = num; @@ -338,10 +330,6 @@ template<typename T> class device_vector : public device_memory { free(); } - device_vector(device_vector &&other) : device_memory(static_cast<device_memory &&>(other)) - { - } - /* Host memory allocation. */ T *alloc(size_t width, size_t height = 0, size_t depth = 0) { diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index ae3ab7e1fc2..7335e0bc64d 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -174,7 +174,7 @@ class OptiXDevice : public Device { device_vector<SbtRecord> sbt_data; device_vector<TextureInfo> texture_info; device_only_memory<KernelParams> launch_params; - vector<device_only_memory<uint8_t>> as_mem; + vector<CUdeviceptr> as_mem; OptixTraversableHandle tlas_handle = 0; // TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually @@ -269,6 +269,9 @@ class OptiXDevice : public Device { task_pool.stop(); // Free all acceleration structures + for (CUdeviceptr mem : as_mem) { + cuMemFree(mem); + } as_mem.clear(); sbt_data.free(); @@ -831,7 +834,6 @@ class OptiXDevice : public Device { bool build_optix_bvh(const OptixBuildInput &build_input, uint16_t num_motion_steps, - device_memory &out_data, OptixTraversableHandle &out_handle) { out_handle = 0; @@ -842,7 +844,15 @@ class OptiXDevice : public Device { OptixAccelBufferSizes sizes = {}; OptixAccelBuildOptions options; options.operation = OPTIX_BUILD_OPERATION_BUILD; - options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE; + if (background) { + // Prefer best performance and lowest memory consumption in background + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION; + } + else { + // Prefer fast updates in viewport + options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD; + } + options.motionOptions.numKeys = num_motion_steps; options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH; options.motionOptions.timeBegin = 0.0f; @@ -853,31 +863,75 @@ class OptiXDevice : public Device { // Allocate required output buffers device_only_memory<char> temp_mem(this, "temp_build_mem"); - temp_mem.alloc_to_device(sizes.tempSizeInBytes); + temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8); + if (!temp_mem.device_pointer) + return false; // Make sure temporary memory allocation succeeded + + // Move textures to host memory if there is not enough room + size_t size = 0, free = 0; + cuMemGetInfo(&free, &size); + size = sizes.outputSizeInBytes + device_working_headroom; + if (size >= free && can_map_host) { + move_textures_to_host(size - free, false); + } - out_data.type = MEM_DEVICE_ONLY; - out_data.data_type = TYPE_UNKNOWN; - out_data.data_elements = 1; - out_data.data_size = sizes.outputSizeInBytes; - mem_alloc(out_data); + CUdeviceptr out_data = 0; + check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes)); + as_mem.push_back(out_data); // Finally build the acceleration structure + OptixAccelEmitDesc compacted_size_prop; + compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE; + // A tiny space was allocated for this property at the end of the temporary buffer above + // Make sure this pointer is 8-byte aligned + compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8); + check_result_optix_ret(optixAccelBuild(context, NULL, &options, &build_input, 1, temp_mem.device_pointer, - sizes.tempSizeInBytes, - out_data.device_pointer, + temp_mem.device_size, + out_data, sizes.outputSizeInBytes, &out_handle, - NULL, - 0)); + &compacted_size_prop, + 1)); // Wait for all operations to finish check_result_cuda_ret(cuStreamSynchronize(NULL)); + // Compact acceleration structure to save memory (do not do this in viewport for faster builds) + if (background) { + uint64_t compacted_size = sizes.outputSizeInBytes; + check_result_cuda_ret( + cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size))); + + // Temporary memory is no longer needed, so free it now to make space + temp_mem.free(); + + // There is no point compacting if the size does not change + if (compacted_size < sizes.outputSizeInBytes) { + CUdeviceptr compacted_data = 0; + if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS) + // Do not compact if memory allocation for compacted acceleration structure fails + // Can just use the uncompacted one then, so succeed here regardless + return true; + as_mem.push_back(compacted_data); + + check_result_optix_ret(optixAccelCompact( + context, NULL, out_handle, compacted_data, compacted_size, &out_handle)); + + // Wait for compaction to finish + check_result_cuda_ret(cuStreamSynchronize(NULL)); + + // Free uncompacted acceleration structure + cuMemFree(out_data); + as_mem.erase(as_mem.end() - 2); // Remove 'out_data' from 'as_mem' array + } + } + return true; } @@ -889,7 +943,10 @@ class OptiXDevice : public Device { unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes; meshes.reserve(bvh->meshes.size()); - // Free all previous acceleration structure + // Free all previous acceleration structures + for (CUdeviceptr mem : as_mem) { + cuMemFree(mem); + } as_mem.clear(); // Build bottom level acceleration structures (BLAS) @@ -968,9 +1025,8 @@ class OptiXDevice : public Device { build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset; // Allocate memory for new BLAS and build it - as_mem.emplace_back(this, "blas"); handles.emplace_back(); - if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back())) + if (!build_optix_bvh(build_input, num_motion_steps, handles.back())) return false; } @@ -1034,9 +1090,8 @@ class OptiXDevice : public Device { build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments(); // Allocate memory for new BLAS and build it - as_mem.emplace_back(this, "blas"); handles.emplace_back(); - if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back())) + if (!build_optix_bvh(build_input, num_motion_steps, handles.back())) return false; } @@ -1081,15 +1136,17 @@ class OptiXDevice : public Device { // Insert motion traversable if object has motion if (motion_blur && ob->use_motion()) { - as_mem.emplace_back(this, "motion_transform"); - device_only_memory<uint8_t> &motion_transform_gpu = as_mem.back(); - motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) + - (max(ob->motion.size(), 2) - 2) * - sizeof(OptixSRTData)); + size_t motion_keys = max(ob->motion.size(), 2) - 2; + size_t motion_transform_size = sizeof(OptixSRTMotionTransform) + + motion_keys * sizeof(OptixSRTData); + + CUdeviceptr motion_transform_gpu = 0; + check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size)); + as_mem.push_back(motion_transform_gpu); // Allocate host side memory for motion transform and fill it with transform data OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>( - motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]); + new uint8_t[motion_transform_size]); motion_transform.child = handle; motion_transform.motionOptions.numKeys = ob->motion.size(); motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE; @@ -1101,38 +1158,43 @@ class OptiXDevice : public Device { transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size()); for (size_t i = 0; i < ob->motion.size(); ++i) { - // scaling - srt_data[i].a = decomp[i].z.x; // scale.x.y - srt_data[i].b = decomp[i].z.y; // scale.x.z - srt_data[i].c = decomp[i].w.x; // scale.y.z + // Scale srt_data[i].sx = decomp[i].y.w; // scale.x.x srt_data[i].sy = decomp[i].z.w; // scale.y.y srt_data[i].sz = decomp[i].w.w; // scale.z.z - srt_data[i].pvx = 0; - srt_data[i].pvy = 0; - srt_data[i].pvz = 0; - // rotation + + // Shear + srt_data[i].a = decomp[i].z.x; // scale.x.y + srt_data[i].b = decomp[i].z.y; // scale.x.z + srt_data[i].c = decomp[i].w.x; // scale.y.z + + // Pivot point + srt_data[i].pvx = 0.0f; + srt_data[i].pvy = 0.0f; + srt_data[i].pvz = 0.0f; + + // Rotation srt_data[i].qx = decomp[i].x.x; srt_data[i].qy = decomp[i].x.y; srt_data[i].qz = decomp[i].x.z; srt_data[i].qw = decomp[i].x.w; - // transform + + // Translation srt_data[i].tx = decomp[i].y.x; srt_data[i].ty = decomp[i].y.y; srt_data[i].tz = decomp[i].y.z; } // Upload motion transform to GPU - mem_copy_to(motion_transform_gpu); - delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer); - motion_transform_gpu.host_pointer = 0; + cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size); + delete[] reinterpret_cast<uint8_t *>(&motion_transform); // Disable instance transform if object uses motion transform already instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM; // Get traversable handle to motion transform optixConvertPointerToTraversableHandle(context, - motion_transform_gpu.device_pointer, + motion_transform_gpu, OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM, &instance.traversableHandle); } @@ -1168,8 +1230,7 @@ class OptiXDevice : public Device { build_input.instanceArray.aabbs = aabbs.device_pointer; build_input.instanceArray.numAabbs = num_instances; - as_mem.emplace_back(this, "tlas"); - return build_optix_bvh(build_input, 0, as_mem.back(), tlas_handle); + return build_optix_bvh(build_input, 0, tlas_handle); } void update_texture_info() diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h index 434502f31f9..bf5957ec9e4 100644 --- a/intern/cycles/kernel/svm/svm_bevel.h +++ b/intern/cycles/kernel/svm/svm_bevel.h @@ -53,7 +53,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg, float3 sum_N = make_float3(0.0f, 0.0f, 0.0f); for (int sample = 0; sample < num_samples; sample++) { - float disk_u = 0.0f, disk_v = 0.0f; + float disk_u, disk_v; path_branched_rng_2D( kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v); |