Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles')
-rw-r--r--intern/cycles/device/device_memory.cpp25
-rw-r--r--intern/cycles/device/device_memory.h12
-rw-r--r--intern/cycles/device/device_optix.cpp139
-rw-r--r--intern/cycles/kernel/svm/svm_bevel.h2
4 files changed, 101 insertions, 77 deletions
diff --git a/intern/cycles/device/device_memory.cpp b/intern/cycles/device/device_memory.cpp
index c106b4505db..3a99a49dffc 100644
--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -47,31 +47,6 @@ device_memory::~device_memory()
assert(shared_counter == 0);
}
-device_memory::device_memory(device_memory &&other)
- : data_type(other.data_type),
- data_elements(other.data_elements),
- data_size(other.data_size),
- device_size(other.device_size),
- data_width(other.data_width),
- data_height(other.data_height),
- data_depth(other.data_depth),
- type(other.type),
- name(other.name),
- interpolation(other.interpolation),
- extension(other.extension),
- device(other.device),
- device_pointer(other.device_pointer),
- host_pointer(other.host_pointer),
- shared_pointer(other.shared_pointer),
- shared_counter(other.shared_counter)
-{
- other.device_size = 0;
- other.device_pointer = 0;
- other.host_pointer = 0;
- other.shared_pointer = 0;
- other.shared_counter = 0;
-}
-
void *device_memory::host_alloc(size_t size)
{
if (!size) {
diff --git a/intern/cycles/device/device_memory.h b/intern/cycles/device/device_memory.h
index f8324e2a214..60740807568 100644
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -235,9 +235,6 @@ class device_memory {
device_memory(const device_memory &) = delete;
device_memory &operator=(const device_memory &) = delete;
- /* But moving is possible. */
- device_memory(device_memory &&);
-
/* Host allocation on the device. All host_pointer memory should be
* allocated with these functions, for devices that support using
* the same pointer for host and device. */
@@ -275,11 +272,6 @@ template<typename T> class device_only_memory : public device_memory {
free();
}
- device_only_memory(device_only_memory &&other)
- : device_memory(static_cast<device_memory &&>(other))
- {
- }
-
void alloc_to_device(size_t num, bool shrink_to_fit = true)
{
size_t new_size = num;
@@ -338,10 +330,6 @@ template<typename T> class device_vector : public device_memory {
free();
}
- device_vector(device_vector &&other) : device_memory(static_cast<device_memory &&>(other))
- {
- }
-
/* Host memory allocation. */
T *alloc(size_t width, size_t height = 0, size_t depth = 0)
{
diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp
index ae3ab7e1fc2..7335e0bc64d 100644
--- a/intern/cycles/device/device_optix.cpp
+++ b/intern/cycles/device/device_optix.cpp
@@ -174,7 +174,7 @@ class OptiXDevice : public Device {
device_vector<SbtRecord> sbt_data;
device_vector<TextureInfo> texture_info;
device_only_memory<KernelParams> launch_params;
- vector<device_only_memory<uint8_t>> as_mem;
+ vector<CUdeviceptr> as_mem;
OptixTraversableHandle tlas_handle = 0;
// TODO(pmours): This is copied from device_cuda.cpp, so move to common code eventually
@@ -269,6 +269,9 @@ class OptiXDevice : public Device {
task_pool.stop();
// Free all acceleration structures
+ for (CUdeviceptr mem : as_mem) {
+ cuMemFree(mem);
+ }
as_mem.clear();
sbt_data.free();
@@ -831,7 +834,6 @@ class OptiXDevice : public Device {
bool build_optix_bvh(const OptixBuildInput &build_input,
uint16_t num_motion_steps,
- device_memory &out_data,
OptixTraversableHandle &out_handle)
{
out_handle = 0;
@@ -842,7 +844,15 @@ class OptiXDevice : public Device {
OptixAccelBufferSizes sizes = {};
OptixAccelBuildOptions options;
options.operation = OPTIX_BUILD_OPERATION_BUILD;
- options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
+ if (background) {
+ // Prefer best performance and lowest memory consumption in background
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
+ }
+ else {
+ // Prefer fast updates in viewport
+ options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD;
+ }
+
options.motionOptions.numKeys = num_motion_steps;
options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
options.motionOptions.timeBegin = 0.0f;
@@ -853,31 +863,75 @@ class OptiXDevice : public Device {
// Allocate required output buffers
device_only_memory<char> temp_mem(this, "temp_build_mem");
- temp_mem.alloc_to_device(sizes.tempSizeInBytes);
+ temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
+ if (!temp_mem.device_pointer)
+ return false; // Make sure temporary memory allocation succeeded
+
+ // Move textures to host memory if there is not enough room
+ size_t size = 0, free = 0;
+ cuMemGetInfo(&free, &size);
+ size = sizes.outputSizeInBytes + device_working_headroom;
+ if (size >= free && can_map_host) {
+ move_textures_to_host(size - free, false);
+ }
- out_data.type = MEM_DEVICE_ONLY;
- out_data.data_type = TYPE_UNKNOWN;
- out_data.data_elements = 1;
- out_data.data_size = sizes.outputSizeInBytes;
- mem_alloc(out_data);
+ CUdeviceptr out_data = 0;
+ check_result_cuda_ret(cuMemAlloc(&out_data, sizes.outputSizeInBytes));
+ as_mem.push_back(out_data);
// Finally build the acceleration structure
+ OptixAccelEmitDesc compacted_size_prop;
+ compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
+ // A tiny space was allocated for this property at the end of the temporary buffer above
+ // Make sure this pointer is 8-byte aligned
+ compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
+
check_result_optix_ret(optixAccelBuild(context,
NULL,
&options,
&build_input,
1,
temp_mem.device_pointer,
- sizes.tempSizeInBytes,
- out_data.device_pointer,
+ temp_mem.device_size,
+ out_data,
sizes.outputSizeInBytes,
&out_handle,
- NULL,
- 0));
+ &compacted_size_prop,
+ 1));
// Wait for all operations to finish
check_result_cuda_ret(cuStreamSynchronize(NULL));
+ // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
+ if (background) {
+ uint64_t compacted_size = sizes.outputSizeInBytes;
+ check_result_cuda_ret(
+ cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
+
+ // Temporary memory is no longer needed, so free it now to make space
+ temp_mem.free();
+
+ // There is no point compacting if the size does not change
+ if (compacted_size < sizes.outputSizeInBytes) {
+ CUdeviceptr compacted_data = 0;
+ if (cuMemAlloc(&compacted_data, compacted_size) != CUDA_SUCCESS)
+ // Do not compact if memory allocation for compacted acceleration structure fails
+ // Can just use the uncompacted one then, so succeed here regardless
+ return true;
+ as_mem.push_back(compacted_data);
+
+ check_result_optix_ret(optixAccelCompact(
+ context, NULL, out_handle, compacted_data, compacted_size, &out_handle));
+
+ // Wait for compaction to finish
+ check_result_cuda_ret(cuStreamSynchronize(NULL));
+
+ // Free uncompacted acceleration structure
+ cuMemFree(out_data);
+ as_mem.erase(as_mem.end() - 2); // Remove 'out_data' from 'as_mem' array
+ }
+ }
+
return true;
}
@@ -889,7 +943,10 @@ class OptiXDevice : public Device {
unordered_map<Mesh *, vector<OptixTraversableHandle>> meshes;
meshes.reserve(bvh->meshes.size());
- // Free all previous acceleration structure
+ // Free all previous acceleration structures
+ for (CUdeviceptr mem : as_mem) {
+ cuMemFree(mem);
+ }
as_mem.clear();
// Build bottom level acceleration structures (BLAS)
@@ -968,9 +1025,8 @@ class OptiXDevice : public Device {
build_input.aabbArray.primitiveIndexOffset = mesh->prim_offset;
// Allocate memory for new BLAS and build it
- as_mem.emplace_back(this, "blas");
handles.emplace_back();
- if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+ if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
return false;
}
@@ -1034,9 +1090,8 @@ class OptiXDevice : public Device {
build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset + mesh->num_segments();
// Allocate memory for new BLAS and build it
- as_mem.emplace_back(this, "blas");
handles.emplace_back();
- if (!build_optix_bvh(build_input, num_motion_steps, as_mem.back(), handles.back()))
+ if (!build_optix_bvh(build_input, num_motion_steps, handles.back()))
return false;
}
@@ -1081,15 +1136,17 @@ class OptiXDevice : public Device {
// Insert motion traversable if object has motion
if (motion_blur && ob->use_motion()) {
- as_mem.emplace_back(this, "motion_transform");
- device_only_memory<uint8_t> &motion_transform_gpu = as_mem.back();
- motion_transform_gpu.alloc_to_device(sizeof(OptixSRTMotionTransform) +
- (max(ob->motion.size(), 2) - 2) *
- sizeof(OptixSRTData));
+ size_t motion_keys = max(ob->motion.size(), 2) - 2;
+ size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
+ motion_keys * sizeof(OptixSRTData);
+
+ CUdeviceptr motion_transform_gpu = 0;
+ check_result_cuda_ret(cuMemAlloc(&motion_transform_gpu, motion_transform_size));
+ as_mem.push_back(motion_transform_gpu);
// Allocate host side memory for motion transform and fill it with transform data
OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
- motion_transform_gpu.host_pointer = new uint8_t[motion_transform_gpu.memory_size()]);
+ new uint8_t[motion_transform_size]);
motion_transform.child = handle;
motion_transform.motionOptions.numKeys = ob->motion.size();
motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
@@ -1101,38 +1158,43 @@ class OptiXDevice : public Device {
transform_motion_decompose(decomp.data(), ob->motion.data(), ob->motion.size());
for (size_t i = 0; i < ob->motion.size(); ++i) {
- // scaling
- srt_data[i].a = decomp[i].z.x; // scale.x.y
- srt_data[i].b = decomp[i].z.y; // scale.x.z
- srt_data[i].c = decomp[i].w.x; // scale.y.z
+ // Scale
srt_data[i].sx = decomp[i].y.w; // scale.x.x
srt_data[i].sy = decomp[i].z.w; // scale.y.y
srt_data[i].sz = decomp[i].w.w; // scale.z.z
- srt_data[i].pvx = 0;
- srt_data[i].pvy = 0;
- srt_data[i].pvz = 0;
- // rotation
+
+ // Shear
+ srt_data[i].a = decomp[i].z.x; // scale.x.y
+ srt_data[i].b = decomp[i].z.y; // scale.x.z
+ srt_data[i].c = decomp[i].w.x; // scale.y.z
+
+ // Pivot point
+ srt_data[i].pvx = 0.0f;
+ srt_data[i].pvy = 0.0f;
+ srt_data[i].pvz = 0.0f;
+
+ // Rotation
srt_data[i].qx = decomp[i].x.x;
srt_data[i].qy = decomp[i].x.y;
srt_data[i].qz = decomp[i].x.z;
srt_data[i].qw = decomp[i].x.w;
- // transform
+
+ // Translation
srt_data[i].tx = decomp[i].y.x;
srt_data[i].ty = decomp[i].y.y;
srt_data[i].tz = decomp[i].y.z;
}
// Upload motion transform to GPU
- mem_copy_to(motion_transform_gpu);
- delete[] reinterpret_cast<uint8_t *>(motion_transform_gpu.host_pointer);
- motion_transform_gpu.host_pointer = 0;
+ cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
+ delete[] reinterpret_cast<uint8_t *>(&motion_transform);
// Disable instance transform if object uses motion transform already
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
// Get traversable handle to motion transform
optixConvertPointerToTraversableHandle(context,
- motion_transform_gpu.device_pointer,
+ motion_transform_gpu,
OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
&instance.traversableHandle);
}
@@ -1168,8 +1230,7 @@ class OptiXDevice : public Device {
build_input.instanceArray.aabbs = aabbs.device_pointer;
build_input.instanceArray.numAabbs = num_instances;
- as_mem.emplace_back(this, "tlas");
- return build_optix_bvh(build_input, 0, as_mem.back(), tlas_handle);
+ return build_optix_bvh(build_input, 0, tlas_handle);
}
void update_texture_info()
diff --git a/intern/cycles/kernel/svm/svm_bevel.h b/intern/cycles/kernel/svm/svm_bevel.h
index 434502f31f9..bf5957ec9e4 100644
--- a/intern/cycles/kernel/svm/svm_bevel.h
+++ b/intern/cycles/kernel/svm/svm_bevel.h
@@ -53,7 +53,7 @@ ccl_device_noinline float3 svm_bevel(KernelGlobals *kg,
float3 sum_N = make_float3(0.0f, 0.0f, 0.0f);
for (int sample = 0; sample < num_samples; sample++) {
- float disk_u = 0.0f, disk_v = 0.0f;
+ float disk_u, disk_v;
path_branched_rng_2D(
kg, state->rng_hash, state, sample, num_samples, PRNG_BEVEL_U, &disk_u, &disk_v);