diff options
Diffstat (limited to 'intern/cycles/device')
-rw-r--r-- | intern/cycles/device/cuda/device_cuda_impl.cpp | 25 | ||||
-rw-r--r-- | intern/cycles/device/device.cpp | 4 | ||||
-rw-r--r-- | intern/cycles/device/device.h | 4 | ||||
-rw-r--r-- | intern/cycles/device/device_cpu.cpp | 235 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.cpp | 52 | ||||
-rw-r--r-- | intern/cycles/device/device_denoising.h | 8 | ||||
-rw-r--r-- | intern/cycles/device/device_multi.cpp | 47 | ||||
-rw-r--r-- | intern/cycles/device/device_optix.cpp | 395 | ||||
-rw-r--r-- | intern/cycles/device/device_task.h | 21 | ||||
-rw-r--r-- | intern/cycles/device/opencl/device_opencl_impl.cpp | 2 |
10 files changed, 592 insertions, 201 deletions
diff --git a/intern/cycles/device/cuda/device_cuda_impl.cpp b/intern/cycles/device/cuda/device_cuda_impl.cpp index b9bbeb9a25b..3a2eb8df95b 100644 --- a/intern/cycles/device/cuda/device_cuda_impl.cpp +++ b/intern/cycles/device/cuda/device_cuda_impl.cpp @@ -383,11 +383,24 @@ string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_featu } } - const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor)); - VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; - if (path_exists(ptx)) { - VLOG(1) << "Using precompiled kernel."; - return ptx; + /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */ + int ptx_major = major, ptx_minor = minor; + while (ptx_major >= 3) { + const string ptx = path_get( + string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } + + if (ptx_minor > 0) { + ptx_minor--; + } + else { + ptx_major--; + ptx_minor = 9; + } } } @@ -1760,7 +1773,7 @@ void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising) denoising.render_buffer.samples = rtile.sample; denoising.buffer.gpu_temporary_mem = true; - denoising.run_denoising(&rtile); + denoising.run_denoising(rtile); } void CUDADevice::adaptive_sampling_filter(uint filter_sample, diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index 9dbb33980b4..407f73e8451 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -209,13 +209,13 @@ bool Device::bind_fallback_display_space_shader(const float width, const float h glUseProgram(fallback_shader_program); image_texture_location = glGetUniformLocation(fallback_shader_program, "image_texture"); if (image_texture_location < 0) { - LOG(ERROR) << "Shader doesn't containt the 'image_texture' uniform."; + LOG(ERROR) << "Shader doesn't contain the 'image_texture' uniform."; return false; } fullscreen_location = glGetUniformLocation(fallback_shader_program, "fullscreen"); if (fullscreen_location < 0) { - LOG(ERROR) << "Shader doesn't containt the 'fullscreen' uniform."; + LOG(ERROR) << "Shader doesn't contain the 'fullscreen' uniform."; return false; } diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index a5833369a17..115b05e3911 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -439,10 +439,10 @@ class Device { { return 0; } - virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) + virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/) { } - virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) + virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTileNeighbors & /*neighbors*/) { } diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 8f68e66a1b4..ee3a3ddea64 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -182,6 +182,7 @@ class CPUDevice : public Device { oidn::DeviceRef oidn_device; oidn::FilterRef oidn_filter; #endif + thread_spin_lock oidn_task_lock; bool use_split_kernel; @@ -948,12 +949,25 @@ class CPUDevice : public Device { } } - void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + void denoise_openimagedenoise_buffer(DeviceTask &task, + float *buffer, + const size_t offset, + const size_t stride, + const size_t x, + const size_t y, + const size_t w, + const size_t h, + const float scale) { #ifdef WITH_OPENIMAGEDENOISE assert(openimagedenoise_supported()); - /* Only one at a time, since OpenImageDenoise itself is multithreaded. */ + /* Only one at a time, since OpenImageDenoise itself is multithreaded for full + * buffers, and for tiled rendering because creating multiple devices and filters + * is slow and memory hungry as well. + * + * TODO: optimize tiled rendering case, by batching together denoising of many + * tiles somehow? */ static thread_mutex mutex; thread_scoped_lock lock(mutex); @@ -964,54 +978,192 @@ class CPUDevice : public Device { } if (!oidn_filter) { oidn_filter = oidn_device.newFilter("RT"); + oidn_filter.set("hdr", true); + oidn_filter.set("srgb", false); } - /* Copy pixels from compute device to CPU (no-op for CPU device). */ - rtile.buffers->buffer.copy_from_device(); - /* Set images with appropriate stride for our interleaved pass storage. */ - const struct { + struct { const char *name; - int offset; - } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR}, - {"normal", task.pass_denoising_data + DENOISING_PASS_NORMAL}, - {"albedo", task.pass_denoising_data + DENOISING_PASS_ALBEDO}, - {"output", 0}, + const int offset; + const bool scale; + const bool use; + array<float> scaled_buffer; + } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true}, + {"albedo", + task.pass_denoising_data + DENOISING_PASS_ALBEDO, + true, + task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO}, + {"normal", + task.pass_denoising_data + DENOISING_PASS_NORMAL, + true, + task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL}, + {"output", 0, false, true}, { NULL, 0 }}; for (int i = 0; passes[i].name; i++) { - const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride; - const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float); - const int64_t pixel_stride = task.pass_stride * sizeof(float); - const int64_t row_stride = rtile.stride * pixel_stride; + if (!passes[i].use) { + continue; + } - oidn_filter.setImage(passes[i].name, - (char *)rtile.buffer + buffer_offset, - oidn::Format::Float3, - rtile.w, - rtile.h, - 0, - pixel_stride, - row_stride); + const int64_t pixel_offset = offset + x + y * stride; + const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset); + const int64_t pixel_stride = task.pass_stride; + const int64_t row_stride = stride * pixel_stride; + + if (passes[i].scale && scale != 1.0f) { + /* Normalize albedo and normal passes as they are scaled by the number of samples. + * For the color passes OIDN will perform auto-exposure making it unnecessary. */ + array<float> &scaled_buffer = passes[i].scaled_buffer; + scaled_buffer.resize(w * h * 3); + + for (int y = 0; y < h; y++) { + const float *pass_row = buffer + buffer_offset + y * row_stride; + float *scaled_row = scaled_buffer.data() + y * w * 3; + + for (int x = 0; x < w; x++) { + scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale; + scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale; + scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale; + } + } + + oidn_filter.setImage( + passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0); + } + else { + oidn_filter.setImage(passes[i].name, + buffer + buffer_offset, + oidn::Format::Float3, + w, + h, + 0, + pixel_stride * sizeof(float), + row_stride * sizeof(float)); + } } /* Execute filter. */ - oidn_filter.set("hdr", true); - oidn_filter.set("srgb", false); oidn_filter.commit(); oidn_filter.execute(); - - /* todo: it may be possible to avoid this copy, but we have to ensure that - * when other code copies data from the device it doesn't overwrite the - * denoiser buffers. */ - rtile.buffers->buffer.copy_to_device(); #else (void)task; - (void)rtile; + (void)buffer; + (void)offset; + (void)stride; + (void)x; + (void)y; + (void)w; + (void)h; + (void)scale; #endif } + void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile) + { + if (task.type == DeviceTask::DENOISE_BUFFER) { + /* Copy pixels from compute device to CPU (no-op for CPU device). */ + rtile.buffers->buffer.copy_from_device(); + + denoise_openimagedenoise_buffer(task, + (float *)rtile.buffer, + rtile.offset, + rtile.stride, + rtile.x, + rtile.y, + rtile.w, + rtile.h, + 1.0f / rtile.sample); + + /* todo: it may be possible to avoid this copy, but we have to ensure that + * when other code copies data from the device it doesn't overwrite the + * denoiser buffers. */ + rtile.buffers->buffer.copy_to_device(); + } + else { + /* Per-tile denoising. */ + rtile.sample = rtile.start_sample + rtile.num_samples; + const float scale = 1.0f / rtile.sample; + const float invscale = rtile.sample; + const size_t pass_stride = task.pass_stride; + + /* Map neighboring tiles into one buffer for denoising. */ + RenderTileNeighbors neighbors(rtile); + task.map_neighbor_tiles(neighbors, this); + RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; + rtile = center_tile; + + /* Calculate size of the tile to denoise (including overlap). The overlap + * size was chosen empirically. OpenImageDenoise specifies an overlap size + * of 128 but this is significantly bigger than typical tile size. */ + const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds()); + const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); + + /* Adjacent tiles are in separate memory regions, copy into single buffer. */ + array<float> merged(rect_size.x * rect_size.y * task.pass_stride); + + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + RenderTile &ntile = neighbors.tiles[i]; + if (!ntile.buffer) { + continue; + } + + const int xmin = max(ntile.x, rect.x); + const int ymin = max(ntile.y, rect.y); + const int xmax = min(ntile.x + ntile.w, rect.z); + const int ymax = min(ntile.y + ntile.h, rect.w); + + const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; + const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; + + const size_t merged_stride = rect_size.x; + const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; + float *merged_buffer = merged.data() + merged_offset * pass_stride; + + for (int y = ymin; y < ymax; y++) { + for (int x = 0; x < pass_stride * (xmax - xmin); x++) { + merged_buffer[x] = tile_buffer[x] * scale; + } + tile_buffer += ntile.stride * pass_stride; + merged_buffer += merged_stride * pass_stride; + } + } + + /* Denoise */ + denoise_openimagedenoise_buffer( + task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f); + + /* Copy back result from merged buffer. */ + RenderTile &ntile = neighbors.target; + if (ntile.buffer) { + const int xmin = max(ntile.x, rect.x); + const int ymin = max(ntile.y, rect.y); + const int xmax = min(ntile.x + ntile.w, rect.z); + const int ymax = min(ntile.y + ntile.h, rect.w); + + const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride; + float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride; + + const size_t merged_stride = rect_size.x; + const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride; + const float *merged_buffer = merged.data() + merged_offset * pass_stride; + + for (int y = ymin; y < ymax; y++) { + for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) { + tile_buffer[x + 0] = merged_buffer[x + 0] * invscale; + tile_buffer[x + 1] = merged_buffer[x + 1] * invscale; + tile_buffer[x + 2] = merged_buffer[x + 2] * invscale; + } + tile_buffer += ntile.stride * pass_stride; + merged_buffer += merged_stride * pass_stride; + } + } + + task.unmap_neighbor_tiles(neighbors, this); + } + } + void denoise_nlm(DenoisingTask &denoising, RenderTile &tile) { ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING); @@ -1040,7 +1192,7 @@ class CPUDevice : public Device { denoising.render_buffer.samples = tile.sample; denoising.buffer.gpu_temporary_mem = false; - denoising.run_denoising(&tile); + denoising.run_denoising(tile); } void thread_render(DeviceTask &task) @@ -1070,10 +1222,23 @@ class CPUDevice : public Device { } } + /* NLM denoiser. */ DenoisingTask *denoising = NULL; + /* OpenImageDenoise: we can only denoise with one thread at a time, so to + * avoid waiting with mutex locks in the denoiser, we let only a single + * thread acquire denoising tiles. */ + uint tile_types = task.tile_types; + bool hold_denoise_lock = false; + if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) { + if (!oidn_task_lock.try_lock()) { + tile_types &= ~RenderTile::DENOISE; + hold_denoise_lock = true; + } + } + RenderTile tile; - while (task.acquire_tile(this, tile, task.tile_types)) { + while (task.acquire_tile(this, tile, tile_types)) { if (tile.task == RenderTile::PATH_TRACE) { if (use_split_kernel) { device_only_memory<uchar> void_buffer(this, "void_buffer"); @@ -1108,6 +1273,10 @@ class CPUDevice : public Device { } } + if (hold_denoise_lock) { + oidn_task_lock.unlock(); + } + profiler.remove_state(&kg->profiler); thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 89de80a5bcd..38c42d15cab 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -71,29 +71,30 @@ DenoisingTask::~DenoisingTask() tile_info_mem.free(); } -void DenoisingTask::set_render_buffer(RenderTile *rtiles) +void DenoisingTask::set_render_buffer(RenderTileNeighbors &neighbors) { - for (int i = 0; i < 9; i++) { - tile_info->offsets[i] = rtiles[i].offset; - tile_info->strides[i] = rtiles[i].stride; - tile_info->buffers[i] = rtiles[i].buffer; + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + RenderTile &rtile = neighbors.tiles[i]; + tile_info->offsets[i] = rtile.offset; + tile_info->strides[i] = rtile.stride; + tile_info->buffers[i] = rtile.buffer; } - tile_info->x[0] = rtiles[3].x; - tile_info->x[1] = rtiles[4].x; - tile_info->x[2] = rtiles[5].x; - tile_info->x[3] = rtiles[5].x + rtiles[5].w; - tile_info->y[0] = rtiles[1].y; - tile_info->y[1] = rtiles[4].y; - tile_info->y[2] = rtiles[7].y; - tile_info->y[3] = rtiles[7].y + rtiles[7].h; - - target_buffer.offset = rtiles[9].offset; - target_buffer.stride = rtiles[9].stride; - target_buffer.ptr = rtiles[9].buffer; - - if (do_prefilter && rtiles[9].buffers) { + tile_info->x[0] = neighbors.tiles[3].x; + tile_info->x[1] = neighbors.tiles[4].x; + tile_info->x[2] = neighbors.tiles[5].x; + tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; + tile_info->y[0] = neighbors.tiles[1].y; + tile_info->y[1] = neighbors.tiles[4].y; + tile_info->y[2] = neighbors.tiles[7].y; + tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; + + target_buffer.offset = neighbors.target.offset; + target_buffer.stride = neighbors.target.stride; + target_buffer.ptr = neighbors.target.buffer; + + if (do_prefilter && neighbors.target.buffers) { target_buffer.denoising_output_offset = - rtiles[9].buffers->params.get_denoising_prefiltered_offset(); + neighbors.target.buffers->params.get_denoising_prefiltered_offset(); } else { target_buffer.denoising_output_offset = 0; @@ -320,12 +321,11 @@ void DenoisingTask::reconstruct() functions.solve(target_buffer.ptr); } -void DenoisingTask::run_denoising(RenderTile *tile) +void DenoisingTask::run_denoising(RenderTile &tile) { - RenderTile rtiles[10]; - rtiles[4] = *tile; - functions.map_neighbor_tiles(rtiles); - set_render_buffer(rtiles); + RenderTileNeighbors neighbors(tile); + functions.map_neighbor_tiles(neighbors); + set_render_buffer(neighbors); setup_denoising_buffer(); @@ -347,7 +347,7 @@ void DenoisingTask::run_denoising(RenderTile *tile) write_buffer(); } - functions.unmap_neighbor_tiles(rtiles); + functions.unmap_neighbor_tiles(neighbors); } CCL_NAMESPACE_END diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index 4c122e981eb..2c0dc23b44a 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -102,8 +102,8 @@ class DenoisingTask { device_ptr output_ptr)> detect_outliers; function<bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature; - function<void(RenderTile *rtiles)> map_neighbor_tiles; - function<void(RenderTile *rtiles)> unmap_neighbor_tiles; + function<void(RenderTileNeighbors &neighbors)> map_neighbor_tiles; + function<void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles; } functions; /* Stores state of the current Reconstruction operation, @@ -154,7 +154,7 @@ class DenoisingTask { DenoisingTask(Device *device, const DeviceTask &task); ~DenoisingTask(); - void run_denoising(RenderTile *tile); + void run_denoising(RenderTile &tile); struct DenoiseBuffers { int pass_stride; @@ -179,7 +179,7 @@ class DenoisingTask { protected: Device *device; - void set_render_buffer(RenderTile *rtiles); + void set_render_buffer(RenderTileNeighbors &neighbors); void setup_denoising_buffer(); void prefilter_shadowing(); void prefilter_features(); diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp index fd14bbdccc5..9ea8782d0f0 100644 --- a/intern/cycles/device/device_multi.cpp +++ b/intern/cycles/device/device_multi.cpp @@ -177,8 +177,11 @@ class MultiDevice : public Device { return false; if (requested_features.use_denoising) { + /* Only need denoising feature, everything else is unused. */ + DeviceRequestedFeatures denoising_features; + denoising_features.use_denoising = true; foreach (SubDevice &sub, denoising_devices) - if (!sub.device->load_kernels(requested_features)) + if (!sub.device->load_kernels(denoising_features)) return false; } @@ -581,20 +584,22 @@ class MultiDevice : public Device { return -1; } - void map_neighbor_tiles(Device *sub_device, RenderTile *tiles) + void map_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) { - for (int i = 0; i < 9; i++) { - if (!tiles[i].buffers) { + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + RenderTile &tile = neighbors.tiles[i]; + + if (!tile.buffers) { continue; } - device_vector<float> &mem = tiles[i].buffers->buffer; - tiles[i].buffer = mem.device_pointer; + device_vector<float> &mem = tile.buffers->buffer; + tile.buffer = mem.device_pointer; if (mem.device == this && matching_rendering_and_denoising_devices) { /* Skip unnecessary copies in viewport mode (buffer covers the * whole image), but still need to fix up the tile device pointer. */ - map_tile(sub_device, tiles[i]); + map_tile(sub_device, tile); continue; } @@ -607,15 +612,15 @@ class MultiDevice : public Device { * also required for the case where a CPU thread is denoising * a tile rendered on the GPU. In that case we have to avoid * overwriting the buffer being de-noised by the CPU thread. */ - if (!tiles[i].buffers->map_neighbor_copied) { - tiles[i].buffers->map_neighbor_copied = true; + if (!tile.buffers->map_neighbor_copied) { + tile.buffers->map_neighbor_copied = true; mem.copy_from_device(); } if (mem.device == this) { /* Can re-use memory if tile is already allocated on the sub device. */ - map_tile(sub_device, tiles[i]); - mem.swap_device(sub_device, mem.device_size, tiles[i].buffer); + map_tile(sub_device, tile); + mem.swap_device(sub_device, mem.device_size, tile.buffer); } else { mem.swap_device(sub_device, 0, 0); @@ -623,40 +628,42 @@ class MultiDevice : public Device { mem.copy_to_device(); - tiles[i].buffer = mem.device_pointer; - tiles[i].device_size = mem.device_size; + tile.buffer = mem.device_pointer; + tile.device_size = mem.device_size; mem.restore_device(); } } } - void unmap_neighbor_tiles(Device *sub_device, RenderTile *tiles) + void unmap_neighbor_tiles(Device *sub_device, RenderTileNeighbors &neighbors) { - device_vector<float> &mem = tiles[9].buffers->buffer; + RenderTile &target_tile = neighbors.target; + device_vector<float> &mem = target_tile.buffers->buffer; if (mem.device == this && matching_rendering_and_denoising_devices) { return; } /* Copy denoised result back to the host. */ - mem.swap_device(sub_device, tiles[9].device_size, tiles[9].buffer); + mem.swap_device(sub_device, target_tile.device_size, target_tile.buffer); mem.copy_from_device(); mem.restore_device(); /* Copy denoised result to the original device. */ mem.copy_to_device(); - for (int i = 0; i < 9; i++) { - if (!tiles[i].buffers) { + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + RenderTile &tile = neighbors.tiles[i]; + if (!tile.buffers) { continue; } - device_vector<float> &mem = tiles[i].buffers->buffer; + device_vector<float> &mem = tile.buffers->buffer; if (mem.device != sub_device && mem.device != this) { /* Free up memory again if it was allocated for the copy above. */ - mem.swap_device(sub_device, tiles[i].device_size, tiles[i].buffer); + mem.swap_device(sub_device, tile.device_size, tile.buffer); sub_device->mem_free(mem); mem.restore_device(); } diff --git a/intern/cycles/device/device_optix.cpp b/intern/cycles/device/device_optix.cpp index ececca3df53..1cc45983565 100644 --- a/intern/cycles/device/device_optix.cpp +++ b/intern/cycles/device/device_optix.cpp @@ -131,8 +131,12 @@ class OptiXDevice : public CUDADevice { PG_RGEN, PG_MISS, PG_HITD, // Default hit group - PG_HITL, // __BVH_LOCAL__ hit group PG_HITS, // __SHADOW_RECORD_ALL__ hit group + PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles) +# if OPTIX_ABI_VERSION >= 36 + PG_HITD_MOTION, + PG_HITS_MOTION, +# endif # ifdef WITH_CYCLES_DEBUG PG_EXCP, # endif @@ -177,6 +181,7 @@ class OptiXDevice : public CUDADevice { OptixDeviceContext context = NULL; OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module + OptixModule builtin_modules[2] = {}; OptixPipeline pipelines[NUM_PIPELINES] = {}; bool motion_blur = false; @@ -264,6 +269,9 @@ class OptiXDevice : public CUDADevice { // Unload modules if (optix_module != NULL) optixModuleDestroy(optix_module); + for (unsigned int i = 0; i < 2; ++i) + if (builtin_modules[i] != NULL) + optixModuleDestroy(builtin_modules[i]); for (unsigned int i = 0; i < NUM_PIPELINES; ++i) if (pipelines[i] != NULL) optixPipelineDestroy(pipelines[i]); @@ -338,6 +346,12 @@ class OptiXDevice : public CUDADevice { optixModuleDestroy(optix_module); optix_module = NULL; } + for (unsigned int i = 0; i < 2; ++i) { + if (builtin_modules[i] != NULL) { + optixModuleDestroy(builtin_modules[i]); + builtin_modules[i] = NULL; + } + } for (unsigned int i = 0; i < NUM_PIPELINES; ++i) { if (pipelines[i] != NULL) { optixPipelineDestroy(pipelines[i]); @@ -369,6 +383,18 @@ class OptiXDevice : public CUDADevice { # endif pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h +# if OPTIX_ABI_VERSION >= 36 + pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE; + if (requested_features.use_hair) { + if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE; + } + else { + pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM; + } + } +# endif + // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds // This is necessary since objects may be reported to have motion if the Vector pass is // active, but may still need to be rendered without motion blur if that isn't active as well @@ -442,6 +468,34 @@ class OptiXDevice : public CUDADevice { group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon"; } + +# if OPTIX_ABI_VERSION >= 36 + if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) { + OptixBuiltinISOptions builtin_options; + builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; + builtin_options.usesMotionBlur = false; + + check_result_optix_ret(optixBuiltinISModuleGet( + context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0])); + + group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0]; + group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr; + group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0]; + group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr; + + if (motion_blur) { + builtin_options.usesMotionBlur = true; + + check_result_optix_ret(optixBuiltinISModuleGet( + context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1])); + + group_descs[PG_HITD_MOTION] = group_descs[PG_HITD]; + group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1]; + group_descs[PG_HITS_MOTION] = group_descs[PG_HITS]; + group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1]; + } + } +# endif } if (requested_features.use_subsurface || requested_features.use_shader_raytrace) { @@ -493,8 +547,14 @@ class OptiXDevice : public CUDADevice { unsigned int trace_css = stack_size[PG_HITD].cssCH; // This is based on the maximum of closest-hit and any-hit/intersection programs trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH); - trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH); + trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH); +# if OPTIX_ABI_VERSION >= 36 + trace_css = std::max(trace_css, + stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH); + trace_css = std::max(trace_css, + stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH); +# endif OptixPipelineLinkOptions link_options; link_options.maxTraceDepth = 1; @@ -503,17 +563,23 @@ class OptiXDevice : public CUDADevice { # else link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO; # endif - link_options.overrideUsesMotionBlur = pipeline_options.usesMotionBlur; +# if OPTIX_ABI_VERSION < 24 + link_options.overrideUsesMotionBlur = motion_blur; +# endif { // Create path tracing pipeline OptixProgramGroup pipeline_groups[] = { - groups[PG_RGEN], - groups[PG_MISS], - groups[PG_HITD], - groups[PG_HITS], - groups[PG_HITL], + groups[PG_RGEN], + groups[PG_MISS], + groups[PG_HITD], + groups[PG_HITS], + groups[PG_HITL], +# if OPTIX_ABI_VERSION >= 36 + groups[PG_HITD_MOTION], + groups[PG_HITS_MOTION], +# endif # ifdef WITH_CYCLES_DEBUG - groups[PG_EXCP], + groups[PG_EXCP], # endif }; check_result_optix_ret( @@ -530,8 +596,8 @@ class OptiXDevice : public CUDADevice { const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css; // Set stack size depending on pipeline options - check_result_optix_ret(optixPipelineSetStackSize( - pipelines[PIP_PATH_TRACE], 0, 0, css, (pipeline_options.usesMotionBlur ? 3 : 2))); + check_result_optix_ret( + optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE], 0, 0, css, (motion_blur ? 3 : 2))); } // Only need to create shader evaluation pipeline if one of these features is used: @@ -541,15 +607,19 @@ class OptiXDevice : public CUDADevice { if (use_shader_eval_pipeline) { // Create shader evaluation pipeline OptixProgramGroup pipeline_groups[] = { - groups[PG_BAKE], - groups[PG_DISP], - groups[PG_BACK], - groups[PG_MISS], - groups[PG_HITD], - groups[PG_HITS], - groups[PG_HITL], + groups[PG_BAKE], + groups[PG_DISP], + groups[PG_BACK], + groups[PG_MISS], + groups[PG_HITD], + groups[PG_HITS], + groups[PG_HITL], +# if OPTIX_ABI_VERSION >= 36 + groups[PG_HITD_MOTION], + groups[PG_HITS_MOTION], +# endif # ifdef WITH_CYCLES_DEBUG - groups[PG_EXCP], + groups[PG_EXCP], # endif }; check_result_optix_ret( @@ -672,7 +742,11 @@ class OptiXDevice : public CUDADevice { sbt_params.missRecordCount = 1; sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS +# if OPTIX_ABI_VERSION >= 36 + sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL +# else + sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL +# endif // Launch the ray generation program check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE], @@ -727,19 +801,18 @@ class OptiXDevice : public CUDADevice { // 0 1 2 // 3 4 5 // 6 7 8 9 - RenderTile rtiles[10]; - rtiles[4] = rtile; - task.map_neighbor_tiles(rtiles, this); - rtile = rtiles[4]; // Tile may have been modified by mapping code + RenderTileNeighbors neighbors(rtile); + task.map_neighbor_tiles(neighbors, this); + RenderTile ¢er_tile = neighbors.tiles[RenderTileNeighbors::CENTER]; + RenderTile &target_tile = neighbors.target; + rtile = center_tile; // Tile may have been modified by mapping code // Calculate size of the tile to denoise (including overlap) - int4 rect = make_int4( - rtiles[4].x, rtiles[4].y, rtiles[4].x + rtiles[4].w, rtiles[4].y + rtiles[4].h); + int4 rect = center_tile.bounds(); // Overlap between tiles has to be at least 64 pixels // TODO(pmours): Query this value from OptiX rect = rect_expand(rect, 64); - int4 clip_rect = make_int4( - rtiles[3].x, rtiles[1].y, rtiles[5].x + rtiles[5].w, rtiles[7].y + rtiles[7].h); + int4 clip_rect = neighbors.bounds(); rect = rect_clip(rect, clip_rect); int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y); int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y); @@ -760,14 +833,14 @@ class OptiXDevice : public CUDADevice { device_only_memory<float> input(this, "denoiser input"); device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_WRITE); - if ((!rtiles[0].buffer || rtiles[0].buffer == rtile.buffer) && - (!rtiles[1].buffer || rtiles[1].buffer == rtile.buffer) && - (!rtiles[2].buffer || rtiles[2].buffer == rtile.buffer) && - (!rtiles[3].buffer || rtiles[3].buffer == rtile.buffer) && - (!rtiles[5].buffer || rtiles[5].buffer == rtile.buffer) && - (!rtiles[6].buffer || rtiles[6].buffer == rtile.buffer) && - (!rtiles[7].buffer || rtiles[7].buffer == rtile.buffer) && - (!rtiles[8].buffer || rtiles[8].buffer == rtile.buffer)) { + bool contiguous_memory = true; + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) { + contiguous_memory = false; + } + } + + if (contiguous_memory) { // Tiles are in continous memory, so can just subtract overlap offset input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride; // Stride covers the whole width of the image and not just a single tile @@ -782,19 +855,19 @@ class OptiXDevice : public CUDADevice { input_stride *= rect_size.x; TileInfo *tile_info = tile_info_mem.alloc(1); - for (int i = 0; i < 9; i++) { - tile_info->offsets[i] = rtiles[i].offset; - tile_info->strides[i] = rtiles[i].stride; - tile_info->buffers[i] = rtiles[i].buffer; + for (int i = 0; i < RenderTileNeighbors::SIZE; i++) { + tile_info->offsets[i] = neighbors.tiles[i].offset; + tile_info->strides[i] = neighbors.tiles[i].stride; + tile_info->buffers[i] = neighbors.tiles[i].buffer; } - tile_info->x[0] = rtiles[3].x; - tile_info->x[1] = rtiles[4].x; - tile_info->x[2] = rtiles[5].x; - tile_info->x[3] = rtiles[5].x + rtiles[5].w; - tile_info->y[0] = rtiles[1].y; - tile_info->y[1] = rtiles[4].y; - tile_info->y[2] = rtiles[7].y; - tile_info->y[3] = rtiles[7].y + rtiles[7].h; + tile_info->x[0] = neighbors.tiles[3].x; + tile_info->x[1] = neighbors.tiles[4].x; + tile_info->x[2] = neighbors.tiles[5].x; + tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w; + tile_info->y[0] = neighbors.tiles[1].y; + tile_info->y[1] = neighbors.tiles[4].y; + tile_info->y[2] = neighbors.tiles[7].y; + tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h; tile_info_mem.copy_to_device(); void *args[] = { @@ -804,7 +877,7 @@ class OptiXDevice : public CUDADevice { # if OPTIX_DENOISER_NO_PIXEL_STRIDE device_only_memory<float> input_rgb(this, "denoiser input rgb"); - input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.optix_input_passes); + input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes); void *input_args[] = {&input_rgb.device_pointer, &input_ptr, @@ -813,7 +886,7 @@ class OptiXDevice : public CUDADevice { &input_stride, &task.pass_stride, const_cast<int *>(pass_offset), - &task.denoising.optix_input_passes, + &task.denoising.input_passes, &rtile.sample}; launch_filter_kernel( "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args); @@ -824,7 +897,7 @@ class OptiXDevice : public CUDADevice { # endif const bool recreate_denoiser = (denoiser == NULL) || - (task.denoising.optix_input_passes != denoiser_input_passes); + (task.denoising.input_passes != denoiser_input_passes); if (recreate_denoiser) { // Destroy existing handle before creating new one if (denoiser != NULL) { @@ -833,23 +906,29 @@ class OptiXDevice : public CUDADevice { // Create OptiX denoiser handle on demand when it is first used OptixDenoiserOptions denoiser_options; - assert(task.denoising.optix_input_passes >= 1 && task.denoising.optix_input_passes <= 3); + assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3); denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>( - OPTIX_DENOISER_INPUT_RGB + (task.denoising.optix_input_passes - 1)); + OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1)); +# if OPTIX_ABI_VERSION < 28 denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3; +# endif check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser)); check_result_optix_ret( optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0)); // OptiX denoiser handle was created with the requested number of input passes - denoiser_input_passes = task.denoising.optix_input_passes; + denoiser_input_passes = task.denoising.input_passes; } OptixDenoiserSizes sizes = {}; check_result_optix_ret( optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes)); +# if OPTIX_ABI_VERSION < 28 const size_t scratch_size = sizes.recommendedScratchSizeInBytes; +# else + const size_t scratch_size = sizes.withOverlapScratchSizeInBytes; +# endif const size_t scratch_offset = sizes.stateSizeInBytes; // Allocate denoiser state if tile size has changed since last setup @@ -897,10 +976,10 @@ class OptiXDevice : public CUDADevice { int2 output_offset = overlap_offset; overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually # else - output_layers[0].data = rtiles[9].buffer + pixel_offset; - output_layers[0].width = rtiles[9].w; - output_layers[0].height = rtiles[9].h; - output_layers[0].rowStrideInBytes = rtiles[9].stride * pixel_stride; + output_layers[0].data = target_tile.buffer + pixel_offset; + output_layers[0].width = target_tile.w; + output_layers[0].height = target_tile.h; + output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride; output_layers[0].pixelStrideInBytes = pixel_stride; # endif output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3; @@ -913,7 +992,7 @@ class OptiXDevice : public CUDADevice { denoiser_state.device_pointer, scratch_offset, input_layers, - task.denoising.optix_input_passes, + task.denoising.input_passes, overlap_offset.x, overlap_offset.y, output_layers, @@ -922,26 +1001,26 @@ class OptiXDevice : public CUDADevice { # if OPTIX_DENOISER_NO_PIXEL_STRIDE void *output_args[] = {&input_ptr, - &rtiles[9].buffer, + &target_tile.buffer, &output_offset.x, &output_offset.y, &rect_size.x, &rect_size.y, - &rtiles[9].x, - &rtiles[9].y, - &rtiles[9].w, - &rtiles[9].h, - &rtiles[9].offset, - &rtiles[9].stride, + &target_tile.x, + &target_tile.y, + &target_tile.w, + &target_tile.h, + &target_tile.offset, + &target_tile.stride, &task.pass_stride, &rtile.sample}; launch_filter_kernel( - "kernel_cuda_filter_convert_from_rgb", rtiles[9].w, rtiles[9].h, output_args); + "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args); # endif check_result_cuda_ret(cuStreamSynchronize(0)); - task.unmap_neighbor_tiles(rtiles, this); + task.unmap_neighbor_tiles(neighbors, this); } else { // Run CUDA denoising kernels @@ -993,7 +1072,11 @@ class OptiXDevice : public CUDADevice { sbt_params.missRecordCount = 1; sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord); sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord); - sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITL, PG_HITS +# if OPTIX_ABI_VERSION >= 36 + sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL +# else + sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL +# endif check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL], cuda_stream[thread_index], @@ -1070,7 +1153,7 @@ class OptiXDevice : public CUDADevice { &build_input, 1, temp_mem.device_pointer, - temp_mem.device_size, + sizes.tempSizeInBytes, out_data, sizes.outputSizeInBytes, &out_handle, @@ -1142,7 +1225,6 @@ class OptiXDevice : public CUDADevice { continue; } - const size_t num_curves = hair->num_curves(); const size_t num_segments = hair->num_segments(); size_t num_motion_steps = 1; @@ -1152,7 +1234,18 @@ class OptiXDevice : public CUDADevice { } device_vector<OptixAabb> aabb_data(this, "temp_aabb_data", MEM_READ_ONLY); - aabb_data.alloc(num_segments * num_motion_steps); +# if OPTIX_ABI_VERSION >= 36 + device_vector<int> index_data(this, "temp_index_data", MEM_READ_ONLY); + device_vector<float4> vertex_data(this, "temp_vertex_data", MEM_READ_ONLY); + // Four control points for each curve segment + const size_t num_vertices = num_segments * 4; + if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { + index_data.alloc(num_segments); + vertex_data.alloc(num_vertices * num_motion_steps); + } + else +# endif + aabb_data.alloc(num_segments * num_motion_steps); // Get AABBs for each motion step for (size_t step = 0; step < num_motion_steps; ++step) { @@ -1165,44 +1258,127 @@ class OptiXDevice : public CUDADevice { keys = motion_keys->data_float3() + attr_offset * hair->curve_keys.size(); } - size_t i = step * num_segments; - for (size_t j = 0; j < num_curves; ++j) { - const Hair::Curve c = hair->get_curve(j); - - for (size_t k = 0; k < c.num_segments(); ++i, ++k) { - BoundBox bounds = BoundBox::empty; - c.bounds_grow(k, keys, hair->curve_radius.data(), bounds); - - aabb_data[i].minX = bounds.min.x; - aabb_data[i].minY = bounds.min.y; - aabb_data[i].minZ = bounds.min.z; - aabb_data[i].maxX = bounds.max.x; - aabb_data[i].maxY = bounds.max.y; - aabb_data[i].maxZ = bounds.max.z; + for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) { + const Hair::Curve curve = hair->get_curve(j); + + for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) { +# if OPTIX_ABI_VERSION >= 36 + if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { + int k0 = curve.first_key + segment; + int k1 = k0 + 1; + int ka = max(k0 - 1, curve.first_key); + int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1); + + const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x); + const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y); + const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z); + const float4 pw = make_float4(hair->curve_radius[ka], + hair->curve_radius[k0], + hair->curve_radius[k1], + hair->curve_radius[kb]); + + // Convert Catmull-Rom data to Bezier spline + static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f; + static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f; + static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f; + static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f; + + index_data[i] = i * 4; + float4 *const v = vertex_data.data() + step * num_vertices + index_data[i]; + v[0] = make_float4( + dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw)); + v[1] = make_float4( + dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw)); + v[2] = make_float4( + dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw)); + v[3] = make_float4( + dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw)); + } + else +# endif + { + BoundBox bounds = BoundBox::empty; + curve.bounds_grow(segment, keys, hair->curve_radius.data(), bounds); + + const size_t index = step * num_segments + i; + aabb_data[index].minX = bounds.min.x; + aabb_data[index].minY = bounds.min.y; + aabb_data[index].minZ = bounds.min.z; + aabb_data[index].maxX = bounds.max.x; + aabb_data[index].maxY = bounds.max.y; + aabb_data[index].maxZ = bounds.max.z; + } } } } // Upload AABB data to GPU aabb_data.copy_to_device(); +# if OPTIX_ABI_VERSION >= 36 + index_data.copy_to_device(); + vertex_data.copy_to_device(); +# endif vector<device_ptr> aabb_ptrs; aabb_ptrs.reserve(num_motion_steps); +# if OPTIX_ABI_VERSION >= 36 + vector<device_ptr> width_ptrs; + vector<device_ptr> vertex_ptrs; + width_ptrs.reserve(num_motion_steps); + vertex_ptrs.reserve(num_motion_steps); +# endif for (size_t step = 0; step < num_motion_steps; ++step) { aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb)); +# if OPTIX_ABI_VERSION >= 36 + const device_ptr base_ptr = vertex_data.device_pointer + + step * num_vertices * sizeof(float4); + width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size + vertex_ptrs.push_back(base_ptr); +# endif } - // Disable visibility test anyhit program, since it is already checked during intersection - // Those trace calls that require anyhit can force it with OPTIX_RAY_FLAG_ENFORCE_ANYHIT - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; + // Force a single any-hit call, so shadow record-all behavior works correctly + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; OptixBuildInput build_input = {}; - build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; - build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); - build_input.aabbArray.numPrimitives = num_segments; - build_input.aabbArray.strideInBytes = sizeof(OptixAabb); - build_input.aabbArray.flags = &build_flags; - build_input.aabbArray.numSbtRecords = 1; - build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset; +# if OPTIX_ABI_VERSION >= 36 + if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) { + build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES; + build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE; + build_input.curveArray.numPrimitives = num_segments; + build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); + build_input.curveArray.numVertices = num_vertices; + build_input.curveArray.vertexStrideInBytes = sizeof(float4); + build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data(); + build_input.curveArray.widthStrideInBytes = sizeof(float4); + build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer; + build_input.curveArray.indexStrideInBytes = sizeof(int); + build_input.curveArray.flag = build_flags; + build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset; + } + else +# endif + { + // Disable visibility test any-hit program, since it is already checked during + // intersection. Those trace calls that require anyhit can force it with a ray flag. + build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT; + + build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES; +# if OPTIX_ABI_VERSION < 23 + build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.aabbArray.numPrimitives = num_segments; + build_input.aabbArray.strideInBytes = sizeof(OptixAabb); + build_input.aabbArray.flags = &build_flags; + build_input.aabbArray.numSbtRecords = 1; + build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset; +# else + build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data(); + build_input.customPrimitiveArray.numPrimitives = num_segments; + build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb); + build_input.customPrimitiveArray.flags = &build_flags; + build_input.customPrimitiveArray.numSbtRecords = 1; + build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset; +# endif + } // Allocate memory for new BLAS and build it OptixTraversableHandle handle; @@ -1257,8 +1433,8 @@ class OptiXDevice : public CUDADevice { vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3)); } - // No special build flags for triangle primitives - unsigned int build_flags = OPTIX_GEOMETRY_FLAG_NONE; + // Force a single any-hit call, so shadow record-all behavior works correctly + unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL; OptixBuildInput build_input = {}; build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES; build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data(); @@ -1324,9 +1500,26 @@ class OptiXDevice : public CUDADevice { // Set user instance ID to object index instance.instanceId = ob->get_device_index(); - // Volumes have a special bit set in the visibility mask so a trace can mask only volumes - // See 'scene_intersect_volume' in bvh.h - instance.visibilityMask = (ob->geometry->has_volume ? 3 : 1); + // Have to have at least one bit in the mask, or else instance would always be culled + instance.visibilityMask = 1; + + if (ob->geometry->has_volume) { + // Volumes have a special bit set in the visibility mask so a trace can mask only volumes + instance.visibilityMask |= 2; + } + + if (ob->geometry->type == Geometry::HAIR) { + // Same applies to curves (so they can be skipped in local trace calls) + instance.visibilityMask |= 4; + +# if OPTIX_ABI_VERSION >= 36 + if (motion_blur && ob->geometry->has_motion_blur() && DebugFlags().optix.curves_api && + static_cast<const Hair *>(ob->geometry)->curve_shape == CURVE_THICK) { + // Select between motion blur and non-motion blur built-in intersection module + instance.sbtOffset = PG_HITD_MOTION - PG_HITD; + } +# endif + } // Insert motion traversable if object has motion if (motion_blur && ob->use_motion()) { diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h index 600973b8100..fd380788282 100644 --- a/intern/cycles/device/device_task.h +++ b/intern/cycles/device/device_task.h @@ -29,6 +29,7 @@ CCL_NAMESPACE_BEGIN class Device; class RenderBuffers; class RenderTile; +class RenderTileNeighbors; class Tile; enum DenoiserType { @@ -41,6 +42,14 @@ enum DenoiserType { DENOISER_ALL = ~0, }; +enum DenoiserInput { + DENOISER_INPUT_RGB = 1, + DENOISER_INPUT_RGB_ALBEDO = 2, + DENOISER_INPUT_RGB_ALBEDO_NORMAL = 3, + + DENOISER_INPUT_NUM, +}; + typedef int DenoiserTypeMask; class DenoiseParams { @@ -72,10 +81,10 @@ class DenoiseParams { /* Clamp the input to the range of +-1e8. Should be enough for any legitimate data. */ bool clamp_input; - /** Optix Denoiser **/ + /** OIDN/Optix Denoiser **/ - /* Passes handed over to the OptiX denoiser (default to color + albedo). */ - int optix_input_passes; + /* Passes handed over to the OIDN/OptiX denoiser (default to color + albedo). */ + DenoiserInput input_passes; DenoiseParams() { @@ -91,7 +100,7 @@ class DenoiseParams { neighbor_frames = 2; clamp_input = true; - optix_input_passes = 2; + input_passes = DENOISER_INPUT_RGB_ALBEDO_NORMAL; start_sample = 0; } @@ -150,8 +159,8 @@ class DeviceTask { function<void(RenderTile &)> update_tile_sample; function<void(RenderTile &)> release_tile; function<bool()> get_cancel; - function<void(RenderTile *, Device *)> map_neighbor_tiles; - function<void(RenderTile *, Device *)> unmap_neighbor_tiles; + function<void(RenderTileNeighbors &, Device *)> map_neighbor_tiles; + function<void(RenderTileNeighbors &, Device *)> unmap_neighbor_tiles; uint tile_types; DenoiseParams denoising; diff --git a/intern/cycles/device/opencl/device_opencl_impl.cpp b/intern/cycles/device/opencl/device_opencl_impl.cpp index 8c94815b193..e851749949d 100644 --- a/intern/cycles/device/opencl/device_opencl_impl.cpp +++ b/intern/cycles/device/opencl/device_opencl_impl.cpp @@ -1850,7 +1850,7 @@ void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising) denoising.render_buffer.samples = rtile.sample; denoising.buffer.gpu_temporary_mem = true; - denoising.run_denoising(&rtile); + denoising.run_denoising(rtile); } void OpenCLDevice::shader(DeviceTask &task) |