diff options
Diffstat (limited to 'intern/cycles/kernel/kernels/cuda')
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/filter.cu | 413 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel.cu | 232 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_config.h | 121 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h | 265 | ||||
-rw-r--r-- | intern/cycles/kernel/kernels/cuda/kernel_split.cu | 156 |
5 files changed, 0 insertions, 1187 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu deleted file mode 100644 index 6c9642d1f03..00000000000 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright 2011-2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#include "kernel_config.h" - -#include "kernel/kernel_compat_cuda.h" - -#include "kernel/filter/filter_kernel.h" - -/* kernels */ - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_copy_input(float *buffer, - CCL_FILTER_TILE_INFO, - int4 prefilter_rect, - int buffer_pass_stride) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - int xtile = (x < tile_info->x[1]) ? 0 : ((x < tile_info->x[2]) ? 1 : 2); - int ytile = (y < tile_info->y[1]) ? 0 : ((y < tile_info->y[2]) ? 1 : 2); - int itile = ytile * 3 + xtile; - float *const in = ((float *)ccl_get_tile_buffer(itile)) + - (tile_info->offsets[itile] + y * tile_info->strides[itile] + x) * buffer_pass_stride; - buffer += ((y - prefilter_rect.y) * (prefilter_rect.z - prefilter_rect.x) + (x - prefilter_rect.x)) * buffer_pass_stride; - for (int i = 0; i < buffer_pass_stride; ++i) - buffer[i] = in[i]; - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_convert_to_rgb(float *rgb, float *buf, int sw, int sh, int stride, int pass_stride, int3 pass_offset, int num_inputs, int num_samples) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < sw && y < sh) { - if (num_inputs > 0) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.x) / sizeof(float); - float *out = rgb + (x + y * sw) * 3; - out[0] = clamp(in[0] / num_samples, 0.0f, 10000.0f); - out[1] = clamp(in[1] / num_samples, 0.0f, 10000.0f); - out[2] = clamp(in[2] / num_samples, 0.0f, 10000.0f); - } - if (num_inputs > 1) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.y) / sizeof(float); - float *out = rgb + (x + y * sw) * 3 + (sw * sh) * 3; - out[0] = in[0] / num_samples; - out[1] = in[1] / num_samples; - out[2] = in[2] / num_samples; - } - if (num_inputs > 2) { - float *in = buf + x * pass_stride + (y * stride + pass_offset.z) / sizeof(float); - float *out = rgb + (x + y * sw) * 3 + (sw * sh * 2) * 3; - out[0] = in[0] / num_samples; - out[1] = in[1] / num_samples; - out[2] = in[2] / num_samples; - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_convert_from_rgb(float *rgb, float *buf, int ix, int iy, int iw, int ih, int sx, int sy, int sw, int sh, int offset, int stride, int pass_stride, int num_samples) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < sw && y < sh) { - float *in = rgb + ((ix + x) + (iy + y) * iw) * 3; - float *out = buf + (offset + (sx + x) + (sy + y) * stride) * pass_stride; - out[0] = in[0] * num_samples; - out[1] = in[1] * num_samples; - out[2] = in[2] * num_samples; - } -} - - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_divide_shadow(int sample, - CCL_FILTER_TILE_INFO, - float *unfilteredA, - float *unfilteredB, - float *sampleVariance, - float *sampleVarianceV, - float *bufferVariance, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_divide_shadow(sample, - tile_info, - x, y, - unfilteredA, - unfilteredB, - sampleVariance, - sampleVarianceV, - bufferVariance, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_get_feature(int sample, - CCL_FILTER_TILE_INFO, - int m_offset, - int v_offset, - float *mean, - float *variance, - float scale, - int4 prefilter_rect, - int buffer_pass_stride, - int buffer_denoising_offset) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_get_feature(sample, - tile_info, - m_offset, v_offset, - x, y, - mean, variance, - scale, - prefilter_rect, - buffer_pass_stride, - buffer_denoising_offset); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_write_feature(int sample, - int4 buffer_params, - int4 filter_area, - float *from, - float *buffer, - int out_offset, - int4 prefilter_rect) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - kernel_filter_write_feature(sample, - x + filter_area.x, - y + filter_area.y, - buffer_params, - from, - buffer, - out_offset, - prefilter_rect); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_detect_outliers(float *image, - float *variance, - float *depth, - float *output, - int4 prefilter_rect, - int pass_stride) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r) -{ - int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x; - int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y; - if(x < prefilter_rect.z && y < prefilter_rect.w) { - kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, - CCL_FILTER_TILE_INFO, - float *transform, int *rank, - int4 filter_area, int4 rect, - int radius, float pca_threshold, - int pass_stride, int frame_stride, - bool use_time) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - int *l_rank = rank + y*filter_area.z + x; - float *l_transform = transform + y*filter_area.z + x; - kernel_filter_construct_transform(buffer, - tile_info, - x + filter_area.x, y + filter_area.y, - rect, - pass_stride, frame_stride, - use_time, - l_transform, l_rank, - radius, pca_threshold, - filter_area.z*filter_area.w, - threadIdx.y*blockDim.x + threadIdx.x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, - const float *ccl_restrict variance_image, - const float *ccl_restrict scale_image, - float *difference_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int channel_offset, - int frame_offset, - float a, - float k_2) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, - weight_image, - variance_image, - scale_image, - difference_image + ofs, - rect, stride, - channel_offset, - frame_offset, - a, k_2); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, - float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_blur(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, - float *out_image, - int w, - int h, - int stride, - int pass_stride, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_calc_weight(co.x, co.y, - difference_image + ofs, - out_image + ofs, - rect, stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, - const float *ccl_restrict image, - float *out_image, - float *accum_image, - int w, - int h, - int stride, - int pass_stride, - int channel_offset, - int r, - int f) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) { - kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, - difference_image + ofs, - image, - out_image, - accum_image, - rect, - channel_offset, - stride, f); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *out_image, - const float *ccl_restrict accum_image, - int w, - int h, - int stride) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < w && y < h) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(int t, - const float *ccl_restrict difference_image, - const float *ccl_restrict buffer, - float const* __restrict__ transform, - int *rank, - float *XtWX, - float3 *XtWY, - int4 filter_window, - int w, - int h, - int stride, - int pass_stride, - int r, - int f, - int frame_offset, - bool use_time) -{ - int4 co, rect; - int ofs; - if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) { - kernel_filter_nlm_construct_gramian(co.x, co.y, - co.z, co.w, - t, - difference_image + ofs, - buffer, - transform, rank, - XtWX, XtWY, - rect, filter_window, - stride, f, - pass_stride, - frame_offset, - use_time, - threadIdx.y*blockDim.x + threadIdx.x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_finalize(float *buffer, - int *rank, - float *XtWX, - float3 *XtWY, - int4 filter_area, - int4 buffer_params, - int sample) -{ - int x = blockDim.x*blockIdx.x + threadIdx.x; - int y = blockDim.y*blockIdx.y + threadIdx.y; - if(x < filter_area.z && y < filter_area.w) { - int storage_ofs = y*filter_area.z+x; - rank += storage_ofs; - XtWX += storage_ofs; - XtWY += storage_ofs; - kernel_filter_finalize(x, y, buffer, rank, - filter_area.z*filter_area.w, - XtWX, XtWY, - buffer_params, sample); - } -} - -#endif - diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu deleted file mode 100644 index cf62b6e781e..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel.cu +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#include "kernel/kernel_compat_cuda.h" -#include "kernel_config.h" - -#include "util/util_atomic.h" - -#include "kernel/kernel_math.h" -#include "kernel/kernel_types.h" -#include "kernel/kernel_globals.h" -#include "kernel/kernel_color.h" -#include "kernel/kernels/cuda/kernel_cuda_image.h" -#include "kernel/kernel_film.h" -#include "kernel/kernel_path.h" -#include "kernel/kernel_path_branched.h" -#include "kernel/kernel_bake.h" -#include "kernel/kernel_work_stealing.h" -#include "kernel/kernel_adaptive_sampling.h" - -/* kernels */ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace(WorkTile *tile, uint total_work_size) -{ - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - uint x, y, sample; - KernelGlobals kg; - if(thread_is_active) { - get_work_pixel(tile, work_index, &x, &y, &sample); - - kernel_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - - if(kernel_data.film.cryptomatte_passes) { - __syncthreads(); - if(thread_is_active) { - kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - } -} - -#ifdef __BRANCHED_PATH__ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS) -kernel_cuda_branched_path_trace(WorkTile *tile, uint total_work_size) -{ - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - uint x, y, sample; - KernelGlobals kg; - if(thread_is_active) { - get_work_pixel(tile, work_index, &x, &y, &sample); - - kernel_branched_path_trace(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - - if(kernel_data.film.cryptomatte_passes) { - __syncthreads(); - if(thread_is_active) { - kernel_cryptomatte_post(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } - } -} -#endif - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_stopping(WorkTile *tile, int sample, uint total_work_size) -{ - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - KernelGlobals kg; - if(thread_is_active && kernel_data.film.pass_adaptive_aux_buffer) { - uint x = tile->x + work_index % tile->w; - uint y = tile->y + work_index / tile->w; - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - kernel_do_adaptive_stopping(&kg, buffer, sample); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_filter_x(WorkTile *tile, int sample, uint) -{ - KernelGlobals kg; - if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { - if(ccl_global_id(0) < tile->h) { - int y = tile->y + ccl_global_id(0); - kernel_do_adaptive_filter_x(&kg, y, tile); - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_filter_y(WorkTile *tile, int sample, uint) -{ - KernelGlobals kg; - if(kernel_data.film.pass_adaptive_aux_buffer && sample > kernel_data.integrator.adaptive_min_samples) { - if(ccl_global_id(0) < tile->w) { - int x = tile->x + ccl_global_id(0); - kernel_do_adaptive_filter_y(&kg, x, tile); - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_adaptive_scale_samples(WorkTile *tile, int start_sample, int sample, uint total_work_size) -{ - if(kernel_data.film.pass_adaptive_aux_buffer) { - int work_index = ccl_global_id(0); - bool thread_is_active = work_index < total_work_size; - KernelGlobals kg; - if(thread_is_active) { - uint x = tile->x + work_index % tile->w; - uint y = tile->y + work_index / tile->w; - int index = tile->offset + x + y * tile->stride; - ccl_global float *buffer = tile->buffer + index * kernel_data.film.pass_stride; - if(buffer[kernel_data.film.pass_sample_count] < 0.0f) { - buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count]; - float sample_multiplier = sample / buffer[kernel_data.film.pass_sample_count]; - if(sample_multiplier != 1.0f) { - kernel_adaptive_post_adjust(&kg, buffer, sample_multiplier); - } - } - else { - kernel_adaptive_post_adjust(&kg, buffer, sample / (sample - 1.0f)); - } - } - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) { - kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) { - kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_displace(uint4 *input, - float4 *output, - int type, - int sx, - int sw, - int offset, - int sample) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - - if(x < sx + sw) { - KernelGlobals kg; - kernel_displace_evaluate(&kg, input, output, x); - } -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_background(uint4 *input, - float4 *output, - int type, - int sx, - int sw, - int offset, - int sample) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - - if(x < sx + sw) { - KernelGlobals kg; - kernel_background_evaluate(&kg, input, output, x); - } -} - -#ifdef __BAKING__ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_bake(WorkTile *tile, uint total_work_size) -{ - int work_index = ccl_global_id(0); - - if(work_index < total_work_size) { - uint x, y, sample; - get_work_pixel(tile, work_index, &x, &y, &sample); - - KernelGlobals kg; - kernel_bake_evaluate(&kg, tile->buffer, sample, x, y, tile->offset, tile->stride); - } -} -#endif - -#endif - diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h deleted file mode 100644 index 2e47ce2de6c..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel_config.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2011-2013 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* device data taken from CUDA occupancy calculator */ - -/* 3.0 and 3.5 */ -#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.2 */ -#elif __CUDA_ARCH__ == 320 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 63 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 3.7 */ -#elif __CUDA_ARCH__ == 370 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 63 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 5.x, 6.x */ -#elif __CUDA_ARCH__ <= 699 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of - * registers */ -# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600 -# define CUDA_KERNEL_MAX_REGISTERS 64 -# else -# define CUDA_KERNEL_MAX_REGISTERS 48 -# endif -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63 - -/* 7.x, 8.x */ -#elif __CUDA_ARCH__ <= 899 -# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536 -# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32 -# define CUDA_BLOCK_MAX_THREADS 1024 -# define CUDA_THREAD_MAX_REGISTERS 255 - -/* tunable parameters */ -# define CUDA_THREADS_BLOCK_WIDTH 16 -# define CUDA_KERNEL_MAX_REGISTERS 64 -# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72 - -/* unknown architecture */ -#else -# error "Unknown or unsupported CUDA architecture, can't determine launch bounds" -#endif - -/* For split kernel using all registers seems fastest for now, but this - * is unlikely to be optimal once we resolve other bottlenecks. */ - -#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS - -/* Compute number of threads per block and minimum blocks per multiprocessor - * given the maximum number of registers per thread. */ - -#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \ - __launch_bounds__(threads_block_width *threads_block_width, \ - CUDA_MULTIPRESSOR_MAX_REGISTERS / \ - (threads_block_width * threads_block_width * thread_num_registers)) - -/* sanity checks */ - -#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS -# error "Maximum number of threads per block exceeded" -#endif - -#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \ - (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \ - CUDA_MULTIPROCESSOR_MAX_BLOCKS -# error "Maximum number of blocks per multiprocessor exceeded" -#endif - -#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif - -#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS -# error "Maximum number of registers per thread exceeded" -#endif diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h deleted file mode 100644 index 132653fa7ca..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright 2017 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef WITH_NANOVDB -# define NDEBUG /* Disable "assert" in device code */ -# define NANOVDB_USE_INTRINSICS -# include "nanovdb/NanoVDB.h" -# include "nanovdb/util/SampleFromVoxels.h" -#endif - -/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */ -ccl_device float cubic_w0(float a) -{ - return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f); -} -ccl_device float cubic_w1(float a) -{ - return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f); -} -ccl_device float cubic_w2(float a) -{ - return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f); -} -ccl_device float cubic_w3(float a) -{ - return (1.0f / 6.0f) * (a * a * a); -} - -/* g0 and g1 are the two amplitude functions. */ -ccl_device float cubic_g0(float a) -{ - return cubic_w0(a) + cubic_w1(a); -} -ccl_device float cubic_g1(float a) -{ - return cubic_w2(a) + cubic_w3(a); -} - -/* h0 and h1 are the two offset functions */ -ccl_device float cubic_h0(float a) -{ - return (cubic_w1(a) / cubic_g0(a)) - 1.0f; -} -ccl_device float cubic_h1(float a) -{ - return (cubic_w3(a) / cubic_g1(a)) + 1.0f; -} - -/* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */ -template<typename T> -ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y) -{ - CUtexObject tex = (CUtexObject)info.data; - - x = (x * info.width) - 0.5f; - y = (y * info.height) - 0.5f; - - float px = floorf(x); - float py = floorf(y); - float fx = x - px; - float fy = y - py; - - float g0x = cubic_g0(fx); - float g1x = cubic_g1(fx); - /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ - float x0 = (px + cubic_h0(fx) + 0.5f) / info.width; - float x1 = (px + cubic_h1(fx) + 0.5f) / info.width; - float y0 = (py + cubic_h0(fy) + 0.5f) / info.height; - float y1 = (py + cubic_h1(fy) + 0.5f) / info.height; - - return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) + - cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1)); -} - -/* Fast tricubic texture lookup using 8 trilinear lookups. */ -template<typename T> -ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z) -{ - CUtexObject tex = (CUtexObject)info.data; - - x = (x * info.width) - 0.5f; - y = (y * info.height) - 0.5f; - z = (z * info.depth) - 0.5f; - - float px = floorf(x); - float py = floorf(y); - float pz = floorf(z); - float fx = x - px; - float fy = y - py; - float fz = z - pz; - - float g0x = cubic_g0(fx); - float g1x = cubic_g1(fx); - float g0y = cubic_g0(fy); - float g1y = cubic_g1(fy); - float g0z = cubic_g0(fz); - float g1z = cubic_g1(fz); - - /* Note +0.5 offset to compensate for CUDA linear filtering convention. */ - float x0 = (px + cubic_h0(fx) + 0.5f) / info.width; - float x1 = (px + cubic_h1(fx) + 0.5f) / info.width; - float y0 = (py + cubic_h0(fy) + 0.5f) / info.height; - float y1 = (py + cubic_h1(fy) + 0.5f) / info.height; - float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth; - float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth; - - return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) + - g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) + - g1z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z1) + g1x * tex3D<T>(tex, x1, y0, z1)) + - g1y * (g0x * tex3D<T>(tex, x0, y1, z1) + g1x * tex3D<T>(tex, x1, y1, z1))); -} - -#ifdef WITH_NANOVDB -template<typename T, typename S> -ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, float z) -{ - float px = floorf(x); - float py = floorf(y); - float pz = floorf(z); - float fx = x - px; - float fy = y - py; - float fz = z - pz; - - float g0x = cubic_g0(fx); - float g1x = cubic_g1(fx); - float g0y = cubic_g0(fy); - float g1y = cubic_g1(fy); - float g0z = cubic_g0(fz); - float g1z = cubic_g1(fz); - - float x0 = px + cubic_h0(fx); - float x1 = px + cubic_h1(fx); - float y0 = py + cubic_h0(fy); - float y1 = py + cubic_h1(fy); - float z0 = pz + cubic_h0(fz); - float z1 = pz + cubic_h1(fz); - - using namespace nanovdb; - - return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) + - g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) + - g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) + - g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1)))); -} - -template<typename T> -ccl_device_inline T kernel_tex_image_interp_nanovdb( - const TextureInfo &info, float x, float y, float z, uint interpolation) -{ - using namespace nanovdb; - - NanoGrid<T> *const grid = (NanoGrid<T> *)info.data; - typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType; - AccessorType acc = grid->getAccessor(); - - switch (interpolation) { - case INTERPOLATION_CLOSEST: - return SampleFromVoxels<AccessorType, 0, false>(acc)(Vec3f(x, y, z)); - case INTERPOLATION_LINEAR: - return SampleFromVoxels<AccessorType, 1, false>(acc)(Vec3f(x - 0.5f, y - 0.5f, z - 0.5f)); - default: - SampleFromVoxels<AccessorType, 1, false> s(acc); - return kernel_tex_image_interp_tricubic_nanovdb<T>(s, x - 0.5f, y - 0.5f, z - 0.5f); - } -} -#endif - -ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y) -{ - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); - - /* float4, byte4, ushort4 and half4 */ - const int texture_type = info.data_type; - if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || - texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { - if (info.interpolation == INTERPOLATION_CUBIC) { - return kernel_tex_image_interp_bicubic<float4>(info, x, y); - } - else { - CUtexObject tex = (CUtexObject)info.data; - return tex2D<float4>(tex, x, y); - } - } - /* float, byte and half */ - else { - float f; - - if (info.interpolation == INTERPOLATION_CUBIC) { - f = kernel_tex_image_interp_bicubic<float>(info, x, y); - } - else { - CUtexObject tex = (CUtexObject)info.data; - f = tex2D<float>(tex, x, y); - } - - return make_float4(f, f, f, 1.0f); - } -} - -ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals *kg, - int id, - float3 P, - InterpolationType interp) -{ - const TextureInfo &info = kernel_tex_fetch(__texture_info, id); - - if (info.use_transform_3d) { - P = transform_point(&info.transform_3d, P); - } - - const float x = P.x; - const float y = P.y; - const float z = P.z; - - uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp; - const int texture_type = info.data_type; - -#ifdef WITH_NANOVDB - if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) { - float f = kernel_tex_image_interp_nanovdb<float>(info, x, y, z, interpolation); - return make_float4(f, f, f, 1.0f); - } - if (texture_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) { - nanovdb::Vec3f f = kernel_tex_image_interp_nanovdb<nanovdb::Vec3f>( - info, x, y, z, interpolation); - return make_float4(f[0], f[1], f[2], 1.0f); - } -#endif - if (texture_type == IMAGE_DATA_TYPE_FLOAT4 || texture_type == IMAGE_DATA_TYPE_BYTE4 || - texture_type == IMAGE_DATA_TYPE_HALF4 || texture_type == IMAGE_DATA_TYPE_USHORT4) { - if (interpolation == INTERPOLATION_CUBIC) { - return kernel_tex_image_interp_tricubic<float4>(info, x, y, z); - } - else { - CUtexObject tex = (CUtexObject)info.data; - return tex3D<float4>(tex, x, y, z); - } - } - else { - float f; - - if (interpolation == INTERPOLATION_CUBIC) { - f = kernel_tex_image_interp_tricubic<float>(info, x, y, z); - } - else { - CUtexObject tex = (CUtexObject)info.data; - f = tex3D<float>(tex, x, y, z); - } - - return make_float4(f, f, f, 1.0f); - } -} diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu deleted file mode 100644 index 95ad7599cf1..00000000000 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2011-2016 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* CUDA split kernel entry points */ - -#ifdef __CUDA_ARCH__ - -#define __SPLIT_KERNEL__ - -#include "kernel/kernel_compat_cuda.h" -#include "kernel_config.h" - -#include "kernel/split/kernel_split_common.h" -#include "kernel/split/kernel_data_init.h" -#include "kernel/split/kernel_path_init.h" -#include "kernel/split/kernel_scene_intersect.h" -#include "kernel/split/kernel_lamp_emission.h" -#include "kernel/split/kernel_do_volume.h" -#include "kernel/split/kernel_queue_enqueue.h" -#include "kernel/split/kernel_indirect_background.h" -#include "kernel/split/kernel_shader_setup.h" -#include "kernel/split/kernel_shader_sort.h" -#include "kernel/split/kernel_shader_eval.h" -#include "kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h" -#include "kernel/split/kernel_subsurface_scatter.h" -#include "kernel/split/kernel_direct_lighting.h" -#include "kernel/split/kernel_shadow_blocked_ao.h" -#include "kernel/split/kernel_shadow_blocked_dl.h" -#include "kernel/split/kernel_enqueue_inactive.h" -#include "kernel/split/kernel_next_iteration_setup.h" -#include "kernel/split/kernel_indirect_subsurface.h" -#include "kernel/split/kernel_buffer_update.h" -#include "kernel/split/kernel_adaptive_stopping.h" -#include "kernel/split/kernel_adaptive_filter_x.h" -#include "kernel/split/kernel_adaptive_filter_y.h" -#include "kernel/split/kernel_adaptive_adjust_samples.h" - -#include "kernel/kernel_film.h" - -/* kernels */ -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_state_buffer_size(uint num_threads, uint64_t *size) -{ - *size = split_data_buffer_size(NULL, num_threads); -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_path_trace_data_init( - ccl_global void *split_data_buffer, - int num_elements, - ccl_global char *ray_state, - int start_sample, - int end_sample, - int sx, int sy, int sw, int sh, int offset, int stride, - ccl_global int *Queue_index, - int queuesize, - ccl_global char *use_queues_flag, - ccl_global unsigned int *work_pool_wgs, - unsigned int num_samples, - ccl_global float *buffer) -{ - kernel_data_init(NULL, - NULL, - split_data_buffer, - num_elements, - ray_state, - start_sample, - end_sample, - sx, sy, sw, sh, offset, stride, - Queue_index, - queuesize, - use_queues_flag, - work_pool_wgs, - num_samples, - buffer); -} - -#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \ - extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ - kernel_cuda_##name() \ - { \ - kernel_##name(NULL); \ - } - -#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \ - extern "C" __global__ void \ - CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_SPLIT_MAX_REGISTERS) \ - kernel_cuda_##name() \ - { \ - ccl_local type locals; \ - kernel_##name(NULL, &locals); \ - } - -DEFINE_SPLIT_KERNEL_FUNCTION(path_init) -DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect) -DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission) -DEFINE_SPLIT_KERNEL_FUNCTION(do_volume) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(queue_enqueue, QueueEnqueueLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(indirect_background) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_setup, uint) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(shader_sort, ShaderSortLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(holdout_emission_blurring_pathtermination_ao, BackgroundAOLocals) -DEFINE_SPLIT_KERNEL_FUNCTION(subsurface_scatter) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(direct_lighting, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_ao) -DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(enqueue_inactive, uint) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface) -DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_stopping) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_x) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_filter_y) -DEFINE_SPLIT_KERNEL_FUNCTION(adaptive_adjust_samples) - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride); -} - -extern "C" __global__ void -CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride) -{ - int x = sx + blockDim.x*blockIdx.x + threadIdx.x; - int y = sy + blockDim.y*blockIdx.y + threadIdx.y; - - if(x < sx + sw && y < sy + sh) - kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride); -} - -#endif - |