Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMai Lavelle <mai.lavelle@gmail.com>2017-02-14 13:50:29 +0300
committerMai Lavelle <mai.lavelle@gmail.com>2017-03-08 09:24:53 +0300
commit817873cc83034c460f1be6bf410c95ff009f3ae2 (patch)
treed50373c256ff02d5f12b067be50c7401c326332b /intern/cycles/kernel/kernels
parent0892352bfe6d5a9aa6ec4c088e67f8bbbbfae610 (diff)
Cycles: CUDA implementation of split kernel
Diffstat (limited to 'intern/cycles/kernel/kernels')
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel.cu100
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_config.h110
-rw-r--r--intern/cycles/kernel/kernels/cuda/kernel_split.cu118
3 files changed, 231 insertions, 97 deletions
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 090ab2c50c2..52e541321e3 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -16,7 +16,10 @@
/* CUDA kernel entry points */
+#ifdef __CUDA_ARCH__
+
#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
#include "../../kernel_math.h"
#include "../../kernel_types.h"
#include "../../kernel_globals.h"
@@ -25,104 +28,7 @@
#include "../../kernel_path_branched.h"
#include "../../kernel_bake.h"
-/* device data taken from CUDA occupancy calculator */
-
-#ifdef __CUDA_ARCH__
-
-/* 2.0 and 2.1 */
-#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 32
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
-
-/* 3.0 and 3.5 */
-#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.2 */
-#elif __CUDA_ARCH__ == 320
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 63
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 3.7 */
-#elif __CUDA_ARCH__ == 370
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 63
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* 5.0, 5.2, 5.3, 6.0, 6.1 */
-#elif __CUDA_ARCH__ >= 500
-# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
-# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
-# define CUDA_BLOCK_MAX_THREADS 1024
-# define CUDA_THREAD_MAX_REGISTERS 255
-
-/* tunable parameters */
-# define CUDA_THREADS_BLOCK_WIDTH 16
-# define CUDA_KERNEL_MAX_REGISTERS 48
-# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
-
-/* unknown architecture */
-#else
-# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
-#endif
-
-/* compute number of threads per block and minimum blocks per multiprocessor
- * given the maximum number of registers per thread */
-
-#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
- __launch_bounds__( \
- threads_block_width*threads_block_width, \
- CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
- )
-
-/* sanity checks */
-
-#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
-# error "Maximum number of threads per block exceeded"
-#endif
-
-#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
-# error "Maximum number of blocks per multiprocessor exceeded"
-#endif
-
-#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
-#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
-# error "Maximum number of registers per thread exceeded"
-#endif
-
/* kernels */
-
extern "C" __global__ void
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
kernel_cuda_path_trace(float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_config.h b/intern/cycles/kernel/kernels/cuda/kernel_config.h
new file mode 100644
index 00000000000..9fa39dc9ebb
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_config.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* device data taken from CUDA occupancy calculator */
+
+/* 2.0 and 2.1 */
+#if __CUDA_ARCH__ == 200 || __CUDA_ARCH__ == 210
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 8
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 32
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 40
+
+/* 3.0 and 3.5 */
+#elif __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.2 */
+#elif __CUDA_ARCH__ == 320
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 63
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 3.7 */
+#elif __CUDA_ARCH__ == 370
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 63
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* 5.0, 5.2, 5.3, 6.0, 6.1 */
+#elif __CUDA_ARCH__ >= 500
+# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
+# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
+# define CUDA_BLOCK_MAX_THREADS 1024
+# define CUDA_THREAD_MAX_REGISTERS 255
+
+/* tunable parameters */
+# define CUDA_THREADS_BLOCK_WIDTH 16
+# define CUDA_KERNEL_MAX_REGISTERS 48
+# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
+
+/* unknown architecture */
+#else
+# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
+#endif
+
+/* compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread */
+
+#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
+ __launch_bounds__( \
+ threads_block_width*threads_block_width, \
+ CUDA_MULTIPRESSOR_MAX_REGISTERS/(threads_block_width*threads_block_width*thread_num_registers) \
+ )
+
+/* sanity checks */
+
+#if CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
+# error "Maximum number of threads per block exceeded"
+#endif
+
+#if CUDA_MULTIPRESSOR_MAX_REGISTERS/(CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH*CUDA_KERNEL_MAX_REGISTERS) > CUDA_MULTIPROCESSOR_MAX_BLOCKS
+# error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
+
+#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
+# error "Maximum number of registers per thread exceeded"
+#endif
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
new file mode 100644
index 00000000000..441cd96fafa
--- /dev/null
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA split kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#define __SPLIT_KERNEL__
+
+#include "../../kernel_compat_cuda.h"
+#include "kernel_config.h"
+
+#include "../../split/kernel_split_common.h"
+#include "../../split/kernel_data_init.h"
+#include "../../split/kernel_scene_intersect.h"
+#include "../../split/kernel_lamp_emission.h"
+#include "../../split/kernel_queue_enqueue.h"
+#include "../../split/kernel_background_buffer_update.h"
+#include "../../split/kernel_shader_eval.h"
+#include "../../split/kernel_holdout_emission_blurring_pathtermination_ao.h"
+#include "../../split/kernel_direct_lighting.h"
+#include "../../split/kernel_shadow_blocked.h"
+#include "../../split/kernel_next_iteration_setup.h"
+#include "../../split/kernel_sum_all_radiance.h"
+
+#include "../../kernel_film.h"
+
+/* kernels */
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_path_trace_data_init(
+ ccl_global void *split_data_buffer,
+ int num_elements,
+ ccl_global char *ray_state,
+ ccl_global uint *rng_state,
+ int start_sample,
+ int end_sample,
+ int sx, int sy, int sw, int sh, int offset, int stride,
+ ccl_global int *Queue_index,
+ int queuesize,
+ ccl_global char *use_queues_flag,
+ ccl_global unsigned int *work_pool_wgs,
+ unsigned int num_samples,
+ ccl_global float *buffer)
+{
+ kernel_data_init(NULL,
+ NULL,
+ split_data_buffer,
+ num_elements,
+ ray_state,
+ rng_state,
+ start_sample,
+ end_sample,
+ sx, sy, sw, sh, offset, stride,
+ Queue_index,
+ queuesize,
+ use_queues_flag,
+ work_pool_wgs,
+ num_samples,
+ buffer);
+}
+
+#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+ extern "C" __global__ void \
+ CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) \
+ kernel_cuda_##name() \
+ { \
+ kernel_##name(NULL); \
+ }
+
+DEFINE_SPLIT_KERNEL_FUNCTION(scene_intersect)
+DEFINE_SPLIT_KERNEL_FUNCTION(lamp_emission)
+DEFINE_SPLIT_KERNEL_FUNCTION(queue_enqueue)
+DEFINE_SPLIT_KERNEL_FUNCTION(background_buffer_update)
+DEFINE_SPLIT_KERNEL_FUNCTION(shader_eval)
+DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
+DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
+DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
+DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
+DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_byte(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_byte(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_convert_to_half_float(uchar4 *rgba, float *buffer, float sample_scale, int sx, int sy, int sw, int sh, int offset, int stride)
+{
+ int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
+ int y = sy + blockDim.y*blockIdx.y + threadIdx.y;
+
+ if(x < sx + sw && y < sy + sh)
+ kernel_film_convert_to_half_float(NULL, rgba, buffer, sample_scale, x, y, offset, stride);
+}
+
+#endif
+