diff options
author | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2013-06-12 01:58:48 +0400 |
---|---|---|
committer | Brecht Van Lommel <brechtvanlommel@pandora.be> | 2013-06-12 01:58:48 +0400 |
commit | 37f92119e449a56116bc7a78aaafeaa67ee4c493 (patch) | |
tree | 1075e1fde2056b7cf66c6b94593a847393d5174a /intern | |
parent | 3d21bf96887cf0f936d9f280516c94a814b9fbf2 (diff) |
Fix #35665: more CUDA issues with recent kernel changes, tested on sm_20, sm_21
and sm_30 cards, so hopefully it should all work now.
Also includes some warnings fixes related to nvcc compiler arguments, should make
no difference otherwise.
Diffstat (limited to 'intern')
-rw-r--r-- | intern/cycles/kernel/CMakeLists.txt | 13 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_jitter.h | 5 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_path.h | 59 | ||||
-rw-r--r-- | intern/cycles/kernel/kernel_random.h | 59 |
4 files changed, 60 insertions, 76 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index f87f5dec741..8b4466863e0 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -129,9 +129,20 @@ if(WITH_CYCLES_CUDA_BINARIES) foreach(arch ${CYCLES_CUDA_BINARIES_ARCH}) set(cuda_cubin kernel_${arch}.cubin) + if(${arch} MATCHES "sm_1[0-9]") + # sm_1x + set(cuda_arch_flags "--maxrregcount=24 --opencc-options -OPT:Olimit=0") + elseif(${arch} MATCHES "sm_2[0-9]") + # sm_2x + set(cuda_arch_flags "--maxrregcount=24") + else() + # sm_3x + set(cuda_arch_flags "--maxrregcount=32") + endif() + add_custom_command( OUTPUT ${cuda_cubin} - COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" --maxrregcount=24 --opencc-options -OPT:Olimit=0 -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC + COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} -m${CUDA_BITS} --cubin ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cu -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin} --ptxas-options="-v" ${cuda_arch_flags} -I${CMAKE_CURRENT_SOURCE_DIR}/../util -I${CMAKE_CURRENT_SOURCE_DIR}/svm -DCCL_NAMESPACE_BEGIN= -DCCL_NAMESPACE_END= -DNVCC DEPENDS ${cuda_sources}) delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) diff --git a/intern/cycles/kernel/kernel_jitter.h b/intern/cycles/kernel/kernel_jitter.h index 5ea44cd0cad..15d2151228f 100644 --- a/intern/cycles/kernel/kernel_jitter.h +++ b/intern/cycles/kernel/kernel_jitter.h @@ -146,7 +146,7 @@ __device_noinline float cmj_sample_1D(int s, int N, int p) return (x + jx)*invN; } -__device_noinline float2 cmj_sample_2D(int s, int N, int p) +__device_noinline void cmj_sample_2D(int s, int N, int p, float *fx, float *fy) { int m = float_to_int(sqrtf(N)); int n = (N + m - 1)/m; @@ -173,7 +173,8 @@ __device_noinline float2 cmj_sample_2D(int s, int N, int p) float jx = cmj_randfloat(s, p * 0x967a889b); float jy = cmj_randfloat(s, p * 0x368cc8b7); - return make_float2((sx + (sy + jx)*invn)*invm, (s + jy)*invN); + *fx = (sx + (sy + jx)*invn)*invm; + *fy = (s + jy)*invN; } #endif diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index f58f83e2f82..b895d1fcf52 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -409,9 +409,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample, /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { /* todo: solve correlation */ - float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U); - float bsdf_u = bsdf_uv.x; - float bsdf_v = bsdf_uv.y; + float bsdf_u, bsdf_v; + path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; @@ -450,9 +449,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample, #else float light_o = path_rng_1D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_F); #endif - float2 light_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U); - float light_u = light_uv.x; - float light_v = light_uv.y; + float light_u, light_v; + path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -484,9 +482,8 @@ __device float4 kernel_path_progressive(KernelGlobals *kg, RNG *rng, int sample, BsdfEval bsdf_eval; float3 bsdf_omega_in; differential3 bsdf_domega_in; - float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U); - float bsdf_u = bsdf_uv.x; - float bsdf_v = bsdf_uv.y; + float bsdf_u, bsdf_v; + path_rng_2D(kg, rng, sample, num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval, @@ -653,10 +650,8 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray #ifdef __AO__ /* ambient occlusion */ if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) { - /* todo: solve correlation */ - float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U); - float bsdf_u = bsdf_uv.x; - float bsdf_v = bsdf_uv.y; + float bsdf_u, bsdf_v; + path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v); float ao_factor = kernel_data.background.ao_factor; float3 ao_N; @@ -695,9 +690,8 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray #else float light_o = path_rng_1D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_F); #endif - float2 light_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U); - float light_u = light_uv.x; - float light_v = light_uv.y; + float light_u, light_v; + path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v); Ray light_ray; BsdfEval L_light; @@ -730,9 +724,8 @@ __device void kernel_path_indirect(KernelGlobals *kg, RNG *rng, int sample, Ray BsdfEval bsdf_eval; float3 bsdf_omega_in; differential3 bsdf_domega_in; - float2 bsdf_uv = path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U); - float bsdf_u = bsdf_uv.x; - float bsdf_v = bsdf_uv.y; + float bsdf_u, bsdf_v; + path_rng_2D(kg, rng, sample, num_total_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample(kg, &sd, bsdf_u, bsdf_v, &bsdf_eval, @@ -784,10 +777,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R float3 ao_bsdf = shader_bsdf_ao(kg, sd, ao_factor, &ao_N); for(int j = 0; j < num_samples; j++) { - /* todo: solve correlation */ - float2 bsdf_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U); - float bsdf_u = bsdf_uv.x; - float bsdf_v = bsdf_uv.y; + float bsdf_u, bsdf_v; + path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v); float3 ao_D; float ao_pdf; @@ -836,9 +827,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R num_samples_inv *= 0.5f; for(int j = 0; j < num_samples; j++) { - float2 light_uv = path_rng_2D(kg, &lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U); - float light_u = light_uv.x; - float light_v = light_uv.y; + float light_u, light_v; + path_rng_2D(kg, &lamp_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v); if(direct_emission(kg, sd, i, 0.0f, 0.0f, light_u, light_v, &light_ray, &L_light, &is_lamp)) { /* trace shadow ray */ @@ -862,9 +852,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R for(int j = 0; j < num_samples; j++) { float light_t = path_rng_1D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT); - float2 light_uv = path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U); - float light_u = light_uv.x; - float light_v = light_uv.y; + float light_u, light_v; + path_rng_2D(kg, rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_LIGHT_U, &light_u, &light_v); /* only sample triangle lights */ if(kernel_data.integrator.num_all_lights) @@ -913,9 +902,8 @@ __device_noinline void kernel_path_non_progressive_lighting(KernelGlobals *kg, R BsdfEval bsdf_eval; float3 bsdf_omega_in; differential3 bsdf_domega_in; - float2 bsdf_uv = path_rng_2D(kg, &bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U); - float bsdf_u = bsdf_uv.x; - float bsdf_v = bsdf_uv.y; + float bsdf_u, bsdf_v; + path_rng_2D(kg, &bsdf_rng, sample*num_samples + j, aa_samples*num_samples, rng_offset + PRNG_BSDF_U, &bsdf_u, &bsdf_v); int label; label = shader_bsdf_sample_closure(kg, sd, sc, bsdf_u, bsdf_v, &bsdf_eval, @@ -1162,11 +1150,8 @@ __device void kernel_path_trace(KernelGlobals *kg, float lens_u = 0.0f, lens_v = 0.0f; - if(kernel_data.cam.aperturesize > 0.0f) { - float2 lens_uv = path_rng_2D(kg, &rng, sample, num_samples, PRNG_LENS_U); - lens_u = lens_uv.x; - lens_v = lens_uv.y; - } + if(kernel_data.cam.aperturesize > 0.0f) + path_rng_2D(kg, &rng, sample, num_samples, PRNG_LENS_U, &lens_u, &lens_v); float time = 0.0f; diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index b5f824d5cce..20fc1fe2253 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -102,8 +102,16 @@ __device uint sobol_lookup(const uint m, const uint frame, const uint ex, const return index; } -__device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dimension) +__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) { +#ifdef __CMJ__ + if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { + /* correlated multi-jittered */ + int p = *rng + dimension; + return cmj_sample_1D(sample, num_samples, p); + } +#endif + #ifdef __SOBOL_FULL_SCREEN__ uint result = sobol_dimension(kg, *rng, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); @@ -117,41 +125,27 @@ __device_inline float path_rng(KernelGlobals *kg, RNG *rng, int sample, int dime float shift; if(dimension & 1) - shift = (*rng >> 16)*(1.0f/(float)0xFFFF); + shift = (*rng >> 16)/((float)0xFFFF); else - shift = (*rng & 0xFFFF)*(1.0f/(float)0xFFFF); + shift = (*rng & 0xFFFF)/((float)0xFFFF); return r + shift - floorf(r + shift); #endif } -__device_inline float path_rng_1D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) -{ -#ifdef __CMJ__ - if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { - /* correlated multi-jittered */ - int p = *rng + dimension; - return cmj_sample_1D(sample, num_samples, p); - } -#endif - - /* sobol */ - return path_rng(kg, rng, sample, dimension); -} - -__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension) +__device_inline void path_rng_2D(KernelGlobals *kg, RNG *rng, int sample, int num_samples, int dimension, float *fx, float *fy) { #ifdef __CMJ__ if(kernel_data.integrator.sampling_pattern == SAMPLING_PATTERN_CMJ) { /* correlated multi-jittered */ int p = *rng + dimension; - return cmj_sample_2D(sample, num_samples, p); + cmj_sample_2D(sample, num_samples, p, fx, fy); } #endif /* sobol */ - return make_float2(path_rng(kg, rng, sample, dimension), - path_rng(kg, rng, sample, dimension + 1)); + *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); + *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); } __device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) @@ -184,10 +178,7 @@ __device_inline void path_rng_init(KernelGlobals *kg, __global uint *rng_state, *fy = 0.5f; } else { - float2 fxy = path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U); - - *fx = fxy.x; - *fy = fxy.y; + path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); } #endif } @@ -203,20 +194,19 @@ __device void path_rng_end(KernelGlobals *kg, __global uint *rng_state, RNG rng) __device float path_rng(KernelGlobals *kg, RNG& rng, int sample, int dimension) { - /* implicit mod 2^32 */ - rng = (1103515245*(rng) + 12345); - return (float)rng * (1.0f/(float)0xFFFFFFFF); } __device_inline float path_rng_1D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) { - return path_rng(kg, rng, sample, dimension); + /* implicit mod 2^32 */ + rng = (1103515245*(rng) + 12345); + return (float)rng * (1.0f/(float)0xFFFFFFFF); } -__device_inline float2 path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension) +__device_inline void path_rng_2D(KernelGlobals *kg, RNG& rng, int sample, int num_samples, int dimension, float *fx, float *fy) { - return make_float2(path_rng(kg, rng, sample, dimension), - path_rng(kg, rng, sample, dimension + 1)); + *fx = path_rng_1D(kg, rng, sample, num_samples, dimension); + *fy = path_rng_1D(kg, rng, sample, num_samples, dimension + 1); } __device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sample, int num_samples, RNG *rng, int x, int y, float *fx, float *fy) @@ -231,10 +221,7 @@ __device void path_rng_init(KernelGlobals *kg, __global uint *rng_state, int sam *fy = 0.5f; } else { - float2 fxy = path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U); - - *fx = fxy.x; - *fy = fxy.y; + path_rng_2D(kg, rng, sample, num_samples, PRNG_FILTER_U, fx, fy); } } |