diff options
author | Sergey Sharybin <sergey.vfx@gmail.com> | 2017-09-22 11:26:49 +0300 |
---|---|---|
committer | Sergey Sharybin <sergey.vfx@gmail.com> | 2017-09-22 11:26:49 +0300 |
commit | 128c7c3ba1d15e08241ec3e90fa694cfb210d2b9 (patch) | |
tree | b463c1b7df7de21673d4c5dca319390a3f1d1106 /intern | |
parent | 225e52eaf89039dccf66a4f503fdb7007b1a0394 (diff) | |
parent | f320d0e0a86207983ca4f7e7afb4fa7a140acf99 (diff) |
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
23 files changed, 310 insertions, 409 deletions
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp index 20f0fb04c89..19f4d19179d 100644 --- a/intern/cycles/blender/blender_mesh.cpp +++ b/intern/cycles/blender/blender_mesh.cpp @@ -63,9 +63,8 @@ inline void face_split_tri_indices(const int face_flag, tri_b[1] = 3; tri_b[2] = 1; } - else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ { - assert(face_flag & FACE_FLAG_DIVIDE_13); - + else { + /* Quad with FACE_FLAG_DIVIDE_13 or single triangle. */ tri_a[0] = 0; tri_a[1] = 1; tri_a[2] = 2; diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h index 06221189060..267aeea6e86 100644 --- a/intern/cycles/kernel/closure/bssrdf.h +++ b/intern/cycles/kernel/closure/bssrdf.h @@ -400,7 +400,7 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type) bssrdf_burley_setup(bssrdf); } - return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF; + return SD_BSSRDF; } } diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h index b05f6e9ed5e..f06005c5072 100644 --- a/intern/cycles/kernel/kernel_bake.h +++ b/intern/cycles/kernel/kernel_bake.h @@ -51,8 +51,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg, path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL); /* evaluate surface shader */ - float rbsdf = path_state_rng_1D(kg, &state, PRNG_BSDF); - shader_eval_surface(kg, sd, &state, rbsdf, state.flag); + shader_eval_surface(kg, sd, &state, state.flag); /* TODO, disable more closures we don't need besides transparent */ shader_bsdf_disable_transparency(kg, sd); @@ -241,12 +240,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg, } else { /* surface color of the pass only */ - shader_eval_surface(kg, sd, state, 0.0f, 0); + shader_eval_surface(kg, sd, state, 0); return kernel_bake_shader_bsdf(kg, sd, type); } } else { - shader_eval_surface(kg, sd, state, 0.0f, 0); + shader_eval_surface(kg, sd, state, 0); color = kernel_bake_shader_bsdf(kg, sd, type); } @@ -338,7 +337,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, case SHADER_EVAL_NORMAL: { if((sd.flag & SD_HAS_BUMP)) { - shader_eval_surface(kg, &sd, &state, 0.f, 0); + shader_eval_surface(kg, &sd, &state, 0); } /* encoding: normal = (2 * color) - 1 */ @@ -352,7 +351,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input, } case SHADER_EVAL_EMISSION: { - shader_eval_surface(kg, &sd, &state, 0.f, 0); + shader_eval_surface(kg, &sd, &state, 0); out = shader_emissive_eval(kg, &sd); break; } diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h index 13d4759a9ec..45b8c6311e1 100644 --- a/intern/cycles/kernel/kernel_emission.h +++ b/intern/cycles/kernel/kernel_emission.h @@ -70,7 +70,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg, /* no path flag, we're evaluating this for all closures. that's weak but * we'd have to do multiple evaluations otherwise */ path_state_modify_bounce(state, true); - shader_eval_surface(kg, emission_sd, state, 0.0f, 0); + shader_eval_surface(kg, emission_sd, state, 0); path_state_modify_bounce(state, false); /* evaluate emissive closure */ diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h index 3a242a06a72..d43d6374c13 100644 --- a/intern/cycles/kernel/kernel_path.h +++ b/intern/cycles/kernel/kernel_path.h @@ -210,8 +210,8 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume( /* indirect sample. if we use distance sampling and take just * one sample for direct and indirect light, we could share * this computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); result = kernel_volume_decoupled_scatter(kg, state, &volume_ray, sd, throughput, @@ -434,11 +434,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, sd, &isect, ray); - float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); - shader_eval_surface(kg, sd, state, rbsdf, state->flag); -#ifdef __BRANCHED_PATH__ - shader_merge_closures(sd); -#endif /* __BRANCHED_PATH__ */ + shader_eval_surface(kg, sd, state, state->flag); + shader_prepare_closures(sd, state); /* Apply shadow catcher, holdout, emission. */ if(!kernel_path_shader_apply(kg, @@ -462,7 +459,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; @@ -483,21 +480,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg, /* bssrdf scatter to a different location on the same object, replacing * the closures with a diffuse BSDF */ if(sd->flag & SD_BSSRDF) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { uint lcg_state = lcg_state_init(state, 0x68bc21eb); - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); subsurface_scatter_step(kg, sd, state, @@ -591,8 +585,8 @@ ccl_device_forceinline void kernel_path_integrate( /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, ray); - float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); - shader_eval_surface(kg, &sd, state, rbsdf, state->flag); + shader_eval_surface(kg, &sd, state, state->flag); + shader_prepare_closures(&sd, state); /* Apply shadow catcher, holdout, emission. */ if(!kernel_path_shader_apply(kg, @@ -616,7 +610,7 @@ ccl_device_forceinline void kernel_path_integrate( break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) break; diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h index 3994d8d4954..010988d2a02 100644 --- a/intern/cycles/kernel/kernel_path_branched.h +++ b/intern/cycles/kernel/kernel_path_branched.h @@ -339,8 +339,8 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, /* scatter sample. if we use distance sampling and take just one * sample for direct and indirect light, we could share this * computation, but makes code a bit complex */ - float rphase = path_state_rng_1D_for_decision(kg, &ps, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, &ps, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, &ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false); @@ -439,7 +439,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, /* Setup and evaluate shader. */ shader_setup_from_ray(kg, &sd, &isect, &ray); - shader_eval_surface(kg, &sd, &state, 0.0f, state.flag); + shader_eval_surface(kg, &sd, &state, state.flag); shader_merge_closures(&sd); /* Apply shadow catcher, holdout, emission. */ @@ -466,7 +466,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg, break; } else if(probability != 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, &state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE); if(terminate >= probability) break; diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h index bb09b4ac080..eccee54c0e3 100644 --- a/intern/cycles/kernel/kernel_path_state.h +++ b/intern/cycles/kernel/kernel_path_state.h @@ -76,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta state->flag |= PATH_RAY_TRANSPARENT; state->transparent_bounce++; - /* don't increase random number generator offset here, to avoid some - * unwanted patterns, see path_state_rng_1D_for_decision */ - if(!kernel_data.integrator.transparent_shadows) state->flag |= PATH_RAY_MIS_SKIP; + /* random number generator next bounce */ + state->rng_offset += PRNG_BOUNCE_NUM; + return; } diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h index 619d57e71fb..1753618607a 100644 --- a/intern/cycles/kernel/kernel_path_subsurface.h +++ b/intern/cycles/kernel/kernel_path_subsurface.h @@ -32,11 +32,10 @@ bool kernel_path_subsurface_scatter( ccl_addr_space float3 *throughput, ccl_addr_space SubsurfaceIndirectRays *ss_indirect) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { @@ -49,8 +48,6 @@ bool kernel_path_subsurface_scatter( uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); SubsurfaceIntersection ss_isect; - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v); int num_hits = subsurface_scatter_multi_intersect(kg, &ss_isect, sd, diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h index e7e24f853c2..3cf897ac49c 100644 --- a/intern/cycles/kernel/kernel_path_volume.h +++ b/intern/cycles/kernel/kernel_path_volume.h @@ -77,7 +77,7 @@ bool kernel_path_volume_bounce( float3 phase_omega_in; differential3 phase_domega_in; float phase_u, phase_v; - path_state_rng_2D(kg, state, PRNG_PHASE_U, &phase_u, &phase_v); + path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v); int label; label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval, @@ -155,8 +155,8 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); @@ -201,8 +201,8 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE); - float rscatter = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); + float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL); + float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); @@ -238,8 +238,8 @@ ccl_device void kernel_branched_path_volume_connect_light( float3 tp = throughput; /* sample position on volume segment */ - float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); - float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); + float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg, state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false); diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h index b35ed3bd279..11798d87cb5 100644 --- a/intern/cycles/kernel/kernel_random.h +++ b/intern/cycles/kernel/kernel_random.h @@ -30,12 +30,6 @@ CCL_NAMESPACE_BEGIN #ifdef __SOBOL__ -/* Skip initial numbers that are not as well distributed, especially the - * first sequence is just 0 everywhere, which can be problematic for e.g. - * path termination. - */ -#define SOBOL_SKIP 64 - ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension) { uint result = 0; @@ -73,7 +67,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg, #ifdef __SOBOL__ /* Sobol sequence value using direction vectors. */ - uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension); + uint result = sobol_dimension(kg, sample, dimension); float r = (float)result * (1.0f/(float)0xFFFFFFFF); /* Cranly-Patterson rotation using rng seed */ @@ -186,25 +180,6 @@ ccl_device_inline float path_state_rng_1D(KernelGlobals *kg, state->rng_offset + dimension); } -ccl_device_inline float path_state_rng_1D_for_decision( - KernelGlobals *kg, - const ccl_addr_space PathState *state, - int dimension) -{ - /* The rng_offset is not increased for transparent bounces. if we do then - * fully transparent objects can become subtly visible by the different - * sampling patterns used where the transparent object is. - * - * however for some random numbers that will determine if we next bounce - * is transparent we do need to increase the offset to avoid always making - * the same decision. */ - const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; - return path_rng_1D(kg, - state->rng_hash, - state->sample, state->num_samples, - rng_offset + dimension); -} - ccl_device_inline void path_state_rng_2D(KernelGlobals *kg, const ccl_addr_space PathState *state, int dimension, @@ -232,22 +207,6 @@ ccl_device_inline float path_branched_rng_1D( state->rng_offset + dimension); } -ccl_device_inline float path_branched_rng_1D_for_decision( - KernelGlobals *kg, - uint rng_hash, - const ccl_addr_space PathState *state, - int branch, - int num_branches, - int dimension) -{ - const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM; - return path_rng_1D(kg, - rng_hash, - state->sample * num_branches + branch, - state->num_samples * num_branches, - rng_offset + dimension); -} - ccl_device_inline void path_branched_rng_2D( KernelGlobals *kg, uint rng_hash, @@ -273,7 +232,7 @@ ccl_device_inline float path_state_rng_light_termination( const ccl_addr_space PathState *state) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_state_rng_1D_for_decision(kg, state, PRNG_LIGHT_TERMINATE); + return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE); } return 0.0f; } @@ -286,12 +245,12 @@ ccl_device_inline float path_branched_rng_light_termination( int num_branches) { if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) { - return path_branched_rng_1D_for_decision(kg, - rng_hash, - state, - branch, - num_branches, - PRNG_LIGHT_TERMINATE); + return path_branched_rng_1D(kg, + rng_hash, + state, + branch, + num_branches, + PRNG_LIGHT_TERMINATE); } return 0.0f; } diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h index 5964aca0c78..bb3add5d7ca 100644 --- a/intern/cycles/kernel/kernel_shader.h +++ b/intern/cycles/kernel/kernel_shader.h @@ -494,20 +494,45 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd) } #endif +/* Defensive sampling. */ + +ccl_device_inline void shader_prepare_closures(ShaderData *sd, + ccl_addr_space PathState *state) +{ + /* We can likely also do defensive sampling at deeper bounces, particularly + * for cases like a perfect mirror but possibly also others. This will need + * a good heuristic. */ + if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) { + float sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sum += sc->sample_weight; + } + } + + for(int i = 0; i < sd->num_closure; i++) { + ShaderClosure *sc = &sd->closure[i]; + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + sc->sample_weight = max(sc->sample_weight, 0.125f * sum); + } + } + } +} + + /* BSDF */ ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf, - int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) + const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight) { /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ for(int i = 0; i < sd->num_closure; i++) { - if(i == skip_bsdf) - continue; - const ShaderClosure *sc = &sd->closure[i]; - if(CLOSURE_IS_BSDF(sc->type)) { + if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) { float bsdf_pdf = 0.0f; float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf); @@ -570,7 +595,7 @@ void shader_bsdf_eval(KernelGlobals *kg, #endif { float pdf; - _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f); + _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f); if(use_mis) { float weight = power_heuristic(light_pdf, pdf); bsdf_eval_mis(eval, weight); @@ -578,48 +603,120 @@ void shader_bsdf_eval(KernelGlobals *kg, } } -ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, - ShaderData *sd, - float randu, float randv, - BsdfEval *bsdf_eval, - float3 *omega_in, - differential3 *domega_in, - float *pdf) +ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd, + float *randu) { int sampled = 0; if(sd->num_closure > 1) { - /* pick a BSDF closure based on sample weights */ + /* Pick a BSDF or based on sample weights. */ float sum = 0.0f; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; - - if(CLOSURE_IS_BSDF(sc->type)) + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { sum += sc->sample_weight; + } } - float r = sd->randb_closure*sum; - sum = 0.0f; + float r = (*randu)*sum; + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; - for(sampled = 0; sampled < sd->num_closure; sampled++) { - const ShaderClosure *sc = &sd->closure[sampled]; - if(CLOSURE_IS_BSDF(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + sampled = i; - if(r <= sum) + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } + } - if(sampled == sd->num_closure) { - *pdf = 0.0f; - return LABEL_NONE; + return &sd->closure[sampled]; +} + +ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd, + ccl_addr_space float3 *throughput, + float *randu) +{ + int sampled = 0; + + if(sd->num_closure > 1) { + /* Pick a BSDF or BSSRDF or based on sample weights. */ + float sum_bsdf = 0.0f; + float sum_bssrdf = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF(sc->type)) { + sum_bsdf += sc->sample_weight; + } + else if(CLOSURE_IS_BSSRDF(sc->type)) { + sum_bssrdf += sc->sample_weight; + } + } + + float r = (*randu)*(sum_bsdf + sum_bssrdf); + float partial_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + const ShaderClosure *sc = &sd->closure[i]; + + if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) { + float next_sum = partial_sum + sc->sample_weight; + + if(r < next_sum) { + if(CLOSURE_IS_BSDF(sc->type)) { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf; + return NULL; + } + else { + *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf; + sampled = i; + + /* Rescale to reuse for direction sample, to better + * preserve stratifaction. */ + *randu = (r - partial_sum) / sc->sample_weight; + break; + } + } + + partial_sum = next_sum; + } } } - const ShaderClosure *sc = &sd->closure[sampled]; + return &sd->closure[sampled]; +} + +ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, + ShaderData *sd, + float randu, float randv, + BsdfEval *bsdf_eval, + float3 *omega_in, + differential3 *domega_in, + float *pdf) +{ + const ShaderClosure *sc = shader_bsdf_pick(sd, &randu); + if(!sc) { + *pdf = 0.0f; + return LABEL_NONE; + } + + /* BSSRDF should already have been handled elsewhere. */ + kernel_assert(CLOSURE_IS_BSDF(sc->type)); int label; float3 eval; @@ -632,7 +729,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg, if(sd->num_closure > 1) { float sweight = sc->sample_weight; - _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight); + _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight); } } @@ -868,11 +965,10 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd) /* Surface Evaluation */ ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd, - ccl_addr_space PathState *state, float randb, int path_flag) + ccl_addr_space PathState *state, int path_flag) { sd->num_closure = 0; sd->num_closure_extra = 0; - sd->randb_closure = randb; #ifdef __OSL__ if(kg->osl) @@ -903,7 +999,6 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd, { sd->num_closure = 0; sd->num_closure_extra = 0; - sd->randb_closure = 0.0f; #ifdef __SVM__ #ifdef __OSL__ @@ -985,17 +1080,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s sum += sc->sample_weight; } - float r = sd->randb_closure*sum; - sum = 0.0f; + float r = randu*sum; + float partial_sum = 0.0f; for(sampled = 0; sampled < sd->num_closure; sampled++) { const ShaderClosure *sc = &sd->closure[sampled]; if(CLOSURE_IS_PHASE(sc->type)) { - sum += sc->sample_weight; + float next_sum = partial_sum + sc->sample_weight; - if(r <= sum) + if(r <= next_sum) { + /* Rescale to reuse for BSDF direction sample. */ + randu = (r - partial_sum) / sc->sample_weight; break; + } + + partial_sum = next_sum; } } @@ -1099,7 +1199,6 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_ { sd->num_closure = 0; sd->num_closure_extra = 0; - sd->randb_closure = 0.0f; /* this will modify sd->P */ #ifdef __SVM__ diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h index e02494ec1b0..065f9b184e2 100644 --- a/intern/cycles/kernel/kernel_shadow.h +++ b/intern/cycles/kernel/kernel_shadow.h @@ -86,7 +86,6 @@ ccl_device_forceinline bool shadow_handle_transparent_isect( shader_eval_surface(kg, shadow_sd, state, - 0.0f, PATH_RAY_SHADOW); path_state_modify_bounce(state, false); *throughput *= shader_bsdf_transparency(kg, shadow_sd); diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h index 26ec6383b73..23a09e5e2ca 100644 --- a/intern/cycles/kernel/kernel_subsurface.h +++ b/intern/cycles/kernel/kernel_subsurface.h @@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN * - try to reduce one sample model variance */ -#define BSSRDF_MULTI_EVAL - -ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability) -{ - /* sum sample weights of bssrdf and bsdf */ - float bsdf_sum = 0.0f; - float bssrdf_sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSDF(sc->type)) - bsdf_sum += sc->sample_weight; - else if(CLOSURE_IS_BSSRDF(sc->type)) - bssrdf_sum += sc->sample_weight; - } - - /* use bsdf or bssrdf? */ - float r = sd->randb_closure*(bsdf_sum + bssrdf_sum); - - if(r < bsdf_sum) { - /* use bsdf, and adjust randb so we can reuse it for picking a bsdf */ - sd->randb_closure = r/bsdf_sum; - *probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f; - return NULL; - } - - /* use bssrdf */ - r -= bsdf_sum; - - float sum = 0.0f; - - for(int i = 0; i < sd->num_closure; i++) { - ShaderClosure *sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - sum += sc->sample_weight; - - if(r <= sum) { - sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight; - -#ifdef BSSRDF_MULTI_EVAL - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f; -#else - *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f; -#endif - return sc; - } - } - } - - /* should never happen */ - sd->randb_closure = 0.0f; - *probability = 1.0f; - return NULL; -} - ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, float disk_r, float r, bool all) { -#ifdef BSSRDF_MULTI_EVAL /* this is the veach one-sample model with balance heuristic, some pdf * factors drop out when using balance heuristic weighting */ float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f); float pdf_sum = 0.0f; - float sample_weight_sum = 0.0f; - int num_bssrdf = 0; + float sample_weight_inv = 0.0f; - for(int i = 0; i < sd->num_closure; i++) { - sc = &sd->closure[i]; - - if(CLOSURE_IS_BSSRDF(sc->type)) { - float sample_weight = (all)? 1.0f: sc->sample_weight; - sample_weight_sum += sample_weight; + if(!all) { + float sample_weight_sum = 0.0f; + + for(int i = 0; i < sd->num_closure; i++) { + sc = &sd->closure[i]; + + if(CLOSURE_IS_BSSRDF(sc->type)) { + sample_weight_sum += sc->sample_weight; + } } - } - float sample_weight_inv = 1.0f/sample_weight_sum; + sample_weight_inv = 1.0f/sample_weight_sum; + } for(int i = 0; i < sd->num_closure; i++) { sc = &sd->closure[i]; @@ -125,25 +69,16 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd, /* TODO power heuristic is not working correct here */ eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf; pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf; - - num_bssrdf++; } } return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f); -#else - float pdf = bssrdf_pdf(pick_sc, r); - float disk_pdf = bssrdf_pdf(pick_sc, disk_r); - - return pick_sc->weight * pdf / disk_pdf; -#endif } /* replace closures with a single diffuse bsdf closure after scatter step */ -ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N) +ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N) { sd->flag &= ~SD_CLOSURE_FLAGS; - sd->randb_closure = 0.0f; sd->num_closure = 0; sd->num_closure_extra = 0; @@ -219,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg, if(bump || texture_blur > 0.0f) { /* average color and normal at incoming point */ - shader_eval_surface(kg, sd, state, 0.0f, state_flag); + shader_eval_surface(kg, sd, state, state_flag); float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL); /* we simply divide out the average color and multiply with the average @@ -242,7 +177,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( KernelGlobals *kg, SubsurfaceIntersection *ss_isect, ShaderData *sd, - ShaderClosure *sc, + const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, @@ -255,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect( disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - /* reusing variable for picking the closure gives a bit nicer stratification - * for path tracer, for branched we do all closures so it doesn't help */ - float axisu = (all)? disk_u: sd->randb_closure; - - if(axisu < 0.5f) { + if(disk_u < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; - if(all) - disk_u *= 2.0f; + disk_u *= 2.0f; } - else if(axisu < 0.75f) { + else if(disk_u < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; - if(all) - disk_u = (disk_u - 0.5f)*4.0f; + disk_u = (disk_u - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -283,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect( pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; - if(all) - disk_u = (disk_u - 0.75f)*4.0f; + disk_u = (disk_u - 0.75f)*4.0f; } /* sample point on disk */ @@ -390,7 +318,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup( ShaderData *sd, ccl_addr_space PathState *state, int state_flag, - ShaderClosure *sc, + const ShaderClosure *sc, bool all) { #ifdef __SPLIT_KERNEL__ @@ -419,7 +347,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup( /* subsurface scattering step, from a point on the surface to another nearby point on the same object */ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state, - int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) + int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all) { float3 eval = make_float3(0.0f, 0.0f, 0.0f); @@ -430,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a disk_N = sd->Ng; make_orthonormals(disk_N, &disk_T, &disk_B); - if(sd->randb_closure < 0.5f) { + if(disk_u < 0.5f) { pick_pdf_N = 0.5f; pick_pdf_T = 0.25f; pick_pdf_B = 0.25f; + disk_u *= 2.0f; } - else if(sd->randb_closure < 0.75f) { + else if(disk_u < 0.75f) { float3 tmp = disk_N; disk_N = disk_T; disk_T = tmp; pick_pdf_N = 0.25f; pick_pdf_T = 0.5f; pick_pdf_B = 0.25f; + disk_u = (disk_u - 0.5f)*4.0f; } else { float3 tmp = disk_N; @@ -450,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a pick_pdf_N = 0.25f; pick_pdf_T = 0.25f; pick_pdf_B = 0.5f; + disk_u = (disk_u - 0.75f)*4.0f; } /* sample point on disk */ diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h index 1b4e926ca28..1853fab1967 100644 --- a/intern/cycles/kernel/kernel_types.h +++ b/intern/cycles/kernel/kernel_types.h @@ -281,31 +281,21 @@ enum PathTraceDimension { PRNG_FILTER_V = 1, PRNG_LENS_U = 2, PRNG_LENS_V = 3, -#ifdef __CAMERA_MOTION__ PRNG_TIME = 4, PRNG_UNUSED_0 = 5, PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */ PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */ -#endif - PRNG_BASE_NUM = 8, + PRNG_BASE_NUM = 10, PRNG_BSDF_U = 0, PRNG_BSDF_V = 1, - PRNG_BSDF = 2, - PRNG_UNUSED3 = 3, - PRNG_LIGHT_U = 4, - PRNG_LIGHT_V = 5, - PRNG_LIGHT_TERMINATE = 6, - PRNG_TERMINATE = 7, - -#ifdef __VOLUME__ - PRNG_PHASE_U = 8, - PRNG_PHASE_V = 9, - PRNG_PHASE = 10, - PRNG_SCATTER_DISTANCE = 11, -#endif - - PRNG_BOUNCE_NUM = 12, + PRNG_LIGHT_U = 2, + PRNG_LIGHT_V = 3, + PRNG_LIGHT_TERMINATE = 4, + PRNG_TERMINATE = 5, + PRNG_PHASE_CHANNEL = 6, + PRNG_SCATTER_DISTANCE = 7, + PRNG_BOUNCE_NUM = 8, }; enum SamplingPattern { diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h index d8e8e192ab2..d9c310a893e 100644 --- a/intern/cycles/kernel/kernel_volume.h +++ b/intern/cycles/kernel/kernel_volume.h @@ -379,13 +379,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; /* decide if we will hit or miss */ bool scatter = true; - float xi = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); if(probalistic_scatter) { float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel); @@ -483,10 +482,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ - float xi = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE); - float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE); + float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE); + float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL); int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; bool has_scatter = false; for(int i = 0; i < max_steps; i++) { @@ -843,7 +841,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter( /* pick random color channel, we use the Veach one-sample * model with balance heuristic for the channels */ int channel = (int)(rphase*3.0f); - sd->randb_closure = rphase*3.0f - channel; float xi = rscatter; /* probabilistic scattering decision based on transmittance */ diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h index 28fc5ce1c30..0c11158e8da 100644 --- a/intern/cycles/kernel/kernel_work_stealing.h +++ b/intern/cycles/kernel/kernel_work_stealing.h @@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN # pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #endif -ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg) -{ - return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples; -} - -ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg) -{ - return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE; -} - -ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index) -{ - return ray_index / WORK_POOL_SIZE; -} - -ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool) -{ - uint total_work_size = kernel_total_work_size(kg); - uint num_pools = kernel_num_work_pools(kg); - - if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) { - return 0; - } - - uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE; - - uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE)); - if(work_pool < remainder / WORK_POOL_SIZE) { - work_size += WORK_POOL_SIZE; - } - else if(work_pool == remainder / WORK_POOL_SIZE) { - work_size += remainder % WORK_POOL_SIZE; - } - - return work_size; -} - -ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index) -{ - uint num_pools = kernel_num_work_pools(kg); - uint pool = work_pool_from_ray_index(kg, ray_index); - - return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE) - + (pool * WORK_POOL_SIZE) - + (work_index % WORK_POOL_SIZE); -} - /* Returns true if there is work */ -ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index) +ccl_device bool get_next_work(KernelGlobals *kg, + uint thread_index, + ccl_private uint *global_work_index) { - uint work_pool = work_pool_from_ray_index(kg, ray_index); - uint pool_size = work_pool_work_size(kg, work_pool); + uint total_work_size = kernel_split_params.w + * kernel_split_params.h + * kernel_split_params.num_samples; - if(pool_size == 0) { + /* With a small amount of work there may be more threads than work due to + * rounding up of global size, stop such threads immediately. */ + if(thread_index >= total_work_size) { return false; } - *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]); - return (*work_index < pool_size); -} + /* Increase atomic work index counter in pool. */ + uint pool = thread_index / WORK_POOL_SIZE; + uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]); -/* This function assumes that the passed `work` is valid. */ -/* Decode sample number w.r.t. assigned `work`. */ -ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index) -{ - return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h); -} + /* Map per-pool work index to a global work index. */ + uint global_size = ccl_global_size(0) * ccl_global_size(1); + kernel_assert(global_size % WORK_POOL_SIZE == 0); + kernel_assert(thread_index < global_size); -/* Decode pixel and tile position w.r.t. assigned `work`. */ -ccl_device void get_work_pixel_tile_position(KernelGlobals *kg, - ccl_private uint *pixel_x, - ccl_private uint *pixel_y, - ccl_private uint *tile_x, - ccl_private uint *tile_y, - uint work_index, - uint ray_index) -{ - uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h); + *global_work_index = (work_index / WORK_POOL_SIZE) * global_size + + (pool * WORK_POOL_SIZE) + + (work_index % WORK_POOL_SIZE); - *tile_x = pixel_index % kernel_split_params.w; - *tile_y = pixel_index / kernel_split_params.w; + /* Test if all work for this pool is done. */ + return (*global_work_index < total_work_size); +} - *pixel_x = *tile_x + kernel_split_params.x; - *pixel_y = *tile_y + kernel_split_params.y; +/* Map global work index to pixel X/Y and sample. */ +ccl_device_inline void get_work_pixel(KernelGlobals *kg, + uint global_work_index, + ccl_private uint *x, + ccl_private uint *y, + ccl_private uint *sample) +{ + uint tile_pixels = kernel_split_params.w * kernel_split_params.h; + uint sample_offset = global_work_index / tile_pixels; + uint pixel_offset = global_work_index - sample_offset * tile_pixels; + uint y_offset = pixel_offset / kernel_split_params.w; + uint x_offset = pixel_offset - y_offset * kernel_split_params.w; + + *x = kernel_split_params.x + x_offset; + *y = kernel_split_params.y + y_offset; + *sample = kernel_split_params.start_sample + sample_offset; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h index 7b4d1299c12..c9e7deddafa 100644 --- a/intern/cycles/kernel/split/kernel_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_buffer_update.h @@ -84,14 +84,9 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { - uint work_index = kernel_split_state.work_array[ray_index]; - uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - - uint tile_x, tile_y, pixel_x, pixel_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); - - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + uint sample = state->sample; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; /* accumulate result in output buffer */ kernel_write_result(kg, buffer, sample, L); @@ -102,31 +97,26 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { /* We have completed current work; So get next work */ uint work_index; - int valid_work = get_next_work(kg, &work_index, ray_index); - if(!valid_work) { + if(!get_next_work(kg, ray_index, &work_index)) { /* If work is invalid, this means no more work is available and the thread may exit */ ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE); } if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) { - kernel_split_state.work_array[ray_index] = work_index; - /* Get the sample associated with the current work */ - uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - /* Get pixel and tile position associated with current work */ - uint tile_x, tile_y, pixel_x, pixel_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); - - /* Remap rng_state according to the current work */ + uint x, y, sample; + get_work_pixel(kg, work_index, &x, &y, &sample); + + /* Remap rng_state to current pixel. */ ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; + rng_state += kernel_split_params.offset + x + y*stride; - /* Remap buffer according to the current work */ - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ uint rng_hash; - kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng_hash, ray); + kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray); if(ray->t != 0.0f) { /* Initialize throughput, path radiance, Ray, PathState; @@ -145,6 +135,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg, /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index 9036b1e473d..dffd291012d 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -90,8 +90,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( if(ray_index != QUEUE_EMPTY_SLOT) { #endif - int stride = kernel_split_params.stride; - ccl_global PathState *state = 0x0; float3 throughput; @@ -99,15 +97,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( ShaderData *sd = &kernel_split_state.sd[ray_index]; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { - uint work_index = kernel_split_state.work_array[ray_index]; - uint pixel_x, pixel_y, tile_x, tile_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; + uint buffer_offset = kernel_split_state.buffer_offset[ray_index]; + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; ccl_global Ray *ray = &kernel_split_state.ray[ray_index]; ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index]; @@ -140,7 +131,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao( kernel_split_path_end(kg, ray_index); } else if(probability < 1.0f) { - float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE); + float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE); if(terminate >= probability) { kernel_split_path_end(kg, ray_index); } diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h index c75931855b2..0ab2289348b 100644 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -29,38 +29,32 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { */ kernel_split_state.ray_state[ray_index] = RAY_ACTIVE; - uint work_index = 0; /* Get work. */ - if(!get_next_work(kg, &work_index, ray_index)) { + uint work_index; + if(!get_next_work(kg, ray_index, &work_index)) { /* No more work, mark ray as inactive */ kernel_split_state.ray_state[ray_index] = RAY_INACTIVE; return; } - /* Get the sample associated with the work. */ - uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - - /* Get pixel and tile position associated with the work. */ - uint pixel_x, pixel_y, tile_x, tile_y; - get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, - &tile_x, &tile_y, - work_index, - ray_index); - kernel_split_state.work_array[ray_index] = work_index; + uint x, y, sample; + get_work_pixel(kg, work_index, &x, &y, &sample); + /* Remap rng_state and buffer to current pixel. */ ccl_global uint *rng_state = kernel_split_params.rng_state; - rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; + rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride; - ccl_global float *buffer = kernel_split_params.buffer; - buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; + /* Store buffer offset for writing to passes. */ + uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride; + kernel_split_state.buffer_offset[ray_index] = buffer_offset; /* Initialize random numbers and ray. */ uint rng_hash; kernel_path_trace_setup(kg, rng_state, sample, - pixel_x, pixel_y, + x, y, &rng_hash, &kernel_split_state.ray[ray_index]); @@ -84,6 +78,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ + ccl_global float *buffer = kernel_split_params.buffer + buffer_offset; kernel_write_pass_float4(buffer, sample, L_rad); ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); } diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h index eac29dcd0d1..7032461b04a 100644 --- a/intern/cycles/kernel/split/kernel_shader_eval.h +++ b/intern/cycles/kernel/split/kernel_shader_eval.h @@ -50,20 +50,16 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { ccl_global PathState *state = &kernel_split_state.path_state[ray_index]; -#ifndef __BRANCHED_PATH__ - float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); - shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, rbsdf, state->flag); -#else - float rbsdf = 0.0f; - - if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF); - + shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag); +#ifdef __BRANCHED_PATH__ + if(kernel_data.integrator.branched) { + shader_merge_closures(&kernel_split_state.sd[ray_index]); + } + else +#endif + { + shader_prepare_closures(&kernel_split_state.sd[ray_index], state); } - - shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, rbsdf, state->flag); - shader_merge_closures(&kernel_split_state.sd[ray_index]); -#endif /* __BRANCHED_PATH__ */ } } diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h index e08afc22b20..c58c8463f5c 100644 --- a/intern/cycles/kernel/split/kernel_split_data_types.h +++ b/intern/cycles/kernel/split/kernel_split_data_types.h @@ -122,7 +122,7 @@ typedef ccl_global struct SplitBranchedState { SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \ SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \ SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \ - SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \ + SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd, 1) \ SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \ SPLIT_DATA_SUBSURFACE_ENTRIES \ diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h index a487e53df5c..3b957856aea 100644 --- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h +++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h @@ -250,20 +250,17 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg) #ifdef __BRANCHED_PATH__ } else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) { - float bssrdf_probability; - ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability); + float bssrdf_u, bssrdf_v; + path_state_rng_2D(kg, + state, + PRNG_BSDF_U, + &bssrdf_u, &bssrdf_v); - /* modify throughput for picking bssrdf or bsdf */ - *throughput *= bssrdf_probability; + const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u); /* do bssrdf scatter step if we picked a bssrdf closure */ if(sc) { uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb); - float bssrdf_u, bssrdf_v; - path_state_rng_2D(kg, - state, - PRNG_BSDF_U, - &bssrdf_u, &bssrdf_v); subsurface_scatter_step(kg, sd, state, diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h index 0382c0811dd..3c5785c4807 100644 --- a/intern/cycles/util/util_optimization.h +++ b/intern/cycles/util/util_optimization.h @@ -25,6 +25,9 @@ #if defined(i386) || defined(_M_IX86) +/* We require minimum SSE2 support on x86, so auto enable. */ +# define __KERNEL_SSE2__ + # ifdef WITH_KERNEL_SSE2 # define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 # endif |