Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/intern
diff options
context:
space:
mode:
authorSergey Sharybin <sergey.vfx@gmail.com>2017-09-22 11:26:49 +0300
committerSergey Sharybin <sergey.vfx@gmail.com>2017-09-22 11:26:49 +0300
commit128c7c3ba1d15e08241ec3e90fa694cfb210d2b9 (patch)
treeb463c1b7df7de21673d4c5dca319390a3f1d1106 /intern
parent225e52eaf89039dccf66a4f503fdb7007b1a0394 (diff)
parentf320d0e0a86207983ca4f7e7afb4fa7a140acf99 (diff)
Merge branch 'master' into blender2.8
Diffstat (limited to 'intern')
-rw-r--r--intern/cycles/blender/blender_mesh.cpp5
-rw-r--r--intern/cycles/kernel/closure/bssrdf.h2
-rw-r--r--intern/cycles/kernel/kernel_bake.h11
-rw-r--r--intern/cycles/kernel/kernel_emission.h2
-rw-r--r--intern/cycles/kernel/kernel_path.h34
-rw-r--r--intern/cycles/kernel/kernel_path_branched.h8
-rw-r--r--intern/cycles/kernel/kernel_path_state.h6
-rw-r--r--intern/cycles/kernel/kernel_path_subsurface.h9
-rw-r--r--intern/cycles/kernel/kernel_path_volume.h14
-rw-r--r--intern/cycles/kernel/kernel_random.h57
-rw-r--r--intern/cycles/kernel/kernel_shader.h175
-rw-r--r--intern/cycles/kernel/kernel_shadow.h1
-rw-r--r--intern/cycles/kernel/kernel_subsurface.h125
-rw-r--r--intern/cycles/kernel/kernel_types.h26
-rw-r--r--intern/cycles/kernel/kernel_volume.h11
-rw-r--r--intern/cycles/kernel/kernel_work_stealing.h112
-rw-r--r--intern/cycles/kernel/split/kernel_buffer_update.h37
-rw-r--r--intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h15
-rw-r--r--intern/cycles/kernel/split/kernel_path_init.h27
-rw-r--r--intern/cycles/kernel/split/kernel_shader_eval.h22
-rw-r--r--intern/cycles/kernel/split/kernel_split_data_types.h2
-rw-r--r--intern/cycles/kernel/split/kernel_subsurface_scatter.h15
-rw-r--r--intern/cycles/util/util_optimization.h3
23 files changed, 310 insertions, 409 deletions
diff --git a/intern/cycles/blender/blender_mesh.cpp b/intern/cycles/blender/blender_mesh.cpp
index 20f0fb04c89..19f4d19179d 100644
--- a/intern/cycles/blender/blender_mesh.cpp
+++ b/intern/cycles/blender/blender_mesh.cpp
@@ -63,9 +63,8 @@ inline void face_split_tri_indices(const int face_flag,
tri_b[1] = 3;
tri_b[2] = 1;
}
- else /*if(face_flag & FACE_FLAG_DIVIDE_13)*/ {
- assert(face_flag & FACE_FLAG_DIVIDE_13);
-
+ else {
+ /* Quad with FACE_FLAG_DIVIDE_13 or single triangle. */
tri_a[0] = 0;
tri_a[1] = 1;
tri_a[2] = 2;
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index 06221189060..267aeea6e86 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -400,7 +400,7 @@ ccl_device int bssrdf_setup(Bssrdf *bssrdf, ClosureType type)
bssrdf_burley_setup(bssrdf);
}
- return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
+ return SD_BSSRDF;
}
}
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index b05f6e9ed5e..f06005c5072 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -51,8 +51,7 @@ ccl_device_inline void compute_light_pass(KernelGlobals *kg,
path_state_init(kg, &emission_sd, &state, rng_hash, sample, NULL);
/* evaluate surface shader */
- float rbsdf = path_state_rng_1D(kg, &state, PRNG_BSDF);
- shader_eval_surface(kg, sd, &state, rbsdf, state.flag);
+ shader_eval_surface(kg, sd, &state, state.flag);
/* TODO, disable more closures we don't need besides transparent */
shader_bsdf_disable_transparency(kg, sd);
@@ -241,12 +240,12 @@ ccl_device float3 kernel_bake_evaluate_direct_indirect(KernelGlobals *kg,
}
else {
/* surface color of the pass only */
- shader_eval_surface(kg, sd, state, 0.0f, 0);
+ shader_eval_surface(kg, sd, state, 0);
return kernel_bake_shader_bsdf(kg, sd, type);
}
}
else {
- shader_eval_surface(kg, sd, state, 0.0f, 0);
+ shader_eval_surface(kg, sd, state, 0);
color = kernel_bake_shader_bsdf(kg, sd, type);
}
@@ -338,7 +337,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
case SHADER_EVAL_NORMAL:
{
if((sd.flag & SD_HAS_BUMP)) {
- shader_eval_surface(kg, &sd, &state, 0.f, 0);
+ shader_eval_surface(kg, &sd, &state, 0);
}
/* encoding: normal = (2 * color) - 1 */
@@ -352,7 +351,7 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
}
case SHADER_EVAL_EMISSION:
{
- shader_eval_surface(kg, &sd, &state, 0.f, 0);
+ shader_eval_surface(kg, &sd, &state, 0);
out = shader_emissive_eval(kg, &sd);
break;
}
diff --git a/intern/cycles/kernel/kernel_emission.h b/intern/cycles/kernel/kernel_emission.h
index 13d4759a9ec..45b8c6311e1 100644
--- a/intern/cycles/kernel/kernel_emission.h
+++ b/intern/cycles/kernel/kernel_emission.h
@@ -70,7 +70,7 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
/* no path flag, we're evaluating this for all closures. that's weak but
* we'd have to do multiple evaluations otherwise */
path_state_modify_bounce(state, true);
- shader_eval_surface(kg, emission_sd, state, 0.0f, 0);
+ shader_eval_surface(kg, emission_sd, state, 0);
path_state_modify_bounce(state, false);
/* evaluate emissive closure */
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 3a242a06a72..d43d6374c13 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -210,8 +210,8 @@ ccl_device_forceinline VolumeIntegrateResult kernel_path_volume(
/* indirect sample. if we use distance sampling and take just
* one sample for direct and indirect light, we could share
* this computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+ float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
result = kernel_volume_decoupled_scatter(kg,
state, &volume_ray, sd, throughput,
@@ -434,11 +434,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
sd,
&isect,
ray);
- float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF);
- shader_eval_surface(kg, sd, state, rbsdf, state->flag);
-#ifdef __BRANCHED_PATH__
- shader_merge_closures(sd);
-#endif /* __BRANCHED_PATH__ */
+ shader_eval_surface(kg, sd, state, state->flag);
+ shader_prepare_closures(sd, state);
/* Apply shadow catcher, holdout, emission. */
if(!kernel_path_shader_apply(kg,
@@ -462,7 +459,7 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
if(terminate >= probability)
break;
@@ -483,21 +480,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
/* bssrdf scatter to a different location on the same object, replacing
* the closures with a diffuse BSDF */
if(sd->flag & SD_BSSRDF) {
- float bssrdf_probability;
- ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
- /* modify throughput for picking bssrdf or bsdf */
- throughput *= bssrdf_probability;
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, &throughput, &bssrdf_u);
/* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) {
uint lcg_state = lcg_state_init(state, 0x68bc21eb);
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg,
- state,
- PRNG_BSDF_U,
- &bssrdf_u, &bssrdf_v);
subsurface_scatter_step(kg,
sd,
state,
@@ -591,8 +585,8 @@ ccl_device_forceinline void kernel_path_integrate(
/* Setup and evaluate shader. */
shader_setup_from_ray(kg, &sd, &isect, ray);
- float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF);
- shader_eval_surface(kg, &sd, state, rbsdf, state->flag);
+ shader_eval_surface(kg, &sd, state, state->flag);
+ shader_prepare_closures(&sd, state);
/* Apply shadow catcher, holdout, emission. */
if(!kernel_path_shader_apply(kg,
@@ -616,7 +610,7 @@ ccl_device_forceinline void kernel_path_integrate(
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
if(terminate >= probability)
break;
diff --git a/intern/cycles/kernel/kernel_path_branched.h b/intern/cycles/kernel/kernel_path_branched.h
index 3994d8d4954..010988d2a02 100644
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -339,8 +339,8 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
/* scatter sample. if we use distance sampling and take just one
* sample for direct and indirect light, we could share this
* computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, &ps, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, &ps, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, &ps, PRNG_PHASE_CHANNEL);
+ float rscatter = path_state_rng_1D(kg, &ps, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
&ps, &pray, &sd, &tp, rphase, rscatter, &volume_segment, NULL, false);
@@ -439,7 +439,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
/* Setup and evaluate shader. */
shader_setup_from_ray(kg, &sd, &isect, &ray);
- shader_eval_surface(kg, &sd, &state, 0.0f, state.flag);
+ shader_eval_surface(kg, &sd, &state, state.flag);
shader_merge_closures(&sd);
/* Apply shadow catcher, holdout, emission. */
@@ -466,7 +466,7 @@ ccl_device void kernel_branched_path_integrate(KernelGlobals *kg,
break;
}
else if(probability != 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, &state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, &state, PRNG_TERMINATE);
if(terminate >= probability)
break;
diff --git a/intern/cycles/kernel/kernel_path_state.h b/intern/cycles/kernel/kernel_path_state.h
index bb09b4ac080..eccee54c0e3 100644
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -76,12 +76,12 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
state->flag |= PATH_RAY_TRANSPARENT;
state->transparent_bounce++;
- /* don't increase random number generator offset here, to avoid some
- * unwanted patterns, see path_state_rng_1D_for_decision */
-
if(!kernel_data.integrator.transparent_shadows)
state->flag |= PATH_RAY_MIS_SKIP;
+ /* random number generator next bounce */
+ state->rng_offset += PRNG_BOUNCE_NUM;
+
return;
}
diff --git a/intern/cycles/kernel/kernel_path_subsurface.h b/intern/cycles/kernel/kernel_path_subsurface.h
index 619d57e71fb..1753618607a 100644
--- a/intern/cycles/kernel/kernel_path_subsurface.h
+++ b/intern/cycles/kernel/kernel_path_subsurface.h
@@ -32,11 +32,10 @@ bool kernel_path_subsurface_scatter(
ccl_addr_space float3 *throughput,
ccl_addr_space SubsurfaceIndirectRays *ss_indirect)
{
- float bssrdf_probability;
- ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
- /* modify throughput for picking bssrdf or bsdf */
- *throughput *= bssrdf_probability;
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
/* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) {
@@ -49,8 +48,6 @@ bool kernel_path_subsurface_scatter(
uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
SubsurfaceIntersection ss_isect;
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg, state, PRNG_BSDF_U, &bssrdf_u, &bssrdf_v);
int num_hits = subsurface_scatter_multi_intersect(kg,
&ss_isect,
sd,
diff --git a/intern/cycles/kernel/kernel_path_volume.h b/intern/cycles/kernel/kernel_path_volume.h
index e7e24f853c2..3cf897ac49c 100644
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -77,7 +77,7 @@ bool kernel_path_volume_bounce(
float3 phase_omega_in;
differential3 phase_domega_in;
float phase_u, phase_v;
- path_state_rng_2D(kg, state, PRNG_PHASE_U, &phase_u, &phase_v);
+ path_state_rng_2D(kg, state, PRNG_BSDF_U, &phase_u, &phase_v);
int label;
label = shader_volume_phase_sample(kg, sd, phase_u, phase_v, &phase_eval,
@@ -155,8 +155,8 @@ ccl_device void kernel_branched_path_volume_connect_light(
float3 tp = throughput;
/* sample position on volume segment */
- float rphase = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE);
- float rscatter = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+ float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+ float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
@@ -201,8 +201,8 @@ ccl_device void kernel_branched_path_volume_connect_light(
float3 tp = throughput;
/* sample position on volume segment */
- float rphase = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE);
- float rscatter = path_branched_rng_1D_for_decision(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
+ float rphase = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_PHASE_CHANNEL);
+ float rscatter = path_branched_rng_1D(kg, state->rng_hash, state, j, num_samples, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
@@ -238,8 +238,8 @@ ccl_device void kernel_branched_path_volume_connect_light(
float3 tp = throughput;
/* sample position on volume segment */
- float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
+ float rscatter = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
VolumeIntegrateResult result = kernel_volume_decoupled_scatter(kg,
state, ray, sd, &tp, rphase, rscatter, segment, (ls.t != FLT_MAX)? &ls.P: NULL, false);
diff --git a/intern/cycles/kernel/kernel_random.h b/intern/cycles/kernel/kernel_random.h
index b35ed3bd279..11798d87cb5 100644
--- a/intern/cycles/kernel/kernel_random.h
+++ b/intern/cycles/kernel/kernel_random.h
@@ -30,12 +30,6 @@ CCL_NAMESPACE_BEGIN
#ifdef __SOBOL__
-/* Skip initial numbers that are not as well distributed, especially the
- * first sequence is just 0 everywhere, which can be problematic for e.g.
- * path termination.
- */
-#define SOBOL_SKIP 64
-
ccl_device uint sobol_dimension(KernelGlobals *kg, int index, int dimension)
{
uint result = 0;
@@ -73,7 +67,7 @@ ccl_device_forceinline float path_rng_1D(KernelGlobals *kg,
#ifdef __SOBOL__
/* Sobol sequence value using direction vectors. */
- uint result = sobol_dimension(kg, sample + SOBOL_SKIP, dimension);
+ uint result = sobol_dimension(kg, sample, dimension);
float r = (float)result * (1.0f/(float)0xFFFFFFFF);
/* Cranly-Patterson rotation using rng seed */
@@ -186,25 +180,6 @@ ccl_device_inline float path_state_rng_1D(KernelGlobals *kg,
state->rng_offset + dimension);
}
-ccl_device_inline float path_state_rng_1D_for_decision(
- KernelGlobals *kg,
- const ccl_addr_space PathState *state,
- int dimension)
-{
- /* The rng_offset is not increased for transparent bounces. if we do then
- * fully transparent objects can become subtly visible by the different
- * sampling patterns used where the transparent object is.
- *
- * however for some random numbers that will determine if we next bounce
- * is transparent we do need to increase the offset to avoid always making
- * the same decision. */
- const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
- return path_rng_1D(kg,
- state->rng_hash,
- state->sample, state->num_samples,
- rng_offset + dimension);
-}
-
ccl_device_inline void path_state_rng_2D(KernelGlobals *kg,
const ccl_addr_space PathState *state,
int dimension,
@@ -232,22 +207,6 @@ ccl_device_inline float path_branched_rng_1D(
state->rng_offset + dimension);
}
-ccl_device_inline float path_branched_rng_1D_for_decision(
- KernelGlobals *kg,
- uint rng_hash,
- const ccl_addr_space PathState *state,
- int branch,
- int num_branches,
- int dimension)
-{
- const int rng_offset = state->rng_offset + state->transparent_bounce * PRNG_BOUNCE_NUM;
- return path_rng_1D(kg,
- rng_hash,
- state->sample * num_branches + branch,
- state->num_samples * num_branches,
- rng_offset + dimension);
-}
-
ccl_device_inline void path_branched_rng_2D(
KernelGlobals *kg,
uint rng_hash,
@@ -273,7 +232,7 @@ ccl_device_inline float path_state_rng_light_termination(
const ccl_addr_space PathState *state)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_state_rng_1D_for_decision(kg, state, PRNG_LIGHT_TERMINATE);
+ return path_state_rng_1D(kg, state, PRNG_LIGHT_TERMINATE);
}
return 0.0f;
}
@@ -286,12 +245,12 @@ ccl_device_inline float path_branched_rng_light_termination(
int num_branches)
{
if(kernel_data.integrator.light_inv_rr_threshold > 0.0f) {
- return path_branched_rng_1D_for_decision(kg,
- rng_hash,
- state,
- branch,
- num_branches,
- PRNG_LIGHT_TERMINATE);
+ return path_branched_rng_1D(kg,
+ rng_hash,
+ state,
+ branch,
+ num_branches,
+ PRNG_LIGHT_TERMINATE);
}
return 0.0f;
}
diff --git a/intern/cycles/kernel/kernel_shader.h b/intern/cycles/kernel/kernel_shader.h
index 5964aca0c78..bb3add5d7ca 100644
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -494,20 +494,45 @@ ccl_device_inline void shader_merge_closures(ShaderData *sd)
}
#endif
+/* Defensive sampling. */
+
+ccl_device_inline void shader_prepare_closures(ShaderData *sd,
+ ccl_addr_space PathState *state)
+{
+ /* We can likely also do defensive sampling at deeper bounces, particularly
+ * for cases like a perfect mirror but possibly also others. This will need
+ * a good heuristic. */
+ if(state->bounce + state->transparent_bounce == 0 && sd->num_closure > 1) {
+ float sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ sum += sc->sample_weight;
+ }
+ }
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ ShaderClosure *sc = &sd->closure[i];
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ sc->sample_weight = max(sc->sample_weight, 0.125f * sum);
+ }
+ }
+ }
+}
+
+
/* BSDF */
ccl_device_inline void _shader_bsdf_multi_eval(KernelGlobals *kg, ShaderData *sd, const float3 omega_in, float *pdf,
- int skip_bsdf, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
+ const ShaderClosure *skip_sc, BsdfEval *result_eval, float sum_pdf, float sum_sample_weight)
{
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
for(int i = 0; i < sd->num_closure; i++) {
- if(i == skip_bsdf)
- continue;
-
const ShaderClosure *sc = &sd->closure[i];
- if(CLOSURE_IS_BSDF(sc->type)) {
+ if(sc != skip_sc && CLOSURE_IS_BSDF(sc->type)) {
float bsdf_pdf = 0.0f;
float3 eval = bsdf_eval(kg, sd, sc, omega_in, &bsdf_pdf);
@@ -570,7 +595,7 @@ void shader_bsdf_eval(KernelGlobals *kg,
#endif
{
float pdf;
- _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, -1, eval, 0.0f, 0.0f);
+ _shader_bsdf_multi_eval(kg, sd, omega_in, &pdf, NULL, eval, 0.0f, 0.0f);
if(use_mis) {
float weight = power_heuristic(light_pdf, pdf);
bsdf_eval_mis(eval, weight);
@@ -578,48 +603,120 @@ void shader_bsdf_eval(KernelGlobals *kg,
}
}
-ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
- ShaderData *sd,
- float randu, float randv,
- BsdfEval *bsdf_eval,
- float3 *omega_in,
- differential3 *domega_in,
- float *pdf)
+ccl_device_inline const ShaderClosure *shader_bsdf_pick(ShaderData *sd,
+ float *randu)
{
int sampled = 0;
if(sd->num_closure > 1) {
- /* pick a BSDF closure based on sample weights */
+ /* Pick a BSDF or based on sample weights. */
float sum = 0.0f;
- for(sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
-
- if(CLOSURE_IS_BSDF(sc->type))
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSDF(sc->type)) {
sum += sc->sample_weight;
+ }
}
- float r = sd->randb_closure*sum;
- sum = 0.0f;
+ float r = (*randu)*sum;
+ float partial_sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
- for(sampled = 0; sampled < sd->num_closure; sampled++) {
- const ShaderClosure *sc = &sd->closure[sampled];
-
if(CLOSURE_IS_BSDF(sc->type)) {
- sum += sc->sample_weight;
+ float next_sum = partial_sum + sc->sample_weight;
+
+ if(r < next_sum) {
+ sampled = i;
- if(r <= sum)
+ /* Rescale to reuse for direction sample, to better
+ * preserve stratifaction. */
+ *randu = (r - partial_sum) / sc->sample_weight;
break;
+ }
+
+ partial_sum = next_sum;
}
}
+ }
- if(sampled == sd->num_closure) {
- *pdf = 0.0f;
- return LABEL_NONE;
+ return &sd->closure[sampled];
+}
+
+ccl_device_inline const ShaderClosure *shader_bssrdf_pick(ShaderData *sd,
+ ccl_addr_space float3 *throughput,
+ float *randu)
+{
+ int sampled = 0;
+
+ if(sd->num_closure > 1) {
+ /* Pick a BSDF or BSSRDF or based on sample weights. */
+ float sum_bsdf = 0.0f;
+ float sum_bssrdf = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSDF(sc->type)) {
+ sum_bsdf += sc->sample_weight;
+ }
+ else if(CLOSURE_IS_BSSRDF(sc->type)) {
+ sum_bssrdf += sc->sample_weight;
+ }
+ }
+
+ float r = (*randu)*(sum_bsdf + sum_bssrdf);
+ float partial_sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ const ShaderClosure *sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSDF_OR_BSSRDF(sc->type)) {
+ float next_sum = partial_sum + sc->sample_weight;
+
+ if(r < next_sum) {
+ if(CLOSURE_IS_BSDF(sc->type)) {
+ *throughput *= (sum_bsdf + sum_bssrdf) / sum_bsdf;
+ return NULL;
+ }
+ else {
+ *throughput *= (sum_bsdf + sum_bssrdf) / sum_bssrdf;
+ sampled = i;
+
+ /* Rescale to reuse for direction sample, to better
+ * preserve stratifaction. */
+ *randu = (r - partial_sum) / sc->sample_weight;
+ break;
+ }
+ }
+
+ partial_sum = next_sum;
+ }
}
}
- const ShaderClosure *sc = &sd->closure[sampled];
+ return &sd->closure[sampled];
+}
+
+ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
+ ShaderData *sd,
+ float randu, float randv,
+ BsdfEval *bsdf_eval,
+ float3 *omega_in,
+ differential3 *domega_in,
+ float *pdf)
+{
+ const ShaderClosure *sc = shader_bsdf_pick(sd, &randu);
+ if(!sc) {
+ *pdf = 0.0f;
+ return LABEL_NONE;
+ }
+
+ /* BSSRDF should already have been handled elsewhere. */
+ kernel_assert(CLOSURE_IS_BSDF(sc->type));
int label;
float3 eval;
@@ -632,7 +729,7 @@ ccl_device_inline int shader_bsdf_sample(KernelGlobals *kg,
if(sd->num_closure > 1) {
float sweight = sc->sample_weight;
- _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sampled, bsdf_eval, *pdf*sweight, sweight);
+ _shader_bsdf_multi_eval(kg, sd, *omega_in, pdf, sc, bsdf_eval, *pdf*sweight, sweight);
}
}
@@ -868,11 +965,10 @@ ccl_device float3 shader_holdout_eval(KernelGlobals *kg, ShaderData *sd)
/* Surface Evaluation */
ccl_device void shader_eval_surface(KernelGlobals *kg, ShaderData *sd,
- ccl_addr_space PathState *state, float randb, int path_flag)
+ ccl_addr_space PathState *state, int path_flag)
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
- sd->randb_closure = randb;
#ifdef __OSL__
if(kg->osl)
@@ -903,7 +999,6 @@ ccl_device float3 shader_eval_background(KernelGlobals *kg, ShaderData *sd,
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
- sd->randb_closure = 0.0f;
#ifdef __SVM__
#ifdef __OSL__
@@ -985,17 +1080,22 @@ ccl_device int shader_volume_phase_sample(KernelGlobals *kg, const ShaderData *s
sum += sc->sample_weight;
}
- float r = sd->randb_closure*sum;
- sum = 0.0f;
+ float r = randu*sum;
+ float partial_sum = 0.0f;
for(sampled = 0; sampled < sd->num_closure; sampled++) {
const ShaderClosure *sc = &sd->closure[sampled];
if(CLOSURE_IS_PHASE(sc->type)) {
- sum += sc->sample_weight;
+ float next_sum = partial_sum + sc->sample_weight;
- if(r <= sum)
+ if(r <= next_sum) {
+ /* Rescale to reuse for BSDF direction sample. */
+ randu = (r - partial_sum) / sc->sample_weight;
break;
+ }
+
+ partial_sum = next_sum;
}
}
@@ -1099,7 +1199,6 @@ ccl_device void shader_eval_displacement(KernelGlobals *kg, ShaderData *sd, ccl_
{
sd->num_closure = 0;
sd->num_closure_extra = 0;
- sd->randb_closure = 0.0f;
/* this will modify sd->P */
#ifdef __SVM__
diff --git a/intern/cycles/kernel/kernel_shadow.h b/intern/cycles/kernel/kernel_shadow.h
index e02494ec1b0..065f9b184e2 100644
--- a/intern/cycles/kernel/kernel_shadow.h
+++ b/intern/cycles/kernel/kernel_shadow.h
@@ -86,7 +86,6 @@ ccl_device_forceinline bool shadow_handle_transparent_isect(
shader_eval_surface(kg,
shadow_sd,
state,
- 0.0f,
PATH_RAY_SHADOW);
path_state_modify_bounce(state, false);
*throughput *= shader_bsdf_transparency(kg, shadow_sd);
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index 26ec6383b73..23a09e5e2ca 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -28,87 +28,31 @@ CCL_NAMESPACE_BEGIN
* - try to reduce one sample model variance
*/
-#define BSSRDF_MULTI_EVAL
-
-ccl_device ShaderClosure *subsurface_scatter_pick_closure(KernelGlobals *kg, ShaderData *sd, float *probability)
-{
- /* sum sample weights of bssrdf and bsdf */
- float bsdf_sum = 0.0f;
- float bssrdf_sum = 0.0f;
-
- for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if(CLOSURE_IS_BSDF(sc->type))
- bsdf_sum += sc->sample_weight;
- else if(CLOSURE_IS_BSSRDF(sc->type))
- bssrdf_sum += sc->sample_weight;
- }
-
- /* use bsdf or bssrdf? */
- float r = sd->randb_closure*(bsdf_sum + bssrdf_sum);
-
- if(r < bsdf_sum) {
- /* use bsdf, and adjust randb so we can reuse it for picking a bsdf */
- sd->randb_closure = r/bsdf_sum;
- *probability = (bsdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bsdf_sum: 1.0f;
- return NULL;
- }
-
- /* use bssrdf */
- r -= bsdf_sum;
-
- float sum = 0.0f;
-
- for(int i = 0; i < sd->num_closure; i++) {
- ShaderClosure *sc = &sd->closure[i];
-
- if(CLOSURE_IS_BSSRDF(sc->type)) {
- sum += sc->sample_weight;
-
- if(r <= sum) {
- sd->randb_closure = (r - (sum - sc->sample_weight))/sc->sample_weight;
-
-#ifdef BSSRDF_MULTI_EVAL
- *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/bssrdf_sum: 1.0f;
-#else
- *probability = (bssrdf_sum > 0.0f)? (bsdf_sum + bssrdf_sum)/sc->sample_weight: 1.0f;
-#endif
- return sc;
- }
- }
- }
-
- /* should never happen */
- sd->randb_closure = 0.0f;
- *probability = 1.0f;
- return NULL;
-}
-
ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
- ShaderClosure *sc,
+ const ShaderClosure *sc,
float disk_r,
float r,
bool all)
{
-#ifdef BSSRDF_MULTI_EVAL
/* this is the veach one-sample model with balance heuristic, some pdf
* factors drop out when using balance heuristic weighting */
float3 eval_sum = make_float3(0.0f, 0.0f, 0.0f);
float pdf_sum = 0.0f;
- float sample_weight_sum = 0.0f;
- int num_bssrdf = 0;
+ float sample_weight_inv = 0.0f;
- for(int i = 0; i < sd->num_closure; i++) {
- sc = &sd->closure[i];
-
- if(CLOSURE_IS_BSSRDF(sc->type)) {
- float sample_weight = (all)? 1.0f: sc->sample_weight;
- sample_weight_sum += sample_weight;
+ if(!all) {
+ float sample_weight_sum = 0.0f;
+
+ for(int i = 0; i < sd->num_closure; i++) {
+ sc = &sd->closure[i];
+
+ if(CLOSURE_IS_BSSRDF(sc->type)) {
+ sample_weight_sum += sc->sample_weight;
+ }
}
- }
- float sample_weight_inv = 1.0f/sample_weight_sum;
+ sample_weight_inv = 1.0f/sample_weight_sum;
+ }
for(int i = 0; i < sd->num_closure; i++) {
sc = &sd->closure[i];
@@ -125,25 +69,16 @@ ccl_device_inline float3 subsurface_scatter_eval(ShaderData *sd,
/* TODO power heuristic is not working correct here */
eval_sum += sc->weight*pdf; //*sample_weight*disk_pdf;
pdf_sum += sample_weight*disk_pdf; //*sample_weight*disk_pdf;
-
- num_bssrdf++;
}
}
return (pdf_sum > 0.0f)? eval_sum / pdf_sum : make_float3(0.0f, 0.0f, 0.0f);
-#else
- float pdf = bssrdf_pdf(pick_sc, r);
- float disk_pdf = bssrdf_pdf(pick_sc, disk_r);
-
- return pick_sc->weight * pdf / disk_pdf;
-#endif
}
/* replace closures with a single diffuse bsdf closure after scatter step */
-ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, ShaderClosure *sc, float3 weight, bool hit, float3 N)
+ccl_device void subsurface_scatter_setup_diffuse_bsdf(ShaderData *sd, const ShaderClosure *sc, float3 weight, bool hit, float3 N)
{
sd->flag &= ~SD_CLOSURE_FLAGS;
- sd->randb_closure = 0.0f;
sd->num_closure = 0;
sd->num_closure_extra = 0;
@@ -219,7 +154,7 @@ ccl_device void subsurface_color_bump_blur(KernelGlobals *kg,
if(bump || texture_blur > 0.0f) {
/* average color and normal at incoming point */
- shader_eval_surface(kg, sd, state, 0.0f, state_flag);
+ shader_eval_surface(kg, sd, state, state_flag);
float3 in_color = shader_bssrdf_sum(sd, (bump)? N: NULL, NULL);
/* we simply divide out the average color and multiply with the average
@@ -242,7 +177,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
KernelGlobals *kg,
SubsurfaceIntersection *ss_isect,
ShaderData *sd,
- ShaderClosure *sc,
+ const ShaderClosure *sc,
uint *lcg_state,
float disk_u,
float disk_v,
@@ -255,26 +190,20 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
disk_N = sd->Ng;
make_orthonormals(disk_N, &disk_T, &disk_B);
- /* reusing variable for picking the closure gives a bit nicer stratification
- * for path tracer, for branched we do all closures so it doesn't help */
- float axisu = (all)? disk_u: sd->randb_closure;
-
- if(axisu < 0.5f) {
+ if(disk_u < 0.5f) {
pick_pdf_N = 0.5f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.25f;
- if(all)
- disk_u *= 2.0f;
+ disk_u *= 2.0f;
}
- else if(axisu < 0.75f) {
+ else if(disk_u < 0.75f) {
float3 tmp = disk_N;
disk_N = disk_T;
disk_T = tmp;
pick_pdf_N = 0.25f;
pick_pdf_T = 0.5f;
pick_pdf_B = 0.25f;
- if(all)
- disk_u = (disk_u - 0.5f)*4.0f;
+ disk_u = (disk_u - 0.5f)*4.0f;
}
else {
float3 tmp = disk_N;
@@ -283,8 +212,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(
pick_pdf_N = 0.25f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.5f;
- if(all)
- disk_u = (disk_u - 0.75f)*4.0f;
+ disk_u = (disk_u - 0.75f)*4.0f;
}
/* sample point on disk */
@@ -390,7 +318,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
ShaderData *sd,
ccl_addr_space PathState *state,
int state_flag,
- ShaderClosure *sc,
+ const ShaderClosure *sc,
bool all)
{
#ifdef __SPLIT_KERNEL__
@@ -419,7 +347,7 @@ ccl_device_noinline void subsurface_scatter_multi_setup(
/* subsurface scattering step, from a point on the surface to another nearby point on the same object */
ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_addr_space PathState *state,
- int state_flag, ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
+ int state_flag, const ShaderClosure *sc, uint *lcg_state, float disk_u, float disk_v, bool all)
{
float3 eval = make_float3(0.0f, 0.0f, 0.0f);
@@ -430,18 +358,20 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
disk_N = sd->Ng;
make_orthonormals(disk_N, &disk_T, &disk_B);
- if(sd->randb_closure < 0.5f) {
+ if(disk_u < 0.5f) {
pick_pdf_N = 0.5f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.25f;
+ disk_u *= 2.0f;
}
- else if(sd->randb_closure < 0.75f) {
+ else if(disk_u < 0.75f) {
float3 tmp = disk_N;
disk_N = disk_T;
disk_T = tmp;
pick_pdf_N = 0.25f;
pick_pdf_T = 0.5f;
pick_pdf_B = 0.25f;
+ disk_u = (disk_u - 0.5f)*4.0f;
}
else {
float3 tmp = disk_N;
@@ -450,6 +380,7 @@ ccl_device void subsurface_scatter_step(KernelGlobals *kg, ShaderData *sd, ccl_a
pick_pdf_N = 0.25f;
pick_pdf_T = 0.25f;
pick_pdf_B = 0.5f;
+ disk_u = (disk_u - 0.75f)*4.0f;
}
/* sample point on disk */
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 1b4e926ca28..1853fab1967 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -281,31 +281,21 @@ enum PathTraceDimension {
PRNG_FILTER_V = 1,
PRNG_LENS_U = 2,
PRNG_LENS_V = 3,
-#ifdef __CAMERA_MOTION__
PRNG_TIME = 4,
PRNG_UNUSED_0 = 5,
PRNG_UNUSED_1 = 6, /* for some reason (6, 7) is a bad sobol pattern */
PRNG_UNUSED_2 = 7, /* with a low number of samples (< 64) */
-#endif
- PRNG_BASE_NUM = 8,
+ PRNG_BASE_NUM = 10,
PRNG_BSDF_U = 0,
PRNG_BSDF_V = 1,
- PRNG_BSDF = 2,
- PRNG_UNUSED3 = 3,
- PRNG_LIGHT_U = 4,
- PRNG_LIGHT_V = 5,
- PRNG_LIGHT_TERMINATE = 6,
- PRNG_TERMINATE = 7,
-
-#ifdef __VOLUME__
- PRNG_PHASE_U = 8,
- PRNG_PHASE_V = 9,
- PRNG_PHASE = 10,
- PRNG_SCATTER_DISTANCE = 11,
-#endif
-
- PRNG_BOUNCE_NUM = 12,
+ PRNG_LIGHT_U = 2,
+ PRNG_LIGHT_V = 3,
+ PRNG_LIGHT_TERMINATE = 4,
+ PRNG_TERMINATE = 5,
+ PRNG_PHASE_CHANNEL = 6,
+ PRNG_SCATTER_DISTANCE = 7,
+ PRNG_BOUNCE_NUM = 8,
};
enum SamplingPattern {
diff --git a/intern/cycles/kernel/kernel_volume.h b/intern/cycles/kernel/kernel_volume.h
index d8e8e192ab2..d9c310a893e 100644
--- a/intern/cycles/kernel/kernel_volume.h
+++ b/intern/cycles/kernel/kernel_volume.h
@@ -379,13 +379,12 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_homogeneous(
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
- float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
int channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
/* decide if we will hit or miss */
bool scatter = true;
- float xi = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE);
+ float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
if(probalistic_scatter) {
float sample_sigma_t = kernel_volume_channel_get(sigma_t, channel);
@@ -483,10 +482,9 @@ ccl_device VolumeIntegrateResult kernel_volume_integrate_heterogeneous_distance(
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
- float xi = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE);
- float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE);
+ float xi = path_state_rng_1D(kg, state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D(kg, state, PRNG_PHASE_CHANNEL);
int channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
bool has_scatter = false;
for(int i = 0; i < max_steps; i++) {
@@ -843,7 +841,6 @@ ccl_device VolumeIntegrateResult kernel_volume_decoupled_scatter(
/* pick random color channel, we use the Veach one-sample
* model with balance heuristic for the channels */
int channel = (int)(rphase*3.0f);
- sd->randb_closure = rphase*3.0f - channel;
float xi = rscatter;
/* probabilistic scattering decision based on transmittance */
diff --git a/intern/cycles/kernel/kernel_work_stealing.h b/intern/cycles/kernel/kernel_work_stealing.h
index 28fc5ce1c30..0c11158e8da 100644
--- a/intern/cycles/kernel/kernel_work_stealing.h
+++ b/intern/cycles/kernel/kernel_work_stealing.h
@@ -27,90 +27,54 @@ CCL_NAMESPACE_BEGIN
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
-ccl_device_inline uint kernel_total_work_size(KernelGlobals *kg)
-{
- return kernel_split_params.w * kernel_split_params.h * kernel_split_params.num_samples;
-}
-
-ccl_device_inline uint kernel_num_work_pools(KernelGlobals *kg)
-{
- return ccl_global_size(0) * ccl_global_size(1) / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_from_ray_index(KernelGlobals *kg, uint ray_index)
-{
- return ray_index / WORK_POOL_SIZE;
-}
-
-ccl_device_inline uint work_pool_work_size(KernelGlobals *kg, uint work_pool)
-{
- uint total_work_size = kernel_total_work_size(kg);
- uint num_pools = kernel_num_work_pools(kg);
-
- if(work_pool >= num_pools || work_pool * WORK_POOL_SIZE >= total_work_size) {
- return 0;
- }
-
- uint work_size = (total_work_size / (num_pools * WORK_POOL_SIZE)) * WORK_POOL_SIZE;
-
- uint remainder = (total_work_size % (num_pools * WORK_POOL_SIZE));
- if(work_pool < remainder / WORK_POOL_SIZE) {
- work_size += WORK_POOL_SIZE;
- }
- else if(work_pool == remainder / WORK_POOL_SIZE) {
- work_size += remainder % WORK_POOL_SIZE;
- }
-
- return work_size;
-}
-
-ccl_device_inline uint get_global_work_index(KernelGlobals *kg, uint work_index, uint ray_index)
-{
- uint num_pools = kernel_num_work_pools(kg);
- uint pool = work_pool_from_ray_index(kg, ray_index);
-
- return (work_index / WORK_POOL_SIZE) * (num_pools * WORK_POOL_SIZE)
- + (pool * WORK_POOL_SIZE)
- + (work_index % WORK_POOL_SIZE);
-}
-
/* Returns true if there is work */
-ccl_device bool get_next_work(KernelGlobals *kg, ccl_private uint *work_index, uint ray_index)
+ccl_device bool get_next_work(KernelGlobals *kg,
+ uint thread_index,
+ ccl_private uint *global_work_index)
{
- uint work_pool = work_pool_from_ray_index(kg, ray_index);
- uint pool_size = work_pool_work_size(kg, work_pool);
+ uint total_work_size = kernel_split_params.w
+ * kernel_split_params.h
+ * kernel_split_params.num_samples;
- if(pool_size == 0) {
+ /* With a small amount of work there may be more threads than work due to
+ * rounding up of global size, stop such threads immediately. */
+ if(thread_index >= total_work_size) {
return false;
}
- *work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[work_pool]);
- return (*work_index < pool_size);
-}
+ /* Increase atomic work index counter in pool. */
+ uint pool = thread_index / WORK_POOL_SIZE;
+ uint work_index = atomic_fetch_and_inc_uint32(&kernel_split_params.work_pools[pool]);
-/* This function assumes that the passed `work` is valid. */
-/* Decode sample number w.r.t. assigned `work`. */
-ccl_device uint get_work_sample(KernelGlobals *kg, uint work_index, uint ray_index)
-{
- return get_global_work_index(kg, work_index, ray_index) / (kernel_split_params.w * kernel_split_params.h);
-}
+ /* Map per-pool work index to a global work index. */
+ uint global_size = ccl_global_size(0) * ccl_global_size(1);
+ kernel_assert(global_size % WORK_POOL_SIZE == 0);
+ kernel_assert(thread_index < global_size);
-/* Decode pixel and tile position w.r.t. assigned `work`. */
-ccl_device void get_work_pixel_tile_position(KernelGlobals *kg,
- ccl_private uint *pixel_x,
- ccl_private uint *pixel_y,
- ccl_private uint *tile_x,
- ccl_private uint *tile_y,
- uint work_index,
- uint ray_index)
-{
- uint pixel_index = get_global_work_index(kg, work_index, ray_index) % (kernel_split_params.w*kernel_split_params.h);
+ *global_work_index = (work_index / WORK_POOL_SIZE) * global_size
+ + (pool * WORK_POOL_SIZE)
+ + (work_index % WORK_POOL_SIZE);
- *tile_x = pixel_index % kernel_split_params.w;
- *tile_y = pixel_index / kernel_split_params.w;
+ /* Test if all work for this pool is done. */
+ return (*global_work_index < total_work_size);
+}
- *pixel_x = *tile_x + kernel_split_params.x;
- *pixel_y = *tile_y + kernel_split_params.y;
+/* Map global work index to pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(KernelGlobals *kg,
+ uint global_work_index,
+ ccl_private uint *x,
+ ccl_private uint *y,
+ ccl_private uint *sample)
+{
+ uint tile_pixels = kernel_split_params.w * kernel_split_params.h;
+ uint sample_offset = global_work_index / tile_pixels;
+ uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+ uint y_offset = pixel_offset / kernel_split_params.w;
+ uint x_offset = pixel_offset - y_offset * kernel_split_params.w;
+
+ *x = kernel_split_params.x + x_offset;
+ *y = kernel_split_params.y + y_offset;
+ *sample = kernel_split_params.start_sample + sample_offset;
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_buffer_update.h b/intern/cycles/kernel/split/kernel_buffer_update.h
index 7b4d1299c12..c9e7deddafa 100644
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -84,14 +84,9 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
- uint work_index = kernel_split_state.work_array[ray_index];
- uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
- uint tile_x, tile_y, pixel_x, pixel_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+ uint sample = state->sample;
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
/* accumulate result in output buffer */
kernel_write_result(kg, buffer, sample, L);
@@ -102,31 +97,26 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
/* We have completed current work; So get next work */
uint work_index;
- int valid_work = get_next_work(kg, &work_index, ray_index);
- if(!valid_work) {
+ if(!get_next_work(kg, ray_index, &work_index)) {
/* If work is invalid, this means no more work is available and the thread may exit */
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
}
if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
- kernel_split_state.work_array[ray_index] = work_index;
- /* Get the sample associated with the current work */
- uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
- /* Get pixel and tile position associated with current work */
- uint tile_x, tile_y, pixel_x, pixel_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-
- /* Remap rng_state according to the current work */
+ uint x, y, sample;
+ get_work_pixel(kg, work_index, &x, &y, &sample);
+
+ /* Remap rng_state to current pixel. */
ccl_global uint *rng_state = kernel_split_params.rng_state;
- rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+ rng_state += kernel_split_params.offset + x + y*stride;
- /* Remap buffer according to the current work */
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (kernel_split_params.offset + x + y*stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
/* Initialize random numbers and ray. */
uint rng_hash;
- kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, &rng_hash, ray);
+ kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng_hash, ray);
if(ray->t != 0.0f) {
/* Initialize throughput, path radiance, Ray, PathState;
@@ -145,6 +135,7 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
kernel_write_pass_float4(buffer, sample, L_rad);
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index 9036b1e473d..dffd291012d 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -90,8 +90,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
if(ray_index != QUEUE_EMPTY_SLOT) {
#endif
- int stride = kernel_split_params.stride;
-
ccl_global PathState *state = 0x0;
float3 throughput;
@@ -99,15 +97,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
ShaderData *sd = &kernel_split_state.sd[ray_index];
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
- uint work_index = kernel_split_state.work_array[ray_index];
- uint pixel_x, pixel_y, tile_x, tile_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
- &tile_x, &tile_y,
- work_index,
- ray_index);
-
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
+ uint buffer_offset = kernel_split_state.buffer_offset[ray_index];
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
ccl_global Ray *ray = &kernel_split_state.ray[ray_index];
ShaderData *emission_sd = &kernel_split_state.sd_DL_shadow[ray_index];
@@ -140,7 +131,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
kernel_split_path_end(kg, ray_index);
}
else if(probability < 1.0f) {
- float terminate = path_state_rng_1D_for_decision(kg, state, PRNG_TERMINATE);
+ float terminate = path_state_rng_1D(kg, state, PRNG_TERMINATE);
if(terminate >= probability) {
kernel_split_path_end(kg, ray_index);
}
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index c75931855b2..0ab2289348b 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -29,38 +29,32 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
*/
kernel_split_state.ray_state[ray_index] = RAY_ACTIVE;
- uint work_index = 0;
/* Get work. */
- if(!get_next_work(kg, &work_index, ray_index)) {
+ uint work_index;
+ if(!get_next_work(kg, ray_index, &work_index)) {
/* No more work, mark ray as inactive */
kernel_split_state.ray_state[ray_index] = RAY_INACTIVE;
return;
}
- /* Get the sample associated with the work. */
- uint sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
-
- /* Get pixel and tile position associated with the work. */
- uint pixel_x, pixel_y, tile_x, tile_y;
- get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
- &tile_x, &tile_y,
- work_index,
- ray_index);
- kernel_split_state.work_array[ray_index] = work_index;
+ uint x, y, sample;
+ get_work_pixel(kg, work_index, &x, &y, &sample);
+ /* Remap rng_state and buffer to current pixel. */
ccl_global uint *rng_state = kernel_split_params.rng_state;
- rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
+ rng_state += kernel_split_params.offset + x + y*kernel_split_params.stride;
- ccl_global float *buffer = kernel_split_params.buffer;
- buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
+ /* Store buffer offset for writing to passes. */
+ uint buffer_offset = (kernel_split_params.offset + x + y*kernel_split_params.stride) * kernel_data.film.pass_stride;
+ kernel_split_state.buffer_offset[ray_index] = buffer_offset;
/* Initialize random numbers and ray. */
uint rng_hash;
kernel_path_trace_setup(kg,
rng_state,
sample,
- pixel_x, pixel_y,
+ x, y,
&rng_hash,
&kernel_split_state.ray[ray_index]);
@@ -84,6 +78,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
/* These rays do not participate in path-iteration. */
float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
/* Accumulate result in output buffer. */
+ ccl_global float *buffer = kernel_split_params.buffer + buffer_offset;
kernel_write_pass_float4(buffer, sample, L_rad);
ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
}
diff --git a/intern/cycles/kernel/split/kernel_shader_eval.h b/intern/cycles/kernel/split/kernel_shader_eval.h
index eac29dcd0d1..7032461b04a 100644
--- a/intern/cycles/kernel/split/kernel_shader_eval.h
+++ b/intern/cycles/kernel/split/kernel_shader_eval.h
@@ -50,20 +50,16 @@ ccl_device void kernel_shader_eval(KernelGlobals *kg)
if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
ccl_global PathState *state = &kernel_split_state.path_state[ray_index];
-#ifndef __BRANCHED_PATH__
- float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF);
- shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, rbsdf, state->flag);
-#else
- float rbsdf = 0.0f;
-
- if(!kernel_data.integrator.branched || IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF);
-
+ shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, state->flag);
+#ifdef __BRANCHED_PATH__
+ if(kernel_data.integrator.branched) {
+ shader_merge_closures(&kernel_split_state.sd[ray_index]);
+ }
+ else
+#endif
+ {
+ shader_prepare_closures(&kernel_split_state.sd[ray_index], state);
}
-
- shader_eval_surface(kg, &kernel_split_state.sd[ray_index], state, rbsdf, state->flag);
- shader_merge_closures(&kernel_split_state.sd[ray_index]);
-#endif /* __BRANCHED_PATH__ */
}
}
diff --git a/intern/cycles/kernel/split/kernel_split_data_types.h b/intern/cycles/kernel/split/kernel_split_data_types.h
index e08afc22b20..c58c8463f5c 100644
--- a/intern/cycles/kernel/split/kernel_split_data_types.h
+++ b/intern/cycles/kernel/split/kernel_split_data_types.h
@@ -122,7 +122,7 @@ typedef ccl_global struct SplitBranchedState {
SPLIT_DATA_ENTRY(ccl_global int, is_lamp, 1) \
SPLIT_DATA_ENTRY(ccl_global Ray, light_ray, 1) \
SPLIT_DATA_ENTRY(ccl_global int, queue_data, (NUM_QUEUES*2)) /* TODO(mai): this is too large? */ \
- SPLIT_DATA_ENTRY(ccl_global uint, work_array, 1) \
+ SPLIT_DATA_ENTRY(ccl_global uint, buffer_offset, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd, 1) \
SPLIT_DATA_ENTRY(ShaderData, sd_DL_shadow, 1) \
SPLIT_DATA_SUBSURFACE_ENTRIES \
diff --git a/intern/cycles/kernel/split/kernel_subsurface_scatter.h b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
index a487e53df5c..3b957856aea 100644
--- a/intern/cycles/kernel/split/kernel_subsurface_scatter.h
+++ b/intern/cycles/kernel/split/kernel_subsurface_scatter.h
@@ -250,20 +250,17 @@ ccl_device void kernel_subsurface_scatter(KernelGlobals *kg)
#ifdef __BRANCHED_PATH__
}
else if(IS_FLAG(ray_state, ray_index, RAY_BRANCHED_INDIRECT)) {
- float bssrdf_probability;
- ShaderClosure *sc = subsurface_scatter_pick_closure(kg, sd, &bssrdf_probability);
+ float bssrdf_u, bssrdf_v;
+ path_state_rng_2D(kg,
+ state,
+ PRNG_BSDF_U,
+ &bssrdf_u, &bssrdf_v);
- /* modify throughput for picking bssrdf or bsdf */
- *throughput *= bssrdf_probability;
+ const ShaderClosure *sc = shader_bssrdf_pick(sd, throughput, &bssrdf_u);
/* do bssrdf scatter step if we picked a bssrdf closure */
if(sc) {
uint lcg_state = lcg_state_init_addrspace(state, 0x68bc21eb);
- float bssrdf_u, bssrdf_v;
- path_state_rng_2D(kg,
- state,
- PRNG_BSDF_U,
- &bssrdf_u, &bssrdf_v);
subsurface_scatter_step(kg,
sd,
state,
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index 0382c0811dd..3c5785c4807 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -25,6 +25,9 @@
#if defined(i386) || defined(_M_IX86)
+/* We require minimum SSE2 support on x86, so auto enable. */
+# define __KERNEL_SSE2__
+
# ifdef WITH_KERNEL_SSE2
# define WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
# endif