diff options
author | Clément Foucault <foucault.clem@gmail.com> | 2022-08-11 09:13:47 +0300 |
---|---|---|
committer | Clément Foucault <foucault.clem@gmail.com> | 2022-08-14 21:40:04 +0300 |
commit | 67d7792503e4f598d8620818b1d9887670e144da (patch) | |
tree | 0d89e573208d3c651017f94f2a8ccc47463815a8 /source/blender/draw/engines/eevee_next/shaders | |
parent | 1226f5848dd9aef235e20d898e2c0c65f25da0b0 (diff) |
EEVEE-Next: Light: New light module
Compared to the previous implementation this has a limit of 65536 lights
per scene. Lights exceeding this limit will be ignored.
This also introduce fine grained GPU light culling, making rendering
many lights in a scene more efficient as long they don't overlap much.
Compatible light panels have been unhidden.
Note: This commit does not include surface evaluation, only light culling.
Diffstat (limited to 'source/blender/draw/engines/eevee_next/shaders')
10 files changed, 1200 insertions, 0 deletions
diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_debug_frag.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_debug_frag.glsl new file mode 100644 index 00000000000..321c99f7952 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_debug_frag.glsl @@ -0,0 +1,52 @@ + +/** + * Debug Shader outputing a gradient of orange - white - blue to mark culling hotspots. + * Green pixels are error pixels that are missing lights from the culling pass (i.e: when culling + * pass is not conservative enough). + */ + +#pragma BLENDER_REQUIRE(common_view_lib.glsl) +#pragma BLENDER_REQUIRE(common_math_lib.glsl) +#pragma BLENDER_REQUIRE(eevee_light_lib.glsl) +#pragma BLENDER_REQUIRE(eevee_light_iter_lib.glsl) + +void main() +{ + ivec2 texel = ivec2(gl_FragCoord.xy); + + float depth = texelFetch(depth_tx, texel, 0).r; + float vP_z = get_view_z_from_depth(depth); + vec3 P = get_world_space_from_depth(uvcoordsvar.xy, depth); + + float light_count = 0.0; + uint light_cull = 0u; + vec2 px = gl_FragCoord.xy; + LIGHT_FOREACH_BEGIN_LOCAL(light_cull_buf, light_zbin_buf, light_tile_buf, px, vP_z, l_idx) + { + LightData light = light_buf[l_idx]; + light_cull |= 1u << l_idx; + light_count += 1.0; + } + LIGHT_FOREACH_END + + uint light_nocull = 0u; + LIGHT_FOREACH_BEGIN_LOCAL_NO_CULL(light_cull_buf, l_idx) + { + LightData light = light_buf[l_idx]; + vec3 L; + float dist; + light_vector_get(light, P, L, dist); + if (light_attenuation(light_buf[l_idx], L, dist) > 0.0) { + light_nocull |= 1u << l_idx; + } + } + LIGHT_FOREACH_END + + if ((light_cull & light_nocull) != light_nocull) { + /* ERROR. Some lights were culled incorrectly. */ + out_debug_color = vec4(0.0, 1.0, 0.0, 1.0); + } + else { + out_debug_color = vec4(heatmap_gradient(light_count / 4.0), 1.0); + } +}
\ No newline at end of file diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_select_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_select_comp.glsl new file mode 100644 index 00000000000..9c12b0e50e6 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_select_comp.glsl @@ -0,0 +1,62 @@ + +/** + * Select the visible items inside the active view and put them inside the sorting buffer. + */ + +#pragma BLENDER_REQUIRE(common_view_lib.glsl) +#pragma BLENDER_REQUIRE(common_math_geom_lib.glsl) +#pragma BLENDER_REQUIRE(common_intersect_lib.glsl) + +void main() +{ + uint l_idx = gl_GlobalInvocationID.x; + if (l_idx >= light_cull_buf.items_count) { + return; + } + + LightData light = in_light_buf[l_idx]; + + /* Do not select 0 power lights. */ + if (light.influence_radius_max < 1e-8) { + return; + } + + /* Sun lights are packed at the end of the array. Perform early copy. */ + if (light.type == LIGHT_SUN) { + /* NOTE: We know the index because sun lights are packed at the start of the input buffer. */ + out_light_buf[light_cull_buf.local_lights_len + l_idx] = light; + return; + } + + Sphere sphere; + switch (light.type) { + case LIGHT_SPOT: + /* Only for < ~170° Cone due to plane extraction precision. */ + if (light.spot_tan < 10.0) { + Pyramid pyramid = shape_pyramid_non_oblique( + light._position, + light._position - light._back * light.influence_radius_max, + light._right * light.influence_radius_max * light.spot_tan / light.spot_size_inv.x, + light._up * light.influence_radius_max * light.spot_tan / light.spot_size_inv.y); + if (!intersect_view(pyramid)) { + return; + } + } + case LIGHT_RECT: + case LIGHT_ELLIPSE: + case LIGHT_POINT: + sphere = Sphere(light._position, light.influence_radius_max); + break; + } + + /* TODO(fclem): HiZ culling? Could be quite beneficial given the nature of the 2.5D culling. */ + + /* TODO(fclem): Small light culling / fading? */ + + if (intersect_view(sphere)) { + uint index = atomicAdd(light_cull_buf.visible_count, 1u); + + out_zdist_buf[index] = dot(cameraForward, light._position) - dot(cameraForward, cameraPos); + out_key_buf[index] = l_idx; + } +} diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_sort_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_sort_comp.glsl new file mode 100644 index 00000000000..daf2016cd35 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_sort_comp.glsl @@ -0,0 +1,57 @@ + +/** + * Sort the lights by their Z distance to the camera. + * Outputs ordered light buffer. + * One thread processes one Light entity. + */ + +#pragma BLENDER_REQUIRE(common_math_lib.glsl) + +shared float zdists_cache[gl_WorkGroupSize.x]; + +void main() +{ + uint src_index = gl_GlobalInvocationID.x; + bool valid_thread = true; + + if (src_index >= light_cull_buf.visible_count) { + /* Do not return because we use barriers later on (which need uniform control flow). + * Just process the same last item but avoid insertion. */ + src_index = light_cull_buf.visible_count - 1; + valid_thread = false; + } + + float local_zdist = in_zdist_buf[src_index]; + + int prefix_sum = 0; + /* Iterate over the whole key buffer. */ + uint iter = divide_ceil_u(light_cull_buf.visible_count, gl_WorkGroupSize.x); + for (uint i = 0u; i < iter; i++) { + uint index = gl_WorkGroupSize.x * i + gl_LocalInvocationID.x; + /* NOTE: This will load duplicated values, but they will be discarded. */ + index = min(index, light_cull_buf.visible_count - 1); + zdists_cache[gl_LocalInvocationID.x] = in_zdist_buf[index]; + + barrier(); + + /* Iterate over the cache line. */ + uint line_end = min(gl_WorkGroupSize.x, light_cull_buf.visible_count - gl_WorkGroupSize.x * i); + for (uint j = 0u; j < line_end; j++) { + if (zdists_cache[j] < local_zdist) { + prefix_sum++; + } + else if (zdists_cache[j] == local_zdist) { + /* Same depth, use index to order and avoid same prefix for 2 different lights. */ + if ((gl_WorkGroupSize.x * i + j) < src_index) { + prefix_sum++; + } + } + } + } + + if (valid_thread) { + /* Copy sorted light to render light buffer. */ + uint input_index = in_key_buf[src_index]; + out_light_buf[prefix_sum] = in_light_buf[input_index]; + } +} diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl new file mode 100644 index 00000000000..37705e22b22 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl @@ -0,0 +1,188 @@ + +/** + * 2D Culling pass for lights. + * We iterate over all items and check if they intersect with the tile frustum. + * Dispatch one thread per word. + */ + +#pragma BLENDER_REQUIRE(common_view_lib.glsl) +#pragma BLENDER_REQUIRE(common_intersect_lib.glsl) +#pragma BLENDER_REQUIRE(eevee_light_iter_lib.glsl) + +/* ---------------------------------------------------------------------- */ +/** \name Culling shapes extraction + * \{ */ + +struct CullingTile { + IsectFrustum frustum; + vec4 bounds; +}; + +/* Corners are expected to be in viewspace so that the cone is starting from the origin. + * Corner order does not matter. */ +vec4 tile_bound_cone(vec3 v00, vec3 v01, vec3 v10, vec3 v11) +{ + v00 = normalize(v00); + v01 = normalize(v01); + v10 = normalize(v10); + v11 = normalize(v11); + vec3 center = normalize(v00 + v01 + v10 + v11); + float angle_cosine = dot(center, v00); + angle_cosine = max(angle_cosine, dot(center, v01)); + angle_cosine = max(angle_cosine, dot(center, v10)); + angle_cosine = max(angle_cosine, dot(center, v11)); + return vec4(center, angle_cosine); +} + +/* Corners are expected to be in viewspace. Returns Z-aligned bounding cylinder. + * Corner order does not matter. */ +vec4 tile_bound_cylinder(vec3 v00, vec3 v01, vec3 v10, vec3 v11) +{ + vec3 center = (v00 + v01 + v10 + v11) * 0.25; + vec4 corners_dist; + float dist_sqr = distance_squared(center, v00); + dist_sqr = max(dist_sqr, distance_squared(center, v01)); + dist_sqr = max(dist_sqr, distance_squared(center, v10)); + dist_sqr = max(dist_sqr, distance_squared(center, v11)); + /* Return a cone. Later converted to cylinder. */ + return vec4(center, sqrt(dist_sqr)); +} + +vec2 tile_to_ndc(vec2 tile_co, vec2 offset) +{ + /* Add a margin to prevent culling too much if the frustum becomes too much unstable. */ + const float margin = 0.02; + tile_co += margin * (offset * 2.0 - 1.0); + + tile_co += offset; + return tile_co * light_cull_buf.tile_to_uv_fac * 2.0 - 1.0; +} + +CullingTile tile_culling_get(uvec2 tile_co) +{ + vec2 ftile = vec2(tile_co); + /* Culling frustum corners for this tile. */ + vec3 corners[8]; + /* Follow same corners order as view frustum. */ + corners[1].xy = corners[0].xy = tile_to_ndc(ftile, vec2(0, 0)); + corners[5].xy = corners[4].xy = tile_to_ndc(ftile, vec2(1, 0)); + corners[6].xy = corners[7].xy = tile_to_ndc(ftile, vec2(1, 1)); + corners[2].xy = corners[3].xy = tile_to_ndc(ftile, vec2(0, 1)); + corners[1].z = corners[5].z = corners[6].z = corners[2].z = -1.0; + corners[0].z = corners[4].z = corners[7].z = corners[3].z = 1.0; + + for (int i = 0; i < 8; i++) { + /* Culling in view space for precision. */ + corners[i] = project_point(ProjectionMatrixInverse, corners[i]); + } + + bool is_persp = ProjectionMatrix[3][3] == 0.0; + CullingTile tile; + tile.bounds = (is_persp) ? tile_bound_cone(corners[0], corners[4], corners[7], corners[3]) : + tile_bound_cylinder(corners[0], corners[4], corners[7], corners[3]); + + tile.frustum = isect_data_setup(shape_frustum(corners)); + return tile; +} + +/** \} */ + +/* ---------------------------------------------------------------------- */ +/** \name Intersection Tests + * \{ */ + +bool intersect(CullingTile tile, Sphere sphere) +{ + bool isect = true; + /* Test tile intersection using bounding cone or bounding cylinder. + * This has less false positive cases when the sphere is large. */ + if (ProjectionMatrix[3][3] == 0.0) { + isect = intersect(shape_cone(tile.bounds.xyz, tile.bounds.w), sphere); + } + else { + /* Simplify to a 2D circle test on the view Z axis plane. */ + isect = intersect(shape_circle(tile.bounds.xy, tile.bounds.w), + shape_circle(sphere.center.xy, sphere.radius)); + } + /* Refine using frustum test. If the sphere is small it avoids intersection + * with a neighbor tile. */ + if (isect) { + isect = intersect(tile.frustum, sphere); + } + return isect; +} + +bool intersect(CullingTile tile, Box bbox) +{ + return intersect(tile.frustum, bbox); +} + +bool intersect(CullingTile tile, Pyramid pyramid) +{ + return intersect(tile.frustum, pyramid); +} + +/** \} */ + +void main() +{ + uint word_idx = gl_GlobalInvocationID.x % light_cull_buf.tile_word_len; + uint tile_idx = gl_GlobalInvocationID.x / light_cull_buf.tile_word_len; + uvec2 tile_co = uvec2(tile_idx % light_cull_buf.tile_x_len, + tile_idx / light_cull_buf.tile_x_len); + + if (tile_co.y >= light_cull_buf.tile_y_len) { + return; + } + + /* TODO(fclem): We could stop the tile at the HiZ depth. */ + CullingTile tile = tile_culling_get(tile_co); + + uint l_idx = word_idx * 32u; + uint l_end = min(l_idx + 32u, light_cull_buf.visible_count); + uint word = 0u; + for (; l_idx < l_end; l_idx++) { + LightData light = light_buf[l_idx]; + + /* Culling in view space for precision and simplicity. */ + vec3 vP = transform_point(ViewMatrix, light._position); + vec3 v_right = transform_direction(ViewMatrix, light._right); + vec3 v_up = transform_direction(ViewMatrix, light._up); + vec3 v_back = transform_direction(ViewMatrix, light._back); + float radius = light.influence_radius_max; + + Sphere sphere = shape_sphere(vP, radius); + bool intersect_tile = intersect(tile, sphere); + + switch (light.type) { + case LIGHT_SPOT: + /* Only for < ~170° Cone due to plane extraction precision. */ + if (light.spot_tan < 10.0) { + Pyramid pyramid = shape_pyramid_non_oblique( + vP, + vP - v_back * radius, + v_right * radius * light.spot_tan / light.spot_size_inv.x, + v_up * radius * light.spot_tan / light.spot_size_inv.y); + intersect_tile = intersect_tile && intersect(tile, pyramid); + break; + } + /* Fallthrough to the hemispheric case. */ + case LIGHT_RECT: + case LIGHT_ELLIPSE: + vec3 v000 = vP - v_right * radius - v_up * radius; + vec3 v100 = v000 + v_right * (radius * 2.0); + vec3 v010 = v000 + v_up * (radius * 2.0); + vec3 v001 = v000 - v_back * radius; + Box bbox = shape_box(v000, v100, v010, v001); + intersect_tile = intersect_tile && intersect(tile, bbox); + default: + break; + } + + if (intersect_tile) { + word |= 1u << (l_idx % 32u); + } + } + + out_light_tile_buf[gl_GlobalInvocationID.x] = word; +} diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_zbin_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_zbin_comp.glsl new file mode 100644 index 00000000000..d96f191fb77 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_zbin_comp.glsl @@ -0,0 +1,56 @@ + +/** + * Create the Zbins from Z-sorted lights. + * Perform min-max operation in LDS memory for speed. + * For this reason, we only dispatch 1 thread group. + */ + +#pragma BLENDER_REQUIRE(common_view_lib.glsl) +#pragma BLENDER_REQUIRE(eevee_light_iter_lib.glsl) + +/* Fits the limit of 32KB. */ +shared uint zbin_max[CULLING_ZBIN_COUNT]; +shared uint zbin_min[CULLING_ZBIN_COUNT]; + +void main() +{ + const uint zbin_iter = CULLING_ZBIN_COUNT / gl_WorkGroupSize.x; + const uint zbin_local = gl_LocalInvocationID.x * zbin_iter; + + uint src_index = gl_GlobalInvocationID.x; + + for (uint i = 0u, l = zbin_local; i < zbin_iter; i++, l++) { + zbin_max[l] = 0x0u; + zbin_min[l] = ~0x0u; + } + barrier(); + + uint light_iter = divide_ceil_u(light_cull_buf.visible_count, gl_WorkGroupSize.x); + for (uint i = 0u; i < light_iter; i++) { + uint index = i * gl_WorkGroupSize.x + gl_LocalInvocationID.x; + if (index >= light_cull_buf.visible_count) { + continue; + } + vec3 P = light_buf[index]._position; + /* TODO(fclem): Could have better bounds for spot and area lights. */ + float radius = light_buf[index].influence_radius_max; + float z_dist = dot(cameraForward, P) - dot(cameraForward, cameraPos); + int z_min = culling_z_to_zbin( + light_cull_buf.zbin_scale, light_cull_buf.zbin_bias, z_dist + radius); + int z_max = culling_z_to_zbin( + light_cull_buf.zbin_scale, light_cull_buf.zbin_bias, z_dist - radius); + z_min = clamp(z_min, 0, CULLING_ZBIN_COUNT - 1); + z_max = clamp(z_max, 0, CULLING_ZBIN_COUNT - 1); + /* Register to Z bins. */ + for (int z = z_min; z <= z_max; z++) { + atomicMin(zbin_min[z], index); + atomicMax(zbin_max[z], index); + } + } + barrier(); + + /* Write result to zbins buffer. Pack min & max into 1 uint. */ + for (uint i = 0u, l = zbin_local; i < zbin_iter; i++, l++) { + out_zbin_buf[l] = (zbin_max[l] << 16u) | (zbin_min[l] & 0xFFFFu); + } +} diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_eval_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_eval_lib.glsl new file mode 100644 index 00000000000..d4abdd43aa4 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_eval_lib.glsl @@ -0,0 +1,129 @@ + +/** + * The resources expected to be defined are: + * - light_buf + * - light_zbin_buf + * - light_cull_buf + * - light_tile_buf + * - shadow_atlas_tx + * - shadow_tilemaps_tx + * - sss_transmittance_tx + * - utility_tx + */ + +#pragma BLENDER_REQUIRE(eevee_light_lib.glsl) +#pragma BLENDER_REQUIRE(gpu_shader_codegen_lib.glsl) + +/* TODO(fclem): We could reduce register pressure by only having static branches for sun lights. */ +void light_eval_ex(ClosureDiffuse diffuse, + ClosureReflection reflection, + const bool is_directional, + vec3 P, + vec3 V, + float vP_z, + float thickness, + vec4 ltc_mat, + uint l_idx, + inout vec3 out_diffuse, + inout vec3 out_specular) +{ + LightData light = light_buf[l_idx]; + vec3 L; + float dist; + light_vector_get(light, P, L, dist); + + float visibility = light_attenuation(light, L, dist); + +#if 0 /* TODO(fclem): Shadows */ + if ((light.shadow_id != LIGHT_NO_SHADOW) && (visibility > 0.0)) { + vec3 lL = light_world_to_local(light, -L) * dist; + + float shadow_delta = shadow_delta_get( + shadow_atlas_tx, shadow_tilemaps_tx, light, light.shadow_data, lL, dist, P); + +# ifdef SSS_TRANSMITTANCE + /* Transmittance evaluation first to use initial visibility. */ + if (diffuse.sss_id != 0u && light.diffuse_power > 0.0) { + float delta = max(thickness, shadow_delta); + + vec3 intensity = visibility * light.transmit_power * + light_translucent(sss_transmittance_tx, + is_directional, + light, + diffuse.N, + L, + dist, + diffuse.sss_radius, + delta); + out_diffuse += light.color * intensity; + } +# endif + + visibility *= float(shadow_delta - light.shadow_data.bias <= 0.0); + } +#endif + + if (visibility < 1e-6) { + return; + } + + if (light.diffuse_power > 0.0) { + float intensity = visibility * light.diffuse_power * + light_diffuse(utility_tx, is_directional, light, diffuse.N, V, L, dist); + out_diffuse += light.color * intensity; + } + + if (light.specular_power > 0.0) { + float intensity = visibility * light.specular_power * + light_ltc( + utility_tx, is_directional, light, reflection.N, V, L, dist, ltc_mat); + out_specular += light.color * intensity; + } +} + +void light_eval(ClosureDiffuse diffuse, + ClosureReflection reflection, + vec3 P, + vec3 V, + float vP_z, + float thickness, + inout vec3 out_diffuse, + inout vec3 out_specular) +{ + vec2 uv = vec2(reflection.roughness, safe_sqrt(1.0 - dot(reflection.N, V))); + uv = uv * UTIL_TEX_UV_SCALE + UTIL_TEX_UV_BIAS; + vec4 ltc_mat = utility_tx_sample(utility_tx, uv, UTIL_LTC_MAT_LAYER); + + LIGHT_FOREACH_BEGIN_DIRECTIONAL(light_cull_buf, l_idx) + { + light_eval_ex(diffuse, + reflection, + true, + P, + V, + vP_z, + thickness, + ltc_mat, + l_idx, + out_diffuse, + out_specular); + } + LIGHT_FOREACH_END + + vec2 px = gl_FragCoord.xy; + LIGHT_FOREACH_BEGIN_LOCAL(light_cull_buf, light_zbin_buf, light_tile_buf, px, vP_z, l_idx) + { + light_eval_ex(diffuse, + reflection, + false, + P, + V, + vP_z, + thickness, + ltc_mat, + l_idx, + out_diffuse, + out_specular); + } + LIGHT_FOREACH_END +} diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_iter_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_iter_lib.glsl new file mode 100644 index 00000000000..22a5f98e6c3 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_iter_lib.glsl @@ -0,0 +1,72 @@ + +#pragma BLENDER_REQUIRE(common_math_lib.glsl) + +uint zbin_mask(uint word_index, uint zbin_min, uint zbin_max) +{ + uint word_start = word_index * 32u; + uint word_end = word_start + 31u; + uint local_min = max(zbin_min, word_start); + uint local_max = min(zbin_max, word_end); + uint mask_width = local_max - local_min + 1; + return bit_field_mask(mask_width, local_min); +} + +int culling_z_to_zbin(float scale, float bias, float z) +{ + return int(z * scale + bias); +} + +/* Waiting to implement extensions support. We need: + * - GL_KHR_shader_subgroup_ballot + * - GL_KHR_shader_subgroup_arithmetic + * or + * - Vulkan 1.1 + */ +#if 1 +# define subgroupMin(a) a +# define subgroupMax(a) a +# define subgroupOr(a) a +# define subgroupBroadcastFirst(a) a +#endif + +#define LIGHT_FOREACH_BEGIN_DIRECTIONAL(_culling, _index) \ + { \ + { \ + for (uint _index = _culling.local_lights_len; _index < _culling.items_count; _index++) { + +#define LIGHT_FOREACH_BEGIN_LOCAL(_culling, _zbins, _words, _pixel, _linearz, _item_index) \ + { \ + uvec2 tile_co = uvec2(_pixel / _culling.tile_size); \ + uint tile_word_offset = (tile_co.x + tile_co.y * _culling.tile_x_len) * \ + _culling.tile_word_len; \ + int zbin_index = culling_z_to_zbin(_culling.zbin_scale, _culling.zbin_bias, _linearz); \ + zbin_index = clamp(zbin_index, 0, CULLING_ZBIN_COUNT - 1); \ + uint zbin_data = _zbins[zbin_index]; \ + uint min_index = zbin_data & 0xFFFFu; \ + uint max_index = zbin_data >> 16u; \ + /* Ensure all threads inside a subgroup get the same value to reduce VGPR usage. */ \ + min_index = subgroupBroadcastFirst(subgroupMin(min_index)); \ + max_index = subgroupBroadcastFirst(subgroupMax(max_index)); \ + /* Same as divide by 32 but avoid interger division. */ \ + uint word_min = min_index >> 5u; \ + uint word_max = max_index >> 5u; \ + for (uint word_idx = word_min; word_idx <= word_max; word_idx++) { \ + uint word = _words[tile_word_offset + word_idx]; \ + word &= zbin_mask(word_idx, min_index, max_index); \ + /* Ensure all threads inside a subgroup get the same value to reduce VGPR usage. */ \ + word = subgroupBroadcastFirst(subgroupOr(word)); \ + int bit_index; \ + while ((bit_index = findLSB(word)) != -1) { \ + word &= ~1u << uint(bit_index); \ + uint _item_index = word_idx * 32u + bit_index; + +/* No culling. Iterate over all items. */ +#define LIGHT_FOREACH_BEGIN_LOCAL_NO_CULL(_culling, _item_index) \ + { \ + { \ + for (uint _item_index = 0; _item_index < _culling.visible_count; _item_index++) { + +#define LIGHT_FOREACH_END \ + } \ + } \ + } diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_lib.glsl new file mode 100644 index 00000000000..58608f6e1f0 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_lib.glsl @@ -0,0 +1,209 @@ + +#pragma BLENDER_REQUIRE(common_math_geom_lib.glsl) +#pragma BLENDER_REQUIRE(eevee_ltc_lib.glsl) +#pragma BLENDER_REQUIRE(eevee_light_iter_lib.glsl) + +/* ---------------------------------------------------------------------- */ +/** \name Light Functions + * \{ */ + +void light_vector_get(LightData ld, vec3 P, out vec3 L, out float dist) +{ + if (ld.type == LIGHT_SUN) { + L = ld._back; + dist = 1.0; + } + else { + L = ld._position - P; + dist = inversesqrt(len_squared(L)); + L *= dist; + dist = 1.0 / dist; + } +} + +/* Rotate vector to light's local space. Does not translate. */ +vec3 light_world_to_local(LightData ld, vec3 L) +{ + /* Avoid relying on compiler to optimize this. + * vec3 lL = transpose(mat3(ld.object_mat)) * L; */ + vec3 lL; + lL.x = dot(ld.object_mat[0].xyz, L); + lL.y = dot(ld.object_mat[1].xyz, L); + lL.z = dot(ld.object_mat[2].xyz, L); + return lL; +} + +/* From Frostbite PBR Course + * Distance based attenuation + * http://www.frostbite.com/wp-content/uploads/2014/11/course_notes_moving_frostbite_to_pbr.pdf */ +float light_influence_attenuation(float dist, float inv_sqr_influence) +{ + float factor = sqr(dist) * inv_sqr_influence; + float fac = saturate(1.0 - sqr(factor)); + return sqr(fac); +} + +float light_spot_attenuation(LightData ld, vec3 L) +{ + vec3 lL = light_world_to_local(ld, L); + float ellipse = inversesqrt(1.0 + len_squared(lL.xy * ld.spot_size_inv / lL.z)); + float spotmask = smoothstep(0.0, 1.0, ellipse * ld._spot_mul + ld._spot_bias); + return spotmask; +} + +float light_attenuation(LightData ld, vec3 L, float dist) +{ + float vis = 1.0; + if (ld.type == LIGHT_SPOT) { + vis *= light_spot_attenuation(ld, L); + } + if (ld.type >= LIGHT_SPOT) { + vis *= step(0.0, -dot(L, -ld._back)); + } + if (ld.type != LIGHT_SUN) { +#ifdef VOLUME_LIGHTING + vis *= light_influence_attenuation(dist, ld.influence_radius_invsqr_volume); +#else + vis *= light_influence_attenuation(dist, ld.influence_radius_invsqr_surface); +#endif + } + return vis; +} + +/* Cheaper alternative than evaluating the LTC. + * The result needs to be multiplied by BSDF or Phase Function. */ +float light_point_light(LightData ld, const bool is_directional, vec3 L, float dist) +{ + if (is_directional) { + return 1.0; + } + /** + * Using "Point Light Attenuation Without Singularity" from Cem Yuksel + * http://www.cemyuksel.com/research/pointlightattenuation/pointlightattenuation.pdf + * http://www.cemyuksel.com/research/pointlightattenuation/ + **/ + float d_sqr = sqr(dist); + float r_sqr = ld.radius_squared; + /* Using reformulation that has better numerical percision. */ + float power = 2.0 / (d_sqr + r_sqr + dist * sqrt(d_sqr + r_sqr)); + + if (is_area_light(ld.type)) { + /* Modulate by light plane orientation / solid angle. */ + power *= saturate(dot(ld._back, L)); + } + return power; +} + +float light_diffuse(sampler2DArray utility_tx, + const bool is_directional, + LightData ld, + vec3 N, + vec3 V, + vec3 L, + float dist) +{ + if (is_directional || !is_area_light(ld.type)) { + float radius = ld._radius / dist; + return ltc_evaluate_disk_simple(utility_tx, radius, dot(N, L)); + } + else if (ld.type == LIGHT_RECT) { + vec3 corners[4]; + corners[0] = ld._right * ld._area_size_x + ld._up * -ld._area_size_y; + corners[1] = ld._right * ld._area_size_x + ld._up * ld._area_size_y; + corners[2] = -corners[0]; + corners[3] = -corners[1]; + + corners[0] = normalize(L * dist + corners[0]); + corners[1] = normalize(L * dist + corners[1]); + corners[2] = normalize(L * dist + corners[2]); + corners[3] = normalize(L * dist + corners[3]); + + return ltc_evaluate_quad(utility_tx, corners, N); + } + else /* (ld.type == LIGHT_ELLIPSE) */ { + vec3 points[3]; + points[0] = ld._right * -ld._area_size_x + ld._up * -ld._area_size_y; + points[1] = ld._right * ld._area_size_x + ld._up * -ld._area_size_y; + points[2] = -points[0]; + + points[0] += L * dist; + points[1] += L * dist; + points[2] += L * dist; + + return ltc_evaluate_disk(utility_tx, N, V, mat3(1.0), points); + } +} + +float light_ltc(sampler2DArray utility_tx, + const bool is_directional, + LightData ld, + vec3 N, + vec3 V, + vec3 L, + float dist, + vec4 ltc_mat) +{ + if (is_directional || ld.type != LIGHT_RECT) { + vec3 Px = ld._right; + vec3 Py = ld._up; + + if (is_directional || !is_area_light(ld.type)) { + make_orthonormal_basis(L, Px, Py); + } + + vec3 points[3]; + points[0] = Px * -ld._area_size_x + Py * -ld._area_size_y; + points[1] = Px * ld._area_size_x + Py * -ld._area_size_y; + points[2] = -points[0]; + + points[0] += L * dist; + points[1] += L * dist; + points[2] += L * dist; + + return ltc_evaluate_disk(utility_tx, N, V, ltc_matrix(ltc_mat), points); + } + else { + vec3 corners[4]; + corners[0] = ld._right * ld._area_size_x + ld._up * -ld._area_size_y; + corners[1] = ld._right * ld._area_size_x + ld._up * ld._area_size_y; + corners[2] = -corners[0]; + corners[3] = -corners[1]; + + corners[0] += L * dist; + corners[1] += L * dist; + corners[2] += L * dist; + corners[3] += L * dist; + + ltc_transform_quad(N, V, ltc_matrix(ltc_mat), corners); + + return ltc_evaluate_quad(utility_tx, corners, vec3(0.0, 0.0, 1.0)); + } +} + +vec3 light_translucent(sampler1D transmittance_tx, + const bool is_directional, + LightData ld, + vec3 N, + vec3 L, + float dist, + vec3 sss_radius, + float delta) +{ + /* TODO(fclem): We should compute the power at the entry point. */ + /* NOTE(fclem): we compute the light attenuation using the light vector but the transmittance + * using the shadow depth delta. */ + float power = light_point_light(ld, is_directional, L, dist); + /* Do not add more energy on front faces. Also apply lambertian BSDF. */ + power *= max(0.0, dot(-N, L)) * M_1_PI; + + sss_radius *= SSS_TRANSMIT_LUT_RADIUS; + vec3 channels_co = saturate(delta / sss_radius) * SSS_TRANSMIT_LUT_SCALE + SSS_TRANSMIT_LUT_BIAS; + + vec3 translucency; + translucency.x = (sss_radius.x > 0.0) ? texture(transmittance_tx, channels_co.x).r : 0.0; + translucency.y = (sss_radius.y > 0.0) ? texture(transmittance_tx, channels_co.y).r : 0.0; + translucency.z = (sss_radius.z > 0.0) ? texture(transmittance_tx, channels_co.z).r : 0.0; + return translucency * power; +} + +/** \} */ diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_ltc_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_ltc_lib.glsl new file mode 100644 index 00000000000..57e92b0b9b4 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_ltc_lib.glsl @@ -0,0 +1,299 @@ + +/** + * Adapted from : + * Real-Time Polygonal-Light Shading with Linearly Transformed Cosines. + * Eric Heitz, Jonathan Dupuy, Stephen Hill and David Neubelt. + * ACM Transactions on Graphics (Proceedings of ACM SIGGRAPH 2016) 35(4), 2016. + * Project page: https://eheitzresearch.wordpress.com/415-2/ + */ + +/* Diffuse *clipped* sphere integral. */ +float ltc_diffuse_sphere_integral(sampler2DArray utility_tx, float avg_dir_z, float form_factor) +{ +#if 1 + /* use tabulated horizon-clipped sphere */ + vec2 uv = vec2(avg_dir_z * 0.5 + 0.5, form_factor); + uv = uv * UTIL_TEX_UV_SCALE + UTIL_TEX_UV_BIAS; + + return texture(utility_tx, vec3(uv, UTIL_DISK_INTEGRAL_LAYER))[UTIL_DISK_INTEGRAL_COMP]; +#else + /* Cheap approximation. Less smooth and have energy issues. */ + return max((form_factor * form_factor + avg_dir_z) / (form_factor + 1.0), 0.0); +#endif +} + +/** + * An extended version of the implementation from + * "How to solve a cubic equation, revisited" + * http://momentsingraphics.de/?p=105 + */ +vec3 ltc_solve_cubic(vec4 coefs) +{ + /* Normalize the polynomial */ + coefs.xyz /= coefs.w; + /* Divide middle coefficients by three */ + coefs.yz /= 3.0; + + float A = coefs.w; + float B = coefs.z; + float C = coefs.y; + float D = coefs.x; + + /* Compute the Hessian and the discriminant */ + vec3 delta = vec3(-coefs.zy * coefs.zz + coefs.yx, dot(vec2(coefs.z, -coefs.y), coefs.xy)); + + /* Discriminant */ + float discr = dot(vec2(4.0 * delta.x, -delta.y), delta.zy); + + /* Clamping avoid NaN output on some platform. (see T67060) */ + float sqrt_discr = sqrt(clamp(discr, 0.0, FLT_MAX)); + + vec2 xlc, xsc; + + /* Algorithm A */ + { + float A_a = 1.0; + float C_a = delta.x; + float D_a = -2.0 * B * delta.x + delta.y; + + /* Take the cubic root of a normalized complex number */ + float theta = atan(sqrt_discr, -D_a) / 3.0; + + float _2_sqrt_C_a = 2.0 * sqrt(-C_a); + float x_1a = _2_sqrt_C_a * cos(theta); + float x_3a = _2_sqrt_C_a * cos(theta + (2.0 / 3.0) * M_PI); + + float xl; + if ((x_1a + x_3a) > 2.0 * B) { + xl = x_1a; + } + else { + xl = x_3a; + } + + xlc = vec2(xl - B, A); + } + + /* Algorithm D */ + { + float A_d = D; + float C_d = delta.z; + float D_d = -D * delta.y + 2.0 * C * delta.z; + + /* Take the cubic root of a normalized complex number */ + float theta = atan(D * sqrt_discr, -D_d) / 3.0; + + float _2_sqrt_C_d = 2.0 * sqrt(-C_d); + float x_1d = _2_sqrt_C_d * cos(theta); + float x_3d = _2_sqrt_C_d * cos(theta + (2.0 / 3.0) * M_PI); + + float xs; + if (x_1d + x_3d < 2.0 * C) { + xs = x_1d; + } + else { + xs = x_3d; + } + + xsc = vec2(-D, xs + C); + } + + float E = xlc.y * xsc.y; + float F = -xlc.x * xsc.y - xlc.y * xsc.x; + float G = xlc.x * xsc.x; + + vec2 xmc = vec2(C * F - B * G, -B * F + C * E); + + vec3 root = vec3(xsc.x / xsc.y, xmc.x / xmc.y, xlc.x / xlc.y); + + if (root.x < root.y && root.x < root.z) { + root.xyz = root.yxz; + } + else if (root.z < root.x && root.z < root.y) { + root.xyz = root.xzy; + } + + return root; +} + +/* from Real-Time Area Lighting: a Journey from Research to Production + * Stephen Hill and Eric Heitz */ +vec3 ltc_edge_integral_vec(vec3 v1, vec3 v2) +{ + float x = dot(v1, v2); + float y = abs(x); + + float a = 0.8543985 + (0.4965155 + 0.0145206 * y) * y; + float b = 3.4175940 + (4.1616724 + y) * y; + float v = a / b; + + float theta_sintheta = (x > 0.0) ? v : 0.5 * inversesqrt(max(1.0 - x * x, 1e-7)) - v; + + return cross(v1, v2) * theta_sintheta; +} + +mat3 ltc_matrix(vec4 lut) +{ + /* Load inverse matrix. */ + return mat3(vec3(lut.x, 0, lut.y), vec3(0, 1, 0), vec3(lut.z, 0, lut.w)); +} + +void ltc_transform_quad(vec3 N, vec3 V, mat3 Minv, inout vec3 corners[4]) +{ + /* Avoid dot(N, V) == 1 in ortho mode, leading T1 normalize to fail. */ + V = normalize(V + 1e-8); + + /* Construct orthonormal basis around N. */ + vec3 T1, T2; + T1 = normalize(V - N * dot(N, V)); + T2 = cross(N, T1); + + /* Rotate area light in (T1, T2, R) basis. */ + Minv = Minv * transpose(mat3(T1, T2, N)); + + /* Apply LTC inverse matrix. */ + corners[0] = normalize(Minv * corners[0]); + corners[1] = normalize(Minv * corners[1]); + corners[2] = normalize(Minv * corners[2]); + corners[3] = normalize(Minv * corners[3]); +} + +/* If corners have already pass through ltc_transform_quad(), + * then N **MUST** be vec3(0.0, 0.0, 1.0), corresponding to the Up axis of the shading basis. */ +float ltc_evaluate_quad(sampler2DArray utility_tx, vec3 corners[4], vec3 N) +{ + /* Approximation using a sphere of the same solid angle than the quad. + * Finding the clipped sphere diffuse integral is easier than clipping the quad. */ + vec3 avg_dir; + avg_dir = ltc_edge_integral_vec(corners[0], corners[1]); + avg_dir += ltc_edge_integral_vec(corners[1], corners[2]); + avg_dir += ltc_edge_integral_vec(corners[2], corners[3]); + avg_dir += ltc_edge_integral_vec(corners[3], corners[0]); + + float form_factor = length(avg_dir); + float avg_dir_z = dot(N, avg_dir / form_factor); + return form_factor * ltc_diffuse_sphere_integral(utility_tx, avg_dir_z, form_factor); +} + +/* If disk does not need to be transformed and is already front facing. */ +float ltc_evaluate_disk_simple(sampler2DArray utility_tx, float disk_radius, float NL) +{ + float r_sqr = disk_radius * disk_radius; + float one_r_sqr = 1.0 + r_sqr; + float form_factor = r_sqr * inversesqrt(one_r_sqr * one_r_sqr); + return form_factor * ltc_diffuse_sphere_integral(utility_tx, NL, form_factor); +} + +/* disk_points are WS vectors from the shading point to the disk "bounding domain" */ +float ltc_evaluate_disk(sampler2DArray utility_tx, vec3 N, vec3 V, mat3 Minv, vec3 disk_points[3]) +{ + /* Avoid dot(N, V) == 1 in ortho mode, leading T1 normalize to fail. */ + V = normalize(V + 1e-8); + + /* construct orthonormal basis around N */ + vec3 T1, T2; + T1 = normalize(V - N * dot(V, N)); + T2 = cross(N, T1); + + /* rotate area light in (T1, T2, R) basis */ + mat3 R = transpose(mat3(T1, T2, N)); + + /* Intermediate step: init ellipse. */ + vec3 L_[3]; + L_[0] = mul(R, disk_points[0]); + L_[1] = mul(R, disk_points[1]); + L_[2] = mul(R, disk_points[2]); + + vec3 C = 0.5 * (L_[0] + L_[2]); + vec3 V1 = 0.5 * (L_[1] - L_[2]); + vec3 V2 = 0.5 * (L_[1] - L_[0]); + + /* Transform ellipse by Minv. */ + C = Minv * C; + V1 = Minv * V1; + V2 = Minv * V2; + + /* Compute eigenvectors of new ellipse. */ + + float d11 = dot(V1, V1); + float d22 = dot(V2, V2); + float d12 = dot(V1, V2); + float a, b; /* Eigenvalues */ + const float threshold = 0.0007; /* Can be adjusted. Fix artifacts. */ + if (abs(d12) / sqrt(d11 * d22) > threshold) { + float tr = d11 + d22; + float det = -d12 * d12 + d11 * d22; + + /* use sqrt matrix to solve for eigenvalues */ + det = sqrt(det); + float u = 0.5 * sqrt(tr - 2.0 * det); + float v = 0.5 * sqrt(tr + 2.0 * det); + float e_max = (u + v); + float e_min = (u - v); + e_max *= e_max; + e_min *= e_min; + + vec3 V1_, V2_; + if (d11 > d22) { + V1_ = d12 * V1 + (e_max - d11) * V2; + V2_ = d12 * V1 + (e_min - d11) * V2; + } + else { + V1_ = d12 * V2 + (e_max - d22) * V1; + V2_ = d12 * V2 + (e_min - d22) * V1; + } + + a = 1.0 / e_max; + b = 1.0 / e_min; + V1 = normalize(V1_); + V2 = normalize(V2_); + } + else { + a = 1.0 / d11; + b = 1.0 / d22; + V1 *= sqrt(a); + V2 *= sqrt(b); + } + + /* Now find front facing ellipse with same solid angle. */ + + vec3 V3 = normalize(cross(V1, V2)); + if (dot(C, V3) < 0.0) { + V3 *= -1.0; + } + + float L = dot(V3, C); + float inv_L = 1.0 / L; + float x0 = dot(V1, C) * inv_L; + float y0 = dot(V2, C) * inv_L; + + float L_sqr = L * L; + a *= L_sqr; + b *= L_sqr; + + float t = 1.0 + x0 * x0; + float c0 = a * b; + float c1 = c0 * (t + y0 * y0) - a - b; + float c2 = (1.0 - a * t) - b * (1.0 + y0 * y0); + float c3 = 1.0; + + vec3 roots = ltc_solve_cubic(vec4(c0, c1, c2, c3)); + float e1 = roots.x; + float e2 = roots.y; + float e3 = roots.z; + + vec3 avg_dir = vec3(a * x0 / (a - e2), b * y0 / (b - e2), 1.0); + + mat3 rotate = mat3(V1, V2, V3); + + avg_dir = rotate * avg_dir; + avg_dir = normalize(avg_dir); + + /* L1, L2 are the extends of the front facing ellipse. */ + float L1 = sqrt(-e2 / e3); + float L2 = sqrt(-e2 / e1); + + /* Find the sphere and compute lighting. */ + float form_factor = max(0.0, L1 * L2 * inversesqrt((1.0 + L1 * L1) * (1.0 + L2 * L2))); + return form_factor * ltc_diffuse_sphere_integral(utility_tx, avg_dir.z, form_factor); +} diff --git a/source/blender/draw/engines/eevee_next/shaders/infos/eevee_light_culling_info.hh b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_light_culling_info.hh new file mode 100644 index 00000000000..56fda25ed13 --- /dev/null +++ b/source/blender/draw/engines/eevee_next/shaders/infos/eevee_light_culling_info.hh @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "eevee_defines.hh" +#include "gpu_shader_create_info.hh" + +/* -------------------------------------------------------------------- */ +/** \name Shared + * \{ */ + +GPU_SHADER_CREATE_INFO(eevee_light_data) + .storage_buf(0, Qualifier::READ, "LightCullingData", "light_cull_buf") + .storage_buf(1, Qualifier::READ, "LightData", "light_buf[]") + .storage_buf(2, Qualifier::READ, "uint", "light_zbin_buf[]") + .storage_buf(3, Qualifier::READ, "uint", "light_tile_buf[]"); + +/** \} */ + +/* -------------------------------------------------------------------- */ +/** \name Culling + * \{ */ + +GPU_SHADER_CREATE_INFO(eevee_light_culling_select) + .do_static_compilation(true) + .additional_info("eevee_shared", "draw_view") + .local_group_size(CULLING_SELECT_GROUP_SIZE) + .storage_buf(0, Qualifier::READ_WRITE, "LightCullingData", "light_cull_buf") + .storage_buf(1, Qualifier::READ, "LightData", "in_light_buf[]") + .storage_buf(2, Qualifier::WRITE, "LightData", "out_light_buf[]") + .storage_buf(3, Qualifier::WRITE, "float", "out_zdist_buf[]") + .storage_buf(4, Qualifier::WRITE, "uint", "out_key_buf[]") + .compute_source("eevee_light_culling_select_comp.glsl"); + +GPU_SHADER_CREATE_INFO(eevee_light_culling_sort) + .do_static_compilation(true) + .additional_info("eevee_shared", "draw_view") + .storage_buf(0, Qualifier::READ, "LightCullingData", "light_cull_buf") + .storage_buf(1, Qualifier::READ, "LightData", "in_light_buf[]") + .storage_buf(2, Qualifier::WRITE, "LightData", "out_light_buf[]") + .storage_buf(3, Qualifier::READ, "float", "in_zdist_buf[]") + .storage_buf(4, Qualifier::READ, "uint", "in_key_buf[]") + .local_group_size(CULLING_SORT_GROUP_SIZE) + .compute_source("eevee_light_culling_sort_comp.glsl"); + +GPU_SHADER_CREATE_INFO(eevee_light_culling_zbin) + .do_static_compilation(true) + .additional_info("eevee_shared", "draw_view") + .local_group_size(CULLING_ZBIN_GROUP_SIZE) + .storage_buf(0, Qualifier::READ, "LightCullingData", "light_cull_buf") + .storage_buf(1, Qualifier::READ, "LightData", "light_buf[]") + .storage_buf(2, Qualifier::WRITE, "uint", "out_zbin_buf[]") + .compute_source("eevee_light_culling_zbin_comp.glsl"); + +GPU_SHADER_CREATE_INFO(eevee_light_culling_tile) + .do_static_compilation(true) + .additional_info("eevee_shared", "draw_view") + .local_group_size(CULLING_TILE_GROUP_SIZE) + .storage_buf(0, Qualifier::READ, "LightCullingData", "light_cull_buf") + .storage_buf(1, Qualifier::READ, "LightData", "light_buf[]") + .storage_buf(2, Qualifier::WRITE, "uint", "out_light_tile_buf[]") + .compute_source("eevee_light_culling_tile_comp.glsl"); + +/** \} */ + +/* -------------------------------------------------------------------- */ +/** \name Debug + * \{ */ + +GPU_SHADER_CREATE_INFO(eevee_light_culling_debug) + .do_static_compilation(true) + .sampler(0, ImageType::DEPTH_2D, "depth_tx") + .fragment_out(0, Type::VEC4, "out_debug_color") + .additional_info("eevee_shared", "draw_view") + .fragment_source("eevee_light_culling_debug_frag.glsl") + .additional_info("draw_fullscreen", "eevee_light_data"); + +/** \} */ |