19 files changed, 552 insertions, 205 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index f6b4b963a7a..ea0f16c9233 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -93,6 +93,7 @@ set(SRC_BVH_HEADERS
   bvh/bvh_local.h
   bvh/bvh_traversal.h
   bvh/bvh_types.h
+  bvh/bvh_util.h
   bvh/bvh_volume.h
   bvh/bvh_volume_all.h
   bvh/bvh_embree.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 3049f243ae9..3a3f38539c5 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -29,9 +29,10 @@
 #  include "kernel/bvh/bvh_embree.h"
 #endif
 
-CCL_NAMESPACE_BEGIN
-
 #include "kernel/bvh/bvh_types.h"
+#include "kernel/bvh/bvh_util.h"
+
+CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPTIX__
 
@@ -533,97 +534,4 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals *kg,
 }
 #endif /* __VOLUME_RECORD_ALL__ */
 
-/* Ray offset to avoid self intersection.
- *
- * This function should be used to compute a modified ray start position for
- * rays leaving from a surface. */
-
-ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
-{
-#ifdef __INTERSECTION_REFINE__
-  const float epsilon_f = 1e-5f;
-  /* ideally this should match epsilon_f, but instancing and motion blur
-   * precision makes it problematic */
-  const float epsilon_test = 1.0f;
-  const int epsilon_i = 32;
-
-  float3 res;
-
-  /* x component */
-  if (fabsf(P.x) < epsilon_test) {
-    res.x = P.x + Ng.x * epsilon_f;
-  }
-  else {
-    uint ix = __float_as_uint(P.x);
-    ix += ((ix ^ __float_as_uint(Ng.x)) >> 31) ? -epsilon_i : epsilon_i;
-    res.x = __uint_as_float(ix);
-  }
-
-  /* y component */
-  if (fabsf(P.y) < epsilon_test) {
-    res.y = P.y + Ng.y * epsilon_f;
-  }
-  else {
-    uint iy = __float_as_uint(P.y);
-    iy += ((iy ^ __float_as_uint(Ng.y)) >> 31) ? -epsilon_i : epsilon_i;
-    res.y = __uint_as_float(iy);
-  }
-
-  /* z component */
-  if (fabsf(P.z) < epsilon_test) {
-    res.z = P.z + Ng.z * epsilon_f;
-  }
-  else {
-    uint iz = __float_as_uint(P.z);
-    iz += ((iz ^ __float_as_uint(Ng.z)) >> 31) ? -epsilon_i : epsilon_i;
-    res.z = __uint_as_float(iz);
-  }
-
-  return res;
-#else
-  const float epsilon_f = 1e-4f;
-  return P + epsilon_f * Ng;
-#endif
-}
-
-#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
-/* ToDo: Move to another file? */
-ccl_device int intersections_compare(const void *a, const void *b)
-{
-  const Intersection *isect_a = (const Intersection *)a;
-  const Intersection *isect_b = (const Intersection *)b;
-
-  if (isect_a->t < isect_b->t)
-    return -1;
-  else if (isect_a->t > isect_b->t)
-    return 1;
-  else
-    return 0;
-}
-#endif
-
-#if defined(__SHADOW_RECORD_ALL__)
-ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
-{
-#  ifdef __KERNEL_GPU__
-  /* Use bubble sort which has more friendly memory pattern on GPU. */
-  bool swapped;
-  do {
-    swapped = false;
-    for (int j = 0; j < num_hits - 1; ++j) {
-      if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp = hits[j];
-        hits[j] = hits[j + 1];
-        hits[j + 1] = tmp;
-        swapped = true;
-      }
-    }
-    --num_hits;
-  } while (swapped);
-#  else
-  qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
-#  endif
-}
-#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
-
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index dccd257d2de..2e94b1d7c37 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -180,25 +180,10 @@ ccl_device_inline
 
               /* todo: optimize so primitive visibility flag indicates if
                * the primitive has a transparent shadow shader? */
-              int prim = kernel_tex_fetch(__prim_index, isect_array->prim);
-              int shader = 0;
-
-#ifdef __HAIR__
-              if (kernel_tex_fetch(__prim_type, isect_array->prim) & PRIMITIVE_ALL_TRIANGLE)
-#endif
-              {
-                shader = kernel_tex_fetch(__tri_shader, prim);
-              }
-#ifdef __HAIR__
-              else {
-                float4 str = kernel_tex_fetch(__curves, prim);
-                shader = __float_as_int(str.z);
-              }
-#endif
-              int flag = kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+              const int flags = intersection_get_shader_flags(kg, isect_array);
 
               /* if no transparent shadows, all light is blocked */
-              if (!(flag & SD_HAS_TRANSPARENT_SHADOW)) {
+              if (!(flags & SD_HAS_TRANSPARENT_SHADOW)) {
                 return true;
               }
               /* if maximum number of hits reached, block all light */
diff --git a/intern/cycles/kernel/bvh/bvh_util.h b/intern/cycles/kernel/bvh/bvh_util.h
new file mode 100644
index 00000000000..a694e4dc259
--- /dev/null
+++ b/intern/cycles/kernel/bvh/bvh_util.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/* Ray offset to avoid self intersection.
+ *
+ * This function should be used to compute a modified ray start position for
+ * rays leaving from a surface. */
+
+ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
+{
+#ifdef __INTERSECTION_REFINE__
+  const float epsilon_f = 1e-5f;
+  /* ideally this should match epsilon_f, but instancing and motion blur
+   * precision makes it problematic */
+  const float epsilon_test = 1.0f;
+  const int epsilon_i = 32;
+
+  float3 res;
+
+  /* x component */
+  if (fabsf(P.x) < epsilon_test) {
+    res.x = P.x + Ng.x * epsilon_f;
+  }
+  else {
+    uint ix = __float_as_uint(P.x);
+    ix += ((ix ^ __float_as_uint(Ng.x)) >> 31) ? -epsilon_i : epsilon_i;
+    res.x = __uint_as_float(ix);
+  }
+
+  /* y component */
+  if (fabsf(P.y) < epsilon_test) {
+    res.y = P.y + Ng.y * epsilon_f;
+  }
+  else {
+    uint iy = __float_as_uint(P.y);
+    iy += ((iy ^ __float_as_uint(Ng.y)) >> 31) ? -epsilon_i : epsilon_i;
+    res.y = __uint_as_float(iy);
+  }
+
+  /* z component */
+  if (fabsf(P.z) < epsilon_test) {
+    res.z = P.z + Ng.z * epsilon_f;
+  }
+  else {
+    uint iz = __float_as_uint(P.z);
+    iz += ((iz ^ __float_as_uint(Ng.z)) >> 31) ? -epsilon_i : epsilon_i;
+    res.z = __uint_as_float(iz);
+  }
+
+  return res;
+#else
+  const float epsilon_f = 1e-4f;
+  return P + epsilon_f * Ng;
+#endif
+}
+
+#if defined(__VOLUME_RECORD_ALL__) || (defined(__SHADOW_RECORD_ALL__) && defined(__KERNEL_CPU__))
+/* ToDo: Move to another file? */
+ccl_device int intersections_compare(const void *a, const void *b)
+{
+  const Intersection *isect_a = (const Intersection *)a;
+  const Intersection *isect_b = (const Intersection *)b;
+
+  if (isect_a->t < isect_b->t)
+    return -1;
+  else if (isect_a->t > isect_b->t)
+    return 1;
+  else
+    return 0;
+}
+#endif
+
+#if defined(__SHADOW_RECORD_ALL__)
+ccl_device_inline void sort_intersections(Intersection *hits, uint num_hits)
+{
+  kernel_assert(num_hits > 0);
+
+#  ifdef __KERNEL_GPU__
+  /* Use bubble sort which has more friendly memory pattern on GPU. */
+  bool swapped;
+  do {
+    swapped = false;
+    for (int j = 0; j < num_hits - 1; ++j) {
+      if (hits[j].t > hits[j + 1].t) {
+        struct Intersection tmp = hits[j];
+        hits[j] = hits[j + 1];
+        hits[j + 1] = tmp;
+        swapped = true;
+      }
+    }
+    --num_hits;
+  } while (swapped);
+#  else
+  qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
+#  endif
+}
+#endif /* __SHADOW_RECORD_ALL__ | __VOLUME_RECORD_ALL__ */
+
+/* Utility to quickly get a shader flags from an intersection. */
+
+ccl_device_forceinline int intersection_get_shader_flags(KernelGlobals *ccl_restrict kg,
+                                                         const Intersection *isect)
+{
+  const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  int shader = 0;
+
+#ifdef __HAIR__
+  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+  {
+    shader = kernel_tex_fetch(__tri_shader, prim);
+  }
+#ifdef __HAIR__
+  else {
+    float4 str = kernel_tex_fetch(__curves, prim);
+    shader = __float_as_int(str.z);
+  }
+#endif
+
+  return kernel_tex_fetch(__shaders, (shader & SHADER_MASK)).flags;
+}
+
+ccl_device_forceinline int intersection_get_shader(KernelGlobals *ccl_restrict kg,
+                                                   const Intersection *isect)
+{
+  const int prim = kernel_tex_fetch(__prim_index, isect->prim);
+  int shader = 0;
+
+#ifdef __HAIR__
+  if (kernel_tex_fetch(__prim_type, isect->prim) & PRIMITIVE_ALL_TRIANGLE)
+#endif
+  {
+    shader = kernel_tex_fetch(__tri_shader, prim);
+  }
+#ifdef __HAIR__
+  else {
+    float4 str = kernel_tex_fetch(__curves, prim);
+    shader = __float_as_int(str.z);
+  }
+#endif
+
+  return shader & SHADER_MASK;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/closure/alloc.h b/intern/cycles/kernel/closure/alloc.h
index 341d1e16eb1..99a5a675976 100644
--- a/intern/cycles/kernel/closure/alloc.h
+++ b/intern/cycles/kernel/closure/alloc.h
@@ -57,14 +57,24 @@ ccl_device ccl_addr_space void *closure_alloc_extra(ShaderData *sd, int size)
 
 ccl_device_inline ShaderClosure *bsdf_alloc(ShaderData *sd, int size, float3 weight)
 {
-  ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+  kernel_assert(isfinite3_safe(weight));
 
-  if (sc == NULL)
-    return NULL;
+  const float sample_weight = fabsf(average(weight));
+
+  /* Use comparison this way to help dealing with non-finite weight: if the average is not finite
+   * we will not allocate new closure. */
+  if (sample_weight >= CLOSURE_WEIGHT_CUTOFF) {
+    ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+    if (sc == NULL) {
+      return NULL;
+    }
+
+    sc->sample_weight = sample_weight;
 
-  float sample_weight = fabsf(average(weight));
-  sc->sample_weight = sample_weight;
-  return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? sc : NULL;
+    return sc;
+  }
+
+  return NULL;
 }
 
 #ifdef __OSL__
@@ -73,17 +83,27 @@ ccl_device_inline ShaderClosure *bsdf_alloc_osl(ShaderData *sd,
                                                 float3 weight,
                                                 void *data)
 {
-  ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+  kernel_assert(isfinite3_safe(weight));
 
-  if (!sc)
-    return NULL;
+  const float sample_weight = fabsf(average(weight));
 
-  memcpy((void *)sc, data, size);
+  /* Use comparison this way to help dealing with non-finite weight: if the average is not finite
+   * we will not allocate new closure. */
+  if (sample_weight >= CLOSURE_WEIGHT_CUTOFF) {
+    ShaderClosure *sc = closure_alloc(sd, size, CLOSURE_NONE_ID, weight);
+    if (!sc) {
+      return NULL;
+    }
 
-  float sample_weight = fabsf(average(weight));
-  sc->weight = weight;
-  sc->sample_weight = sample_weight;
-  return (sample_weight >= CLOSURE_WEIGHT_CUTOFF) ? sc : NULL;
+    memcpy((void *)sc, data, size);
+
+    sc->weight = weight;
+    sc->sample_weight = sample_weight;
+
+    return sc;
+  }
+
+  return NULL;
 }
 #endif
 
diff --git a/intern/cycles/kernel/kernel_light.h b/intern/cycles/kernel/kernel_light.h
index 9650b85a5c2..42a834d2ce3 100644
--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -119,11 +119,11 @@ ccl_device_inline bool lamp_light_sample(
           klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
       float3 axisv = make_float3(
           klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-      float3 D = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
+      float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
       float invarea = fabsf(klight->area.invarea);
       bool is_round = (klight->area.invarea < 0.0f);
 
-      if (dot(ls->P - P, D) > 0.0f) {
+      if (dot(ls->P - P, Ng) > 0.0f) {
         return false;
       }
 
@@ -136,19 +136,37 @@ ccl_device_inline bool lamp_light_sample(
       }
       else {
         inplane = ls->P;
-        ls->pdf = rect_light_sample(P, &ls->P, axisu, axisv, randu, randv, true);
+
+        float3 sample_axisu = axisu;
+        float3 sample_axisv = axisv;
+
+        if (klight->area.tan_spread > 0.0f) {
+          if (!light_spread_clamp_area_light(
+                  P, Ng, &ls->P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+            return false;
+          }
+        }
+
+        ls->pdf = rect_light_sample(P, &ls->P, sample_axisu, sample_axisv, randu, randv, true);
         inplane = ls->P - inplane;
       }
 
       ls->u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu)) + 0.5f;
       ls->v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv)) + 0.5f;
 
-      ls->Ng = D;
+      ls->Ng = Ng;
       ls->D = normalize_len(ls->P - P, &ls->t);
 
       ls->eval_fac = 0.25f * invarea;
+
+      if (klight->area.tan_spread > 0.0f) {
+        /* Area Light spread angle attenuation */
+        ls->eval_fac *= light_spread_attenuation(
+            ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
+      }
+
       if (is_round) {
-        ls->pdf *= lamp_light_pdf(kg, D, -ls->D, ls->t);
+        ls->pdf *= lamp_light_pdf(kg, Ng, -ls->D, ls->t);
       }
     }
   }
@@ -283,9 +301,28 @@ ccl_device bool lamp_light_eval(
       ls->pdf = invarea * lamp_light_pdf(kg, Ng, -D, ls->t);
     }
     else {
-      ls->pdf = rect_light_sample(P, &light_P, axisu, axisv, 0, 0, false);
+      float3 sample_axisu = axisu;
+      float3 sample_axisv = axisv;
+
+      if (klight->area.tan_spread > 0.0f) {
+        if (!light_spread_clamp_area_light(
+                P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
+          return false;
+        }
+      }
+
+      ls->pdf = rect_light_sample(P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
     }
     ls->eval_fac = 0.25f * invarea;
+
+    if (klight->area.tan_spread > 0.0f) {
+      /* Area Light spread angle attenuation */
+      ls->eval_fac *= light_spread_attenuation(
+          ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
+      if (ls->eval_fac == 0.0f) {
+        return false;
+      }
+    }
   }
   else {
     return false;
diff --git a/intern/cycles/kernel/kernel_light_common.h b/intern/cycles/kernel/kernel_light_common.h
index 39503a4b479..4a683d36226 100644
--- a/intern/cycles/kernel/kernel_light_common.h
+++ b/intern/cycles/kernel/kernel_light_common.h
@@ -146,6 +146,70 @@ ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot
   return attenuation;
 }
 
+ccl_device float light_spread_attenuation(const float3 D,
+                                          const float3 lightNg,
+                                          const float tan_spread,
+                                          const float normalize_spread)
+{
+  /* Model a soft-box grid, computing the ratio of light not hidden by the
+   * slats of the grid at a given angle. (see D10594). */
+  const float cos_a = -dot(D, lightNg);
+  const float sin_a = safe_sqrtf(1.0f - sqr(cos_a));
+  const float tan_a = sin_a / cos_a;
+  return max((1.0f - (tan_spread * tan_a)) * normalize_spread, 0.0f);
+}
+
+/* Compute subset of area light that actually has an influence on the shading point, to
+ * reduce noise with low spread. */
+ccl_device bool light_spread_clamp_area_light(const float3 P,
+                                              const float3 lightNg,
+                                              float3 *lightP,
+                                              float3 *axisu,
+                                              float3 *axisv,
+                                              const float tan_spread)
+{
+  /* Closest point in area light plane and distance to that plane. */
+  const float3 closest_P = P - dot(lightNg, P - *lightP) * lightNg;
+  const float t = len(closest_P - P);
+
+  /* Radius of circle on area light that actually affects the shading point. */
+  const float radius = t / tan_spread;
+
+  /* TODO: would be faster to store as normalized vector + length, also in rect_light_sample. */
+  float len_u, len_v;
+  const float3 u = normalize_len(*axisu, &len_u);
+  const float3 v = normalize_len(*axisv, &len_v);
+
+  /* Local uv coordinates of closest point. */
+  const float closest_u = dot(u, closest_P - *lightP);
+  const float closest_v = dot(v, closest_P - *lightP);
+
+  /* Compute rectangle encompassing the circle that affects the shading point,
+   * clamped to the bounds of the area light. */
+  const float min_u = max(closest_u - radius, -len_u * 0.5f);
+  const float max_u = min(closest_u + radius, len_u * 0.5f);
+  const float min_v = max(closest_v - radius, -len_v * 0.5f);
+  const float max_v = min(closest_v + radius, len_v * 0.5f);
+
+  /* Skip if rectangle is empty. */
+  if (min_u >= max_u || min_v >= max_v) {
+    return false;
+  }
+
+  /* Compute new area light center position and axes from rectangle in local
+   * uv coordinates. */
+  const float new_center_u = 0.5f * (min_u + max_u);
+  const float new_center_v = 0.5f * (min_v + max_v);
+  const float new_len_u = max_u - min_u;
+  const float new_len_v = max_v - min_v;
+
+  *lightP = *lightP + new_center_u * u + new_center_v * v;
+  *axisu = u * new_len_u;
+  *axisv = v * new_len_v;
+
+  return true;
+}
+
 ccl_device float lamp_light_pdf(KernelGlobals *kg, const float3 Ng, const float3 I, float t)
 {
   float cos_pi = dot(Ng, I);
diff --git a/intern/cycles/kernel/kernel_montecarlo.h b/intern/cycles/kernel/kernel_montecarlo.h
index ba25c0e24e4..ce37bd0b15e 100644
--- a/intern/cycles/kernel/kernel_montecarlo.h
+++ b/intern/cycles/kernel/kernel_montecarlo.h
@@ -195,31 +195,108 @@ ccl_device float2 regular_polygon_sample(float corners, float rotation, float u,
 
 ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
 {
-  float3 R;
-  float NI = dot(N, I);
-  float NgR, threshold;
-
-  /* Check if the incident ray is coming from behind normal N. */
-  if (NI > 0) {
-    /* Normal reflection */
-    R = (2 * NI) * N - I;
-    NgR = dot(Ng, R);
-
-    /* Reflection rays may always be at least as shallow as the incoming ray. */
-    threshold = min(0.9f * dot(Ng, I), 0.01f);
-    if (NgR >= threshold) {
-      return N;
+  float3 R = 2 * dot(N, I) * N - I;
+
+  /* Reflection rays may always be at least as shallow as the incoming ray. */
+  float threshold = min(0.9f * dot(Ng, I), 0.01f);
+  if (dot(Ng, R) >= threshold) {
+    return N;
+  }
+
+  /* Form coordinate system with Ng as the Z axis and N inside the X-Z-plane.
+   * The X axis is found by normalizing the component of N that's orthogonal to Ng.
+   * The Y axis isn't actually needed.
+   */
+  float NdotNg = dot(N, Ng);
+  float3 X = normalize(N - NdotNg * Ng);
+
+  /* Keep math expressions. */
+  /* clang-format off */
+  /* Calculate N.z and N.x in the local coordinate system.
+   *
+   * The goal of this computation is to find a N' that is rotated towards Ng just enough
+   * to lift R' above the threshold (here called t), therefore dot(R', Ng) = t.
+   *
+   * According to the standard reflection equation,
+   * this means that we want dot(2*dot(N', I)*N' - I, Ng) = t.
+   *
+   * Since the Z axis of our local coordinate system is Ng, dot(x, Ng) is just x.z, so we get
+   * 2*dot(N', I)*N'.z - I.z = t.
+   *
+   * The rotation is simple to express in the coordinate system we formed -
+   * since N lies in the X-Z-plane, we know that N' will also lie in the X-Z-plane,
+   * so N'.y = 0 and therefore dot(N', I) = N'.x*I.x + N'.z*I.z .
+   *
+   * Furthermore, we want N' to be normalized, so N'.x = sqrt(1 - N'.z^2).
+   *
+   * With these simplifications,
+   * we get the final equation 2*(sqrt(1 - N'.z^2)*I.x + N'.z*I.z)*N'.z - I.z = t.
+   *
+   * The only unknown here is N'.z, so we can solve for that.
+   *
+   * The equation has four solutions in general:
+   *
+   * N'.z = +-sqrt(0.5*(+-sqrt(I.x^2*(I.x^2 + I.z^2 - t^2)) + t*I.z + I.x^2 + I.z^2)/(I.x^2 + I.z^2))
+   * We can simplify this expression a bit by grouping terms:
+   *
+   * a = I.x^2 + I.z^2
+   * b = sqrt(I.x^2 * (a - t^2))
+   * c = I.z*t + a
+   * N'.z = +-sqrt(0.5*(+-b + c)/a)
+   *
+   * Two solutions can immediately be discarded because they're negative so N' would lie in the
+   * lower hemisphere.
+   */
+  /* clang-format on */
+
+  float Ix = dot(I, X), Iz = dot(I, Ng);
+  float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+  float a = Ix2 + Iz2;
+
+  float b = safe_sqrtf(Ix2 * (a - sqr(threshold)));
+  float c = Iz * threshold + a;
+
+  /* Evaluate both solutions.
+   * In many cases one can be immediately discarded (if N'.z would be imaginary or larger than
+   * one), so check for that first. If no option is viable (might happen in extreme cases like N
+   * being in the wrong hemisphere), give up and return Ng. */
+  float fac = 0.5f / a;
+  float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
+  bool valid1 = (N1_z2 > 1e-5f) && (N1_z2 <= (1.0f + 1e-5f));
+  bool valid2 = (N2_z2 > 1e-5f) && (N2_z2 <= (1.0f + 1e-5f));
+
+  float2 N_new;
+  if (valid1 && valid2) {
+    /* If both are possible, do the expensive reflection-based check. */
+    float2 N1 = make_float2(safe_sqrtf(1.0f - N1_z2), safe_sqrtf(N1_z2));
+    float2 N2 = make_float2(safe_sqrtf(1.0f - N2_z2), safe_sqrtf(N2_z2));
+
+    float R1 = 2 * (N1.x * Ix + N1.y * Iz) * N1.y - Iz;
+    float R2 = 2 * (N2.x * Ix + N2.y * Iz) * N2.y - Iz;
+
+    valid1 = (R1 >= 1e-5f);
+    valid2 = (R2 >= 1e-5f);
+    if (valid1 && valid2) {
+      /* If both solutions are valid, return the one with the shallower reflection since it will be
+       * closer to the input (if the original reflection wasn't shallow, we would not be in this
+       * part of the function). */
+      N_new = (R1 < R2) ? N1 : N2;
     }
+    else {
+      /* If only one reflection is valid (= positive), pick that one. */
+      N_new = (R1 > R2) ? N1 : N2;
+    }
+  }
+  else if (valid1 || valid2) {
+    /* Only one solution passes the N'.z criterium, so pick that one. */
+    float Nz2 = valid1 ? N1_z2 : N2_z2;
+    N_new = make_float2(safe_sqrtf(1.0f - Nz2), safe_sqrtf(Nz2));
   }
   else {
-    /* Bad incident */
-    R = -I;
-    NgR = dot(Ng, R);
-    threshold = 0.01f;
+    return Ng;
   }
 
-  R = R + Ng * (threshold - NgR);            /* Lift the reflection above the threshold. */
-  return normalize(I * len(R) + R * len(I)); /* Find a bisector. */
+  return N_new.x * X + N_new.y * Ng;
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index 5681510fc25..dd2390808ea 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -65,7 +65,6 @@ ccl_device_forceinline bool kernel_path_scene_intersect(KernelGlobals *kg,
   uint visibility = path_state_ray_visibility(kg, state);
 
   if (path_state_ao_bounce(kg, state)) {
-    visibility = PATH_RAY_SHADOW;
     ray->t = kernel_data.background.ao_distance;
   }
 
@@ -416,7 +415,13 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
         break;
       }
       else if (path_state_ao_bounce(kg, state)) {
-        break;
+        if (intersection_get_shader_flags(kg, &isect) &
+            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        }
+        else {
+          break;
+        }
       }
 
       /* Setup shader data. */
@@ -554,7 +559,13 @@ ccl_device_forceinline void kernel_path_integrate(KernelGlobals *kg,
         break;
       }
       else if (path_state_ao_bounce(kg, state)) {
-        break;
+        if (intersection_get_shader_flags(kg, &isect) &
+            (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
+          state->flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
+        }
+        else {
+          break;
+        }
       }
 
       /* Setup shader data. */
diff --git a/intern/cycles/kernel/kernel_subsurface.h b/intern/cycles/kernel/kernel_subsurface.h
index c75958e79c5..dd922b86722 100644
--- a/intern/cycles/kernel/kernel_subsurface.h
+++ b/intern/cycles/kernel/kernel_subsurface.h
@@ -25,8 +25,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3
 subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, float r, bool all)
 {
-  /* this is the veach one-sample model with balance heuristic, some pdf
-   * factors drop out when using balance heuristic weighting */
+  /* This is the Veach one-sample model with balance heuristic, some pdf
+   * factors drop out when using balance heuristic weighting. For branched
+   * path tracing (all) we sample all closure and don't use MIS. */
   float3 eval_sum = zero_float3();
   float pdf_sum = 0.0f;
   float sample_weight_inv = 0.0f;
@@ -65,6 +66,30 @@ subsurface_scatter_eval(ShaderData *sd, const ShaderClosure *sc, float disk_r, f
   return (pdf_sum > 0.0f) ? eval_sum / pdf_sum : zero_float3();
 }
 
+ccl_device_inline float3 subsurface_scatter_walk_eval(ShaderData *sd,
+                                                      const ShaderClosure *sc,
+                                                      float3 throughput,
+                                                      bool all)
+{
+  /* This is the Veach one-sample model with balance heuristic, some pdf
+   * factors drop out when using balance heuristic weighting. For branched
+   * path tracing (all) we sample all closure and don't use MIS. */
+  if (!all) {
+    float bssrdf_weight = 0.0f;
+    float weight = sc->sample_weight;
+
+    for (int i = 0; i < sd->num_closure; i++) {
+      sc = &sd->closure[i];
+
+      if (CLOSURE_IS_BSSRDF(sc->type)) {
+        bssrdf_weight += sc->sample_weight;
+      }
+    }
+    throughput *= bssrdf_weight / weight;
+  }
+  return throughput;
+}
+
 /* replace closures with a single diffuse bsdf closure after scatter step */
 ccl_device void subsurface_scatter_setup_diffuse_bsdf(
     KernelGlobals *kg, ShaderData *sd, ClosureType type, float roughness, float3 weight, float3 N)
@@ -437,7 +462,8 @@ ccl_device_noinline
                            ccl_addr_space PathState *state,
                            const ShaderClosure *sc,
                            const float bssrdf_u,
-                           const float bssrdf_v)
+                           const float bssrdf_v,
+                           bool all)
 {
   /* Sample diffuse surface scatter into the object. */
   float3 D;
@@ -605,6 +631,13 @@ ccl_device_noinline
     if (hit) {
       t = ray->t;
     }
+    else if (bounce == 0) {
+      /* Restore original position if nothing was hit after the first bounce,
+       * without the ray_offset() that was added to avoid self-intersection.
+       * Otherwise if that offset is relatively large compared to the scattering
+       * radius, we never go back up high enough to exit the surface. */
+      ray->P = sd->P;
+    }
 
     /* Advance to new scatter location. */
     ray->P += t * ray->D;
@@ -662,7 +695,7 @@ ccl_device_noinline
   /* TODO: gain back performance lost from merging with disk BSSRDF. We
    * only need to return on hit so this indirect ray push/pop overhead
    * is not actually needed, but it does keep the code simpler. */
-  ss_isect->weight[0] = throughput;
+  ss_isect->weight[0] = subsurface_scatter_walk_eval(sd, sc, throughput, all);
 #ifdef __SPLIT_KERNEL__
   ss_isect->ray = *ray;
 #endif
@@ -684,7 +717,7 @@ ccl_device_inline int subsurface_scatter_multi_intersect(KernelGlobals *kg,
     return subsurface_scatter_disk(kg, ss_isect, sd, sc, lcg_state, bssrdf_u, bssrdf_v, all);
   }
   else {
-    return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v);
+    return subsurface_random_walk(kg, ss_isect, sd, state, sc, bssrdf_u, bssrdf_v, all);
   }
 }
 
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index df56360b1df..74fa2826cd4 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -99,27 +99,23 @@ CCL_NAMESPACE_BEGIN
 #define __AO__
 #define __PASSES__
 #define __HAIR__
-
-/* Without these we get an AO render, used by OpenCL preview kernel. */
-#ifndef __KERNEL_AO_PREVIEW__
-#  define __SVM__
-#  define __EMISSION__
-#  define __HOLDOUT__
-#  define __MULTI_CLOSURE__
-#  define __TRANSPARENT_SHADOWS__
-#  define __BACKGROUND_MIS__
-#  define __LAMP_MIS__
-#  define __CAMERA_MOTION__
-#  define __OBJECT_MOTION__
-#  define __BAKING__
-#  define __PRINCIPLED__
-#  define __SUBSURFACE__
-#  define __VOLUME__
-#  define __VOLUME_SCATTER__
-#  define __CMJ__
-#  define __SHADOW_RECORD_ALL__
-#  define __BRANCHED_PATH__
-#endif
+#define __SVM__
+#define __EMISSION__
+#define __HOLDOUT__
+#define __MULTI_CLOSURE__
+#define __TRANSPARENT_SHADOWS__
+#define __BACKGROUND_MIS__
+#define __LAMP_MIS__
+#define __CAMERA_MOTION__
+#define __OBJECT_MOTION__
+#define __BAKING__
+#define __PRINCIPLED__
+#define __SUBSURFACE__
+#define __VOLUME__
+#define __VOLUME_SCATTER__
+#define __CMJ__
+#define __SHADOW_RECORD_ALL__
+#define __BRANCHED_PATH__
 
 /* Device specific features */
 #ifdef __KERNEL_CPU__
@@ -895,6 +891,8 @@ enum ShaderDataFlag {
   SD_HAS_CONSTANT_EMISSION = (1 << 27),
   /* Needs to access attributes for volume rendering */
   SD_NEED_VOLUME_ATTRIBUTES = (1 << 28),
+  /* Shader has emission */
+  SD_HAS_EMISSION = (1 << 29),
 
   SD_SHADER_FLAGS = (SD_USE_MIS | SD_HAS_TRANSPARENT_SHADOW | SD_HAS_VOLUME | SD_HAS_ONLY_VOLUME |
                      SD_HETEROGENEOUS_VOLUME | SD_HAS_BSSRDF_BUMP | SD_VOLUME_EQUIANGULAR |
@@ -1501,9 +1499,9 @@ typedef struct KernelAreaLight {
   float axisu[3];
   float invarea;
   float axisv[3];
-  float pad1;
+  float tan_spread;
   float dir[3];
-  float pad2;
+  float normalize_spread;
 } KernelAreaLight;
 
 typedef struct KernelDistantLight {
diff --git a/intern/cycles/kernel/shaders/node_noise_texture.osl b/intern/cycles/kernel/shaders/node_noise_texture.osl
index 61c0216910b..01196ab633a 100644
--- a/intern/cycles/kernel/shaders/node_noise_texture.osl
+++ b/intern/cycles/kernel/shaders/node_noise_texture.osl
@@ -25,7 +25,7 @@
  * coordinates to act as a seed since the noise functions don't have seed values.
  * A seed value is needed for generating distortion textures and color outputs.
  * The offset's components are in the range [100, 200], not too high to cause
- * bad precision and not to small to be noticeable. We use float seed because
+ * bad precision and not too small to be noticeable. We use float seed because
  * OSL only support float hashes.
  */
 
diff --git a/intern/cycles/kernel/shaders/node_vector_math.osl b/intern/cycles/kernel/shaders/node_vector_math.osl
index 3963c23ea9c..c08d75b99ef 100644
--- a/intern/cycles/kernel/shaders/node_vector_math.osl
+++ b/intern/cycles/kernel/shaders/node_vector_math.osl
@@ -52,6 +52,9 @@ shader node_vector_math(string math_type = "add",
   else if (math_type == "faceforward") {
     Vector = compatible_faceforward(Vector1, Vector2, Vector3);
   }
+  else if (math_type == "multiply_add") {
+    Vector = Vector1 * Vector2 + Vector3;
+  }
   else if (math_type == "dot_product") {
     Value = dot(Vector1, Vector2);
   }
diff --git a/intern/cycles/kernel/shaders/stdcycles.h b/intern/cycles/kernel/shaders/stdcycles.h
index af7b645d9a2..dd604da68ce 100644
--- a/intern/cycles/kernel/shaders/stdcycles.h
+++ b/intern/cycles/kernel/shaders/stdcycles.h
@@ -84,30 +84,67 @@ closure color principled_hair(normal N,
 closure color henyey_greenstein(float g) BUILTIN;
 closure color absorption() BUILTIN;
 
-normal ensure_valid_reflection(normal Ng, normal I, normal N)
+normal ensure_valid_reflection(normal Ng, vector I, normal N)
 {
   /* The implementation here mirrors the one in kernel_montecarlo.h,
    * check there for an explanation of the algorithm. */
-  vector R;
-  float NI = dot(N, I);
-  float NgR, threshold;
-
-  if (NI > 0) {
-    R = (2 * NI) * N - I;
-    NgR = dot(Ng, R);
-    threshold = min(0.9 * dot(Ng, I), 0.01);
-    if (NgR >= threshold) {
-      return N;
+
+  float sqr(float x)
+  {
+    return x * x;
+  }
+
+  vector R = 2 * dot(N, I) * N - I;
+
+  float threshold = min(0.9 * dot(Ng, I), 0.01);
+  if (dot(Ng, R) >= threshold) {
+    return N;
+  }
+
+  float NdotNg = dot(N, Ng);
+  vector X = normalize(N - NdotNg * Ng);
+
+  float Ix = dot(I, X), Iz = dot(I, Ng);
+  float Ix2 = sqr(Ix), Iz2 = sqr(Iz);
+  float a = Ix2 + Iz2;
+
+  float b = sqrt(Ix2 * (a - sqr(threshold)));
+  float c = Iz * threshold + a;
+
+  float fac = 0.5 / a;
+  float N1_z2 = fac * (b + c), N2_z2 = fac * (-b + c);
+  int valid1 = (N1_z2 > 1e-5) && (N1_z2 <= (1.0 + 1e-5));
+  int valid2 = (N2_z2 > 1e-5) && (N2_z2 <= (1.0 + 1e-5));
+
+  float N_new_x, N_new_z;
+  if (valid1 && valid2) {
+    float N1_x = sqrt(1.0 - N1_z2), N1_z = sqrt(N1_z2);
+    float N2_x = sqrt(1.0 - N2_z2), N2_z = sqrt(N2_z2);
+
+    float R1 = 2 * (N1_x * Ix + N1_z * Iz) * N1_z - Iz;
+    float R2 = 2 * (N2_x * Ix + N2_z * Iz) * N2_z - Iz;
+
+    valid1 = (R1 >= 1e-5);
+    valid2 = (R2 >= 1e-5);
+    if (valid1 && valid2) {
+      N_new_x = (R1 < R2) ? N1_x : N2_x;
+      N_new_z = (R1 < R2) ? N1_z : N2_z;
+    }
+    else {
+      N_new_x = (R1 > R2) ? N1_x : N2_x;
+      N_new_z = (R1 > R2) ? N1_z : N2_z;
     }
   }
+  else if (valid1 || valid2) {
+    float Nz2 = valid1 ? N1_z2 : N2_z2;
+    N_new_x = sqrt(1.0 - Nz2);
+    N_new_z = sqrt(Nz2);
+  }
   else {
-    R = -I;
-    NgR = dot(Ng, R);
-    threshold = 0.01;
+    return Ng;
   }
 
-  R = R + Ng * (threshold - NgR);
-  return normalize(I * length(R) + R * length(I));
+  return N_new_x * X + N_new_z * Ng;
 }
 
 #endif /* CCL_STDOSL_H */
diff --git a/intern/cycles/kernel/svm/svm_math.h b/intern/cycles/kernel/svm/svm_math.h
index dda2e50f916..733ea28f9e5 100644
--- a/intern/cycles/kernel/svm/svm_math.h
+++ b/intern/cycles/kernel/svm/svm_math.h
@@ -58,7 +58,8 @@ ccl_device void svm_node_vector_math(KernelGlobals *kg,
   float3 vector;
 
   /* 3 Vector Operators */
-  if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD) {
+  if (type == NODE_VECTOR_MATH_WRAP || type == NODE_VECTOR_MATH_FACEFORWARD ||
+      type == NODE_VECTOR_MATH_MULTIPLY_ADD) {
     uint4 extra_node = read_node(kg, offset);
     c = stack_load_float3(stack, extra_node.x);
   }
diff --git a/intern/cycles/kernel/svm/svm_math_util.h b/intern/cycles/kernel/svm/svm_math_util.h
index 389c44ab1da..9e654f2247f 100644
--- a/intern/cycles/kernel/svm/svm_math_util.h
+++ b/intern/cycles/kernel/svm/svm_math_util.h
@@ -52,6 +52,9 @@ ccl_device void svm_vector_math(float *value,
     case NODE_VECTOR_MATH_FACEFORWARD:
       *vector = faceforward(a, b, c);
       break;
+    case NODE_VECTOR_MATH_MULTIPLY_ADD:
+      *vector = a * b + c;
+      break;
     case NODE_VECTOR_MATH_DOT_PRODUCT:
       *value = dot(a, b);
       break;
@@ -242,12 +245,15 @@ ccl_device float3 svm_math_blackbody_color(float t)
     return make_float3(4.70366907f, 0.0f, 0.0f);
   }
 
+  /* Manually align for readability. */
+  /* clang-format off */
   int i = (t >= 6365.0f) ? 5 :
           (t >= 3315.0f) ? 4 :
           (t >= 1902.0f) ? 3 :
           (t >= 1449.0f) ? 2 :
           (t >= 1167.0f) ? 1 :
                            0;
+  /* clang-format on */
 
   ccl_constant float *r = blackbody_table_r[i];
   ccl_constant float *g = blackbody_table_g[i];
diff --git a/intern/cycles/kernel/svm/svm_noisetex.h b/intern/cycles/kernel/svm/svm_noisetex.h
index 920dd7d9d02..61fd9553802 100644
--- a/intern/cycles/kernel/svm/svm_noisetex.h
+++ b/intern/cycles/kernel/svm/svm_noisetex.h
@@ -20,7 +20,7 @@ CCL_NAMESPACE_BEGIN
  * coordinates to act as a seed since the noise functions don't have seed values.
  * A seed value is needed for generating distortion textures and color outputs.
  * The offset's components are in the range [100, 200], not too high to cause
- * bad precision and not to small to be noticeable. We use float seed because
+ * bad precision and not too small to be noticeable. We use float seed because
  * OSL only support float hashes.
  */
 
diff --git a/intern/cycles/kernel/svm/svm_tex_coord.h b/intern/cycles/kernel/svm/svm_tex_coord.h
index 4fe940f1a67..fc46bb584be 100644
--- a/intern/cycles/kernel/svm/svm_tex_coord.h
+++ b/intern/cycles/kernel/svm/svm_tex_coord.h
@@ -370,10 +370,13 @@ ccl_device void svm_node_tangent(KernelGlobals *kg, ShaderData *sd, float *stack
 
   if (direction_type == NODE_TANGENT_UVMAP) {
     /* UV map */
-    if (desc.offset == ATTR_STD_NOT_FOUND)
-      tangent = make_float3(0.0f, 0.0f, 0.0f);
-    else
+    if (desc.offset == ATTR_STD_NOT_FOUND) {
+      stack_store_float3(stack, tangent_offset, zero_float3());
+      return;
+    }
+    else {
       tangent = attribute_value;
+    }
   }
   else {
     /* radial */
diff --git a/intern/cycles/kernel/svm/svm_types.h b/intern/cycles/kernel/svm/svm_types.h
index 64a8f82a094..062afcfa5ac 100644
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -341,6 +341,7 @@ typedef enum NodeVectorMathType {
   NODE_VECTOR_MATH_TANGENT,
   NODE_VECTOR_MATH_REFRACT,
   NODE_VECTOR_MATH_FACEFORWARD,
+  NODE_VECTOR_MATH_MULTIPLY_ADD,
 } NodeVectorMathType;
 
 typedef enum NodeClampType {