18 files changed, 361 insertions, 260 deletions
diff --git a/intern/CMakeLists.txt b/intern/CMakeLists.txt
index 4493c68a88d..e6b561c39b9 100644
--- a/intern/CMakeLists.txt
+++ b/intern/CMakeLists.txt
@@ -80,4 +80,3 @@ endif()
 if(WITH_QUADRIFLOW)
   add_subdirectory(quadriflow)
 endif()
-
diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 61ac24f7f07..013d86a560b 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -262,13 +262,13 @@ def register_passes(engine, scene, srl):
 
     if crl.use_pass_crypto_object:
         for i in range(0, crl.pass_crypto_depth, 2):
-            engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+            engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i//2), 4, "RGBA", 'COLOR')
     if crl.use_pass_crypto_material:
         for i in range(0, crl.pass_crypto_depth, 2):
-            engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+            engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i//2), 4, "RGBA", 'COLOR')
     if srl.cycles.use_pass_crypto_asset:
         for i in range(0, srl.cycles.pass_crypto_depth, 2):
-            engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i), 4, "RGBA", 'COLOR')
+            engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i//2), 4, "RGBA", 'COLOR')
 
     if crl.use_denoising or crl.denoising_store_passes:
         engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR')
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 00dd37f089c..b5e10b0c2cb 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -454,6 +454,12 @@ class CUDADevice : public Device {
         VLOG(1) << "Using precompiled kernel.";
         return cubin;
       }
+      const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
     }
 
     const string common_cflags = compile_kernel_get_common_cflags(
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 782553e405c..4077a1ad516 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -389,11 +389,20 @@ if(WITH_CYCLES_CUDA_BINARIES)
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
-    set(cuda_cubin ${name}_${arch}.cubin)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "cubin")
+    endif()
+    set(cuda_file ${name}_${arch}.${format})
 
     set(kernel_sources ${sources})
     if(NOT ${prev_arch} STREQUAL "none")
-      set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin)
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin)
+      endif()
     endif()
 
     set(cuda_kernel_src "/kernels/cuda/${name}.cu")
@@ -406,7 +415,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
       -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
       --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin})
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
     if(${experimental})
       set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
@@ -440,20 +449,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
             -v
             -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
         DEPENDS ${kernel_sources} cycles_cubin_cc)
+      set(cuda_file ${cuda_cubin})
     else()
       add_custom_command(
-        OUTPUT ${cuda_cubin}
+        OUTPUT ${cuda_file}
         COMMAND ${CUDA_NVCC_EXECUTABLE}
             -arch=${arch}
             ${CUDA_NVCC_FLAGS}
-            --cubin
+            --${format}
             ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
             --ptxas-options="-v"
             ${cuda_flags}
         DEPENDS ${kernel_sources})
     endif()
-    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-    list(APPEND cuda_cubins ${cuda_cubin})
+    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib)
+    list(APPEND cuda_cubins ${cuda_file})
 
     unset(cuda_debug_flags)
   endmacro()
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 41bbadb621d..adc85881fe5 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -76,9 +76,9 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
   filter_calculate_scale(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions. This mainly helps to prevent
-   * overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent overfitting. */
   float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero(feature_matrix, num_features);
   FOR_PIXEL_WINDOW
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 830444645d7..5a124b5d73b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -73,9 +73,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
   filter_calculate_scale_sse(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions. This mainly helps to prevent
-   * overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent over-fitting. */
   float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero_sse(feature_matrix_sse, num_features);
   FOR_PIXEL_WINDOW_SSE
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 2778cffba3a..b29d4163133 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -16,127 +16,127 @@
  */
 
 #ifndef __UTIL_DEFINES_H__
-#  define __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
 
 /* Bitness */
 
-#  if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
-      defined(_M_X64)
-#    define __KERNEL_64_BIT__
-#  endif
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
+    defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
 
 /* Qualifiers for kernel code shared by CPU and GPU */
 
-#  ifndef __KERNEL_GPU__
-#    define ccl_device static inline
-#    define ccl_device_noinline static
-#    define ccl_device_noinline_cpu ccl_device_noinline
-#    define ccl_global
-#    define ccl_static_constant static const
-#    define ccl_constant const
-#    define ccl_local
-#    define ccl_local_param
-#    define ccl_private
-#    define ccl_restrict __restrict
-#    define ccl_ref &
-#    define ccl_optional_struct_init
-#    define __KERNEL_WITH_SSE_ALIGN__
-
-#    if defined(_WIN32) && !defined(FREE_WINDOWS)
-#      define ccl_device_inline static __forceinline
-#      define ccl_device_forceinline static __forceinline
-#      define ccl_align(...) __declspec(align(__VA_ARGS__))
-#      ifdef __KERNEL_64_BIT__
-#        define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#      else /* __KERNEL_64_BIT__ */
-#        undef __KERNEL_WITH_SSE_ALIGN__
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_device_noinline_cpu ccl_device_noinline
+#  define ccl_global
+#  define ccl_static_constant static const
+#  define ccl_constant const
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define ccl_ref &
+#  define ccl_optional_struct_init
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
 /* No support for function arguments (error C2719). */
-#        define ccl_try_align(...)
-#      endif /* __KERNEL_64_BIT__ */
-#      define ccl_may_alias
-#      define ccl_always_inline __forceinline
-#      define ccl_never_inline __declspec(noinline)
-#      define ccl_maybe_unused
-#    else /* _WIN32 && !FREE_WINDOWS */
-#      define ccl_device_inline static inline __attribute__((always_inline))
-#      define ccl_device_forceinline static inline __attribute__((always_inline))
-#      define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#      ifndef FREE_WINDOWS64
-#        define __forceinline inline __attribute__((always_inline))
-#      endif
-#      define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#      define ccl_may_alias __attribute__((__may_alias__))
-#      define ccl_always_inline __attribute__((always_inline))
-#      define ccl_never_inline __attribute__((noinline))
-#      define ccl_maybe_unused __attribute__((used))
-#    endif /* _WIN32 && !FREE_WINDOWS */
+#      define ccl_try_align(...)
+#    endif /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif /* _WIN32 && !FREE_WINDOWS */
 
 /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-#    ifndef ATTR_FALLTHROUGH
-#      if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
-#        define ATTR_FALLTHROUGH __attribute__((fallthrough))
-#      else
-#        define ATTR_FALLTHROUGH ((void)0)
-#      endif
+#  ifndef ATTR_FALLTHROUGH
+#    if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
+#      define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#    else
+#      define ATTR_FALLTHROUGH ((void)0)
 #    endif
-#  endif /* __KERNEL_GPU__ */
+#  endif
+#endif /* __KERNEL_GPU__ */
 
 /* macros */
 
 /* hints for branch prediction, only use in code that runs a _lot_ */
-#  if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#    define LIKELY(x) __builtin_expect(!!(x), 1)
-#    define UNLIKELY(x) __builtin_expect(!!(x), 0)
-#  else
-#    define LIKELY(x) (x)
-#    define UNLIKELY(x) (x)
-#  endif
-
-#  if defined(__GNUC__) || defined(__clang__)
-#    if defined(__cplusplus)
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x) __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x) (x)
+#  define UNLIKELY(x) (x)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__cplusplus)
 /* Some magic to be sure we don't have reference in the type. */
 template<typename T> static inline T decltype_helper(T x)
 {
   return x;
 }
-#      define TYPEOF(x) decltype(decltype_helper(x))
-#    else
-#      define TYPEOF(x) typeof(x)
-#    endif
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
 #  endif
+#endif
 
 /* Causes warning:
  * incompatible types when assigning to type 'Foo' from type 'Bar'
  * ... the compiler optimizes away the temp var */
-#  ifdef __GNUC__
-#    define CHECK_TYPE(var, type) \
-      { \
-        TYPEOF(var) * __tmp; \
-        __tmp = (type *)NULL; \
-        (void)__tmp; \
-      } \
-      (void)0
-
-#    define CHECK_TYPE_PAIR(var_a, var_b) \
-      { \
-        TYPEOF(var_a) * __tmp; \
-        __tmp = (typeof(var_b) *)NULL; \
-        (void)__tmp; \
-      } \
-      (void)0
-#  else
-#    define CHECK_TYPE(var, type)
-#    define CHECK_TYPE_PAIR(var_a, var_b)
-#  endif
+#ifdef __GNUC__
+#  define CHECK_TYPE(var, type) \
+    { \
+      TYPEOF(var) * __tmp; \
+      __tmp = (type *)NULL; \
+      (void)__tmp; \
+    } \
+    (void)0
+
+#  define CHECK_TYPE_PAIR(var_a, var_b) \
+    { \
+      TYPEOF(var_a) * __tmp; \
+      __tmp = (typeof(var_b) *)NULL; \
+      (void)__tmp; \
+    } \
+    (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
 
 /* can be used in simple macros */
-#  define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val)))
-
-#  ifndef __KERNEL_GPU__
-#    include <cassert>
-#    define util_assert(statement) assert(statement)
-#  else
-#    define util_assert(statement)
-#  endif
+#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement) assert(statement)
+#else
+#  define util_assert(statement)
+#endif
 
 #endif /* __UTIL_DEFINES_H__ */
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index b4b972a4036..ceb52830319 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -15,18 +15,18 @@
  */
 
 #ifndef __UTIL_STATIC_ASSERT_H__
-#  define __UTIL_STATIC_ASSERT_H__
+#define __UTIL_STATIC_ASSERT_H__
 
 CCL_NAMESPACE_BEGIN
 
 /* TODO(sergey): In theory CUDA might work with own static assert
  * implementation since it's just pure C++.
  */
-#  ifdef __KERNEL_GPU__
-#    ifndef static_assert
-#      define static_assert(statement, message)
-#    endif
-#  endif /* __KERNEL_GPU__ */
+#ifdef __KERNEL_GPU__
+#  ifndef static_assert
+#    define static_assert(statement, message)
+#  endif
+#endif /* __KERNEL_GPU__ */
 
 /* TODO(sergey): For until C++11 is a bare minimum for us,
  * we do a bit of a trickery to show meaningful message so
@@ -42,8 +42,8 @@ CCL_NAMESPACE_BEGIN
  * After C++11 bump it should be possible to glue structure
  * name to the error message,
  */
-#  define static_assert_align(st, align) \
-    static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
+#define static_assert_align(st, align) \
+  static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
 
 CCL_NAMESPACE_END
 
diff --git a/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp b/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp
index 3b28f055191..aabaffc7732 100644
--- a/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp
+++ b/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp
@@ -24,8 +24,6 @@
 #include "GHOST_DisplayManagerWin32.h"
 #include "GHOST_Debug.h"
 
-#undef _WIN32_WINNT
-#define _WIN32_WINNT 0x501  // require Windows XP or newer
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 
diff --git a/intern/ghost/intern/GHOST_SystemPathsWin32.cpp b/intern/ghost/intern/GHOST_SystemPathsWin32.cpp
index bdc403b947e..63a6b7224b5 100644
--- a/intern/ghost/intern/GHOST_SystemPathsWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemPathsWin32.cpp
@@ -39,15 +39,16 @@ GHOST_SystemPathsWin32::~GHOST_SystemPathsWin32()
 
 const GHOST_TUns8 *GHOST_SystemPathsWin32::getSystemDir(int, const char *versionstr) const
 {
-  static char knownpath[MAX_PATH * 3 + 128] = {
-      0}; /* 1 utf-16 might translante into 3 utf-8. 2 utf-16 translates into 4 utf-8*/
-  wchar_t knownpath_16[MAX_PATH];
+  /* 1 utf-16 might translante into 3 utf-8. 2 utf-16 translates into 4 utf-8*/
+  static char knownpath[MAX_PATH * 3 + 128] = {0};
+  PWSTR knownpath_16 = NULL;
 
-  HRESULT hResult = SHGetFolderPathW(
-      NULL, CSIDL_COMMON_APPDATA, NULL, SHGFP_TYPE_CURRENT, knownpath_16);
+  HRESULT hResult = SHGetKnownFolderPath(
+      FOLDERID_ProgramData, KF_FLAG_DEFAULT, NULL, &knownpath_16);
 
   if (hResult == S_OK) {
     conv_utf_16_to_8(knownpath_16, knownpath, MAX_PATH * 3);
+    CoTaskMemFree(knownpath_16);
     strcat(knownpath, "\\Blender Foundation\\Blender\\");
     strcat(knownpath, versionstr);
     return (GHOST_TUns8 *)knownpath;
@@ -59,12 +60,14 @@ const GHOST_TUns8 *GHOST_SystemPathsWin32::getSystemDir(int, const char *version
 const GHOST_TUns8 *GHOST_SystemPathsWin32::getUserDir(int, const char *versionstr) const
 {
   static char knownpath[MAX_PATH * 3 + 128] = {0};
-  wchar_t knownpath_16[MAX_PATH];
+  PWSTR knownpath_16 = NULL;
 
-  HRESULT hResult = SHGetFolderPathW(NULL, CSIDL_APPDATA, NULL, SHGFP_TYPE_CURRENT, knownpath_16);
+  HRESULT hResult = SHGetKnownFolderPath(
+      FOLDERID_RoamingAppData, KF_FLAG_DEFAULT, NULL, &knownpath_16);
 
   if (hResult == S_OK) {
     conv_utf_16_to_8(knownpath_16, knownpath, MAX_PATH * 3);
+    CoTaskMemFree(knownpath_16);
     strcat(knownpath, "\\Blender Foundation\\Blender\\");
     strcat(knownpath, versionstr);
     return (GHOST_TUns8 *)knownpath;
diff --git a/intern/ghost/intern/GHOST_SystemPathsWin32.h b/intern/ghost/intern/GHOST_SystemPathsWin32.h
index 49d241df633..f1924ea51bc 100644
--- a/intern/ghost/intern/GHOST_SystemPathsWin32.h
+++ b/intern/ghost/intern/GHOST_SystemPathsWin32.h
@@ -28,8 +28,6 @@
 #  error WIN32 only!
 #endif  // WIN32
 
-#undef _WIN32_WINNT
-#define _WIN32_WINNT 0x501  // require Windows XP or newer
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 
diff --git a/intern/ghost/intern/GHOST_TaskbarWin32.h b/intern/ghost/intern/GHOST_TaskbarWin32.h
index b1b81337494..abf1172cea8 100644
--- a/intern/ghost/intern/GHOST_TaskbarWin32.h
+++ b/intern/ghost/intern/GHOST_TaskbarWin32.h
@@ -24,10 +24,6 @@
 #  error WIN32 only!
 #endif  // WIN32
 
-/* require Windows XP or newer */
-#undef _WIN32_WINNT
-#define _WIN32_WINNT 0x501
-
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <shlobj.h>
diff --git a/intern/ghost/intern/GHOST_WindowViewCocoa.h b/intern/ghost/intern/GHOST_WindowViewCocoa.h
index 9ed339c9992..cee40924b73 100644
--- a/intern/ghost/intern/GHOST_WindowViewCocoa.h
+++ b/intern/ghost/intern/GHOST_WindowViewCocoa.h
@@ -58,6 +58,11 @@
   return YES;
 }
 
+- (BOOL)acceptsFirstMouse:(NSEvent *)event
+{
+  return YES;
+}
+
 // The trick to prevent Cocoa from complaining (beeping)
 - (void)keyDown:(NSEvent *)event
 {
diff --git a/intern/opensubdiv/internal/opensubdiv_evaluator.cc b/intern/opensubdiv/internal/opensubdiv_evaluator.cc
index 2500691885c..4f5a1db82ca 100644
--- a/intern/opensubdiv/internal/opensubdiv_evaluator.cc
+++ b/intern/opensubdiv/internal/opensubdiv_evaluator.cc
@@ -102,6 +102,17 @@ void evaluateLimit(OpenSubdiv_Evaluator *evaluator,
   evaluator->internal->eval_output->evaluateLimit(ptex_face_index, face_u, face_v, P, dPdu, dPdv);
 }
 
+void evaluatePatchesLimit(OpenSubdiv_Evaluator *evaluator,
+                          const OpenSubdiv_PatchCoord *patch_coords,
+                          const int num_patch_coords,
+                          float *P,
+                          float *dPdu,
+                          float *dPdv)
+{
+  evaluator->internal->eval_output->evaluatePatchesLimit(
+      patch_coords, num_patch_coords, P, dPdu, dPdv);
+}
+
 void evaluateVarying(OpenSubdiv_Evaluator *evaluator,
                      const int ptex_face_index,
                      float face_u,
@@ -137,6 +148,8 @@ void assignFunctionPointers(OpenSubdiv_Evaluator *evaluator)
   evaluator->evaluateLimit = evaluateLimit;
   evaluator->evaluateVarying = evaluateVarying;
   evaluator->evaluateFaceVarying = evaluateFaceVarying;
+
+  evaluator->evaluatePatchesLimit = evaluatePatchesLimit;
 }
 
 }  // namespace
diff --git a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc
index fa45c0119ec..c5dd4509976 100644
--- a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc
+++ b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc
@@ -54,109 +54,133 @@ using OpenSubdiv::Osd::CpuPatchTable;
 using OpenSubdiv::Osd::CpuVertexBuffer;
 using OpenSubdiv::Osd::PatchCoord;
 
-// TODO(sergey): Remove after official requirement bump for OSD version.
-#if OPENSUBDIV_VERSION_NUMBER >= 30200
-#  define OPENSUBDIV_HAS_FVAR_EVALUATION
-#else
-#  undef OPENSUBDIV_HAS_FVAR_EVALUATION
-#endif
-
 namespace opensubdiv_capi {
 
 namespace {
 
-// Helper class to wrap numerous of patch coordinates into a buffer.
-// Used to pass coordinates to the CPU evaluator. Other evaluators are not
-// supported.
-class PatchCoordBuffer : public vector<PatchCoord> {
+// Array implementation which stores small data on stack (or, rather, in the class itself).
+template<typename T, int kNumMaxElementsOnStack> class StackOrHeapArray {
  public:
-  static PatchCoordBuffer *Create(int size)
+  StackOrHeapArray()
+      : num_elements_(0), heap_elements_(NULL), num_heap_elements_(0), effective_elements_(NULL)
   {
-    PatchCoordBuffer *buffer = new PatchCoordBuffer();
-    buffer->resize(size);
-    return buffer;
   }
 
-  PatchCoord *BindCpuBuffer()
+  explicit StackOrHeapArray(int size) : StackOrHeapArray()
   {
-    return reinterpret_cast<PatchCoord *>(&(*this)[0]);
+    resize(size);
   }
 
-  int GetNumVertices()
+  ~StackOrHeapArray()
   {
-    return size();
+    delete[] heap_elements_;
   }
 
-  void UpdateData(const PatchCoord *patch_coords, int num_patch_coords)
+  int size() const
   {
-    memcpy(&(*this)[0],
-           reinterpret_cast<const void *>(patch_coords),
-           sizeof(PatchCoord) * num_patch_coords);
-  }
-};
+    return num_elements_;
+  };
 
-// Helper class to wrap single of patch coord into a buffer. Used to pass
-// coordinates to the CPU evaluator. Other evaluators are not supported.
-class SinglePatchCoordBuffer {
- public:
-  static SinglePatchCoordBuffer *Create()
+  T *data()
   {
-    return new SinglePatchCoordBuffer();
+    return effective_elements_;
   }
 
-  SinglePatchCoordBuffer()
+  void resize(int num_elements)
   {
+    const int old_num_elements = num_elements_;
+    num_elements_ = num_elements;
+    // Early output if allcoation size did not change, or allocation size is smaller.
+    // We never re-allocate, sacrificing some memory over performance.
+    if (old_num_elements >= num_elements) {
+      return;
+    }
+    // Simple case: no previously allocated buffer, can simply do one allocation.
+    if (effective_elements_ == NULL) {
+      effective_elements_ = allocate(num_elements);
+      return;
+    }
+    // Make new allocation, and copy elements if needed.
+    T *old_buffer = effective_elements_;
+    effective_elements_ = allocate(num_elements);
+    if (old_buffer != effective_elements_) {
+      memcpy(effective_elements_, old_buffer, sizeof(T) * min(old_num_elements, num_elements));
+    }
+    if (old_buffer != stack_elements_) {
+      delete[] old_buffer;
+    }
   }
 
-  explicit SinglePatchCoordBuffer(const PatchCoord &patch_coord) : patch_coord_(patch_coord)
+ protected:
+  T *allocate(int num_elements)
   {
+    if (num_elements < kNumMaxElementsOnStack) {
+      return stack_elements_;
+    }
+    heap_elements_ = new T[num_elements];
+    return heap_elements_;
   }
 
-  PatchCoord *BindCpuBuffer()
-  {
-    return &patch_coord_;
-  }
+  // Number of elements in the buffer.
+  int num_elements_;
 
-  int GetNumVertices()
+  // Elements which are allocated on a stack (or, rather, in the same allocation as the buffer
+  // itself).
+  // Is used as long as buffer is smaller than kNumMaxElementsOnStack.
+  T stack_elements_[kNumMaxElementsOnStack];
+
+  // Heap storage for buffer larger than kNumMaxElementsOnStack.
+  T *heap_elements_;
+  int num_heap_elements_;
+
+  // Depending on the current buffer size points to rither stack_elements_ or heap_elements_.
+  T *effective_elements_;
+};
+
+// 32 is a number of inner vertices along the patch size at subdivision level 6.
+typedef StackOrHeapArray<PatchCoord, 32 * 32> StackOrHeapPatchCoordArray;
+
+// Buffer which implements API required by OpenSubdiv and uses an existing memory as an underlying
+// storage.
+template<typename T> class RawDataWrapperBuffer {
+ public:
+  RawDataWrapperBuffer(T *data) : data_(data)
   {
-    return 1;
   }
 
-  void UpdateData(const PatchCoord &patch_coord)
+  T *BindCpuBuffer()
   {
-    patch_coord_ = patch_coord;
+    return data_;
   }
 
+  // TODO(sergey): Support UpdateData().
+
  protected:
-  PatchCoord patch_coord_;
+  T *data_;
 };
 
-// Helper class which is aimed to be used in cases when buffer is small enough
-// and better to be allocated in stack rather than in heap.
-//
-// TODO(sergey): Check if bare arrays could be used by CPU evaluator.
-template<int element_size, int num_vertices> class StackAllocatedBuffer {
+template<typename T> class RawDataWrapperVertexBuffer : public RawDataWrapperBuffer<T> {
  public:
-  static PatchCoordBuffer *Create(int /*size*/)
+  RawDataWrapperVertexBuffer(T *data, int num_vertices)
+      : RawDataWrapperBuffer<T>(data), num_vertices_(num_vertices)
   {
-    // TODO(sergey): Validate that requested size is smaller than static
-    // stack memory size.
-    return new StackAllocatedBuffer<element_size, num_vertices>();
-  }
-
-  float *BindCpuBuffer()
-  {
-    return &data_[0];
   }
 
   int GetNumVertices()
   {
-    return num_vertices;
+    return num_vertices_;
   }
 
-  // TODO(sergey): Support UpdateData().
  protected:
-  float data_[element_size * num_vertices];
+  int num_vertices_;
+};
+
+class ConstPatchCoordWrapperBuffer : public RawDataWrapperVertexBuffer<const PatchCoord> {
+ public:
+  ConstPatchCoordWrapperBuffer(const PatchCoord *data, int num_vertices)
+      : RawDataWrapperVertexBuffer(data, num_vertices)
+  {
+  }
 };
 
 template<typename EVAL_VERTEX_BUFFER,
@@ -217,11 +241,12 @@ class FaceVaryingVolatileEval {
                             device_context_);
   }
 
-  void evalPatch(const PatchCoord &patch_coord, float face_varying[2])
+  // NOTE: face_varying must point to a memory of at least float[2]*num_patch_coords.
+  void evalPatches(const PatchCoord *patch_coord, const int num_patch_coords, float *face_varying)
   {
-    StackAllocatedBuffer<2, 1> face_varying_data;
+    RawDataWrapperBuffer<float> face_varying_data(face_varying);
     BufferDescriptor face_varying_desc(0, 2, 2);
-    SinglePatchCoordBuffer patch_coord_buffer(patch_coord);
+    ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords);
     const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
         evaluator_cache_, src_face_varying_desc_, face_varying_desc, device_context_);
     EVALUATOR::EvalPatchesFaceVarying(src_face_varying_data_,
@@ -234,8 +259,6 @@ class FaceVaryingVolatileEval {
                                       face_varying_channel_,
                                       eval_instance,
                                       device_context_);
-    const float *refined_face_varying = face_varying_data.BindCpuBuffer();
-    memcpy(face_varying, refined_face_varying, sizeof(float) * 2);
   }
 
  protected:
@@ -297,7 +320,6 @@ class VolatileEvalOutput {
     src_data_ = SRC_VERTEX_BUFFER::Create(3, num_total_vertices, device_context_);
     src_varying_data_ = SRC_VERTEX_BUFFER::Create(3, num_total_vertices, device_context_);
     patch_table_ = PATCH_TABLE::Create(patch_table, device_context_);
-    patch_coords_ = NULL;
     vertex_stencils_ = convertToCompatibleStencilTable<STENCIL_TABLE>(vertex_stencils,
                                                                       device_context_);
     varying_stencils_ = convertToCompatibleStencilTable<STENCIL_TABLE>(varying_stencils,
@@ -398,74 +420,66 @@ class VolatileEvalOutput {
     }
   }
 
-  void evalPatchCoord(const PatchCoord &patch_coord, float P[3])
+  // NOTE: P must point to a memory of at least float[3]*num_patch_coords.
+  void evalPatches(const PatchCoord *patch_coord, const int num_patch_coords, float *P)
   {
-    StackAllocatedBuffer<6, 1> vertex_data;
-    // TODO(sergey): Varying data is interleaved in vertex array, so need to
-    // adjust stride if there is a varying data.
-    // BufferDescriptor vertex_desc(0, 3, 6);
-    BufferDescriptor vertex_desc(0, 3, 3);
-    SinglePatchCoordBuffer patch_coord_buffer(patch_coord);
+    RawDataWrapperBuffer<float> P_data(P);
+    // TODO(sergey): Support interleaved vertex-varying data.
+    BufferDescriptor P_desc(0, 3, 3);
+    ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords);
     const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
-        evaluator_cache_, src_desc_, vertex_desc, device_context_);
+        evaluator_cache_, src_desc_, P_desc, device_context_);
     EVALUATOR::EvalPatches(src_data_,
                            src_desc_,
-                           &vertex_data,
-                           vertex_desc,
+                           &P_data,
+                           P_desc,
                            patch_coord_buffer.GetNumVertices(),
                            &patch_coord_buffer,
                            patch_table_,
                            eval_instance,
                            device_context_);
-    const float *refined_vertices = vertex_data.BindCpuBuffer();
-    memcpy(P, refined_vertices, sizeof(float) * 3);
   }
 
-  void evalPatchesWithDerivatives(const PatchCoord &patch_coord,
-                                  float P[3],
-                                  float dPdu[3],
-                                  float dPdv[3])
+  // NOTE: P, dPdu, dPdv must point to a memory of at least float[3]*num_patch_coords.
+  void evalPatchesWithDerivatives(const PatchCoord *patch_coord,
+                                  const int num_patch_coords,
+                                  float *P,
+                                  float *dPdu,
+                                  float *dPdv)
   {
-    StackAllocatedBuffer<6, 1> vertex_data, derivatives;
-    // TODO(sergey): Varying data is interleaved in vertex array, so need to
-    // adjust stride if there is a varying data.
-    // BufferDescriptor vertex_desc(0, 3, 6);
-    BufferDescriptor vertex_desc(0, 3, 3);
-    BufferDescriptor du_desc(0, 3, 6), dv_desc(3, 3, 6);
-    SinglePatchCoordBuffer patch_coord_buffer(patch_coord);
+    assert(dPdu);
+    assert(dPdv);
+    RawDataWrapperBuffer<float> P_data(P);
+    RawDataWrapperBuffer<float> dPdu_data(dPdu), dPdv_data(dPdv);
+    // TODO(sergey): Support interleaved vertex-varying data.
+    BufferDescriptor P_desc(0, 3, 3);
+    BufferDescriptor dpDu_desc(0, 3, 3), pPdv_desc(0, 3, 3);
+    ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords);
     const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
-        evaluator_cache_, src_desc_, vertex_desc, du_desc, dv_desc, device_context_);
+        evaluator_cache_, src_desc_, P_desc, dpDu_desc, pPdv_desc, device_context_);
     EVALUATOR::EvalPatches(src_data_,
                            src_desc_,
-                           &vertex_data,
-                           vertex_desc,
-                           &derivatives,
-                           du_desc,
-                           &derivatives,
-                           dv_desc,
+                           &P_data,
+                           P_desc,
+                           &dPdu_data,
+                           dpDu_desc,
+                           &dPdv_data,
+                           pPdv_desc,
                            patch_coord_buffer.GetNumVertices(),
                            &patch_coord_buffer,
                            patch_table_,
                            eval_instance,
                            device_context_);
-    const float *refined_vertices = vertex_data.BindCpuBuffer();
-    memcpy(P, refined_vertices, sizeof(float) * 3);
-    if (dPdu != NULL || dPdv != NULL) {
-      const float *refined_derivatives = derivatives.BindCpuBuffer();
-      if (dPdu != NULL) {
-        memcpy(dPdu, refined_derivatives, sizeof(float) * 3);
-      }
-      if (dPdv != NULL) {
-        memcpy(dPdv, refined_derivatives + 3, sizeof(float) * 3);
-      }
-    }
   }
 
-  void evalPatchVarying(const PatchCoord &patch_coord, float varying[3])
+  // NOTE: varying must point to a memory of at least float[3]*num_patch_coords.
+  void evalPatchesVarying(const PatchCoord *patch_coord,
+                          const int num_patch_coords,
+                          float *varying)
   {
-    StackAllocatedBuffer<6, 1> varying_data;
+    RawDataWrapperBuffer<float> varying_data(varying);
     BufferDescriptor varying_desc(3, 3, 6);
-    SinglePatchCoordBuffer patch_coord_buffer(patch_coord);
+    ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords);
     const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>(
         evaluator_cache_, src_varying_desc_, varying_desc, device_context_);
     EVALUATOR::EvalPatchesVarying(src_varying_data_,
@@ -477,23 +491,22 @@ class VolatileEvalOutput {
                                   patch_table_,
                                   eval_instance,
                                   device_context_);
-    const float *refined_varying = varying_data.BindCpuBuffer();
-    memcpy(varying, refined_varying, sizeof(float) * 3);
   }
 
-  void evalPatchFaceVarying(const int face_varying_channel,
-                            const PatchCoord &patch_coord,
-                            float face_varying[2])
+  void evalPatchesFaceVarying(const int face_varying_channel,
+                              const PatchCoord *patch_coord,
+                              const int num_patch_coords,
+                              float face_varying[2])
   {
     assert(face_varying_channel >= 0);
     assert(face_varying_channel < face_varying_evaluators.size());
-    face_varying_evaluators[face_varying_channel]->evalPatch(patch_coord, face_varying);
+    face_varying_evaluators[face_varying_channel]->evalPatches(
+        patch_coord, num_patch_coords, face_varying);
   }
 
  private:
   SRC_VERTEX_BUFFER *src_data_;
   SRC_VERTEX_BUFFER *src_varying_data_;
-  PatchCoordBuffer *patch_coords_;
   PATCH_TABLE *patch_table_;
   BufferDescriptor src_desc_;
   BufferDescriptor src_varying_desc_;
@@ -510,6 +523,19 @@ class VolatileEvalOutput {
   DEVICE_CONTEXT *device_context_;
 };
 
+void convertPatchCoordsToArray(const OpenSubdiv_PatchCoord *patch_coords,
+                               const int num_patch_coords,
+                               const OpenSubdiv::Far::PatchMap *patch_map,
+                               StackOrHeapPatchCoordArray *array)
+{
+  array->resize(num_patch_coords);
+  for (int i = 0; i < num_patch_coords; ++i) {
+    const PatchTable::PatchHandle *handle = patch_map->FindPatch(
+        patch_coords[i].ptex_face, patch_coords[i].u, patch_coords[i].v);
+    (array->data())[i] = PatchCoord(*handle, patch_coords[i].u, patch_coords[i].v);
+  }
+}
+
 }  // namespace
 
 // Note: Define as a class instead of typedcef to make it possible
@@ -653,10 +679,10 @@ void CpuEvalOutputAPI::evaluateLimit(const int ptex_face_index,
   const PatchTable::PatchHandle *handle = patch_map_->FindPatch(ptex_face_index, face_u, face_v);
   PatchCoord patch_coord(*handle, face_u, face_v);
   if (dPdu != NULL || dPdv != NULL) {
-    implementation_->evalPatchesWithDerivatives(patch_coord, P, dPdu, dPdv);
+    implementation_->evalPatchesWithDerivatives(&patch_coord, 1, P, dPdu, dPdv);
   }
   else {
-    implementation_->evalPatchCoord(patch_coord, P);
+    implementation_->evalPatches(&patch_coord, 1, P);
   }
 }
 
@@ -671,7 +697,7 @@ void CpuEvalOutputAPI::evaluateVarying(const int ptex_face_index,
   assert(face_v <= 1.0f);
   const PatchTable::PatchHandle *handle = patch_map_->FindPatch(ptex_face_index, face_u, face_v);
   PatchCoord patch_coord(*handle, face_u, face_v);
-  implementation_->evalPatchVarying(patch_coord, varying);
+  implementation_->evalPatchesVarying(&patch_coord, 1, varying);
 }
 
 void CpuEvalOutputAPI::evaluateFaceVarying(const int face_varying_channel,
@@ -686,7 +712,24 @@ void CpuEvalOutputAPI::evaluateFaceVarying(const int face_varying_channel,
   assert(face_v <= 1.0f);
   const PatchTable::PatchHandle *handle = patch_map_->FindPatch(ptex_face_index, face_u, face_v);
   PatchCoord patch_coord(*handle, face_u, face_v);
-  implementation_->evalPatchFaceVarying(face_varying_channel, patch_coord, face_varying);
+  implementation_->evalPatchesFaceVarying(face_varying_channel, &patch_coord, 1, face_varying);
+}
+
+void CpuEvalOutputAPI::evaluatePatchesLimit(const OpenSubdiv_PatchCoord *patch_coords,
+                                            const int num_patch_coords,
+                                            float *P,
+                                            float *dPdu,
+                                            float *dPdv)
+{
+  StackOrHeapPatchCoordArray patch_coords_array;
+  convertPatchCoordsToArray(patch_coords, num_patch_coords, patch_map_, &patch_coords_array);
+  if (dPdu != NULL || dPdv != NULL) {
+    implementation_->evalPatchesWithDerivatives(
+        patch_coords_array.data(), num_patch_coords, P, dPdu, dPdv);
+  }
+  else {
+    implementation_->evalPatches(patch_coords_array.data(), num_patch_coords, P);
+  }
 }
 
 }  // namespace opensubdiv_capi
@@ -757,7 +800,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal(
   }
   // Face warying stencil.
   vector<const StencilTable *> all_face_varying_stencils;
-#ifdef OPENSUBDIV_HAS_FVAR_EVALUATION
   all_face_varying_stencils.reserve(num_face_varying_channels);
   for (int face_varying_channel = 0; face_varying_channel < num_face_varying_channels;
        ++face_varying_channel) {
@@ -769,7 +811,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal(
     all_face_varying_stencils.push_back(
         StencilTableFactory::Create(*refiner, face_varying_stencil_options));
   }
-#endif
   // Generate bi-cubic patch table for the limit surface.
   // TODO(sergey): Ideally we would want to expose end-cap settings via
   // C-API to make it more generic. Currently it matches old Blender's
@@ -800,7 +841,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal(
       varying_stencils = table;
     }
   }
-#ifdef OPENSUBDIV_HAS_FVAR_EVALUATION
   for (int face_varying_channel = 0; face_varying_channel < num_face_varying_channels;
        ++face_varying_channel) {
     const StencilTable *table = StencilTableFactory::AppendLocalPointStencilTableFaceVarying(
@@ -813,7 +853,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal(
       all_face_varying_stencils[face_varying_channel] = table;
     }
   }
-#endif
   // Create OpenSubdiv's CPU side evaluator.
   // TODO(sergey): Make it possible to use different evaluators.
   opensubdiv_capi::CpuEvalOutput *eval_output = new opensubdiv_capi::CpuEvalOutput(
diff --git a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h
index 7c963227d17..392633944c6 100644
--- a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h
+++ b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h
@@ -26,6 +26,7 @@
 #include <opensubdiv/far/patchMap.h>
 #include <opensubdiv/far/patchTable.h>
 
+struct OpenSubdiv_PatchCoord;
 struct OpenSubdiv_TopologyRefiner;
 
 namespace opensubdiv_capi {
@@ -114,6 +115,18 @@ class CpuEvalOutputAPI {
                            float face_v,
                            float face_varying[2]);
 
+  // Batched evaluation of multiple input coordinates.
+
+  // Evaluate given ptex face at given bilinear coordinate.
+  // If derivatives are NULL, they will not be evaluated.
+  //
+  // NOTE: Output arrays must point to a memory of size float[3]*num_patch_coords.
+  void evaluatePatchesLimit(const OpenSubdiv_PatchCoord *patch_coords,
+                            const int num_patch_coords,
+                            float *P,
+                            float *dPdu,
+                            float *dPdv);
+
  protected:
   CpuEvalOutput *implementation_;
   OpenSubdiv::Far::PatchMap *patch_map_;
diff --git a/intern/opensubdiv/opensubdiv_capi_type.h b/intern/opensubdiv/opensubdiv_capi_type.h
index 35eeb71dede..e759c5f43b0 100644
--- a/intern/opensubdiv/opensubdiv_capi_type.h
+++ b/intern/opensubdiv/opensubdiv_capi_type.h
@@ -58,6 +58,13 @@ typedef enum OpenSubdiv_FVarLinearInterpolation {
   OSD_FVAR_LINEAR_INTERPOLATION_ALL,
 } OpenSubdiv_FVarLinearInterpolation;
 
+typedef struct OpenSubdiv_PatchCoord {
+  int ptex_face;
+
+  // Parametric location on patch.
+  float u, v;
+} OpenSubdiv_PatchCoord;
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/intern/opensubdiv/opensubdiv_evaluator_capi.h b/intern/opensubdiv/opensubdiv_evaluator_capi.h
index ceb0c58feba..1572d01b851 100644
--- a/intern/opensubdiv/opensubdiv_evaluator_capi.h
+++ b/intern/opensubdiv/opensubdiv_evaluator_capi.h
@@ -24,6 +24,7 @@ extern "C" {
 #endif
 
 struct OpenSubdiv_EvaluatorInternal;
+struct OpenSubdiv_PatchCoord;
 struct OpenSubdiv_TopologyRefiner;
 
 typedef struct OpenSubdiv_Evaluator {
@@ -108,6 +109,19 @@ typedef struct OpenSubdiv_Evaluator {
                               float face_v,
                               float face_varying[2]);
 
+  // Batched evaluation of multiple input coordinates.
+
+  // Evaluate limit surface.
+  // If derivatives are NULL, they will not be evaluated.
+  //
+  // NOTE: Output arrays must point to a memory of size float[3]*num_patch_coords.
+  void (*evaluatePatchesLimit)(struct OpenSubdiv_Evaluator *evaluator,
+                               const struct OpenSubdiv_PatchCoord *patch_coords,
+                               const int num_patch_coords,
+                               float *P,
+                               float *dPdu,
+                               float *dPdv);
+
   // Internal storage for the use in this module only.
   //
   // This is where actual OpenSubdiv's evaluator is living.