6 files changed, 133 insertions, 117 deletions
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 00dd37f089c..b5e10b0c2cb 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -454,6 +454,12 @@ class CUDADevice : public Device {
         VLOG(1) << "Using precompiled kernel.";
         return cubin;
       }
+      const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
+      if (path_exists(ptx)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return ptx;
+      }
     }
 
     const string common_cflags = compile_kernel_get_common_cflags(
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index ea8aa197b6f..78da584e132 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -389,11 +389,20 @@ if(WITH_CYCLES_CUDA_BINARIES)
   set(cuda_cubins)
 
   macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental)
-    set(cuda_cubin ${name}_${arch}.cubin)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "cubin")
+    endif()
+    set(cuda_file ${name}_${arch}.${format})
 
     set(kernel_sources ${sources})
     if(NOT ${prev_arch} STREQUAL "none")
-      set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin)
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin)
+      endif()
     endif()
 
     set(cuda_kernel_src "/kernels/cuda/${name}.cu")
@@ -406,7 +415,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
       -I ${CMAKE_CURRENT_SOURCE_DIR}/..
       -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda
       --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin})
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
 
     if(${experimental})
       set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
@@ -440,20 +449,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
             -v
             -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}"
         DEPENDS ${kernel_sources} cycles_cubin_cc)
+      set(cuda_file ${cuda_cubin})
     else()
       add_custom_command(
-        OUTPUT ${cuda_cubin}
+        OUTPUT ${cuda_file}
         COMMAND ${CUDA_NVCC_EXECUTABLE}
             -arch=${arch}
             ${CUDA_NVCC_FLAGS}
-            --cubin
+            --${format}
             ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
             --ptxas-options="-v"
             ${cuda_flags}
         DEPENDS ${kernel_sources})
     endif()
-    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
-    list(APPEND cuda_cubins ${cuda_cubin})
+    delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib)
+    list(APPEND cuda_cubins ${cuda_file})
 
     unset(cuda_debug_flags)
   endmacro()
diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h
index 41bbadb621d..adc85881fe5 100644
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -76,9 +76,9 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re
   filter_calculate_scale(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions. This mainly helps to prevent
-   * overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent overfitting. */
   float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero(feature_matrix, num_features);
   FOR_PIXEL_WINDOW
diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h
index 830444645d7..5a124b5d73b 100644
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -73,9 +73,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff
   filter_calculate_scale_sse(feature_scale, use_time);
 
   /* === Generate the feature transformation. ===
-   * This transformation maps the num_features-dimentional feature space to a reduced feature
-   * (r-feature) space which generally has fewer dimensions. This mainly helps to prevent
-   * overfitting. */
+   * This transformation maps the num_features-dimensional feature space to a reduced feature
+   * (r-feature) space which generally has fewer dimensions.
+   * This mainly helps to prevent over-fitting. */
   float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES];
   math_matrix_zero_sse(feature_matrix_sse, num_features);
   FOR_PIXEL_WINDOW_SSE
diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h
index 2778cffba3a..b29d4163133 100644
--- a/intern/cycles/util/util_defines.h
+++ b/intern/cycles/util/util_defines.h
@@ -16,127 +16,127 @@
  */
 
 #ifndef __UTIL_DEFINES_H__
-#  define __UTIL_DEFINES_H__
+#define __UTIL_DEFINES_H__
 
 /* Bitness */
 
-#  if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
-      defined(_M_X64)
-#    define __KERNEL_64_BIT__
-#  endif
+#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \
+    defined(_M_X64)
+#  define __KERNEL_64_BIT__
+#endif
 
 /* Qualifiers for kernel code shared by CPU and GPU */
 
-#  ifndef __KERNEL_GPU__
-#    define ccl_device static inline
-#    define ccl_device_noinline static
-#    define ccl_device_noinline_cpu ccl_device_noinline
-#    define ccl_global
-#    define ccl_static_constant static const
-#    define ccl_constant const
-#    define ccl_local
-#    define ccl_local_param
-#    define ccl_private
-#    define ccl_restrict __restrict
-#    define ccl_ref &
-#    define ccl_optional_struct_init
-#    define __KERNEL_WITH_SSE_ALIGN__
-
-#    if defined(_WIN32) && !defined(FREE_WINDOWS)
-#      define ccl_device_inline static __forceinline
-#      define ccl_device_forceinline static __forceinline
-#      define ccl_align(...) __declspec(align(__VA_ARGS__))
-#      ifdef __KERNEL_64_BIT__
-#        define ccl_try_align(...) __declspec(align(__VA_ARGS__))
-#      else /* __KERNEL_64_BIT__ */
-#        undef __KERNEL_WITH_SSE_ALIGN__
+#ifndef __KERNEL_GPU__
+#  define ccl_device static inline
+#  define ccl_device_noinline static
+#  define ccl_device_noinline_cpu ccl_device_noinline
+#  define ccl_global
+#  define ccl_static_constant static const
+#  define ccl_constant const
+#  define ccl_local
+#  define ccl_local_param
+#  define ccl_private
+#  define ccl_restrict __restrict
+#  define ccl_ref &
+#  define ccl_optional_struct_init
+#  define __KERNEL_WITH_SSE_ALIGN__
+
+#  if defined(_WIN32) && !defined(FREE_WINDOWS)
+#    define ccl_device_inline static __forceinline
+#    define ccl_device_forceinline static __forceinline
+#    define ccl_align(...) __declspec(align(__VA_ARGS__))
+#    ifdef __KERNEL_64_BIT__
+#      define ccl_try_align(...) __declspec(align(__VA_ARGS__))
+#    else /* __KERNEL_64_BIT__ */
+#      undef __KERNEL_WITH_SSE_ALIGN__
 /* No support for function arguments (error C2719). */
-#        define ccl_try_align(...)
-#      endif /* __KERNEL_64_BIT__ */
-#      define ccl_may_alias
-#      define ccl_always_inline __forceinline
-#      define ccl_never_inline __declspec(noinline)
-#      define ccl_maybe_unused
-#    else /* _WIN32 && !FREE_WINDOWS */
-#      define ccl_device_inline static inline __attribute__((always_inline))
-#      define ccl_device_forceinline static inline __attribute__((always_inline))
-#      define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
-#      ifndef FREE_WINDOWS64
-#        define __forceinline inline __attribute__((always_inline))
-#      endif
-#      define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
-#      define ccl_may_alias __attribute__((__may_alias__))
-#      define ccl_always_inline __attribute__((always_inline))
-#      define ccl_never_inline __attribute__((noinline))
-#      define ccl_maybe_unused __attribute__((used))
-#    endif /* _WIN32 && !FREE_WINDOWS */
+#      define ccl_try_align(...)
+#    endif /* __KERNEL_64_BIT__ */
+#    define ccl_may_alias
+#    define ccl_always_inline __forceinline
+#    define ccl_never_inline __declspec(noinline)
+#    define ccl_maybe_unused
+#  else /* _WIN32 && !FREE_WINDOWS */
+#    define ccl_device_inline static inline __attribute__((always_inline))
+#    define ccl_device_forceinline static inline __attribute__((always_inline))
+#    define ccl_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    ifndef FREE_WINDOWS64
+#      define __forceinline inline __attribute__((always_inline))
+#    endif
+#    define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#    define ccl_may_alias __attribute__((__may_alias__))
+#    define ccl_always_inline __attribute__((always_inline))
+#    define ccl_never_inline __attribute__((noinline))
+#    define ccl_maybe_unused __attribute__((used))
+#  endif /* _WIN32 && !FREE_WINDOWS */
 
 /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */
-#    ifndef ATTR_FALLTHROUGH
-#      if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
-#        define ATTR_FALLTHROUGH __attribute__((fallthrough))
-#      else
-#        define ATTR_FALLTHROUGH ((void)0)
-#      endif
+#  ifndef ATTR_FALLTHROUGH
+#    if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */
+#      define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#    else
+#      define ATTR_FALLTHROUGH ((void)0)
 #    endif
-#  endif /* __KERNEL_GPU__ */
+#  endif
+#endif /* __KERNEL_GPU__ */
 
 /* macros */
 
 /* hints for branch prediction, only use in code that runs a _lot_ */
-#  if defined(__GNUC__) && defined(__KERNEL_CPU__)
-#    define LIKELY(x) __builtin_expect(!!(x), 1)
-#    define UNLIKELY(x) __builtin_expect(!!(x), 0)
-#  else
-#    define LIKELY(x) (x)
-#    define UNLIKELY(x) (x)
-#  endif
-
-#  if defined(__GNUC__) || defined(__clang__)
-#    if defined(__cplusplus)
+#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#  define LIKELY(x) __builtin_expect(!!(x), 1)
+#  define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#  define LIKELY(x) (x)
+#  define UNLIKELY(x) (x)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__cplusplus)
 /* Some magic to be sure we don't have reference in the type. */
 template<typename T> static inline T decltype_helper(T x)
 {
   return x;
 }
-#      define TYPEOF(x) decltype(decltype_helper(x))
-#    else
-#      define TYPEOF(x) typeof(x)
-#    endif
+#    define TYPEOF(x) decltype(decltype_helper(x))
+#  else
+#    define TYPEOF(x) typeof(x)
 #  endif
+#endif
 
 /* Causes warning:
  * incompatible types when assigning to type 'Foo' from type 'Bar'
  * ... the compiler optimizes away the temp var */
-#  ifdef __GNUC__
-#    define CHECK_TYPE(var, type) \
-      { \
-        TYPEOF(var) * __tmp; \
-        __tmp = (type *)NULL; \
-        (void)__tmp; \
-      } \
-      (void)0
-
-#    define CHECK_TYPE_PAIR(var_a, var_b) \
-      { \
-        TYPEOF(var_a) * __tmp; \
-        __tmp = (typeof(var_b) *)NULL; \
-        (void)__tmp; \
-      } \
-      (void)0
-#  else
-#    define CHECK_TYPE(var, type)
-#    define CHECK_TYPE_PAIR(var_a, var_b)
-#  endif
+#ifdef __GNUC__
+#  define CHECK_TYPE(var, type) \
+    { \
+      TYPEOF(var) * __tmp; \
+      __tmp = (type *)NULL; \
+      (void)__tmp; \
+    } \
+    (void)0
+
+#  define CHECK_TYPE_PAIR(var_a, var_b) \
+    { \
+      TYPEOF(var_a) * __tmp; \
+      __tmp = (typeof(var_b) *)NULL; \
+      (void)__tmp; \
+    } \
+    (void)0
+#else
+#  define CHECK_TYPE(var, type)
+#  define CHECK_TYPE_PAIR(var_a, var_b)
+#endif
 
 /* can be used in simple macros */
-#  define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val)))
-
-#  ifndef __KERNEL_GPU__
-#    include <cassert>
-#    define util_assert(statement) assert(statement)
-#  else
-#    define util_assert(statement)
-#  endif
+#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val)))
+
+#ifndef __KERNEL_GPU__
+#  include <cassert>
+#  define util_assert(statement) assert(statement)
+#else
+#  define util_assert(statement)
+#endif
 
 #endif /* __UTIL_DEFINES_H__ */
diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h
index b4b972a4036..ceb52830319 100644
--- a/intern/cycles/util/util_static_assert.h
+++ b/intern/cycles/util/util_static_assert.h
@@ -15,18 +15,18 @@
  */
 
 #ifndef __UTIL_STATIC_ASSERT_H__
-#  define __UTIL_STATIC_ASSERT_H__
+#define __UTIL_STATIC_ASSERT_H__
 
 CCL_NAMESPACE_BEGIN
 
 /* TODO(sergey): In theory CUDA might work with own static assert
  * implementation since it's just pure C++.
  */
-#  ifdef __KERNEL_GPU__
-#    ifndef static_assert
-#      define static_assert(statement, message)
-#    endif
-#  endif /* __KERNEL_GPU__ */
+#ifdef __KERNEL_GPU__
+#  ifndef static_assert
+#    define static_assert(statement, message)
+#  endif
+#endif /* __KERNEL_GPU__ */
 
 /* TODO(sergey): For until C++11 is a bare minimum for us,
  * we do a bit of a trickery to show meaningful message so
@@ -42,8 +42,8 @@ CCL_NAMESPACE_BEGIN
  * After C++11 bump it should be possible to glue structure
  * name to the error message,
  */
-#  define static_assert_align(st, align) \
-    static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
+#define static_assert_align(st, align) \
+  static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned")  // NOLINT
 
 CCL_NAMESPACE_END