diff options
Diffstat (limited to 'intern')
18 files changed, 361 insertions, 260 deletions
diff --git a/intern/CMakeLists.txt b/intern/CMakeLists.txt index 4493c68a88d..e6b561c39b9 100644 --- a/intern/CMakeLists.txt +++ b/intern/CMakeLists.txt @@ -80,4 +80,3 @@ endif() if(WITH_QUADRIFLOW) add_subdirectory(quadriflow) endif() - diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py index 61ac24f7f07..013d86a560b 100644 --- a/intern/cycles/blender/addon/engine.py +++ b/intern/cycles/blender/addon/engine.py @@ -262,13 +262,13 @@ def register_passes(engine, scene, srl): if crl.use_pass_crypto_object: for i in range(0, crl.pass_crypto_depth, 2): - engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i), 4, "RGBA", 'COLOR') + engine.register_pass(scene, srl, "CryptoObject" + '{:02d}'.format(i//2), 4, "RGBA", 'COLOR') if crl.use_pass_crypto_material: for i in range(0, crl.pass_crypto_depth, 2): - engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i), 4, "RGBA", 'COLOR') + engine.register_pass(scene, srl, "CryptoMaterial" + '{:02d}'.format(i//2), 4, "RGBA", 'COLOR') if srl.cycles.use_pass_crypto_asset: for i in range(0, srl.cycles.pass_crypto_depth, 2): - engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i), 4, "RGBA", 'COLOR') + engine.register_pass(scene, srl, "CryptoAsset" + '{:02d}'.format(i//2), 4, "RGBA", 'COLOR') if crl.use_denoising or crl.denoising_store_passes: engine.register_pass(scene, srl, "Noisy Image", 4, "RGBA", 'COLOR') diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index 00dd37f089c..b5e10b0c2cb 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -454,6 +454,12 @@ class CUDADevice : public Device { VLOG(1) << "Using precompiled kernel."; return cubin; } + const string ptx = path_get(string_printf("lib/%s_compute_%d%d.ptx", name, major, minor)); + VLOG(1) << "Testing for pre-compiled kernel " << ptx << "."; + if (path_exists(ptx)) { + VLOG(1) << "Using precompiled kernel."; + return ptx; + } } const string common_cflags = compile_kernel_get_common_cflags( diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index 782553e405c..4077a1ad516 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -389,11 +389,20 @@ if(WITH_CYCLES_CUDA_BINARIES) set(cuda_cubins) macro(CYCLES_CUDA_KERNEL_ADD arch prev_arch name flags sources experimental) - set(cuda_cubin ${name}_${arch}.cubin) + if(${arch} MATCHES "compute_.*") + set(format "ptx") + else() + set(format "cubin") + endif() + set(cuda_file ${name}_${arch}.${format}) set(kernel_sources ${sources}) if(NOT ${prev_arch} STREQUAL "none") - set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin) + if(${prev_arch} MATCHES "compute_.*") + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx) + else() + set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.cubin) + endif() endif() set(cuda_kernel_src "/kernels/cuda/${name}.cu") @@ -406,7 +415,7 @@ if(WITH_CYCLES_CUDA_BINARIES) -I ${CMAKE_CURRENT_SOURCE_DIR}/.. -I ${CMAKE_CURRENT_SOURCE_DIR}/kernels/cuda --use_fast_math - -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_cubin}) + -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}) if(${experimental}) set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__) @@ -440,20 +449,21 @@ if(WITH_CYCLES_CUDA_BINARIES) -v -cuda-toolkit-dir "${CUDA_TOOLKIT_ROOT_DIR}" DEPENDS ${kernel_sources} cycles_cubin_cc) + set(cuda_file ${cuda_cubin}) else() add_custom_command( - OUTPUT ${cuda_cubin} + OUTPUT ${cuda_file} COMMAND ${CUDA_NVCC_EXECUTABLE} -arch=${arch} ${CUDA_NVCC_FLAGS} - --cubin + --${format} ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src} --ptxas-options="-v" ${cuda_flags} DEPENDS ${kernel_sources}) endif() - delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib) - list(APPEND cuda_cubins ${cuda_cubin}) + delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_file}" ${CYCLES_INSTALL_PATH}/lib) + list(APPEND cuda_cubins ${cuda_file}) unset(cuda_debug_flags) endmacro() diff --git a/intern/cycles/kernel/filter/filter_transform_gpu.h b/intern/cycles/kernel/filter/filter_transform_gpu.h index 41bbadb621d..adc85881fe5 100644 --- a/intern/cycles/kernel/filter/filter_transform_gpu.h +++ b/intern/cycles/kernel/filter/filter_transform_gpu.h @@ -76,9 +76,9 @@ ccl_device void kernel_filter_construct_transform(const ccl_global float *ccl_re filter_calculate_scale(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the num_features-dimentional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. This mainly helps to prevent - * overfitting. */ + * This transformation maps the num_features-dimensional feature space to a reduced feature + * (r-feature) space which generally has fewer dimensions. + * This mainly helps to prevent overfitting. */ float feature_matrix[DENOISE_FEATURES * DENOISE_FEATURES]; math_matrix_zero(feature_matrix, num_features); FOR_PIXEL_WINDOW diff --git a/intern/cycles/kernel/filter/filter_transform_sse.h b/intern/cycles/kernel/filter/filter_transform_sse.h index 830444645d7..5a124b5d73b 100644 --- a/intern/cycles/kernel/filter/filter_transform_sse.h +++ b/intern/cycles/kernel/filter/filter_transform_sse.h @@ -73,9 +73,9 @@ ccl_device void kernel_filter_construct_transform(const float *ccl_restrict buff filter_calculate_scale_sse(feature_scale, use_time); /* === Generate the feature transformation. === - * This transformation maps the num_features-dimentional feature space to a reduced feature - * (r-feature) space which generally has fewer dimensions. This mainly helps to prevent - * overfitting. */ + * This transformation maps the num_features-dimensional feature space to a reduced feature + * (r-feature) space which generally has fewer dimensions. + * This mainly helps to prevent over-fitting. */ float4 feature_matrix_sse[DENOISE_FEATURES * DENOISE_FEATURES]; math_matrix_zero_sse(feature_matrix_sse, num_features); FOR_PIXEL_WINDOW_SSE diff --git a/intern/cycles/util/util_defines.h b/intern/cycles/util/util_defines.h index 2778cffba3a..b29d4163133 100644 --- a/intern/cycles/util/util_defines.h +++ b/intern/cycles/util/util_defines.h @@ -16,127 +16,127 @@ */ #ifndef __UTIL_DEFINES_H__ -# define __UTIL_DEFINES_H__ +#define __UTIL_DEFINES_H__ /* Bitness */ -# if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \ - defined(_M_X64) -# define __KERNEL_64_BIT__ -# endif +#if defined(__ppc64__) || defined(__PPC64__) || defined(__x86_64__) || defined(__ia64__) || \ + defined(_M_X64) +# define __KERNEL_64_BIT__ +#endif /* Qualifiers for kernel code shared by CPU and GPU */ -# ifndef __KERNEL_GPU__ -# define ccl_device static inline -# define ccl_device_noinline static -# define ccl_device_noinline_cpu ccl_device_noinline -# define ccl_global -# define ccl_static_constant static const -# define ccl_constant const -# define ccl_local -# define ccl_local_param -# define ccl_private -# define ccl_restrict __restrict -# define ccl_ref & -# define ccl_optional_struct_init -# define __KERNEL_WITH_SSE_ALIGN__ - -# if defined(_WIN32) && !defined(FREE_WINDOWS) -# define ccl_device_inline static __forceinline -# define ccl_device_forceinline static __forceinline -# define ccl_align(...) __declspec(align(__VA_ARGS__)) -# ifdef __KERNEL_64_BIT__ -# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) -# else /* __KERNEL_64_BIT__ */ -# undef __KERNEL_WITH_SSE_ALIGN__ +#ifndef __KERNEL_GPU__ +# define ccl_device static inline +# define ccl_device_noinline static +# define ccl_device_noinline_cpu ccl_device_noinline +# define ccl_global +# define ccl_static_constant static const +# define ccl_constant const +# define ccl_local +# define ccl_local_param +# define ccl_private +# define ccl_restrict __restrict +# define ccl_ref & +# define ccl_optional_struct_init +# define __KERNEL_WITH_SSE_ALIGN__ + +# if defined(_WIN32) && !defined(FREE_WINDOWS) +# define ccl_device_inline static __forceinline +# define ccl_device_forceinline static __forceinline +# define ccl_align(...) __declspec(align(__VA_ARGS__)) +# ifdef __KERNEL_64_BIT__ +# define ccl_try_align(...) __declspec(align(__VA_ARGS__)) +# else /* __KERNEL_64_BIT__ */ +# undef __KERNEL_WITH_SSE_ALIGN__ /* No support for function arguments (error C2719). */ -# define ccl_try_align(...) -# endif /* __KERNEL_64_BIT__ */ -# define ccl_may_alias -# define ccl_always_inline __forceinline -# define ccl_never_inline __declspec(noinline) -# define ccl_maybe_unused -# else /* _WIN32 && !FREE_WINDOWS */ -# define ccl_device_inline static inline __attribute__((always_inline)) -# define ccl_device_forceinline static inline __attribute__((always_inline)) -# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) -# ifndef FREE_WINDOWS64 -# define __forceinline inline __attribute__((always_inline)) -# endif -# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) -# define ccl_may_alias __attribute__((__may_alias__)) -# define ccl_always_inline __attribute__((always_inline)) -# define ccl_never_inline __attribute__((noinline)) -# define ccl_maybe_unused __attribute__((used)) -# endif /* _WIN32 && !FREE_WINDOWS */ +# define ccl_try_align(...) +# endif /* __KERNEL_64_BIT__ */ +# define ccl_may_alias +# define ccl_always_inline __forceinline +# define ccl_never_inline __declspec(noinline) +# define ccl_maybe_unused +# else /* _WIN32 && !FREE_WINDOWS */ +# define ccl_device_inline static inline __attribute__((always_inline)) +# define ccl_device_forceinline static inline __attribute__((always_inline)) +# define ccl_align(...) __attribute__((aligned(__VA_ARGS__))) +# ifndef FREE_WINDOWS64 +# define __forceinline inline __attribute__((always_inline)) +# endif +# define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +# define ccl_may_alias __attribute__((__may_alias__)) +# define ccl_always_inline __attribute__((always_inline)) +# define ccl_never_inline __attribute__((noinline)) +# define ccl_maybe_unused __attribute__((used)) +# endif /* _WIN32 && !FREE_WINDOWS */ /* Use to suppress '-Wimplicit-fallthrough' (in place of 'break'). */ -# ifndef ATTR_FALLTHROUGH -# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ -# define ATTR_FALLTHROUGH __attribute__((fallthrough)) -# else -# define ATTR_FALLTHROUGH ((void)0) -# endif +# ifndef ATTR_FALLTHROUGH +# if defined(__GNUC__) && (__GNUC__ >= 7) /* gcc7.0+ only */ +# define ATTR_FALLTHROUGH __attribute__((fallthrough)) +# else +# define ATTR_FALLTHROUGH ((void)0) # endif -# endif /* __KERNEL_GPU__ */ +# endif +#endif /* __KERNEL_GPU__ */ /* macros */ /* hints for branch prediction, only use in code that runs a _lot_ */ -# if defined(__GNUC__) && defined(__KERNEL_CPU__) -# define LIKELY(x) __builtin_expect(!!(x), 1) -# define UNLIKELY(x) __builtin_expect(!!(x), 0) -# else -# define LIKELY(x) (x) -# define UNLIKELY(x) (x) -# endif - -# if defined(__GNUC__) || defined(__clang__) -# if defined(__cplusplus) +#if defined(__GNUC__) && defined(__KERNEL_CPU__) +# define LIKELY(x) __builtin_expect(!!(x), 1) +# define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +# define LIKELY(x) (x) +# define UNLIKELY(x) (x) +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__cplusplus) /* Some magic to be sure we don't have reference in the type. */ template<typename T> static inline T decltype_helper(T x) { return x; } -# define TYPEOF(x) decltype(decltype_helper(x)) -# else -# define TYPEOF(x) typeof(x) -# endif +# define TYPEOF(x) decltype(decltype_helper(x)) +# else +# define TYPEOF(x) typeof(x) # endif +#endif /* Causes warning: * incompatible types when assigning to type 'Foo' from type 'Bar' * ... the compiler optimizes away the temp var */ -# ifdef __GNUC__ -# define CHECK_TYPE(var, type) \ - { \ - TYPEOF(var) * __tmp; \ - __tmp = (type *)NULL; \ - (void)__tmp; \ - } \ - (void)0 - -# define CHECK_TYPE_PAIR(var_a, var_b) \ - { \ - TYPEOF(var_a) * __tmp; \ - __tmp = (typeof(var_b) *)NULL; \ - (void)__tmp; \ - } \ - (void)0 -# else -# define CHECK_TYPE(var, type) -# define CHECK_TYPE_PAIR(var_a, var_b) -# endif +#ifdef __GNUC__ +# define CHECK_TYPE(var, type) \ + { \ + TYPEOF(var) * __tmp; \ + __tmp = (type *)NULL; \ + (void)__tmp; \ + } \ + (void)0 + +# define CHECK_TYPE_PAIR(var_a, var_b) \ + { \ + TYPEOF(var_a) * __tmp; \ + __tmp = (typeof(var_b) *)NULL; \ + (void)__tmp; \ + } \ + (void)0 +#else +# define CHECK_TYPE(var, type) +# define CHECK_TYPE_PAIR(var_a, var_b) +#endif /* can be used in simple macros */ -# define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val))) - -# ifndef __KERNEL_GPU__ -# include <cassert> -# define util_assert(statement) assert(statement) -# else -# define util_assert(statement) -# endif +#define CHECK_TYPE_INLINE(val, type) ((void)(((type)0) != (val))) + +#ifndef __KERNEL_GPU__ +# include <cassert> +# define util_assert(statement) assert(statement) +#else +# define util_assert(statement) +#endif #endif /* __UTIL_DEFINES_H__ */ diff --git a/intern/cycles/util/util_static_assert.h b/intern/cycles/util/util_static_assert.h index b4b972a4036..ceb52830319 100644 --- a/intern/cycles/util/util_static_assert.h +++ b/intern/cycles/util/util_static_assert.h @@ -15,18 +15,18 @@ */ #ifndef __UTIL_STATIC_ASSERT_H__ -# define __UTIL_STATIC_ASSERT_H__ +#define __UTIL_STATIC_ASSERT_H__ CCL_NAMESPACE_BEGIN /* TODO(sergey): In theory CUDA might work with own static assert * implementation since it's just pure C++. */ -# ifdef __KERNEL_GPU__ -# ifndef static_assert -# define static_assert(statement, message) -# endif -# endif /* __KERNEL_GPU__ */ +#ifdef __KERNEL_GPU__ +# ifndef static_assert +# define static_assert(statement, message) +# endif +#endif /* __KERNEL_GPU__ */ /* TODO(sergey): For until C++11 is a bare minimum for us, * we do a bit of a trickery to show meaningful message so @@ -42,8 +42,8 @@ CCL_NAMESPACE_BEGIN * After C++11 bump it should be possible to glue structure * name to the error message, */ -# define static_assert_align(st, align) \ - static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned") // NOLINT +#define static_assert_align(st, align) \ + static_assert((sizeof(st) % (align) == 0), "Structure must be strictly aligned") // NOLINT CCL_NAMESPACE_END diff --git a/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp b/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp index 3b28f055191..aabaffc7732 100644 --- a/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp +++ b/intern/ghost/intern/GHOST_DisplayManagerWin32.cpp @@ -24,8 +24,6 @@ #include "GHOST_DisplayManagerWin32.h" #include "GHOST_Debug.h" -#undef _WIN32_WINNT -#define _WIN32_WINNT 0x501 // require Windows XP or newer #define WIN32_LEAN_AND_MEAN #include <windows.h> diff --git a/intern/ghost/intern/GHOST_SystemPathsWin32.cpp b/intern/ghost/intern/GHOST_SystemPathsWin32.cpp index bdc403b947e..63a6b7224b5 100644 --- a/intern/ghost/intern/GHOST_SystemPathsWin32.cpp +++ b/intern/ghost/intern/GHOST_SystemPathsWin32.cpp @@ -39,15 +39,16 @@ GHOST_SystemPathsWin32::~GHOST_SystemPathsWin32() const GHOST_TUns8 *GHOST_SystemPathsWin32::getSystemDir(int, const char *versionstr) const { - static char knownpath[MAX_PATH * 3 + 128] = { - 0}; /* 1 utf-16 might translante into 3 utf-8. 2 utf-16 translates into 4 utf-8*/ - wchar_t knownpath_16[MAX_PATH]; + /* 1 utf-16 might translante into 3 utf-8. 2 utf-16 translates into 4 utf-8*/ + static char knownpath[MAX_PATH * 3 + 128] = {0}; + PWSTR knownpath_16 = NULL; - HRESULT hResult = SHGetFolderPathW( - NULL, CSIDL_COMMON_APPDATA, NULL, SHGFP_TYPE_CURRENT, knownpath_16); + HRESULT hResult = SHGetKnownFolderPath( + FOLDERID_ProgramData, KF_FLAG_DEFAULT, NULL, &knownpath_16); if (hResult == S_OK) { conv_utf_16_to_8(knownpath_16, knownpath, MAX_PATH * 3); + CoTaskMemFree(knownpath_16); strcat(knownpath, "\\Blender Foundation\\Blender\\"); strcat(knownpath, versionstr); return (GHOST_TUns8 *)knownpath; @@ -59,12 +60,14 @@ const GHOST_TUns8 *GHOST_SystemPathsWin32::getSystemDir(int, const char *version const GHOST_TUns8 *GHOST_SystemPathsWin32::getUserDir(int, const char *versionstr) const { static char knownpath[MAX_PATH * 3 + 128] = {0}; - wchar_t knownpath_16[MAX_PATH]; + PWSTR knownpath_16 = NULL; - HRESULT hResult = SHGetFolderPathW(NULL, CSIDL_APPDATA, NULL, SHGFP_TYPE_CURRENT, knownpath_16); + HRESULT hResult = SHGetKnownFolderPath( + FOLDERID_RoamingAppData, KF_FLAG_DEFAULT, NULL, &knownpath_16); if (hResult == S_OK) { conv_utf_16_to_8(knownpath_16, knownpath, MAX_PATH * 3); + CoTaskMemFree(knownpath_16); strcat(knownpath, "\\Blender Foundation\\Blender\\"); strcat(knownpath, versionstr); return (GHOST_TUns8 *)knownpath; diff --git a/intern/ghost/intern/GHOST_SystemPathsWin32.h b/intern/ghost/intern/GHOST_SystemPathsWin32.h index 49d241df633..f1924ea51bc 100644 --- a/intern/ghost/intern/GHOST_SystemPathsWin32.h +++ b/intern/ghost/intern/GHOST_SystemPathsWin32.h @@ -28,8 +28,6 @@ # error WIN32 only! #endif // WIN32 -#undef _WIN32_WINNT -#define _WIN32_WINNT 0x501 // require Windows XP or newer #define WIN32_LEAN_AND_MEAN #include <windows.h> diff --git a/intern/ghost/intern/GHOST_TaskbarWin32.h b/intern/ghost/intern/GHOST_TaskbarWin32.h index b1b81337494..abf1172cea8 100644 --- a/intern/ghost/intern/GHOST_TaskbarWin32.h +++ b/intern/ghost/intern/GHOST_TaskbarWin32.h @@ -24,10 +24,6 @@ # error WIN32 only! #endif // WIN32 -/* require Windows XP or newer */ -#undef _WIN32_WINNT -#define _WIN32_WINNT 0x501 - #define WIN32_LEAN_AND_MEAN #include <windows.h> #include <shlobj.h> diff --git a/intern/ghost/intern/GHOST_WindowViewCocoa.h b/intern/ghost/intern/GHOST_WindowViewCocoa.h index 9ed339c9992..cee40924b73 100644 --- a/intern/ghost/intern/GHOST_WindowViewCocoa.h +++ b/intern/ghost/intern/GHOST_WindowViewCocoa.h @@ -58,6 +58,11 @@ return YES; } +- (BOOL)acceptsFirstMouse:(NSEvent *)event +{ + return YES; +} + // The trick to prevent Cocoa from complaining (beeping) - (void)keyDown:(NSEvent *)event { diff --git a/intern/opensubdiv/internal/opensubdiv_evaluator.cc b/intern/opensubdiv/internal/opensubdiv_evaluator.cc index 2500691885c..4f5a1db82ca 100644 --- a/intern/opensubdiv/internal/opensubdiv_evaluator.cc +++ b/intern/opensubdiv/internal/opensubdiv_evaluator.cc @@ -102,6 +102,17 @@ void evaluateLimit(OpenSubdiv_Evaluator *evaluator, evaluator->internal->eval_output->evaluateLimit(ptex_face_index, face_u, face_v, P, dPdu, dPdv); } +void evaluatePatchesLimit(OpenSubdiv_Evaluator *evaluator, + const OpenSubdiv_PatchCoord *patch_coords, + const int num_patch_coords, + float *P, + float *dPdu, + float *dPdv) +{ + evaluator->internal->eval_output->evaluatePatchesLimit( + patch_coords, num_patch_coords, P, dPdu, dPdv); +} + void evaluateVarying(OpenSubdiv_Evaluator *evaluator, const int ptex_face_index, float face_u, @@ -137,6 +148,8 @@ void assignFunctionPointers(OpenSubdiv_Evaluator *evaluator) evaluator->evaluateLimit = evaluateLimit; evaluator->evaluateVarying = evaluateVarying; evaluator->evaluateFaceVarying = evaluateFaceVarying; + + evaluator->evaluatePatchesLimit = evaluatePatchesLimit; } } // namespace diff --git a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc index fa45c0119ec..c5dd4509976 100644 --- a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc +++ b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.cc @@ -54,109 +54,133 @@ using OpenSubdiv::Osd::CpuPatchTable; using OpenSubdiv::Osd::CpuVertexBuffer; using OpenSubdiv::Osd::PatchCoord; -// TODO(sergey): Remove after official requirement bump for OSD version. -#if OPENSUBDIV_VERSION_NUMBER >= 30200 -# define OPENSUBDIV_HAS_FVAR_EVALUATION -#else -# undef OPENSUBDIV_HAS_FVAR_EVALUATION -#endif - namespace opensubdiv_capi { namespace { -// Helper class to wrap numerous of patch coordinates into a buffer. -// Used to pass coordinates to the CPU evaluator. Other evaluators are not -// supported. -class PatchCoordBuffer : public vector<PatchCoord> { +// Array implementation which stores small data on stack (or, rather, in the class itself). +template<typename T, int kNumMaxElementsOnStack> class StackOrHeapArray { public: - static PatchCoordBuffer *Create(int size) + StackOrHeapArray() + : num_elements_(0), heap_elements_(NULL), num_heap_elements_(0), effective_elements_(NULL) { - PatchCoordBuffer *buffer = new PatchCoordBuffer(); - buffer->resize(size); - return buffer; } - PatchCoord *BindCpuBuffer() + explicit StackOrHeapArray(int size) : StackOrHeapArray() { - return reinterpret_cast<PatchCoord *>(&(*this)[0]); + resize(size); } - int GetNumVertices() + ~StackOrHeapArray() { - return size(); + delete[] heap_elements_; } - void UpdateData(const PatchCoord *patch_coords, int num_patch_coords) + int size() const { - memcpy(&(*this)[0], - reinterpret_cast<const void *>(patch_coords), - sizeof(PatchCoord) * num_patch_coords); - } -}; + return num_elements_; + }; -// Helper class to wrap single of patch coord into a buffer. Used to pass -// coordinates to the CPU evaluator. Other evaluators are not supported. -class SinglePatchCoordBuffer { - public: - static SinglePatchCoordBuffer *Create() + T *data() { - return new SinglePatchCoordBuffer(); + return effective_elements_; } - SinglePatchCoordBuffer() + void resize(int num_elements) { + const int old_num_elements = num_elements_; + num_elements_ = num_elements; + // Early output if allcoation size did not change, or allocation size is smaller. + // We never re-allocate, sacrificing some memory over performance. + if (old_num_elements >= num_elements) { + return; + } + // Simple case: no previously allocated buffer, can simply do one allocation. + if (effective_elements_ == NULL) { + effective_elements_ = allocate(num_elements); + return; + } + // Make new allocation, and copy elements if needed. + T *old_buffer = effective_elements_; + effective_elements_ = allocate(num_elements); + if (old_buffer != effective_elements_) { + memcpy(effective_elements_, old_buffer, sizeof(T) * min(old_num_elements, num_elements)); + } + if (old_buffer != stack_elements_) { + delete[] old_buffer; + } } - explicit SinglePatchCoordBuffer(const PatchCoord &patch_coord) : patch_coord_(patch_coord) + protected: + T *allocate(int num_elements) { + if (num_elements < kNumMaxElementsOnStack) { + return stack_elements_; + } + heap_elements_ = new T[num_elements]; + return heap_elements_; } - PatchCoord *BindCpuBuffer() - { - return &patch_coord_; - } + // Number of elements in the buffer. + int num_elements_; - int GetNumVertices() + // Elements which are allocated on a stack (or, rather, in the same allocation as the buffer + // itself). + // Is used as long as buffer is smaller than kNumMaxElementsOnStack. + T stack_elements_[kNumMaxElementsOnStack]; + + // Heap storage for buffer larger than kNumMaxElementsOnStack. + T *heap_elements_; + int num_heap_elements_; + + // Depending on the current buffer size points to rither stack_elements_ or heap_elements_. + T *effective_elements_; +}; + +// 32 is a number of inner vertices along the patch size at subdivision level 6. +typedef StackOrHeapArray<PatchCoord, 32 * 32> StackOrHeapPatchCoordArray; + +// Buffer which implements API required by OpenSubdiv and uses an existing memory as an underlying +// storage. +template<typename T> class RawDataWrapperBuffer { + public: + RawDataWrapperBuffer(T *data) : data_(data) { - return 1; } - void UpdateData(const PatchCoord &patch_coord) + T *BindCpuBuffer() { - patch_coord_ = patch_coord; + return data_; } + // TODO(sergey): Support UpdateData(). + protected: - PatchCoord patch_coord_; + T *data_; }; -// Helper class which is aimed to be used in cases when buffer is small enough -// and better to be allocated in stack rather than in heap. -// -// TODO(sergey): Check if bare arrays could be used by CPU evaluator. -template<int element_size, int num_vertices> class StackAllocatedBuffer { +template<typename T> class RawDataWrapperVertexBuffer : public RawDataWrapperBuffer<T> { public: - static PatchCoordBuffer *Create(int /*size*/) + RawDataWrapperVertexBuffer(T *data, int num_vertices) + : RawDataWrapperBuffer<T>(data), num_vertices_(num_vertices) { - // TODO(sergey): Validate that requested size is smaller than static - // stack memory size. - return new StackAllocatedBuffer<element_size, num_vertices>(); - } - - float *BindCpuBuffer() - { - return &data_[0]; } int GetNumVertices() { - return num_vertices; + return num_vertices_; } - // TODO(sergey): Support UpdateData(). protected: - float data_[element_size * num_vertices]; + int num_vertices_; +}; + +class ConstPatchCoordWrapperBuffer : public RawDataWrapperVertexBuffer<const PatchCoord> { + public: + ConstPatchCoordWrapperBuffer(const PatchCoord *data, int num_vertices) + : RawDataWrapperVertexBuffer(data, num_vertices) + { + } }; template<typename EVAL_VERTEX_BUFFER, @@ -217,11 +241,12 @@ class FaceVaryingVolatileEval { device_context_); } - void evalPatch(const PatchCoord &patch_coord, float face_varying[2]) + // NOTE: face_varying must point to a memory of at least float[2]*num_patch_coords. + void evalPatches(const PatchCoord *patch_coord, const int num_patch_coords, float *face_varying) { - StackAllocatedBuffer<2, 1> face_varying_data; + RawDataWrapperBuffer<float> face_varying_data(face_varying); BufferDescriptor face_varying_desc(0, 2, 2); - SinglePatchCoordBuffer patch_coord_buffer(patch_coord); + ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords); const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>( evaluator_cache_, src_face_varying_desc_, face_varying_desc, device_context_); EVALUATOR::EvalPatchesFaceVarying(src_face_varying_data_, @@ -234,8 +259,6 @@ class FaceVaryingVolatileEval { face_varying_channel_, eval_instance, device_context_); - const float *refined_face_varying = face_varying_data.BindCpuBuffer(); - memcpy(face_varying, refined_face_varying, sizeof(float) * 2); } protected: @@ -297,7 +320,6 @@ class VolatileEvalOutput { src_data_ = SRC_VERTEX_BUFFER::Create(3, num_total_vertices, device_context_); src_varying_data_ = SRC_VERTEX_BUFFER::Create(3, num_total_vertices, device_context_); patch_table_ = PATCH_TABLE::Create(patch_table, device_context_); - patch_coords_ = NULL; vertex_stencils_ = convertToCompatibleStencilTable<STENCIL_TABLE>(vertex_stencils, device_context_); varying_stencils_ = convertToCompatibleStencilTable<STENCIL_TABLE>(varying_stencils, @@ -398,74 +420,66 @@ class VolatileEvalOutput { } } - void evalPatchCoord(const PatchCoord &patch_coord, float P[3]) + // NOTE: P must point to a memory of at least float[3]*num_patch_coords. + void evalPatches(const PatchCoord *patch_coord, const int num_patch_coords, float *P) { - StackAllocatedBuffer<6, 1> vertex_data; - // TODO(sergey): Varying data is interleaved in vertex array, so need to - // adjust stride if there is a varying data. - // BufferDescriptor vertex_desc(0, 3, 6); - BufferDescriptor vertex_desc(0, 3, 3); - SinglePatchCoordBuffer patch_coord_buffer(patch_coord); + RawDataWrapperBuffer<float> P_data(P); + // TODO(sergey): Support interleaved vertex-varying data. + BufferDescriptor P_desc(0, 3, 3); + ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords); const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>( - evaluator_cache_, src_desc_, vertex_desc, device_context_); + evaluator_cache_, src_desc_, P_desc, device_context_); EVALUATOR::EvalPatches(src_data_, src_desc_, - &vertex_data, - vertex_desc, + &P_data, + P_desc, patch_coord_buffer.GetNumVertices(), &patch_coord_buffer, patch_table_, eval_instance, device_context_); - const float *refined_vertices = vertex_data.BindCpuBuffer(); - memcpy(P, refined_vertices, sizeof(float) * 3); } - void evalPatchesWithDerivatives(const PatchCoord &patch_coord, - float P[3], - float dPdu[3], - float dPdv[3]) + // NOTE: P, dPdu, dPdv must point to a memory of at least float[3]*num_patch_coords. + void evalPatchesWithDerivatives(const PatchCoord *patch_coord, + const int num_patch_coords, + float *P, + float *dPdu, + float *dPdv) { - StackAllocatedBuffer<6, 1> vertex_data, derivatives; - // TODO(sergey): Varying data is interleaved in vertex array, so need to - // adjust stride if there is a varying data. - // BufferDescriptor vertex_desc(0, 3, 6); - BufferDescriptor vertex_desc(0, 3, 3); - BufferDescriptor du_desc(0, 3, 6), dv_desc(3, 3, 6); - SinglePatchCoordBuffer patch_coord_buffer(patch_coord); + assert(dPdu); + assert(dPdv); + RawDataWrapperBuffer<float> P_data(P); + RawDataWrapperBuffer<float> dPdu_data(dPdu), dPdv_data(dPdv); + // TODO(sergey): Support interleaved vertex-varying data. + BufferDescriptor P_desc(0, 3, 3); + BufferDescriptor dpDu_desc(0, 3, 3), pPdv_desc(0, 3, 3); + ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords); const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>( - evaluator_cache_, src_desc_, vertex_desc, du_desc, dv_desc, device_context_); + evaluator_cache_, src_desc_, P_desc, dpDu_desc, pPdv_desc, device_context_); EVALUATOR::EvalPatches(src_data_, src_desc_, - &vertex_data, - vertex_desc, - &derivatives, - du_desc, - &derivatives, - dv_desc, + &P_data, + P_desc, + &dPdu_data, + dpDu_desc, + &dPdv_data, + pPdv_desc, patch_coord_buffer.GetNumVertices(), &patch_coord_buffer, patch_table_, eval_instance, device_context_); - const float *refined_vertices = vertex_data.BindCpuBuffer(); - memcpy(P, refined_vertices, sizeof(float) * 3); - if (dPdu != NULL || dPdv != NULL) { - const float *refined_derivatives = derivatives.BindCpuBuffer(); - if (dPdu != NULL) { - memcpy(dPdu, refined_derivatives, sizeof(float) * 3); - } - if (dPdv != NULL) { - memcpy(dPdv, refined_derivatives + 3, sizeof(float) * 3); - } - } } - void evalPatchVarying(const PatchCoord &patch_coord, float varying[3]) + // NOTE: varying must point to a memory of at least float[3]*num_patch_coords. + void evalPatchesVarying(const PatchCoord *patch_coord, + const int num_patch_coords, + float *varying) { - StackAllocatedBuffer<6, 1> varying_data; + RawDataWrapperBuffer<float> varying_data(varying); BufferDescriptor varying_desc(3, 3, 6); - SinglePatchCoordBuffer patch_coord_buffer(patch_coord); + ConstPatchCoordWrapperBuffer patch_coord_buffer(patch_coord, num_patch_coords); const EVALUATOR *eval_instance = OpenSubdiv::Osd::GetEvaluator<EVALUATOR>( evaluator_cache_, src_varying_desc_, varying_desc, device_context_); EVALUATOR::EvalPatchesVarying(src_varying_data_, @@ -477,23 +491,22 @@ class VolatileEvalOutput { patch_table_, eval_instance, device_context_); - const float *refined_varying = varying_data.BindCpuBuffer(); - memcpy(varying, refined_varying, sizeof(float) * 3); } - void evalPatchFaceVarying(const int face_varying_channel, - const PatchCoord &patch_coord, - float face_varying[2]) + void evalPatchesFaceVarying(const int face_varying_channel, + const PatchCoord *patch_coord, + const int num_patch_coords, + float face_varying[2]) { assert(face_varying_channel >= 0); assert(face_varying_channel < face_varying_evaluators.size()); - face_varying_evaluators[face_varying_channel]->evalPatch(patch_coord, face_varying); + face_varying_evaluators[face_varying_channel]->evalPatches( + patch_coord, num_patch_coords, face_varying); } private: SRC_VERTEX_BUFFER *src_data_; SRC_VERTEX_BUFFER *src_varying_data_; - PatchCoordBuffer *patch_coords_; PATCH_TABLE *patch_table_; BufferDescriptor src_desc_; BufferDescriptor src_varying_desc_; @@ -510,6 +523,19 @@ class VolatileEvalOutput { DEVICE_CONTEXT *device_context_; }; +void convertPatchCoordsToArray(const OpenSubdiv_PatchCoord *patch_coords, + const int num_patch_coords, + const OpenSubdiv::Far::PatchMap *patch_map, + StackOrHeapPatchCoordArray *array) +{ + array->resize(num_patch_coords); + for (int i = 0; i < num_patch_coords; ++i) { + const PatchTable::PatchHandle *handle = patch_map->FindPatch( + patch_coords[i].ptex_face, patch_coords[i].u, patch_coords[i].v); + (array->data())[i] = PatchCoord(*handle, patch_coords[i].u, patch_coords[i].v); + } +} + } // namespace // Note: Define as a class instead of typedcef to make it possible @@ -653,10 +679,10 @@ void CpuEvalOutputAPI::evaluateLimit(const int ptex_face_index, const PatchTable::PatchHandle *handle = patch_map_->FindPatch(ptex_face_index, face_u, face_v); PatchCoord patch_coord(*handle, face_u, face_v); if (dPdu != NULL || dPdv != NULL) { - implementation_->evalPatchesWithDerivatives(patch_coord, P, dPdu, dPdv); + implementation_->evalPatchesWithDerivatives(&patch_coord, 1, P, dPdu, dPdv); } else { - implementation_->evalPatchCoord(patch_coord, P); + implementation_->evalPatches(&patch_coord, 1, P); } } @@ -671,7 +697,7 @@ void CpuEvalOutputAPI::evaluateVarying(const int ptex_face_index, assert(face_v <= 1.0f); const PatchTable::PatchHandle *handle = patch_map_->FindPatch(ptex_face_index, face_u, face_v); PatchCoord patch_coord(*handle, face_u, face_v); - implementation_->evalPatchVarying(patch_coord, varying); + implementation_->evalPatchesVarying(&patch_coord, 1, varying); } void CpuEvalOutputAPI::evaluateFaceVarying(const int face_varying_channel, @@ -686,7 +712,24 @@ void CpuEvalOutputAPI::evaluateFaceVarying(const int face_varying_channel, assert(face_v <= 1.0f); const PatchTable::PatchHandle *handle = patch_map_->FindPatch(ptex_face_index, face_u, face_v); PatchCoord patch_coord(*handle, face_u, face_v); - implementation_->evalPatchFaceVarying(face_varying_channel, patch_coord, face_varying); + implementation_->evalPatchesFaceVarying(face_varying_channel, &patch_coord, 1, face_varying); +} + +void CpuEvalOutputAPI::evaluatePatchesLimit(const OpenSubdiv_PatchCoord *patch_coords, + const int num_patch_coords, + float *P, + float *dPdu, + float *dPdv) +{ + StackOrHeapPatchCoordArray patch_coords_array; + convertPatchCoordsToArray(patch_coords, num_patch_coords, patch_map_, &patch_coords_array); + if (dPdu != NULL || dPdv != NULL) { + implementation_->evalPatchesWithDerivatives( + patch_coords_array.data(), num_patch_coords, P, dPdu, dPdv); + } + else { + implementation_->evalPatches(patch_coords_array.data(), num_patch_coords, P); + } } } // namespace opensubdiv_capi @@ -757,7 +800,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal( } // Face warying stencil. vector<const StencilTable *> all_face_varying_stencils; -#ifdef OPENSUBDIV_HAS_FVAR_EVALUATION all_face_varying_stencils.reserve(num_face_varying_channels); for (int face_varying_channel = 0; face_varying_channel < num_face_varying_channels; ++face_varying_channel) { @@ -769,7 +811,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal( all_face_varying_stencils.push_back( StencilTableFactory::Create(*refiner, face_varying_stencil_options)); } -#endif // Generate bi-cubic patch table for the limit surface. // TODO(sergey): Ideally we would want to expose end-cap settings via // C-API to make it more generic. Currently it matches old Blender's @@ -800,7 +841,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal( varying_stencils = table; } } -#ifdef OPENSUBDIV_HAS_FVAR_EVALUATION for (int face_varying_channel = 0; face_varying_channel < num_face_varying_channels; ++face_varying_channel) { const StencilTable *table = StencilTableFactory::AppendLocalPointStencilTableFaceVarying( @@ -813,7 +853,6 @@ OpenSubdiv_EvaluatorInternal *openSubdiv_createEvaluatorInternal( all_face_varying_stencils[face_varying_channel] = table; } } -#endif // Create OpenSubdiv's CPU side evaluator. // TODO(sergey): Make it possible to use different evaluators. opensubdiv_capi::CpuEvalOutput *eval_output = new opensubdiv_capi::CpuEvalOutput( diff --git a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h index 7c963227d17..392633944c6 100644 --- a/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h +++ b/intern/opensubdiv/internal/opensubdiv_evaluator_internal.h @@ -26,6 +26,7 @@ #include <opensubdiv/far/patchMap.h> #include <opensubdiv/far/patchTable.h> +struct OpenSubdiv_PatchCoord; struct OpenSubdiv_TopologyRefiner; namespace opensubdiv_capi { @@ -114,6 +115,18 @@ class CpuEvalOutputAPI { float face_v, float face_varying[2]); + // Batched evaluation of multiple input coordinates. + + // Evaluate given ptex face at given bilinear coordinate. + // If derivatives are NULL, they will not be evaluated. + // + // NOTE: Output arrays must point to a memory of size float[3]*num_patch_coords. + void evaluatePatchesLimit(const OpenSubdiv_PatchCoord *patch_coords, + const int num_patch_coords, + float *P, + float *dPdu, + float *dPdv); + protected: CpuEvalOutput *implementation_; OpenSubdiv::Far::PatchMap *patch_map_; diff --git a/intern/opensubdiv/opensubdiv_capi_type.h b/intern/opensubdiv/opensubdiv_capi_type.h index 35eeb71dede..e759c5f43b0 100644 --- a/intern/opensubdiv/opensubdiv_capi_type.h +++ b/intern/opensubdiv/opensubdiv_capi_type.h @@ -58,6 +58,13 @@ typedef enum OpenSubdiv_FVarLinearInterpolation { OSD_FVAR_LINEAR_INTERPOLATION_ALL, } OpenSubdiv_FVarLinearInterpolation; +typedef struct OpenSubdiv_PatchCoord { + int ptex_face; + + // Parametric location on patch. + float u, v; +} OpenSubdiv_PatchCoord; + #ifdef __cplusplus } #endif diff --git a/intern/opensubdiv/opensubdiv_evaluator_capi.h b/intern/opensubdiv/opensubdiv_evaluator_capi.h index ceb0c58feba..1572d01b851 100644 --- a/intern/opensubdiv/opensubdiv_evaluator_capi.h +++ b/intern/opensubdiv/opensubdiv_evaluator_capi.h @@ -24,6 +24,7 @@ extern "C" { #endif struct OpenSubdiv_EvaluatorInternal; +struct OpenSubdiv_PatchCoord; struct OpenSubdiv_TopologyRefiner; typedef struct OpenSubdiv_Evaluator { @@ -108,6 +109,19 @@ typedef struct OpenSubdiv_Evaluator { float face_v, float face_varying[2]); + // Batched evaluation of multiple input coordinates. + + // Evaluate limit surface. + // If derivatives are NULL, they will not be evaluated. + // + // NOTE: Output arrays must point to a memory of size float[3]*num_patch_coords. + void (*evaluatePatchesLimit)(struct OpenSubdiv_Evaluator *evaluator, + const struct OpenSubdiv_PatchCoord *patch_coords, + const int num_patch_coords, + float *P, + float *dPdu, + float *dPdv); + // Internal storage for the use in this module only. // // This is where actual OpenSubdiv's evaluator is living. |