diff options
author | Campbell Barton <ideasman42@gmail.com> | 2017-11-30 10:30:41 +0300 |
---|---|---|
committer | Campbell Barton <ideasman42@gmail.com> | 2017-11-30 10:30:41 +0300 |
commit | 03a5eccc945497da3cffafdf050c17fb4ccf999c (patch) | |
tree | b451841a8bb0e614184de8a530322f4e45736b51 | |
parent | 84d39ab97bba5d4885a062363c8b3f3d28cb7da7 (diff) | |
parent | 28d2148b09a2d975876f4418bd5ba15ab5edea8d (diff) |
Merge branch 'master' into blender2.8
32 files changed, 891 insertions, 446 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 40671071921..2f159d40dd1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -195,7 +195,7 @@ endif() # Options # First platform specific non-cached vars -if(UNIX AND NOT APPLE) +if(UNIX AND NOT (APPLE OR HAIKU)) set(WITH_X11 ON) endif() diff --git a/build_files/cmake/Modules/FindSDL2.cmake b/build_files/cmake/Modules/FindSDL2.cmake index 7c40a6ef6f3..e84c6845156 100644 --- a/build_files/cmake/Modules/FindSDL2.cmake +++ b/build_files/cmake/Modules/FindSDL2.cmake @@ -42,7 +42,7 @@ FIND_PATH(SDL2_INCLUDE_DIR HINTS ${_sdl2_SEARCH_DIRS} PATH_SUFFIXES - include/SDL2 include + include/SDL2 include SDL2 ) FIND_LIBRARY(SDL2_LIBRARY diff --git a/build_files/cmake/macros.cmake b/build_files/cmake/macros.cmake index cdc63ef8fba..2e16b9238b3 100644 --- a/build_files/cmake/macros.cmake +++ b/build_files/cmake/macros.cmake @@ -1519,6 +1519,7 @@ function(find_python_package PATH_SUFFIXES site-packages dist-packages + vendor-packages NO_DEFAULT_PATH ) @@ -1529,6 +1530,8 @@ function(find_python_package "'${PYTHON_LIBPATH}/python${_PY_VER_MAJOR}/site-packages/${package}', " "'${PYTHON_LIBPATH}/python${PYTHON_VERSION}/dist-packages/${package}', " "'${PYTHON_LIBPATH}/python${_PY_VER_MAJOR}/dist-packages/${package}', " + "'${PYTHON_LIBPATH}/python${PYTHON_VERSION}/vendor-packages/${package}', " + "'${PYTHON_LIBPATH}/python${_PY_VER_MAJOR}/vendor-packages/${package}', " "\n" "The 'WITH_PYTHON_INSTALL_${_upper_package}' option will be ignored when installing Python.\n" "The build will be usable, only add-ons that depend on this package won't be functional." diff --git a/build_files/cmake/platform/platform_unix.cmake b/build_files/cmake/platform/platform_unix.cmake index 3581fe64341..d5e29164696 100644 --- a/build_files/cmake/platform/platform_unix.cmake +++ b/build_files/cmake/platform/platform_unix.cmake @@ -358,7 +358,11 @@ if(WITH_OPENSUBDIV OR WITH_CYCLES_OPENSUBDIV) endif() # OpenSuse needs lutil, ArchLinux not, for now keep, can avoid by using --as-needed -list(APPEND PLATFORM_LINKLIBS -lutil -lc -lm) +if(HAIKU) + list(APPEND PLATFORM_LINKLIBS -lnetwork) +else() + list(APPEND PLATFORM_LINKLIBS -lutil -lc -lm) +endif() find_package(Threads REQUIRED) list(APPEND PLATFORM_LINKLIBS ${CMAKE_THREAD_LIBS_INIT}) diff --git a/extern/glog/src/config.h b/extern/glog/src/config.h index f5c9c0b0a7b..2703b7ba9dd 100644 --- a/extern/glog/src/config.h +++ b/extern/glog/src/config.h @@ -14,4 +14,6 @@ #include "windows/config.h" #elif defined(__GNU__) #include "config_hurd.h" +#elif defined(__HAIKU__) + #include "config_haiku.h" #endif diff --git a/extern/glog/src/config_haiku.h b/extern/glog/src/config_haiku.h new file mode 100644 index 00000000000..4e6703bc186 --- /dev/null +++ b/extern/glog/src/config_haiku.h @@ -0,0 +1,172 @@ +/* src/config.h. Generated from config.h.in by configure. */ +/* src/config.h.in. Generated from configure.ac by autoheader. */ + +/* Namespace for Google classes */ +#define GOOGLE_NAMESPACE google + +/* Define if you have the `dladdr' function */ +#define HAVE_DLADDR 1 + +/* Define to 1 if you have the <dlfcn.h> header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the <execinfo.h> header file. */ +#undef HAVE_EXECINFO_H + +/* Define if you have the `fcntl' function */ +#define HAVE_FCNTL 1 + +/* Define to 1 if you have the <glob.h> header file. */ +#define HAVE_GLOB_H 1 + +/* Define to 1 if you have the <inttypes.h> header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `pthread' library (-lpthread). */ +#define HAVE_LIBPTHREAD 1 + +/* Define to 1 if you have the <libunwind.h> header file. */ +/* #undef HAVE_LIBUNWIND_H */ + +/* define if you have google gflags library */ +#define HAVE_LIB_GFLAGS 1 + +/* define if you have google gmock library */ +/* #undef HAVE_LIB_GMOCK */ + +/* define if you have google gtest library */ +/* #undef HAVE_LIB_GTEST */ + +/* define if you have libunwind */ +/* #undef HAVE_LIB_UNWIND */ + +/* Define to 1 if you have the <memory.h> header file. */ +#define HAVE_MEMORY_H 1 + +/* define if the compiler implements namespaces */ +#define HAVE_NAMESPACES 1 + +/* Define if you have the 'pread' function */ +#define HAVE_PREAD 1 + +/* Define if you have POSIX threads libraries and header files. */ +#define HAVE_PTHREAD 1 + +/* Define to 1 if you have the <pwd.h> header file. */ +#define HAVE_PWD_H 1 + +/* Define if you have the 'pwrite' function */ +#define HAVE_PWRITE 1 + +/* define if the compiler implements pthread_rwlock_* */ +#define HAVE_RWLOCK 1 + +/* Define if you have the `sigaltstack' function */ +#define HAVE_SIGALTSTACK 1 + +/* Define to 1 if you have the <stdint.h> header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the <stdlib.h> header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the <strings.h> header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the <string.h> header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the <syscall.h> header file. */ +/* #undef HAVE_SYSCALL_H */ + +/* Define to 1 if you have the <syslog.h> header file. */ +#define HAVE_SYSLOG_H 1 + +/* Define to 1 if you have the <sys/stat.h> header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the <sys/syscall.h> header file. */ +/* #undef HAVE_SYS_SYSCALL_H */ + +/* Define to 1 if you have the <sys/time.h> header file. */ +#define HAVE_SYS_TIME_H 1 + +/* Define to 1 if you have the <sys/types.h> header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the <sys/ucontext.h> header file. */ +/* #undef HAVE_SYS_UCONTEXT_H */ + +/* Define to 1 if you have the <sys/utsname.h> header file. */ +#define HAVE_SYS_UTSNAME_H 1 + +/* Define to 1 if you have the <ucontext.h> header file. */ +/* #undef HAVE_UCONTEXT_H */ + +/* Define to 1 if you have the <unistd.h> header file. */ +#define HAVE_UNISTD_H 1 + +/* define if the compiler supports using expression for operator */ +#define HAVE_USING_OPERATOR 1 + +/* define if your compiler has __attribute__ */ +#define HAVE___ATTRIBUTE__ 1 + +/* define if your compiler has __builtin_expect */ +#define HAVE___BUILTIN_EXPECT 1 + +/* define if your compiler has __sync_val_compare_and_swap */ +/* #undef HAVE___SYNC_VAL_COMPARE_AND_SWAP */ + +/* Name of package */ +#define PACKAGE "glog" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "opensource@google.com" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "glog" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "glog 0.3.4" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "glog" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "0.3.4" + +/* How to access the PC from a struct ucontext */ +/* #define PC_FROM_UCONTEXT uc_mcontext.gregs[REG_RIP] */ + +/* Define to necessary symbol if this constant uses a non-standard name on + your system. */ +/* #undef PTHREAD_CREATE_JOINABLE */ + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +/* Define to 1 if you have the ANSI C header files. */ +/* #undef STDC_HEADERS */ + +#define STDC_HEADERS 1 +/* the namespace where STL code like vector<> is defined */ +#define STL_NAMESPACE std + +/* location of source code */ +#define TEST_SRC_DIR "." + +/* Version number of package */ +#define VERSION "0.3.4" + +/* Stops putting the code inside the Google namespace */ +#define _END_GOOGLE_NAMESPACE_ } + +/* Puts following code inside the Google namespace */ +#define _START_GOOGLE_NAMESPACE_ namespace google { + +/* isn't getting defined by configure script when clang compilers are used + and cuases compilation errors in stactrace/unwind modules */ +#ifdef __clang__ +# define NO_FRAME_POINTER +#endif diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp index 999b9230d29..2d28ccd2b49 100644 --- a/intern/cycles/device/device_cpu.cpp +++ b/intern/cycles/device/device_cpu.cpp @@ -190,9 +190,9 @@ public: KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel; KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel; - KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; - KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel; - KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; + KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel; + KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel; + KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel; KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*, int, int, int, int, int, int, int, int, ccl_global int*, int, @@ -565,13 +565,13 @@ public: (float*) color_variance_ptr, difference, local_rect, - task->buffer.w, + task->buffer.stride, task->buffer.pass_stride, 1.0f, task->nlm_k_2); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); - filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4); - filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); + filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4); + filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4); filter_nlm_construct_gramian_kernel()(dx, dy, blurDifference, (float*) task->buffer.mem.device_pointer, @@ -580,9 +580,8 @@ public: (float*) task->storage.XtWX.device_pointer, (float3*) task->storage.XtWY.device_pointer, local_rect, - &task->reconstruction_state.filter_rect.x, - task->buffer.w, - task->buffer.h, + &task->reconstruction_state.filter_window.x, + task->buffer.stride, 4, task->buffer.pass_stride); } @@ -591,8 +590,6 @@ public: filter_finalize_kernel()(x, y, y*task->filter_area.z + x, - task->buffer.w, - task->buffer.h, (float*) output_ptr, (int*) task->storage.rank.device_pointer, (float*) task->storage.XtWX.device_pointer, diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index ce0df36a76f..2e5f47002e8 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -1087,6 +1087,19 @@ public: threads, threads, 1, \ 0, 0, args, 0)); +/* Similar as above, but for 1-dimensional blocks. */ +#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \ + int threads_per_block; \ + cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \ + int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \ + int yblocks = h; + +#define CUDA_LAUNCH_KERNEL_1D(func, args) \ + cuda_assert(cuLaunchKernel(func, \ + xblocks, yblocks, 1, \ + threads_per_block, 1, 1, \ + 0, 0, args, 0)); + bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, DenoisingTask *task) { @@ -1095,60 +1108,65 @@ public: CUDAContextScope scope(this); - int4 rect = task->rect; - int w = align_up(rect.z-rect.x, 4); - int h = rect.w-rect.y; + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; int r = task->nlm_state.r; int f = task->nlm_state.f; float a = task->nlm_state.a; float k_2 = task->nlm_state.k_2; - CUdeviceptr difference = task->nlm_state.temporary_1_ptr; - CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr; - CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*2*num_shifts; + int channel_offset = 0; - cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h)); - cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h)); + CUdeviceptr temporary_mem; + cuda_assert(cuMemAlloc(&temporary_mem, mem_size)); + CUdeviceptr difference = temporary_mem; + CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts; - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr; + cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride)); + cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output")); - CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1)); - int dx, dy; - int4 local_rect; - int channel_offset = 0; - void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2}; - void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f}; - void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f}; - void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f}; - - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - dy = i / (2*r+1) - r; - dx = i % (2*r+1) - r; - local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); - - CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args); - } - - local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y); - void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w}; - CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); - cuda_assert(cuCtxSynchronize()); + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts); + + void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f}; + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args); + } + + cuMemFree(temporary_mem); + + { + CUfunction cuNLMNormalize; + cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize")); + cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1)); + void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride}; + CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h); + CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args); + cuda_assert(cuCtxSynchronize()); + } return !have_error(); } @@ -1194,91 +1212,81 @@ public: mem_zero(task->storage.XtWX); mem_zero(task->storage.XtWY); - CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize; - cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); - cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); - cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); - cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); - cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + int r = task->radius; + int f = 4; + float a = 1.0f; + float k_2 = task->nlm_k_2; - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); - cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); - cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; - CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; - CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr; - CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr; + CUdeviceptr temporary_mem; + cuda_assert(cuMemAlloc(&temporary_mem, 2*mem_size)); + CUdeviceptr difference = temporary_mem; + CUdeviceptr blurDifference = temporary_mem + mem_size; - int r = task->radius; - int f = 4; - float a = 1.0f; - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - - void *calc_difference_args[] = {&dx, &dy, - &color_ptr, - &color_variance_ptr, - &difference, - &local_rect, - &task->buffer.w, - &task->buffer.pass_stride, - &a, - &task->nlm_k_2}; - CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args); - - void *blur_args[] = {&difference, - &blurDifference, - &local_rect, - &task->buffer.w, - &f}; - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - - void *calc_weight_args[] = {&blurDifference, - &difference, - &local_rect, - &task->buffer.w, - &f}; - CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args); - - /* Reuse previous arguments. */ - CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args); - - void *construct_gramian_args[] = {&dx, &dy, - &blurDifference, + { + CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian; + cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference")); + cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur")); + cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight")); + cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian")); + + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1)); + cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED)); + + CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, + task->reconstruction_state.source_w * task->reconstruction_state.source_h, + num_shifts); + + void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2}; + void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f}; + void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f}; + void *construct_gramian_args[] = {&blurDifference, &task->buffer.mem.device_pointer, &task->storage.transform.device_pointer, &task->storage.rank.device_pointer, &task->storage.XtWX.device_pointer, &task->storage.XtWY.device_pointer, - &local_rect, - &task->reconstruction_state.filter_rect, - &task->buffer.w, - &task->buffer.h, + &task->reconstruction_state.filter_window, + &w, &h, &stride, + &shift_stride, &r, &f, &task->buffer.pass_stride}; - CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args); - } - - void *finalize_args[] = {&task->buffer.w, - &task->buffer.h, - &output_ptr, - &task->storage.rank.device_pointer, - &task->storage.XtWX.device_pointer, - &task->storage.XtWY.device_pointer, - &task->filter_area, - &task->reconstruction_state.buffer_params.x, - &task->render_buffer.samples}; - CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args); + CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args); + } + + cuMemFree(temporary_mem); + + { + CUfunction cuFinalize; + cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize")); + cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1)); + void *finalize_args[] = {&output_ptr, + &task->storage.rank.device_pointer, + &task->storage.XtWX.device_pointer, + &task->storage.XtWY.device_pointer, + &task->filter_area, + &task->reconstruction_state.buffer_params.x, + &task->render_buffer.samples}; + CUDA_GET_BLOCKSIZE(cuFinalize, + task->reconstruction_state.source_w, + task->reconstruction_state.source_h); + CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args); + } + cuda_assert(cuCtxSynchronize()); return !have_error(); diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp index 69c43e4a8cf..1862deb9a61 100644 --- a/intern/cycles/device/device_denoising.cpp +++ b/intern/cycles/device/device_denoising.cpp @@ -57,10 +57,9 @@ void DenoisingTask::init_from_devicetask(const DeviceTask &task) render_buffer.denoising_clean_offset = task.pass_denoising_clean; /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */ - rect = make_int4(max(tiles->x[0], filter_area.x - radius), - max(tiles->y[0], filter_area.y - radius), - min(tiles->x[3], filter_area.x + filter_area.z + radius), - min(tiles->y[3], filter_area.y + filter_area.w + radius)); + rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w); + rect = rect_expand(rect, radius); + rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3])); } void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles) @@ -93,9 +92,10 @@ bool DenoisingTask::run_denoising() { /* Allocate denoising buffer. */ buffer.passes = 14; - buffer.w = align_up(rect.z - rect.x, 4); + buffer.width = rect.z - rect.x; + buffer.stride = align_up(buffer.width, 4); buffer.h = rect.w - rect.y; - buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); + buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float))); buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false); device_ptr null_ptr = (device_ptr) 0; @@ -203,15 +203,17 @@ bool DenoisingTask::run_denoising() functions.construct_transform(); - storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false); - storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false); - reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer; - reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer; + device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1"); + device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2"); + temporary_1.alloc_to_device(buffer.pass_stride, false); + temporary_2.alloc_to_device(buffer.pass_stride, false); + reconstruction_state.temporary_1_ptr = temporary_1.device_pointer; + reconstruction_state.temporary_2_ptr = temporary_2.device_pointer; storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false); storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false); - reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); + reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h); int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x; reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset, render_buffer.stride, diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h index ec4e7933cdc..77a82d0ad04 100644 --- a/intern/cycles/device/device_denoising.h +++ b/intern/cycles/device/device_denoising.h @@ -94,7 +94,7 @@ public: device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */ device_ptr temporary_2_ptr; - int4 filter_rect; + int4 filter_window; int4 buffer_params; int source_w; @@ -148,8 +148,9 @@ public: struct DenoiseBuffers { int pass_stride; int passes; - int w; + int stride; int h; + int width; device_only_memory<float> mem; DenoiseBuffers(Device *device) diff --git a/intern/cycles/device/opencl/opencl.h b/intern/cycles/device/opencl/opencl.h index c02f8ffafe6..f38c2f65c1e 100644 --- a/intern/cycles/device/opencl/opencl.h +++ b/intern/cycles/device/opencl/opencl.h @@ -353,7 +353,9 @@ public: void tex_free(device_memory& mem); size_t global_size_round_up(int group_size, int global_size); - void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1); + void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, + bool x_workgroups = false, + size_t max_workgroup_size = -1); void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name); void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg); diff --git a/intern/cycles/device/opencl/opencl_base.cpp b/intern/cycles/device/opencl/opencl_base.cpp index f43177247ef..fe084edc90e 100644 --- a/intern/cycles/device/opencl/opencl_base.cpp +++ b/intern/cycles/device/opencl/opencl_base.cpp @@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size) return global_size + ((r == 0)? 0: group_size - r); } -void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size) +void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size) { size_t workgroup_size, max_work_items[3]; @@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size } /* Try to divide evenly over 2 dimensions. */ - size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); - size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size}; + size_t local_size[2]; + if(x_workgroups) { + local_size[0] = workgroup_size; + local_size[1] = 1; + } + else { + size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1); + local_size[0] = local_size[1] = sqrt_workgroup_size; + } /* Some implementations have max size 1 on 2nd dimension. */ if(local_size[1] > max_work_items[1]) { @@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, device_ptr out_ptr, DenoisingTask *task) { - int4 rect = task->rect; - int w = rect.z-rect.x; - int h = rect.w-rect.y; + + int stride = task->buffer.stride; + int w = task->buffer.width; + int h = task->buffer.h; int r = task->nlm_state.r; int f = task->nlm_state.f; float a = task->nlm_state.a; float k_2 = task->nlm_state.k_2; - cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr); - cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr); - cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + int shift_stride = stride*h; + int num_shifts = (2*r+1)*(2*r+1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr); + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means"); cl_mem image_mem = CL_MEM_PTR(image_ptr); cl_mem guide_mem = CL_MEM_PTR(guide_ptr); @@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr, cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output")); cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize")); - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)); - kernel_set_args(ckNLMCalcDifference, 0, - dx, dy, guide_mem, variance_mem, - difference, local_rect, w, 0, a, k_2); - kernel_set_args(ckNLMBlur, 0, - difference, blurDifference, local_rect, w, f); - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, difference, local_rect, w, f); - kernel_set_args(ckNLMUpdateOutput, 0, - dx, dy, blurDifference, image_mem, - out_mem, weightAccum, local_rect, w, f); - - enqueue_kernel(ckNLMCalcDifference, w, h); - enqueue_kernel(ckNLMBlur, w, h); - enqueue_kernel(ckNLMCalcWeight, w, h); - enqueue_kernel(ckNLMBlur, w, h); - enqueue_kernel(ckNLMUpdateOutput, w, h); - } + kernel_set_args(ckNLMCalcDifference, 0, + guide_mem, + variance_mem, + difference, + w, h, stride, + shift_stride, + r, 0, a, k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + r, f); + kernel_set_args(ckNLMUpdateOutput, 0, + blurDifference, + image_mem, + out_mem, + weightAccum, + w, h, stride, + shift_stride, + r, f); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); - int4 local_rect = make_int4(0, 0, w, h); kernel_set_args(ckNLMNormalize, 0, - out_mem, weightAccum, local_rect, w); + out_mem, weightAccum, w, h, stride); enqueue_kernel(ckNLMNormalize, w, h); return true; @@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian")); cl_kernel ckFinalize = denoising_program(ustring("filter_finalize")); - cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr); - cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr); - - int r = task->radius; - int f = 4; - float a = 1.0f; - for(int i = 0; i < (2*r+1)*(2*r+1); i++) { - int dy = i / (2*r+1) - r; - int dx = i % (2*r+1) - r; - - int local_rect[4] = {max(0, -dx), max(0, -dy), - task->reconstruction_state.source_w - max(0, dx), - task->reconstruction_state.source_h - max(0, dy)}; - - kernel_set_args(ckNLMCalcDifference, 0, - dx, dy, - color_mem, - color_variance_mem, - difference, - local_rect, - task->buffer.w, - task->buffer.pass_stride, - a, task->nlm_k_2); - enqueue_kernel(ckNLMCalcDifference, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMBlur, 0, - difference, - blurDifference, - local_rect, - task->buffer.w, - f); - enqueue_kernel(ckNLMBlur, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMCalcWeight, 0, - blurDifference, - difference, - local_rect, - task->buffer.w, - f); - enqueue_kernel(ckNLMCalcWeight, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - /* Reuse previous arguments. */ - enqueue_kernel(ckNLMBlur, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); - - kernel_set_args(ckNLMConstructGramian, 0, - dx, dy, - blurDifference, - buffer_mem, - transform_mem, - rank_mem, - XtWX_mem, - XtWY_mem, - local_rect, - task->reconstruction_state.filter_rect, - task->buffer.w, - task->buffer.h, - f, - task->buffer.pass_stride); - enqueue_kernel(ckNLMConstructGramian, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h, - 256); - } + int w = task->reconstruction_state.source_w; + int h = task->reconstruction_state.source_h; + int stride = task->buffer.stride; + + int shift_stride = stride*h; + int num_shifts = (2*task->radius + 1)*(2*task->radius + 1); + int mem_size = sizeof(float)*shift_stride*num_shifts; + + cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr); + opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct"); + + kernel_set_args(ckNLMCalcDifference, 0, + color_mem, + color_variance_mem, + difference, + w, h, stride, + shift_stride, + task->radius, + task->buffer.pass_stride, + 1.0f, task->nlm_k_2); + kernel_set_args(ckNLMBlur, 0, + difference, + blurDifference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMCalcWeight, 0, + blurDifference, + difference, + w, h, stride, + shift_stride, + task->radius, 4); + kernel_set_args(ckNLMConstructGramian, 0, + blurDifference, + buffer_mem, + transform_mem, + rank_mem, + XtWX_mem, + XtWY_mem, + task->reconstruction_state.filter_window, + w, h, stride, + shift_stride, + task->radius, 4, + task->buffer.pass_stride); + + enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true); + enqueue_kernel(ckNLMBlur, w*h, num_shifts, true); + enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256); + + opencl_assert(clReleaseMemObject(difference)); + opencl_assert(clReleaseMemObject(blurDifference)); kernel_set_args(ckFinalize, 0, - task->buffer.w, - task->buffer.h, output_mem, rank_mem, XtWX_mem, @@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr, task->filter_area, task->reconstruction_state.buffer_params, task->render_buffer.samples); - enqueue_kernel(ckFinalize, - task->reconstruction_state.source_w, - task->reconstruction_state.source_h); + enqueue_kernel(ckFinalize, w, h); return true; } diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index de056ce97f0..5f10bdf2041 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -254,6 +254,7 @@ set(SRC_UTIL_HEADERS ../util/util_math_int3.h ../util/util_math_int4.h ../util/util_math_matrix.h + ../util/util_rect.h ../util/util_static_assert.h ../util/util_transform.h ../util/util_texture.h diff --git a/intern/cycles/kernel/filter/filter_nlm_cpu.h b/intern/cycles/kernel/filter/filter_nlm_cpu.h index 5e989331bc2..e2da0fd872b 100644 --- a/intern/cycles/kernel/filter/filter_nlm_cpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h @@ -21,7 +21,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, const float *ccl_restrict variance_image, float *difference_image, int4 rect, - int w, + int stride, int channel_offset, float a, float k_2) @@ -31,15 +31,15 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float diff = 0.0f; int numChannels = channel_offset? 3 : 1; for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; - float pvar = variance_image[c*channel_offset + y*w+x]; - float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + float pvar = variance_image[c*channel_offset + y*stride + x]; + float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { diff *= 1.0f/numChannels; } - difference_image[y*w+x] = diff; + difference_image[y*stride + x] = diff; } } } @@ -47,7 +47,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, - int w, + int stride, int f) { int aligned_lowx = rect.x / 4; @@ -56,17 +56,17 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] = 0.0f; + out_image[y*stride + x] = 0.0f; } for(int y1 = low; y1 < high; y1++) { - float4* out_image4 = (float4*)(out_image + y*w); - float4* difference_image4 = (float4*)(difference_image + y1*w); + float4* out_image4 = (float4*)(out_image + y*stride); + float4* difference_image4 = (float4*)(difference_image + y1*stride); for(int x = aligned_lowx; x < aligned_highx; x++) { out_image4[x] += difference_image4[x]; } } for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] *= 1.0f/(high - low); + out_image[y*stride + x] *= 1.0f/(high - low); } } } @@ -74,12 +74,12 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, - int w, + int stride, int f) { for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x; x < rect.z; x++) { - out_image[y*w+x] = 0.0f; + out_image[y*stride + x] = 0.0f; } } for(int dx = -f; dx <= f; dx++) { @@ -87,7 +87,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d int neg_dx = min(0, dx); for(int y = rect.y; y < rect.w; y++) { for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) { - out_image[y*w+x] += difference_image[y*w+dx+x]; + out_image[y*stride + x] += difference_image[y*stride + x+dx]; } } } @@ -95,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d for(int x = rect.x; x < rect.z; x++) { const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); - out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f)); + out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f)); } } } @@ -106,7 +106,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float *out_image, float *accum_image, int4 rect, - int w, + int stride, int f) { for(int y = rect.y; y < rect.w; y++) { @@ -115,11 +115,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*w+x1]; + sum += difference_image[y*stride + x1]; } float weight = sum * (1.0f/(high - low)); - accum_image[y*w+x] += weight; - out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)]; + accum_image[y*stride + x] += weight; + out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)]; } } } @@ -132,31 +132,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy, float *XtWX, float3 *XtWY, int4 rect, - int4 filter_rect, - int w, int h, int f, + int4 filter_window, + int stride, int f, int pass_stride) { + int4 clip_area = rect_clip(rect, filter_window); /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */ - for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) { - int y = fy + filter_rect.y; - for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) { - int x = fx + filter_rect.x; + for(int y = clip_area.y; y < clip_area.w; y++) { + for(int x = clip_area.x; x < clip_area.z; x++) { const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*w+x1]; + sum += difference_image[y*stride + x1]; } float weight = sum * (1.0f/(high - low)); - int storage_ofs = fy*filter_rect.z + fx; + int storage_ofs = coord_to_local_index(filter_window, x, y); float *l_transform = transform + storage_ofs*TRANSFORM_SIZE; float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE; float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE; int *l_rank = rank + storage_ofs; kernel_filter_construct_gramian(x, y, 1, - dx, dy, w, h, + dx, dy, + stride, pass_stride, buffer, l_transform, l_rank, diff --git a/intern/cycles/kernel/filter/filter_nlm_gpu.h b/intern/cycles/kernel/filter/filter_nlm_gpu.h index 2c5ac807051..4ca49ea6733 100644 --- a/intern/cycles/kernel/filter/filter_nlm_gpu.h +++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h @@ -16,57 +16,114 @@ CCL_NAMESPACE_BEGIN +/* Determines pixel coordinates and offset for the current thread. + * Returns whether the thread should do any work. + * + * All coordinates are relative to the denoising buffer! + * + * Window is the rect that should be processed. + * co is filled with (x, y, dx, dy). + */ +ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride, + int4 *rect, int4 *co, int *ofs, + int4 window) +{ + /* Determine the pixel offset that this thread should apply. */ + int s = 2*r+1; + int si = ccl_global_id(1); + int sx = si % s; + int sy = si / s; + if(sy >= s) { + return false; + } + co->z = sx-r; + co->w = sy-r; + + /* Pixels still need to lie inside the denoising buffer after applying the offset, + * so determine the area for which this is the case. */ + *rect = make_int4(max(0, -co->z), max(0, -co->w), + w - max(0, co->z), h - max(0, co->w)); + + /* Find the intersection of the area that we want to process (window) and the area + * that can be processed (rect) to get the final area for this offset. */ + int4 clip_area = rect_clip(window, *rect); + + /* If the radius is larger than one of the sides of the window, + * there will be shifts for which there is no usable pixel at all. */ + if(!rect_is_valid(clip_area)) { + return false; + } + + /* Map the linear thread index to pixels inside the clip area. */ + int x, y; + if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) { + return false; + } + co->x = x; + co->y = y; + + *ofs = (sy*s + sx) * stride; + + return true; +} + +ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride, + int4 *rect, int4 *co, int *ofs) +{ + return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h)); +} + ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y, int dx, int dy, const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, ccl_global float *difference_image, - int4 rect, int w, + int4 rect, int stride, int channel_offset, float a, float k_2) { float diff = 0.0f; int numChannels = channel_offset? 3 : 1; for(int c = 0; c < numChannels; c++) { - float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)]; - float pvar = variance_image[c*channel_offset + y*w+x]; - float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)]; + float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)]; + float pvar = variance_image[c*channel_offset + y*stride + x]; + float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)]; diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar)); } if(numChannels > 1) { diff *= 1.0f/numChannels; } - difference_image[y*w+x] = diff; + difference_image[y*stride + x] = diff; } ccl_device_inline void kernel_filter_nlm_blur(int x, int y, const ccl_global float *ccl_restrict difference_image, ccl_global float *out_image, - int4 rect, int w, int f) + int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.y, y-f); const int high = min(rect.w, y+f+1); for(int y1 = low; y1 < high; y1++) { - sum += difference_image[y1*w+x]; + sum += difference_image[y1*stride + x]; } sum *= 1.0f/(high-low); - out_image[y*w+x] = sum; + out_image[y*stride + x] = sum; } ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y, const ccl_global float *ccl_restrict difference_image, ccl_global float *out_image, - int4 rect, int w, int f) + int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*w+x1]; + sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); - out_image[y*w+x] = fast_expf(-max(sum, 0.0f)); + out_image[y*stride + x] = fast_expf(-max(sum, 0.0f)); } ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, @@ -75,25 +132,25 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int w, int f) + int4 rect, int stride, int f) { float sum = 0.0f; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*w+x1]; + sum += difference_image[y*stride + x1]; } sum *= 1.0f/(high-low); if(out_image) { - accum_image[y*w+x] += sum; - out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)]; + atomic_add_and_fetch_float(accum_image + y*stride + x, sum); + atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]); } else { - accum_image[y*w+x] = sum; + accum_image[y*stride + x] = sum; } } -ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, +ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y, int dx, int dy, const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, @@ -102,30 +159,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, ccl_global float *XtWX, ccl_global float3 *XtWY, int4 rect, - int4 filter_rect, - int w, int h, int f, + int4 filter_window, + int stride, int f, int pass_stride, int localIdx) { - int y = fy + filter_rect.y; - int x = fx + filter_rect.x; const int low = max(rect.x, x-f); const int high = min(rect.z, x+f+1); float sum = 0.0f; for(int x1 = low; x1 < high; x1++) { - sum += difference_image[y*w+x1]; + sum += difference_image[y*stride + x1]; } float weight = sum * (1.0f/(high - low)); - int storage_ofs = fy*filter_rect.z + fx; + /* Reconstruction data is only stored for pixels inside the filter window, + * so compute the pixels's index in there. */ + int storage_ofs = coord_to_local_index(filter_window, x, y); transform += storage_ofs; rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; kernel_filter_construct_gramian(x, y, - filter_rect.z*filter_rect.w, - dx, dy, w, h, + rect_size(filter_window), + dx, dy, + stride, pass_stride, buffer, transform, rank, @@ -136,9 +194,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy, ccl_device_inline void kernel_filter_nlm_normalize(int x, int y, ccl_global float *out_image, const ccl_global float *ccl_restrict accum_image, - int4 rect, int w) + int stride) { - out_image[y*w+x] /= accum_image[y*w+x]; + out_image[y*stride + x] /= accum_image[y*stride + x]; } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h index 25a3025056c..b7bf322f9ce 100644 --- a/intern/cycles/kernel/filter/filter_reconstruction.h +++ b/intern/cycles/kernel/filter/filter_reconstruction.h @@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN ccl_device_inline void kernel_filter_construct_gramian(int x, int y, int storage_stride, int dx, int dy, - int w, int h, + int buffer_stride, int pass_stride, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, @@ -33,8 +33,8 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, return; } - int p_offset = y *w + x; - int q_offset = (y+dy)*w + (x+dx); + int p_offset = y * buffer_stride + x; + int q_offset = (y+dy) * buffer_stride + (x+dx); #ifdef __KERNEL_GPU__ const int stride = storage_stride; @@ -65,7 +65,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); } -ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h, +ccl_device_inline void kernel_filter_finalize(int x, int y, ccl_global float *buffer, ccl_global int *rank, int storage_stride, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu.h b/intern/cycles/kernel/kernels/cpu/filter_cpu.h index bf13ba62806..4231aba88d7 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h @@ -74,7 +74,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, float *variance, float *difference_image, int* rect, - int w, + int stride, int channel_offset, float a, float k_2); @@ -82,13 +82,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, float *out_image, int* rect, - int w, + int stride, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, float *out_image, int* rect, - int w, + int stride, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, @@ -98,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int* rect, - int w, + int stride, int f); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, @@ -110,22 +110,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, float *XtWX, float3 *XtWY, int *rect, - int *filter_rect, - int w, - int h, + int *filter_window, + int stride, int f, int pass_stride); void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, float *accum_image, int* rect, - int w); + int stride); void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, int y, int storage_ofs, - int w, - int h, float *buffer, int *rank, float *XtWX, diff --git a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h index 2fbb0ea2bdb..ab39260784b 100644 --- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h @@ -150,7 +150,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, float *variance, float *difference_image, int *rect, - int w, + int stride, int channel_offset, float a, float k_2) @@ -158,33 +158,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx, #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference); #else - kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2); + kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image, float *out_image, int *rect, - int w, + int stride, int f) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur); #else - kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f); + kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image, float *out_image, int *rect, - int w, + int stride, int f) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight); #else - kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f); + kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f); #endif } @@ -195,13 +195,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx, float *out_image, float *accum_image, int *rect, - int w, + int stride, int f) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output); #else - kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f); + kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f); #endif } @@ -214,36 +214,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx, float *XtWX, float3 *XtWY, int *rect, - int *filter_rect, - int w, - int h, + int *filter_window, + int stride, int f, int pass_stride) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian); #else - kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride); + kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image, float *accum_image, int *rect, - int w) + int stride) { #ifdef KERNEL_STUB STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize); #else - kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w); + kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride); #endif } void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, int y, int storage_ofs, - int w, - int h, float *buffer, int *rank, float *XtWX, @@ -257,7 +254,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x, XtWX += storage_ofs*XTWX_SIZE; XtWY += storage_ofs*XTWY_SIZE; rank += storage_ofs; - kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); + kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample); #endif } diff --git a/intern/cycles/kernel/kernels/cuda/filter.cu b/intern/cycles/kernel/kernels/cuda/filter.cu index c8172355a7f..035f0484488 100644 --- a/intern/cycles/kernel/kernels/cuda/filter.cu +++ b/intern/cycles/kernel/kernels/cuda/filter.cu @@ -134,95 +134,140 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_difference(int dx, int dy, - const float *ccl_restrict weight_image, +kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image, const float *ccl_restrict variance_image, float *difference_image, - int4 rect, int w, + int w, + int h, + int stride, + int shift_stride, + int r, int channel_offset, - float a, float k_2) + float a, + float k_2) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, + weight_image, + variance_image, + difference_image + ofs, + rect, stride, + channel_offset, a, k_2); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, + float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_blur(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f) +kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, + float *out_image, + int w, + int h, + int stride, + int shift_stride, + int r, + int f) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_weight(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_update_output(int dx, int dy, - const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image, const float *ccl_restrict image, - float *out_image, float *accum_image, - int4 rect, int w, + float *out_image, + float *accum_image, + int w, + int h, + int stride, + int shift_stride, + int r, int f) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, + difference_image + ofs, + image, + out_image, + accum_image, + rect, stride, f); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w) +kernel_cuda_filter_nlm_normalize(float *out_image, + const float *ccl_restrict accum_image, + int w, + int h, + int stride) { - int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x; - int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + int x = blockDim.x*blockIdx.x + threadIdx.x; + int y = blockDim.y*blockIdx.y + threadIdx.y; + if(x < w && y < h) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); } } extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, - const float *ccl_restrict difference_image, +kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image, const float *ccl_restrict buffer, float const* __restrict__ transform, int *rank, float *XtWX, float3 *XtWY, - int4 rect, - int4 filter_rect, - int w, int h, int f, + int4 filter_window, + int w, + int h, + int stride, + int shift_stride, + int r, + int f, int pass_stride) { - int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x); - int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y); - if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { - kernel_filter_nlm_construct_gramian(x, y, - dx, dy, - difference_image, + int4 co, rect; + int ofs; + if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + kernel_filter_nlm_construct_gramian(co.x, co.y, + co.z, co.w, + difference_image + ofs, buffer, transform, rank, XtWX, XtWY, - rect, filter_rect, - w, h, f, + rect, filter_window, + stride, f, pass_stride, threadIdx.y*blockDim.x + threadIdx.x); } @@ -230,10 +275,12 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy, extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) -kernel_cuda_filter_finalize(int w, int h, - float *buffer, int *rank, - float *XtWX, float3 *XtWY, - int4 filter_area, int4 buffer_params, +kernel_cuda_filter_finalize(float *buffer, + int *rank, + float *XtWX, + float3 *XtWY, + int4 filter_area, + int4 buffer_params, int sample) { int x = blockDim.x*blockIdx.x + threadIdx.x; @@ -243,7 +290,10 @@ kernel_cuda_filter_finalize(int w, int h, rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; - kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + kernel_filter_finalize(x, y, buffer, rank, + filter_area.z*filter_area.w, + XtWX, XtWY, + buffer_params, sample); } } diff --git a/intern/cycles/kernel/kernels/opencl/filter.cl b/intern/cycles/kernel/kernels/opencl/filter.cl index 7a7b596a350..2b77807c38b 100644 --- a/intern/cycles/kernel/kernels/opencl/filter.cl +++ b/intern/cycles/kernel/kernels/opencl/filter.cl @@ -126,113 +126,136 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_ } } -__kernel void kernel_ocl_filter_nlm_calc_difference(int dx, - int dy, - const ccl_global float *ccl_restrict weight_image, +__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image, const ccl_global float *ccl_restrict variance_image, ccl_global float *difference_image, - int4 rect, int w, + int h, + int stride, + int shift_stride, + int r, int channel_offset, float a, float k_2) { - int x = get_global_id(0) + rect.x; - int y = get_global_id(1) + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w, + weight_image, + variance_image, + difference_image + ofs, + rect, stride, + channel_offset, a, k_2); } } __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image, ccl_global float *out_image, - int4 rect, int w, + int h, + int stride, + int shift_stride, + int r, int f) { - int x = get_global_id(0) + rect.x; - int y = get_global_id(1) + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_blur(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); } } __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image, ccl_global float *out_image, - int4 rect, int w, + int h, + int stride, + int shift_stride, + int r, int f) { - int x = get_global_id(0) + rect.x; - int y = get_global_id(1) + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_calc_weight(co.x, co.y, + difference_image + ofs, + out_image + ofs, + rect, stride, f); } } -__kernel void kernel_ocl_filter_nlm_update_output(int dx, - int dy, - const ccl_global float *ccl_restrict difference_image, +__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict image, ccl_global float *out_image, ccl_global float *accum_image, - int4 rect, int w, + int h, + int stride, + int shift_stride, + int r, int f) { - int x = get_global_id(0) + rect.x; - int y = get_global_id(1) + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f); + int4 co, rect; + int ofs; + if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) { + kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w, + difference_image + ofs, + image, + out_image, + accum_image, + rect, stride, f); } } __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image, const ccl_global float *ccl_restrict accum_image, - int4 rect, - int w) + int w, + int h, + int stride) { - int x = get_global_id(0) + rect.x; - int y = get_global_id(1) + rect.y; - if(x < rect.z && y < rect.w) { - kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w); + int x = get_global_id(0); + int y = get_global_id(1); + if(x < w && y < h) { + kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride); } } -__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx, - int dy, - const ccl_global float *ccl_restrict difference_image, +__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image, const ccl_global float *ccl_restrict buffer, const ccl_global float *ccl_restrict transform, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, - int4 rect, - int4 filter_rect, + int4 filter_window, int w, int h, + int stride, + int shift_stride, + int r, int f, int pass_stride) { - int x = get_global_id(0) + max(0, rect.x-filter_rect.x); - int y = get_global_id(1) + max(0, rect.y-filter_rect.y); - if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) { - kernel_filter_nlm_construct_gramian(x, y, - dx, dy, - difference_image, + int4 co, rect; + int ofs; + if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) { + kernel_filter_nlm_construct_gramian(co.x, co.y, + co.z, co.w, + difference_image + ofs, buffer, transform, rank, XtWX, XtWY, - rect, filter_rect, - w, h, f, + rect, filter_window, + stride, f, pass_stride, get_local_id(1)*get_local_size(0) + get_local_id(0)); } } -__kernel void kernel_ocl_filter_finalize(int w, - int h, - ccl_global float *buffer, +__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer, ccl_global int *rank, ccl_global float *XtWX, ccl_global float3 *XtWY, @@ -247,7 +270,10 @@ __kernel void kernel_ocl_filter_finalize(int w, rank += storage_ofs; XtWX += storage_ofs; XtWY += storage_ofs; - kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample); + kernel_filter_finalize(x, y, buffer, rank, + filter_area.z*filter_area.w, + XtWX, XtWY, + buffer_params, sample); } } diff --git a/intern/cycles/util/CMakeLists.txt b/intern/cycles/util/CMakeLists.txt index 7f3747a0f58..bc9def7ca41 100644 --- a/intern/cycles/util/CMakeLists.txt +++ b/intern/cycles/util/CMakeLists.txt @@ -68,6 +68,7 @@ set(SRC_HEADERS util_path.h util_progress.h util_queue.h + util_rect.h util_set.h util_simd.h util_sky_model.cpp diff --git a/intern/cycles/util/util_math.h b/intern/cycles/util/util_math.h index 39ce6a93982..d0e91a2a1c9 100644 --- a/intern/cycles/util/util_math.h +++ b/intern/cycles/util/util_math.h @@ -320,6 +320,8 @@ CCL_NAMESPACE_END #include "util/util_math_float3.h" #include "util/util_math_float4.h" +#include "util/util_rect.h" + CCL_NAMESPACE_BEGIN #ifndef __KERNEL_OPENCL__ diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h index b31dbe4fc67..382dad64ea5 100644 --- a/intern/cycles/util/util_math_matrix.h +++ b/intern/cycles/util/util_math_matrix.h @@ -98,7 +98,10 @@ ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w) ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride) { for(int i = 0; i < n; i++) { - v[i*stride] += w*x[i]; + ccl_global float *elem = (ccl_global float*) (v + i*stride); + atomic_add_and_fetch_float(elem+0, w.x*x[i]); + atomic_add_and_fetch_float(elem+1, w.y*x[i]); + atomic_add_and_fetch_float(elem+2, w.z*x[i]); } } @@ -136,7 +139,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A, { for(int row = 0; row < n; row++) { for(int col = 0; col <= row; col++) { - MATHS(A, row, col, stride) += v[row]*v[col]*weight; + atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight); } } } diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h new file mode 100644 index 00000000000..17a55a14d0b --- /dev/null +++ b/intern/cycles/util/util_rect.h @@ -0,0 +1,73 @@ +/* + * Copyright 2017 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTIL_RECT_H__ +#define __UTIL_RECT_H__ + +#include "util/util_types.h" + +CCL_NAMESPACE_BEGIN + +/* Rectangles are represented as a int4 containing the coordinates of the lower-left and + * upper-right corners in the order (x0, y0, x1, y1). */ + +ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h) +{ + return make_int4(x0, y0, x0 + w, y0 + h); +} + +ccl_device_inline int4 rect_expand(int4 rect, int d) +{ + return make_int4(rect.x - d, rect.y - d, rect.z + d, rect.w + d); +} + +/* Returns the intersection of two rects. */ +ccl_device_inline int4 rect_clip(int4 a, int4 b) +{ + return make_int4(max(a.x, b.x), max(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); +} + +ccl_device_inline bool rect_is_valid(int4 rect) +{ + return (rect.z > rect.x) && (rect.w > rect.y); +} + +/* Returns the local row-major index of the pixel inside the rect. */ +ccl_device_inline int coord_to_local_index(int4 rect, int x, int y) +{ + int w = rect.z - rect.x; + return (y - rect.y) * w + (x - rect.x); +} + +/* Finds the coordinates of a pixel given by its row-major index in the rect, + * and returns whether the pixel is inside it. */ +ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y) +{ + int w = rect.z - rect.x; + *x = (idx % w) + rect.x; + *y = (idx / w) + rect.y; + return (*y < rect.w); +} + +ccl_device_inline int rect_size(int4 rect) +{ + return (rect.z - rect.x) * (rect.w - rect.y); +} + +CCL_NAMESPACE_END + +#endif /* __UTIL_RECT_H__ */ + diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h index a292a2eb5a0..9a5848c50ad 100644 --- a/intern/guardedalloc/intern/mallocn_intern.h +++ b/intern/guardedalloc/intern/mallocn_intern.h @@ -63,6 +63,9 @@ #elif defined(WIN32) # include <malloc.h> # define malloc_usable_size _msize +#elif defined(__HAIKU__) +# include <malloc.h> +size_t malloc_usable_size(void *ptr); #else # pragma message "We don't know how to use malloc_usable_size on your platform" # undef USE_MALLOC_USABLE_SIZE diff --git a/intern/libmv/libmv/numeric/numeric.h b/intern/libmv/libmv/numeric/numeric.h index 1a23d653676..6312b4eb1a6 100644 --- a/intern/libmv/libmv/numeric/numeric.h +++ b/intern/libmv/libmv/numeric/numeric.h @@ -35,7 +35,8 @@ #if !defined(__MINGW64__) # if defined(_WIN32) || defined(__APPLE__) || \ - defined(__FreeBSD__) || defined(__NetBSD__) + defined(__FreeBSD__) || defined(__NetBSD__) || \ + defined(__HAIKU__) inline void sincos(double x, double *sinx, double *cosx) { *sinx = sin(x); *cosx = cos(x); diff --git a/release/scripts/startup/bl_ui/properties_data_mesh.py b/release/scripts/startup/bl_ui/properties_data_mesh.py index d98e3f00e7d..9f927fe3368 100644 --- a/release/scripts/startup/bl_ui/properties_data_mesh.py +++ b/release/scripts/startup/bl_ui/properties_data_mesh.py @@ -217,7 +217,8 @@ class DATA_PT_vertex_groups(MeshButtonsPanel, Panel): col = row.column(align=True) col.operator("object.vertex_group_add", icon='ZOOMIN', text="") - col.operator("object.vertex_group_remove", icon='ZOOMOUT', text="").all = False + props = col.operator("object.vertex_group_remove", icon='ZOOMOUT', text="") + props.all_unlocked = props.all = False col.menu("MESH_MT_vertex_group_specials", icon='DOWNARROW_HLT', text="") if group: col.separator() diff --git a/source/blender/blenkernel/intern/texture.c b/source/blender/blenkernel/intern/texture.c index 50bb3a5f10d..122b605f160 100644 --- a/source/blender/blenkernel/intern/texture.c +++ b/source/blender/blenkernel/intern/texture.c @@ -1165,6 +1165,10 @@ void set_current_material_texture(Material *ma, Tex *newtex) ma->mtex[act] = BKE_texture_mtex_add(); /* Reset this slot's ON/OFF toggle, for materials, when slot was empty. */ ma->septex &= ~(1 << act); + /* For volumes the default UV texture coordinates are not available. */ + if (ma->material_type == MA_TYPE_VOLUME) { + ma->mtex[act]->texco = TEXCO_ORCO; + } } ma->mtex[act]->tex = newtex; diff --git a/source/blender/blenlib/BLI_sys_types.h b/source/blender/blenlib/BLI_sys_types.h index 9477f61713c..80ee50621ca 100644 --- a/source/blender/blenlib/BLI_sys_types.h +++ b/source/blender/blenlib/BLI_sys_types.h @@ -47,7 +47,9 @@ extern "C" { #endif -#if defined(__linux__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD_kernel__) || defined(__GNU__) +#if defined(__linux__) || defined(__GNU__) || \ + defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD_kernel__) || \ + defined(__HAIKU__) /* Linux-i386, Linux-Alpha, Linux-ppc */ #include <stdint.h> diff --git a/source/blender/blenlib/intern/fileops.c b/source/blender/blenlib/intern/fileops.c index 36a0d1c1641..ad53457f863 100644 --- a/source/blender/blenlib/intern/fileops.c +++ b/source/blender/blenlib/intern/fileops.c @@ -621,7 +621,21 @@ static int recursive_operation(const char *startfrom, const char *startto, if (to) join_dirfile_alloc(&to_path, &to_alloc_len, to, dirent->d_name); - if (dirent->d_type == DT_DIR) { + bool is_dir; + +#ifdef __HAIKU__ + { + struct stat st_dir; + char filename[FILE_MAX]; + BLI_path_join(filename, sizeof(filename), startfrom, dirent->d_name, NULL); + lstat(filename, &st_dir); + is_dir = S_ISDIR(st_dir.st_mode); + } +#else + is_dir = (dirent->d_type == DT_DIR); +#endif + + if (is_dir) { /* recursively dig into a subfolder */ ret = recursive_operation(from_path, to_path, callback_dir_pre, callback_file, callback_dir_post); } diff --git a/source/blender/blenlib/intern/storage.c b/source/blender/blenlib/intern/storage.c index e31659c35d9..c08329ef34f 100644 --- a/source/blender/blenlib/intern/storage.c +++ b/source/blender/blenlib/intern/storage.c @@ -37,7 +37,7 @@ #include <sys/stat.h> -#if defined(__NetBSD__) || defined(__DragonFly__) +#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__HAIKU__) /* Other modern unix os's should probably use this also */ # include <sys/statvfs.h> # define USE_STATFS_STATVFS diff --git a/source/blender/editors/armature/armature_select.c b/source/blender/editors/armature/armature_select.c index e32324d25f9..b87942fed84 100644 --- a/source/blender/editors/armature/armature_select.c +++ b/source/blender/editors/armature/armature_select.c @@ -807,6 +807,7 @@ enum { SIMEDBONE_PREFIX, SIMEDBONE_SUFFIX, SIMEDBONE_LAYER, + SIMEDBONE_GROUP, SIMEDBONE_SHAPE, }; @@ -819,6 +820,7 @@ static const EnumPropertyItem prop_similar_types[] = { {SIMEDBONE_PREFIX, "PREFIX", 0, "Prefix", ""}, {SIMEDBONE_SUFFIX, "SUFFIX", 0, "Suffix", ""}, {SIMEDBONE_LAYER, "LAYER", 0, "Layer", ""}, + {SIMEDBONE_GROUP, "GROUP", 0, "Group", ""}, {SIMEDBONE_SHAPE, "SHAPE", 0, "Shape", ""}, {0, NULL, 0, NULL, NULL} }; @@ -1009,6 +1011,9 @@ static int armature_select_similar_exec(bContext *C, wmOperator *op) return OPERATOR_CANCELLED; } +#define STRUCT_SIZE_AND_OFFSET(_struct, _member) \ + sizeof(((_struct *)NULL)->_member), offsetof(_struct, _member) + switch (type) { case SIMEDBONE_CHILDREN: select_similar_children(arm, ebone_act); @@ -1034,13 +1039,20 @@ static int armature_select_similar_exec(bContext *C, wmOperator *op) case SIMEDBONE_LAYER: select_similar_layer(arm, ebone_act); break; + case SIMEDBONE_GROUP: + select_similar_data_pchan( + arm, obedit, ebone_act, + STRUCT_SIZE_AND_OFFSET(bPoseChannel, agrp_index)); + break; case SIMEDBONE_SHAPE: select_similar_data_pchan( arm, obedit, ebone_act, - sizeof(void *), offsetof(bPoseChannel, custom)); + STRUCT_SIZE_AND_OFFSET(bPoseChannel, custom)); break; } +#undef STRUCT_SIZE_AND_OFFSET + WM_event_add_notifier(C, NC_OBJECT | ND_BONE_SELECT, obedit); return OPERATOR_FINISHED; |