diff options
12 files changed, 362 insertions, 22 deletions
diff --git a/intern/guardedalloc/MEM_guardedalloc.h b/intern/guardedalloc/MEM_guardedalloc.h index 4fb68965338..8c5ad77b8b6 100644 --- a/intern/guardedalloc/MEM_guardedalloc.h +++ b/intern/guardedalloc/MEM_guardedalloc.h @@ -120,6 +120,12 @@ extern "C" { extern void *(*MEM_mallocN)(size_t len, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); /** + * Allocate an aligned block of memory of size len, with tag name str. The + * name must be a static, because only a pointer to it is stored ! + * */ + extern void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3); + + /** * Same as callocN, clears memory and uses mmap (disk cached) if supported. * Can be free'd with MEM_freeN as usual. * */ diff --git a/intern/guardedalloc/intern/mallocn.c b/intern/guardedalloc/intern/mallocn.c index e85fba7a6d0..b0d252cca14 100644 --- a/intern/guardedalloc/intern/mallocn.c +++ b/intern/guardedalloc/intern/mallocn.c @@ -41,6 +41,7 @@ void *(*MEM_reallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfre void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id; void *(*MEM_callocN)(size_t len, const char *str) = MEM_lockfree_callocN; void *(*MEM_mallocN)(size_t len, const char *str) = MEM_lockfree_mallocN; +void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) = MEM_lockfree_mallocN_aligned; void *(*MEM_mapallocN)(size_t len, const char *str) = MEM_lockfree_mapallocN; void (*MEM_printmemlist_pydict)(void) = MEM_lockfree_printmemlist_pydict; void (*MEM_printmemlist)(void) = MEM_lockfree_printmemlist; @@ -60,6 +61,40 @@ uintptr_t (*MEM_get_peak_memory)(void) = MEM_lockfree_get_peak_memory; const char *(*MEM_name_ptr)(void *vmemh) = MEM_lockfree_name_ptr; #endif +void *aligned_malloc(size_t size, size_t alignment) +{ +#ifdef _WIN32 + return _aligned_malloc(size, alignment); +#elif defined(__APPLE__) + /* On Mac OS X, both the heap and the stack are guaranteed 16-byte aligned so + * they work natively with SSE types with no further work. + */ + assert(alignment == 16); + return malloc(size); +#elif defined(__FreeBSD__) || defined(__NetBSD__) + void *result; + + if (posix_memalign(&result, alignment, size)) { + /* non-zero means allocation error + * either no allocation or bad alignment value + */ + return NULL; + } + return result; +#else /* This is for Linux. */ + return memalign(alignment, size); +#endif +} + +void aligned_free(void *ptr) +{ +#ifdef _WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif +} + void MEM_use_guarded_allocator(void) { MEM_allocN_len = MEM_guarded_allocN_len; @@ -69,6 +104,7 @@ void MEM_use_guarded_allocator(void) MEM_recallocN_id = MEM_guarded_recallocN_id; MEM_callocN = MEM_guarded_callocN; MEM_mallocN = MEM_guarded_mallocN; + MEM_mallocN_aligned = MEM_guarded_mallocN_aligned; MEM_mapallocN = MEM_guarded_mapallocN; MEM_printmemlist_pydict = MEM_guarded_printmemlist_pydict; MEM_printmemlist = MEM_guarded_printmemlist; diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c index 172c79d50cd..206390e0710 100644 --- a/intern/guardedalloc/intern/mallocn_guarded_impl.c +++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c @@ -113,7 +113,10 @@ typedef struct MemHead { const char *name; const char *nextname; int tag2; - int mmap; /* if true, memory was mmapped */ + short mmap; /* if true, memory was mmapped */ + short alignment; /* if non-zero aligned alloc was used + * and alignment is stored here. + */ #ifdef DEBUG_MEMCOUNTER int _count; #endif @@ -128,6 +131,8 @@ typedef struct MemHead { #endif } MemHead; +typedef MemHead MemHeadAligned; + /* for openmp threading asserts, saves time troubleshooting * we may need to extend this if blender code starts using MEM_ * functions inside OpenMP correctly with omp_set_lock() */ @@ -187,7 +192,7 @@ static const char *check_memlist(MemHead *memh); #define MEMNEXT(x) \ ((MemHead *)(((char *) x) - ((char *) &(((MemHead *)0)->next)))) - + /* --------------------------------------------------------------------- */ /* vars */ /* --------------------------------------------------------------------- */ @@ -325,10 +330,12 @@ void *MEM_guarded_dupallocN(const void *vmemh) memh--; #ifndef DEBUG_MEMDUPLINAME - if (memh->mmap) + if (UNLIKELY(memh->mmap)) + newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc"); + else if (LIKELY(memh->alignment == 0)) newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc"); else - newp = MEM_guarded_mallocN(memh->len, "dupli_alloc"); + newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, "dupli_alloc"); if (newp == NULL) return NULL; #else @@ -336,14 +343,18 @@ void *MEM_guarded_dupallocN(const void *vmemh) MemHead *nmemh; char *name = malloc(strlen(memh->name) + 24); - if (memh->mmap) { + if (UNLIKELY(memh->mmap)) { sprintf(name, "%s %s", "dupli_mapalloc", memh->name); newp = MEM_guarded_mapallocN(memh->len, name); } - else { + else if (LIKELY(memh->alignment == 0)) { sprintf(name, "%s %s", "dupli_alloc", memh->name); newp = MEM_guarded_mallocN(memh->len, name); } + else { + sprintf(name, "%s %s", "dupli_alloc", memh->name); + newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, name); + } if (newp == NULL) return NULL; @@ -368,7 +379,13 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *str) MemHead *memh = vmemh; memh--; - newp = MEM_guarded_mallocN(len, memh->name); + if (LIKELY(memh->alignment == 0)) { + newp = MEM_guarded_mallocN(len, memh->name); + } + else { + newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name); + } + if (newp) { if (len < memh->len) { /* shrink */ @@ -397,7 +414,13 @@ void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *str) MemHead *memh = vmemh; memh--; - newp = MEM_guarded_mallocN(len, memh->name); + if (LIKELY(memh->alignment == 0)) { + newp = MEM_guarded_mallocN(len, memh->name); + } + else { + newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name); + } + if (newp) { if (len < memh->len) { /* shrink */ @@ -464,6 +487,7 @@ static void make_memhead_header(MemHead *memh, size_t len, const char *str) memh->nextname = NULL; memh->len = len; memh->mmap = 0; + memh->alignment = 0; memh->tag2 = MEMTAG2; #ifdef DEBUG_MEMDUPLINAME @@ -514,6 +538,54 @@ void *MEM_guarded_mallocN(size_t len, const char *str) return NULL; } +void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *str) +{ + MemHead *memh; + + /* It's possible that MemHead's size is not properly aligned, + * do extra padding to deal with this. + * + * We only support small alignments which fits into short in + * order to save some bits in MemHead structure. + */ + short extra_padding = (short)MEMHEAD_ALIGN_PADDING(alignment); + + /* Huge alignment values doesn't make sense and they + * wouldn't fit into 'short' used in the MemHead. + */ + assert(alignment < 1024); + + /* We only support alignment to a power of two. */ + assert(IS_POW2(alignment)); + + len = SIZET_ALIGN_4(len); + + memh = (MemHead *)aligned_malloc(len + (size_t)extra_padding + sizeof(MemHead) + sizeof(MemTail), alignment); + + if (LIKELY(memh)) { + /* We keep padding in the beginning of MemHead, + * this way it's always possible to get MemHead + * from the data pointer. + */ + memh = (MemHead *)((char *)memh + extra_padding); + + make_memhead_header(memh, len, str); + memh->alignment = (short) alignment; + if (UNLIKELY(malloc_debug_memset && len)) + memset(memh + 1, 255, len); + +#ifdef DEBUG_MEMCOUNTER + if (_mallocn_count == DEBUG_MEMCOUNTER_ERROR_VAL) + memcount_raise(__func__); + memh->_count = _mallocn_count++; +#endif + return (++memh); + } + print_error("aligned_malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n", + SIZET_ARG(len), str, (unsigned int) mem_in_use); + return NULL; +} + void *MEM_guarded_callocN(size_t len, const char *str) { MemHead *memh; @@ -953,7 +1025,12 @@ static void rem_memblock(MemHead *memh) else { if (UNLIKELY(malloc_debug_memset && memh->len)) memset(memh + 1, 255, memh->len); - free(memh); + if (LIKELY(memh->alignment == 0)) { + free(memh); + } + else { + aligned_free(MEMHEAD_REAL_PTR(memh)); + } } } diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h index 7c8922dd407..a69bcf3d27b 100644 --- a/intern/guardedalloc/intern/mallocn_intern.h +++ b/intern/guardedalloc/intern/mallocn_intern.h @@ -85,6 +85,35 @@ # define UNLIKELY(x) (x) #endif +#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__NetBSD__) +// Needed for memalign on Linux and _aligned_alloc on Windows. +# ifdef FREE_WINDOWS +/* make sure _aligned_malloc is included */ +# ifdef __MSVCRT_VERSION__ +# undef __MSVCRT_VERSION__ +# endif + +# define __MSVCRT_VERSION__ 0x0700 +# endif // FREE_WINDOWS + +# include <malloc.h> +#else +// Apple's malloc is 16-byte aligned, and does not have malloc.h, so include +// stdilb instead. +# include <cstdlib> +#endif + +#define IS_POW2(a) (((a) & ((a) - 1)) == 0) + +/* Extra padding which needs to be applied on MemHead to make it aligned. */ +#define MEMHEAD_ALIGN_PADDING(alignment) ((size_t)alignment - (sizeof(MemHeadAligned) % (size_t)alignment)) + +/* Real pointer returned by the malloc or aligned_alloc. */ +#define MEMHEAD_REAL_PTR(memh) ((char *)memh - MEMHEAD_ALIGN_PADDING(memh->alignment)) + +void *aligned_malloc(size_t size, size_t alignment); +void aligned_free(void *ptr); + /* Prototypes for counted allocator functions */ size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT; void MEM_lockfree_freeN(void *vmemh); @@ -93,6 +122,7 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) void *MEM_lockfree_recallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(2); void *MEM_lockfree_callocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); void *MEM_lockfree_mallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); +void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3); void *MEM_lockfree_mapallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); void MEM_lockfree_printmemlist_pydict(void); void MEM_lockfree_printmemlist(void); @@ -119,6 +149,7 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(2); void *MEM_guarded_callocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); void *MEM_guarded_mallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); +void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3); void *MEM_guarded_mapallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2); void MEM_guarded_printmemlist_pydict(void); void MEM_guarded_printmemlist(void); diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c index 6fc01807af3..c76caff0d74 100644 --- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c +++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c @@ -46,6 +46,11 @@ typedef struct MemHead { size_t len; } MemHead; +typedef struct MemHeadAligned { + short alignment; + size_t len; +} MemHeadAligned; + static unsigned int totblock = 0; static size_t mem_in_use = 0, mmap_in_use = 0, peak_mem = 0; static bool malloc_debug_memset = false; @@ -54,9 +59,17 @@ static void (*error_callback)(const char *) = NULL; static void (*thread_lock_callback)(void) = NULL; static void (*thread_unlock_callback)(void) = NULL; +enum { + MEMHEAD_MMAP_FLAG = 1, + MEMHEAD_ALIGN_FLAG = 2, +}; + #define MEMHEAD_FROM_PTR(ptr) (((MemHead*) vmemh) - 1) #define PTR_FROM_MEMHEAD(memhead) (memhead + 1) -#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) 1) +#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) vmemh) - 1) +#define PTR_FROM_MEMHEAD_ALIGNED(memhead) (memhead + 1) +#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) MEMHEAD_MMAP_FLAG) +#define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t) MEMHEAD_ALIGN_FLAG) #ifdef __GNUC__ __attribute__ ((format(printf, 1, 2))) @@ -93,7 +106,7 @@ static void mem_unlock_thread(void) size_t MEM_lockfree_allocN_len(const void *vmemh) { if (vmemh) { - return MEMHEAD_FROM_PTR(vmemh)->len & ~((size_t) 1); + return MEMHEAD_FROM_PTR(vmemh)->len & ~((size_t) (MEMHEAD_MMAP_FLAG | MEMHEAD_ALIGN_FLAG)); } else { return 0; @@ -124,7 +137,13 @@ void MEM_lockfree_freeN(void *vmemh) if (UNLIKELY(malloc_debug_memset && len)) { memset(memh + 1, 255, len); } - free(memh); + if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) { + MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh); + aligned_free(MEMHEAD_REAL_PTR(memh_aligned)); + } + else { + free(memh); + } } } @@ -134,9 +153,16 @@ void *MEM_lockfree_dupallocN(const void *vmemh) if (vmemh) { MemHead *memh = MEMHEAD_FROM_PTR(vmemh); const size_t prev_size = MEM_allocN_len(vmemh); - if (MEMHEAD_IS_MMAP(memh)) { + if (UNLIKELY(MEMHEAD_IS_MMAP(memh))) { newp = MEM_lockfree_mapallocN(prev_size, "dupli_mapalloc"); } + else if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) { + MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh); + newp = MEM_lockfree_mallocN_aligned( + prev_size, + (size_t)memh_aligned->alignment, + "dupli_malloc"); + } else { newp = MEM_lockfree_mallocN(prev_size, "dupli_malloc"); } @@ -150,9 +176,20 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *str) void *newp = NULL; if (vmemh) { + MemHead *memh = MEMHEAD_FROM_PTR(vmemh); size_t old_len = MEM_allocN_len(vmemh); - newp = MEM_lockfree_mallocN(len, "realloc"); + if (LIKELY(!MEMHEAD_IS_ALIGNED(memh))) { + newp = MEM_lockfree_mallocN(len, "realloc"); + } + else { + MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh); + newp = MEM_lockfree_mallocN_aligned( + old_len, + (size_t)memh_aligned->alignment, + "realloc"); + } + if (newp) { if (len < old_len) { /* shrink */ @@ -178,9 +215,19 @@ void *MEM_lockfree_recallocN_id(void *vmemh, size_t len, const char *str) void *newp = NULL; if (vmemh) { + MemHead *memh = MEMHEAD_FROM_PTR(vmemh); size_t old_len = MEM_allocN_len(vmemh); - newp = MEM_lockfree_mallocN(len, "recalloc"); + if (LIKELY(!MEMHEAD_IS_ALIGNED(memh))) { + newp = MEM_lockfree_mallocN(len, "recalloc"); + } + else { + MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh); + newp = MEM_lockfree_mallocN_aligned(old_len, + (size_t)memh_aligned->alignment, + "recalloc"); + } + if (newp) { if (len < old_len) { /* shrink */ @@ -256,6 +303,57 @@ void *MEM_lockfree_mallocN(size_t len, const char *str) return NULL; } +void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str) +{ + MemHeadAligned *memh; + + /* It's possible that MemHead's size is not properly aligned, + * do extra padding to deal with this. + * + * We only support small alignments which fits into short in + * order to save some bits in MemHead structure. + */ + size_t extra_padding = MEMHEAD_ALIGN_PADDING(alignment); + + /* Huge alignment values doesn't make sense and they + * wouldn't fit into 'short' used in the MemHead. + */ + assert(alignment < 1024); + + /* We only support alignment to a power of two. */ + assert(IS_POW2(alignment)); + + len = SIZET_ALIGN_4(len); + + memh = (MemHeadAligned *)aligned_malloc( + len + extra_padding + sizeof(MemHeadAligned), alignment); + + if (LIKELY(memh)) { + /* We keep padding in the beginning of MemHead, + * this way it's always possible to get MemHead + * from the data pointer. + */ + memh = (MemHeadAligned *)((char *)memh + extra_padding); + + if (UNLIKELY(malloc_debug_memset && len)) { + memset(memh + 1, 255, len); + } + + memh->len = len | (size_t) MEMHEAD_ALIGN_FLAG; + memh->alignment = (short) alignment; + atomic_add_u(&totblock, 1); + atomic_add_z(&mem_in_use, len); + + /* TODO(sergey): Not strictly speaking thread-safe. */ + peak_mem = mem_in_use > peak_mem ? mem_in_use : peak_mem; + + return PTR_FROM_MEMHEAD(memh); + } + print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n", + SIZET_ARG(len), str, (unsigned int) mem_in_use); + return NULL; +} + void *MEM_lockfree_mapallocN(size_t len, const char *str) { MemHead *memh; @@ -279,7 +377,7 @@ void *MEM_lockfree_mapallocN(size_t len, const char *str) #endif if (memh != (MemHead *)-1) { - memh->len = len | (size_t) 1; + memh->len = len | (size_t) MEMHEAD_MMAP_FLAG; atomic_add_u(&totblock, 1); atomic_add_z(&mem_in_use, len); atomic_add_z(&mmap_in_use, len); diff --git a/source/blender/compositor/intern/COM_MemoryBuffer.cpp b/source/blender/compositor/intern/COM_MemoryBuffer.cpp index 04828bfe3f8..c1916f4a68f 100644 --- a/source/blender/compositor/intern/COM_MemoryBuffer.cpp +++ b/source/blender/compositor/intern/COM_MemoryBuffer.cpp @@ -46,7 +46,7 @@ MemoryBuffer::MemoryBuffer(MemoryProxy *memoryProxy, unsigned int chunkNumber, r BLI_rcti_init(&this->m_rect, rect->xmin, rect->xmax, rect->ymin, rect->ymax); this->m_memoryProxy = memoryProxy; this->m_chunkNumber = chunkNumber; - this->m_buffer = (float *)MEM_mallocN(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, "COM_MemoryBuffer"); + this->m_buffer = (float *)MEM_mallocN_aligned(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, 16, "COM_MemoryBuffer"); this->m_state = COM_MB_ALLOCATED; this->m_datatype = COM_DT_COLOR; this->m_chunkWidth = this->m_rect.xmax - this->m_rect.xmin; @@ -57,7 +57,7 @@ MemoryBuffer::MemoryBuffer(MemoryProxy *memoryProxy, rcti *rect) BLI_rcti_init(&this->m_rect, rect->xmin, rect->xmax, rect->ymin, rect->ymax); this->m_memoryProxy = memoryProxy; this->m_chunkNumber = -1; - this->m_buffer = (float *)MEM_mallocN(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, "COM_MemoryBuffer"); + this->m_buffer = (float *)MEM_mallocN_aligned(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, 16, "COM_MemoryBuffer"); this->m_state = COM_MB_TEMPORARILY; this->m_datatype = COM_DT_COLOR; this->m_chunkWidth = this->m_rect.xmax - this->m_rect.xmin; diff --git a/source/blender/compositor/operations/COM_BlurBaseOperation.cpp b/source/blender/compositor/operations/COM_BlurBaseOperation.cpp index e7af9319f88..d5aafc7c2ae 100644 --- a/source/blender/compositor/operations/COM_BlurBaseOperation.cpp +++ b/source/blender/compositor/operations/COM_BlurBaseOperation.cpp @@ -91,6 +91,18 @@ float *BlurBaseOperation::make_gausstab(float rad, int size) return gausstab; } +#ifdef __SSE2__ +__m128 *BlurBaseOperation::convert_gausstab_sse(const float *gausstab, float rad, int size) +{ + int n = 2 * size + 1; + __m128 *gausstab_sse = (__m128 *) MEM_mallocN_aligned(sizeof(__m128) * n, 16, "gausstab sse"); + for (int i = 0; i < n; ++i) { + gausstab_sse[i] = _mm_set1_ps(gausstab[i]); + } + return gausstab_sse; +} +#endif + /* normalized distance from the current (inverted so 1.0 is close and 0.0 is far) * 'ease' is applied after, looks nicer */ float *BlurBaseOperation::make_dist_fac_inverse(float rad, int size, int falloff) diff --git a/source/blender/compositor/operations/COM_BlurBaseOperation.h b/source/blender/compositor/operations/COM_BlurBaseOperation.h index 052a525ef2c..e97dd4d766d 100644 --- a/source/blender/compositor/operations/COM_BlurBaseOperation.h +++ b/source/blender/compositor/operations/COM_BlurBaseOperation.h @@ -27,6 +27,10 @@ #define MAX_GAUSSTAB_RADIUS 30000 +#ifdef __SSE2__ +# include <emmintrin.h> +#endif + class BlurBaseOperation : public NodeOperation, public QualityStepHelper { private: @@ -34,6 +38,9 @@ protected: BlurBaseOperation(DataType data_type); float *make_gausstab(float rad, int size); +#ifdef __SSE2__ + __m128 *convert_gausstab_sse(const float *gaustab, float rad, int size); +#endif float *make_dist_fac_inverse(float rad, int size, int falloff); void updateSize(); diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp index d08924ca4ef..0aefba3bb7c 100644 --- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp +++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp @@ -31,6 +31,9 @@ extern "C" { GaussianXBlurOperation::GaussianXBlurOperation() : BlurBaseOperation(COM_DT_COLOR) { this->m_gausstab = NULL; +#ifdef __SSE2__ + this->m_gausstab_sse = NULL; +#endif this->m_filtersize = 0; } @@ -54,8 +57,14 @@ void GaussianXBlurOperation::initExecution() if (this->m_sizeavailable) { float rad = max_ff(m_size * m_data.sizex, 0.0f); m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS); - + + /* TODO(sergey): De-duplicate with the case below and Y blur. */ this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize); +#ifdef __SSE2__ + this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab, + rad, + m_filtersize); +#endif } } @@ -65,8 +74,13 @@ void GaussianXBlurOperation::updateGauss() updateSize(); float rad = max_ff(m_size * m_data.sizex, 0.0f); m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS); - + this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize); +#ifdef __SSE2__ + this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab, + rad, + m_filtersize); +#endif } } @@ -88,12 +102,25 @@ void GaussianXBlurOperation::executePixel(float output[4], int x, int y, void *d int step = getStep(); int offsetadd = getOffsetAdd(); int bufferindex = ((xmin - bufferstartx) * 4) + ((ymin - bufferstarty) * 4 * bufferwidth); + +#ifdef __SSE2__ + __m128 accum_r = _mm_load_ps(color_accum); + for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) { + __m128 reg_a = _mm_load_ps(&buffer[bufferindex]); + reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]); + accum_r = _mm_add_ps(accum_r, reg_a); + multiplier_accum += this->m_gausstab[index]; + bufferindex += offsetadd; + } + _mm_store_ps(color_accum, accum_r); +#else for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) { const float multiplier = this->m_gausstab[index]; madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier); multiplier_accum += multiplier; bufferindex += offsetadd; } +#endif mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum); } @@ -105,6 +132,12 @@ void GaussianXBlurOperation::deinitExecution() MEM_freeN(this->m_gausstab); this->m_gausstab = NULL; } +#ifdef __SSE2__ + if (this->m_gausstab_sse) { + MEM_freeN(this->m_gausstab_sse); + this->m_gausstab_sse = NULL; + } +#endif deinitMutex(); } diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.h b/source/blender/compositor/operations/COM_GaussianXBlurOperation.h index 6442f214138..e391320a007 100644 --- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.h +++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.h @@ -28,6 +28,9 @@ class GaussianXBlurOperation : public BlurBaseOperation { private: float *m_gausstab; +#ifdef __SSE2__ + __m128 *m_gausstab_sse; +#endif int m_filtersize; void updateGauss(); public: diff --git a/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp index 8216b79372f..a05a1ab6a23 100644 --- a/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp +++ b/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp @@ -31,6 +31,9 @@ extern "C" { GaussianYBlurOperation::GaussianYBlurOperation() : BlurBaseOperation(COM_DT_COLOR) { this->m_gausstab = NULL; +#ifdef __SSE2__ + this->m_gausstab_sse = NULL; +#endif this->m_filtersize = 0; } @@ -54,8 +57,13 @@ void GaussianYBlurOperation::initExecution() if (this->m_sizeavailable) { float rad = max_ff(m_size * m_data.sizey, 0.0f); m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS); - + this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize); +#ifdef __SSE2__ + this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab, + rad, + m_filtersize); +#endif } } @@ -65,8 +73,13 @@ void GaussianYBlurOperation::updateGauss() updateSize(); float rad = max_ff(m_size * m_data.sizey, 0.0f); m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS); - + this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize); +#ifdef __SSE2__ + this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab, + rad, + m_filtersize); +#endif } } @@ -88,6 +101,20 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d int index; int step = getStep(); const int bufferIndexx = ((xmin - bufferstartx) * 4); + +#ifdef __SSE2__ + __m128 accum_r = _mm_load_ps(color_accum); + for (int ny = ymin; ny < ymax; ny += step) { + index = (ny - y) + this->m_filtersize; + int bufferindex = bufferIndexx + ((ny - bufferstarty) * 4 * bufferwidth); + const float multiplier = this->m_gausstab[index]; + __m128 reg_a = _mm_load_ps(&buffer[bufferindex]); + reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]); + accum_r = _mm_add_ps(accum_r, reg_a); + multiplier_accum += multiplier; + } + _mm_store_ps(color_accum, accum_r); +#else for (int ny = ymin; ny < ymax; ny += step) { index = (ny - y) + this->m_filtersize; int bufferindex = bufferIndexx + ((ny - bufferstarty) * 4 * bufferwidth); @@ -95,6 +122,7 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier); multiplier_accum += multiplier; } +#endif mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum); } @@ -106,6 +134,12 @@ void GaussianYBlurOperation::deinitExecution() MEM_freeN(this->m_gausstab); this->m_gausstab = NULL; } +#ifdef __SSE2__ + if (this->m_gausstab_sse) { + MEM_freeN(this->m_gausstab_sse); + this->m_gausstab_sse = NULL; + } +#endif deinitMutex(); } diff --git a/source/blender/compositor/operations/COM_GaussianYBlurOperation.h b/source/blender/compositor/operations/COM_GaussianYBlurOperation.h index 16503360de2..22b6562077d 100644 --- a/source/blender/compositor/operations/COM_GaussianYBlurOperation.h +++ b/source/blender/compositor/operations/COM_GaussianYBlurOperation.h @@ -28,6 +28,9 @@ class GaussianYBlurOperation : public BlurBaseOperation { private: float *m_gausstab; +#ifdef __SSE2__ + __m128 *m_gausstab_sse; +#endif int m_filtersize; void updateGauss(); public: |