Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--intern/guardedalloc/MEM_guardedalloc.h6
-rw-r--r--intern/guardedalloc/intern/mallocn.c36
-rw-r--r--intern/guardedalloc/intern/mallocn_guarded_impl.c95
-rw-r--r--intern/guardedalloc/intern/mallocn_intern.h31
-rw-r--r--intern/guardedalloc/intern/mallocn_lockfree_impl.c112
-rw-r--r--source/blender/compositor/intern/COM_MemoryBuffer.cpp4
-rw-r--r--source/blender/compositor/operations/COM_BlurBaseOperation.cpp12
-rw-r--r--source/blender/compositor/operations/COM_BlurBaseOperation.h7
-rw-r--r--source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp37
-rw-r--r--source/blender/compositor/operations/COM_GaussianXBlurOperation.h3
-rw-r--r--source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp38
-rw-r--r--source/blender/compositor/operations/COM_GaussianYBlurOperation.h3
12 files changed, 362 insertions, 22 deletions
diff --git a/intern/guardedalloc/MEM_guardedalloc.h b/intern/guardedalloc/MEM_guardedalloc.h
index 4fb68965338..8c5ad77b8b6 100644
--- a/intern/guardedalloc/MEM_guardedalloc.h
+++ b/intern/guardedalloc/MEM_guardedalloc.h
@@ -120,6 +120,12 @@ extern "C" {
extern void *(*MEM_mallocN)(size_t len, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
/**
+ * Allocate an aligned block of memory of size len, with tag name str. The
+ * name must be a static, because only a pointer to it is stored !
+ * */
+ extern void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
+
+ /**
* Same as callocN, clears memory and uses mmap (disk cached) if supported.
* Can be free'd with MEM_freeN as usual.
* */
diff --git a/intern/guardedalloc/intern/mallocn.c b/intern/guardedalloc/intern/mallocn.c
index e85fba7a6d0..b0d252cca14 100644
--- a/intern/guardedalloc/intern/mallocn.c
+++ b/intern/guardedalloc/intern/mallocn.c
@@ -41,6 +41,7 @@ void *(*MEM_reallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfre
void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id;
void *(*MEM_callocN)(size_t len, const char *str) = MEM_lockfree_callocN;
void *(*MEM_mallocN)(size_t len, const char *str) = MEM_lockfree_mallocN;
+void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) = MEM_lockfree_mallocN_aligned;
void *(*MEM_mapallocN)(size_t len, const char *str) = MEM_lockfree_mapallocN;
void (*MEM_printmemlist_pydict)(void) = MEM_lockfree_printmemlist_pydict;
void (*MEM_printmemlist)(void) = MEM_lockfree_printmemlist;
@@ -60,6 +61,40 @@ uintptr_t (*MEM_get_peak_memory)(void) = MEM_lockfree_get_peak_memory;
const char *(*MEM_name_ptr)(void *vmemh) = MEM_lockfree_name_ptr;
#endif
+void *aligned_malloc(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+ return _aligned_malloc(size, alignment);
+#elif defined(__APPLE__)
+ /* On Mac OS X, both the heap and the stack are guaranteed 16-byte aligned so
+ * they work natively with SSE types with no further work.
+ */
+ assert(alignment == 16);
+ return malloc(size);
+#elif defined(__FreeBSD__) || defined(__NetBSD__)
+ void *result;
+
+ if (posix_memalign(&result, alignment, size)) {
+ /* non-zero means allocation error
+ * either no allocation or bad alignment value
+ */
+ return NULL;
+ }
+ return result;
+#else /* This is for Linux. */
+ return memalign(alignment, size);
+#endif
+}
+
+void aligned_free(void *ptr)
+{
+#ifdef _WIN32
+ _aligned_free(ptr);
+#else
+ free(ptr);
+#endif
+}
+
void MEM_use_guarded_allocator(void)
{
MEM_allocN_len = MEM_guarded_allocN_len;
@@ -69,6 +104,7 @@ void MEM_use_guarded_allocator(void)
MEM_recallocN_id = MEM_guarded_recallocN_id;
MEM_callocN = MEM_guarded_callocN;
MEM_mallocN = MEM_guarded_mallocN;
+ MEM_mallocN_aligned = MEM_guarded_mallocN_aligned;
MEM_mapallocN = MEM_guarded_mapallocN;
MEM_printmemlist_pydict = MEM_guarded_printmemlist_pydict;
MEM_printmemlist = MEM_guarded_printmemlist;
diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c
index 172c79d50cd..206390e0710 100644
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@@ -113,7 +113,10 @@ typedef struct MemHead {
const char *name;
const char *nextname;
int tag2;
- int mmap; /* if true, memory was mmapped */
+ short mmap; /* if true, memory was mmapped */
+ short alignment; /* if non-zero aligned alloc was used
+ * and alignment is stored here.
+ */
#ifdef DEBUG_MEMCOUNTER
int _count;
#endif
@@ -128,6 +131,8 @@ typedef struct MemHead {
#endif
} MemHead;
+typedef MemHead MemHeadAligned;
+
/* for openmp threading asserts, saves time troubleshooting
* we may need to extend this if blender code starts using MEM_
* functions inside OpenMP correctly with omp_set_lock() */
@@ -187,7 +192,7 @@ static const char *check_memlist(MemHead *memh);
#define MEMNEXT(x) \
((MemHead *)(((char *) x) - ((char *) &(((MemHead *)0)->next))))
-
+
/* --------------------------------------------------------------------- */
/* vars */
/* --------------------------------------------------------------------- */
@@ -325,10 +330,12 @@ void *MEM_guarded_dupallocN(const void *vmemh)
memh--;
#ifndef DEBUG_MEMDUPLINAME
- if (memh->mmap)
+ if (UNLIKELY(memh->mmap))
+ newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc");
+ else if (LIKELY(memh->alignment == 0))
newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc");
else
- newp = MEM_guarded_mallocN(memh->len, "dupli_alloc");
+ newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, "dupli_alloc");
if (newp == NULL) return NULL;
#else
@@ -336,14 +343,18 @@ void *MEM_guarded_dupallocN(const void *vmemh)
MemHead *nmemh;
char *name = malloc(strlen(memh->name) + 24);
- if (memh->mmap) {
+ if (UNLIKELY(memh->mmap)) {
sprintf(name, "%s %s", "dupli_mapalloc", memh->name);
newp = MEM_guarded_mapallocN(memh->len, name);
}
- else {
+ else if (LIKELY(memh->alignment == 0)) {
sprintf(name, "%s %s", "dupli_alloc", memh->name);
newp = MEM_guarded_mallocN(memh->len, name);
}
+ else {
+ sprintf(name, "%s %s", "dupli_alloc", memh->name);
+ newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, name);
+ }
if (newp == NULL) return NULL;
@@ -368,7 +379,13 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *str)
MemHead *memh = vmemh;
memh--;
- newp = MEM_guarded_mallocN(len, memh->name);
+ if (LIKELY(memh->alignment == 0)) {
+ newp = MEM_guarded_mallocN(len, memh->name);
+ }
+ else {
+ newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name);
+ }
+
if (newp) {
if (len < memh->len) {
/* shrink */
@@ -397,7 +414,13 @@ void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *str)
MemHead *memh = vmemh;
memh--;
- newp = MEM_guarded_mallocN(len, memh->name);
+ if (LIKELY(memh->alignment == 0)) {
+ newp = MEM_guarded_mallocN(len, memh->name);
+ }
+ else {
+ newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name);
+ }
+
if (newp) {
if (len < memh->len) {
/* shrink */
@@ -464,6 +487,7 @@ static void make_memhead_header(MemHead *memh, size_t len, const char *str)
memh->nextname = NULL;
memh->len = len;
memh->mmap = 0;
+ memh->alignment = 0;
memh->tag2 = MEMTAG2;
#ifdef DEBUG_MEMDUPLINAME
@@ -514,6 +538,54 @@ void *MEM_guarded_mallocN(size_t len, const char *str)
return NULL;
}
+void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *str)
+{
+ MemHead *memh;
+
+ /* It's possible that MemHead's size is not properly aligned,
+ * do extra padding to deal with this.
+ *
+ * We only support small alignments which fits into short in
+ * order to save some bits in MemHead structure.
+ */
+ short extra_padding = (short)MEMHEAD_ALIGN_PADDING(alignment);
+
+ /* Huge alignment values doesn't make sense and they
+ * wouldn't fit into 'short' used in the MemHead.
+ */
+ assert(alignment < 1024);
+
+ /* We only support alignment to a power of two. */
+ assert(IS_POW2(alignment));
+
+ len = SIZET_ALIGN_4(len);
+
+ memh = (MemHead *)aligned_malloc(len + (size_t)extra_padding + sizeof(MemHead) + sizeof(MemTail), alignment);
+
+ if (LIKELY(memh)) {
+ /* We keep padding in the beginning of MemHead,
+ * this way it's always possible to get MemHead
+ * from the data pointer.
+ */
+ memh = (MemHead *)((char *)memh + extra_padding);
+
+ make_memhead_header(memh, len, str);
+ memh->alignment = (short) alignment;
+ if (UNLIKELY(malloc_debug_memset && len))
+ memset(memh + 1, 255, len);
+
+#ifdef DEBUG_MEMCOUNTER
+ if (_mallocn_count == DEBUG_MEMCOUNTER_ERROR_VAL)
+ memcount_raise(__func__);
+ memh->_count = _mallocn_count++;
+#endif
+ return (++memh);
+ }
+ print_error("aligned_malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
+ SIZET_ARG(len), str, (unsigned int) mem_in_use);
+ return NULL;
+}
+
void *MEM_guarded_callocN(size_t len, const char *str)
{
MemHead *memh;
@@ -953,7 +1025,12 @@ static void rem_memblock(MemHead *memh)
else {
if (UNLIKELY(malloc_debug_memset && memh->len))
memset(memh + 1, 255, memh->len);
- free(memh);
+ if (LIKELY(memh->alignment == 0)) {
+ free(memh);
+ }
+ else {
+ aligned_free(MEMHEAD_REAL_PTR(memh));
+ }
}
}
diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h
index 7c8922dd407..a69bcf3d27b 100644
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@@ -85,6 +85,35 @@
# define UNLIKELY(x) (x)
#endif
+#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__NetBSD__)
+// Needed for memalign on Linux and _aligned_alloc on Windows.
+# ifdef FREE_WINDOWS
+/* make sure _aligned_malloc is included */
+# ifdef __MSVCRT_VERSION__
+# undef __MSVCRT_VERSION__
+# endif
+
+# define __MSVCRT_VERSION__ 0x0700
+# endif // FREE_WINDOWS
+
+# include <malloc.h>
+#else
+// Apple's malloc is 16-byte aligned, and does not have malloc.h, so include
+// stdilb instead.
+# include <cstdlib>
+#endif
+
+#define IS_POW2(a) (((a) & ((a) - 1)) == 0)
+
+/* Extra padding which needs to be applied on MemHead to make it aligned. */
+#define MEMHEAD_ALIGN_PADDING(alignment) ((size_t)alignment - (sizeof(MemHeadAligned) % (size_t)alignment))
+
+/* Real pointer returned by the malloc or aligned_alloc. */
+#define MEMHEAD_REAL_PTR(memh) ((char *)memh - MEMHEAD_ALIGN_PADDING(memh->alignment))
+
+void *aligned_malloc(size_t size, size_t alignment);
+void aligned_free(void *ptr);
+
/* Prototypes for counted allocator functions */
size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
void MEM_lockfree_freeN(void *vmemh);
@@ -93,6 +122,7 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *UNUSED(str))
void *MEM_lockfree_recallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(2);
void *MEM_lockfree_callocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
void *MEM_lockfree_mallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
+void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
void *MEM_lockfree_mapallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
void MEM_lockfree_printmemlist_pydict(void);
void MEM_lockfree_printmemlist(void);
@@ -119,6 +149,7 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *UNUSED(str))
void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(2);
void *MEM_guarded_callocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
void *MEM_guarded_mallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
+void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
void *MEM_guarded_mapallocN(size_t len, const char *UNUSED(str)) ATTR_MALLOC ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
void MEM_guarded_printmemlist_pydict(void);
void MEM_guarded_printmemlist(void);
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index 6fc01807af3..c76caff0d74 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -46,6 +46,11 @@ typedef struct MemHead {
size_t len;
} MemHead;
+typedef struct MemHeadAligned {
+ short alignment;
+ size_t len;
+} MemHeadAligned;
+
static unsigned int totblock = 0;
static size_t mem_in_use = 0, mmap_in_use = 0, peak_mem = 0;
static bool malloc_debug_memset = false;
@@ -54,9 +59,17 @@ static void (*error_callback)(const char *) = NULL;
static void (*thread_lock_callback)(void) = NULL;
static void (*thread_unlock_callback)(void) = NULL;
+enum {
+ MEMHEAD_MMAP_FLAG = 1,
+ MEMHEAD_ALIGN_FLAG = 2,
+};
+
#define MEMHEAD_FROM_PTR(ptr) (((MemHead*) vmemh) - 1)
#define PTR_FROM_MEMHEAD(memhead) (memhead + 1)
-#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) 1)
+#define MEMHEAD_ALIGNED_FROM_PTR(ptr) (((MemHeadAligned*) vmemh) - 1)
+#define PTR_FROM_MEMHEAD_ALIGNED(memhead) (memhead + 1)
+#define MEMHEAD_IS_MMAP(memhead) ((memhead)->len & (size_t) MEMHEAD_MMAP_FLAG)
+#define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t) MEMHEAD_ALIGN_FLAG)
#ifdef __GNUC__
__attribute__ ((format(printf, 1, 2)))
@@ -93,7 +106,7 @@ static void mem_unlock_thread(void)
size_t MEM_lockfree_allocN_len(const void *vmemh)
{
if (vmemh) {
- return MEMHEAD_FROM_PTR(vmemh)->len & ~((size_t) 1);
+ return MEMHEAD_FROM_PTR(vmemh)->len & ~((size_t) (MEMHEAD_MMAP_FLAG | MEMHEAD_ALIGN_FLAG));
}
else {
return 0;
@@ -124,7 +137,13 @@ void MEM_lockfree_freeN(void *vmemh)
if (UNLIKELY(malloc_debug_memset && len)) {
memset(memh + 1, 255, len);
}
- free(memh);
+ if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) {
+ MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+ aligned_free(MEMHEAD_REAL_PTR(memh_aligned));
+ }
+ else {
+ free(memh);
+ }
}
}
@@ -134,9 +153,16 @@ void *MEM_lockfree_dupallocN(const void *vmemh)
if (vmemh) {
MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
const size_t prev_size = MEM_allocN_len(vmemh);
- if (MEMHEAD_IS_MMAP(memh)) {
+ if (UNLIKELY(MEMHEAD_IS_MMAP(memh))) {
newp = MEM_lockfree_mapallocN(prev_size, "dupli_mapalloc");
}
+ else if (UNLIKELY(MEMHEAD_IS_ALIGNED(memh))) {
+ MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+ newp = MEM_lockfree_mallocN_aligned(
+ prev_size,
+ (size_t)memh_aligned->alignment,
+ "dupli_malloc");
+ }
else {
newp = MEM_lockfree_mallocN(prev_size, "dupli_malloc");
}
@@ -150,9 +176,20 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, const char *str)
void *newp = NULL;
if (vmemh) {
+ MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
size_t old_len = MEM_allocN_len(vmemh);
- newp = MEM_lockfree_mallocN(len, "realloc");
+ if (LIKELY(!MEMHEAD_IS_ALIGNED(memh))) {
+ newp = MEM_lockfree_mallocN(len, "realloc");
+ }
+ else {
+ MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+ newp = MEM_lockfree_mallocN_aligned(
+ old_len,
+ (size_t)memh_aligned->alignment,
+ "realloc");
+ }
+
if (newp) {
if (len < old_len) {
/* shrink */
@@ -178,9 +215,19 @@ void *MEM_lockfree_recallocN_id(void *vmemh, size_t len, const char *str)
void *newp = NULL;
if (vmemh) {
+ MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
size_t old_len = MEM_allocN_len(vmemh);
- newp = MEM_lockfree_mallocN(len, "recalloc");
+ if (LIKELY(!MEMHEAD_IS_ALIGNED(memh))) {
+ newp = MEM_lockfree_mallocN(len, "recalloc");
+ }
+ else {
+ MemHeadAligned *memh_aligned = MEMHEAD_ALIGNED_FROM_PTR(vmemh);
+ newp = MEM_lockfree_mallocN_aligned(old_len,
+ (size_t)memh_aligned->alignment,
+ "recalloc");
+ }
+
if (newp) {
if (len < old_len) {
/* shrink */
@@ -256,6 +303,57 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
return NULL;
}
+void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str)
+{
+ MemHeadAligned *memh;
+
+ /* It's possible that MemHead's size is not properly aligned,
+ * do extra padding to deal with this.
+ *
+ * We only support small alignments which fits into short in
+ * order to save some bits in MemHead structure.
+ */
+ size_t extra_padding = MEMHEAD_ALIGN_PADDING(alignment);
+
+ /* Huge alignment values doesn't make sense and they
+ * wouldn't fit into 'short' used in the MemHead.
+ */
+ assert(alignment < 1024);
+
+ /* We only support alignment to a power of two. */
+ assert(IS_POW2(alignment));
+
+ len = SIZET_ALIGN_4(len);
+
+ memh = (MemHeadAligned *)aligned_malloc(
+ len + extra_padding + sizeof(MemHeadAligned), alignment);
+
+ if (LIKELY(memh)) {
+ /* We keep padding in the beginning of MemHead,
+ * this way it's always possible to get MemHead
+ * from the data pointer.
+ */
+ memh = (MemHeadAligned *)((char *)memh + extra_padding);
+
+ if (UNLIKELY(malloc_debug_memset && len)) {
+ memset(memh + 1, 255, len);
+ }
+
+ memh->len = len | (size_t) MEMHEAD_ALIGN_FLAG;
+ memh->alignment = (short) alignment;
+ atomic_add_u(&totblock, 1);
+ atomic_add_z(&mem_in_use, len);
+
+ /* TODO(sergey): Not strictly speaking thread-safe. */
+ peak_mem = mem_in_use > peak_mem ? mem_in_use : peak_mem;
+
+ return PTR_FROM_MEMHEAD(memh);
+ }
+ print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
+ SIZET_ARG(len), str, (unsigned int) mem_in_use);
+ return NULL;
+}
+
void *MEM_lockfree_mapallocN(size_t len, const char *str)
{
MemHead *memh;
@@ -279,7 +377,7 @@ void *MEM_lockfree_mapallocN(size_t len, const char *str)
#endif
if (memh != (MemHead *)-1) {
- memh->len = len | (size_t) 1;
+ memh->len = len | (size_t) MEMHEAD_MMAP_FLAG;
atomic_add_u(&totblock, 1);
atomic_add_z(&mem_in_use, len);
atomic_add_z(&mmap_in_use, len);
diff --git a/source/blender/compositor/intern/COM_MemoryBuffer.cpp b/source/blender/compositor/intern/COM_MemoryBuffer.cpp
index 04828bfe3f8..c1916f4a68f 100644
--- a/source/blender/compositor/intern/COM_MemoryBuffer.cpp
+++ b/source/blender/compositor/intern/COM_MemoryBuffer.cpp
@@ -46,7 +46,7 @@ MemoryBuffer::MemoryBuffer(MemoryProxy *memoryProxy, unsigned int chunkNumber, r
BLI_rcti_init(&this->m_rect, rect->xmin, rect->xmax, rect->ymin, rect->ymax);
this->m_memoryProxy = memoryProxy;
this->m_chunkNumber = chunkNumber;
- this->m_buffer = (float *)MEM_mallocN(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, "COM_MemoryBuffer");
+ this->m_buffer = (float *)MEM_mallocN_aligned(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, 16, "COM_MemoryBuffer");
this->m_state = COM_MB_ALLOCATED;
this->m_datatype = COM_DT_COLOR;
this->m_chunkWidth = this->m_rect.xmax - this->m_rect.xmin;
@@ -57,7 +57,7 @@ MemoryBuffer::MemoryBuffer(MemoryProxy *memoryProxy, rcti *rect)
BLI_rcti_init(&this->m_rect, rect->xmin, rect->xmax, rect->ymin, rect->ymax);
this->m_memoryProxy = memoryProxy;
this->m_chunkNumber = -1;
- this->m_buffer = (float *)MEM_mallocN(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, "COM_MemoryBuffer");
+ this->m_buffer = (float *)MEM_mallocN_aligned(sizeof(float) * determineBufferSize() * COM_NUMBER_OF_CHANNELS, 16, "COM_MemoryBuffer");
this->m_state = COM_MB_TEMPORARILY;
this->m_datatype = COM_DT_COLOR;
this->m_chunkWidth = this->m_rect.xmax - this->m_rect.xmin;
diff --git a/source/blender/compositor/operations/COM_BlurBaseOperation.cpp b/source/blender/compositor/operations/COM_BlurBaseOperation.cpp
index e7af9319f88..d5aafc7c2ae 100644
--- a/source/blender/compositor/operations/COM_BlurBaseOperation.cpp
+++ b/source/blender/compositor/operations/COM_BlurBaseOperation.cpp
@@ -91,6 +91,18 @@ float *BlurBaseOperation::make_gausstab(float rad, int size)
return gausstab;
}
+#ifdef __SSE2__
+__m128 *BlurBaseOperation::convert_gausstab_sse(const float *gausstab, float rad, int size)
+{
+ int n = 2 * size + 1;
+ __m128 *gausstab_sse = (__m128 *) MEM_mallocN_aligned(sizeof(__m128) * n, 16, "gausstab sse");
+ for (int i = 0; i < n; ++i) {
+ gausstab_sse[i] = _mm_set1_ps(gausstab[i]);
+ }
+ return gausstab_sse;
+}
+#endif
+
/* normalized distance from the current (inverted so 1.0 is close and 0.0 is far)
* 'ease' is applied after, looks nicer */
float *BlurBaseOperation::make_dist_fac_inverse(float rad, int size, int falloff)
diff --git a/source/blender/compositor/operations/COM_BlurBaseOperation.h b/source/blender/compositor/operations/COM_BlurBaseOperation.h
index 052a525ef2c..e97dd4d766d 100644
--- a/source/blender/compositor/operations/COM_BlurBaseOperation.h
+++ b/source/blender/compositor/operations/COM_BlurBaseOperation.h
@@ -27,6 +27,10 @@
#define MAX_GAUSSTAB_RADIUS 30000
+#ifdef __SSE2__
+# include <emmintrin.h>
+#endif
+
class BlurBaseOperation : public NodeOperation, public QualityStepHelper {
private:
@@ -34,6 +38,9 @@ protected:
BlurBaseOperation(DataType data_type);
float *make_gausstab(float rad, int size);
+#ifdef __SSE2__
+ __m128 *convert_gausstab_sse(const float *gaustab, float rad, int size);
+#endif
float *make_dist_fac_inverse(float rad, int size, int falloff);
void updateSize();
diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
index d08924ca4ef..0aefba3bb7c 100644
--- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
+++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
@@ -31,6 +31,9 @@ extern "C" {
GaussianXBlurOperation::GaussianXBlurOperation() : BlurBaseOperation(COM_DT_COLOR)
{
this->m_gausstab = NULL;
+#ifdef __SSE2__
+ this->m_gausstab_sse = NULL;
+#endif
this->m_filtersize = 0;
}
@@ -54,8 +57,14 @@ void GaussianXBlurOperation::initExecution()
if (this->m_sizeavailable) {
float rad = max_ff(m_size * m_data.sizex, 0.0f);
m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-
+
+ /* TODO(sergey): De-duplicate with the case below and Y blur. */
this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+ this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+ rad,
+ m_filtersize);
+#endif
}
}
@@ -65,8 +74,13 @@ void GaussianXBlurOperation::updateGauss()
updateSize();
float rad = max_ff(m_size * m_data.sizex, 0.0f);
m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-
+
this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+ this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+ rad,
+ m_filtersize);
+#endif
}
}
@@ -88,12 +102,25 @@ void GaussianXBlurOperation::executePixel(float output[4], int x, int y, void *d
int step = getStep();
int offsetadd = getOffsetAdd();
int bufferindex = ((xmin - bufferstartx) * 4) + ((ymin - bufferstarty) * 4 * bufferwidth);
+
+#ifdef __SSE2__
+ __m128 accum_r = _mm_load_ps(color_accum);
+ for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) {
+ __m128 reg_a = _mm_load_ps(&buffer[bufferindex]);
+ reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]);
+ accum_r = _mm_add_ps(accum_r, reg_a);
+ multiplier_accum += this->m_gausstab[index];
+ bufferindex += offsetadd;
+ }
+ _mm_store_ps(color_accum, accum_r);
+#else
for (int nx = xmin, index = (xmin - x) + this->m_filtersize; nx < xmax; nx += step, index += step) {
const float multiplier = this->m_gausstab[index];
madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier);
multiplier_accum += multiplier;
bufferindex += offsetadd;
}
+#endif
mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum);
}
@@ -105,6 +132,12 @@ void GaussianXBlurOperation::deinitExecution()
MEM_freeN(this->m_gausstab);
this->m_gausstab = NULL;
}
+#ifdef __SSE2__
+ if (this->m_gausstab_sse) {
+ MEM_freeN(this->m_gausstab_sse);
+ this->m_gausstab_sse = NULL;
+ }
+#endif
deinitMutex();
}
diff --git a/source/blender/compositor/operations/COM_GaussianXBlurOperation.h b/source/blender/compositor/operations/COM_GaussianXBlurOperation.h
index 6442f214138..e391320a007 100644
--- a/source/blender/compositor/operations/COM_GaussianXBlurOperation.h
+++ b/source/blender/compositor/operations/COM_GaussianXBlurOperation.h
@@ -28,6 +28,9 @@
class GaussianXBlurOperation : public BlurBaseOperation {
private:
float *m_gausstab;
+#ifdef __SSE2__
+ __m128 *m_gausstab_sse;
+#endif
int m_filtersize;
void updateGauss();
public:
diff --git a/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp b/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp
index 8216b79372f..a05a1ab6a23 100644
--- a/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp
+++ b/source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp
@@ -31,6 +31,9 @@ extern "C" {
GaussianYBlurOperation::GaussianYBlurOperation() : BlurBaseOperation(COM_DT_COLOR)
{
this->m_gausstab = NULL;
+#ifdef __SSE2__
+ this->m_gausstab_sse = NULL;
+#endif
this->m_filtersize = 0;
}
@@ -54,8 +57,13 @@ void GaussianYBlurOperation::initExecution()
if (this->m_sizeavailable) {
float rad = max_ff(m_size * m_data.sizey, 0.0f);
m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-
+
this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+ this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+ rad,
+ m_filtersize);
+#endif
}
}
@@ -65,8 +73,13 @@ void GaussianYBlurOperation::updateGauss()
updateSize();
float rad = max_ff(m_size * m_data.sizey, 0.0f);
m_filtersize = min_ii(ceil(rad), MAX_GAUSSTAB_RADIUS);
-
+
this->m_gausstab = BlurBaseOperation::make_gausstab(rad, m_filtersize);
+#ifdef __SSE2__
+ this->m_gausstab_sse = BlurBaseOperation::convert_gausstab_sse(this->m_gausstab,
+ rad,
+ m_filtersize);
+#endif
}
}
@@ -88,6 +101,20 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d
int index;
int step = getStep();
const int bufferIndexx = ((xmin - bufferstartx) * 4);
+
+#ifdef __SSE2__
+ __m128 accum_r = _mm_load_ps(color_accum);
+ for (int ny = ymin; ny < ymax; ny += step) {
+ index = (ny - y) + this->m_filtersize;
+ int bufferindex = bufferIndexx + ((ny - bufferstarty) * 4 * bufferwidth);
+ const float multiplier = this->m_gausstab[index];
+ __m128 reg_a = _mm_load_ps(&buffer[bufferindex]);
+ reg_a = _mm_mul_ps(reg_a, this->m_gausstab_sse[index]);
+ accum_r = _mm_add_ps(accum_r, reg_a);
+ multiplier_accum += multiplier;
+ }
+ _mm_store_ps(color_accum, accum_r);
+#else
for (int ny = ymin; ny < ymax; ny += step) {
index = (ny - y) + this->m_filtersize;
int bufferindex = bufferIndexx + ((ny - bufferstarty) * 4 * bufferwidth);
@@ -95,6 +122,7 @@ void GaussianYBlurOperation::executePixel(float output[4], int x, int y, void *d
madd_v4_v4fl(color_accum, &buffer[bufferindex], multiplier);
multiplier_accum += multiplier;
}
+#endif
mul_v4_v4fl(output, color_accum, 1.0f / multiplier_accum);
}
@@ -106,6 +134,12 @@ void GaussianYBlurOperation::deinitExecution()
MEM_freeN(this->m_gausstab);
this->m_gausstab = NULL;
}
+#ifdef __SSE2__
+ if (this->m_gausstab_sse) {
+ MEM_freeN(this->m_gausstab_sse);
+ this->m_gausstab_sse = NULL;
+ }
+#endif
deinitMutex();
}
diff --git a/source/blender/compositor/operations/COM_GaussianYBlurOperation.h b/source/blender/compositor/operations/COM_GaussianYBlurOperation.h
index 16503360de2..22b6562077d 100644
--- a/source/blender/compositor/operations/COM_GaussianYBlurOperation.h
+++ b/source/blender/compositor/operations/COM_GaussianYBlurOperation.h
@@ -28,6 +28,9 @@
class GaussianYBlurOperation : public BlurBaseOperation {
private:
float *m_gausstab;
+#ifdef __SSE2__
+ __m128 *m_gausstab_sse;
+#endif
int m_filtersize;
void updateGauss();
public: