diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2017-03-20 12:53:07 +0300 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2017-03-20 12:53:07 +0300 |
commit | e623b064ebeb46fb9159ef21877367a20eec91c2 (patch) | |
tree | 1d4933d8c25ab6fe12818604631dfae4beed5100 /common | |
parent | a11dd6ed22b33e42ba639a72ed5e6a8a0205b941 (diff) |
gpu_memcpy: add a memory barrier to avoid compiler re-ordering
Diffstat (limited to 'common')
-rw-r--r-- | common/DSUtilLite/gpu_memcpy_sse4.h | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/common/DSUtilLite/gpu_memcpy_sse4.h b/common/DSUtilLite/gpu_memcpy_sse4.h index 76eec8e1..48aec0f1 100644 --- a/common/DSUtilLite/gpu_memcpy_sse4.h +++ b/common/DSUtilLite/gpu_memcpy_sse4.h @@ -76,7 +76,9 @@ inline void* gpu_memcpy(void* d, const void* s, size_t size) xmm14 = _mm_stream_load_si128(pSrc + 14); xmm15 = _mm_stream_load_si128(pSrc + 15); #endif - pSrc += regsInLoop; + + _ReadWriteBarrier(); + // _mm_store_si128 emit the SSE2 intruction MOVDQA (aligned store) _mm_store_si128(pTrg , xmm0); _mm_store_si128(pTrg + 1, xmm1); @@ -96,6 +98,7 @@ inline void* gpu_memcpy(void* d, const void* s, size_t size) _mm_store_si128(pTrg + 14, xmm14); _mm_store_si128(pTrg + 15, xmm15); #endif + pSrc += regsInLoop; pTrg += regsInLoop; } |