diff options
author | Kacper Michajłow <kasper93@gmail.com> | 2017-08-13 17:39:25 +0300 |
---|---|---|
committer | Kacper Michajłow <kasper93@gmail.com> | 2017-08-13 23:24:43 +0300 |
commit | fe9d00c292419dc67849bd063812d8bb44feeb6d (patch) | |
tree | 1ee920e2a7158cbcd3937126d884afae9ef3e362 | |
parent | 620d8e5bef3e071aa0a2e0d3d5b49f71f45ed712 (diff) |
Do not over optimize memsetd.
In our use low overhead rep stosd is faster than SSE2 stores. On modern
CPUs rep stosd is quite fast. We don't need big throughput, low overhead
is more important in our usage.
-rw-r--r-- | src/DSUtil/DSUtil.cpp | 30 |
1 files changed, 1 insertions, 29 deletions
diff --git a/src/DSUtil/DSUtil.cpp b/src/DSUtil/DSUtil.cpp index be4ab09e4..b7501b94b 100644 --- a/src/DSUtil/DSUtil.cpp +++ b/src/DSUtil/DSUtil.cpp @@ -919,35 +919,7 @@ REFERENCE_TIME HMSF2RT(DVD_HMSF_TIMECODE hmsf, double fps /*= -1.0*/) void memsetd(void* dst, unsigned int c, size_t nbytes)
{
size_t n = nbytes / 4;
-
-#if defined(_M_IX86_FP) && _M_IX86_FP < 2
- if (!(g_cpuid.m_flags & g_cpuid.sse2)) { // No SSE2
- __stosd((unsigned long*)dst, c, n);
- return;
- }
-#endif
-
- size_t o = n - (n % 4);
-
- __m128i val = _mm_set1_epi32((int)c);
- if (((uintptr_t)dst & 0x0F) == 0) { // 16-byte aligned
- for (size_t i = 0; i < o; i += 4) {
- _mm_store_si128((__m128i*) & (((DWORD*)dst)[i]), val);
- }
- } else {
- for (size_t i = 0; i < o; i += 4) {
- _mm_storeu_si128((__m128i*) & (((DWORD*)dst)[i]), val);
- }
- }
-
- switch (n - o) {
- case 3:
- ((DWORD*)dst)[o + 2] = c;
- case 2:
- ((DWORD*)dst)[o + 1] = c;
- case 1:
- ((DWORD*)dst)[o + 0] = c;
- }
+ __stosd((unsigned long*)dst, c, n);
}
void memsetw(void* dst, unsigned short c, size_t nbytes)
|