Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/llvm/llvm-project.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorGuillaume Chatelet <gchatelet@google.com>2022-11-02 12:03:58 +0300
committerGuillaume Chatelet <gchatelet@google.com>2022-11-02 12:09:46 +0300
commit67437dd0147e92fe601ef76d17dbbf171f638580 (patch)
treecbeed6f9ba8a592f7816432fd31d2de00b1ce0ed /libc
parent17c9d4dfeeb99795d2c176035eb06cd75d70dda6 (diff)
[reland][libc] Switch to new implementation of mem* functions
The new framework makes it explicit which processor feature is being used and allows for easier per platform customization: - ARM cpu now uses trivial implementations to reduce code size. - Memcmp, Bcmp and Memmove have been optimized for x86 - Bcmp has been optimized for aarch64. This is a reland of https://reviews.llvm.org/D135134 (b3f1d58, 028414881381) Reviewed By: courbet Differential Revision: https://reviews.llvm.org/D136595
Diffstat (limited to 'libc')
-rw-r--r--libc/src/stdio/printf_core/string_writer.cpp2
-rw-r--r--libc/src/string/bcmp.cpp3
-rw-r--r--libc/src/string/memcmp.cpp3
-rw-r--r--libc/src/string/memcpy.cpp3
-rw-r--r--libc/src/string/memmove.cpp104
-rw-r--r--libc/src/string/memory_utils/bcmp_implementations.h176
-rw-r--r--libc/src/string/memory_utils/bzero_implementations.h6
-rw-r--r--libc/src/string/memory_utils/memcmp_implementations.h182
-rw-r--r--libc/src/string/memory_utils/memcpy_implementations.h197
-rw-r--r--libc/src/string/memory_utils/memset_implementations.h160
-rw-r--r--libc/src/string/memory_utils/op_x86.h2
-rw-r--r--libc/src/string/mempcpy.cpp7
-rw-r--r--libc/src/string/memset.cpp3
13 files changed, 525 insertions, 323 deletions
diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp
index a80df32d40a0..472573d4a813 100644
--- a/libc/src/stdio/printf_core/string_writer.cpp
+++ b/libc/src/stdio/printf_core/string_writer.cpp
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
len = available_capacity;
if (len > 0) {
- inline_memset(cur_buffer, new_char, len);
+ inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
cur_buffer += len;
available_capacity -= len;
}
diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp
index 963a7f5bce17..21991303b146 100644
--- a/libc/src/string/bcmp.cpp
+++ b/libc/src/string/bcmp.cpp
@@ -14,8 +14,7 @@ namespace __llvm_libc {
LLVM_LIBC_FUNCTION(int, bcmp,
(const void *lhs, const void *rhs, size_t count)) {
- return inline_bcmp(static_cast<const char *>(lhs),
- static_cast<const char *>(rhs), count);
+ return inline_bcmp(lhs, rhs, count);
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp
index 292525e17dad..7cf6782dd0d5 100644
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@@ -15,8 +15,7 @@ namespace __llvm_libc {
LLVM_LIBC_FUNCTION(int, memcmp,
(const void *lhs, const void *rhs, size_t count)) {
- return inline_memcmp(static_cast<const char *>(lhs),
- static_cast<const char *>(rhs), count);
+ return inline_memcmp(lhs, rhs, count);
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
index ff990f48a20b..850400540037 100644
--- a/libc/src/string/memcpy.cpp
+++ b/libc/src/string/memcpy.cpp
@@ -15,8 +15,7 @@ namespace __llvm_libc {
LLVM_LIBC_FUNCTION(void *, memcpy,
(void *__restrict dst, const void *__restrict src,
size_t size)) {
- inline_memcpy(reinterpret_cast<char *>(dst),
- reinterpret_cast<const char *>(src), size);
+ inline_memcpy(dst, src, size);
return dst;
}
diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index f24257893b20..a42ced3fc36b 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -9,42 +9,110 @@
#include "src/string/memmove.h"
#include "src/__support/common.h"
-#include "src/__support/integer_operations.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
#include <stddef.h> // size_t, ptrdiff_t
+#include <stdio.h>
+
namespace __llvm_libc {
-static inline void inline_memmove(char *dst, const char *src, size_t count) {
- using namespace __llvm_libc::scalar;
+[[maybe_unused]] static inline void
+inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
+ if ((count == 0) || (dst == src))
+ return;
+ if (dst < src) {
+#pragma nounroll
+ for (size_t offset = 0; offset < count; ++offset)
+ builtin::Memcpy<1>::block(dst + offset, src + offset);
+ } else {
+#pragma nounroll
+ for (ptrdiff_t offset = count - 1; offset >= 0; --offset)
+ builtin::Memcpy<1>::block(dst + offset, src + offset);
+ }
+}
+
+template <size_t MaxSize>
+[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
+ size_t count) {
if (count == 0)
return;
if (count == 1)
- return move<_1>(dst, src);
+ return generic::Memmove<1, MaxSize>::block(dst, src);
if (count <= 4)
- return move<HeadTail<_2>>(dst, src, count);
+ return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
if (count <= 8)
- return move<HeadTail<_4>>(dst, src, count);
+ return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
if (count <= 16)
- return move<HeadTail<_8>>(dst, src, count);
+ return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
if (count <= 32)
- return move<HeadTail<_16>>(dst, src, count);
+ return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
if (count <= 64)
- return move<HeadTail<_32>>(dst, src, count);
+ return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
if (count <= 128)
- return move<HeadTail<_64>>(dst, src, count);
+ return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
+ if (dst < src) {
+ generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
+ count);
+ return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
+ count);
+ } else {
+ generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
+ count);
+ return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
+ count);
+ }
+}
- using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
- if (dst < src)
- return move<AlignedMoveLoop>(dst, src, count);
- else if (dst > src)
- return move_backward<AlignedMoveLoop>(dst, src, count);
+static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+#if defined(LLVM_LIBC_ARCH_X86)
+ static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+ : x86::kAvx ? 32
+ : x86::kSse2 ? 16
+ : 8;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
+#endif
+ // return inline_memmove_generic<kMaxSize>(dst, src, count);
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memmove<1, kMaxSize>::block(dst, src);
+ if (count <= 4)
+ return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
+ if (count <= 8)
+ return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
+ if (count <= 16)
+ return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
+ if (count <= 32)
+ return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
+ if (count <= 64)
+ return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
+ if (count <= 128)
+ return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
+ if (dst < src) {
+ generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
+ return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
+ count);
+ } else {
+ generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
+ return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
+ count);
+ }
+#elif defined(LLVM_LIBC_ARCH_ARM)
+ return inline_memmove_embedded_tiny(dst, src, count);
+#else
+#error "Unsupported platform"
+#endif
}
LLVM_LIBC_FUNCTION(void *, memmove,
(void *dst, const void *src, size_t count)) {
- inline_memmove(reinterpret_cast<char *>(dst),
- reinterpret_cast<const char *>(src), count);
+ inline_memmove(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src),
+ count);
return dst;
}
diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h
index c26e38e51adf..2e18ee81aaf6 100644
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@@ -11,49 +11,169 @@
#include "src/__support/architectures.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
-// Fixed-size difference between 'lhs' and 'rhs'.
-template <typename Element> bool differs(const char *lhs, const char *rhs) {
- return !Element::equals(lhs, rhs);
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+#pragma nounroll
+ for (size_t offset = 0; offset < count; ++offset)
+ if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
+ return value;
+ return BcmpReturnType::ZERO();
}
-// Runtime-size difference between 'lhs' and 'rhs'.
-template <typename Element>
-bool differs(const char *lhs, const char *rhs, size_t size) {
- return !Element::equals(lhs, rhs, size);
+
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (count < 256)
+ return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
+ if (auto value = generic::Bcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
}
+#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
- using namespace ::__llvm_libc::x86;
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- using namespace ::__llvm_libc::aarch64;
-#else
- using namespace ::__llvm_libc::scalar;
-#endif
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (count <= 32)
+ return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+ if (count < 256)
+ return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
+ if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (count <= 32)
+ return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+ if (count <= 64)
+ return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
+ if (count <= 128)
+ return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
+ if (unlikely(count >= 256)) {
+ if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (count <= 32)
+ return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+ if (count <= 64)
+ return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
+ if (count <= 128)
+ return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
+ if (unlikely(count >= 256)) {
+ if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
+ size_t count) {
if (count == 0)
- return 0;
+ return BcmpReturnType::ZERO();
if (count == 1)
- return differs<_1>(lhs, rhs);
+ return generic::Bcmp<1>::block(p1, p2);
if (count == 2)
- return differs<_2>(lhs, rhs);
- if (count == 3)
- return differs<_3>(lhs, rhs);
+ return generic::Bcmp<2>::block(p1, p2);
+ if (count <= 4)
+ return generic::Bcmp<2>::head_tail(p1, p2, count);
if (count <= 8)
- return differs<HeadTail<_4>>(lhs, rhs, count);
+ return generic::Bcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
- return differs<HeadTail<_8>>(lhs, rhs, count);
- if (count <= 32)
- return differs<HeadTail<_16>>(lhs, rhs, count);
+ return generic::Bcmp<8>::head_tail(p1, p2, count);
+ if constexpr (x86::kAvx512BW)
+ return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
+ else if constexpr (x86::kAvx2)
+ return inline_bcmp_x86_avx2_gt16(p1, p2, count);
+ else if constexpr (x86::kSse2)
+ return inline_bcmp_x86_sse2_gt16(p1, p2, count);
+ else
+ return inline_bcmp_generic_gt16(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) {
+ if (likely(count <= 32)) {
+ if (unlikely(count >= 16)) {
+ return generic::Bcmp<16>::head_tail(p1, p2, count);
+ }
+ switch (count) {
+ case 0:
+ return BcmpReturnType::ZERO();
+ case 1:
+ return generic::Bcmp<1>::block(p1, p2);
+ case 2:
+ return generic::Bcmp<2>::block(p1, p2);
+ case 3:
+ return generic::Bcmp<2>::head_tail(p1, p2, count);
+ case 4:
+ return generic::Bcmp<4>::block(p1, p2);
+ case 5:
+ case 6:
+ case 7:
+ return generic::Bcmp<4>::head_tail(p1, p2, count);
+ case 8:
+ return generic::Bcmp<8>::block(p1, p2);
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15:
+ return generic::Bcmp<8>::head_tail(p1, p2, count);
+ }
+ }
+
if (count <= 64)
- return differs<HeadTail<_32>>(lhs, rhs, count);
- if (count <= 128)
- return differs<HeadTail<_64>>(lhs, rhs, count);
- return differs<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+ return generic::Bcmp<32>::head_tail(p1, p2, count);
+
+ // Aligned loop if > 256, otherwise normal loop
+ if (count > 256) {
+ if (auto value = generic::Bcmp<32>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ }
+ return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
+static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86)
+ return inline_bcmp_x86(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ return inline_bcmp_aarch64(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_ARM)
+ return inline_bcmp_embedded_tiny(p1, p2, count);
+#else
+#error "Unsupported platform"
+#endif
+}
+
+static inline int inline_bcmp(const void *p1, const void *p2, size_t count) {
+ return static_cast<int>(inline_bcmp(reinterpret_cast<CPtr>(p1),
+ reinterpret_cast<CPtr>(p2), count));
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/bzero_implementations.h b/libc/src/string/memory_utils/bzero_implementations.h
index 168fdd7e531d..550c910def88 100644
--- a/libc/src/string/memory_utils/bzero_implementations.h
+++ b/libc/src/string/memory_utils/bzero_implementations.h
@@ -15,10 +15,14 @@
namespace __llvm_libc {
-inline static void inline_bzero(char *dst, size_t count) {
+inline static void inline_bzero(Ptr dst, size_t count) {
inline_memset(dst, 0, count);
}
+inline static void inline_bzero(void *dst, size_t count) {
+ inline_bzero(reinterpret_cast<Ptr>(dst), count);
+}
+
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H
diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h
index f2079468f2be..b3258b9607a0 100644
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@@ -11,93 +11,141 @@
#include "src/__support/architectures.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+#pragma nounroll
+ for (size_t offset = 0; offset < count; ++offset)
+ if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
+ return value;
+ return MemcmpReturnType::ZERO();
+}
+
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (unlikely(count >= 384)) {
+ if (auto value = generic::Memcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ }
+ return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-static inline int inline_memcmp(const char *lhs, const char *rhs,
- size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
- /////////////////////////////////////////////////////////////////////////////
- // LLVM_LIBC_ARCH_X86
- /////////////////////////////////////////////////////////////////////////////
- using namespace __llvm_libc::x86;
- if (count == 0)
- return 0;
- if (count == 1)
- return three_way_compare<_1>(lhs, rhs);
- if (count == 2)
- return three_way_compare<_2>(lhs, rhs);
- if (count == 3)
- return three_way_compare<_3>(lhs, rhs);
- if (count <= 8)
- return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
- if (count <= 16)
- return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (unlikely(count >= 384)) {
+ if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ }
+ return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
+ return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
- return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
+ return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
- return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
- return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- /////////////////////////////////////////////////////////////////////////////
- // LLVM_LIBC_ARCH_AARCH64
- /////////////////////////////////////////////////////////////////////////////
- using namespace ::__llvm_libc::aarch64;
- if (count == 0) // [0, 0]
- return 0;
- if (count == 1) // [1, 1]
- return three_way_compare<_1>(lhs, rhs);
- if (count == 2) // [2, 2]
- return three_way_compare<_2>(lhs, rhs);
- if (count == 3) // [3, 3]
- return three_way_compare<_3>(lhs, rhs);
- if (count < 8) // [4, 7]
- return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
- if (count < 16) // [8, 15]
- return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
- if (unlikely(count >= 128)) // [128, ∞]
- return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
- if (!equals<_16>(lhs, rhs)) // [16, 16]
- return three_way_compare<_16>(lhs, rhs);
+ return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
+ if (unlikely(count >= 384)) {
+ if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
+ return value;
+ align_to_next_boundary<32, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (count <= 32)
+ return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
+ if (count <= 64)
+ return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+ if (count <= 128)
+ return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
+ if (unlikely(count >= 384)) {
+ if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (unlikely(count >= 128)) { // [128, ∞]
+ if (auto value = generic::Memcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
+ }
+ if (generic::Bcmp<16>::block(p1, p2)) // [16, 16]
+ return generic::Memcmp<16>::block(p1, p2);
if (count < 32) // [17, 31]
- return three_way_compare<Tail<_16>>(lhs, rhs, count);
- if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
- return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
+ return generic::Memcmp<16>::tail(p1, p2, count);
+ if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
+ return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
if (count < 64) // [33, 63]
- return three_way_compare<Tail<_32>>(lhs, rhs, count);
+ return generic::Memcmp<32>::tail(p1, p2, count);
// [64, 127]
- return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
-#else
- /////////////////////////////////////////////////////////////////////////////
- // Default
- /////////////////////////////////////////////////////////////////////////////
- using namespace ::__llvm_libc::scalar;
+ return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
if (count == 0)
- return 0;
+ return MemcmpReturnType::ZERO();
if (count == 1)
- return three_way_compare<_1>(lhs, rhs);
+ return generic::Memcmp<1>::block(p1, p2);
if (count == 2)
- return three_way_compare<_2>(lhs, rhs);
+ return generic::Memcmp<2>::block(p1, p2);
if (count == 3)
- return three_way_compare<_3>(lhs, rhs);
+ return generic::Memcmp<3>::block(p1, p2);
if (count <= 8)
- return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+ return generic::Memcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
- return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
- if (count <= 32)
- return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
- if (count <= 64)
- return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
- if (count <= 128)
- return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
- return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+ return generic::Memcmp<8>::head_tail(p1, p2, count);
+#if defined(LLVM_LIBC_ARCH_X86)
+ if constexpr (x86::kAvx512BW)
+ return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
+ else if constexpr (x86::kAvx2)
+ return inline_memcmp_x86_avx2_gt16(p1, p2, count);
+ else if constexpr (x86::kSse2)
+ return inline_memcmp_x86_sse2_gt16(p1, p2, count);
+ else
+ return inline_memcmp_generic_gt16(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ if constexpr (aarch64::kNeon)
+ return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
+ else
+ return inline_memcmp_generic_gt16(p1, p2, count);
#endif
+#elif defined(LLVM_LIBC_ARCH_ARM)
+ return inline_memcmp_embedded_tiny(p1, p2, count);
+#else
+#error "Unsupported platform"
+#endif
+}
+
+static inline int inline_memcmp(const void *p1, const void *p2, size_t count) {
+ return static_cast<int>(inline_memcmp(reinterpret_cast<CPtr>(p1),
+ reinterpret_cast<CPtr>(p2), count));
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
index 3385d40fbc56..cb9a82856f45 100644
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -11,145 +11,130 @@
#include "src/__support/architectures.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// The tests for `count` are in ascending order so the cost of branching is
-// proportional to the cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-// Guided Optimization as the optimized code can take advantage of branching
-// probabilities.
-// - It also allows for easier customization and favors testing multiple
-// implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-// with little change on the code side.
-
namespace __llvm_libc {
-static inline void inline_memcpy(char *__restrict dst,
- const char *__restrict src, size_t count) {
- using namespace __llvm_libc::builtin;
-#if defined(LLVM_LIBC_ARCH_X86)
- /////////////////////////////////////////////////////////////////////////////
- // LLVM_LIBC_ARCH_X86
- /////////////////////////////////////////////////////////////////////////////
-
- // Whether to use only rep;movsb.
- constexpr bool USE_ONLY_REP_MOVSB =
- LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
-
- // kRepMovsBSize == -1 : Only CopyAligned is used.
- // kRepMovsBSize == 0 : Only RepMovsb is used.
- // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
- constexpr size_t REP_MOVS_B_SIZE =
-#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
- LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
-#else
- -1;
-#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-
- // Whether target supports AVX instructions.
- constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
-
-#if defined(__AVX__)
- using LoopBlockSize = _64;
-#else
- using LoopBlockSize = _32;
-#endif
-
- if (USE_ONLY_REP_MOVSB)
- return copy<x86::Accelerator>(dst, src, count);
+[[maybe_unused]] static inline void
+inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+#pragma nounroll
+ for (size_t offset = 0; offset < count; ++offset)
+ builtin::Memcpy<1>::block(dst + offset, src + offset);
+}
+#if defined(LLVM_LIBC_ARCH_X86)
+[[maybe_unused]] static inline void
+inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0)
return;
if (count == 1)
- return copy<_1>(dst, src);
+ return builtin::Memcpy<1>::block(dst, src);
if (count == 2)
- return copy<_2>(dst, src);
+ return builtin::Memcpy<2>::block(dst, src);
if (count == 3)
- return copy<_3>(dst, src);
+ return builtin::Memcpy<3>::block(dst, src);
if (count == 4)
- return copy<_4>(dst, src);
+ return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
- return copy<HeadTail<_4>>(dst, src, count);
+ return builtin::Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
- return copy<HeadTail<_8>>(dst, src, count);
+ return builtin::Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
- return copy<HeadTail<_16>>(dst, src, count);
+ return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
- return copy<HeadTail<_32>>(dst, src, count);
+ return builtin::Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
- return copy<HeadTail<_64>>(dst, src, count);
- if (HAS_AVX && count < 256)
- return copy<HeadTail<_128>>(dst, src, count);
- if (count <= REP_MOVS_B_SIZE)
- return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
- count);
- return copy<x86::Accelerator>(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- /////////////////////////////////////////////////////////////////////////////
- // LLVM_LIBC_ARCH_AARCH64
- /////////////////////////////////////////////////////////////////////////////
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ if (x86::kAvx && count < 256)
+ return builtin::Memcpy<128>::head_tail(dst, src, count);
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32;
+ return builtin::Memcpy<kBlockSize>::loop_and_tail(dst, src, count);
+}
+
+[[maybe_unused]] static inline void
+inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
+ CPtr __restrict src, size_t count) {
+ // Whether to use rep;movsb exclusively, not at all, or only above a certain
+ // threshold.
+ // TODO: Use only a single preprocessor definition to simplify the code.
+#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
+#endif
+
+ static constexpr bool kUseOnlyRepMovsb =
+ LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+ static constexpr size_t kRepMovsbThreshold =
+ LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+ if constexpr (kUseOnlyRepMovsb)
+ return x86::Memcpy::repmovsb(dst, src, count);
+ else if constexpr (kRepMovsbThreshold >= 0) {
+ if (unlikely(count >= kRepMovsbThreshold))
+ return x86::Memcpy::repmovsb(dst, src, count);
+ else
+ return inline_memcpy_x86(dst, src, count);
+ } else {
+ return inline_memcpy_x86(dst, src, count);
+ }
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline void
+inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0)
return;
if (count == 1)
- return copy<_1>(dst, src);
+ return builtin::Memcpy<1>::block(dst, src);
if (count == 2)
- return copy<_2>(dst, src);
+ return builtin::Memcpy<2>::block(dst, src);
if (count == 3)
- return copy<_3>(dst, src);
+ return builtin::Memcpy<3>::block(dst, src);
if (count == 4)
- return copy<_4>(dst, src);
+ return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
- return copy<HeadTail<_4>>(dst, src, count);
+ return builtin::Memcpy<4>::head_tail(dst, src, count);
if (count < 16)
- return copy<HeadTail<_8>>(dst, src, count);
+ return builtin::Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
- return copy<HeadTail<_16>>(dst, src, count);
+ return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
- return copy<HeadTail<_32>>(dst, src, count);
+ return builtin::Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
- return copy<HeadTail<_64>>(dst, src, count);
- return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ builtin::Memcpy<16>::block(dst, src);
+ align_to_next_boundary<16, Arg::Src>(dst, src, count);
+ return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
+static inline void inline_memcpy(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ using namespace __llvm_libc::builtin;
+#if defined(LLVM_LIBC_ARCH_X86)
+ return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ return inline_memcpy_aarch64(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_ARM)
+ return inline_memcpy_embedded_tiny(dst, src, count);
#else
- /////////////////////////////////////////////////////////////////////////////
- // Default
- /////////////////////////////////////////////////////////////////////////////
- if (count == 0)
- return;
- if (count == 1)
- return copy<_1>(dst, src);
- if (count == 2)
- return copy<_2>(dst, src);
- if (count == 3)
- return copy<_3>(dst, src);
- if (count == 4)
- return copy<_4>(dst, src);
- if (count < 8)
- return copy<HeadTail<_4>>(dst, src, count);
- if (count < 16)
- return copy<HeadTail<_8>>(dst, src, count);
- if (count < 32)
- return copy<HeadTail<_16>>(dst, src, count);
- if (count < 64)
- return copy<HeadTail<_32>>(dst, src, count);
- if (count < 128)
- return copy<HeadTail<_64>>(dst, src, count);
- return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
+#error "Unsupported platform"
#endif
}
+static inline void inline_memcpy(void *__restrict dst,
+ const void *__restrict src, size_t count) {
+ inline_memcpy(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src), count);
+}
+
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h
index d58ed3b70330..75ecf164a2b0 100644
--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@@ -10,129 +10,111 @@
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
#include "src/__support/architectures.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
-// A general purpose implementation assuming cheap unaligned writes for sizes:
-// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
-// or 64 Bytes at a time, the compiler will expand them as needed.
-//
-// This implementation is subject to change as we benchmark more processors. We
-// may also want to customize it for processors with specialized instructions
-// that performs better (e.g. `rep stosb`).
-//
-// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
-// We want to balance two things here:
-// - The number of redundant writes (when using `SetBlockOverlap`),
-// - The number of conditionals for sizes <=128 (~90% of memset calls are for
-// such sizes).
-//
-// For the range 64-128:
-// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
-// is wasteful near 65 but efficient toward 128.
-// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
-// 96 or 128 Bytes.
-// - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
-// for 65-96 and copy<96>+Overlap<32> for 97-128
-//
-// Benchmarks showed that redundant writes were cheap (for Intel X86) but
-// conditional were expensive, even on processor that do not support writing 64B
-// at a time (pre-AVX512F). We also want to favor short functions that allow
-// more hot code to fit in the iL1 cache.
-//
-// Above 128 we have to use conditionals since we don't know the upper bound in
-// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
-// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
-// superior for sizes that mattered.
-inline static void inline_memset(char *dst, unsigned char value, size_t count) {
+[[maybe_unused]] inline static void
+inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
+#pragma nounroll
+ for (size_t offset = 0; offset < count; ++offset)
+ generic::Memset<1, 1>::block(dst + offset, value);
+}
+
#if defined(LLVM_LIBC_ARCH_X86)
- /////////////////////////////////////////////////////////////////////////////
- // LLVM_LIBC_ARCH_X86
- /////////////////////////////////////////////////////////////////////////////
- using namespace __llvm_libc::x86;
+template <size_t MaxSize>
+[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value,
+ size_t count) {
if (count == 0)
return;
if (count == 1)
- return splat_set<_1>(dst, value);
+ return generic::Memset<1, MaxSize>::block(dst, value);
if (count == 2)
- return splat_set<_2>(dst, value);
+ return generic::Memset<2, MaxSize>::block(dst, value);
if (count == 3)
- return splat_set<_3>(dst, value);
+ return generic::Memset<3, MaxSize>::block(dst, value);
if (count <= 8)
- return splat_set<HeadTail<_4>>(dst, value, count);
+ return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
if (count <= 16)
- return splat_set<HeadTail<_8>>(dst, value, count);
+ return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
if (count <= 32)
- return splat_set<HeadTail<_16>>(dst, value, count);
+ return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
if (count <= 64)
- return splat_set<HeadTail<_32>>(dst, value, count);
+ return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
if (count <= 128)
- return splat_set<HeadTail<_64>>(dst, value, count);
- return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- /////////////////////////////////////////////////////////////////////////////
- // LLVM_LIBC_ARCH_AARCH64
- /////////////////////////////////////////////////////////////////////////////
- using namespace __llvm_libc::aarch64_memset;
+ return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<32, MaxSize>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+template <size_t MaxSize>
+[[maybe_unused]] inline static void
+inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count <= 3) {
- splat_set<_1>(dst, value);
+ generic::Memset<1, MaxSize>::block(dst, value);
if (count > 1)
- splat_set<Tail<_2>>(dst, value, count);
+ generic::Memset<2, MaxSize>::tail(dst, value, count);
return;
}
if (count <= 8)
- return splat_set<HeadTail<_4>>(dst, value, count);
+ return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
if (count <= 16)
- return splat_set<HeadTail<_8>>(dst, value, count);
+ return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
if (count <= 32)
- return splat_set<HeadTail<_16>>(dst, value, count);
+ return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
if (count <= (32 + 64)) {
- splat_set<_32>(dst, value);
+ generic::Memset<32, MaxSize>::block(dst, value);
if (count <= 64)
- return splat_set<Tail<_32>>(dst, value, count);
- splat_set<Skip<32>::Then<_32>>(dst, value);
- splat_set<Tail<_32>>(dst, value, count);
+ return generic::Memset<32, MaxSize>::tail(dst, value, count);
+ generic::Memset<32, MaxSize>::block(dst + 32, value);
+ generic::Memset<32, MaxSize>::tail(dst, value, count);
return;
}
- if (count >= 448 && value == 0 && hasZva())
- return splat_set<Align<_64, Arg::P1>::Then<Loop<Zva64, _64>>>(dst, 0,
- count);
- else
- return splat_set<Align<_16, Arg::P1>::Then<Loop<_64>>>(dst, value, count);
-#else
- /////////////////////////////////////////////////////////////////////////////
- // Default
- /////////////////////////////////////////////////////////////////////////////
- using namespace ::__llvm_libc::scalar;
+ if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
+ generic::Memset<64, MaxSize>::block(dst, 0);
+ align_to_next_boundary<64>(dst, count);
+ return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
+ } else {
+ generic::Memset<16, MaxSize>::block(dst, value);
+ align_to_next_boundary<16>(dst, count);
+ return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
+ }
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
- if (count == 0)
- return;
- if (count == 1)
- return splat_set<_1>(dst, value);
- if (count == 2)
- return splat_set<_2>(dst, value);
- if (count == 3)
- return splat_set<_3>(dst, value);
- if (count <= 8)
- return splat_set<HeadTail<_4>>(dst, value, count);
- if (count <= 16)
- return splat_set<HeadTail<_8>>(dst, value, count);
- if (count <= 32)
- return splat_set<HeadTail<_16>>(dst, value, count);
- if (count <= 64)
- return splat_set<HeadTail<_32>>(dst, value, count);
- if (count <= 128)
- return splat_set<HeadTail<_64>>(dst, value, count);
- return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86)
+ static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+ : x86::kAvx ? 32
+ : x86::kSse2 ? 16
+ : 8;
+ return inline_memset_x86<kMaxSize>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
+ return inline_memset_aarch64<kMaxSize>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_ARM)
+ return inline_memset_embedded_tiny(dst, value, count);
+#else
+#error "Unsupported platform"
#endif
}
+inline static void inline_memset(void *dst, uint8_t value, size_t count) {
+ inline_memset(reinterpret_cast<Ptr>(dst), value, count);
+}
+
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index a4b59a12b0b7..8e6432233ca3 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -42,7 +42,7 @@ static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
///////////////////////////////////////////////////////////////////////////////
// Memcpy repmovsb implementation
struct Memcpy {
- static void repmovsb(char *dst, const char *src, size_t count) {
+ static void repmovsb(void *dst, const void *src, size_t count) {
asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
}
};
diff --git a/libc/src/string/mempcpy.cpp b/libc/src/string/mempcpy.cpp
index f26bd64bee42..dd539eb3a2d8 100644
--- a/libc/src/string/mempcpy.cpp
+++ b/libc/src/string/mempcpy.cpp
@@ -15,11 +15,10 @@
namespace __llvm_libc {
LLVM_LIBC_FUNCTION(void *, mempcpy,
- (void *__restrict dest, const void *__restrict src,
+ (void *__restrict dst, const void *__restrict src,
size_t count)) {
- char *result = reinterpret_cast<char *>(dest);
- inline_memcpy(result, reinterpret_cast<const char *>(src), count);
- return result + count;
+ inline_memcpy(dst, src, count);
+ return reinterpret_cast<char *>(dst) + count;
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp
index 549c0742dec7..b80cfce87fca 100644
--- a/libc/src/string/memset.cpp
+++ b/libc/src/string/memset.cpp
@@ -13,8 +13,7 @@
namespace __llvm_libc {
LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
- inline_memset(reinterpret_cast<char *>(dst),
- static_cast<unsigned char>(value), count);
+ inline_memset(dst, static_cast<uint8_t>(value), count);
return dst;
}