perf(libc): use builtin implementations where possible

When conditions are right, eg a small memcpy of a known size and
alignment, the compiler may know of a sequence that is optimal for the
given constraints and inline it. If the compiler doesn't find one, it
will emit a call to the generic function (in the libc) which will
implement this in the most generic and unconstrained manner. That
generic function is rarely the most optimal when constraints are known.

So give the compiler a chance to do this. Replace calls to libc
functions that have builtins to the builtin and keep the generic
implementation if it decides to emit a call anyway.

And example of this in action is usage of FEAT_MOPS. When the compiler
is aware of the feature (-march=armv8.8-a) then it will emit the 3 MOPS
instructions instead of calls to our memcpy() and memset()
implementations.

Change-Id: I9860cfada1d941b613ebd4da068e9992c387952e
Signed-off-by: Boyan Karatotev <boyan.karatotev@arm.com>
diff --git a/include/lib/libc/string.h b/include/lib/libc/string.h
index bba9816..098fded 100644
--- a/include/lib/libc/string.h
+++ b/include/lib/libc/string.h
@@ -4,7 +4,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 /*
- * Portions copyright (c) 2018-2020, Arm Limited and Contributors.
+ * Portions copyright (c) 2018-2025, Arm Limited and Contributors.
  * Portions copyright (c) 2023, Intel Corporation. All rights reserved.
  * All rights reserved.
  */
@@ -14,19 +14,26 @@
 
 #include <stddef.h>
 
-void *memcpy(void *dst, const void *src, size_t len);
+/*
+ * When conditions are right, the compiler may have a baked-in call that can be
+ * inlined and that will be much more optimal than our generic implementation.
+ * When it doesn't, it will emit a call to the original function for which we
+ * provide an implementation.
+ */
+#define memcpy  __builtin_memcpy
+#define memset  __builtin_memset
+#define memcmp  __builtin_memcmp
+#define memchr  __builtin_memchr
+#define strcmp  __builtin_strcmp
+#define strncmp __builtin_strncmp
+#define strchr  __builtin_strchr
+#define strlen  __builtin_strlen
+#define strrchr __builtin_strrchr
+
 int memcpy_s(void *dst, size_t dsize, void *src, size_t ssize);
 void *memmove(void *dst, const void *src, size_t len);
-int memcmp(const void *s1, const void *s2, size_t len);
-int strcmp(const char *s1, const char *s2);
-int strncmp(const char *s1, const char *s2, size_t n);
-void *memchr(const void *src, int c, size_t len);
 void *memrchr(const void *src, int c, size_t len);
-char *strchr(const char *s, int c);
-void *memset(void *dst, int val, size_t count);
-size_t strlen(const char *s);
 size_t strnlen(const char *s, size_t maxlen);
-char *strrchr(const char *p, int ch);
 size_t strlcpy(char * dst, const char * src, size_t dsize);
 size_t strlcat(char * dst, const char * src, size_t dsize);
 char *strtok_r(char *s, const char *delim, char **last);
diff --git a/include/lib/libc/string_private.h b/include/lib/libc/string_private.h
new file mode 100644
index 0000000..da85fae
--- /dev/null
+++ b/include/lib/libc/string_private.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2025, Arm Limited and Contributors. All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#ifndef STRING_PRIVATE_H
+#define STRING_PRIVATE_H
+
+/* Do not include outside of the libc. Use string.h instead. */
+
+#include <stddef.h>
+
+int memcmp(const void *s1, const void *s2, size_t len);
+int strcmp(const char *s1, const char *s2);
+int strncmp(const char *s1, const char *s2, size_t n);
+void *memchr(const void *src, int c, size_t len);
+char *strchr(const char *s, int c);
+void *memset(void *dst, int val, size_t count);
+size_t strlen(const char *s);
+char *strrchr(const char *p, int ch);
+
+#endif /* STRING_PRIVATE_H */