master

分支 (30)

标签 (59)

管理

管理

master

openEuler-24.03-LTS-SP1

openEuler-24.03-LTS-SP2

openEuler-24.03-LTS-Next

openEuler-24.03-LTS-SP3

openEuler-24.03-LTS

openEuler-20.03-LTS-SP4

openEuler-25.09

openEuler-22.03-LTS-SP4

openEuler-22.03-LTS-SP3

openEuler-22.03-LTS-SP1

openEuler-25.03

openEuler-24.09

openEuler-22.03-LTS-SP2

openEuler-22.03-LTS-Next

openEuler-22.03-LTS

openEuler-20.03-LTS-SP1

openEuler-20.03-LTS-SP3

openEuler-23.09

openEuler-23.03

openEuler-25.09-release

openEuler-22.03-LTS-SP3-update-20250815

openEuler-24.03-LTS-SP2-update-20250815

openEuler-24.03-LTS-update-20250815

openEuler-22.03-LTS-SP4-update-20250815

openEuler-24.03-LTS-SP1-update-20250815

openEuler-20.03-LTS-SP4-update-20250815

openEuler-24.03-LTS-SP2-release

openEuler-20.03-LTS-SP4-update-20250606

openEuler-22.03-LTS-SP3-update-20250606

openEuler-22.03-LTS-SP4-update-20250606

openEuler-24.03-LTS-update-20250606

openEuler-24.03-LTS-SP1-update-20250606

openEuler-25.03-release

openEuler-22.03-LTS-SP3-update-20250307

openEuler-22.03-LTS-SP4-update-20250307

openEuler-20.03-LTS-SP4-update-20250307

openEuler-24.03-LTS-SP1-update-20250221

openEuler-24.03-LTS-update-20250221

openEuler-24.03-LTS-SP1-release

glibc
/
revert-aarch64-Use-memcpy_simd-as-the...

From 9e87dccaffb020117fee8fb7eeffff5a2387f16f Mon Sep 17 00:00:00 2001
From: liqingqing <liqingqing3@huawei.com>
Date: Sat, 12 Jul 2025 06:05:35 +0800
Subject: [PATCH] revert "aarch64: Use memcpy_simd as the default memcpy"

This revert commit e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 cause
memcpy_simd has performance regression in many arm cores
https://sourceware.org/bugzilla/show_bug.cgi?id=27437
---
 sysdeps/aarch64/memcpy.S | 192 ++++++++++++++++++++++-----------------
 1 file changed, 111 insertions(+), 81 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 304e7eda..0adc5246 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,5 +1,4 @@
-/* Generic optimized memcpy using SIMD.
-   Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2021 Free Software Foundation, Inc.

    This file is part of the GNU C Library.

@@ -21,7 +20,7 @@

 /* Assumptions:
  *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * ARMv8-a, AArch64, unaligned accesses.
  *
  */

@@ -37,18 +36,21 @@
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
+#define C_l	x10
 #define C_lw	w10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	dst
+#define H_l	src
+#define H_h	srcend
 #define tmp1	x14

-#define A_q	q0
-#define B_q	q1
-#define C_q	q2
-#define D_q	q3
-#define E_q	q4
-#define F_q	q5
-#define G_q	q6
-#define H_q	q7
-
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -67,9 +69,10 @@
    Large copies use a software pipelined loop processing 64 bytes per
    iteration.  The destination pointer is 16-byte aligned to minimize
    unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.  */
+   from the end.
+*/

-ENTRY (MEMCPY)
+ENTRY_ALIGN (MEMCPY, 6)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -84,10 +87,10 @@ ENTRY (MEMCPY)
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldr	A_q, [src]
-	ldr	B_q, [srcend, -16]
-	str	A_q, [dstin]
-	str	B_q, [dstend, -16]
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
 	ret

 	/* Copy 8-15 bytes.  */
@@ -99,6 +102,7 @@ L(copy16):
 	str	A_h, [dstend, -8]
 	ret

+	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
@@ -124,69 +128,87 @@ L(copy0):
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
-	ldp	A_q, B_q, [src]
-	ldp	C_q, D_q, [srcend, -32]
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	ldp	D_l, D_h, [srcend, -16]
 	cmp	count, 64
 	b.hi	L(copy128)
-	stp	A_q, B_q, [dstin]
-	stp	C_q, D_q, [dstend, -32]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret

 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
-	ldp	E_q, F_q, [src, 32]
+	ldp	E_l, E_h, [src, 32]
+	ldp	F_l, F_h, [src, 48]
 	cmp	count, 96
 	b.ls	L(copy96)
-	ldp	G_q, H_q, [srcend, -64]
-	stp	G_q, H_q, [dstend, -64]
+	ldp	G_l, G_h, [srcend, -64]
+	ldp	H_l, H_h, [srcend, -48]
+	stp	G_l, G_h, [dstend, -64]
+	stp	H_l, H_h, [dstend, -48]
 L(copy96):
-	stp	A_q, B_q, [dstin]
-	stp	E_q, F_q, [dstin, 32]
-	stp	C_q, D_q, [dstend, -32]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	E_l, E_h, [dstin, 32]
+	stp	F_l, F_h, [dstin, 48]
+	stp	C_l, C_h, [dstend, -32]
+	stp	D_l, D_h, [dstend, -16]
 	ret

-	/* Align loop64 below to 16 bytes.  */
-	nop
-
+	.p2align 4
 	/* Copy more than 128 bytes.  */
 L(copy_long):
-	/* Copy 16 bytes and then align src to 16-byte alignment.  */
-	ldr	D_q, [src]
-	and	tmp1, src, 15
-	bic	src, src, 15
-	sub	dst, dstin, tmp1
+	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
+	ldp	D_l, D_h, [src]
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	sub	src, src, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_q, B_q, [src, 16]
-	str	D_q, [dstin]
-	ldp	C_q, D_q, [src, 48]
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
+
 L(loop64):
-	stp	A_q, B_q, [dst, 16]
-	ldp	A_q, B_q, [src, 80]
-	stp	C_q, D_q, [dst, 48]
-	ldp	C_q, D_q, [src, 112]
-	add	src, src, 64
-	add	dst, dst, 64
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
 	subs	count, count, 64
 	b.hi	L(loop64)

 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
-	ldp	E_q, F_q, [srcend, -64]
-	stp	A_q, B_q, [dst, 16]
-	ldp	A_q, B_q, [srcend, -32]
-	stp	C_q, D_q, [dst, 48]
-	stp	E_q, F_q, [dstend, -64]
-	stp	A_q, B_q, [dstend, -32]
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
 	ret

 END (MEMCPY)
 libc_hidden_builtin_def (MEMCPY)

-
-ENTRY (MEMMOVE)
+ENTRY_ALIGN (MEMMOVE, 4)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -198,56 +220,64 @@ ENTRY (MEMMOVE)
 	cmp	count, 32
 	b.hi	L(copy32_128)

-	/* Small moves: 0..32 bytes.  */
+	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldr	A_q, [src]
-	ldr	B_q, [srcend, -16]
-	str	A_q, [dstin]
-	str	B_q, [dstend, -16]
+	ldp	A_l, A_h, [src]
+	ldp	D_l, D_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
 	ret

+	.p2align 4
 L(move_long):
 	/* Only use backward copy if there is an overlap.  */
 	sub	tmp1, dstin, src
-	cbz	tmp1, L(move0)
+	cbz	tmp1, L(copy0)
 	cmp	tmp1, count
 	b.hs	L(copy_long)

 	/* Large backwards copy for overlapping copies.
-	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
-L(copy_long_backwards):
-	ldr	D_q, [srcend, -16]
-	and	tmp1, srcend, 15
-	bic	srcend, srcend, 15
+	   Copy 16 bytes and then align dst to 16-byte alignment.  */
+	ldp	D_l, D_h, [srcend, -16]
+	and	tmp1, dstend, 15
+	sub	srcend, srcend, tmp1
 	sub	count, count, tmp1
-	ldp	A_q, B_q, [srcend, -32]
-	str	D_q, [dstend, -16]
-	ldp	C_q, D_q, [srcend, -64]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)

 L(loop64_backwards):
-	str	B_q, [dstend, -16]
-	str	A_q, [dstend, -32]
-	ldp	A_q, B_q, [srcend, -96]
-	str	D_q, [dstend, -48]
-	str	C_q, [dstend, -64]!
-	ldp	C_q, D_q, [srcend, -128]
-	sub	srcend, srcend, 64
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)

 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
-	ldp	E_q, F_q, [src, 32]
-	stp	A_q, B_q, [dstend, -32]
-	ldp	A_q, B_q, [src]
-	stp	C_q, D_q, [dstend, -64]
-	stp	E_q, F_q, [dstin, 32]
-	stp	A_q, B_q, [dstin]
-L(move0):
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
 	ret

 END (MEMMOVE)
--
2.33.0