Ai
13 Star 20 Fork 181

src-openEuler/glibc

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
revert-aarch64-Use-memcpy_simd-as-the-default-memcpy.patch 7.61 KB
一键复制 编辑 原始数据 按行查看 历史
From 9e87dccaffb020117fee8fb7eeffff5a2387f16f Mon Sep 17 00:00:00 2001
From: liqingqing <liqingqing3@huawei.com>
Date: Sat, 12 Jul 2025 06:05:35 +0800
Subject: [PATCH] revert "aarch64: Use memcpy_simd as the default memcpy"
This revert commit e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 cause
memcpy_simd has performance regression in many arm cores
https://sourceware.org/bugzilla/show_bug.cgi?id=27437
---
sysdeps/aarch64/memcpy.S | 192 ++++++++++++++++++++++-----------------
1 file changed, 111 insertions(+), 81 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 304e7eda..0adc5246 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,5 +1,4 @@
-/* Generic optimized memcpy using SIMD.
- Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Copyright (C) 2012-2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -21,7 +20,7 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * ARMv8-a, AArch64, unaligned accesses.
*
*/
@@ -37,18 +36,21 @@
#define B_l x8
#define B_lw w8
#define B_h x9
+#define C_l x10
#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
#define tmp1 x14
-#define A_q q0
-#define B_q q1
-#define C_q q2
-#define D_q q3
-#define E_q q4
-#define F_q q5
-#define G_q q6
-#define H_q q7
-
#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
@@ -67,9 +69,10 @@
Large copies use a software pipelined loop processing 64 bytes per
iteration. The destination pointer is 16-byte aligned to minimize
unaligned accesses. The loop tail is handled by always copying 64 bytes
- from the end. */
+ from the end.
+*/
-ENTRY (MEMCPY)
+ENTRY_ALIGN (MEMCPY, 6)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -84,10 +87,10 @@ ENTRY (MEMCPY)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
- ldr A_q, [src]
- ldr B_q, [srcend, -16]
- str A_q, [dstin]
- str B_q, [dstend, -16]
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
ret
/* Copy 8-15 bytes. */
@@ -99,6 +102,7 @@ L(copy16):
str A_h, [dstend, -8]
ret
+ .p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
@@ -124,69 +128,87 @@ L(copy0):
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
- ldp A_q, B_q, [src]
- ldp C_q, D_q, [srcend, -32]
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
cmp count, 64
b.hi L(copy128)
- stp A_q, B_q, [dstin]
- stp C_q, D_q, [dstend, -32]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
ret
.p2align 4
/* Copy 65..128 bytes. */
L(copy128):
- ldp E_q, F_q, [src, 32]
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
cmp count, 96
b.ls L(copy96)
- ldp G_q, H_q, [srcend, -64]
- stp G_q, H_q, [dstend, -64]
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
L(copy96):
- stp A_q, B_q, [dstin]
- stp E_q, F_q, [dstin, 32]
- stp C_q, D_q, [dstend, -32]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
ret
- /* Align loop64 below to 16 bytes. */
- nop
-
+ .p2align 4
/* Copy more than 128 bytes. */
L(copy_long):
- /* Copy 16 bytes and then align src to 16-byte alignment. */
- ldr D_q, [src]
- and tmp1, src, 15
- bic src, src, 15
- sub dst, dstin, tmp1
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_q, B_q, [src, 16]
- str D_q, [dstin]
- ldp C_q, D_q, [src, 48]
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
+
L(loop64):
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [src, 80]
- stp C_q, D_q, [dst, 48]
- ldp C_q, D_q, [src, 112]
- add src, src, 64
- add dst, dst, 64
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
- ldp E_q, F_q, [srcend, -64]
- stp A_q, B_q, [dst, 16]
- ldp A_q, B_q, [srcend, -32]
- stp C_q, D_q, [dst, 48]
- stp E_q, F_q, [dstend, -64]
- stp A_q, B_q, [dstend, -32]
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
ret
END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
-
-ENTRY (MEMMOVE)
+ENTRY_ALIGN (MEMMOVE, 4)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -198,56 +220,64 @@ ENTRY (MEMMOVE)
cmp count, 32
b.hi L(copy32_128)
- /* Small moves: 0..32 bytes. */
+ /* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
- ldr A_q, [src]
- ldr B_q, [srcend, -16]
- str A_q, [dstin]
- str B_q, [dstend, -16]
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
ret
+ .p2align 4
L(move_long):
/* Only use backward copy if there is an overlap. */
sub tmp1, dstin, src
- cbz tmp1, L(move0)
+ cbz tmp1, L(copy0)
cmp tmp1, count
b.hs L(copy_long)
/* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align srcend to 16-byte alignment. */
-L(copy_long_backwards):
- ldr D_q, [srcend, -16]
- and tmp1, srcend, 15
- bic srcend, srcend, 15
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
sub count, count, tmp1
- ldp A_q, B_q, [srcend, -32]
- str D_q, [dstend, -16]
- ldp C_q, D_q, [srcend, -64]
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
- str B_q, [dstend, -16]
- str A_q, [dstend, -32]
- ldp A_q, B_q, [srcend, -96]
- str D_q, [dstend, -48]
- str C_q, [dstend, -64]!
- ldp C_q, D_q, [srcend, -128]
- sub srcend, srcend, 64
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
- ldp E_q, F_q, [src, 32]
- stp A_q, B_q, [dstend, -32]
- ldp A_q, B_q, [src]
- stp C_q, D_q, [dstend, -64]
- stp E_q, F_q, [dstin, 32]
- stp A_q, B_q, [dstin]
-L(move0):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
ret
END (MEMMOVE)
--
2.33.0
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/src-openeuler/glibc.git
git@gitee.com:src-openeuler/glibc.git
src-openeuler
glibc
glibc
master

搜索帮助