Ai
13 Star 20 Fork 181

src-openEuler/glibc

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
AArch64-Improve-generic-strlen.patch 2.47 KB
一键复制 编辑 原始数据 按行查看 历史
liqingqing_1229 提交于 2025-03-28 14:44 +08:00 . sync from glibc upstream 2.38 branch.
From 9ca74b8ad1968d935815bdc2f1f1c7e9f2e32f70 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Wed, 7 Aug 2024 14:43:47 +0100
Subject: [PATCH] AArch64: Improve generic strlen
Improve performance by handling another 16 bytes before entering the loop.
Use ADDHN in the loop to avoid SHRN+FMOV when it terminates. Change final
size computation to avoid increasing latency. On Neoverse V1 performance
of the random strlen benchmark improves by 4.6%.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
(cherry picked from commit 3dc426b642dcafdbc11a99f2767e081d086f5fc7)
---
sysdeps/aarch64/strlen.S | 39 +++++++++++++++++++++++++++------------
1 file changed, 27 insertions(+), 12 deletions(-)
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index 133ef93342..352fb40d3a 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+/* Generic optimized strlen using SIMD.
+ Copyright (C) 2012-2024 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -56,36 +57,50 @@ ENTRY (STRLEN)
shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
fmov synd, dend
lsr synd, synd, shift
- cbz synd, L(loop)
+ cbz synd, L(next16)
rbit synd, synd
clz result, synd
lsr result, result, 2
ret
+L(next16):
+ ldr data, [src, 16]
+ cmeq vhas_nul.16b, vdata.16b, 0
+ shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
+ fmov synd, dend
+ cbz synd, L(loop)
+ add src, src, 16
+#ifndef __AARCH64EB__
+ rbit synd, synd
+#endif
+ sub result, src, srcin
+ clz tmp, synd
+ add result, result, tmp, lsr 2
+ ret
+
.p2align 5
L(loop):
- ldr data, [src, 16]
+ ldr data, [src, 32]!
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbnz synd, L(loop_end)
- ldr data, [src, 32]!
+ ldr data, [src, 16]
cmeq vhas_nul.16b, vdata.16b, 0
- umaxp vend.16b, vhas_nul.16b, vhas_nul.16b
+ addhn vend.8b, vhas_nul.8h, vhas_nul.8h
fmov synd, dend
cbz synd, L(loop)
- sub src, src, 16
+ add src, src, 16
L(loop_end):
- shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
- sub result, src, srcin
- fmov synd, dend
+ sub result, shift, src, lsl 2 /* (srcin - src) << 2. */
#ifndef __AARCH64EB__
rbit synd, synd
+ sub result, result, 3
#endif
- add result, result, 16
clz tmp, synd
- add result, result, tmp, lsr 2
+ sub result, tmp, result
+ lsr result, result, 2
ret
END (STRLEN)
--
2.27.0
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/src-openeuler/glibc.git
git@gitee.com:src-openeuler/glibc.git
src-openeuler
glibc
glibc
master

搜索帮助