diff --git a/Feature-Optimize-CRC-calculation-for-the-RISC-V.patch b/Feature-Optimize-CRC-calculation-for-the-RISC-V.patch new file mode 100644 index 0000000000000000000000000000000000000000..7328cc5aa39b8dd83682c29ecb365293997bbf23 --- /dev/null +++ b/Feature-Optimize-CRC-calculation-for-the-RISC-V.patch @@ -0,0 +1,2173 @@ +From 069f340f77240f62cdab8c8c9701e4191be5f1ad Mon Sep 17 00:00:00 2001 +From: dongji +Date: Tue, 9 Sep 2025 20:23:50 +0800 +Subject: [PATCH] Optimize CRC calculation for the RISC-V64 architecture by replacing + time-consuming vredxor.vv instructions with better performance + combination of vslidexx.vi and vxor.vv instructions. Dependencies on + the Zvbb extension have been removed. + + Testing + The verification method involves + running autogen.sh and configure to generate the Makefile, followed by + executing the make checks command. The verification results below were + obtained on a RISC-V64 OpenEuler 24.03 virtual machine on QEMU 9.0.1 + + All tests successful. + make --no-print-directory check-TESTS + PASS: raid/pq_gen_test + PASS: crc/crc16_t10dif_test + PASS: crc/crc16_t10dif_copy_test + PASS: raid/xor_gen_test + PASS: crc/crc64_funcs_test + PASS: crc/crc32_funcs_test + PASS: igzip/igzip_wrapper_hdr_test + PASS: igzip/checksum32_funcs_test + PASS: erasure_code/gf_inverse_test + PASS: erasure_code/gf_vect_mul_test + PASS: raid/xor_check_test + PASS: raid/pq_check_test + PASS: igzip/igzip_rand_test + PASS: mem/mem_zero_detect_test + PASS: erasure_code/erasure_code_test + PASS: erasure_code/erasure_code_update_test + ============================================================================ + Testsuite summary for libisal 2.30.0 + ============================================================================ + # TOTAL: 16 + # PASS: 16 + # SKIP: 0 + # XFAIL: 0 + # FAIL: 0 + # XPASS: 0 + # ERROR: 0 + ============================================================================ + +Signed-off-by: liuqingtao +--- + Makefile.am | 7 +- + configure.ac | 98 +++++++++-- + crc/riscv64/Makefile.am | 1 - + crc/riscv64/crc16_t10dif_copy_vclmul.S | 175 +++++++++----------- + crc/riscv64/crc16_t10dif_vclmul.S | 157 +++++++++--------- + crc/riscv64/crc32_gzip_refl_vclmul.S | 5 +- + crc/riscv64/crc32_gzip_refl_vclmul.h | 12 +- + crc/riscv64/crc32_ieee_norm_vclmul.S | 5 +- + crc/riscv64/crc32_ieee_norm_vclmul.h | 12 +- + crc/riscv64/crc32_iscsi_refl_vclmul.S | 11 +- + crc/riscv64/crc32_iscsi_refl_vclmul.h | 12 +- + crc/riscv64/crc32_norm_common_vclmul.h | 12 +- + crc/riscv64/crc32_refl_common_vclmul.h | 5 +- + crc/riscv64/crc64_ecma_norm_vclmul.S | 5 +- + crc/riscv64/crc64_ecma_norm_vclmul.h | 10 +- + crc/riscv64/crc64_ecma_refl_vclmul.S | 5 +- + crc/riscv64/crc64_ecma_refl_vclmul.h | 12 +- + crc/riscv64/crc64_iso_norm_vclmul.S | 5 +- + crc/riscv64/crc64_iso_norm_vclmul.h | 12 +- + crc/riscv64/crc64_iso_refl_vclmul.S | 5 +- + crc/riscv64/crc64_iso_refl_vclmul.h | 12 +- + crc/riscv64/crc64_jones_norm_vclmul.S | 5 +- + crc/riscv64/crc64_jones_norm_vclmul.h | 12 +- + crc/riscv64/crc64_jones_refl_vclmul.S | 5 +- + crc/riscv64/crc64_jones_refl_vclmul.h | 12 +- + crc/riscv64/crc64_norm_common_vclmul.h | 12 +- + crc/riscv64/crc64_refl_common_vclmul.h | 6 +- + crc/riscv64/crc_common_vclmul.h | 217 +++++++++++++------------ + crc/riscv64/crc_multibinary_riscv.S | 3 +- + crc/riscv64/crc_riscv64_dispatcher.c | 164 ++++++++++--------- + include/riscv64_multibinary.h | 186 ++++++++++----------- + 31 files changed, 637 insertions(+), 563 deletions(-) + +diff --git a/Makefile.am b/Makefile.am +index 7602018..3f2e212 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -158,15 +158,10 @@ AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} $(src_include) ${D} + if CPU_AARCH64 + AM_CCASFLAGS = ${AM_CFLAGS} + else +-if CPU_RISCV64 +-AM_CCASFLAGS = ${AM_CFLAGS} +-else + AM_CCASFLAGS = ${yasm_args} ${INCLUDE} ${src_include} ${DEFS} ${D} + endif +-endif +- + if CPU_RISCV64 +-AM_CFLAGS += -march=rv64gcv_zbc_zvbc_zvbb ++AM_CCASFLAGS = ${AM_CFLAGS} + endif + + .asm.s: +diff --git a/configure.ac b/configure.ac +index f69ae10..2208ad6 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -40,16 +40,10 @@ AM_CONDITIONAL([CPU_AARCH64], [test "$CPU" = "aarch64"]) + AM_CONDITIONAL([CPU_PPC64LE], [test "$CPU" = "ppc64le"]) + AM_CONDITIONAL([CPU_RISCV64], [test "$CPU" = "riscv64"]) + AM_CONDITIONAL([CPU_UNDEFINED], [test "x$CPU" = "x"]) +- +-if test "$CPU" = "x86_64"; then +- is_x86=yes +-else +- if test "$CPU" = "x86_32"; then +- is_x86=yes +- else +- is_x86=no +- fi +-fi ++AM_CONDITIONAL([HAVE_RVV], [false]) ++AM_CONDITIONAL([HAVE_ZBC], [false]) ++AM_CONDITIONAL([HAVE_ZVBC], [false]) ++AM_CONDITIONAL([HAVE_HWPROBE_H], [false]) + + # Check for programs + AC_PROG_CC_STDC +@@ -61,6 +55,90 @@ AC_PREFIX_DEFAULT([/usr]) + AC_PROG_SED + AC_PROG_MKDIR_P + ++case "${CPU}" in ++ ++ x86_64) ++ is_x86=yes ++ ;; ++ ++ x86_32) ++ is_x86=yes ++ ;; ++ ++ riscv64) ++ AC_CHECK_HEADER([asm/hwprobe.h], ++ [AC_DEFINE([HAVE_HWPROBE_H], [1], [Define if asm/hwprobe.h exists]) ++ AM_CONDITIONAL([HAVE_HWPROBE_H], [true]) hwprobe_h=yes], ++ [AC_DEFINE([HAVE_HWPROBE_H], [0], [Define if asm/hwprobe.h not exists]) ++ AM_CONDITIONAL([HAVE_HWPROBE_H], [false]) hwprobe_h=no] ++ ) ++ AC_MSG_CHECKING([RVV support]) ++ AC_COMPILE_IFELSE( ++ [AC_LANG_PROGRAM([], [ ++ __asm__ volatile( ++ ".option arch, +v\n" ++ "vsetivli zero, 0, e8, m1, ta, ma\n" ++ ); ++ ])], ++ [AC_DEFINE([HAVE_RVV], [1], [Enable RVV instructions]) ++ AM_CONDITIONAL([HAVE_RVV], [true]) rvv=yes], ++ [AC_DEFINE([HAVE_RVV], [0], [Disable RVV instructions]) ++ AM_CONDITIONAL([HAVE_RVV], [false]) rvv=no] ++ ) ++ AC_MSG_RESULT([$rvv]) ++ if test "x$hwprobe_h" = "xyes"; then ++ AC_MSG_CHECKING([ZBC support]) ++ AC_COMPILE_IFELSE( ++ [AC_LANG_PROGRAM([#include ], [ ++ int a = RISCV_HWPROBE_EXT_ZBC; ++ __asm__ volatile( ++ ".option arch, +zbc\n" ++ "clmul zero, zero, zero\n" ++ "clmulh zero, zero, zero\n" ++ ); ++ ])], ++ [AC_DEFINE([HAVE_ZBC], [1], [Enable ZBC instructions]) ++ AM_CONDITIONAL([HAVE_ZBC], [true]) zbc=yes], ++ [AC_DEFINE([HAVE_ZBC], [0], [Disable ZBC instructions]) ++ AM_CONDITIONAL([HAVE_ZBC], [false]) zbc=no] ++ ) ++ AC_MSG_RESULT([$zbc]) ++ AC_MSG_CHECKING([ZVBC support]) ++ AC_COMPILE_IFELSE( ++ [AC_LANG_PROGRAM([#include ], [ ++ int a = RISCV_HWPROBE_EXT_ZVBC; ++ __asm__ volatile( ++ ".option arch, +v, +zvbc\n" ++ "vsetivli zero, 2, e64, m1, ta, ma\n" ++ "vmv.s.x v0, zero\n" ++ "vclmul.vv v0, v0, v0\n" ++ "vclmulh.vv v0, v0, v0\n" ++ ); ++ ])], ++ [AC_DEFINE([HAVE_ZVBC], [1], [Enable ZVBC instructions]) ++ AM_CONDITIONAL([HAVE_ZVBC], [true]) zvbc=yes], ++ [AC_DEFINE([HAVE_ZVBC], [0], [Disable ZVBC instructions]) ++ AM_CONDITIONAL([HAVE_ZVBC], [false]) zvbc=no] ++ ) ++ AC_MSG_RESULT([$zvbc]) ++ fi ++ if test "x$rvv" = "xyes"; then ++ rvv_arch="rv64gcv" ++ AS_IF([test "x$hwprobe_h" = "xyes"], ++ [AS_IF([test "x$zbc" = "xyes" && test "x$zvbc" = "xyes"], ++ [rvv_arch="rv64gcv_zbc_zvbc"] ++ )] ++ ) ++ CFLAGS+=" -march=$rvv_arch" ++ CCASFLAGS+=" -march=$rvv_arch" ++ fi ++ ;; ++ ++ *) ++ is_x86=no ++ ++esac ++ + # Options + AC_ARG_ENABLE([debug], + AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]), +diff --git a/crc/riscv64/Makefile.am b/crc/riscv64/Makefile.am +index b78dbe1..5c6b134 100644 +--- a/crc/riscv64/Makefile.am ++++ b/crc/riscv64/Makefile.am +@@ -26,7 +26,6 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- + lsrc_riscv64 += \ + crc/riscv64/crc_multibinary_riscv.S \ + crc/riscv64/crc_riscv64_dispatcher.c +diff --git a/crc/riscv64/crc16_t10dif_copy_vclmul.S b/crc/riscv64/crc16_t10dif_copy_vclmul.S +index 331e9ce..d05eaf3 100644 +--- a/crc/riscv64/crc16_t10dif_copy_vclmul.S ++++ b/crc/riscv64/crc16_t10dif_copy_vclmul.S +@@ -1,5 +1,5 @@ + ######################################################################## +-# Copyright(c) 2025 ZTE Corporation All rights reserved. ++# Copyright (c) 2025 ZTE Corporation. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions +@@ -26,7 +26,7 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + .section .text + .align 2 + .global crc16_t10dif_copy_vclmul +@@ -74,47 +74,29 @@ crc16_t10dif_copy_vclmul: + .crc_fold: + # Initialize vector registers + vsetivli zero, 2, e64, m1, ta, ma +- vle64.v v6, 0(a2) +- addi a2, a2, 16 +- vle64.v v7, 0(a2) +- addi a2, a2, 16 +- vle64.v v8, 0(a2) +- addi a2, a2, 16 +- vle64.v v9, 0(a2) +- addi a2, a2, 16 ++ vl4re64.v v4, 0(a2) ++ addi a2, a2, 64 + addi a3, a3, -64 + +- vse64.v v6, (a1) +- addi a1, a1, 16 +- vse64.v v7, (a1) +- addi a1, a1, 16 +- vse64.v v8, (a1) +- addi a1, a1, 16 +- vse64.v v9, (a1) +- addi a1, a1, 16 ++ vs4r.v v4, (a1) ++ addi a1, a1, 64 + + # Prepare initial vector ++ la t0, .shuffle_data_mask ++ vsetivli zero, 16, e8, m1, ta, ma ++ vle8.v v13, 0(t0) + slli a0, a0, 32 +- vmv.s.x v4, zero +- vrev8.v v6, v6 +- vrev8.v v7, v7 +- vrev8.v v8, v8 +- vrev8.v v9, v9 +- vslidedown.vi v0, v6, 1 +- vslidedown.vi v1, v7, 1 +- vslidedown.vi v2, v8, 1 +- vslidedown.vi v3, v9, 1 +- vslideup.vi v0, v6, 1 +- vslideup.vi v1, v7, 1 +- vslideup.vi v2, v8, 1 +- vslideup.vi v3, v9, 1 ++ vrgather.vv v0, v4, v13 ++ vrgather.vv v1, v5, v13 ++ vrgather.vv v2, v6, v13 ++ vrgather.vv v3, v7, v13 ++ vsetivli zero, 2, e64, m1, ta, ma + + vmv.v.x v5, a0 ++ vmv.s.x v4, zero + vslideup.vi v4, v5, 1 + + vxor.vv v0, v0, v4 +- +- vmv.v.x v8, zero + bltu a3, t1, final_fold + + # Load constants +@@ -125,64 +107,55 @@ crc16_t10dif_copy_vclmul: + + # Main processing loop + loop_start: +- vle64.v v9, (a2) +- addi a2, a2, 16 +- vle64.v v10, (a2) +- addi a2, a2, 16 +- vle64.v v11, (a2) +- addi a2, a2, 16 +- vle64.v v12, (a2) +- addi a2, a2, 16 +- vse64.v v9, (a1) +- addi a1, a1, 16 +- vse64.v v10, (a1) +- addi a1, a1, 16 +- vse64.v v11, (a1) +- addi a1, a1, 16 +- vse64.v v12, (a1) +- addi a1, a1, 16 ++ vl4re64.v v8, (a2) ++ addi a2, a2, 64 ++ vs4r.v v8, (a1) ++ addi a1, a1, 64 + + vclmul.vv v4, v0, v5 + vclmulh.vv v0, v0, v5 +- vredxor.vs v0, v0, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v0, 1 +- vrev8.v v9, v9 +- vslidedown.vi v6, v9, 1 +- vslideup.vi v6, v9, 1 +- vxor.vv v0, v4, v6 ++ vslidedown.vi v15, v4, 1 ++ vslidedown.vi v14, v0, 1 ++ vxor.vv v15, v15, v4 ++ vxor.vv v14, v14, v0 ++ vslideup.vi v15, v14, 1 + + # Process v1-v3 (similar to v0) + + vclmul.vv v4, v1, v5 + vclmulh.vv v1, v1, v5 +- vredxor.vs v1, v1, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v1, 1 +- vrev8.v v10, v10 +- vslidedown.vi v6, v10, 1 +- vslideup.vi v6, v10, 1 +- vxor.vv v1, v4, v6 ++ vslidedown.vi v16, v4, 1 ++ vslidedown.vi v14, v1, 1 ++ vxor.vv v16, v16, v4 ++ vxor.vv v14, v14, v1 ++ vslideup.vi v16, v14, 1 + + vclmul.vv v4, v2, v5 + vclmulh.vv v2, v2, v5 +- vredxor.vs v2, v2, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v2, 1 +- vrev8.v v11, v11 +- vslidedown.vi v6, v11, 1 +- vslideup.vi v6, v11, 1 +- vxor.vv v2, v4, v6 ++ vslidedown.vi v17, v4, 1 ++ vslidedown.vi v14, v2, 1 ++ vxor.vv v17, v17, v4 ++ vxor.vv v14, v14, v2 ++ vslideup.vi v17, v14, 1 + + vclmul.vv v4, v3, v5 + vclmulh.vv v3, v3, v5 +- vredxor.vs v3, v3, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v3, 1 +- vrev8.v v12, v12 +- vslidedown.vi v6, v12, 1 +- vslideup.vi v6, v12, 1 +- vxor.vv v3, v4, v6 ++ vslidedown.vi v18, v4, 1 ++ vslidedown.vi v14, v3, 1 ++ vxor.vv v18, v18, v4 ++ vxor.vv v14, v14, v3 ++ vslideup.vi v18, v14, 1 ++ ++ vsetivli zero, 16, e8, m1, ta, ma ++ vrgather.vv v0, v8, v13 ++ vrgather.vv v1, v9, v13 ++ vrgather.vv v2, v10, v13 ++ vrgather.vv v3, v11, v13 ++ vsetivli zero, 2, e64, m1, ta, ma ++ vxor.vv v0, v0, v15 ++ vxor.vv v1, v1, v16 ++ vxor.vv v2, v2, v17 ++ vxor.vv v3, v3, v18 + + addi a3, a3, -64 + bge a3, t0, loop_start +@@ -192,22 +165,30 @@ final_fold: + vle64.v v5, 0(t0) + vclmul.vv v6, v0, v5 + vclmulh.vv v7, v0, v5 +- vredxor.vs v6, v6, v8 +- vredxor.vs v7, v7, v8 +- vslideup.vi v6, v7, 1 +- vxor.vv v0, v6, v1 ++ vslidedown.vi v8, v6, 1 ++ vslidedown.vi v9, v7, 1 ++ vxor.vv v8, v8, v6 ++ vxor.vv v9, v9, v7 ++ vslideup.vi v8, v9, 1 ++ vxor.vv v0, v8, v1 ++ + vclmul.vv v6, v0, v5 + vclmulh.vv v7, v0, v5 +- vredxor.vs v6, v6, v8 +- vredxor.vs v7, v7, v8 +- vslideup.vi v6, v7, 1 +- vxor.vv v0, v6, v2 ++ vslidedown.vi v8, v6, 1 ++ vslidedown.vi v9, v7, 1 ++ vxor.vv v8, v8, v6 ++ vxor.vv v9, v9, v7 ++ vslideup.vi v8, v9, 1 ++ vxor.vv v0, v8, v2 ++ + vclmul.vv v6, v0, v5 + vclmulh.vv v7, v0, v5 +- vredxor.vs v6, v6, v8 +- vredxor.vs v7, v7, v8 +- vslideup.vi v6, v7, 1 +- vxor.vv v0, v6, v3 ++ vslidedown.vi v8, v6, 1 ++ vslidedown.vi v9, v7, 1 ++ vxor.vv v8, v8, v6 ++ vxor.vv v9, v9, v7 ++ vslideup.vi v8, v9, 1 ++ vxor.vv v0, v8, v3 + + # Store result + addi sp, sp, -16 +@@ -229,10 +210,10 @@ final_fold: + + # Barrett reduction + srli a5, a4, 32 +- li t2, 0x1f65a57f9 # x_quo ++ li t2, 0x1f65a57f9 + clmul a5, t2, a5 + srli a5, a5, 32 +- li t3, 0x18bb70000 # x_poly ++ li t3, 0x18bb70000 + clmul a5, a5, t3 + xor a0, a5, a4 + addi sp, sp, 16 +@@ -241,7 +222,6 @@ tail_processing: + # Process remaining bytes + beqz a3, .end + +- # Call crc16_t10dif_generic equivalent for remaining bytes + jal x0, .crc_table_loop_pre + + .section .rodata +@@ -259,8 +239,8 @@ k_const2: + .quad 0x4c1a0000 + + .LANCHOR0 = . + 0 +- .type crc16tab, %object +- .size crc16tab, 1024 ++ .type crc16tab, %object ++ .size crc16tab, 1024 + crc16tab: + .word 0x00000000, 0x8bb70000, 0x9cd90000, 0x176e0000, 0xb2050000, 0x39b20000, 0x2edc0000, 0xa56b0000 + .word 0xefbd0000, 0x640a0000, 0x73640000, 0xf8d30000, 0x5db80000, 0xd60f0000, 0xc1610000, 0x4ad60000 +@@ -295,4 +275,11 @@ crc16tab: + .word 0x1f650000, 0x94d20000, 0x83bc0000, 0x080b0000, 0xad600000, 0x26d70000, 0x31b90000, 0xba0e0000 + .word 0xf0d80000, 0x7b6f0000, 0x6c010000, 0xe7b60000, 0x42dd0000, 0xc96a0000, 0xde040000, 0x55b30000 + ++.shuffle_data_mask = . + 0 ++ .type shuffle_data, %object ++ .size shuffle_data, 16 ++shuffle_data: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 + ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc16_t10dif_vclmul.S b/crc/riscv64/crc16_t10dif_vclmul.S +index 42a9f2f..c8eaef8 100644 +--- a/crc/riscv64/crc16_t10dif_vclmul.S ++++ b/crc/riscv64/crc16_t10dif_vclmul.S +@@ -1,5 +1,5 @@ + ######################################################################## +-# Copyright(c) 2025 ZTE Corporation All rights reserved. ++# Copyright (c) 2025 ZTE Corporation. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions +@@ -26,7 +26,7 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + .section .text + .align 2 + .global crc16_t10dif_vclmul +@@ -40,7 +40,7 @@ + + crc16_t10dif_vclmul: + # Initialize state +- slli a0, a0, 16 # state = crc << 16 ++ slli a0, a0, 16 + + # Check if length >= 64 + li t1, 64 +@@ -72,38 +72,27 @@ crc16_t10dif_vclmul: + .crc_fold: + # Initialize vector registers + vsetivli zero, 2, e64, m1, ta, ma +- vle64.v v6, 0(a1) +- addi a1, a1, 16 +- vle64.v v7, 0(a1) +- addi a1, a1, 16 +- vle64.v v8, 0(a1) +- addi a1, a1, 16 +- vle64.v v9, 0(a1) +- addi a1, a1, 16 ++ ++ vl4re64.v v4, (a1) ++ addi a1, a1, 64 + addi a2, a2, -64 + + # Prepare initial vector ++ la t0, .shuffle_data_mask ++ vsetivli zero, 16, e8, m1, ta, ma ++ vle8.v v13, 0(t0) + slli a0, a0, 32 +- vmv.s.x v4, zero +- vrev8.v v6, v6 +- vrev8.v v7, v7 +- vrev8.v v8, v8 +- vrev8.v v9, v9 +- vslidedown.vi v0, v6, 1 +- vslidedown.vi v1, v7, 1 +- vslidedown.vi v2, v8, 1 +- vslidedown.vi v3, v9, 1 +- vslideup.vi v0, v6, 1 +- vslideup.vi v1, v7, 1 +- vslideup.vi v2, v8, 1 +- vslideup.vi v3, v9, 1 ++ vrgather.vv v0, v4, v13 ++ vrgather.vv v1, v5, v13 ++ vrgather.vv v2, v6, v13 ++ vrgather.vv v3, v7, v13 ++ vsetivli zero, 2, e64, m1, ta, ma + + vmv.v.x v5, a0 ++ vmv.s.x v4, zero + vslideup.vi v4, v5, 1 + + vxor.vv v0, v0, v4 +- +- vmv.v.x v8, zero + bltu a2, t1, final_fold + + # Load constants +@@ -112,56 +101,54 @@ crc16_t10dif_vclmul: + + # Main processing loop + loop_start: +- vle64.v v9, (a1) +- addi a1, a1, 16 +- vle64.v v10, (a1) +- addi a1, a1, 16 +- vle64.v v11, (a1) +- addi a1, a1, 16 +- vle64.v v12, (a1) +- addi a1, a1, 16 ++ ++ vl4re64.v v8, (a1) ++ addi a1, a1, 64 + + vclmul.vv v4, v0, v5 + vclmulh.vv v0, v0, v5 +- vredxor.vs v0, v0, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v0, 1 +- vrev8.v v9, v9 +- vslidedown.vi v6, v9, 1 +- vslideup.vi v6, v9, 1 +- vxor.vv v0, v4, v6 ++ vslidedown.vi v15, v4, 1 ++ vslidedown.vi v14, v0, 1 ++ vxor.vv v15, v15, v4 ++ vxor.vv v14, v14, v0 ++ vslideup.vi v15, v14, 1 + + # Process v1-v3 (similar to v0) + + vclmul.vv v4, v1, v5 + vclmulh.vv v1, v1, v5 +- vredxor.vs v1, v1, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v1, 1 +- vrev8.v v10, v10 +- vslidedown.vi v6, v10, 1 +- vslideup.vi v6, v10, 1 +- vxor.vv v1, v4, v6 ++ vslidedown.vi v16, v4, 1 ++ vslidedown.vi v14, v1, 1 ++ vxor.vv v16, v16, v4 ++ vxor.vv v14, v14, v1 ++ vslideup.vi v16, v14, 1 + + vclmul.vv v4, v2, v5 + vclmulh.vv v2, v2, v5 +- vredxor.vs v2, v2, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v2, 1 +- vrev8.v v11, v11 +- vslidedown.vi v6, v11, 1 +- vslideup.vi v6, v11, 1 +- vxor.vv v2, v4, v6 ++ vslidedown.vi v17, v4, 1 ++ vslidedown.vi v14, v2, 1 ++ vxor.vv v17, v17, v4 ++ vxor.vv v14, v14, v2 ++ vslideup.vi v17, v14, 1 + + vclmul.vv v4, v3, v5 + vclmulh.vv v3, v3, v5 +- vredxor.vs v3, v3, v8 +- vredxor.vs v4, v4, v8 +- vslideup.vi v4, v3, 1 +- vrev8.v v12, v12 +- vslidedown.vi v6, v12, 1 +- vslideup.vi v6, v12, 1 +- vxor.vv v3, v4, v6 ++ vslidedown.vi v18, v4, 1 ++ vslidedown.vi v14, v3, 1 ++ vxor.vv v18, v18, v4 ++ vxor.vv v14, v14, v3 ++ vslideup.vi v18, v14, 1 ++ ++ vsetivli zero, 16, e8, m1, ta, ma ++ vrgather.vv v0, v8, v13 ++ vrgather.vv v1, v9, v13 ++ vrgather.vv v2, v10, v13 ++ vrgather.vv v3, v11, v13 ++ vsetivli zero, 2, e64, m1, ta, ma ++ vxor.vv v0, v0, v15 ++ vxor.vv v1, v1, v16 ++ vxor.vv v2, v2, v17 ++ vxor.vv v3, v3, v18 + + addi a2, a2, -64 + bge a2, t1, loop_start +@@ -171,24 +158,30 @@ final_fold: + vle64.v v5, 0(t0) + vclmul.vv v6, v0, v5 + vclmulh.vv v7, v0, v5 +- vredxor.vs v6, v6, v8 +- vredxor.vs v7, v7, v8 +- vslideup.vi v6, v7, 1 +- vxor.vv v0, v6, v1 ++ vslidedown.vi v8, v6, 1 ++ vslidedown.vi v9, v7, 1 ++ vxor.vv v8, v8, v6 ++ vxor.vv v9, v9, v7 ++ vslideup.vi v8, v9, 1 ++ vxor.vv v0, v8, v1 + + vclmul.vv v6, v0, v5 + vclmulh.vv v7, v0, v5 +- vredxor.vs v6, v6, v8 +- vredxor.vs v7, v7, v8 +- vslideup.vi v6, v7, 1 +- vxor.vv v0, v6, v2 ++ vslidedown.vi v8, v6, 1 ++ vslidedown.vi v9, v7, 1 ++ vxor.vv v8, v8, v6 ++ vxor.vv v9, v9, v7 ++ vslideup.vi v8, v9, 1 ++ vxor.vv v0, v8, v2 + + vclmul.vv v6, v0, v5 + vclmulh.vv v7, v0, v5 +- vredxor.vs v6, v6, v8 +- vredxor.vs v7, v7, v8 +- vslideup.vi v6, v7, 1 +- vxor.vv v0, v6, v3 ++ vslidedown.vi v8, v6, 1 ++ vslidedown.vi v9, v7, 1 ++ vxor.vv v8, v8, v6 ++ vxor.vv v9, v9, v7 ++ vslideup.vi v8, v9, 1 ++ vxor.vv v0, v8, v3 + + # Store result + addi sp, sp, -16 +@@ -211,10 +204,10 @@ final_fold: + + # Barrett reduction + srli a5, a4, 32 +- li t2, 0x1f65a57f9 # x_quo ++ li t2, 0x1f65a57f9 + clmul a5, t2, a5 + srli a5, a5, 32 +- li t4, 0x18bb70000 # x_poly ++ li t4, 0x18bb70000 + clmul a5, a5, t4 + xor a0, a5, a4 + +@@ -222,7 +215,6 @@ tail_processing: + # Process remaining bytes + beqz a2, .end + +- # Call crc16_t10dif_generic equivalent for remaining bytes + jal x0, .crc_table_loop_pre + + .section .rodata +@@ -240,8 +232,8 @@ k_const2: + .quad 0x4c1a0000 + + .LANCHOR0 = . + 0 +- .type crc16tab, %object +- .size crc16tab, 1024 ++ .type crc16tab, %object ++ .size crc16tab, 1024 + crc16tab: + .word 0x00000000, 0x8bb70000, 0x9cd90000, 0x176e0000, 0xb2050000, 0x39b20000, 0x2edc0000, 0xa56b0000 + .word 0xefbd0000, 0x640a0000, 0x73640000, 0xf8d30000, 0x5db80000, 0xd60f0000, 0xc1610000, 0x4ad60000 +@@ -276,4 +268,11 @@ crc16tab: + .word 0x1f650000, 0x94d20000, 0x83bc0000, 0x080b0000, 0xad600000, 0x26d70000, 0x31b90000, 0xba0e0000 + .word 0xf0d80000, 0x7b6f0000, 0x6c010000, 0xe7b60000, 0x42dd0000, 0xc96a0000, 0xde040000, 0x55b30000 + ++.shuffle_data_mask = . + 0 ++ .type shuffle_data, %object ++ .size shuffle_data, 16 ++shuffle_data: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 + ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc32_gzip_refl_vclmul.S b/crc/riscv64/crc32_gzip_refl_vclmul.S +index 4a32b7c..c327816 100644 +--- a/crc/riscv64/crc32_gzip_refl_vclmul.S ++++ b/crc/riscv64/crc32_gzip_refl_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc32_gzip_refl_vclmul.h" + #include "crc32_refl_common_vclmul.h" + +-crc32_refl_func crc32_gzip_refl_vclmul +\ No newline at end of file ++crc32_refl_func crc32_gzip_refl_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc32_gzip_refl_vclmul.h b/crc/riscv64/crc32_gzip_refl_vclmul.h +index 3fee53e..d91dab6 100644 +--- a/crc/riscv64/crc32_gzip_refl_vclmul.h ++++ b/crc/riscv64/crc32_gzip_refl_vclmul.h +@@ -35,7 +35,7 @@ + .section .rodata + .text + .align 3 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 16 + const_2: +@@ -45,10 +45,10 @@ const_2: + .quad 0xccaa009e + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc32_table_gzip_refl, %object +- .size crc32_table_gzip_refl, 1024 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc32_table_gzip_refl, %object ++ .size crc32_table_gzip_refl, 1024 + crc32_table_gzip_refl: + .word 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3 + .word 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91 +@@ -81,4 +81,4 @@ crc32_table_gzip_refl: + .word 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db + .word 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9 + .word 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf +- .word 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d ++ .word 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +\ No newline at end of file +diff --git a/crc/riscv64/crc32_ieee_norm_vclmul.S b/crc/riscv64/crc32_ieee_norm_vclmul.S +index a909c7b..6005f04 100644 +--- a/crc/riscv64/crc32_ieee_norm_vclmul.S ++++ b/crc/riscv64/crc32_ieee_norm_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc32_ieee_norm_vclmul.h" + #include "crc32_norm_common_vclmul.h" + +-crc32_norm_func crc32_ieee_norm_vclmul +\ No newline at end of file ++crc32_norm_func crc32_ieee_norm_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc32_ieee_norm_vclmul.h b/crc/riscv64/crc32_ieee_norm_vclmul.h +index 93443bf..71a6058 100644 +--- a/crc/riscv64/crc32_ieee_norm_vclmul.h ++++ b/crc/riscv64/crc32_ieee_norm_vclmul.h +@@ -35,7 +35,7 @@ + .section .rodata + .text + .align 4 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 32 + const_2: +@@ -45,10 +45,10 @@ const_2: + .quad 0xc5b9cd4c + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc32_table_ieee_norm, %object +- .size crc32_table_ieee_norm, 1024 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc32_table_ieee_norm, %object ++ .size crc32_table_ieee_norm, 1024 + crc32_table_ieee_norm: + .word 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005 + .word 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, 0x384fbdbd +@@ -81,4 +81,4 @@ crc32_table_ieee_norm: + .word 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c + .word 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4 + .word 0x89b8fd09, 0x8d79e0be, 0x803ac667, 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c +- .word 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 ++ .word 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 +\ No newline at end of file +diff --git a/crc/riscv64/crc32_iscsi_refl_vclmul.S b/crc/riscv64/crc32_iscsi_refl_vclmul.S +index 3b5b355..1534238 100644 +--- a/crc/riscv64/crc32_iscsi_refl_vclmul.S ++++ b/crc/riscv64/crc32_iscsi_refl_vclmul.S +@@ -26,16 +26,16 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc32_iscsi_refl_vclmul.h" + #include "crc32_refl_common_vclmul.h" + + crc32_refl_func crc32_iscsi_refl_vclmul_internal + + .text +- .align 3 +- .global crc32_iscsi_refl_vclmul +- .type crc32_iscsi_refl_vclmul, %function ++ .align 3 ++ .global crc32_iscsi_refl_vclmul ++ .type crc32_iscsi_refl_vclmul, %function + crc32_iscsi_refl_vclmul: + mv a7, a2 + sext.w a2, a1 +@@ -53,4 +53,5 @@ crc32_iscsi_refl_vclmul: + addi sp, sp, 8 + xori a0, a0, -1 + and a0, a0, t5 +- ret +\ No newline at end of file ++ ret ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc32_iscsi_refl_vclmul.h b/crc/riscv64/crc32_iscsi_refl_vclmul.h +index df87fdb..54247e9 100644 +--- a/crc/riscv64/crc32_iscsi_refl_vclmul.h ++++ b/crc/riscv64/crc32_iscsi_refl_vclmul.h +@@ -35,7 +35,7 @@ + .section .rodata + .text + .align 3 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 16 + const_2: +@@ -45,10 +45,10 @@ const_2: + .quad 0x493c7d27 + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc32_table_iscsi_refl, %object +- .size crc32_table_iscsi_refl, 1024 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc32_table_iscsi_refl, %object ++ .size crc32_table_iscsi_refl, 1024 + crc32_table_iscsi_refl: + .word 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB + .word 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24 +@@ -81,4 +81,4 @@ crc32_table_iscsi_refl: + .word 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1 + .word 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E + .word 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E +- .word 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 ++ .word 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 +\ No newline at end of file +diff --git a/crc/riscv64/crc32_norm_common_vclmul.h b/crc/riscv64/crc32_norm_common_vclmul.h +index fe021a7..07ed3d3 100644 +--- a/crc/riscv64/crc32_norm_common_vclmul.h ++++ b/crc/riscv64/crc32_norm_common_vclmul.h +@@ -31,7 +31,7 @@ + + .macro crc32_norm_func name + .text +- .align 3 ++ .align 3 + .type \name, @function + .global \name + \name: +@@ -67,11 +67,8 @@ + + .align 2 + .crc_clmul_pre: +- vsetivli zero, 2, e64, m1, ta, ma + slli seed, seed, 32 +- + crc_norm_load_first_block +- vmv.s.x vec_zero, zero + crc_load_p4 + addi tmp_0, len, -64 + bltu tmp_0, tmp_1, .clmul_loop_end +@@ -107,5 +104,10 @@ + xor seed, tmp_1, tmp_4 + + j .crc_tab_pre +- .size \name, .-\name ++ .size \name, .-\name ++ .section .rodata.cst16,"aM",@progbits,16 ++ .align 4 ++.shuffle_data: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 + .endm +\ No newline at end of file +diff --git a/crc/riscv64/crc32_refl_common_vclmul.h b/crc/riscv64/crc32_refl_common_vclmul.h +index 2e2461d..fd64a16 100644 +--- a/crc/riscv64/crc32_refl_common_vclmul.h ++++ b/crc/riscv64/crc32_refl_common_vclmul.h +@@ -31,7 +31,7 @@ + + .macro crc32_refl_func name + .text +- .align 3 ++ .align 3 + .type \name, @function + .global \name + \name: +@@ -71,7 +71,6 @@ + .crc_clmul_pre: + vsetivli zero, 2, e64, m1, ta, ma + crc_refl_load_first_block +- vmv.s.x vec_zero, zero + crc_load_p4 + addi tmp_0, len, -64 + bltu tmp_0, tmp_1, .clmul_loop_end +@@ -108,5 +107,5 @@ + srai seed, tmp_4, 0x20 + + j .crc_tab_pre +- .size \name, .-\name ++ .size \name, .-\name + .endm +\ No newline at end of file +diff --git a/crc/riscv64/crc64_ecma_norm_vclmul.S b/crc/riscv64/crc64_ecma_norm_vclmul.S +index b0ec7ce..a7c13d0 100644 +--- a/crc/riscv64/crc64_ecma_norm_vclmul.S ++++ b/crc/riscv64/crc64_ecma_norm_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc64_ecma_norm_vclmul.h" + #include "crc64_norm_common_vclmul.h" + +-crc64_norm_func crc64_ecma_norm_vclmul +\ No newline at end of file ++crc64_norm_func crc64_ecma_norm_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc64_ecma_norm_vclmul.h b/crc/riscv64/crc64_ecma_norm_vclmul.h +index 7e4fe07..579b1af 100644 +--- a/crc/riscv64/crc64_ecma_norm_vclmul.h ++++ b/crc/riscv64/crc64_ecma_norm_vclmul.h +@@ -34,7 +34,7 @@ + .section .rodata + .text + .align 4 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 64 + const_2: +@@ -44,10 +44,10 @@ const_2: + .quad 0x4eb938a7d257740e + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc64_table_ecma_norm, %object +- .size crc64_table_ecma_norm, 2048 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc64_table_ecma_norm, %object ++ .size crc64_table_ecma_norm, 2048 + crc64_table_ecma_norm: + .dword 0x0000000000000000, 0x42f0e1eba9ea3693 + .dword 0x85e1c3d753d46d26, 0xc711223cfa3e5bb5 +diff --git a/crc/riscv64/crc64_ecma_refl_vclmul.S b/crc/riscv64/crc64_ecma_refl_vclmul.S +index 3dc92a5..7d352cc 100644 +--- a/crc/riscv64/crc64_ecma_refl_vclmul.S ++++ b/crc/riscv64/crc64_ecma_refl_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc64_ecma_refl_vclmul.h" + #include "crc64_refl_common_vclmul.h" + +-crc64_refl_func crc64_ecma_refl_vclmul +\ No newline at end of file ++crc64_refl_func crc64_ecma_refl_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc64_ecma_refl_vclmul.h b/crc/riscv64/crc64_ecma_refl_vclmul.h +index a3d7632..25a0a2a 100644 +--- a/crc/riscv64/crc64_ecma_refl_vclmul.h ++++ b/crc/riscv64/crc64_ecma_refl_vclmul.h +@@ -34,7 +34,7 @@ + .section .rodata + .text + .align 3 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 64 + const_2: +@@ -44,10 +44,10 @@ const_2: + .quad 0xdabe95afc7875f40 + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc64_table_ecma_refl, %object +- .size crc64_table_ecma_refl, 2048 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc64_table_ecma_refl, %object ++ .size crc64_table_ecma_refl, 2048 + crc64_table_ecma_refl: + .dword 0x0000000000000000, 0xb32e4cbe03a75f6f + .dword 0xf4843657a840a05b, 0x47aa7ae9abe7ff34 +@@ -176,4 +176,4 @@ crc64_table_ecma_refl: + .dword 0xdcd7181e300f9e5e, 0x6ff954a033a8c131 + .dword 0x28532e49984f3e05, 0x9b7d62f79be8616a + .dword 0xa707db9acf80c06d, 0x14299724cc279f02 +- .dword 0x5383edcd67c06036, 0xe0ada17364673f59 ++ .dword 0x5383edcd67c06036, 0xe0ada17364673f59 +\ No newline at end of file +diff --git a/crc/riscv64/crc64_iso_norm_vclmul.S b/crc/riscv64/crc64_iso_norm_vclmul.S +index 93f1290..4a53d44 100644 +--- a/crc/riscv64/crc64_iso_norm_vclmul.S ++++ b/crc/riscv64/crc64_iso_norm_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc64_iso_norm_vclmul.h" + #include "crc64_norm_common_vclmul.h" + +-crc64_norm_func crc64_iso_norm_vclmul +\ No newline at end of file ++crc64_norm_func crc64_iso_norm_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc64_iso_norm_vclmul.h b/crc/riscv64/crc64_iso_norm_vclmul.h +index 3fe52ef..eca31cd 100644 +--- a/crc/riscv64/crc64_iso_norm_vclmul.h ++++ b/crc/riscv64/crc64_iso_norm_vclmul.h +@@ -34,7 +34,7 @@ + .section .rodata + .text + .align 3 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 64 + const_2: +@@ -44,10 +44,10 @@ const_2: + .quad 0x0000000000001db7 + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc64_table_iso_norm, %object +- .size crc64_table_iso_norm, 2048 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc64_table_iso_norm, %object ++ .size crc64_table_iso_norm, 2048 + crc64_table_iso_norm: + .dword 0x0000000000000000, 0x000000000000001b + .dword 0x0000000000000036, 0x000000000000002d +@@ -176,4 +176,4 @@ crc64_table_iso_norm: + .dword 0x0000000000000948, 0x0000000000000953 + .dword 0x000000000000097e, 0x0000000000000965 + .dword 0x0000000000000924, 0x000000000000093f +- .dword 0x0000000000000912, 0x0000000000000909 ++ .dword 0x0000000000000912, 0x0000000000000909 +\ No newline at end of file +diff --git a/crc/riscv64/crc64_iso_refl_vclmul.S b/crc/riscv64/crc64_iso_refl_vclmul.S +index 9e3a9b8..4f3c18f 100644 +--- a/crc/riscv64/crc64_iso_refl_vclmul.S ++++ b/crc/riscv64/crc64_iso_refl_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc64_iso_refl_vclmul.h" + #include "crc64_refl_common_vclmul.h" + +-crc64_refl_func crc64_iso_refl_vclmul +\ No newline at end of file ++crc64_refl_func crc64_iso_refl_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc64_iso_refl_vclmul.h b/crc/riscv64/crc64_iso_refl_vclmul.h +index 5b2ad8c..cae418e 100644 +--- a/crc/riscv64/crc64_iso_refl_vclmul.h ++++ b/crc/riscv64/crc64_iso_refl_vclmul.h +@@ -34,7 +34,7 @@ + .section .rodata + .text + .align 3 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 64 + const_2: +@@ -44,10 +44,10 @@ const_2: + .quad 0xf500000000000001 + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc64_table_iso_refl, %object +- .size crc64_table_iso_refl, 2048 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc64_table_iso_refl, %object ++ .size crc64_table_iso_refl, 2048 + crc64_table_iso_refl: + .dword 0x0000000000000000, 0x01b0000000000000 + .dword 0x0360000000000000, 0x02d0000000000000 +@@ -176,4 +176,4 @@ crc64_table_iso_refl: + .dword 0x9480000000000000, 0x9530000000000000 + .dword 0x97e0000000000000, 0x9650000000000000 + .dword 0x9240000000000000, 0x93f0000000000000 +- .dword 0x9120000000000000, 0x9090000000000000 ++ .dword 0x9120000000000000, 0x9090000000000000 +\ No newline at end of file +diff --git a/crc/riscv64/crc64_jones_norm_vclmul.S b/crc/riscv64/crc64_jones_norm_vclmul.S +index 17630f7..fdfd799 100644 +--- a/crc/riscv64/crc64_jones_norm_vclmul.S ++++ b/crc/riscv64/crc64_jones_norm_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc64_jones_norm_vclmul.h" + #include "crc64_norm_common_vclmul.h" + +-crc64_norm_func crc64_jones_norm_vclmul +\ No newline at end of file ++crc64_norm_func crc64_jones_norm_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc64_jones_norm_vclmul.h b/crc/riscv64/crc64_jones_norm_vclmul.h +index 9fe71bc..0e252c7 100644 +--- a/crc/riscv64/crc64_jones_norm_vclmul.h ++++ b/crc/riscv64/crc64_jones_norm_vclmul.h +@@ -34,7 +34,7 @@ + .section .rodata + .text + .align 4 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 64 + const_2: +@@ -44,10 +44,10 @@ const_2: + .quad 0x698b74157cfbd736 + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc64_table_jones_norm, %object +- .size crc64_table_jones_norm, 2048 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc64_table_jones_norm, %object ++ .size crc64_table_jones_norm, 2048 + crc64_table_jones_norm: + .dword 0x0000000000000000, 0xad93d23594c935a9 + .dword 0xf6b4765ebd5b5efb, 0x5b27a46b29926b52 +@@ -176,4 +176,4 @@ crc64_table_jones_norm: + .dword 0x0f3dad1425e60e99, 0xa2ae7f21b12f3b30 + .dword 0xf989db4a98bd5062, 0x541a097f0c7465cb + .dword 0x4fc6939ccb9986c6, 0xe25541a95f50b36f +- .dword 0xb972e5c276c2d83d, 0x14e137f7e20bed94 ++ .dword 0xb972e5c276c2d83d, 0x14e137f7e20bed94 +\ No newline at end of file +diff --git a/crc/riscv64/crc64_jones_refl_vclmul.S b/crc/riscv64/crc64_jones_refl_vclmul.S +index aeb91cf..af4bcff 100644 +--- a/crc/riscv64/crc64_jones_refl_vclmul.S ++++ b/crc/riscv64/crc64_jones_refl_vclmul.S +@@ -26,8 +26,9 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + #include "crc64_jones_refl_vclmul.h" + #include "crc64_refl_common_vclmul.h" + +-crc64_refl_func crc64_jones_refl_vclmul +\ No newline at end of file ++crc64_refl_func crc64_jones_refl_vclmul ++#endif +\ No newline at end of file +diff --git a/crc/riscv64/crc64_jones_refl_vclmul.h b/crc/riscv64/crc64_jones_refl_vclmul.h +index feb949d..faea6a9 100644 +--- a/crc/riscv64/crc64_jones_refl_vclmul.h ++++ b/crc/riscv64/crc64_jones_refl_vclmul.h +@@ -34,7 +34,7 @@ + .section .rodata + .text + .align 3 +- .set .crc_loop_const,. + 0 ++ .set .crc_loop_const,. + 0 + .type const_2, %object + .size const_2, 64 + const_2: +@@ -44,10 +44,10 @@ const_2: + .quad 0x381d0015c96f4444 + + .text +- .align 4 +- .set .lanchor_crc_tab,. + 0 +- .type crc64_table_jones_refl, %object +- .size crc64_table_jones_refl, 2048 ++ .align 4 ++ .set .lanchor_crc_tab,. + 0 ++ .type crc64_table_jones_refl, %object ++ .size crc64_table_jones_refl, 2048 + crc64_table_jones_refl: + .dword 0x0000000000000000, 0x7ad870c830358979 + .dword 0xf5b0e190606b12f2, 0x8f689158505e9b8b +@@ -176,4 +176,4 @@ crc64_table_jones_refl: + .dword 0x66e7a46c27f3aa2c, 0x1c3fd4a417c62355 + .dword 0x935745fc4798b8de, 0xe98f353477ad31a7 + .dword 0xa6df411fbfb21ca3, 0xdc0731d78f8795da +- .dword 0x536fa08fdfd90e51, 0x29b7d047efec8728 ++ .dword 0x536fa08fdfd90e51, 0x29b7d047efec8728 +\ No newline at end of file +diff --git a/crc/riscv64/crc64_norm_common_vclmul.h b/crc/riscv64/crc64_norm_common_vclmul.h +index 2e6e46e..cfc9338 100644 +--- a/crc/riscv64/crc64_norm_common_vclmul.h ++++ b/crc/riscv64/crc64_norm_common_vclmul.h +@@ -31,7 +31,7 @@ + + .macro crc64_norm_func name + .text +- .align 3 ++ .align 3 + .type \name, @function + .global \name + \name: +@@ -65,10 +65,7 @@ + + .align 2 + .crc_clmul_pre: +- vsetivli zero, 2, e64, m1, ta, ma +- vmv.s.x vec_zero, zero + crc_norm_load_first_block +- vmv.s.x vec_zero, zero + crc_load_p4 + addi tmp_0, len, -64 + bltu tmp_0, tmp_1, .clmul_loop_end +@@ -98,5 +95,10 @@ + xor seed, tmp_1, tmp_4 + + j .crc_tab_pre +- .size \name, .-\name ++ .size \name, .-\name ++ .section .rodata.cst16,"aM",@progbits,16 ++ .align 4 ++.shuffle_data: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 + .endm +\ No newline at end of file +diff --git a/crc/riscv64/crc64_refl_common_vclmul.h b/crc/riscv64/crc64_refl_common_vclmul.h +index 6664518..17b3472 100644 +--- a/crc/riscv64/crc64_refl_common_vclmul.h ++++ b/crc/riscv64/crc64_refl_common_vclmul.h +@@ -31,7 +31,7 @@ + + .macro crc64_refl_func name + .text +- .align 3 ++ .align 3 + .type \name, @function + .global \name + \name: +@@ -66,9 +66,7 @@ + .align 2 + .crc_clmul_pre: + vsetivli zero, 2, e64, m1, ta, ma +- vmv.s.x vec_zero, zero + crc_refl_load_first_block +- vmv.s.x vec_zero, zero + crc_load_p4 + addi tmp_0, len, -64 + bltu tmp_0, tmp_1, .clmul_loop_end +@@ -97,5 +95,5 @@ + xor seed, tmp_4, tmp_5 + + j .crc_tab_pre +- .size \name, .-\name ++ .size \name, .-\name + .endm +\ No newline at end of file +diff --git a/crc/riscv64/crc_common_vclmul.h b/crc/riscv64/crc_common_vclmul.h +index f0c8be7..cc7256d 100644 +--- a/crc/riscv64/crc_common_vclmul.h ++++ b/crc/riscv64/crc_common_vclmul.h +@@ -26,7 +26,6 @@ + # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + ######################################################################### +- + // parameters + #define seed a0 + #define buf a1 +@@ -56,20 +55,22 @@ + #define vec_5 v5 + #define vec_6 v6 + #define vec_7 v7 +-#define vec_zero v8 + #define vec_8 v8 + #define vec_9 v9 ++#define vec_10 v10 ++#define vec_11 v11 ++#define vec_12 v12 ++#define vec_shuffle v13 ++#define vec_14 v14 ++#define vec_15 v15 ++#define vec_16 v16 ++#define vec_17 v17 ++#define vec_18 v18 + + .macro crc_refl_load_first_block + mv buf_iter, buf +- vle64.v vec_0, 0(buf_iter) +- addi buf_iter, buf_iter, 16 +- vle64.v vec_1, 0(buf_iter) +- addi buf_iter, buf_iter, 16 +- vle64.v vec_2, 0(buf_iter) +- addi buf_iter, buf_iter, 16 +- vle64.v vec_3, 0(buf_iter) +- addi buf_iter, buf_iter, 16 ++ vl4re64.v vec_0, 0(buf_iter) ++ addi buf_iter, buf_iter, 64 + andi counter, len, ~63 + addi tmp_0, counter, -64 + +@@ -80,33 +81,25 @@ + .endm + + .macro crc_norm_load_first_block ++ la tmp_0, .shuffle_data + mv buf_iter, buf +- vle64.v vec_6, 0(buf_iter) +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) +- addi buf_iter, buf_iter, 16 +- vle64.v vec_8, 0(buf_iter) +- addi buf_iter, buf_iter, 16 +- vle64.v vec_9, 0(buf_iter) +- addi buf_iter, buf_iter, 16 ++ vl4re64.v vec_4, (buf_iter) ++ ++ vsetivli zero, 16, e8, m1, ta, ma ++ vle8.v vec_shuffle, 0(tmp_0) ++ vrgather.vv vec_0, vec_4, vec_shuffle ++ vrgather.vv vec_1, vec_5, vec_shuffle ++ vrgather.vv vec_2, vec_6, vec_shuffle ++ vrgather.vv vec_3, vec_7, vec_shuffle ++ vsetivli zero, 2, e64, m1, ta, ma ++ ++ addi buf_iter, buf_iter, 64 + andi counter, len, ~63 + addi tmp_0, counter, -64 + + vmv.s.x vec_4, zero + vmv.s.x vec_5, seed + vslideup.vi vec_4, vec_5, 1 +- vrev8.v vec_6, vec_6 +- vrev8.v vec_7, vec_7 +- vrev8.v vec_8, vec_8 +- vrev8.v vec_9, vec_9 +- vslidedown.vi vec_0, vec_6, 1 +- vslidedown.vi vec_1, vec_7, 1 +- vslidedown.vi vec_2, vec_8, 1 +- vslidedown.vi vec_3, vec_9, 1 +- vslideup.vi vec_0, vec_6, 1 +- vslideup.vi vec_1, vec_7, 1 +- vslideup.vi vec_2, vec_8, 1 +- vslideup.vi vec_3, vec_9, 1 + vxor.vv vec_0, vec_0, vec_4 + .endm + +@@ -119,96 +112,98 @@ + .macro crc_refl_loop + .align 3 + .clmul_loop: +- vle64.v vec_7, 0(buf_iter) ++ vl4re64.v vec_8, (buf_iter) ++ + vclmul.vv vec_4, vec_0, vec_5 + vclmulh.vv vec_0, vec_0, vec_5 +- vredxor.vs vec_0, vec_0, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_0, 1 +- vxor.vv vec_0, vec_4, vec_7 ++ vslidedown.vi vec_15, vec_4, 1 ++ vslidedown.vi vec_14, vec_0, 1 ++ vxor.vv vec_15, vec_15, vec_4 ++ vxor.vv vec_14, vec_14, vec_0 ++ vslideup.vi vec_15, vec_14, 1 + +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) + vclmul.vv vec_4, vec_1, vec_5 + vclmulh.vv vec_1, vec_1, vec_5 +- vredxor.vs vec_1, vec_1, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_1, 1 +- vxor.vv vec_1, vec_4, vec_7 ++ vslidedown.vi vec_16, vec_4, 1 ++ vslidedown.vi vec_14, vec_1, 1 ++ vxor.vv vec_16, vec_16, vec_4 ++ vxor.vv vec_14, vec_14, vec_1 ++ vslideup.vi vec_16, vec_14, 1 + +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) + vclmul.vv vec_4, vec_2, vec_5 + vclmulh.vv vec_2, vec_2, vec_5 +- vredxor.vs vec_2, vec_2, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_2, 1 +- vxor.vv vec_2, vec_4, vec_7 ++ vslidedown.vi vec_17, vec_4, 1 ++ vslidedown.vi vec_14, vec_2, 1 ++ vxor.vv vec_17, vec_17, vec_4 ++ vxor.vv vec_14, vec_14, vec_2 ++ vslideup.vi vec_17, vec_14, 1 + +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) + vclmul.vv vec_4, vec_3, vec_5 + vclmulh.vv vec_3, vec_3, vec_5 +- vredxor.vs vec_3, vec_3, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_3, 1 +- vxor.vv vec_3, vec_4, vec_7 +- +- addi buf_iter, buf_iter, 16 ++ vslidedown.vi vec_18, vec_4, 1 ++ vslidedown.vi vec_14, vec_3, 1 ++ vxor.vv vec_18, vec_18, vec_4 ++ vxor.vv vec_14, vec_14, vec_3 ++ vslideup.vi vec_18, vec_14, 1 ++ ++ vxor.vv vec_0, vec_8, vec_15 ++ vxor.vv vec_1, vec_9, vec_16 ++ vxor.vv vec_2, vec_10, vec_17 ++ vxor.vv vec_3, vec_11, vec_18 ++ ++ addi buf_iter, buf_iter, 64 + bne buf_iter, buf_end, .clmul_loop + .endm + + .macro crc_norm_loop +- .align 3 ++ .align 3 + .clmul_loop: +- vle64.v vec_7, 0(buf_iter) ++ vl4re64.v vec_8, (buf_iter) ++ + vclmul.vv vec_4, vec_0, vec_5 + vclmulh.vv vec_0, vec_0, vec_5 +- vredxor.vs vec_0, vec_0, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_0, 1 +- vrev8.v vec_7, vec_7 +- vslidedown.vi vec_6, vec_7, 1 +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_0, vec_4, vec_6 +- +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) ++ vslidedown.vi vec_15, vec_4, 1 ++ vslidedown.vi vec_14, vec_0, 1 ++ vxor.vv vec_15, vec_15, vec_4 ++ vxor.vv vec_14, vec_14, vec_0 ++ vslideup.vi vec_15, vec_14, 1 ++ + vclmul.vv vec_4, vec_1, vec_5 + vclmulh.vv vec_1, vec_1, vec_5 +- vredxor.vs vec_1, vec_1, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_1, 1 +- vrev8.v vec_7, vec_7 +- vslidedown.vi vec_6, vec_7, 1 +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_1, vec_4, vec_6 +- +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) ++ vslidedown.vi vec_16, vec_4, 1 ++ vslidedown.vi vec_14, vec_1, 1 ++ vxor.vv vec_16, vec_16, vec_4 ++ vxor.vv vec_14, vec_14, vec_1 ++ vslideup.vi vec_16, vec_14, 1 ++ + vclmul.vv vec_4, vec_2, vec_5 + vclmulh.vv vec_2, vec_2, vec_5 +- vredxor.vs vec_2, vec_2, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_2, 1 +- vrev8.v vec_7, vec_7 +- vslidedown.vi vec_6, vec_7, 1 +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_2, vec_4, vec_6 +- +- addi buf_iter, buf_iter, 16 +- vle64.v vec_7, 0(buf_iter) ++ vslidedown.vi vec_17, vec_4, 1 ++ vslidedown.vi vec_14, vec_2, 1 ++ vxor.vv vec_17, vec_17, vec_4 ++ vxor.vv vec_14, vec_14, vec_2 ++ vslideup.vi vec_17, vec_14, 1 ++ + vclmul.vv vec_4, vec_3, vec_5 + vclmulh.vv vec_3, vec_3, vec_5 +- vredxor.vs vec_3, vec_3, vec_zero +- vredxor.vs vec_4, vec_4, vec_zero +- vslideup.vi vec_4, vec_3, 1 +- vrev8.v vec_7, vec_7 +- vslidedown.vi vec_6, vec_7, 1 +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_3, vec_4, vec_6 +- +- addi buf_iter, buf_iter, 16 ++ vslidedown.vi vec_18, vec_4, 1 ++ vslidedown.vi vec_14, vec_3, 1 ++ vxor.vv vec_18, vec_18, vec_4 ++ vxor.vv vec_14, vec_14, vec_3 ++ vslideup.vi vec_18, vec_14, 1 ++ ++ vsetivli zero, 16, e8, m1, ta, ma ++ vrgather.vv vec_0, vec_8, vec_shuffle ++ vrgather.vv vec_1, vec_9, vec_shuffle ++ vrgather.vv vec_2, vec_10, vec_shuffle ++ vrgather.vv vec_3, vec_11, vec_shuffle ++ vsetivli zero, 2, e64, m1, ta, ma ++ vxor.vv vec_0, vec_0, vec_15 ++ vxor.vv vec_1, vec_1, vec_16 ++ vxor.vv vec_2, vec_2, vec_17 ++ vxor.vv vec_3, vec_3, vec_18 ++ ++ addi buf_iter, buf_iter, 64 + bne buf_iter, buf_end, .clmul_loop + .endm + +@@ -216,20 +211,26 @@ + vle64.v vec_5, 0(tmp_4) + vclmul.vv vec_6, vec_0, vec_5 + vclmulh.vv vec_7, vec_0, vec_5 +- vredxor.vs vec_6, vec_6, vec_zero +- vredxor.vs vec_7, vec_7, vec_zero +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_0, vec_6, vec_1 ++ vslidedown.vi vec_8, vec_6, 1 ++ vslidedown.vi vec_9, vec_7, 1 ++ vxor.vv vec_8, vec_8, vec_6 ++ vxor.vv vec_9, vec_9, vec_7 ++ vslideup.vi vec_8, vec_9, 1 ++ vxor.vv vec_0, vec_8, vec_1 + vclmul.vv vec_6, vec_0, vec_5 + vclmulh.vv vec_7, vec_0, vec_5 +- vredxor.vs vec_6, vec_6, vec_zero +- vredxor.vs vec_7, vec_7, vec_zero +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_0, vec_6, vec_2 ++ vslidedown.vi vec_8, vec_6, 1 ++ vslidedown.vi vec_9, vec_7, 1 ++ vxor.vv vec_8, vec_8, vec_6 ++ vxor.vv vec_9, vec_9, vec_7 ++ vslideup.vi vec_8, vec_9, 1 ++ vxor.vv vec_0, vec_8, vec_2 + vclmul.vv vec_6, vec_0, vec_5 + vclmulh.vv vec_7, vec_0, vec_5 +- vredxor.vs vec_6, vec_6, vec_zero +- vredxor.vs vec_7, vec_7, vec_zero +- vslideup.vi vec_6, vec_7, 1 +- vxor.vv vec_0, vec_6, vec_3 ++ vslidedown.vi vec_8, vec_6, 1 ++ vslidedown.vi vec_9, vec_7, 1 ++ vxor.vv vec_8, vec_8, vec_6 ++ vxor.vv vec_9, vec_9, vec_7 ++ vslideup.vi vec_8, vec_9, 1 ++ vxor.vv vec_0, vec_8, vec_3 + .endm +\ No newline at end of file +diff --git a/crc/riscv64/crc_multibinary_riscv.S b/crc/riscv64/crc_multibinary_riscv.S +index ef36681..eab6b85 100644 +--- a/crc/riscv64/crc_multibinary_riscv.S ++++ b/crc/riscv64/crc_multibinary_riscv.S +@@ -1,5 +1,5 @@ + ######################################################################## +-# Copyright(c) 2025 ZTE Corporation All rights reserved. ++# Copyright (c) 2025 ZTE Corporation. + # + # Redistribution and use in source and binary forms, with or without + # modification, are permitted provided that the following conditions +@@ -28,7 +28,6 @@ + ######################################################################### + #include + +- + mbin_interface crc32_iscsi + mbin_interface crc16_t10dif + mbin_interface crc16_t10dif_copy +diff --git a/crc/riscv64/crc_riscv64_dispatcher.c b/crc/riscv64/crc_riscv64_dispatcher.c +index eec3cd7..bb3b470 100644 +--- a/crc/riscv64/crc_riscv64_dispatcher.c ++++ b/crc/riscv64/crc_riscv64_dispatcher.c +@@ -27,143 +27,159 @@ + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************/ + #include ++#include "crc.h" ++#include "crc64.h" ++ ++extern uint16_t ++crc16_t10dif_vclmul(uint16_t, uint8_t *, uint64_t); ++ ++extern uint16_t ++crc16_t10dif_copy_vclmul(uint16_t, uint8_t *, uint8_t *, uint64_t); ++ ++extern uint32_t ++crc32_ieee_norm_vclmul(uint32_t, uint8_t *, uint64_t); ++ ++extern unsigned int ++crc32_iscsi_refl_vclmul(unsigned char *, int, unsigned int); ++ ++extern uint32_t ++crc32_gzip_refl_vclmul(uint32_t, uint8_t *, uint64_t); ++ ++extern uint64_t ++crc64_ecma_refl_vclmul(uint64_t, const unsigned char *, uint64_t); ++ ++extern uint64_t ++crc64_ecma_norm_vclmul(uint64_t, const unsigned char *, uint64_t); ++ ++extern uint64_t ++crc64_iso_refl_vclmul(uint64_t, const unsigned char *, uint64_t); ++ ++extern uint64_t ++crc64_iso_norm_vclmul(uint64_t, const unsigned char *, uint64_t); ++ ++extern uint64_t ++crc64_jones_refl_vclmul(uint64_t, const unsigned char *, uint64_t); ++ ++extern uint64_t ++crc64_jones_norm_vclmul(uint64_t, const unsigned char *, uint64_t); + + DEFINE_INTERFACE_DISPATCHER(crc16_t10dif) + { ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc16_t10dif_vclmul); ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc16_t10dif_vclmul; + } +- +- return PROVIDER_BASIC(crc16_t10dif); ++#endif ++ return crc16_t10dif_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc16_t10dif_copy) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc16_t10dif_copy_vclmul); ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc16_t10dif_copy_vclmul; + } +- +- return PROVIDER_BASIC(crc16_t10dif_copy); ++#endif ++ return crc16_t10dif_copy_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc32_ieee) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc32_ieee_norm_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc32_ieee_norm_vclmul; + } +- +- return PROVIDER_BASIC(crc32_ieee); ++#endif ++ return crc32_ieee_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc32_iscsi) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc32_iscsi_refl_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc32_iscsi_refl_vclmul; + } +- +- return PROVIDER_BASIC(crc32_iscsi); ++#endif ++ return crc32_iscsi_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc32_gzip_refl) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc32_gzip_refl_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc32_gzip_refl_vclmul; + } +- +- return PROVIDER_BASIC(crc32_gzip_refl); ++#endif ++ return crc32_gzip_refl_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc64_ecma_refl) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc64_ecma_refl_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc64_ecma_refl_vclmul; + } +- +- return PROVIDER_BASIC(crc64_ecma_refl); ++#endif ++ return crc64_ecma_refl_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc64_ecma_norm) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc64_ecma_norm_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc64_ecma_norm_vclmul; + } +- +- return PROVIDER_BASIC(crc64_ecma_norm); ++#endif ++ return crc64_ecma_norm_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc64_iso_refl) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc64_iso_refl_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc64_iso_refl_vclmul; + } +- +- return PROVIDER_BASIC(crc64_iso_refl); ++#endif ++ return crc64_iso_refl_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc64_iso_norm) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc64_iso_norm_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc64_iso_norm_vclmul; + } +- +- return PROVIDER_BASIC(crc64_iso_norm); ++#endif ++ return crc64_iso_norm_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc64_jones_refl) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc64_jones_refl_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc64_jones_refl_vclmul; + } +- +- return PROVIDER_BASIC(crc64_jones_refl); ++#endif ++ return crc64_jones_refl_base; + } + + DEFINE_INTERFACE_DISPATCHER(crc64_jones_norm) + { +- ++#if HAVE_RVV && HAVE_ZBC && HAVE_ZVBC + unsigned long auxval = getauxval(AT_HWCAP); +- if (auxval & HWCAP_RV('V')) { +- if (has_riscv_ext("zvbc") && has_riscv_ext("zvbb") && has_riscv_ext("zbc")) +- return PROVIDER_INFO(crc64_jones_norm_vclmul); +- ++ if (auxval & HWCAP_RV('V') && CHECK_RISCV_EXTENSIONS("ZVBC", "ZBC")) { ++ return crc64_jones_norm_vclmul; + } +- +- return PROVIDER_BASIC(crc64_jones_norm); ++#endif ++ return crc64_jones_norm_base; + } +\ No newline at end of file +diff --git a/include/riscv64_multibinary.h b/include/riscv64_multibinary.h +index ad66970..8222a94 100644 +--- a/include/riscv64_multibinary.h ++++ b/include/riscv64_multibinary.h +@@ -1,5 +1,5 @@ + /********************************************************************** +- Copyright(c) 2024 All rights reserved. ++ Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS). + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions +@@ -10,6 +10,9 @@ + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. ++ * Neither the name of ISCAS nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +@@ -23,12 +26,10 @@ + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************/ +- +-#ifndef __RISCV64_MULTIBINARY_H__ +-#define __RISCV64_MULTIBINARY_H__ +- ++#ifndef __RISCV_MULTIBINARY_H__ ++#define __RISCV_MULTIBINARY_H__ + #ifndef __riscv +-#error "This file is for RISC-V only" ++#error "This file is for riscv only" + #endif + + #ifdef __ASSEMBLY__ +@@ -45,119 +46,108 @@ + * 3. Prototype should be *"void * \name\()_dispatcher"* + * 4. The dispather should return the right function pointer , revision and a string information . + **/ +- .macro mbin_interface name:req +- .section .data +- .align 3 +- .global \name\()_dispatcher_info +- .type \name\()_dispatcher_info, @object ++.macro mbin_interface name:req ++ .section .data ++ .align 3 ++ .global \name\()_dispatcher_info ++ .type \name\()_dispatcher_info, @object + \name\()_dispatcher_info: +- .quad \name\()_mbinit +- .section .text +- .global \name\()_mbinit ++ .quad \name\()_mbinit ++ .section .text ++ .global \name\()_mbinit + \name\()_mbinit: +- addi sp, sp, -56 +- sd ra, 48(sp) +- sd a0, 0(sp) +- sd a1, 8(sp) +- sd a2, 16(sp) +- sd a3, 24(sp) +- sd a4, 32(sp) +- sd a5, 40(sp) +- call \name\()_dispatcher +- mv t2, a0 +- la t0, \name\()_dispatcher_info +- sd a0, 0(t0) +- ld ra, 48(sp) +- ld a0, 0(sp) +- ld a1, 8(sp) +- ld a2, 16(sp) +- ld a3, 24(sp) +- ld a4, 32(sp) +- ld a5, 40(sp) +- addi sp, sp, 56 +- jr t2 ++ addi sp, sp, -56 ++ sd ra, 48(sp) ++ sd a0, 0(sp) ++ sd a1, 8(sp) ++ sd a2, 16(sp) ++ sd a3, 24(sp) ++ sd a4, 32(sp) ++ sd a5, 40(sp) ++ call \name\()_dispatcher ++ mv t2, a0 ++ la t0, \name\()_dispatcher_info ++ sd a0, 0(t0) ++ ld ra, 48(sp) ++ ld a0, 0(sp) ++ ld a1, 8(sp) ++ ld a2, 16(sp) ++ ld a3, 24(sp) ++ ld a4, 32(sp) ++ ld a5, 40(sp) ++ addi sp, sp, 56 ++ jr t2 + .global \name\() + .type \name,%function + \name\(): +- la t0, \name\()_dispatcher_info +- ld t1, 0(t0) +- jr t1 ++ la t0, \name\()_dispatcher_info ++ ld t1, 0(t0) ++ jr t1 + .size \name,. - \name + .endm + + /** +-* mbin_interface_base is used for the interfaces which have only +-* noarch implementation +-*/ ++ * mbin_interface_base is used for the interfaces which have only ++ * noarch implementation ++ */ + .macro mbin_interface_base name:req, base:req +- .extern \base +- .data +- .align 3 +- .global \name\()_dispatcher_info +- .type \name\()_dispatcher_info, @object ++ .extern \base ++ .data ++ .align 3 ++ .global \name\()_dispatcher_info ++ .type \name\()_dispatcher_info, @object + \name\()_dispatcher_info: +- .dword \base +- .text +- .global \name +- .type \name, @function ++ .dword \base ++ .text ++ .global \name ++ .type \name, @function + \name: +- la t0, \name\()_dispatcher_info +- ld t0, (t0) +- jr t0 ++ la t0, \name\()_dispatcher_info ++ ld t0, (t0) ++ jr t0 + .endm +- + #else /* __ASSEMBLY__ */ + #include +-#include ++#if HAVE_HWPROBE_H ++#include ++#endif ++#include ++#include + #include + #define HWCAP_RV(letter) (1ul << ((letter) - 'A')) + +-/* Define interface dispatcher macro */ +-#define DEFINE_INTERFACE_DISPATCHER(name) \ +- void * name##_dispatcher(void) +- +-/* Define basic provider macro */ +-#define PROVIDER_BASIC(name) \ +- PROVIDER_INFO(name##_base) ++#if HAVE_ZBC && HAVE_ZVBC ++#define EXT_CODE(ext) ( \ ++ strcmp(ext, "ZBC") == 0 ? RISCV_HWPROBE_EXT_ZBC : \ ++ strcmp(ext, "ZVBC") == 0 ? RISCV_HWPROBE_EXT_ZVBC : \ ++ -1) ++#endif + +-#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x +-#define DO_PRAGMA(x) _Pragma (#x) +-#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x) +-#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push) +-#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop) ++#define INIT_PROBE_STRUCT() \ ++ (struct riscv_hwprobe){ \ ++ .key = RISCV_HWPROBE_KEY_IMA_EXT_0 \ ++ } + +-#define PROVIDER_INFO(_func_entry) \ +- ({ DIGNOSTIC_PUSH() \ +- DIGNOSTIC_IGNORE(-Wnested-externs) \ +- extern void _func_entry(void); \ +- DIGNOSTIC_POP() \ +- _func_entry; \ +- }) ++#ifdef EXT_CODE ++static inline int check_riscv_extensions(const char **extensions, size_t count) ++{ ++ struct riscv_hwprobe _probe = INIT_PROBE_STRUCT(); ++ syscall(__NR_riscv_hwprobe, &_probe, 1, 0, NULL, 0); ++ for (size_t i = 0; i < count; i++) { ++ if (!(_probe.value & EXT_CODE(extensions[i]))) { ++ return 0; ++ } ++ } ++ return 1; ++} + +-/* RISC-V extension detection */ +-static inline int has_riscv_ext(const char *ext) { +- FILE *fp = fopen("/proc/cpuinfo", "r"); +- if (!fp) return 0; ++#define CHECK_RISCV_EXTENSIONS(...) \ ++ check_riscv_extensions((const char*[]){ __VA_ARGS__ }, \ ++ sizeof((const char*[]){ __VA_ARGS__ })/sizeof(const char*)) ++#endif + +- char line[1024]; +- int found = 0; +- while (fgets(line, sizeof(line), fp)) { +- char *isa = strstr(line, "isa"); +- if (isa) { +- char *colon = strchr(isa, ':'); +- if (colon) { +- char *isa_str = colon + 1; +- while (*isa_str == ' ' || *isa_str == '\t') isa_str++; +- if (strstr(isa_str, ext)) { +- found = 1; +- break; +- } +- } +- } +- } +- fclose(fp); +- return found; +-} ++#define DEFINE_INTERFACE_DISPATCHER(name) \ ++ void * name##_dispatcher(void) + + #endif /* __ASSEMBLY__ */ +-#endif /* __RISCV64_MULTIBINARY_H__ */ +\ No newline at end of file ++#endif /* __RISCV_MULTIBINARY_H__ */ +\ No newline at end of file +-- +2.27.0 + diff --git a/isa-l.spec b/isa-l.spec index 0ecbea6ef555751e63b1350c2bdf195bfb9e11f5..cdf176083f1a3609c399940d73c26cd4072176ea 100644 --- a/isa-l.spec +++ b/isa-l.spec @@ -2,13 +2,14 @@ %define isal_devname libisa-l-devel Name: isa-l Version: 2.30.0 -Release: 3 +Release: 4 Summary: Intelligent Storage Acceleration Library License: BSD-3-Clause URL: https://github.com/intel/isa-l Source0: https://github.com/intel/isa-l/archive/refs/tags/v%{version}.tar.gz Patch1: Feature-Add-Basic-RISC-V-And-CRC-Vector-support.patch +Patch2: Feature-Optimize-CRC-calculation-for-the-RISC-V.patch BuildRequires: yasm gcc BuildRequires: autoconf automake libtool @@ -70,7 +71,10 @@ find %{?buildroot} -name *.la -print0 | xargs -r0 rm -f %{_libdir}/pkgconfig/libisal.pc %changelog -* Fri Jul 18 2025 liuqingtao - 2.30.0-3 +* Tue Sep 09 2025 liuqingtao - 2.30.0-4 +- Optimize CRC calculation for the RISC-V + +* Fri Jul 18 2025 liuqingtao - 2.30.0-3 - Add basic RISC-V and CRC Vector support * Sat Feb 25 2023 yaoxin - 2.30.0-2