From 848164545253b2df1e3d9347e9ba31c55effc059 Mon Sep 17 00:00:00 2001 From: xxcui Date: Mon, 8 Dec 2025 10:18:47 +0800 Subject: [PATCH 1/2] add patch to support sm3 optimized by rvv cve --- ...-Anolis-SM3-performance-optimization.patch | 265 ++++++++++++++++++ openssl.spec | 6 +- 2 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 3100-Anolis-SM3-performance-optimization.patch diff --git a/3100-Anolis-SM3-performance-optimization.patch b/3100-Anolis-SM3-performance-optimization.patch new file mode 100644 index 0000000..61447d0 --- /dev/null +++ b/3100-Anolis-SM3-performance-optimization.patch @@ -0,0 +1,265 @@ +From a89d29499b077e985e4a6010335541e7b24359f8 Mon Sep 17 00:00:00 2001 +From: xxcui +Date: Mon, 8 Dec 2025 10:03:31 +0800 +Subject: [PATCH 1/1] Optimization of SM3 algorithm using RISC-V cryptographic + Vector Extension. + +Signed-off-by: xxcui +Change-Id: Ib53e4d2dd7a9eacbde607ea7e67776900292b42b +--- + crypto/perlasm/riscv.pm | 19 +++ + crypto/sm3/asm/sm3-riscv64-zvksh.pl | 188 +++++++++++++++++++++++++++- + 2 files changed, 205 insertions(+), 2 deletions(-) + +diff --git a/crypto/perlasm/riscv.pm b/crypto/perlasm/riscv.pm +index 2402c052ca..cd27c89fd8 100644 +--- a/crypto/perlasm/riscv.pm ++++ b/crypto/perlasm/riscv.pm +@@ -537,6 +537,16 @@ sub vluxei8_v { + return ".word ".($template | ($vm << 25) | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7)); + } + ++sub vluxei32_v { ++ # vluxei32.v vd, (rs1), vs2, vm ++ my $template = 0b000001_0_00000_00000_110_00000_0000111; ++ my $vd = read_vreg shift; ++ my $rs1 = read_reg shift; ++ my $vs2 = read_vreg shift; ++ my $vm = read_mask_vreg shift; ++ return ".word ".($template | ($vm << 25) | ($vs2 << 20) | ($rs1 << 15) | ($vd << 7)); ++} ++ + sub vmerge_vim { + # vmerge.vim vd, vs2, imm, v0 + my $template = 0b0101110_00000_00000_011_00000_1010111; +@@ -1050,4 +1060,13 @@ sub vsm3me_vv { + return ".word ".($template | ($vs2 << 20) | ($vs1 << 15 ) | ($vd << 7)); + } + ++sub vrgather_vv{ ++ # vrgather.vv vd, vs2, vs1 ++ my $template = 0b11001_00000_00000_000_00000_1010111; ++ my $vd = read_vreg shift; ++ my $vs2 = read_vreg shift; ++ my $vs1 = read_vreg shift; ++ return ".word ".($template | ($vs2 << 20) | ($vs1 << 15 ) | ($vd << 7)); ++} ++ + 1; +diff --git a/crypto/sm3/asm/sm3-riscv64-zvksh.pl b/crypto/sm3/asm/sm3-riscv64-zvksh.pl +index 1b8fd2eea0..69a433c517 100755 +--- a/crypto/sm3/asm/sm3-riscv64-zvksh.pl ++++ b/crypto/sm3/asm/sm3-riscv64-zvksh.pl +@@ -63,7 +63,7 @@ ___ + ################################################################################ + # ossl_hwsm3_block_data_order_zvksh(SM3_CTX *c, const void *p, size_t num); + { +-my ($CTX, $INPUT, $NUM) = ("a0", "a1", "a2"); ++my ($CTX, $INPUT, $NUM, $EVENNUM , $TMPADDR) = ("a0", "a1", "a2", "a6", "t0"); + my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, + $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, + $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, +@@ -76,8 +76,179 @@ $code .= <<___; + .globl ossl_hwsm3_block_data_order_zvksh + .type ossl_hwsm3_block_data_order_zvksh,\@function + ossl_hwsm3_block_data_order_zvksh: ++ # Obtain VLEN and select the corresponding branch ++ csrr t0, vlenb ++ addi t1, t0, -64 ++ beqz t1, ossl_hwsm3_block_data_order_zvksh_zvl512 ++ addi t1, t0, -32 ++ beqz t1, ossl_hwsm3_block_data_order_zvksh_zvl256 ++ j ossl_hwsm3_block_data_order_zvksh_zvl128 ++ossl_hwsm3_block_data_order_zvksh_zvl512: ++ @{[vsetivli "zero", 8, "e32", "m1", "tu", "mu"]} ++ @{[vle32_v $V26, $CTX]} ++ @{[vrev8_v $V26, $V26]} ++ @{[vsetivli "zero", 16, "e32", "m1", "ta", "ma"]} ++ la $TMPADDR, ORDER_BY_ZVL512_DATA ++ @{[vle32_v $V30, $TMPADDR]} ++ addi $TMPADDR, $TMPADDR, 64 ++ @{[vle32_v $V31, $TMPADDR]} ++ la $TMPADDR, ORDER_BY_ZVL512_EXP ++ @{[vle32_v $V29, $TMPADDR]} ++ addi $TMPADDR, $TMPADDR, 64 ++ @{[vle32_v $V28, $TMPADDR]} ++ srli $EVENNUM , $NUM, 1 ++ andi $NUM, $NUM, 1 ++ beqz $EVENNUM , ossl_hwsm3_block_data_order_zvksh_zvl256 ++L_sm3_loop_zvl512: ++ # Use indexed loads (ORDER_BY_RVV512_DATA) to load two blocks in the ++ # word order expected by the later vrgather/vsm3c stages. ++ @{[vluxei32_v $V0, $INPUT, $V30]} ++ @{[vluxei32_v $V1, $INPUT, $V31]} ++ @{[vrgather_vv $V9, $V0, $V29]} ++ @{[vrgather_vv $V10, $V9, $V29]} ++ @{[vrgather_vv $V11, $V1, $V28]} ++ @{[vor_vv $V10, $V10, $V11]} ++ @{[vrgather_vv $V11, $V10, $V29]} ++ @{[vrgather_vv $V12, $V1, $V29]} ++ @{[vrgather_vv $V13, $V12, $V29]} ++ @{[vsm3me_vv $V2, $V1, $V0]} ++ @{[vrgather_vv $V14, $V2, $V28]} ++ @{[vor_vv $V13, $V13, $V14]} ++ @{[vrgather_vv $V14, $V13, $V29]} ++ @{[vrgather_vv $V15, $V2, $V29]} ++ @{[vrgather_vv $V16, $V15, $V29]} ++ @{[vsm3me_vv $V3, $V2, $V1]} ++ @{[vrgather_vv $V17, $V3, $V28]} ++ @{[vor_vv $V16, $V16, $V17]} ++ @{[vrgather_vv $V17, $V16, $V29]} ++ @{[vrgather_vv $V18, $V3, $V29]} ++ @{[vrgather_vv $V19, $V18, $V29]} ++ @{[vsm3me_vv $V4, $V3, $V2]} ++ @{[vrgather_vv $V20, $V4, $V28]} ++ @{[vor_vv $V19, $V19, $V20]} ++ @{[vrgather_vv $V20, $V19, $V29]} ++ @{[vrgather_vv $V21, $V4, $V29]} ++ @{[vrgather_vv $V22, $V21, $V29]} ++ @{[vsm3me_vv $V5, $V4, $V3]} ++ @{[vrgather_vv $V23, $V5, $V28]} ++ @{[vor_vv $V22, $V22, $V23]} ++ @{[vrgather_vv $V23, $V22, $V29]} ++ @{[vrgather_vv $V24, $V5, $V29]} ++ @{[vrgather_vv $V25, $V24, $V29]} ++ @{[vsm3me_vv $V6, $V5, $V4]} ++ @{[vrgather_vv $V27, $V6, $V28]} ++ @{[vor_vv $V25, $V25, $V27]} ++ @{[vsm3me_vv $V7, $V6, $V5]} ++ @{[vsm3me_vv $V8, $V7, $V6]} ++ @{[vmv_v_v $V27, $V26]} ++ @{[vsetivli "zero", 8, "e32", "m1", "tu", "mu"]} ++ @{[vsm3c_vi $V26, $V0, 0]} ++ @{[vsm3c_vi $V26, $V9, 1]} ++ @{[vsm3c_vi $V26, $V10, 2]} ++ @{[vsm3c_vi $V26, $V11, 3]} ++ @{[vsm3c_vi $V26, $V1, 4]} ++ @{[vsm3c_vi $V26, $V12, 5]} ++ @{[vsm3c_vi $V26, $V13, 6]} ++ @{[vsm3c_vi $V26, $V14, 7]} ++ @{[vsm3c_vi $V26, $V2, 8]} ++ @{[vsm3c_vi $V26, $V15, 9]} ++ @{[vsm3c_vi $V26, $V16, 10]} ++ @{[vsm3c_vi $V26, $V17, 11]} ++ @{[vsm3c_vi $V26, $V3, 12]} ++ @{[vsm3c_vi $V26, $V18, 13]} ++ @{[vsm3c_vi $V26, $V19, 14]} ++ @{[vsm3c_vi $V26, $V20, 15]} ++ @{[vsm3c_vi $V26, $V4, 16]} ++ @{[vsm3c_vi $V26, $V21, 17]} ++ @{[vsm3c_vi $V26, $V22, 18]} ++ @{[vsm3c_vi $V26, $V23, 19]} ++ @{[vsm3c_vi $V26, $V5, 20]} ++ @{[vsm3c_vi $V26, $V24, 21]} ++ @{[vsm3c_vi $V26, $V25, 22]} ++ @{[vrgather_vv $V9, $V25, $V29]} ++ @{[vrgather_vv $V10, $V6, $V29]} ++ @{[vrgather_vv $V11, $V10, $V29]} ++ @{[vrgather_vv $V12, $V7, $V28]} ++ @{[vor_vv $V11, $V11, $V12]} ++ @{[vrgather_vv $V12, $V11, $V29]} ++ @{[vrgather_vv $V13, $V7, $V29]} ++ @{[vrgather_vv $V14, $V13, $V29]} ++ @{[vrgather_vv $V15, $V8, $V28]} ++ @{[vor_vv $V14, $V14, $V15]} ++ @{[vrgather_vv $V15, $V14, $V29]} ++ @{[vsm3c_vi $V26, $V9, 23]} ++ @{[vsm3c_vi $V26, $V6, 24]} ++ @{[vsm3c_vi $V26, $V10, 25]} ++ @{[vsm3c_vi $V26, $V11, 26]} ++ @{[vsm3c_vi $V26, $V12, 27]} ++ @{[vsm3c_vi $V26, $V7, 28]} ++ @{[vsm3c_vi $V26, $V13, 29]} ++ @{[vsm3c_vi $V26, $V14, 30]} ++ @{[vsm3c_vi $V26, $V15, 31]} ++ @{[vsetivli "zero", 16, "e32", "m1", "ta", "ma"]} ++ @{[vxor_vv $V26, $V26, $V27]} ++ @{[vslideup_vi $V27, $V26, 8]} ++ @{[vmv_v_v $V26, $V27]} ++ @{[vsm3c_vi $V26, $V0, 0]} ++ @{[vsm3c_vi $V26, $V9, 1]} ++ @{[vsm3c_vi $V26, $V10, 2]} ++ @{[vsm3c_vi $V26, $V11, 3]} ++ @{[vsm3c_vi $V26, $V1, 4]} ++ @{[vsm3c_vi $V26, $V12, 5]} ++ @{[vsm3c_vi $V26, $V13, 6]} ++ @{[vsm3c_vi $V26, $V14, 7]} ++ @{[vsm3c_vi $V26, $V2, 8]} ++ @{[vsm3c_vi $V26, $V15, 9]} ++ @{[vsm3c_vi $V26, $V16, 10]} ++ @{[vsm3c_vi $V26, $V17, 11]} ++ @{[vsm3c_vi $V26, $V3, 12]} ++ @{[vsm3c_vi $V26, $V18, 13]} ++ @{[vsm3c_vi $V26, $V19, 14]} ++ @{[vsm3c_vi $V26, $V20, 15]} ++ @{[vsm3c_vi $V26, $V4, 16]} ++ @{[vsm3c_vi $V26, $V21, 17]} ++ @{[vsm3c_vi $V26, $V22, 18]} ++ @{[vsm3c_vi $V26, $V23, 19]} ++ @{[vsm3c_vi $V26, $V5, 20]} ++ @{[vsm3c_vi $V26, $V24, 21]} ++ @{[vsm3c_vi $V26, $V25, 22]} ++ @{[vrgather_vv $V9, $V25, $V29]} ++ @{[vrgather_vv $V10, $V6, $V29]} ++ @{[vrgather_vv $V11, $V10, $V29]} ++ @{[vrgather_vv $V12, $V7, $V28]} ++ @{[vor_vv $V11, $V11, $V12]} ++ @{[vrgather_vv $V12, $V11, $V29]} ++ @{[vrgather_vv $V13, $V7, $V29]} ++ @{[vrgather_vv $V14, $V13, $V29]} ++ @{[vrgather_vv $V15, $V8, $V28]} ++ @{[vor_vv $V14, $V14, $V15]} ++ @{[vrgather_vv $V15, $V14, $V29]} ++ @{[vsm3c_vi $V26, $V9, 23]} ++ @{[vsm3c_vi $V26, $V6, 24]} ++ @{[vsm3c_vi $V26, $V10, 25]} ++ @{[vsm3c_vi $V26, $V11, 26]} ++ @{[vsm3c_vi $V26, $V12, 27]} ++ @{[vsm3c_vi $V26, $V7, 28]} ++ @{[vsm3c_vi $V26, $V13, 29]} ++ @{[vsm3c_vi $V26, $V14, 30]} ++ @{[vsm3c_vi $V26, $V15, 31]} ++ @{[vxor_vv $V26, $V26, $V27]} ++ @{[vslidedown_vi $V27, $V26, 8]} ++ @{[vmv_v_v $V26, $V27]} ++ addi $EVENNUM , $EVENNUM , -1 ++ addi $INPUT, $INPUT, 128 ++ bnez $EVENNUM , L_sm3_loop_zvl512 ++ @{[vsetivli "zero", 8, "e32", "m1", "ta", "ma"]} ++ @{[vrev8_v $V26, $V26]} ++ @{[vse32_v $V26, $CTX]} ++ bnez $NUM, ossl_hwsm3_block_data_order_zvksh_zvl256 ++ ret ++ossl_hwsm3_block_data_order_zvksh_zvl256: ++ @{[vsetivli "zero", 8, "e32", "m1", "ta", "ma"]} ++ j ossl_hwsm3_block_data_order_zvksh_single ++ossl_hwsm3_block_data_order_zvksh_zvl128: + @{[vsetivli "zero", 8, "e32", "m2", "ta", "ma"]} +- ++ossl_hwsm3_block_data_order_zvksh_single: + # Load initial state of hash context (c->A-H). + @{[vle32_v $V0, $CTX]} + @{[vrev8_v $V0, $V0]} +@@ -220,6 +391,19 @@ L_sm3_end: + ret + + .size ossl_hwsm3_block_data_order_zvksh,.-ossl_hwsm3_block_data_order_zvksh ++ ++.section .rodata ++.p2align 3 ++.type ORDER_BY_ZVL512_DATA,\@object ++ORDER_BY_ZVL512_DATA: ++ .word 0, 4, 8, 12, 16, 20, 24, 28, 64, 68, 72, 76, 80, 84, 88, 92, 32, 36, 40, 44, 48, 52, 56, 60, 96, 100, 104, 108, 112, 116, 120, 124 ++.size ORDER_BY_ZVL512_DATA, .-ORDER_BY_ZVL512_DATA ++ ++.p2align 3 ++.type ORDER_BY_ZVL512_EXP,\@object ++ORDER_BY_ZVL512_EXP: ++ .word 2, 3, 4, 5, 6, 7, 255, 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 255, 255, 255, 255, 8, 9, 10, 11 ++.size ORDER_BY_ZVL512_EXP, .-ORDER_BY_ZVL512_EXP + ___ + } + +-- +2.34.1 + diff --git a/openssl.spec b/openssl.spec index 23dc8a7..01462b1 100644 --- a/openssl.spec +++ b/openssl.spec @@ -1,4 +1,4 @@ -%define anolis_release 3 +%define anolis_release 4 %global soversion 3 %define srpmhash() %{lua: @@ -89,6 +89,7 @@ Patch: 0056-apps-speed.c-Disable-testing-of-composite-signature-.patch Patch: 0057-apps-speed.c-Support-more-signature-algorithms.patch Patch: 0058-Add-targets-to-skip-build-of-non-installable-program.patch Patch: 0059-RSA_encrypt-decrypt-with-padding-NONE-is-not-support.patch +Patch: 3100-Anolis-SM3-performance-optimization.patch BuildRequires: gcc git coreutils perl-interpreter sed zlib-devel /usr/bin/cmp BuildRequires: lksctp-tools-devel @@ -292,6 +293,9 @@ rm -f $RPM_BUILD_ROOT%{_sysconfdir}/pki/tls/{openssl.cnf.dist,ct_log_list.cnf.di %doc NEWS.md README.md %changelog +* Mon Dec 08 2025 xxcui 1:3.5.4-4 +- Add patch to support sm3 optimized with rvv cve. + * Fri Dec 05 2025 konglidong 1:3.5.4-3 - fix some failed test case in check phase -- Gitee From ccbfa41b17ff473cd19a0cab7e03c1e661d77c07 Mon Sep 17 00:00:00 2001 From: xxcui Date: Mon, 8 Dec 2025 11:31:57 +0800 Subject: [PATCH 2/2] Add sha512/sm4/aes performance optmized with rvv cve. --- ...olis-SHA512-performance-optimization.patch | 288 +++++++++++ ...256-ECB-CBC-performance-optimization.patch | 463 ++++++++++++++++++ ...lis-SM4-ECB-performance-optimization.patch | 284 +++++++++++ openssl.spec | 6 + 4 files changed, 1041 insertions(+) create mode 100644 3000-Anolis-SHA512-performance-optimization.patch create mode 100644 3200-Anolis-AES-128-192-256-ECB-CBC-performance-optimization.patch create mode 100644 3400-Anolis-SM4-ECB-performance-optimization.patch diff --git a/3000-Anolis-SHA512-performance-optimization.patch b/3000-Anolis-SHA512-performance-optimization.patch new file mode 100644 index 0000000..dcbf69e --- /dev/null +++ b/3000-Anolis-SHA512-performance-optimization.patch @@ -0,0 +1,288 @@ +From 6aa4ade2abe112d6c90a373bc0211bb8313e315a Mon Sep 17 00:00:00 2001 +From: xxcui +Date: Mon, 8 Dec 2025 10:10:30 +0800 +Subject: [PATCH 1/1] Optimization of SHA512 algorithm using RISC-V + cryptographic Vector Extension. + +Signed-off-by: xxcui +Change-Id: I8cd81ed459615c0914201e46f3fa1fc0b1e8d203 +--- + crypto/perlasm/riscv.pm | 8 + + crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl | 226 ++++++++++++++++++- + 2 files changed, 231 insertions(+), 3 deletions(-) + +diff --git a/crypto/perlasm/riscv.pm b/crypto/perlasm/riscv.pm +index cd27c89fd8..ec12961540 100644 +--- a/crypto/perlasm/riscv.pm ++++ b/crypto/perlasm/riscv.pm +@@ -592,6 +592,14 @@ sub vmv_v_i { + return ".word ".($template | ($imm << 15) | ($vd << 7)); + } + ++sub vmv1r_v { ++ # vmv1r.v vd, vs1 ++ my $template = 0b1001111_00000_00000_011_00000_1010111; ++ my $vd = read_vreg shift; ++ my $vs1 = read_vreg shift; ++ return ".word ".($template | ($vs1 << 20) | ($vd << 7)); ++} ++ + sub vmv_v_x { + # vmv.v.x vd, rs1 + my $template = 0b0101111_00000_00000_100_00000_1010111; +diff --git a/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl b/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl +index c5df987296..27aed10c99 100644 +--- a/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl ++++ b/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl +@@ -70,6 +70,7 @@ my $K512 = "K512"; + + # Function arguments + my ($H, $INP, $LEN, $KT, $H2, $INDEX_PATTERN) = ("a0", "a1", "a2", "a3", "t3", "t4"); ++my ($T0, $T1) = ("t0", "t1"); + + ################################################################################ + # void sha512_block_data_order_zvkb_zvknhb(void *c, const void *p, size_t len) +@@ -104,10 +105,229 @@ sha512_block_data_order_zvkb_zvknhb: + # The AVL is 4 in SHA, so we could use a single e8(8 element masking) for masking. + @{[vsetivli "zero", 1, "e8", "m1", "ta", "ma"]} + @{[vmv_v_i $V0, 0x01]} ++ csrr t0, vlenb ++ addi t1, t0, -64 ++ beqz t1, sha512_block_data_order_zvkb_zvknhb_zvl512 ++ addi t1, t0, -32 ++ beqz t1, sha512_block_data_order_zvkb_zvknhb_zvl256 ++ j sha512_block_data_order_zvkb_zvknhb_zvl128 ++sha512_block_data_order_zvkb_zvknhb_zvl512: ++sha512_block_data_order_zvkb_zvknhb_zvl256: ++ # When vlen=256 or 512, the round constants K512 can be loaded ++ # at once in vector register files. ++ @{[vsetivli "zero", 4, "e64", "m1", "ta", "ma"]} ++ # Load round constants K512 ++ la $KT, $K512 ++ @{[vle64_v $V2, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V3, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V4, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V5, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V6, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V7, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V8, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V9, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V11, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V13, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V15, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V17, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V19, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V21, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V23, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V25, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V27, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V29, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V30, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V31, ($KT)]} + +- @{[vsetivli "zero", 4, "e64", "m2", "ta", "ma"]} ++L_round_loop_256_512: ++ # Decrement length by 1 ++ addi $LEN, $LEN, -1 ++ ++ # Keep the current state as we need it later: H' = H+{a',b',c',...,h'}. ++ @{[vmv1r_v $V26, $V22]} ++ @{[vmv1r_v $V28, $V24]} ++ ++ # Load the 1024-bits of the message block in v10, v12, v14, v16 ++ # and perform the endian swap. ++ @{[vle64_v $V10, $INP]} ++ @{[vrev8_v $V10, $V10]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V12, $INP]} ++ @{[vrev8_v $V12, $V12]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V14, $INP]} ++ @{[vrev8_v $V14, $V14]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V16, $INP]} ++ @{[vrev8_v $V16, $V16]} ++ addi $INP, $INP, 32 ++ ++ # Quad-round 0 (+0, v10->v12->v14->v16) ++ @{[vadd_vv $V18, $V2, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 1 (+1, v12->v14->v16->v10) ++ @{[vadd_vv $V18, $V3, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 2 (+2, v14->v16->v10->v12) ++ @{[vadd_vv $V18, $V4, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 3 (+3, v16->v10->v12->v14) ++ @{[vadd_vv $V18, $V5, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 4 (+4, v10->v12->v14->v16) ++ @{[vadd_vv $V18, $V6, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 5 (+5, v12->v14->v16->v10) ++ @{[vadd_vv $V18, $V7, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 6 (+6, v14->v16->v10->v12) ++ @{[vadd_vv $V18, $V8, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 7 (+7, v16->v10->v12->v14) ++ @{[vadd_vv $V18, $V9, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 8 (+8, v10->v12->v14->v16) ++ @{[vadd_vv $V18, $V11, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 9 (+9, v12->v14->v16->v10) ++ @{[vadd_vv $V18, $V13, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 10 (+10, v14->v16->v10->v12) ++ @{[vadd_vv $V18, $V15, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 11 (+11, v16->v10->v12->v14) ++ @{[vadd_vv $V18, $V17, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 12 (+12, v10->v12->v14->v16) ++ @{[vadd_vv $V18, $V19, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 13 (+13, v12->v14->v16->v10) ++ @{[vadd_vv $V18, $V21, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} + +-L_round_loop: ++ # Quad-round 14 (+14, v14->v16->v10->v12) ++ @{[vadd_vv $V18, $V23, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 15 (+15, v16->v10->v12->v14) ++ @{[vadd_vv $V18, $V25, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 16 (+0, v10->v12->v14->v16) ++ # Note that we stop generating new message schedule words (Wt, v10-16) ++ # as we already generated all the words we end up consuming (i.e., W[79:76]). ++ @{[vadd_vv $V18, $V27, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 17 (+1, v12->v14->v16->v10) ++ @{[vadd_vv $V18, $V29, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 18 (+2, v14->v16->v10->v12) ++ @{[vadd_vv $V18, $V30, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 19 (+3, v16->v10->v12->v14) ++ @{[vadd_vv $V18, $V31, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # H' = H+{a',b',c',...,h'} ++ @{[vadd_vv $V22, $V26, $V22]} ++ @{[vadd_vv $V24, $V28, $V24]} ++ bnez $LEN, L_round_loop_256_512 ++ ++ # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. ++ @{[vsuxei8_v $V22, ($H), $V1]} ++ @{[vsuxei8_v $V24, ($H2), $V1]} ++ ++ ret ++sha512_block_data_order_zvkb_zvknhb_zvl128: ++ @{[vsetivli $T0, 4, "e64", "m2", "ta", "ma"]} ++L_round_loop_128: + # Load round constants K512 + la $KT, $K512 + +@@ -204,7 +424,7 @@ L_round_loop: + # H' = H+{a',b',c',...,h'} + @{[vadd_vv $V22, $V26, $V22]} + @{[vadd_vv $V24, $V28, $V24]} +- bnez $LEN, L_round_loop ++ bnez $LEN, L_round_loop_128 + + # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. + @{[vsuxei8_v $V22, ($H), $V1]} +-- +2.34.1 + diff --git a/3200-Anolis-AES-128-192-256-ECB-CBC-performance-optimization.patch b/3200-Anolis-AES-128-192-256-ECB-CBC-performance-optimization.patch new file mode 100644 index 0000000..ba382d7 --- /dev/null +++ b/3200-Anolis-AES-128-192-256-ECB-CBC-performance-optimization.patch @@ -0,0 +1,463 @@ +From 97698489fc2b2c9942d923087be8d0f712c96e76 Mon Sep 17 00:00:00 2001 +From: xxcui +Date: Sun, 30 Nov 2025 11:27:02 +0800 +Subject: [PATCH 1/1] Optimization of AES-128/192/256-ECB/CBC algorithm using + RISC-V cryptographic + +Signed-off-by: xxcui +Change-Id: I22cef2624c8bc36c37cc6668b1eb55de24cad4bc +--- + crypto/aes/asm/aes-riscv64-zvkned.pl | 244 ++++++++++++++++++++------- + 1 file changed, 182 insertions(+), 62 deletions(-) + +diff --git a/crypto/aes/asm/aes-riscv64-zvkned.pl b/crypto/aes/asm/aes-riscv64-zvkned.pl +index 4c0292781d..9b28577f3c 100644 +--- a/crypto/aes/asm/aes-riscv64-zvkned.pl ++++ b/crypto/aes/asm/aes-riscv64-zvkned.pl +@@ -64,12 +64,21 @@ my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, + $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, + ) = map("v$_",(0..31)); + ++my ($VLENB,$VLENB_SUB) = ("t5", "t6"); ++ + # Load all 11 round keys to v1-v11 registers. + sub aes_128_load_key { + my $KEYP = shift; + + my $code=<<___; ++ csrr $VLENB, vlenb ++ addi $VLENB_SUB, $VLENB, -16 ++ beqz $VLENB_SUB, 8f ++ @{[vsetivli "zero", 4, "e32", "mf2", "ta", "ma"]} ++ j 9f ++8: + @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++9: + @{[vle32_v $V1, $KEYP]} + addi $KEYP, $KEYP, 16 + @{[vle32_v $V2, $KEYP]} +@@ -101,7 +110,14 @@ sub aes_192_load_key { + my $KEYP = shift; + + my $code=<<___; ++ csrr $VLENB, vlenb ++ addi $VLENB_SUB, $VLENB, -16 ++ beqz $VLENB_SUB, 8f ++ @{[vsetivli "zero", 4, "e32", "mf2", "ta", "ma"]} ++ j 9f ++8: + @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++9: + @{[vle32_v $V1, $KEYP]} + addi $KEYP, $KEYP, 16 + @{[vle32_v $V2, $KEYP]} +@@ -137,7 +153,14 @@ sub aes_256_load_key { + my $KEYP = shift; + + my $code=<<___; ++ csrr $VLENB, vlenb ++ addi $VLENB_SUB, $VLENB, -16 ++ beqz $VLENB_SUB, 8f ++ @{[vsetivli "zero", 4, "e32", "mf2", "ta", "ma"]} ++ j 9f ++8: + @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++9: + @{[vle32_v $V1, $KEYP]} + addi $KEYP, $KEYP, 16 + @{[vle32_v $V2, $KEYP]} +@@ -191,6 +214,25 @@ ___ + return $code; + } + ++# aes-128 encryption with round keys v1-v11 ++sub aes_128_encrypt_cbc { ++ my $code=<<___; ++ @{[vaesz_vs $V24, $V17]} # with round key w[ 0, 3] ++ @{[vaesem_vs $V24, $V2]} # with round key w[ 4, 7] ++ @{[vaesem_vs $V24, $V3]} # with round key w[ 8,11] ++ @{[vaesem_vs $V24, $V4]} # with round key w[12,15] ++ @{[vaesem_vs $V24, $V5]} # with round key w[16,19] ++ @{[vaesem_vs $V24, $V6]} # with round key w[20,23] ++ @{[vaesem_vs $V24, $V7]} # with round key w[24,27] ++ @{[vaesem_vs $V24, $V8]} # with round key w[28,31] ++ @{[vaesem_vs $V24, $V9]} # with round key w[32,35] ++ @{[vaesem_vs $V24, $V10]} # with round key w[36,39] ++ @{[vaesef_vs $V24, $V11]} # with round key w[40,43] ++___ ++ ++ return $code; ++} ++ + # aes-128 decryption with round keys v1-v11 + sub aes_128_decrypt { + my $code=<<___; +@@ -231,6 +273,27 @@ ___ + return $code; + } + ++# aes-192 encryption with round keys v1-v13 ++sub aes_192_encrypt_cbc { ++ my $code=<<___; ++ @{[vaesz_vs $V24, $V17]} # with round key w[ 0, 3] ++ @{[vaesem_vs $V24, $V2]} # with round key w[ 4, 7] ++ @{[vaesem_vs $V24, $V3]} # with round key w[ 8,11] ++ @{[vaesem_vs $V24, $V4]} # with round key w[12,15] ++ @{[vaesem_vs $V24, $V5]} # with round key w[16,19] ++ @{[vaesem_vs $V24, $V6]} # with round key w[20,23] ++ @{[vaesem_vs $V24, $V7]} # with round key w[24,27] ++ @{[vaesem_vs $V24, $V8]} # with round key w[28,31] ++ @{[vaesem_vs $V24, $V9]} # with round key w[32,35] ++ @{[vaesem_vs $V24, $V10]} # with round key w[36,39] ++ @{[vaesem_vs $V24, $V11]} # with round key w[40,43] ++ @{[vaesem_vs $V24, $V12]} # with round key w[44,47] ++ @{[vaesef_vs $V24, $V13]} # with round key w[48,51] ++___ ++ ++ return $code; ++} ++ + # aes-192 decryption with round keys v1-v13 + sub aes_192_decrypt { + my $code=<<___; +@@ -275,6 +338,29 @@ ___ + return $code; + } + ++# aes-256 encryption with round keys v1-v15 ++sub aes_256_encrypt_cbc { ++ my $code=<<___; ++ @{[vaesz_vs $V24, $V17]} # with round key w[ 0, 3] ++ @{[vaesem_vs $V24, $V2]} # with round key w[ 4, 7] ++ @{[vaesem_vs $V24, $V3]} # with round key w[ 8,11] ++ @{[vaesem_vs $V24, $V4]} # with round key w[12,15] ++ @{[vaesem_vs $V24, $V5]} # with round key w[16,19] ++ @{[vaesem_vs $V24, $V6]} # with round key w[20,23] ++ @{[vaesem_vs $V24, $V7]} # with round key w[24,27] ++ @{[vaesem_vs $V24, $V8]} # with round key w[28,31] ++ @{[vaesem_vs $V24, $V9]} # with round key w[32,35] ++ @{[vaesem_vs $V24, $V10]} # with round key w[36,39] ++ @{[vaesem_vs $V24, $V11]} # with round key w[40,43] ++ @{[vaesem_vs $V24, $V12]} # with round key w[44,47] ++ @{[vaesem_vs $V24, $V13]} # with round key w[48,51] ++ @{[vaesem_vs $V24, $V14]} # with round key w[52,55] ++ @{[vaesef_vs $V24, $V15]} # with round key w[56,59] ++___ ++ ++ return $code; ++} ++ + # aes-256 decryption with round keys v1-v15 + sub aes_256_decrypt { + my $code=<<___; +@@ -304,7 +390,7 @@ ___ + # size_t length, const AES_KEY *key, + # unsigned char *ivec, const int enc); + my ($INP, $OUTP, $LEN, $KEYP, $IVP, $ENC) = ("a0", "a1", "a2", "a3", "a4", "a5"); +-my ($T0, $T1, $ROUNDS) = ("t0", "t1", "t2"); ++my ($T0, $T1, $ROUNDS, $VL, $LEN32) = ("t0", "t1", "t2", "t3", "t4"); + + $code .= <<___; + .p2align 3 +@@ -345,13 +431,28 @@ L_cbc_enc_128: + + @{[vle32_v $V24, $INP]} + @{[vxor_vv $V24, $V24, $V16]} +- j 2f ++ j 3f + + 1: + @{[vle32_v $V17, $INP]} +- @{[vxor_vv $V24, $V24, $V17]} ++ @{[vxor_vv $V17, $V17, $V1]} + + 2: ++ # AES body ++ @{[aes_128_encrypt_cbc]} ++ ++ @{[vse32_v $V24, $OUTP]} ++ ++ addi $INP, $INP, 16 ++ addi $OUTP, $OUTP, 16 ++ addi $LEN, $LEN, -16 ++ ++ bnez $LEN, 1b ++ ++ @{[vse32_v $V24, $IVP]} ++ ++ ret ++3: + # AES body + @{[aes_128_encrypt]} + +@@ -380,13 +481,28 @@ L_cbc_enc_192: + + @{[vle32_v $V24, $INP]} + @{[vxor_vv $V24, $V24, $V16]} +- j 2f ++ j 3f + + 1: + @{[vle32_v $V17, $INP]} +- @{[vxor_vv $V24, $V24, $V17]} ++ @{[vxor_vv $V17, $V17, $V1]} + + 2: ++ # AES body ++ @{[aes_192_encrypt_cbc]} ++ ++ @{[vse32_v $V24, $OUTP]} ++ ++ addi $INP, $INP, 16 ++ addi $OUTP, $OUTP, 16 ++ addi $LEN, $LEN, -16 ++ ++ bnez $LEN, 1b ++ ++ @{[vse32_v $V24, $IVP]} ++ ++ ret ++3: + # AES body + @{[aes_192_encrypt]} + +@@ -415,13 +531,28 @@ L_cbc_enc_256: + + @{[vle32_v $V24, $INP]} + @{[vxor_vv $V24, $V24, $V16]} +- j 2f ++ j 3f + + 1: + @{[vle32_v $V17, $INP]} +- @{[vxor_vv $V24, $V24, $V17]} ++ @{[vxor_vv $V17, $V17, $V1]} + + 2: ++ # AES body ++ @{[aes_256_encrypt_cbc]} ++ ++ @{[vse32_v $V24, $OUTP]} ++ ++ addi $INP, $INP, 16 ++ addi $OUTP, $OUTP, 16 ++ addi $LEN, $LEN, -16 ++ ++ bnez $LEN, 1b ++ ++ @{[vse32_v $V24, $IVP]} ++ ++ ret ++3: + # AES body + @{[aes_256_encrypt]} + +@@ -457,7 +588,7 @@ rv64i_zvkned_cbc_decrypt: + + # Load number of rounds + lwu $ROUNDS, 240($KEYP) +- ++ srli $LEN32, $LEN, 2 + # Get proper routine for key size + li $T0, 10 + beq $ROUNDS, $T0, L_cbc_dec_128 +@@ -477,35 +608,32 @@ $code .= <<___; + L_cbc_dec_128: + # Load all 11 round keys to v1-v11 registers. + @{[aes_128_load_key $KEYP]} +- + # Load IV. + @{[vle32_v $V16, $IVP]} + +- @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- j 2f +- + 1: ++ @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ slli $T0, $VL, 2 ++ sub $LEN32, $LEN32, $VL ++ addi $VL, $VL, -4 + @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- addi $OUTP, $OUTP, 16 ++ @{[vslideup_vi $V16, $V24, 4]} ++ @{[vslidedown_vx $V20, $V24, $VL]} + +-2: + # AES body + @{[aes_128_decrypt]} +- + @{[vxor_vv $V24, $V24, $V16]} ++ @{[vmv_v_v $V16, $V20]} + @{[vse32_v $V24, $OUTP]} +- @{[vmv_v_v $V16, $V17]} +- +- addi $LEN, $LEN, -16 +- addi $INP, $INP, 16 + +- bnez $LEN, 1b +- +- @{[vse32_v $V16, $IVP]} ++ add $INP, $INP, $T0 ++ add $OUTP, $OUTP, $T0 + ++ bnez $LEN32, 1b ++ @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++ @{[vse32_v $V20, $IVP]} + ret ++ + .size L_cbc_dec_128,.-L_cbc_dec_128 + ___ + +@@ -514,34 +642,30 @@ $code .= <<___; + L_cbc_dec_192: + # Load all 13 round keys to v1-v13 registers. + @{[aes_192_load_key $KEYP]} +- + # Load IV. + @{[vle32_v $V16, $IVP]} + +- @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- j 2f +- + 1: ++ @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ slli $T0, $VL, 2 ++ sub $LEN32, $LEN32, $VL ++ addi $VL, $VL, -4 + @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- addi $OUTP, $OUTP, 16 ++ @{[vslideup_vi $V16, $V24, 4]} ++ @{[vslidedown_vx $V20, $V24, $VL]} + +-2: + # AES body + @{[aes_192_decrypt]} +- + @{[vxor_vv $V24, $V24, $V16]} ++ @{[vmv_v_v $V16, $V20]} + @{[vse32_v $V24, $OUTP]} +- @{[vmv_v_v $V16, $V17]} +- +- addi $LEN, $LEN, -16 +- addi $INP, $INP, 16 +- +- bnez $LEN, 1b + +- @{[vse32_v $V16, $IVP]} ++ add $INP, $INP, $T0 ++ add $OUTP, $OUTP, $T0 + ++ bnez $LEN32, 1b ++ @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]} ++ @{[vse32_v $V20, $IVP]} + ret + .size L_cbc_dec_192,.-L_cbc_dec_192 + ___ +@@ -551,34 +675,30 @@ $code .= <<___; + L_cbc_dec_256: + # Load all 15 round keys to v1-v15 registers. + @{[aes_256_load_key $KEYP]} +- + # Load IV. + @{[vle32_v $V16, $IVP]} + +- @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- j 2f +- + 1: ++ @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ slli $T0, $VL, 2 ++ sub $LEN32, $LEN32, $VL ++ addi $VL, $VL, -4 + @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- addi $OUTP, $OUTP, 16 ++ @{[vslideup_vi $V16, $V24, 4]} ++ @{[vslidedown_vx $V20, $V24, $VL]} + +-2: + # AES body + @{[aes_256_decrypt]} +- + @{[vxor_vv $V24, $V24, $V16]} ++ @{[vmv_v_v $V16, $V20]} + @{[vse32_v $V24, $OUTP]} +- @{[vmv_v_v $V16, $V17]} +- +- addi $LEN, $LEN, -16 +- addi $INP, $INP, 16 +- +- bnez $LEN, 1b + +- @{[vse32_v $V16, $IVP]} ++ add $INP, $INP, $T0 ++ add $OUTP, $OUTP, $T0 + ++ bnez $LEN32, 1b ++ @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]} ++ @{[vse32_v $V20, $IVP]} + ret + .size L_cbc_dec_256,.-L_cbc_dec_256 + ___ +@@ -627,7 +747,7 @@ L_ecb_enc_128: + @{[aes_128_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -654,7 +774,7 @@ L_ecb_enc_192: + @{[aes_192_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -681,7 +801,7 @@ L_ecb_enc_256: + @{[aes_256_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -738,7 +858,7 @@ L_ecb_dec_128: + @{[aes_128_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -765,7 +885,7 @@ L_ecb_dec_192: + @{[aes_192_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -792,7 +912,7 @@ L_ecb_dec_256: + @{[aes_256_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +-- +2.34.1 + diff --git a/3400-Anolis-SM4-ECB-performance-optimization.patch b/3400-Anolis-SM4-ECB-performance-optimization.patch new file mode 100644 index 0000000..cbfae38 --- /dev/null +++ b/3400-Anolis-SM4-ECB-performance-optimization.patch @@ -0,0 +1,284 @@ +From 0d04ec19941c8b0743a2ae031fd743e8c5de1af2 Mon Sep 17 00:00:00 2001 +From: xxcui +Date: Mon, 1 Dec 2025 10:42:11 +0800 +Subject: [PATCH 1/1] Optimization of SM4-ECB algorithm using RISC-V + cryptographic Vector Extension. + +Signed-off-by: xxcui +Change-Id: Ic94b673053cc06d54a04aeaf66d584a3c08c0771 +--- + crypto/sm4/asm/sm4-riscv64-zvksed.pl | 185 ++++++++++++++++++ + include/crypto/sm4_platform.h | 6 + + .../ciphers/cipher_sm4_hw_rv64i.inc | 29 ++- + 3 files changed, 211 insertions(+), 9 deletions(-) + +diff --git a/crypto/sm4/asm/sm4-riscv64-zvksed.pl b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +index 0734e5fa4c..b0251bc89c 100644 +--- a/crypto/sm4/asm/sm4-riscv64-zvksed.pl ++++ b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +@@ -66,6 +66,7 @@ ___ + { + my ($ukey,$keys,$fk)=("a0","a1","t0"); + my ($vukey,$vfk,$vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7)=("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10"); ++ + $code .= <<___; + .p2align 3 + .globl rv64i_zvksed_sm4_set_encrypt_key +@@ -285,12 +286,196 @@ rv64i_zvksed_sm4_decrypt: + ___ + } + ++# void rv64i_zvkned_sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++# size_t length, const SM4_KEY *key, ++# const int enc); ++{ ++my ($a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$t4,$t5,$t6)=("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); ++my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, ++ $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, ++ $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, ++ $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, ++) = map("v$_",(0..31)); ++$code .= <<___; ++.p2align 3 ++.globl rv64i_zvkned_sm4_ecb_encrypt ++.type rv64i_zvkned_sm4_ecb_encrypt,\@function ++rv64i_zvkned_sm4_ecb_encrypt: ++ @{[vsetivli__x0_4_e32_m1_tu_mu]} ++ # Order of elements was adjusted in set_encrypt_key() ++ @{[vle32_v $V2, $a3]} # rk[0:3] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V3, $a3]} # rk[4:7] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V4, $a3]} # rk[8:11] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V5, $a3]} # rk[12:15] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V6, $a3]} # rk[16:19] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V7, $a3]} # rk[20:23] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V8, $a3]} # rk[24:27] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V9, $a3]} # rk[28:31] ++ li $t3, 32 ++ @{[vsetvli "zero", $t3, "e32", "m4", "ta", "ma"]} ++ la $a3, ORDER ++ @{[vle32_v $V16, $a3]} ++ srli $t3, $a2, 2 ++ ++ # Load input data ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ sub $t5, $t3, $t1 ++ @{[vle32_v $V24, $a0]} ++ @{[vrev8_v $V24, $V24]} ++1: ++ beqz $t5, .Lenc_last_block ++ # Encrypt with all keys ++ @{[vsm4r_vs $V24, $V2]} ++ # Simultaneously load the next round ++ slli $t6, $t1, 2 ++ add $a0, $a0, $t6 ++ @{[vsetvli $t2, $t5, "e32", "m4", "ta", "ma"]} ++ @{[vle32_v $V20, $a0]} ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V9]} ++ @{[vrev8_v $V20, $V20]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ vmv4r.v $V24, $V20 ++ add $a1, $a1, $t6 ++ mv $t3, $t5 ++ sub $t5, $t5, $t2 ++ j 1b ++.Lenc_last_block: ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ # Encrypt with all keys ++ @{[vsm4r_vs $V24, $V2]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V9]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ ret ++.size rv64i_zvkned_sm4_ecb_encrypt,.-rv64i_zvkned_sm4_ecb_encrypt ++ ++___ ++} ++ ++ ++############################################################################### ++# void rv64i_zvkned_sm4_ecb_decrypt(const unsigned char *in, unsigned char *out, ++# size_t length, const SM4_KEY *key); ++{ ++my ($a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$t4,$t5,$t6)=("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); ++my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, ++ $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, ++ $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, ++ $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, ++) = map("v$_",(0..31)); ++$code .= <<___; ++.p2align 3 ++.globl rv64i_zvkned_sm4_ecb_decrypt ++.type rv64i_zvkned_sm4_ecb_decrypt,\@function ++rv64i_zvkned_sm4_ecb_decrypt: ++ @{[vsetivli__x0_4_e32_m1_tu_mu]} ++ # Order of elements was adjusted in set_encrypt_key() ++ @{[vle32_v $V9, $a3]} # rk[0:3] ++ addi a3, a3, 16 ++ @{[vle32_v $V8, $a3]} # rk[4:7] ++ addi a3, a3, 16 ++ @{[vle32_v $V7, $a3]} # rk[8:11] ++ addi a3, a3, 16 ++ @{[vle32_v $V6, $a3]} # rk[12:15] ++ addi a3, a3, 16 ++ @{[vle32_v $V5, $a3]} # rk[16:19] ++ addi a3, a3, 16 ++ @{[vle32_v $V4, $a3]} # rk[20:23] ++ addi a3, a3, 16 ++ @{[vle32_v $V3, $a3]} # rk[24:27] ++ addi a3, a3, 16 ++ @{[vle32_v $V2, $a3]} # rk[28:31] ++ li $t3, 32 ++ @{[vsetvli "zero", $t3, "e32", "m4", "ta", "ma"]} ++ la a3, ORDER ++ @{[vle32_v $V16, $a3]} ++ srli $t3, $a2, 2 ++ ++ # Load input data ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ sub $t5, $t3, $t1 ++ @{[vle32_v $V24, $a0]} ++ @{[vrev8_v $V24, $V24]} ++1: ++ beqz $t5, .Ldec_last_block ++ # Decrypt with all keys ++ @{[vsm4r_vs $V24, $V9]} ++ # Simultaneously load the next round ++ slli $t6, $t1, 2 ++ add $a0, $a0, $t6 ++ @{[vsetvli $t2, $t5, "e32", "m4", "ta", "ma"]} ++ @{[vle32_v $V20, $a0]} ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V2]} ++ @{[vrev8_v $V20, $V20]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ vmv4r.v $V24, $V20 ++ add $a1, $a1, $t6 ++ mv $t3, $t5 ++ sub $t5, $t5, $t2 ++ j 1b ++.Ldec_last_block: ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ # Decrypt with all keys ++ @{[vsm4r_vs $V24, $V9]} ++ @{[vle32_v $V20, $a0]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V2]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ ret ++.size rv64i_zvkned_sm4_ecb_decrypt,.-rv64i_zvkned_sm4_ecb_decrypt ++___ ++} ++ + $code .= <<___; + # Family Key (little-endian 32-bit chunks) + .p2align 3 + FK: + .word 0xA3B1BAC6, 0x56AA3350, 0x677D9197, 0xB27022DC + .size FK,.-FK ++ ++.p2align 3 ++ORDER: ++ .word 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40, 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60 ++.size ORDER,.-ORDER + ___ + + print $code; +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 3df1b4256d..4a9471185c 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -50,6 +50,12 @@ void rv64i_zvksed_sm4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); + void rv64i_zvksed_sm4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); ++void rv64i_zvkned_sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void rv64i_zvkned_sm4_ecb_decrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); + # endif /* RV64 */ + # endif /* OPENSSL_CPUID_OBJ */ + +diff --git a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc +index 763d9d09dd..2b80002bd2 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc ++++ b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc +@@ -26,16 +26,27 @@ static int cipher_hw_rv64i_zvksed_sm4_initkey(PROV_CIPHER_CTX *ctx, + SM4_KEY *ks = &sctx->ks.ks; + + ctx->ks = ks; +- if (ctx->enc +- || (ctx->mode != EVP_CIPH_ECB_MODE +- && ctx->mode != EVP_CIPH_CBC_MODE)) { +- rv64i_zvksed_sm4_set_encrypt_key(key, ks); +- ctx->block = (block128_f) rv64i_zvksed_sm4_encrypt; +- ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_ECB_MODE) { ++ if (ctx->enc) { ++ rv64i_zvksed_sm4_set_encrypt_key(key, ks); ++ ctx->block = NULL; ++ ctx->stream.ecb = (ecb128_f) rv64i_zvkned_sm4_ecb_encrypt; ++ } else { ++ rv64i_zvksed_sm4_set_decrypt_key(key, ks); ++ ctx->block = NULL; ++ ctx->stream.ecb = (ecb128_f) rv64i_zvkned_sm4_ecb_decrypt; ++ } + } else { +- rv64i_zvksed_sm4_set_decrypt_key(key, ks); +- ctx->block = (block128_f) rv64i_zvksed_sm4_decrypt; +- ctx->stream.cbc = NULL; ++ if (ctx->enc ++ || ctx->mode != EVP_CIPH_CBC_MODE) { ++ rv64i_zvksed_sm4_set_encrypt_key(key, ks); ++ ctx->block = (block128_f) rv64i_zvksed_sm4_encrypt; ++ ctx->stream.cbc = NULL; ++ } else { ++ rv64i_zvksed_sm4_set_decrypt_key(key, ks); ++ ctx->block = (block128_f) rv64i_zvksed_sm4_decrypt; ++ ctx->stream.cbc = NULL; ++ } + } + + return 1; +-- +2.34.1 + diff --git a/openssl.spec b/openssl.spec index 01462b1..2351fc9 100644 --- a/openssl.spec +++ b/openssl.spec @@ -90,6 +90,9 @@ Patch: 0057-apps-speed.c-Support-more-signature-algorithms.patch Patch: 0058-Add-targets-to-skip-build-of-non-installable-program.patch Patch: 0059-RSA_encrypt-decrypt-with-padding-NONE-is-not-support.patch Patch: 3100-Anolis-SM3-performance-optimization.patch +Patch: 3000-Anolis-SHA512-performance-optimization.patch +Patch: 3200-Anolis-AES-128-192-256-ECB-CBC-performance-optimization.patch +Patch: 3400-Anolis-SM4-ECB-performance-optimization.patch BuildRequires: gcc git coreutils perl-interpreter sed zlib-devel /usr/bin/cmp BuildRequires: lksctp-tools-devel @@ -295,6 +298,9 @@ rm -f $RPM_BUILD_ROOT%{_sysconfdir}/pki/tls/{openssl.cnf.dist,ct_log_list.cnf.di %changelog * Mon Dec 08 2025 xxcui 1:3.5.4-4 - Add patch to support sm3 optimized with rvv cve. +- Add patch to support sha512 optimized wth rvv cve. +- Add patch to support sm4-ecb optimized with rvv cve. +- Add patch to support aes-128-192-256-ecb-cbc with rvv cve. * Fri Dec 05 2025 konglidong 1:3.5.4-3 - fix some failed test case in check phase -- Gitee