diff --git a/1000-SHA512-Perf-Optimiazed-with-CVE-Inst.patch b/1000-SHA512-Perf-Optimiazed-with-CVE-Inst.patch new file mode 100644 index 0000000000000000000000000000000000000000..f09076395dfbd430f3e35b59376842ec5194392c --- /dev/null +++ b/1000-SHA512-Perf-Optimiazed-with-CVE-Inst.patch @@ -0,0 +1,359 @@ +diff --git a/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl b/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl +index c5df987296..508d0c9320 100644 +--- a/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl ++++ b/crypto/sha/asm/sha512-riscv64-zvkb-zvknhb.pl +@@ -70,6 +70,7 @@ my $K512 = "K512"; + + # Function arguments + my ($H, $INP, $LEN, $KT, $H2, $INDEX_PATTERN) = ("a0", "a1", "a2", "a3", "t3", "t4"); ++my ($T0, $T1) = ("t0", "t1"); + + ################################################################################ + # void sha512_block_data_order_zvkb_zvknhb(void *c, const void *p, size_t len) +@@ -104,10 +105,231 @@ sha512_block_data_order_zvkb_zvknhb: + # The AVL is 4 in SHA, so we could use a single e8(8 element masking) for masking. + @{[vsetivli "zero", 1, "e8", "m1", "ta", "ma"]} + @{[vmv_v_i $V0, 0x01]} ++ csrr t0, vlenb ++ addi t1, t0, -64 ++ beqz t1, sha512_block_data_order_zvkb_zvknhb_rvv512 ++ addi t1, t0, -32 ++ beqz t1, sha512_block_data_order_zvkb_zvknhb_rvv256 ++ j sha512_block_data_order_zvkb_zvknhb_rvv128 ++sha512_block_data_order_zvkb_zvknhb_rvv512: ++sha512_block_data_order_zvkb_zvknhb_rvv256: ++ @{[vsetivli "zero", 4, "e64", "m1", "ta", "ma"]} ++ # j L_round_loop ++ # Load round constants K512 ++ la $KT, $K512 ++ @{[vle64_v $V2, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V3, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V4, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V5, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V6, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V7, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V8, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V9, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V11, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V13, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V15, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V17, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V19, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V21, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V23, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V25, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V27, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V29, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V30, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vle64_v $V31, ($KT)]} + +- @{[vsetivli "zero", 4, "e64", "m2", "ta", "ma"]} ++L_round_loop_other: ++ # Decrement length by 1 ++ addi $LEN, $LEN, -1 ++ ++ # Keep the current state as we need it later: H' = H+{a',b',c',...,h'}. ++ @{[vmv1r_v $V26, $V22]} ++ @{[vmv1r_v $V28, $V24]} ++ ++ # Load the 1024-bits of the message block in v10-v16 and perform the endian ++ # swap. ++ @{[vle64_v $V10, $INP]} ++ @{[vrev8_v $V10, $V10]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V12, $INP]} ++ @{[vrev8_v $V12, $V12]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V14, $INP]} ++ @{[vrev8_v $V14, $V14]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V16, $INP]} ++ @{[vrev8_v $V16, $V16]} ++ addi $INP, $INP, 32 ++ ++ # Quad-round 0 ++ @{[vadd_vv $V18, $V2, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 1 ++ @{[vadd_vv $V18, $V3, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 2 ++ @{[vadd_vv $V18, $V4, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 3 ++ @{[vadd_vv $V18, $V5, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 4 ++ @{[vadd_vv $V18, $V6, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 5 ++ @{[vadd_vv $V18, $V7, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 6 ++ @{[vadd_vv $V18, $V8, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 7 ++ @{[vadd_vv $V18, $V9, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 8 ++ @{[vadd_vv $V18, $V11, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 9 ++ @{[vadd_vv $V18, $V13, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 10 ++ @{[vadd_vv $V18, $V15, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 11 ++ @{[vadd_vv $V18, $V17, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 12 ++ @{[vadd_vv $V18, $V19, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 13 ++ @{[vadd_vv $V18, $V21, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 14 ++ @{[vadd_vv $V18, $V23, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 15 ++ @{[vadd_vv $V18, $V25, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ ++ # Quad-round 16 ++ # Note that we stop generating new message schedule words (Wt, v10-16) ++ # as we already generated all the words we end up consuming (i.e., W[79:76]). ++ @{[vadd_vv $V18, $V27, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 17 (+1, v12->v14->v16->v10) ++ @{[vadd_vv $V18, $V29, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 18 (+2, v14->v16->v10->v12) ++ @{[vadd_vv $V18, $V30, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 19 (+3, v16->v10->v12->v14) ++ # No t1 increment needed. ++ @{[vadd_vv $V18, $V31, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} + +-L_round_loop: ++ # H' = H+{a',b',c',...,h'} ++ @{[vadd_vv $V22, $V26, $V22]} ++ @{[vadd_vv $V24, $V28, $V24]} ++ bnez $LEN, L_round_loop_other ++ ++ # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. ++ @{[vsuxei8_v $V22, ($H), $V1]} ++ @{[vsuxei8_v $V24, ($H2), $V1]} ++ ++ ret ++sha512_block_data_order_zvkb_zvknhb_rvv128: ++ @{[vsetivli $T0, 4, "e64", "m2", "ta", "ma"]} ++ li $T1, 4 ++ bne $T0, $T1, L_round_loop_256_512 ++L_round_loop_128: + # Load round constants K512 + la $KT, $K512 + +@@ -204,7 +426,111 @@ L_round_loop: + # H' = H+{a',b',c',...,h'} + @{[vadd_vv $V22, $V26, $V22]} + @{[vadd_vv $V24, $V28, $V24]} +- bnez $LEN, L_round_loop ++ bnez $LEN, L_round_loop_128 ++ ++ # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. ++ @{[vsuxei8_v $V22, ($H), $V1]} ++ @{[vsuxei8_v $V24, ($H2), $V1]} ++ ++ ret ++L_round_loop_256_512: ++ # Load round constants K512 ++ la $KT, $K512 ++ ++ # Decrement length by 1 ++ addi $LEN, $LEN, -1 ++ ++ # Keep the current state as we need it later: H' = H+{a',b',c',...,h'}. ++ @{[vmv1r_v $V26, $V22]} ++ @{[vmv1r_v $V28, $V24]} ++ ++ # Load the 1024-bits of the message block in v10-v16 and perform the endian ++ # swap. ++ @{[vle64_v $V10, $INP]} ++ @{[vrev8_v $V10, $V10]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V12, $INP]} ++ @{[vrev8_v $V12, $V12]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V14, $INP]} ++ @{[vrev8_v $V14, $V14]} ++ addi $INP, $INP, 32 ++ @{[vle64_v $V16, $INP]} ++ @{[vrev8_v $V16, $V16]} ++ addi $INP, $INP, 32 ++ ++ .rept 4 ++ # Quad-round 0 (+0, v10->v12->v14->v16) ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V14, $V12, $V0]} ++ @{[vsha2ms_vv $V10, $V18, $V16]} ++ ++ # Quad-round 1 (+1, v12->v14->v16->v10) ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V16, $V14, $V0]} ++ @{[vsha2ms_vv $V12, $V18, $V10]} ++ ++ # Quad-round 2 (+2, v14->v16->v10->v12) ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V10, $V16, $V0]} ++ @{[vsha2ms_vv $V14, $V18, $V12]} ++ ++ # Quad-round 3 (+3, v16->v10->v12->v14) ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ @{[vmerge_vvm $V18, $V12, $V10, $V0]} ++ @{[vsha2ms_vv $V16, $V18, $V14]} ++ .endr ++ ++ # Quad-round 16 (+0, v10->v12->v14->v16) ++ # Note that we stop generating new message schedule words (Wt, v10-16) ++ # as we already generated all the words we end up consuming (i.e., W[79:76]). ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V10]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 17 (+1, v12->v14->v16->v10) ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V12]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 18 (+2, v14->v16->v10->v12) ++ @{[vle64_v $V20, ($KT)]} ++ addi $KT, $KT, 32 ++ @{[vadd_vv $V18, $V20, $V14]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # Quad-round 19 (+3, v16->v10->v12->v14) ++ @{[vle64_v $V20, ($KT)]} ++ # No t1 increment needed. ++ @{[vadd_vv $V18, $V20, $V16]} ++ @{[vsha2cl_vv $V24, $V22, $V18]} ++ @{[vsha2ch_vv $V22, $V24, $V18]} ++ ++ # H' = H+{a',b',c',...,h'} ++ @{[vadd_vv $V22, $V26, $V22]} ++ @{[vadd_vv $V24, $V28, $V24]} ++ bnez $LEN, L_round_loop_256_512 + + # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}. + @{[vsuxei8_v $V22, ($H), $V1]} diff --git a/1001-SM3-Perf-Optimiazed-with-CVE-Inst.patch b/1001-SM3-Perf-Optimiazed-with-CVE-Inst.patch new file mode 100644 index 0000000000000000000000000000000000000000..a7019b46f81581a5d252c0538eab71ec56499d46 --- /dev/null +++ b/1001-SM3-Perf-Optimiazed-with-CVE-Inst.patch @@ -0,0 +1,206 @@ +diff --git a/crypto/sm3/asm/sm3-riscv64-zvksh.pl b/crypto/sm3/asm/sm3-riscv64-zvksh.pl +index 1b8fd2eea0..a38ee96a4d 100755 +--- a/crypto/sm3/asm/sm3-riscv64-zvksh.pl ++++ b/crypto/sm3/asm/sm3-riscv64-zvksh.pl +@@ -63,7 +63,7 @@ ___ + ################################################################################ + # ossl_hwsm3_block_data_order_zvksh(SM3_CTX *c, const void *p, size_t num); + { +-my ($CTX, $INPUT, $NUM) = ("a0", "a1", "a2"); ++my ($CTX, $INPUT, $NUM, $TMP1, $TMP2) = ("a0", "a1", "a2", "a6", "t0"); + my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, + $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, + $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, +@@ -72,12 +72,191 @@ my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, + + $code .= <<___; + .text ++ ++.p2align 3 ++ORDER_BY_RVV512_DATA: ++ .word 0, 4, 8, 12, 16, 20, 24, 28, 64, 68, 72, 76, 80, 84, 88, 92, 32, 36, 40, 44, 48, 52, 56, 60, 96, 100, 104, 108, 112, 116, 120, 124 ++.size ORDER_BY_RVV512_DATA, .-ORDER_BY_RVV512_DATA ++ ++.p2align 3 ++ORDER_BY_RVV512_EXP: ++ .word 2, 3, 4, 5, 6, 7, 255, 255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 255, 255, 255, 255, 8, 9, 10, 11 ++.size ORDER_BY_RVV512_EXP, .-ORDER_BY_RVV512_EXP ++ + .p2align 3 + .globl ossl_hwsm3_block_data_order_zvksh + .type ossl_hwsm3_block_data_order_zvksh,\@function + ossl_hwsm3_block_data_order_zvksh: ++ csrr t0, vlenb ++ addi t1, t0, -64 ++ beqz t1, ossl_hwsm3_block_data_order_zvksh_rvv512 ++ addi t1, t0, -32 ++ beqz t1, ossl_hwsm3_block_data_order_zvksh_rvv256 ++ j sossl_hwsm3_block_data_order_zvksh_rvv128 ++ossl_hwsm3_block_data_order_zvksh_rvv512: ++ @{[vsetivli "zero", 8, "e32", "m1", "tu", "mu"]} ++ @{[vle32_v $V26, $CTX]} ++ @{[vrev8_v $V26, $V26]} ++ @{[vsetivli "zero", 16, "e32", "m1", "ta", "ma"]} ++ la $TMP2, ORDER_BY_RVV512_DATA ++ @{[vle32_v $V30, $TMP2]} ++ addi $TMP2, $TMP2, 64 ++ @{[vle32_v $V31, $TMP2]} ++ la $TMP2, ORDER_BY_RVV512_EXP ++ @{[vle32_v $V29, $TMP2]} ++ addi $TMP2, $TMP2, 64 ++ @{[vle32_v $V28, $TMP2]} ++ srli $TMP1, $NUM, 1 ++ andi $NUM, $NUM, 1 ++ beqz $TMP1, ossl_hwsm3_block_data_order_zvksh_rvv256 ++L_sm3_loop_rvv512: ++ @{[vluxei32_v $V0, $INPUT, $V30]} ++ @{[vluxei32_v $V1, $INPUT, $V31]} ++ @{[vrgather_vv $V9, $V0, $V29]} ++ @{[vrgather_vv $V10, $V9, $V29]} ++ @{[vrgather_vv $V11, $V1, $V28]} ++ @{[vor_vv $V10, $V10, $V11]} ++ @{[vrgather_vv $V11, $V10, $V29]} ++ @{[vrgather_vv $V12, $V1, $V29]} ++ @{[vrgather_vv $V13, $V12, $V29]} ++ @{[vsm3me_vv $V2, $V1, $V0]} ++ @{[vrgather_vv $V14, $V2, $V28]} ++ @{[vor_vv $V13, $V13, $V14]} ++ @{[vrgather_vv $V14, $V13, $V29]} ++ @{[vrgather_vv $V15, $V2, $V29]} ++ @{[vrgather_vv $V16, $V15, $V29]} ++ @{[vsm3me_vv $V3, $V2, $V1]} ++ @{[vrgather_vv $V17, $V3, $V28]} ++ @{[vor_vv $V16, $V16, $V17]} ++ @{[vrgather_vv $V17, $V16, $V29]} ++ @{[vrgather_vv $V18, $V3, $V29]} ++ @{[vrgather_vv $V19, $V18, $V29]} ++ @{[vsm3me_vv $V4, $V3, $V2]} ++ @{[vrgather_vv $V20, $V4, $V28]} ++ @{[vor_vv $V19, $V19, $V20]} ++ @{[vrgather_vv $V20, $V19, $V29]} ++ @{[vrgather_vv $V21, $V4, $V29]} ++ @{[vrgather_vv $V22, $V21, $V29]} ++ @{[vsm3me_vv $V5, $V4, $V3]} ++ @{[vrgather_vv $V23, $V5, $V28]} ++ @{[vor_vv $V22, $V22, $V23]} ++ @{[vrgather_vv $V23, $V22, $V29]} ++ @{[vrgather_vv $V24, $V5, $V29]} ++ @{[vrgather_vv $V25, $V24, $V29]} ++ @{[vsm3me_vv $V6, $V5, $V4]} ++ @{[vrgather_vv $V27, $V6, $V28]} ++ @{[vor_vv $V25, $V25, $V27]} ++ @{[vsm3me_vv $V7, $V6, $V5]} ++ @{[vsm3me_vv $V8, $V7, $V6]} ++ @{[vmv_v_v $V27, $V26]} ++ @{[vsetivli "zero", 8, "e32", "m1", "tu", "mu"]} ++ @{[vsm3c_vi $V26, $V0, 0]} ++ @{[vsm3c_vi $V26, $V9, 1]} ++ @{[vsm3c_vi $V26, $V10, 2]} ++ @{[vsm3c_vi $V26, $V11, 3]} ++ @{[vsm3c_vi $V26, $V1, 4]} ++ @{[vsm3c_vi $V26, $V12, 5]} ++ @{[vsm3c_vi $V26, $V13, 6]} ++ @{[vsm3c_vi $V26, $V14, 7]} ++ @{[vsm3c_vi $V26, $V2, 8]} ++ @{[vsm3c_vi $V26, $V15, 9]} ++ @{[vsm3c_vi $V26, $V16, 10]} ++ @{[vsm3c_vi $V26, $V17, 11]} ++ @{[vsm3c_vi $V26, $V3, 12]} ++ @{[vsm3c_vi $V26, $V18, 13]} ++ @{[vsm3c_vi $V26, $V19, 14]} ++ @{[vsm3c_vi $V26, $V20, 15]} ++ @{[vsm3c_vi $V26, $V4, 16]} ++ @{[vsm3c_vi $V26, $V21, 17]} ++ @{[vsm3c_vi $V26, $V22, 18]} ++ @{[vsm3c_vi $V26, $V23, 19]} ++ @{[vsm3c_vi $V26, $V5, 20]} ++ @{[vsm3c_vi $V26, $V24, 21]} ++ @{[vsm3c_vi $V26, $V25, 22]} ++ @{[vrgather_vv $V9, $V25, $V29]} ++ @{[vrgather_vv $V10, $V6, $V29]} ++ @{[vrgather_vv $V11, $V10, $V29]} ++ @{[vrgather_vv $V12, $V7, $V28]} ++ @{[vor_vv $V11, $V11, $V12]} ++ @{[vrgather_vv $V12, $V11, $V29]} ++ @{[vrgather_vv $V13, $V7, $V29]} ++ @{[vrgather_vv $V14, $V13, $V29]} ++ @{[vrgather_vv $V15, $V8, $V28]} ++ @{[vor_vv $V14, $V14, $V15]} ++ @{[vrgather_vv $V15, $V14, $V29]} ++ @{[vsm3c_vi $V26, $V9, 23]} ++ @{[vsm3c_vi $V26, $V6, 24]} ++ @{[vsm3c_vi $V26, $V10, 25]} ++ @{[vsm3c_vi $V26, $V11, 26]} ++ @{[vsm3c_vi $V26, $V12, 27]} ++ @{[vsm3c_vi $V26, $V7, 28]} ++ @{[vsm3c_vi $V26, $V13, 29]} ++ @{[vsm3c_vi $V26, $V14, 30]} ++ @{[vsm3c_vi $V26, $V15, 31]} ++ @{[vsetivli "zero", 16, "e32", "m1", "ta", "ma"]} ++ @{[vxor_vv $V26, $V26, $V27]} ++ @{[vslideup_vi $V27, $V26, 8]} ++ @{[vmv_v_v $V26, $V27]} ++ @{[vsm3c_vi $V26, $V0, 0]} ++ @{[vsm3c_vi $V26, $V9, 1]} ++ @{[vsm3c_vi $V26, $V10, 2]} ++ @{[vsm3c_vi $V26, $V11, 3]} ++ @{[vsm3c_vi $V26, $V1, 4]} ++ @{[vsm3c_vi $V26, $V12, 5]} ++ @{[vsm3c_vi $V26, $V13, 6]} ++ @{[vsm3c_vi $V26, $V14, 7]} ++ @{[vsm3c_vi $V26, $V2, 8]} ++ @{[vsm3c_vi $V26, $V15, 9]} ++ @{[vsm3c_vi $V26, $V16, 10]} ++ @{[vsm3c_vi $V26, $V17, 11]} ++ @{[vsm3c_vi $V26, $V3, 12]} ++ @{[vsm3c_vi $V26, $V18, 13]} ++ @{[vsm3c_vi $V26, $V19, 14]} ++ @{[vsm3c_vi $V26, $V20, 15]} ++ @{[vsm3c_vi $V26, $V4, 16]} ++ @{[vsm3c_vi $V26, $V21, 17]} ++ @{[vsm3c_vi $V26, $V22, 18]} ++ @{[vsm3c_vi $V26, $V23, 19]} ++ @{[vsm3c_vi $V26, $V5, 20]} ++ @{[vsm3c_vi $V26, $V24, 21]} ++ @{[vsm3c_vi $V26, $V25, 22]} ++ @{[vrgather_vv $V9, $V25, $V29]} ++ @{[vrgather_vv $V10, $V6, $V29]} ++ @{[vrgather_vv $V11, $V10, $V29]} ++ @{[vrgather_vv $V12, $V7, $V28]} ++ @{[vor_vv $V11, $V11, $V12]} ++ @{[vrgather_vv $V12, $V11, $V29]} ++ @{[vrgather_vv $V13, $V7, $V29]} ++ @{[vrgather_vv $V14, $V13, $V29]} ++ @{[vrgather_vv $V15, $V8, $V28]} ++ @{[vor_vv $V14, $V14, $V15]} ++ @{[vrgather_vv $V15, $V14, $V29]} ++ @{[vsm3c_vi $V26, $V9, 23]} ++ @{[vsm3c_vi $V26, $V6, 24]} ++ @{[vsm3c_vi $V26, $V10, 25]} ++ @{[vsm3c_vi $V26, $V11, 26]} ++ @{[vsm3c_vi $V26, $V12, 27]} ++ @{[vsm3c_vi $V26, $V7, 28]} ++ @{[vsm3c_vi $V26, $V13, 29]} ++ @{[vsm3c_vi $V26, $V14, 30]} ++ @{[vsm3c_vi $V26, $V15, 31]} ++ @{[vxor_vv $V26, $V26, $V27]} ++ @{[vslidedown_vi $V27, $V26, 8]} ++ @{[vmv_v_v $V26, $V27]} ++ addi $TMP1, $TMP1, -1 ++ addi $INPUT, $INPUT, 128 ++ bnez $TMP1, L_sm3_loop_rvv512 ++ @{[vsetivli "zero", 8, "e32", "m1", "ta", "ma"]} ++ @{[vrev8_v $V26, $V26]} ++ @{[vse32_v $V26, $CTX]} ++ bnez $NUM, ossl_hwsm3_block_data_order_zvksh_rvv256 ++ ret ++ossl_hwsm3_block_data_order_zvksh_rvv256: ++ @{[vsetivli "zero", 8, "e32", "m1", "ta", "ma"]} ++ j ossl_hwsm3_block_data_order_zvksh_next ++sossl_hwsm3_block_data_order_zvksh_rvv128: + @{[vsetivli "zero", 8, "e32", "m2", "ta", "ma"]} +- ++ossl_hwsm3_block_data_order_zvksh_next: + # Load initial state of hash context (c->A-H). + @{[vle32_v $V0, $CTX]} + @{[vrev8_v $V0, $V0]} diff --git a/1002-AES-ECB-CBC-Perf-Optimiazed-with-CVE-Inst.patch b/1002-AES-ECB-CBC-Perf-Optimiazed-with-CVE-Inst.patch new file mode 100644 index 0000000000000000000000000000000000000000..825b0cf0eebb12494a3f6a3403c9b1341020fdea --- /dev/null +++ b/1002-AES-ECB-CBC-Perf-Optimiazed-with-CVE-Inst.patch @@ -0,0 +1,448 @@ +diff --git a/crypto/aes/asm/aes-riscv64-zvkned.pl b/crypto/aes/asm/aes-riscv64-zvkned.pl +index 4c0292781d..9b28577f3c 100644 +--- a/crypto/aes/asm/aes-riscv64-zvkned.pl ++++ b/crypto/aes/asm/aes-riscv64-zvkned.pl +@@ -64,12 +64,21 @@ my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, + $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, + ) = map("v$_",(0..31)); + ++my ($VLENB,$VLENB_SUB) = ("t5", "t6"); ++ + # Load all 11 round keys to v1-v11 registers. + sub aes_128_load_key { + my $KEYP = shift; + + my $code=<<___; ++ csrr $VLENB, vlenb ++ addi $VLENB_SUB, $VLENB, -16 ++ beqz $VLENB_SUB, 8f ++ @{[vsetivli "zero", 4, "e32", "mf2", "ta", "ma"]} ++ j 9f ++8: + @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++9: + @{[vle32_v $V1, $KEYP]} + addi $KEYP, $KEYP, 16 + @{[vle32_v $V2, $KEYP]} +@@ -101,7 +110,14 @@ sub aes_192_load_key { + my $KEYP = shift; + + my $code=<<___; ++ csrr $VLENB, vlenb ++ addi $VLENB_SUB, $VLENB, -16 ++ beqz $VLENB_SUB, 8f ++ @{[vsetivli "zero", 4, "e32", "mf2", "ta", "ma"]} ++ j 9f ++8: + @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++9: + @{[vle32_v $V1, $KEYP]} + addi $KEYP, $KEYP, 16 + @{[vle32_v $V2, $KEYP]} +@@ -137,7 +153,14 @@ sub aes_256_load_key { + my $KEYP = shift; + + my $code=<<___; ++ csrr $VLENB, vlenb ++ addi $VLENB_SUB, $VLENB, -16 ++ beqz $VLENB_SUB, 8f ++ @{[vsetivli "zero", 4, "e32", "mf2", "ta", "ma"]} ++ j 9f ++8: + @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++9: + @{[vle32_v $V1, $KEYP]} + addi $KEYP, $KEYP, 16 + @{[vle32_v $V2, $KEYP]} +@@ -191,6 +214,25 @@ ___ + return $code; + } + ++# aes-128 encryption with round keys v1-v11 ++sub aes_128_encrypt_cbc { ++ my $code=<<___; ++ @{[vaesz_vs $V24, $V17]} # with round key w[ 0, 3] ++ @{[vaesem_vs $V24, $V2]} # with round key w[ 4, 7] ++ @{[vaesem_vs $V24, $V3]} # with round key w[ 8,11] ++ @{[vaesem_vs $V24, $V4]} # with round key w[12,15] ++ @{[vaesem_vs $V24, $V5]} # with round key w[16,19] ++ @{[vaesem_vs $V24, $V6]} # with round key w[20,23] ++ @{[vaesem_vs $V24, $V7]} # with round key w[24,27] ++ @{[vaesem_vs $V24, $V8]} # with round key w[28,31] ++ @{[vaesem_vs $V24, $V9]} # with round key w[32,35] ++ @{[vaesem_vs $V24, $V10]} # with round key w[36,39] ++ @{[vaesef_vs $V24, $V11]} # with round key w[40,43] ++___ ++ ++ return $code; ++} ++ + # aes-128 decryption with round keys v1-v11 + sub aes_128_decrypt { + my $code=<<___; +@@ -231,6 +273,27 @@ ___ + return $code; + } + ++# aes-192 encryption with round keys v1-v13 ++sub aes_192_encrypt_cbc { ++ my $code=<<___; ++ @{[vaesz_vs $V24, $V17]} # with round key w[ 0, 3] ++ @{[vaesem_vs $V24, $V2]} # with round key w[ 4, 7] ++ @{[vaesem_vs $V24, $V3]} # with round key w[ 8,11] ++ @{[vaesem_vs $V24, $V4]} # with round key w[12,15] ++ @{[vaesem_vs $V24, $V5]} # with round key w[16,19] ++ @{[vaesem_vs $V24, $V6]} # with round key w[20,23] ++ @{[vaesem_vs $V24, $V7]} # with round key w[24,27] ++ @{[vaesem_vs $V24, $V8]} # with round key w[28,31] ++ @{[vaesem_vs $V24, $V9]} # with round key w[32,35] ++ @{[vaesem_vs $V24, $V10]} # with round key w[36,39] ++ @{[vaesem_vs $V24, $V11]} # with round key w[40,43] ++ @{[vaesem_vs $V24, $V12]} # with round key w[44,47] ++ @{[vaesef_vs $V24, $V13]} # with round key w[48,51] ++___ ++ ++ return $code; ++} ++ + # aes-192 decryption with round keys v1-v13 + sub aes_192_decrypt { + my $code=<<___; +@@ -275,6 +338,29 @@ ___ + return $code; + } + ++# aes-256 encryption with round keys v1-v15 ++sub aes_256_encrypt_cbc { ++ my $code=<<___; ++ @{[vaesz_vs $V24, $V17]} # with round key w[ 0, 3] ++ @{[vaesem_vs $V24, $V2]} # with round key w[ 4, 7] ++ @{[vaesem_vs $V24, $V3]} # with round key w[ 8,11] ++ @{[vaesem_vs $V24, $V4]} # with round key w[12,15] ++ @{[vaesem_vs $V24, $V5]} # with round key w[16,19] ++ @{[vaesem_vs $V24, $V6]} # with round key w[20,23] ++ @{[vaesem_vs $V24, $V7]} # with round key w[24,27] ++ @{[vaesem_vs $V24, $V8]} # with round key w[28,31] ++ @{[vaesem_vs $V24, $V9]} # with round key w[32,35] ++ @{[vaesem_vs $V24, $V10]} # with round key w[36,39] ++ @{[vaesem_vs $V24, $V11]} # with round key w[40,43] ++ @{[vaesem_vs $V24, $V12]} # with round key w[44,47] ++ @{[vaesem_vs $V24, $V13]} # with round key w[48,51] ++ @{[vaesem_vs $V24, $V14]} # with round key w[52,55] ++ @{[vaesef_vs $V24, $V15]} # with round key w[56,59] ++___ ++ ++ return $code; ++} ++ + # aes-256 decryption with round keys v1-v15 + sub aes_256_decrypt { + my $code=<<___; +@@ -304,7 +390,7 @@ ___ + # size_t length, const AES_KEY *key, + # unsigned char *ivec, const int enc); + my ($INP, $OUTP, $LEN, $KEYP, $IVP, $ENC) = ("a0", "a1", "a2", "a3", "a4", "a5"); +-my ($T0, $T1, $ROUNDS) = ("t0", "t1", "t2"); ++my ($T0, $T1, $ROUNDS, $VL, $LEN32) = ("t0", "t1", "t2", "t3", "t4"); + + $code .= <<___; + .p2align 3 +@@ -345,13 +431,28 @@ L_cbc_enc_128: + + @{[vle32_v $V24, $INP]} + @{[vxor_vv $V24, $V24, $V16]} +- j 2f ++ j 3f + + 1: + @{[vle32_v $V17, $INP]} +- @{[vxor_vv $V24, $V24, $V17]} ++ @{[vxor_vv $V17, $V17, $V1]} + + 2: ++ # AES body ++ @{[aes_128_encrypt_cbc]} ++ ++ @{[vse32_v $V24, $OUTP]} ++ ++ addi $INP, $INP, 16 ++ addi $OUTP, $OUTP, 16 ++ addi $LEN, $LEN, -16 ++ ++ bnez $LEN, 1b ++ ++ @{[vse32_v $V24, $IVP]} ++ ++ ret ++3: + # AES body + @{[aes_128_encrypt]} + +@@ -380,13 +481,28 @@ L_cbc_enc_192: + + @{[vle32_v $V24, $INP]} + @{[vxor_vv $V24, $V24, $V16]} +- j 2f ++ j 3f + + 1: + @{[vle32_v $V17, $INP]} +- @{[vxor_vv $V24, $V24, $V17]} ++ @{[vxor_vv $V17, $V17, $V1]} + + 2: ++ # AES body ++ @{[aes_192_encrypt_cbc]} ++ ++ @{[vse32_v $V24, $OUTP]} ++ ++ addi $INP, $INP, 16 ++ addi $OUTP, $OUTP, 16 ++ addi $LEN, $LEN, -16 ++ ++ bnez $LEN, 1b ++ ++ @{[vse32_v $V24, $IVP]} ++ ++ ret ++3: + # AES body + @{[aes_192_encrypt]} + +@@ -415,13 +531,28 @@ L_cbc_enc_256: + + @{[vle32_v $V24, $INP]} + @{[vxor_vv $V24, $V24, $V16]} +- j 2f ++ j 3f + + 1: + @{[vle32_v $V17, $INP]} +- @{[vxor_vv $V24, $V24, $V17]} ++ @{[vxor_vv $V17, $V17, $V1]} + + 2: ++ # AES body ++ @{[aes_256_encrypt_cbc]} ++ ++ @{[vse32_v $V24, $OUTP]} ++ ++ addi $INP, $INP, 16 ++ addi $OUTP, $OUTP, 16 ++ addi $LEN, $LEN, -16 ++ ++ bnez $LEN, 1b ++ ++ @{[vse32_v $V24, $IVP]} ++ ++ ret ++3: + # AES body + @{[aes_256_encrypt]} + +@@ -457,7 +588,7 @@ rv64i_zvkned_cbc_decrypt: + + # Load number of rounds + lwu $ROUNDS, 240($KEYP) +- ++ srli $LEN32, $LEN, 2 + # Get proper routine for key size + li $T0, 10 + beq $ROUNDS, $T0, L_cbc_dec_128 +@@ -477,35 +608,32 @@ $code .= <<___; + L_cbc_dec_128: + # Load all 11 round keys to v1-v11 registers. + @{[aes_128_load_key $KEYP]} +- + # Load IV. + @{[vle32_v $V16, $IVP]} + +- @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- j 2f +- + 1: ++ @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ slli $T0, $VL, 2 ++ sub $LEN32, $LEN32, $VL ++ addi $VL, $VL, -4 + @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- addi $OUTP, $OUTP, 16 ++ @{[vslideup_vi $V16, $V24, 4]} ++ @{[vslidedown_vx $V20, $V24, $VL]} + +-2: + # AES body + @{[aes_128_decrypt]} +- + @{[vxor_vv $V24, $V24, $V16]} ++ @{[vmv_v_v $V16, $V20]} + @{[vse32_v $V24, $OUTP]} +- @{[vmv_v_v $V16, $V17]} +- +- addi $LEN, $LEN, -16 +- addi $INP, $INP, 16 + +- bnez $LEN, 1b +- +- @{[vse32_v $V16, $IVP]} ++ add $INP, $INP, $T0 ++ add $OUTP, $OUTP, $T0 + ++ bnez $LEN32, 1b ++ @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]} ++ @{[vse32_v $V20, $IVP]} + ret ++ + .size L_cbc_dec_128,.-L_cbc_dec_128 + ___ + +@@ -514,34 +642,30 @@ $code .= <<___; + L_cbc_dec_192: + # Load all 13 round keys to v1-v13 registers. + @{[aes_192_load_key $KEYP]} +- + # Load IV. + @{[vle32_v $V16, $IVP]} + +- @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- j 2f +- + 1: ++ @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ slli $T0, $VL, 2 ++ sub $LEN32, $LEN32, $VL ++ addi $VL, $VL, -4 + @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- addi $OUTP, $OUTP, 16 ++ @{[vslideup_vi $V16, $V24, 4]} ++ @{[vslidedown_vx $V20, $V24, $VL]} + +-2: + # AES body + @{[aes_192_decrypt]} +- + @{[vxor_vv $V24, $V24, $V16]} ++ @{[vmv_v_v $V16, $V20]} + @{[vse32_v $V24, $OUTP]} +- @{[vmv_v_v $V16, $V17]} +- +- addi $LEN, $LEN, -16 +- addi $INP, $INP, 16 +- +- bnez $LEN, 1b + +- @{[vse32_v $V16, $IVP]} ++ add $INP, $INP, $T0 ++ add $OUTP, $OUTP, $T0 + ++ bnez $LEN32, 1b ++ @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]} ++ @{[vse32_v $V20, $IVP]} + ret + .size L_cbc_dec_192,.-L_cbc_dec_192 + ___ +@@ -551,34 +675,30 @@ $code .= <<___; + L_cbc_dec_256: + # Load all 15 round keys to v1-v15 registers. + @{[aes_256_load_key $KEYP]} +- + # Load IV. + @{[vle32_v $V16, $IVP]} + +- @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- j 2f +- + 1: ++ @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ slli $T0, $VL, 2 ++ sub $LEN32, $LEN32, $VL ++ addi $VL, $VL, -4 + @{[vle32_v $V24, $INP]} +- @{[vmv_v_v $V17, $V24]} +- addi $OUTP, $OUTP, 16 ++ @{[vslideup_vi $V16, $V24, 4]} ++ @{[vslidedown_vx $V20, $V24, $VL]} + +-2: + # AES body + @{[aes_256_decrypt]} +- + @{[vxor_vv $V24, $V24, $V16]} ++ @{[vmv_v_v $V16, $V20]} + @{[vse32_v $V24, $OUTP]} +- @{[vmv_v_v $V16, $V17]} +- +- addi $LEN, $LEN, -16 +- addi $INP, $INP, 16 +- +- bnez $LEN, 1b + +- @{[vse32_v $V16, $IVP]} ++ add $INP, $INP, $T0 ++ add $OUTP, $OUTP, $T0 + ++ bnez $LEN32, 1b ++ @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]} ++ @{[vse32_v $V20, $IVP]} + ret + .size L_cbc_dec_256,.-L_cbc_dec_256 + ___ +@@ -627,7 +747,7 @@ L_ecb_enc_128: + @{[aes_128_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -654,7 +774,7 @@ L_ecb_enc_192: + @{[aes_192_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -681,7 +801,7 @@ L_ecb_enc_256: + @{[aes_256_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -738,7 +858,7 @@ L_ecb_dec_128: + @{[aes_128_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -765,7 +885,7 @@ L_ecb_dec_192: + @{[aes_192_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + +@@ -792,7 +912,7 @@ L_ecb_dec_256: + @{[aes_256_load_key $KEYP]} + + 1: +- @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]} ++ @{[vsetvli $VL, $LEN32, "e32", "m8", "ta", "ma"]} + slli $T0, $VL, 2 + sub $LEN32, $LEN32, $VL + diff --git a/1005-SM4-ECB-Perf-Optimiazed-with-CVE-Inst.patch b/1005-SM4-ECB-Perf-Optimiazed-with-CVE-Inst.patch new file mode 100644 index 0000000000000000000000000000000000000000..7fd34482a090c5a22d6aba1a8a51144ea0222a34 --- /dev/null +++ b/1005-SM4-ECB-Perf-Optimiazed-with-CVE-Inst.patch @@ -0,0 +1,285 @@ +diff --git a/crypto/perlasm/riscv.pm b/crypto/perlasm/riscv.pm +index 2402c052ca..c033a3feb2 100644 +--- a/crypto/perlasm/riscv.pm ++++ b/crypto/perlasm/riscv.pm +@@ -1050,4 +1050,13 @@ sub vsm3me_vv { + return ".word ".($template | ($vs2 << 20) | ($vs1 << 15 ) | ($vd << 7)); + } + ++sub vrgather_vv{ ++ # vrgather.vv vd, vs2, vs1 ++ my $template = 0b11001_00000_00000_000_00000_1010111; ++ my $vd = read_vreg shift; ++ my $vs2 = read_vreg shift; ++ my $vs1 = read_vreg shift; ++ return ".word ".($template | ($vs2 << 20) | ($vs1 << 15 ) | ($vd << 7)); ++} ++ + 1; +diff --git a/crypto/sm4/asm/sm4-riscv64-zvksed.pl b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +index 0734e5fa4c..b0251bc89c 100644 +--- a/crypto/sm4/asm/sm4-riscv64-zvksed.pl ++++ b/crypto/sm4/asm/sm4-riscv64-zvksed.pl +@@ -66,6 +66,7 @@ ___ + { + my ($ukey,$keys,$fk)=("a0","a1","t0"); + my ($vukey,$vfk,$vk0,$vk1,$vk2,$vk3,$vk4,$vk5,$vk6,$vk7)=("v1","v2","v3","v4","v5","v6","v7","v8","v9","v10"); ++ + $code .= <<___; + .p2align 3 + .globl rv64i_zvksed_sm4_set_encrypt_key +@@ -285,12 +286,196 @@ rv64i_zvksed_sm4_decrypt: + ___ + } + ++# void rv64i_zvkned_sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++# size_t length, const SM4_KEY *key, ++# const int enc); ++{ ++my ($a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$t4,$t5,$t6)=("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); ++my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, ++ $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, ++ $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, ++ $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, ++) = map("v$_",(0..31)); ++$code .= <<___; ++.p2align 3 ++.globl rv64i_zvkned_sm4_ecb_encrypt ++.type rv64i_zvkned_sm4_ecb_encrypt,\@function ++rv64i_zvkned_sm4_ecb_encrypt: ++ @{[vsetivli__x0_4_e32_m1_tu_mu]} ++ # Order of elements was adjusted in set_encrypt_key() ++ @{[vle32_v $V2, $a3]} # rk[0:3] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V3, $a3]} # rk[4:7] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V4, $a3]} # rk[8:11] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V5, $a3]} # rk[12:15] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V6, $a3]} # rk[16:19] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V7, $a3]} # rk[20:23] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V8, $a3]} # rk[24:27] ++ addi $a3, $a3, 16 ++ @{[vle32_v $V9, $a3]} # rk[28:31] ++ li $t3, 32 ++ @{[vsetvli "zero", $t3, "e32", "m4", "ta", "ma"]} ++ la $a3, ORDER ++ @{[vle32_v $V16, $a3]} ++ srli $t3, $a2, 2 ++ ++ # Load input data ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ sub $t5, $t3, $t1 ++ @{[vle32_v $V24, $a0]} ++ @{[vrev8_v $V24, $V24]} ++1: ++ beqz $t5, .Lenc_last_block ++ # Encrypt with all keys ++ @{[vsm4r_vs $V24, $V2]} ++ # Simultaneously load the next round ++ slli $t6, $t1, 2 ++ add $a0, $a0, $t6 ++ @{[vsetvli $t2, $t5, "e32", "m4", "ta", "ma"]} ++ @{[vle32_v $V20, $a0]} ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V9]} ++ @{[vrev8_v $V20, $V20]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ vmv4r.v $V24, $V20 ++ add $a1, $a1, $t6 ++ mv $t3, $t5 ++ sub $t5, $t5, $t2 ++ j 1b ++.Lenc_last_block: ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ # Encrypt with all keys ++ @{[vsm4r_vs $V24, $V2]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V9]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ ret ++.size rv64i_zvkned_sm4_ecb_encrypt,.-rv64i_zvkned_sm4_ecb_encrypt ++ ++___ ++} ++ ++ ++############################################################################### ++# void rv64i_zvkned_sm4_ecb_decrypt(const unsigned char *in, unsigned char *out, ++# size_t length, const SM4_KEY *key); ++{ ++my ($a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$t4,$t5,$t6)=("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6"); ++my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, ++ $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15, ++ $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23, ++ $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31, ++) = map("v$_",(0..31)); ++$code .= <<___; ++.p2align 3 ++.globl rv64i_zvkned_sm4_ecb_decrypt ++.type rv64i_zvkned_sm4_ecb_decrypt,\@function ++rv64i_zvkned_sm4_ecb_decrypt: ++ @{[vsetivli__x0_4_e32_m1_tu_mu]} ++ # Order of elements was adjusted in set_encrypt_key() ++ @{[vle32_v $V9, $a3]} # rk[0:3] ++ addi a3, a3, 16 ++ @{[vle32_v $V8, $a3]} # rk[4:7] ++ addi a3, a3, 16 ++ @{[vle32_v $V7, $a3]} # rk[8:11] ++ addi a3, a3, 16 ++ @{[vle32_v $V6, $a3]} # rk[12:15] ++ addi a3, a3, 16 ++ @{[vle32_v $V5, $a3]} # rk[16:19] ++ addi a3, a3, 16 ++ @{[vle32_v $V4, $a3]} # rk[20:23] ++ addi a3, a3, 16 ++ @{[vle32_v $V3, $a3]} # rk[24:27] ++ addi a3, a3, 16 ++ @{[vle32_v $V2, $a3]} # rk[28:31] ++ li $t3, 32 ++ @{[vsetvli "zero", $t3, "e32", "m4", "ta", "ma"]} ++ la a3, ORDER ++ @{[vle32_v $V16, $a3]} ++ srli $t3, $a2, 2 ++ ++ # Load input data ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ sub $t5, $t3, $t1 ++ @{[vle32_v $V24, $a0]} ++ @{[vrev8_v $V24, $V24]} ++1: ++ beqz $t5, .Ldec_last_block ++ # Decrypt with all keys ++ @{[vsm4r_vs $V24, $V9]} ++ # Simultaneously load the next round ++ slli $t6, $t1, 2 ++ add $a0, $a0, $t6 ++ @{[vsetvli $t2, $t5, "e32", "m4", "ta", "ma"]} ++ @{[vle32_v $V20, $a0]} ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V2]} ++ @{[vrev8_v $V20, $V20]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ vmv4r.v $V24, $V20 ++ add $a1, $a1, $t6 ++ mv $t3, $t5 ++ sub $t5, $t5, $t2 ++ j 1b ++.Ldec_last_block: ++ @{[vsetvli $t1, $t3, "e32", "m4", "ta", "ma"]} ++ # Decrypt with all keys ++ @{[vsm4r_vs $V24, $V9]} ++ @{[vle32_v $V20, $a0]} ++ @{[vsm4r_vs $V24, $V8]} ++ @{[vsm4r_vs $V24, $V7]} ++ @{[vsm4r_vs $V24, $V6]} ++ @{[vsm4r_vs $V24, $V5]} ++ @{[vsm4r_vs $V24, $V4]} ++ @{[vsm4r_vs $V24, $V3]} ++ @{[vsm4r_vs $V24, $V2]} ++ @{[vrev8_v $V24, $V24]} ++ @{[vrgather_vv $V28, $V24, $V16]} ++ @{[vse32_v $V28, $a1]} ++ ret ++.size rv64i_zvkned_sm4_ecb_decrypt,.-rv64i_zvkned_sm4_ecb_decrypt ++___ ++} ++ + $code .= <<___; + # Family Key (little-endian 32-bit chunks) + .p2align 3 + FK: + .word 0xA3B1BAC6, 0x56AA3350, 0x677D9197, 0xB27022DC + .size FK,.-FK ++ ++.p2align 3 ++ORDER: ++ .word 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28, 35, 34, 33, 32, 39, 38, 37, 36, 43, 42, 41, 40, 47, 46, 45, 44, 51, 50, 49, 48, 55, 54, 53, 52, 59, 58, 57, 56, 63, 62, 61, 60 ++.size ORDER,.-ORDER + ___ + + print $code; +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 3df1b4256d..4a9471185c 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -50,6 +50,12 @@ void rv64i_zvksed_sm4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); + void rv64i_zvksed_sm4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); ++void rv64i_zvkned_sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void rv64i_zvkned_sm4_ecb_decrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); + # endif /* RV64 */ + # endif /* OPENSSL_CPUID_OBJ */ + +diff --git a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc +index 763d9d09dd..2b80002bd2 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc ++++ b/providers/implementations/ciphers/cipher_sm4_hw_rv64i.inc +@@ -26,16 +26,27 @@ static int cipher_hw_rv64i_zvksed_sm4_initkey(PROV_CIPHER_CTX *ctx, + SM4_KEY *ks = &sctx->ks.ks; + + ctx->ks = ks; +- if (ctx->enc +- || (ctx->mode != EVP_CIPH_ECB_MODE +- && ctx->mode != EVP_CIPH_CBC_MODE)) { +- rv64i_zvksed_sm4_set_encrypt_key(key, ks); +- ctx->block = (block128_f) rv64i_zvksed_sm4_encrypt; +- ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_ECB_MODE) { ++ if (ctx->enc) { ++ rv64i_zvksed_sm4_set_encrypt_key(key, ks); ++ ctx->block = NULL; ++ ctx->stream.ecb = (ecb128_f) rv64i_zvkned_sm4_ecb_encrypt; ++ } else { ++ rv64i_zvksed_sm4_set_decrypt_key(key, ks); ++ ctx->block = NULL; ++ ctx->stream.ecb = (ecb128_f) rv64i_zvkned_sm4_ecb_decrypt; ++ } + } else { +- rv64i_zvksed_sm4_set_decrypt_key(key, ks); +- ctx->block = (block128_f) rv64i_zvksed_sm4_decrypt; +- ctx->stream.cbc = NULL; ++ if (ctx->enc ++ || ctx->mode != EVP_CIPH_CBC_MODE) { ++ rv64i_zvksed_sm4_set_encrypt_key(key, ks); ++ ctx->block = (block128_f) rv64i_zvksed_sm4_encrypt; ++ ctx->stream.cbc = NULL; ++ } else { ++ rv64i_zvksed_sm4_set_decrypt_key(key, ks); ++ ctx->block = (block128_f) rv64i_zvksed_sm4_decrypt; ++ ctx->stream.cbc = NULL; ++ } + } + + return 1; diff --git a/openssl.spec b/openssl.spec index a623f66e001bda55af829c0b21242a9cc2e55ca4..3bfd04ac2409480500e775794ff3c83916df830b 100644 --- a/openssl.spec +++ b/openssl.spec @@ -1,4 +1,4 @@ -%define anolis_release 1 +%define anolis_release 2 %global soversion 3 %define srpmhash() %{lua: @@ -238,6 +238,9 @@ rm -f $RPM_BUILD_ROOT%{_sysconfdir}/pki/tls/{openssl.cnf.dist,ct_log_list.cnf.di %doc NEWS.md README.md %changelog +* Wed Nov 27 2025 cxx194832 - 1:3.5.4-2 +- Implement SHA512/AES-128/256-ECB/CBC/SM3 performance. + * Wed Nov 26 2025 Weisson - 1:3.5.4-1 - Update to 3.5.4 to support rva23.