diff --git a/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch b/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch deleted file mode 100644 index a13979238ba58fe187acae043f454045975ecf54..0000000000000000000000000000000000000000 --- a/0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch +++ /dev/null @@ -1,18231 +0,0 @@ -From 642128b0e5f86a5bbb304350ff4826028ccd2e20 Mon Sep 17 00:00:00 2001 -From: gxw -Date: Fri, 11 Aug 2023 10:11:51 +0800 -Subject: [PATCH] OpenBLAS-0.3.23: Add opt for LoongArch64 - ---- - .github/workflows/loongarch64.yml | 110 + - Makefile.system | 10 +- - c_check | 35 + - c_check.pl | 45 + - common_loongarch64.h | 13 + - cpuid_loongarch64.c | 18 +- - kernel/loongarch64/KERNEL.LOONGSON3R5 | 31 +- - kernel/loongarch64/KERNEL.generic | 4 + - kernel/loongarch64/dgemm_kernel_16x4.S | 4058 +++++++---------- - kernel/loongarch64/dgemv_n_8_lasx.S | 554 +++ - kernel/loongarch64/dgemv_t_8_lasx.S | 481 ++ - .../loongarch64/dtrsm_kernel_LN_16x4_lasx.S | 1366 ++++++ - .../loongarch64/dtrsm_kernel_LT_16x4_lasx.S | 959 ++++ - .../loongarch64/dtrsm_kernel_RN_16x4_lasx.S | 882 ++++ - .../loongarch64/dtrsm_kernel_RT_16x4_lasx.S | 953 ++++ - kernel/loongarch64/dtrsm_kernel_macro.S | 2147 +++++++++ - kernel/loongarch64/loongarch64_asm.S | 430 ++ - kernel/loongarch64/sgemm_kernel_16x8_lasx.S | 2348 ++++++++++ - kernel/loongarch64/sgemm_ncopy_16_lasx.S | 463 ++ - kernel/loongarch64/sgemm_ncopy_8_lasx.S | 298 ++ - kernel/loongarch64/sgemm_tcopy_16_lasx.S | 526 +++ - kernel/loongarch64/sgemm_tcopy_8_lasx.S | 406 ++ - kernel/loongarch64/sgemv_n_8_lasx.S | 463 ++ - kernel/loongarch64/sgemv_t_8_lasx.S | 405 ++ - lapack/laswp/loongarch64/Makefile | 5 + - param.h | 18 +- - 26 files changed, 14611 insertions(+), 2417 deletions(-) - create mode 100644 .github/workflows/loongarch64.yml - create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S - create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S - create mode 100644 kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S - create mode 100644 kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S - create mode 100644 kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S - create mode 100644 kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S - create mode 100644 kernel/loongarch64/dtrsm_kernel_macro.S - create mode 100644 kernel/loongarch64/loongarch64_asm.S - create mode 100644 kernel/loongarch64/sgemm_kernel_16x8_lasx.S - create mode 100644 kernel/loongarch64/sgemm_ncopy_16_lasx.S - create mode 100644 kernel/loongarch64/sgemm_ncopy_8_lasx.S - create mode 100644 kernel/loongarch64/sgemm_tcopy_16_lasx.S - create mode 100644 kernel/loongarch64/sgemm_tcopy_8_lasx.S - create mode 100644 kernel/loongarch64/sgemv_n_8_lasx.S - create mode 100644 kernel/loongarch64/sgemv_t_8_lasx.S - -diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml -new file mode 100644 -index 000000000..5501e98e0 ---- /dev/null -+++ b/.github/workflows/loongarch64.yml -@@ -0,0 +1,110 @@ -+name: loongarch64 qemu test -+ -+on: [push, pull_request] -+ -+jobs: -+ TEST: -+ runs-on: ubuntu-latest -+ strategy: -+ fail-fast: false -+ matrix: -+ include: -+ - target: LOONGSONGENERIC -+ triple: loongarch64-unknown-linux-gnu -+ opts: NO_SHARED=1 TARGET=LOONGSONGENERIC -+ - target: LOONGSON3R5 -+ triple: loongarch64-unknown-linux-gnu -+ opts: NO_SHARED=1 TARGET=LOONGSON3R5 -+ - target: LOONGSON2K1000 -+ triple: loongarch64-unknown-linux-gnu -+ opts: NO_SHARED=1 TARGET=LOONGSON2K1000 -+ -+ steps: -+ - name: Checkout repository -+ uses: actions/checkout@v3 -+ -+ - name: Install APT deps -+ run: | -+ sudo add-apt-repository ppa:savoury1/virtualisation -+ sudo apt-get update -+ sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ -+ qemu-user-static -+ -+ - name: Download and install loongarch64-toolchain -+ run: | -+ wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -+ tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt -+ -+ - name: Set env -+ run: | -+ echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV -+ echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV -+ -+ - name: Compilation cache -+ uses: actions/cache@v3 -+ with: -+ path: ~/.ccache -+ key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} -+ restore-keys: | -+ ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} -+ ccache-${{ runner.os }}-${{ matrix.target }} -+ -+ - name: Configure ccache -+ run: | -+ test -d ~/.ccache || mkdir -p ~/.ccache -+ echo "max_size = 300M" > ~/.ccache/ccache.conf -+ echo "compression = true" >> ~/.ccache/ccache.conf -+ ccache -s -+ -+ - name: Disable utest dsdot:dsdot_n_1 -+ run: | -+ echo -n > utest/test_dsdot.c -+ echo "Due to the qemu versions 7.2 causing utest cases to fail," -+ echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." -+ -+ - name: Build OpenBLAS -+ run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) -+ -+ - name: Test -+ run: | -+ qemu-loongarch64-static ./utest/openblas_utest -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 -+ rm -f ./test/?BLAT2.SUMM -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat -+ rm -f ./test/?BLAT2.SUMM -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat -+ rm -f ./test/?BLAT3.SUMM -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat -+ OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat -+ rm -f ./test/?BLAT3.SUMM -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat -+ OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat -diff --git a/Makefile.system b/Makefile.system -index 343b94bb3..1eabff27d 100644 ---- a/Makefile.system -+++ b/Makefile.system -@@ -932,8 +932,12 @@ BINARY_DEFINED = 1 - endif - - ifeq ($(ARCH), loongarch64) --CCOMMON_OPT += -march=loongarch64 -mabi=lp64 --FCOMMON_OPT += -march=loongarch64 -mabi=lp64 -+LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d) -+ifneq ($(LA64_ABI), lp64d) -+LA64_ABI=lp64 -+endif -+CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) -+FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI) - endif - - endif -@@ -1763,6 +1767,8 @@ export TARGET_CORE - export NO_AVX512 - export NO_AVX2 - export BUILD_BFLOAT16 -+export NO_LSX -+export NO_LASX - - export SBGEMM_UNROLL_M - export SBGEMM_UNROLL_N -diff --git a/c_check b/c_check -index e8f90e18a..5a7163a63 100755 ---- a/c_check -+++ b/c_check -@@ -181,6 +181,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then - rm -rf "$tmpd" - fi - -+no_lsx=0 -+no_lasx=0 -+if [ "$architecture" = "loongarch64" ]; then -+ tmpd="$(mktemp -d)" -+ tmplsx="$tmpd/lsx.c" -+ codelsx='"vadd.b $vr0, $vr0, $vr0"' -+ lsx_flags='-march=loongarch64 -mlsx' -+ printf "#include \n\n" >> "$tmplsx" -+ printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" -+ args="$lsx_flags -o $tmplsx.o $tmplsx" -+ { -+ $compiler_name $flags $args >/dev/null 2>&1 -+ } || { -+ no_lsx=1 -+ } -+ -+ tmplasx="$tmpd/lasx.c" -+ codelasx='"xvadd.b $xr0, $xr0, $xr0"' -+ lasx_flags='-march=loongarch64 -mlasx' -+ printf "#include \n\n" >> "$tmplasx" -+ printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" -+ args="$lasx_flags -o $tmplasx.o $tmplasx" -+ { -+ $compiler_name $flags $args >/dev/null 2>&1 -+ } || { -+ no_lasx=1 -+ } -+ -+ rm -rf "$tmpd" -+fi -+ - case "$data" in - *ARCH_X86_64*) architecture=x86_64 ;; - *ARCH_X86*) architecture=x86 ;; -@@ -395,6 +426,8 @@ done - [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" - [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" - [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" -+ [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" -+ [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" - } >> "$makefile" - - os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` -@@ -410,6 +443,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` - [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" - [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" - [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" -+ [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" -+ [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" - } >> "$config" - - -diff --git a/c_check.pl b/c_check.pl -index 6ce28e11b..7a860a211 100644 ---- a/c_check.pl -+++ b/c_check.pl -@@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { - } - } - -+$no_lsx = 0; -+$no_lasx = 0; -+if (($architecture eq "loongarch64")) { -+ eval "use File::Temp qw(tempfile)"; -+ if ($@){ -+ warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; -+ } else { -+ $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); -+ $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; -+ $lsx_flags = "-march=loongarch64 -mlsx"; -+ print $tmplsx "#include \n\n"; -+ print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; -+ -+ $args = "$lsx_flags -o $tmplsx.o $tmplsx"; -+ my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); -+ system(@cmd) == 0; -+ if ($? != 0) { -+ $no_lsx = 1; -+ } else { -+ $no_lsx = 0; -+ } -+ unlink("$tmplsx.o"); -+ -+ $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); -+ $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; -+ $lasx_flags = "-march=loongarch64 -mlasx"; -+ print $tmplasx "#include \n\n"; -+ print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; -+ -+ $args = "$lasx_flags -o $tmplasx.o $tmplasx"; -+ my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); -+ system(@cmd) == 0; -+ if ($? != 0) { -+ $no_lasx = 1; -+ } else { -+ $no_lasx = 0; -+ } -+ unlink("$tmplasx.o"); -+ } -+} -+ - $architecture = x86 if ($data =~ /ARCH_X86/); - $architecture = x86_64 if ($data =~ /ARCH_X86_64/); - $architecture = e2k if ($data =~ /ARCH_E2K/); -@@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; - print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; - print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; - print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; -+print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; -+print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; - - $os =~ tr/[a-z]/[A-Z]/; - $architecture =~ tr/[a-z]/[A-Z]/; -@@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; - print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; - print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; - print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; -+print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; -+print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; - - - if ($os eq "LINUX") { -diff --git a/common_loongarch64.h b/common_loongarch64.h -index e15539b5f..ce1fcf091 100644 ---- a/common_loongarch64.h -+++ b/common_loongarch64.h -@@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){ - return x / y; - } - -+#ifndef NO_AFFINITY -+static inline int WhereAmI(void){ -+ int ret = 0, counter = 0; -+ __asm__ volatile ( -+ "rdtimel.w %[counter], %[id]" -+ : [id]"=r"(ret), [counter]"=r"(counter) -+ : -+ : "memory" -+ ); -+ return ret; -+} -+#endif -+ - #ifdef DOUBLE - #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") - #else -diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c -index ca07c7ffb..7c389db27 100644 ---- a/cpuid_loongarch64.c -+++ b/cpuid_loongarch64.c -@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - **********************************************************************************/ - - #include -+#include - - /* If LASX extension instructions supported, - * using core LOONGSON3R5 -@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define CPU_LOONGSON3R5 1 - #define CPU_LOONGSON2K1000 2 - --#define LOONGARCH_CFG2 0x02 --#define LOONGARCH_LASX 1<<7 --#define LOONGARCH_LSX 1<<6 -+#define LA_HWCAP_LSX (1<<4) -+#define LA_HWCAP_LASX (1<<5) - - static char *cpuname[] = { - "LOONGSONGENERIC", -@@ -64,17 +64,11 @@ static char *cpuname_lower[] = { - - int detect(void) { - #ifdef __linux -- uint32_t reg = 0; -+ int flag = (int)getauxval(AT_HWCAP); - -- __asm__ volatile ( -- "cpucfg %0, %1 \n\t" -- : "+&r"(reg) -- : "r"(LOONGARCH_CFG2) -- ); -- -- if (reg & LOONGARCH_LASX) -+ if (flag & LA_HWCAP_LASX) - return CPU_LOONGSON3R5; -- else if (reg & LOONGARCH_LSX) -+ else if (flag & LA_HWCAP_LSX) - return CPU_LOONGSON2K1000; - else - return CPU_GENERIC; -diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 -index cda359040..011e8b89e 100644 ---- a/kernel/loongarch64/KERNEL.LOONGSON3R5 -+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 -@@ -1,3 +1,4 @@ -+ifndef NO_LASX - DGEMMKERNEL = dgemm_kernel_16x4.S - DGEMMINCOPY = dgemm_ncopy_16.S - DGEMMITCOPY = dgemm_tcopy_16.S -@@ -8,7 +9,29 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) - DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) - DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - --DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c --DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c --DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c --DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -+DGEMVNKERNEL = dgemv_n_8_lasx.S -+DGEMVTKERNEL = dgemv_t_8_lasx.S -+ -+SGEMMKERNEL = sgemm_kernel_16x8_lasx.S -+SGEMMINCOPY = sgemm_ncopy_16_lasx.S -+SGEMMITCOPY = sgemm_tcopy_16_lasx.S -+SGEMMONCOPY = sgemm_ncopy_8_lasx.S -+SGEMMOTCOPY = sgemm_tcopy_8_lasx.S -+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -+ -+SGEMVNKERNEL = sgemv_n_8_lasx.S -+SGEMVTKERNEL = sgemv_t_8_lasx.S -+ -+DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S -+DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S -+DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S -+DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S -+endif -+ -+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic -index b772a6f82..213add9ee 100644 ---- a/kernel/loongarch64/KERNEL.generic -+++ b/kernel/loongarch64/KERNEL.generic -@@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c - ZSWAPKERNEL = ../arm/zswap.c - - SGEMVNKERNEL = ../arm/gemv_n.c -+ifndef DGEMVNKERNEL - DGEMVNKERNEL = ../arm/gemv_n.c -+endif - CGEMVNKERNEL = ../arm/zgemv_n.c - ZGEMVNKERNEL = ../arm/zgemv_n.c - - SGEMVTKERNEL = ../arm/gemv_t.c -+ifndef DGEMVTKERNEL - DGEMVTKERNEL = ../arm/gemv_t.c -+endif - CGEMVTKERNEL = ../arm/zgemv_t.c - ZGEMVTKERNEL = ../arm/zgemv_t.c - -diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S -index 13faa977e..f8e26fda2 100644 ---- a/kernel/loongarch64/dgemm_kernel_16x4.S -+++ b/kernel/loongarch64/dgemm_kernel_16x4.S -@@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - #include "common.h" - -+/********************************************************************* -+* 2023/06/28 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+* 2023/06/28 guxiwei -+* Parameter: -+* DGEMM_DEFAULT_UNROLL_N 4 -+* DGEMM_DEFAULT_UNROLL_M 16 -+* DGEMM_DEFAULT_P 32 -+* DGEMM_DEFAULT_Q 152 -+* DGEMM_DEFAULT_R 858 -+* A_PR1 1024 -+* B_PR1 256 -+* -+* -+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: -+* 1 thread: 36.0 GFLOPS -+* 2 threads: 71.6 GFLOPS -+* 3 threads: 101.5 GFLOPS -+* 4 threads: 132.8 GFLOPS -+*********************************************************************/ -+ - /* Function parameters */ - #define M $r4 // param 1: bm - #define N $r5 // param 2: bn -@@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define U4 $xr4 - #define U5 $xr5 - #define U6 $xr6 --#define D0 $xr7 --#define D1 $xr8 --#define D2 $xr9 --#define D3 $xr10 --#define D4 $xr11 --#define D5 $xr12 --#define D6 $xr13 --#define D7 $xr14 --#define D8 $xr15 --#define D9 $xr16 --#define D10 $xr17 --#define D11 $xr18 --#define D12 $xr19 --#define D13 $xr20 --#define D14 $xr21 --#define D15 $xr22 --#define VALPHA $xr23 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+#define VALPHA $xr15 - - /* Prefetch interval */ --#define A_PRE 0x200 -+#define A_PRE 0x400 - #define B_PRE 0x100 - -- PROLOGUE -- -- addi.d $sp, $sp, -56 -- /* Store regs */ -- SDARG $r23, $sp, 0 -- SDARG $r24, $sp, 8 -- SDARG $r25, $sp, 16 -- SDARG $r26, $sp, 24 -- SDARG $r27, $sp, 32 -- ST $f23, $sp, 40 -- ST ALPHA, $sp, 48 -- -- /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ -- xvld VALPHA, $sp, 48 -- xvreplve0.d VALPHA, VALPHA -- --#if defined (TRMMKERNEL) && !defined(LEFT) -- sub.d OFF, ZERO, OFFSET --#else -- xor OFF, OFF, OFF --#endif -- -- /* if (!(N >> 2)) goto L_N3 */ -- srai.d J, N, 2 /* J = bn >> 2 */ -- andi N, N, 0x03 -- beq ZERO, J, .L_N3 -- --.L_J1: /* J-- && This loop include Condition 1 */ -- --/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* --* dgemm_core_16x4 */ -- move C0, C -- move A0, A -- slli.d T0, LDC, 3 -- add.d C1, C0, T0 -- addi.d J, J, -1 /* J-- */ -- add.d C2, C1, T0 -- add.d C3, C2, T0 -- --#if defined(TRMMKERNEL) && defined(LEFT) -- move OFF, OFFSET --#endif -- -- /* if (!(M >> 4)) goto L_M8 */ -- srai.d I, M, 4 /* I = bm >> 4 */ -- beq ZERO, I, .L_M8 -- --.L_I1: /* I-- */ --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- move B0, B --#else -- slli.d T0, OFF, 0x07 -- add.d A0, A0, T0 -- slli.d T0, OFF, 0x05 -- add.d B0, B, T0 --#endif -- --#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -- sub.d L, K, OFF --#elif defined(LEFT) -- /* number of values in A */ -- addi.d L, OFF, 16 --#else -- /* number of values in B */ -- addi.d L, OFF, 4 --#endif --#else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif -- /* Calculate the first set of D0~D15, -- * avoidig set 0 operation -- * Load 16 * 64 from A0 -- * U0 = {a3, a2, a1, a0} -- * U1 = {a7, a6, a5, a4} -- * U2 = {a11, a10, a9, a8} -- * U3 = {a15, a14, a13, a12} -- */ -+.macro KERNEL2x16x4 - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- preld 0, C0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- xvfmul.d D1, U1, U4 -- preld 0, C0, 0x40 -- xvfmul.d D2, U2, U4 -- xvfmul.d D3, U3, U4 -- -- xvldrepl.d U4, B0, 0x08 -- preld 0, C1, 0x00 -- /* line 2 */ -- xvfmul.d D4, U0, U4 -- xvfmul.d D5, U1, U4 -- preld 0, C1, 0x40 -- xvfmul.d D6, U2, U4 -- xvfmul.d D7, U3, U4 -- -- xvldrepl.d U4, B0, 0x10 -- preld 0, C2, 0x00 -- /* line 3 */ -- xvfmul.d D8, U0, U4 -- xvfmul.d D9, U1, U4 -- preld 0, C2, 0x40 -- xvfmul.d D10, U2, U4 -- xvfmul.d D11, U3, U4 -- -- xvldrepl.d U4, B0, 0x18 -- preld 0, C3, 0x00 -- /* line 4 */ -- xvfmul.d D12, U0, U4 -- xvfmul.d D13, U1, U4 -- preld 0, C3, 0x40 -- xvfmul.d D14, U2, U4 -- xvfmul.d D15, U3, U4 -- -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x20 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_L7 */ -- beq ZERO,TL, .L_L7 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 - -- /* Calculate 8 sets of D0~D15 */ --.L_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 - xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ - xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ - xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 - -- /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ - preld 0, B0, B_PRE -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D10, U10, U14, D10 -+ xvfmadd.d D11, U11, U14, D11 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D14, U10, U15, D14 -+ xvfmadd.d D15, U11, U15, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - -- /***8-2***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- /* Cumulative D0~D15 */ -- xvldrepl.d U4, B0, 0x00 -+ xvld U8, A0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 -+ -+ xvld U9, A0, 0x20 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 -+ -+ xvld U10, A0, 0x40 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvld U11, A0, 0x60 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ - preld 0, B0, B_PRE -+ xvldrepl.d U13, B0, 0x08 -+ xvfmadd.d D10, U2, U6, D10 -+ xvfmadd.d D11, U3, U6, D11 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE -+ xvldrepl.d U14, B0, 0x10 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U15, B0, 0x18 -+ xvfmadd.d D14, U2, U7, D14 -+ xvfmadd.d D15, U3, U7, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 -+.endm - -- /***8-3***/ -- /* Load 16 * 64 from A0 */ -+.macro KERNEL2x16x4_END - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ - xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ - xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ - xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 - -- /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ - preld 0, B0, B_PRE -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D10, U10, U14, D10 -+ xvfmadd.d D11, U11, U14, D11 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D14, U10, U15, D14 -+ xvfmadd.d D15, U11, U15, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - -- /***8-4***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- /* Cumulative D0~D15 */ -- xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 -+ - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 -+ -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ - preld 0, B0, B_PRE -+ xvfmadd.d D10, U2, U6, D10 -+ xvfmadd.d D11, U3, U6, D11 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 -+ xvfmadd.d D14, U2, U7, D14 -+ xvfmadd.d D15, U3, U7, D15 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+.macro KERNEL8x16x4 -+.rept 4 -+ KERNEL2x16x4 -+.endr -+.endm - -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x20 -+.macro KERNEL8x16x4_END -+.rept 3 -+ KERNEL2x16x4 -+.endr -+ KERNEL2x16x4_END -+.endm - -- /***8-5***/ -- /* Load 16 * 64 from A0 */ -+.macro KERNEL2x8x4 - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 - -- /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- preld 0, B0, B_PRE -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- preld 0, A0, A_PRE -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 -- preld 0, A0, A_PRE + 0x40 -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 - -- addi.d A0, A0, 0x80 -+ addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - -- /***8-6***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 - -- /* Cumulative D0~D15 */ -- xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U12, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- preld 0, B0, B_PRE - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- preld 0, A0, A_PRE -+ xvldrepl.d U13, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 -- preld 0, A0, A_PRE + 0x40 -+ xvldrepl.d U14, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U15, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 - -- addi.d A0, A0, 0x80 -+ addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 -+.endm - -- /***8-7***/ -- /* Load 16 * 64 from A0 */ -+.macro KERNEL2x8x4_END - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 - -- /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- preld 0, B0, B_PRE - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- preld 0, A0, A_PRE -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 -- preld 0, A0, A_PRE + 0x40 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+.endm - -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x20 -+.macro KERNEL8x8x4 -+.rept 4 -+ KERNEL2x8x4 -+.endr -+.endm - -- /***8-8***/ -- /* Load 16 * 64 from A0 */ -+.macro KERNEL8x8x4_END -+.rept 3 -+ KERNEL2x8x4 -+.endr -+ KERNEL2x8x4_END -+.endm -+ -+.macro KERNEL2x4x4 - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 - -- /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- preld 0, B0, B_PRE -+ xvfmadd.d D0, U8, U12, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- preld 0, A0, A_PRE -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 -- preld 0, A0, A_PRE + 0x40 -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 - -- addi.d A0, A0, 0x80 -+ addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - -- addi.d TL, TL, -1 /* TL-- */ -- blt ZERO,TL, .L_TL1 -+ xvld U8, A0, 0x00 - -- /* Maybe we need calculate the last -- * 7 sets of D0~D15? -- */ --.L_L7: -- /* if (!(L & 7)) goto L_L0 */ -- andi TL, L, 7 -- beq TL, ZERO,.L_L0 -+ xvldrepl.d U12, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 - --.L_L71: -- /* Load 16 * 64 from A0 */ -+ xvldrepl.d U13, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ xvldrepl.d U14, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ -+ xvldrepl.d U15, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+.endm -+ -+.macro KERNEL2x4x4_END - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 - -- /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -+ xvfmadd.d D0, U8, U12, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -- xvfmadd.d D10, U2, U4, D10 -- xvfmadd.d D11, U3, U4, D11 -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -- xvfmadd.d D14, U2, U4, D14 -- xvfmadd.d D15, U3, U4, D15 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 - -- /* Add stride for A0, B0 */ -- addi.d A0, A0, 0x80 -+ addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - -- addi.d TL, TL, -1 -- blt ZERO,TL, .L_L71 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D12, U0, U7, D12 -+.endm - --.L_L0: --#if defined(TRMMKERNEL) -- xvfmul.d D0, D0, VALPHA -- xvfmul.d D1, D1, VALPHA -- xvfmul.d D2, D2, VALPHA -- xvfmul.d D3, D3, VALPHA -- xvfmul.d D4, D4, VALPHA -- xvfmul.d D5, D5, VALPHA -- xvfmul.d D6, D6, VALPHA -- xvfmul.d D7, D7, VALPHA -- xvfmul.d D8, D8, VALPHA -- xvfmul.d D9, D9, VALPHA -- xvfmul.d D10, D10, VALPHA -- xvfmul.d D11, D11, VALPHA -- xvfmul.d D12, D12, VALPHA -- xvfmul.d D13, D13, VALPHA -- xvfmul.d D14, D14, VALPHA -- xvfmul.d D15, D15, VALPHA --#else -- /* Load C0 */ -- xvld U0, C0, 0x00 -- xvld U1, C0, 0x20 -- xvld U2, C0, 0x40 -- xvld U3, C0, 0x60 -- xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -- xvfmadd.d D1, D1, VALPHA, U1 -- xvfmadd.d D2, D2, VALPHA, U2 -- xvfmadd.d D3, D3, VALPHA, U3 -+.macro KERNEL8x4x4 -+.rept 4 -+ KERNEL2x4x4 -+.endr -+.endm - -- /* Load C1 */ -- xvld U0, C1, 0x00 -- xvld U1, C1, 0x20 -- xvld U2, C1, 0x40 -- xvld U3, C1, 0x60 -- xvfmadd.d D4, D4, VALPHA, U0 -- xvfmadd.d D5, D5, VALPHA, U1 -- xvfmadd.d D6, D6, VALPHA, U2 -- xvfmadd.d D7, D7, VALPHA, U3 -+.macro KERNEL8x4x4_END -+.rept 3 -+ KERNEL2x4x4 -+.endr -+ KERNEL2x4x4_END -+.endm - -- /* Load C2 */ -- xvld U0, C2, 0x00 -- xvld U1, C2, 0x20 -- xvld U2, C2, 0x40 -- xvld U3, C2, 0x60 -- xvfmadd.d D8, D8, VALPHA, U0 -- xvfmadd.d D9, D9, VALPHA, U1 -- xvfmadd.d D10, D10, VALPHA, U2 -- xvfmadd.d D11, D11, VALPHA, U3 -+.macro KERNEL2x2x4 -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 - -- /* Load C3 */ -- xvld U0, C3, 0x00 -- xvld U1, C3, 0x20 -- xvld U2, C3, 0x40 -- xvld U3, C3, 0x60 -- xvfmadd.d D12, D12, VALPHA, U0 -- xvfmadd.d D13, D13, VALPHA, U1 -- xvfmadd.d D14, D14, VALPHA, U2 -- xvfmadd.d D15, D15, VALPHA, U3 --#endif // #if defined(TRMMKERNEL) -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 - -- /* Store C0 */ -- xvst D0, C0, 0x00 -- xvst D1, C0, 0x20 -- xvst D2, C0, 0x40 -- xvst D3, C0, 0x60 -- /* Store C1 */ -- xvst D4, C1, 0x00 -- xvst D5, C1, 0x20 -- xvst D6, C1, 0x40 -- xvst D7, C1, 0x60 -- /* Store C2 */ -- xvst D8, C2, 0x00 -- xvst D9, C2, 0x20 -- xvst D10, C2, 0x40 -- xvst D11, C2, 0x60 -- /* Store C3 */ -- xvst D12, C3, 0x00 -- xvst D13, C3, 0x20 -- xvst D14, C3, 0x40 -- xvst D15, C3, 0x60 -+ xvld U4, B0, 0x00 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 - -- /* Add stride for C */ -- addi.d C0, C0, 0x80 -- addi.d C1, C1, 0x80 -- addi.d C2, C2, 0x80 -- addi.d C3, C3, 0x80 -+ xvldrepl.d U8, A0, 0x00 -+ xvldrepl.d U9, A0, 0x08 - --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- sub.d L, K, OFF --#ifdef LEFT -- /* number of values in A */ -- addi.d L, L, -16 --#else -- /* number of values in B */ -- addi.d L, L, -4 --#endif -- slli.d T0, L, 0x07 -- add.d A0, A0, T0 -- slli.d T0, L, 0x05 -- add.d B0, B0, T0 --#endif -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 - --#ifdef LEFT -- addi.d OFF, OFF, 0x10 --#endif --#endif // #if defined(TRMMKERNEL) -+ xvld U12, B0, 0x00 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+.endm - -- addi.d I, I, -1 /* I-- */ -- blt ZERO,I, .L_I1 -+.macro KERNEL2x2x4_END -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 - --.L_M8: -- /* We have done M & 16, considering M=8/4/2/1 */ -- andi I, M, 15 -- beq ZERO,I, .L_M0 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 - -- andi I, M, 8 -- beq ZERO,I, .L_M4 -+ xvld U4, B0, 0x00 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 - --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- move B0, B --#else -- slli.d T0, OFF, 0x06 -- add.d A0, A0, T0 -- slli.d T0, OFF, 0x05 -- add.d B0, B, T0 --#endif -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+.endm - --#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -- sub.d L, K, OFF --#elif defined(LEFT) -- /* number of values in A */ -- addi.d L, OFF, 8 --#else -- /* number of values in B */ -- addi.d L, OFF, 4 --#endif --#else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif // #if defined(TRMMKERNEL) -+.macro KERNEL8x2x4 -+.rept 4 -+ KERNEL2x2x4 -+.endr -+.endm - -- /* Load 8 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+.macro KERNEL8x2x4_END -+.rept 3 -+ KERNEL2x2x4 -+.endr -+ KERNEL2x2x4_END -+.endm - -- xvldrepl.d U4, B0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- xvfmul.d D1, U1, U4 -+.macro KERNEL2x1x4 -+ xvldrepl.d U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvld U4, B0, 0x00 - -- xvldrepl.d U4, B0, 0x08 -- /* line 2 */ -- xvfmul.d D4, U0, U4 -- xvfmul.d D5, U1, U4 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 - -- xvldrepl.d U4, B0, 0x10 -- /* line 3 */ -- xvfmul.d D8, U0, U4 -- xvfmul.d D9, U1, U4 -+ xvldrepl.d U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvld U12, B0, 0x00 - -- xvldrepl.d U4, B0, 0x18 -- /* line 4 */ -- xvfmul.d D12, U0, U4 -- xvfmul.d D13, U1, U4 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+.endm - -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_M8_L7 */ -- beq ZERO,TL, .L_M8_L7 -+.macro KERNEL2x1x4_END -+ xvldrepl.d U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvld U4, B0, 0x00 - --.L_M8_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 16 * 64 from A0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ xvfmadd.d D0, U0, U4, D0 -+.endm -+ -+.macro KERNEL8x1x4 -+.rept 4 -+ KERNEL2x1x4 -+.endr -+.endm -+ -+.macro KERNEL8x1x4_END -+.rept 3 -+ KERNEL2x1x4 -+.endr -+ KERNEL2x1x4_END -+.endm -+ -+.macro KERNEL2x16x2 - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ - xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 - - xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvld U9, A0, 0x20 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ xvld U10, A0, 0x40 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+ xvld U11, A0, 0x60 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+.endm - -- /***8-2***/ -+.macro KERNEL2x16x2_END - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ - xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 - - xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+.endm - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+.macro KERNEL8x16x2 -+.rept 4 -+ KERNEL2x16x2 -+.endr -+.endm -+ -+.macro KERNEL8x16x2_END -+.rept 3 -+ KERNEL2x16x2 -+.endr -+ KERNEL2x16x2_END -+.endm - -- /***8-3***/ -+.macro KERNEL2x8x2 - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ - xvld U1, A0, 0x20 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 - - xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ xvld U9, A0, 0x20 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - - addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+ addi.d B0, B0, 0x10 -+.endm - -- /***8-4***/ -+.macro KERNEL2x8x2_END - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ - xvld U1, A0, 0x20 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 - - xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+.endm - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+.macro KERNEL8x8x2 -+.rept 4 -+ KERNEL2x8x2 -+.endr -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+.macro KERNEL8x8x2_END -+.rept 3 -+ KERNEL2x8x2 -+ .endr -+ KERNEL2x8x2_END -+.endm - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -- -- /***8-5***/ -+.macro KERNEL2x4x2 - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 - - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ xvldrepl.d U5, B0, 0x08 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+.endm - -- /***8-6***/ -+.macro KERNEL2x4x2_END - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 - - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ xvldrepl.d U5, B0, 0x08 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+.macro KERNEL8x4x2 -+.rept 4 -+ KERNEL2x4x2 -+.endr -+.endm - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+.macro KERNEL8x4x2_END -+.rept 3 -+ KERNEL2x4x2 -+.endr -+ KERNEL2x4x2_END -+.endm - -- /***8-7***/ -+.macro KERNEL2x2x2 - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 - - xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 - xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D4, U0, U5, D4 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+.macro KERNEL2x2x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+.endm - -- /***8-8***/ -+.macro KERNEL8x2x2 -+.rept 4 -+ KERNEL2x2x2 -+.endr -+.endm -+ -+.macro KERNEL8x2x2_END -+.rept 3 -+ KERNEL2x2x2 -+.endr -+ KERNEL2x2x2_END -+.endm -+ -+.macro KERNEL2x1x2 - xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 - - xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 - xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D4, U0, U5, D4 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+.macro KERNEL2x1x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 - -- addi.d TL, TL, -1 /* TL-- */ -- blt ZERO,TL, .L_M8_TL1 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 - --.L_M8_L7: -- /* if (!(L & 7)) goto L_M8_L0 */ -- andi TL, L, 7 -- beq TL, ZERO,.L_M8_L0 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+.endm - --.L_M8_L71: -+.macro KERNEL8x1x2 -+.rept 4 -+ KERNEL2x1x2 -+.endr -+.endm -+ -+.macro KERNEL8x1x2_END -+.rept 3 -+ KERNEL2x1x2 -+.endr -+ KERNEL2x1x2_END -+.endm -+ -+.macro KERNEL2x16x1 - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ - xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ xvld U8, A0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvld U9, A0, 0x20 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- xvfmadd.d D9, U1, U4, D9 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- xvfmadd.d D13, U1, U4, D13 -+ xvldrepl.d U12, B0, 0x00 - -- /* Add stride for A0, B0 */ -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x20 -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+.endm - -- addi.d TL, TL, -1 -- blt ZERO,TL, .L_M8_L71 -+.macro KERNEL2x16x1_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 - --.L_M8_L0: --#if defined(TRMMKERNEL) -- xvfmul.d D0, D0, VALPHA -- xvfmul.d D1, D1, VALPHA -- xvfmul.d D4, D4, VALPHA -- xvfmul.d D5, D5, VALPHA -- xvfmul.d D8, D8, VALPHA -- xvfmul.d D9, D9, VALPHA -- xvfmul.d D12, D12, VALPHA -- xvfmul.d D13, D13, VALPHA --#else -- /* Load C0 */ -- xvld U0, C0, 0x00 -- xvld U1, C0, 0x20 -- xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -- xvfmadd.d D1, D1, VALPHA, U1 -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 - -- /* Load C1 */ -- xvld U0, C1, 0x00 -- xvld U1, C1, 0x20 -- xvfmadd.d D4, D4, VALPHA, U0 -- xvfmadd.d D5, D5, VALPHA, U1 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 - -- /* Load C2 */ -- xvld U0, C2, 0x00 -- xvld U1, C2, 0x20 -- xvfmadd.d D8, D8, VALPHA, U0 -- xvfmadd.d D9, D9, VALPHA, U1 -+ xvldrepl.d U4, B0, 0x00 - -- /* Load C3 */ -- xvld U0, C3, 0x00 -- xvld U1, C3, 0x20 -- xvfmadd.d D12, D12, VALPHA, U0 -- xvfmadd.d D13, D13, VALPHA, U1 --#endif // #if defined(TRMMKERNEL) -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 - -- /* Store C0 */ -- xvst D0, C0, 0x00 -- xvst D1, C0, 0x20 -- /* Store C1 */ -- xvst D4, C1, 0x00 -- xvst D5, C1, 0x20 -- /* Store C2 */ -- xvst D8, C2, 0x00 -- xvst D9, C2, 0x20 -- /* Store C3 */ -- xvst D12, C3, 0x00 -- xvst D13, C3, 0x20 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 - -- /* Add stride for C */ -- addi.d C0, C0, 0x40 -- addi.d C1, C1, 0x40 -- addi.d C2, C2, 0x40 -- addi.d C3, C3, 0x40 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+.endm - --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- sub.d L, K, OFF --#ifdef LEFT -- /* number of values in A */ -- addi.d L, L, -8 --#else -- /* number of values in B */ -- addi.d L, L, -4 --#endif -- slli.d T0, L, 0x06 -- add.d A0, A0, T0 -- slli.d T0, L, 0x05 -- add.d B0, B0, T0 --#endif -+.macro KERNEL8x16x1 -+.rept 4 -+ KERNEL2x16x1 -+.endr -+.endm - --#ifdef LEFT -- /* number of values in A */ -- addi.d OFF, OFF, 0x08 --#endif --#endif // #if defined(TRMMKERNEL) -+.macro KERNEL8x16x1_END -+.rept 3 -+ KERNEL2x16x1 -+.endr -+ KERNEL2x16x1_END -+.endm - --/********LOOP (if(N >> 2 ) && (M & 8)) End************/ -+.macro KERNEL2x8x1 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ xvld U1, A0, 0x20 -+ xvldrepl.d U4, B0, 0x00 - --.L_M4: -- andi I, M, 4 -- beq ZERO,I, .L_M2 -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 - --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- move B0, B --#else -- slli.d T0, OFF, 0x05 -- add.d A0, A0, T0 -- add.d B0, B, T0 --#endif -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvld U9, A0, 0x20 -+ xvldrepl.d U12, B0, 0x00 - --#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -- sub.d L, K, OFF --#elif defined(LEFT) -- /* number of values in A */ -- addi.d L, OFF, 4 --#else -- /* number of values in B */ -- addi.d L, OFF, 4 --#endif --#else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+.endm - -- /* Load 4 * 64 from A0 */ -+.macro KERNEL2x8x1_END - xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ xvld U1, A0, 0x20 -+ xvldrepl.d U4, B0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- -- xvldrepl.d U4, B0, 0x08 -- /* line 2 */ -- xvfmul.d D4, U0, U4 -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 - -- xvldrepl.d U4, B0, 0x10 -- /* line 3 */ -- xvfmul.d D8, U0, U4 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- /* line 4 */ -- xvfmul.d D12, U0, U4 -+.macro KERNEL8x8x1 -+.rept 4 -+ KERNEL2x8x1 -+.endr -+.endm - -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_M4_L7 */ -- beq ZERO,TL, .L_M4_L7 -+.macro KERNEL8x8x1_END -+.rept 3 -+ KERNEL2x8x1 -+.endr -+ KERNEL2x8x1_END -+.endm - --.L_M4_TL1: /* TL-- */ -- /***8-1***/ -+.macro KERNEL2x4x1 - xvld U0, A0, 0x00 -- -+ xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U12, B0, 0x00 - - addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+ addi.d B0, B0, 0x08 -+.endm - -- /***8-2***/ -+.macro KERNEL2x4x1_END - xvld U0, A0, 0x00 -- -+ xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D0, U0, U4, D0 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+.macro KERNEL8x4x1 -+.rept 4 -+ KERNEL2x4x1 -+.endr -+.endm - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+.macro KERNEL8x4x1_END -+.rept 3 -+ KERNEL2x4x1 -+.endr -+ KERNEL2x4x1_END -+.endm - -- /***8-3***/ -+.macro KERNEL2x2x1 - xvld U0, A0, 0x00 -- -+ xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U12, B0, 0x00 - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+.endm - -- /***8-4***/ -+.macro KERNEL2x2x1_END - xvld U0, A0, 0x00 -- -+ xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvfmadd.d D0, U0, U4, D0 -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+.macro KERNEL8x2x1 -+.rept 4 -+ KERNEL2x2x1 -+.endr -+.endm - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+.macro KERNEL8x2x1_END -+.rept 3 -+ KERNEL2x2x1 -+.endr -+ KERNEL2x2x1_END -+.endm - -- /***8-5***/ -+.macro KERNEL2x1x1 - xvld U0, A0, 0x00 -- -+ xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U12, B0, 0x00 - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+.endm - -- /***8-6***/ -+.macro KERNEL2x1x1_END - xvld U0, A0, 0x00 -- -+ xvfmadd.d D0, U8, U12, D0 - xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ - xvfmadd.d D0, U0, U4, D0 -+.endm - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+.macro KERNEL8x1x1 -+.rept 4 -+ KERNEL2x1x1 -+.endr -+.endm - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+.macro KERNEL8x1x1_END -+.rept 3 -+ KERNEL2x1x1 -+.endr -+ KERNEL2x1x1_END -+.endm - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+ PROLOGUE - -- /***8-7***/ -- xvld U0, A0, 0x00 -+ addi.d $sp, $sp, -120 -+ /* Store regs */ -+ SDARG $r23, $sp, 0 -+ SDARG $r24, $sp, 8 -+ SDARG $r25, $sp, 16 -+ SDARG $r26, $sp, 24 -+ SDARG $r27, $sp, 32 -+ ST $f23, $sp, 40 -+ ST $f24, $sp, 48 -+ ST $f25, $sp, 56 -+ ST $f26, $sp, 64 -+ ST $f27, $sp, 72 -+ ST $f28, $sp, 80 -+ ST $f29, $sp, 88 -+ ST $f30, $sp, 96 -+ ST $f31, $sp, 104 -+ ST ALPHA, $sp, 112 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+#if defined (TRMMKERNEL) && !defined(LEFT) -+ sub.d OFF, ZERO, OFFSET -+#else -+ xor OFF, OFF, OFF -+#endif - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ /* if (!(N >> 2)) goto L_N3 */ -+ srai.d J, N, 2 /* J = bn >> 2 */ -+ andi N, N, 0x03 -+ xvldrepl.d VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */ -+ beq ZERO, J, .L_N3 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+.L_J1: /* J-- && This loop include Condition 1 */ - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* -+* dgemm_core_16x4 */ -+ move C0, C -+ move A0, A -+ slli.d T0, LDC, 3 -+ add.d C1, C0, T0 -+ addi.d J, J, -1 /* J-- */ -+ add.d C2, C1, T0 -+ add.d C3, C2, T0 - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x20 -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ -+ /* if (!(M >> 4)) goto L_M8 */ -+ srai.d I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_M8 - -- /***8-8***/ -+.L_I1: /* I-- */ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x07 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 16 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ /* Calculate the first set of D0~D15, -+ * avoidig set 0 operation -+ * Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ - xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U4, B0, 0x00 -+ preld 0, C0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ preld 0, C0, 0x40 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U5, B0, 0x08 -+ preld 0, C1, 0x00 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 -+ preld 0, C1, 0x40 -+ xvfmul.d D6, U2, U5 -+ xvfmul.d D7, U3, U5 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvldrepl.d U6, B0, 0x10 -+ preld 0, C2, 0x00 -+ /* line 3 */ -+ xvfmul.d D8, U0, U6 -+ xvfmul.d D9, U1, U6 -+ preld 0, C2, 0x40 -+ xvfmul.d D10, U2, U6 -+ xvfmul.d D11, U3, U6 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvldrepl.d U7, B0, 0x18 -+ preld 0, C3, 0x00 -+ /* line 4 */ -+ xvfmul.d D12, U0, U7 -+ xvfmul.d D13, U1, U7 -+ preld 0, C3, 0x40 -+ xvfmul.d D14, U2, U7 -+ xvfmul.d D15, U3, U7 - -- addi.d A0, A0, 0x20 -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_L7 */ -+ beq ZERO,TL, .L_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 -+ -+ addi.d TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ xvldrepl.d U14, B0, 0x10 -+ xvldrepl.d U15, B0, 0x18 -+ addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - -+ beq ZERO, TL, .L_TL1_END -+.L_TL1: /* TL-- */ -+ KERNEL8x16x4 - addi.d TL, TL, -1 /* TL-- */ -- blt ZERO,TL, .L_M4_TL1 -+ blt ZERO,TL, .L_TL1 - --.L_M4_L7: -- /* if (!(L & 7)) goto L_M4_L0 */ -+.L_TL1_END: -+ KERNEL8x16x4_END -+ -+ /* Maybe we need calculate the last -+ * 7 sets of D0~D15? -+ */ -+.L_L7: -+ /* if (!(L & 7)) goto L_L0 */ - andi TL, L, 7 -- beq TL, ZERO,.L_M4_L0 -+ beq TL, ZERO,.L_L0 - --.L_M4_L71: -+.L_L71: -+ /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 - -+ /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ xvfmadd.d D10, U2, U6, D10 -+ xvfmadd.d D11, U3, U6, D11 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+ xvfmadd.d D14, U2, U7, D14 -+ xvfmadd.d D15, U3, U7, D15 - - /* Add stride for A0, B0 */ -- addi.d A0, A0, 0x20 -+ addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - addi.d TL, TL, -1 -- blt ZERO,TL, .L_M4_L71 -+ blt ZERO,TL, .L_L71 - --.L_M4_L0: -+.L_L0: -+ xvldrepl.d VALPHA, $sp, 112 - #if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA -+ xvfmul.d D2, D2, VALPHA -+ xvfmul.d D3, D3, VALPHA - xvfmul.d D4, D4, VALPHA -+ xvfmul.d D5, D5, VALPHA -+ xvfmul.d D6, D6, VALPHA -+ xvfmul.d D7, D7, VALPHA - xvfmul.d D8, D8, VALPHA -+ xvfmul.d D9, D9, VALPHA -+ xvfmul.d D10, D10, VALPHA -+ xvfmul.d D11, D11, VALPHA - xvfmul.d D12, D12, VALPHA -+ xvfmul.d D13, D13, VALPHA -+ xvfmul.d D14, D14, VALPHA -+ xvfmul.d D15, D15, VALPHA - #else - /* Load C0 */ - xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 -+ xvfmadd.d D2, D2, VALPHA, U2 -+ xvfmadd.d D3, D3, VALPHA, U3 - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvfmadd.d D4, D4, VALPHA, U0 -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+ xvfmadd.d D4, D4, VALPHA, U4 -+ xvfmadd.d D5, D5, VALPHA, U5 -+ xvfmadd.d D6, D6, VALPHA, U6 -+ xvfmadd.d D7, D7, VALPHA, U7 - - /* Load C2 */ -- xvld U0, C2, 0x00 -- xvfmadd.d D8, D8, VALPHA, U0 -+ xvld U8, C2, 0x00 -+ xvld U9, C2, 0x20 -+ xvld U10, C2, 0x40 -+ xvld U11, C2, 0x60 -+ xvfmadd.d D8, D8, VALPHA, U8 -+ xvfmadd.d D9, D9, VALPHA, U9 -+ xvfmadd.d D10, D10, VALPHA, U10 -+ xvfmadd.d D11, D11, VALPHA, U11 - - /* Load C3 */ - xvld U0, C3, 0x00 -+ xvld U1, C3, 0x20 -+ xvld U2, C3, 0x40 -+ xvld U3, C3, 0x60 - xvfmadd.d D12, D12, VALPHA, U0 --#endif // #if defined(TRMMKERNEL) -+ xvfmadd.d D13, D13, VALPHA, U1 -+ xvfmadd.d D14, D14, VALPHA, U2 -+ xvfmadd.d D15, D15, VALPHA, U3 -+#endif // #if defined(TRMMKERNEL) - - /* Store C0 */ - xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ xvst D2, C0, 0x40 -+ xvst D3, C0, 0x60 - /* Store C1 */ - xvst D4, C1, 0x00 -+ xvst D5, C1, 0x20 -+ xvst D6, C1, 0x40 -+ xvst D7, C1, 0x60 - /* Store C2 */ - xvst D8, C2, 0x00 -+ xvst D9, C2, 0x20 -+ xvst D10, C2, 0x40 -+ xvst D11, C2, 0x60 - /* Store C3 */ - xvst D12, C3, 0x00 -+ xvst D13, C3, 0x20 -+ xvst D14, C3, 0x40 -+ xvst D15, C3, 0x60 - - /* Add stride for C */ -- addi.d C0, C0, 0x20 -- addi.d C1, C1, 0x20 -- addi.d C2, C2, 0x20 -- addi.d C3, C3, 0x20 -+ addi.d C0, C0, 0x80 -+ addi.d C1, C1, 0x80 -+ addi.d C2, C2, 0x80 -+ addi.d C3, C3, 0x80 - - #if defined(TRMMKERNEL) - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF - #ifdef LEFT -- /* number of values in A */ -- addi.d L, L, -4 -+ /* number of values in A */ -+ addi.d L, L, -16 - #else - /* number of values in B */ - addi.d L, L, -4 - #endif -- slli.d T0, L, 0x05 -+ slli.d T0, L, 0x07 - add.d A0, A0, T0 -+ slli.d T0, L, 0x05 - add.d B0, B0, T0 - #endif - - #ifdef LEFT -- /* number of values in A */ -- addi.d OFF, OFF, 0x04 -+ addi.d OFF, OFF, 0x10 - #endif - #endif // #if defined(TRMMKERNEL) - --/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ -+ addi.d I, I, -1 /* I-- */ -+ blt ZERO,I, .L_I1 - --.L_M2: -- andi I, M, 2 -- beq ZERO,I, .L_M1 -+.L_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_M4 - - #if defined(TRMMKERNEL) - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B - #else -- slli.d T0, OFF, 0x04 -+ slli.d T0, OFF, 0x06 - add.d A0, A0, T0 - slli.d T0, OFF, 0x05 - add.d B0, B, T0 -@@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - sub.d L, K, OFF - #elif defined(LEFT) - /* number of values in A */ -- addi.d L, OFF, 2 -+ addi.d L, OFF, 8 - #else - /* number of values in B */ - addi.d L, OFF, 4 -@@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #else // #if !defined(TRMMKERNEL) - move B0, B - move L, K /* L = bk */ --#endif -+#endif // #if defined(TRMMKERNEL) - -- /* Load 2 * 64 from A0 */ -+ /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 - -- xvldrepl.d U4, B0, 0x08 -+ xvldrepl.d U5, B0, 0x08 - /* line 2 */ -- xvfmul.d D4, U0, U4 -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 - -- xvldrepl.d U4, B0, 0x10 -+ xvldrepl.d U6, B0, 0x10 - /* line 3 */ -- xvfmul.d D8, U0, U4 -+ xvfmul.d D8, U0, U6 -+ xvfmul.d D9, U1, U6 - -- xvldrepl.d U4, B0, 0x18 -+ xvldrepl.d U7, B0, 0x18 - /* line 4 */ -- xvfmul.d D12, U0, U4 -+ xvfmul.d D12, U0, U7 -+ xvfmul.d D13, U1, U7 - - /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x10 -+ addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_M2_L7 */ -- beq ZERO,TL, .L_M2_L7 -- --.L_M2_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 2 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -- -- /***8-7***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ /* if (TL < 1) goto L_M8_L7 */ -+ beq ZERO,TL, .L_M8_L7 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ addi.d TL, TL, -1 - -- addi.d A0, A0, 0x10 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ xvldrepl.d U14, B0, 0x10 -+ xvldrepl.d U15, B0, 0x18 -+ addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ beq ZERO, TL, .L_M8_TL1_END - -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x20 -+.L_M8_TL1: /* TL-- */ -+ KERNEL8x8x4 - - addi.d TL, TL, -1 /* TL-- */ -- blt ZERO,TL, .L_M2_TL1 -+ blt ZERO,TL, .L_M8_TL1 - --.L_M2_L7: -- /* if (!(L & 7)) goto L_M2_L0 */ -+.L_M8_TL1_END: -+ KERNEL8x8x4_END -+ -+.L_M8_L7: -+ /* if (!(L & 7)) goto L_M8_L0 */ - andi TL, L, 7 -- beq TL, ZERO,.L_M2_L0 -+ beq TL, ZERO,.L_M8_L0 - --.L_M2_L71: -+.L_M8_L71: - xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 - - /* Add stride for A0, B0 */ -- addi.d A0, A0, 0x10 -+ addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - addi.d TL, TL, -1 -- blt ZERO,TL, .L_M2_L71 -+ blt ZERO,TL, .L_M8_L71 - --.L_M2_L0: -+.L_M8_L0: -+ xvldrepl.d VALPHA, $sp, 112 - #if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA - xvfmul.d D4, D4, VALPHA -+ xvfmul.d D5, D5, VALPHA - xvfmul.d D8, D8, VALPHA -+ xvfmul.d D9, D9, VALPHA - xvfmul.d D12, D12, VALPHA -+ xvfmul.d D13, D13, VALPHA - #else - /* Load C0 */ - xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvfmadd.d D1, D1, VALPHA, U1 - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvfmadd.d D4, D4, VALPHA, U0 -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ xvfmadd.d D4, D4, VALPHA, U2 -+ xvfmadd.d D5, D5, VALPHA, U3 - - /* Load C2 */ -- xvld U0, C2, 0x00 -- xvfmadd.d D8, D8, VALPHA, U0 -+ xvld U4, C2, 0x00 -+ xvld U5, C2, 0x20 -+ xvfmadd.d D8, D8, VALPHA, U4 -+ xvfmadd.d D9, D9, VALPHA, U5 - - /* Load C3 */ -- xvld U0, C3, 0x00 -- xvfmadd.d D12, D12, VALPHA, U0 -+ xvld U6, C3, 0x00 -+ xvld U7, C3, 0x20 -+ xvfmadd.d D12, D12, VALPHA, U6 -+ xvfmadd.d D13, D13, VALPHA, U7 - #endif // #if defined(TRMMKERNEL) - -- xvstelm.d D0, C0, 0x00, 0x00 -- xvstelm.d D4, C1, 0x00, 0x00 -- xvstelm.d D8, C2, 0x00, 0x00 -- xvstelm.d D12, C3, 0x00, 0x00 -- xvstelm.d D0, C0, 0x08, 0x01 -- xvstelm.d D4, C1, 0x08, 0x01 -- xvstelm.d D8, C2, 0x08, 0x01 -- xvstelm.d D12, C3, 0x08, 0x01 -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ xvst D1, C0, 0x20 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ xvst D5, C1, 0x20 -+ /* Store C2 */ -+ xvst D8, C2, 0x00 -+ xvst D9, C2, 0x20 -+ /* Store C3 */ -+ xvst D12, C3, 0x00 -+ xvst D13, C3, 0x20 - - /* Add stride for C */ -- addi.d C0, C0, 0x10 -- addi.d C1, C1, 0x10 -- addi.d C2, C2, 0x10 -- addi.d C3, C3, 0x10 -+ addi.d C0, C0, 0x40 -+ addi.d C1, C1, 0x40 -+ addi.d C2, C2, 0x40 -+ addi.d C3, C3, 0x40 - - #if defined(TRMMKERNEL) - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - sub.d L, K, OFF - #ifdef LEFT - /* number of values in A */ -- addi.d L, L, -2 -+ addi.d L, L, -8 - #else - /* number of values in B */ - addi.d L, L, -4 - #endif -- slli.d T0, L, 0x04 -+ slli.d T0, L, 0x06 - add.d A0, A0, T0 - slli.d T0, L, 0x05 - add.d B0, B0, T0 -@@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - #ifdef LEFT - /* number of values in A */ -- addi.d OFF, OFF, 0x02 -+ addi.d OFF, OFF, 0x08 - #endif - #endif // #if defined(TRMMKERNEL) - --/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ -+/********LOOP (if(N >> 2 ) && (M & 8)) End************/ - --.L_M1: -- andi I, M, 1 -- beq ZERO,I, .L_M0 -+.L_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_M2 - - #if defined(TRMMKERNEL) - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - move B0, B - #else -- slli.d T0, OFF, 0x03 -- add.d A0, A0, T0 - slli.d T0, OFF, 0x05 -+ add.d A0, A0, T0 - add.d B0, B, T0 - #endif - -@@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - sub.d L, K, OFF - #elif defined(LEFT) - /* number of values in A */ -- addi.d L, OFF, 1 -+ addi.d L, OFF, 4 - #else - /* number of values in B */ - addi.d L, OFF, 4 -@@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - move L, K /* L = bk */ - #endif - -- /* Load 1 * 64 from A0 */ -+ /* Load 4 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - -- xvldrepl.d U4, B0, 0x08 -+ xvldrepl.d U5, B0, 0x08 - /* line 2 */ -- xvfmul.d D4, U0, U4 -+ xvfmul.d D4, U0, U5 - -- xvldrepl.d U4, B0, 0x10 -+ xvldrepl.d U6, B0, 0x10 - /* line 3 */ -- xvfmul.d D8, U0, U4 -+ xvfmul.d D8, U0, U6 - -- xvldrepl.d U4, B0, 0x18 -+ xvldrepl.d U7, B0, 0x18 - /* line 4 */ -- xvfmul.d D12, U0, U4 -+ xvfmul.d D12, U0, U7 - - /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x08 -+ addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - /* Reduce L */ - addi.d L, L, -1 - srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_M1_L7 */ -- beq ZERO,TL, .L_M1_L7 -+ /* if (TL < 1) goto L_M4_L7 */ -+ beq ZERO,TL, .L_M4_L7 - --.L_M1_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 1 * 64 from A0 */ -- xvld U0, A0, 0x00 -+ xvld U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ addi.d TL, TL, -1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ xvldrepl.d U14, B0, 0x10 -+ xvldrepl.d U15, B0, 0x18 -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ beq ZERO, TL, .L_M4_TL1_END - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+.L_M4_TL1: /* TL-- */ -+ KERNEL8x4x4 - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x20 -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M4_TL1 -+ -+.L_M4_TL1_END: -+ KERNEL8x4x4_END -+ -+.L_M4_L7: -+ /* if (!(L & 7)) goto L_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M4_L0 - -- /***8-2***/ -+.L_M4_L71: - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 -@@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - -- addi.d A0, A0, 0x08 -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - -- /***8-3***/ -- xvld U0, A0, 0x00 -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M4_L71 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+.L_M4_L0: -+ xvldrepl.d VALPHA, $sp, 112 -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D4, D4, VALPHA -+ xvfmul.d D8, D8, VALPHA -+ xvfmul.d D12, D12, VALPHA -+#else -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U1 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ xvfmadd.d D8, D8, VALPHA, U2 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+ xvfmadd.d D12, D12, VALPHA, U3 -+#endif // #if defined(TRMMKERNEL) - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x20 -+ /* Store C0 */ -+ xvst D0, C0, 0x00 -+ /* Store C1 */ -+ xvst D4, C1, 0x00 -+ /* Store C2 */ -+ xvst D8, C2, 0x00 -+ /* Store C3 */ -+ xvst D12, C3, 0x00 - -- /***8-4***/ -- xvld U0, A0, 0x00 -+ /* Add stride for C */ -+ addi.d C0, C0, 0x20 -+ addi.d C1, C1, 0x20 -+ addi.d C2, C2, 0x20 -+ addi.d C3, C3, 0x20 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -4 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x05 -+ add.d A0, A0, T0 -+ add.d B0, B0, T0 -+#endif - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+.L_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_M1 - -- addi.d A0, A0, 0x08 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x04 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 2 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ -+ /* Load 2 * 64 from A0 */ -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 -+ -+ xvld U4, B0, 0x00 -+ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M2_L7 */ -+ beq ZERO,TL, .L_M2_L7 -+ -+ xvldrepl.d U8, A0, 0x00 -+ xvldrepl.d U9, A0, 0x08 -+ -+ addi.d TL, TL, -1 -+ -+ xvld U12, B0, 0x00 -+ addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - -- /***8-5***/ -- xvld U0, A0, 0x00 -+ beq ZERO, TL, .L_M2_TL1_END -+.L_M2_TL1: /* TL-- */ -+ KERNEL8x2x4 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M2_TL1 -+.L_M2_TL1_END: -+ KERNEL8x2x4_END - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+.L_M2_L7: -+ /* if (!(L & 7)) goto L_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M2_L0 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+.L_M2_L71: -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvld U4, B0, 0x00 - -- addi.d A0, A0, 0x08 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - -- /***8-6***/ -- xvld U0, A0, 0x00 -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_M2_L71 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+.L_M2_L0: -+ xvldrepl.d VALPHA, $sp, 112 -+#if defined(TRMMKERNEL) -+ xvfmul.d D0, D0, VALPHA -+ xvfmul.d D1, D1, VALPHA - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D0, C1, 0x00, 0x01 -+ xvstelm.d D0, C2, 0x00, 0x02 -+ xvstelm.d D0, C3, 0x00, 0x03 -+ xvstelm.d D1, C0, 0x08, 0x00 -+ xvstelm.d D1, C1, 0x08, 0x01 -+ xvstelm.d D1, C2, 0x08, 0x02 -+ xvstelm.d D1, C3, 0x08, 0x03 -+#else -+ xvpackev.d D4, D1, D0 -+ xvpackod.d D5, D1, D0 -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvpermi.q U2, U0, 0x20 -+ xvpermi.q U3, U1, 0x20 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvfmadd.d D0, D4, VALPHA, U2 -+ xvfmadd.d D1, D5, VALPHA, U3 - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x20 -+ vst $vr16, C0, 0x00 -+ vst $vr17, C1, 0x00 -+ xvstelm.d D0, C2, 0x00, 0x02 -+ xvstelm.d D1, C3, 0x00, 0x02 -+ xvstelm.d D0, C2, 0x08, 0x03 -+ xvstelm.d D1, C3, 0x08, 0x03 -+#endif // #if defined(TRMMKERNEL) - -- /***8-7***/ -- xvld U0, A0, 0x00 -+ /* Add stride for C */ -+ addi.d C0, C0, 0x10 -+ addi.d C1, C1, 0x10 -+ addi.d C2, C2, 0x10 -+ addi.d C3, C3, 0x10 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ sub.d L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d L, L, -2 -+#else -+ /* number of values in B */ -+ addi.d L, L, -4 -+#endif -+ slli.d T0, L, 0x04 -+ add.d A0, A0, T0 -+ slli.d T0, L, 0x05 -+ add.d B0, B0, T0 -+#endif - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+#ifdef LEFT -+ /* number of values in A */ -+ addi.d OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+.L_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_M0 - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x20 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x03 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x05 -+ add.d B0, B, T0 -+#endif - -- /***8-8***/ -- xvld U0, A0, 0x00 -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 1 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U0, A0, 0x00 -+ xvld U4, B0, 0x00 -+ xvfmul.d D0, U0, U4 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M1_L7 */ -+ beq ZERO,TL, .L_M1_L7 - -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -+ xvldrepl.d U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ addi.d TL, TL, -1 -+ xvld U12, B0, 0x00 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x20 -+ beq ZERO, TL, .L_M1_TL1_END -+ -+.L_M1_TL1: /* TL-- */ -+ KERNEL8x1x4 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M1_TL1 -+.L_M1_TL1_END: -+ KERNEL8x1x4_END - - .L_M1_L7: - /* if (!(L & 7)) goto L_M1_L0 */ -@@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - beq TL, ZERO,.L_M1_L0 - - .L_M1_L71: -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- xvldrepl.d U4, B0, 0x10 -- xvfmadd.d D8, U0, U4, D8 -- -- xvldrepl.d U4, B0, 0x18 -- xvfmadd.d D12, U0, U4, D12 -+ xvldrepl.d U0, A0, 0x00 -+ xvld U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 - - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x08 -@@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - blt ZERO,TL, .L_M1_L71 - - .L_M1_L0: -+ xvldrepl.d VALPHA, $sp, 112 - #if defined(TRMMKERNEL) - xvfmul.d D0, D0, VALPHA -- xvfmul.d D4, D4, VALPHA -- xvfmul.d D8, D8, VALPHA -- xvfmul.d D12, D12, VALPHA -+ -+ xvstelm.d D0, C0, 0x00, 0x00 -+ xvstelm.d D0, C1, 0x00, 0x01 -+ xvstelm.d D0, C2, 0x00, 0x02 -+ xvstelm.d D0, C3, 0x00, 0x03 - #else - /* Load C0 */ -- xvld U0, C0, 0x00 -- xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ -+ xvldrepl.d U0, C0, 0x00 -+ xvfmadd.d D4, D0, VALPHA, U0 - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvfmadd.d D4, D4, VALPHA, U0 -+ xvldrepl.d U1, C1, 0x00 -+ xvfmadd.d D5, D0, VALPHA, U1 - - /* Load C2 */ -- xvld U0, C2, 0x00 -- xvfmadd.d D8, D8, VALPHA, U0 -+ xvldrepl.d U2, C2, 0x00 -+ xvfmadd.d D6, D0, VALPHA, U2 - - /* Load C3 */ -- xvld U0, C3, 0x00 -- xvfmadd.d D12, D12, VALPHA, U0 --#endif // #if defined(TRMMKERNEL) -+ xvldrepl.d U3, C3, 0x00 -+ xvfmadd.d D7, D0, VALPHA, U3 - -- xvstelm.d D0, C0, 0x00, 0x00 -- xvstelm.d D4, C1, 0x00, 0x00 -- xvstelm.d D8, C2, 0x00, 0x00 -- xvstelm.d D12, C3, 0x00, 0x00 -+ xvstelm.d D4, C0, 0x00, 0x00 -+ xvstelm.d D5, C1, 0x00, 0x01 -+ xvstelm.d D6, C2, 0x00, 0x02 -+ xvstelm.d D7, C3, 0x00, 0x03 -+#endif // #if defined(TRMMKERNEL) - - /* Add stride for C */ - addi.d C0, C0, 0x08 -@@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - ///////////////////////////////////////////////// - /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ - -+ xvldrepl.d VALPHA, $sp, 112 -+ - .L_N3: - andi J, N, 2 - beq ZERO, J, .L_N1 -@@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - addi.d L, OFF, 2 - #endif - #else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif -- -- /* Load 16 * 64 from A0 -- * U0 = {a3, a2, a1, a0} -- * U1 = {a7, a6, a5, a4} -- * U2 = {a11, a10, a9, a8} -- * U3 = {a15, a14, a13, a12} -- */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- xvfmul.d D1, U1, U4 -- xvfmul.d D2, U2, U4 -- xvfmul.d D3, U3, U4 -- -- xvldrepl.d U4, B0, 0x08 -- /* line 2 */ -- xvfmul.d D4, U0, U4 -- xvfmul.d D5, U1, U4 -- xvfmul.d D6, U2, U4 -- xvfmul.d D7, U3, U4 -- -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_N3_L7 */ -- beq ZERO,TL, .L_N3_L7 -- --.L_N3_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -- -- /***8-2***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -- -- /***8-3***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -- -- /***8-4***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -- -- /***8-5***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -- -- /***8-6***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -+ move B0, B -+ move L, K /* L = bk */ -+#endif - -- /***8-7***/ -- /* Load 16 * 64 from A0 */ -+ /* Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 - -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x10 -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 -+ xvfmul.d D6, U2, U5 -+ xvfmul.d D7, U3, U5 - -- /***8-8***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_L7 */ -+ beq ZERO,TL, .L_N3_L7 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - -+ beq ZERO, TL, .L_N3_TL1_END -+ -+.L_N3_TL1: /* TL-- */ -+ KERNEL8x16x2 -+ - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N3_TL1 -+.L_N3_TL1_END: -+ KERNEL8x16x2_END - - .L_N3_L7: - /* if (!(L & 7)) goto L_N3_L0 */ -@@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- xvfmadd.d D6, U2, U4, D6 -- xvfmadd.d D7, U3, U4, D7 -- -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 -@@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D3, D3, VALPHA, U3 - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvld U1, C1, 0x20 -- xvld U2, C1, 0x40 -- xvld U3, C1, 0x60 -- xvfmadd.d D4, D4, VALPHA, U0 -- xvfmadd.d D5, D5, VALPHA, U1 -- xvfmadd.d D6, D6, VALPHA, U2 -- xvfmadd.d D7, D7, VALPHA, U3 -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+ xvfmadd.d D4, D4, VALPHA, U4 -+ xvfmadd.d D5, D5, VALPHA, U5 -+ xvfmadd.d D6, D6, VALPHA, U6 -+ xvfmadd.d D7, D7, VALPHA, U7 - #endif // #if defined(TRMMKERNEL) - - /* Store C0 */ -@@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmul.d D0, U0, U4 - xvfmul.d D1, U1, U4 - -- xvldrepl.d U4, B0, 0x08 -+ xvldrepl.d U5, B0, 0x08 - /* line 2 */ -- xvfmul.d D4, U0, U4 -- xvfmul.d D5, U1, U4 -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 - - /* Add stride for A0 and B0 */ - addi.d A0, A0, 0x40 -@@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* if (TL < 1) goto L_N3_M8_L7 */ - beq ZERO,TL, .L_N3_M8_L7 - --.L_N3_M8_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- /* Cumulative D0~D15 */ -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -- -- /***8-7***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ beq ZERO, TL, .L_N3_M8_TL1_END - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x10 -+.L_N3_M8_TL1: /* TL-- */ -+ KERNEL8x8x2 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N3_M8_TL1 -+.L_N3_M8_TL1_END: -+ KERNEL8x8x2_END - - .L_N3_M8_L7: - /* if (!(L & 7)) goto L_N3_M8_L0 */ -@@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- xvfmadd.d D5, U1, U4, D5 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 - - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x40 -@@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D1, D1, VALPHA, U1 - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvld U1, C1, 0x20 -- xvfmadd.d D4, D4, VALPHA, U0 -- xvfmadd.d D5, D5, VALPHA, U1 -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ xvfmadd.d D4, D4, VALPHA, U2 -+ xvfmadd.d D5, D5, VALPHA, U3 - #endif // #if defined(TRMMKERNEL) - - /* Store C0 */ -@@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - add.d B0, B0, T0 - #endif - --#ifdef LEFT -- addi.d OFF, OFF, 0x08 --#endif --#endif // #if defined(TRMMKERNEL) -- --/********LOOP (if(N & 2) && (M & 8) ) End************/ -- --.L_N3_M4: -- andi I, M, 4 -- beq ZERO,I, .L_N3_M2 -- --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- move B0, B --#else -- slli.d T0, OFF, 0x05 -- add.d A0, A0, T0 -- slli.d T0, OFF, 0x04 -- add.d B0, B, T0 --#endif -- --#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -- sub.d L, K, OFF --#elif defined(LEFT) -- /* number of values in A */ -- addi.d L, OFF, 4 --#else -- /* number of values in B */ -- addi.d L, OFF, 2 --#endif --#else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif -- -- /* Load 4 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- -- xvldrepl.d U4, B0, 0x08 -- /* line 2 */ -- xvfmul.d D4, U0, U4 -- -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_N3_M4_L7 */ -- beq ZERO,TL, .L_N3_M4_L7 -- --.L_N3_M4_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 8 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -+#ifdef LEFT -+ addi.d OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) - -- /***8-5***/ -- xvld U0, A0, 0x00 -+/********LOOP (if(N & 2) && (M & 8) ) End************/ - -- /* Cumulative D0~D15 */ -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+.L_N3_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_N3_M2 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x05 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x04 -+ add.d B0, B, T0 -+#endif - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 4 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif - -- /***8-6***/ -+ /* Load 4 * 64 from A0 */ - xvld U0, A0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 - -- /***8-7***/ -- xvld U0, A0, 0x00 -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M4_L7 */ -+ beq ZERO,TL, .L_N3_M4_L7 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ xvld U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ beq ZERO, TL, .L_N3_M4_TL1_END - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x10 -+.L_N3_M4_TL1: /* TL-- */ -+ KERNEL8x4x2 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N3_M4_TL1 -+.L_N3_M4_TL1_END: -+ KERNEL8x4x2_END - - .L_N3_M4_L7: - /* if (!(L & 7)) goto L_N3_M4_L0 */ -@@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 - - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x20 -@@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvfmadd.d D4, D4, VALPHA, U0 -+ xvld U1, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U1 - #endif // #if defined(TRMMKERNEL) - - /* Store C0 */ -@@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* if (TL < 1) goto L_N3_M2_L7 */ - beq ZERO,TL, .L_N3_M2_L7 - --.L_N3_M2_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 2 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -+ xvld U8, A0, 0x00 - -- /***8-7***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ beq ZERO, TL, .L_N3_M2_TL1_END - -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x10 -+.L_N3_M2_TL1: /* TL-- */ -+ KERNEL8x2x2 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N3_M2_TL1 -+.L_N3_M2_TL1_END: -+ KERNEL8x2x2_END - - .L_N3_M2_L7: - /* if (!(L & 7)) goto L_N3_M2_L0 */ -@@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 - - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x10 -@@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvfmadd.d D4, D4, VALPHA, U0 -+ xvld U1, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U1 - #endif // #if defined(TRMMKERNEL) - - xvstelm.d D0, C0, 0x00, 0x00 -@@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #else - /* number of values in B */ - addi.d L, OFF, 2 --#endif --#else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif -- -- /* Load 1 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- -- xvldrepl.d U4, B0, 0x08 -- /* line 2 */ -- xvfmul.d D4, U0, U4 -- -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_N3_M1_L7 */ -- beq ZERO,TL, .L_N3_M1_L7 -- --.L_N3_M1_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 1 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif - -- /***8-7***/ -+ /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x10 -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 - -- /***8-8***/ -- xvld U0, A0, 0x00 -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M1_L7 */ -+ beq ZERO,TL, .L_N3_M1_L7 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ xvld U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - -+ beq ZERO, TL, .L_N3_M1_TL1_END -+ -+.L_N3_M1_TL1: /* TL-- */ -+ KERNEL8x1x2 -+ - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N3_M1_TL1 -+.L_N3_M1_TL1_END: -+ KERNEL8x1x2_END - - .L_N3_M1_L7: - /* if (!(L & 7)) goto L_N3_M1_L0 */ -@@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - -- xvldrepl.d U4, B0, 0x08 -- xvfmadd.d D4, U0, U4, D4 -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 - - /* Add stride for A0, B0 */ - addi.d A0, A0, 0x08 -@@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - - /* Load C1 */ -- xvld U0, C1, 0x00 -- xvfmadd.d D4, D4, VALPHA, U0 -+ xvld U1, C1, 0x00 -+ xvfmadd.d D4, D4, VALPHA, U1 - #endif // #if defined(TRMMKERNEL) - - xvstelm.d D0, C0, 0x00, 0x00 -@@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* if (TL < 1) goto L_N1_L7 */ - beq ZERO,TL, .L_N1_L7 - --.L_N1_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -- -- /***8-2***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -- -- /***8-3***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -- -- /***8-4***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -- -- /***8-5***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -- -- /***8-6***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -- -- /***8-7***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - -- /***8-8***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- xvld U2, A0, 0x40 -- xvld U3, A0, 0x60 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- xvfmadd.d D2, U2, U4, D2 -- xvfmadd.d D3, U3, U4, D3 -- -- addi.d A0, A0, 0x80 -- addi.d B0, B0, 0x08 -+ beq ZERO, TL, .L_N1_TL1_END -+.L_N1_TL1: /* TL-- */ -+ KERNEL8x16x1 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N1_TL1 -+.L_N1_TL1_END: -+ KERNEL8x16x1_END - - .L_N1_L7: - /* if (!(L & 7)) goto L_N1_L0 */ -@@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #endif - slli.d T0, L, 0x07 - add.d A0, A0, T0 -- slli.d T0, L, 0x03 -- add.d B0, B0, T0 --#endif -- --#ifdef LEFT -- addi.d OFF, OFF, 0x10 --#endif --#endif // #if defined(TRMMKERNEL) -- -- addi.d I, I, -1 /* I-- */ -- blt ZERO,I, .L_N1_I1 -- --.L_N1_M8: -- /* We have done M & 16, considering M=8/4/2/1 */ -- andi I, M, 15 -- beq ZERO,I, .L_N1_M0 -- -- andi I, M, 8 -- beq ZERO,I, .L_N1_M4 -- --#if defined(TRMMKERNEL) --#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -- move B0, B --#else -- slli.d T0, OFF, 0x06 -- add.d A0, A0, T0 -- slli.d T0, OFF, 0x03 -- add.d B0, B, T0 --#endif -- --#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -- sub.d L, K, OFF --#elif defined(LEFT) -- /* number of values in A */ -- addi.d L, OFF, 8 --#else -- /* number of values in B */ -- addi.d L, OFF, 1 --#endif --#else // #if !defined(TRMMKERNEL) -- move B0, B -- move L, K /* L = bk */ --#endif -- -- /* Load 8 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- /* line 1 */ -- xvfmul.d D0, U0, U4 -- xvfmul.d D1, U1, U4 -- -- /* Add stride for A0 and B0 */ -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -- /* Reduce L */ -- addi.d L, L, -1 -- srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -- /* if (TL < 1) goto L_N1_M8_L7 */ -- beq ZERO,TL, .L_N1_M8_L7 -- --.L_N1_M8_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 16 * 64 from A0 */ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -- -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -+ slli.d T0, L, 0x03 -+ add.d B0, B0, T0 -+#endif - -- /***8-5***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+#ifdef LEFT -+ addi.d OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ addi.d I, I, -1 /* I-- */ -+ blt ZERO,I, .L_N1_I1 - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -+.L_N1_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_N1_M0 - -- /***8-6***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+ andi I, M, 8 -+ beq ZERO,I, .L_N1_M4 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ slli.d T0, OFF, 0x06 -+ add.d A0, A0, T0 -+ slli.d T0, OFF, 0x03 -+ add.d B0, B, T0 -+#endif - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ sub.d L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ addi.d L, OFF, 8 -+#else -+ /* number of values in B */ -+ addi.d L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif - -- /***8-7***/ -+ /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 - -- addi.d A0, A0, 0x40 -- addi.d B0, B0, 0x08 -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M8_L7 */ -+ beq ZERO,TL, .L_N1_M8_L7 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- xvld U1, A0, 0x20 -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- xvfmadd.d D1, U1, U4, D1 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - -+ beq ZERO, TL, .L_N1_M8_TL1_END -+.L_N1_M8_TL1: /* TL-- */ -+ KERNEL8x8x1 -+ - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N1_M8_TL1 - -+.L_N1_M8_TL1_END: -+ KERNEL8x8x1_END -+ - .L_N1_M8_L7: - /* if (!(L & 7)) goto L_N1_M8_L0 */ - andi TL, L, 7 -@@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* if (TL < 1) goto L_N1_M4_L7 */ - beq ZERO,TL, .L_N1_M4_L7 - --.L_N1_M4_TL1: /* TL-- */ -- /***8-1***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -- -- /***8-7***/ -- xvld U0, A0, 0x00 -+ xvld U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ beq ZERO, TL, .L_N1_M4_TL1_END - -- addi.d A0, A0, 0x20 -- addi.d B0, B0, 0x08 -+.L_N1_M4_TL1: /* TL-- */ -+ KERNEL8x4x1 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N1_M4_TL1 -+.L_N1_M4_TL1_END: -+ KERNEL8x4x1_END - - .L_N1_M4_L7: - /* if (!(L & 7)) goto L_N1_M4_L0 */ -@@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* if (TL < 1) goto L_N1_M2_L7 */ - beq ZERO,TL, .L_N1_M2_L7 - --.L_N1_M2_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 2 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -- -- /***8-7***/ -- xvld U0, A0, 0x00 -+ xvld U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ beq ZERO, TL, .L_N1_M2_TL1_END - -- addi.d A0, A0, 0x10 -- addi.d B0, B0, 0x08 -+.L_N1_M2_TL1: /* TL-- */ -+ KERNEL8x2x1 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N1_M2_TL1 -+.L_N1_M2_TL1_END: -+ KERNEL8x2x1_END - - .L_N1_M2_L7: - /* if (!(L & 7)) goto L_N1_M2_L0 */ -@@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - /* if (TL < 1) goto L_N1_M1_L7 */ - beq ZERO,TL, .L_N1_M1_L7 - --.L_N1_M1_TL1: /* TL-- */ -- /***8-1***/ -- /* Load 1 * 64 from A0 */ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -- -- /***8-2***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -- -- /***8-3***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -- -- /***8-4***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -- -- /***8-5***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -- -- /***8-6***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -- -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -- -- /***8-7***/ -- xvld U0, A0, 0x00 -+ xvld U8, A0, 0x00 - -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ addi.d TL, TL, -1 - -+ xvldrepl.d U12, B0, 0x00 - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - -- /***8-8***/ -- xvld U0, A0, 0x00 -- -- xvldrepl.d U4, B0, 0x00 -- xvfmadd.d D0, U0, U4, D0 -+ beq ZERO, TL, .L_N1_M1_TL1_END - -- addi.d A0, A0, 0x08 -- addi.d B0, B0, 0x08 -+.L_N1_M1_TL1: /* TL-- */ -+ KERNEL8x1x1 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_N1_M1_TL1 -+.L_N1_M1_TL1_END: -+ KERNEL8x1x1_END - - .L_N1_M1_L7: - /* if (!(L & 7)) goto L_N1_M1_L0 */ -@@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - LDARG $r26, $sp, 24 - LDARG $r27, $sp, 32 - LD $f23, $sp, 40 -- addi.d $sp, $sp, 56 -+ LD $f24, $sp, 48 -+ LD $f25, $sp, 56 -+ LD $f26, $sp, 64 -+ LD $f27, $sp, 72 -+ LD $f28, $sp, 80 -+ LD $f29, $sp, 88 -+ LD $f30, $sp, 96 -+ LD $f31, $sp, 104 -+ addi.d $sp, $sp, 120 - - jirl $r0, $r1, 0x0 - -diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S -new file mode 100644 -index 000000000..a49bf9bb1 ---- /dev/null -+++ b/kernel/loongarch64/dgemv_n_8_lasx.S -@@ -0,0 +1,554 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/07/14 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, -+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -+ */ -+#define M $r4 -+#define N $r5 -+#define ALPHA $f0 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INC_X $r10 -+#define Y $r11 -+#define INC_Y $r6 -+ -+#define J $r12 -+#define I $r13 -+#define K $r14 -+#define Y_ORG $r15 -+#define OFFSET $r16 -+#define K_LDA $r17 -+#define M8 $r18 -+#define T0 $r19 -+#define PA0 $r20 -+#define PA1 $r23 -+#define PA2 $r24 -+#define PA3 $r25 -+#define PA4 $r26 -+#define PA5 $r27 -+#define PA6 $r28 -+#define PA7 $r29 -+ -+#define VALPHA $xr1 -+#define X0 $xr2 -+#define X1 $xr3 -+#define X2 $xr4 -+#define X3 $xr5 -+#define X4 $xr6 -+#define X5 $xr7 -+#define X6 $xr8 -+#define X7 $xr9 -+#define Y0 $xr10 -+#define Y1 $xr11 -+#define A0 $xr12 -+#define A1 $xr13 -+#define A2 $xr14 -+#define A3 $xr15 -+#define A4 $xr16 -+#define A5 $xr17 -+#define A6 $xr18 -+#define A7 $xr19 -+#define A8 $xr20 -+#define A9 $xr21 -+#define A10 $xr22 -+#define A11 $xr23 -+#define A12 $xr24 -+#define A13 $xr25 -+#define A14 $xr26 -+#define A15 $xr27 -+ -+.macro DLOAD_X_8 -+ GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ -+ X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 -+ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ -+ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA -+.endm -+ -+.macro DLOAD_X_4 -+ GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 -+ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA -+.endm -+ -+.macro DLOAD_X_2 -+ GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 -+ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA -+.endm -+ -+.macro DLOAD_X_1 -+ GLDREPL xv, d, X0, X, 0x00 -+ GMUL xvf, d, X0, X0, VALPHA -+.endm -+ -+.macro DLOAD_Y_8 -+ GLD xv, , Y0, Y, 0, Y1, Y, 0x20 -+.endm -+ -+.macro DLOAD_Y_4 -+ GLD xv, , Y0, Y, 0 -+.endm -+ -+.macro DLOAD_Y_1 -+ fld.d $f10, Y, 0 -+.endm -+ -+.macro DSTORE_Y_8 -+ GST xv, , Y0, Y, 0, Y1, Y, 0x20 -+.endm -+ -+.macro DSTORE_Y_4 -+ GST xv, , Y0, Y, 0 -+.endm -+ -+.macro DSTORE_Y_1 -+ fst.d $f10, Y, 0 -+.endm -+ -+// Unable to use vector load/store ins -+.macro DLOAD_Y_8_GAP -+ fld.d $f10, Y, 0 -+ fldx.d $f13, Y, INC_Y -+ PTR_ALSL T0, INC_Y, Y, 1 -+ fld.d $f14, T0, 0 -+ fldx.d $f15, T0, INC_Y -+ PTR_ALSL T0, INC_Y, Y, 2 -+ fld.d $f11, T0, 0 -+ fldx.d $f17, T0, INC_Y -+ PTR_ADD T0, T0, INC_Y -+ PTR_ADD T0, T0, INC_Y -+ fld.d $f18, T0, 0 -+ fldx.d $f19, T0, INC_Y -+ GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 -+.endm -+ -+.macro DLOAD_Y_4_GAP -+ fld.d $f10, Y, 0 -+ fldx.d $f13, Y, INC_Y -+ PTR_ALSL T0, INC_Y, Y, 1 -+ fld.d $f14, T0, 0 -+ fldx.d $f15, T0, INC_Y -+ GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 -+.endm -+ -+.macro DSTORE_Y_8_GAP -+ xvstelm.d Y0, Y, 0, 0 -+ PTR_ADD T0, Y, INC_Y -+ xvstelm.d Y0, T0, 0, 1 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y0, T0, 0, 2 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y0, T0, 0, 3 -+ -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y1, T0, 0, 0 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y1, T0, 0, 1 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y1, T0, 0, 2 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y1, T0, 0, 3 -+.endm -+ -+.macro DSTORE_Y_4_GAP -+ xvstelm.d Y0, Y, 0, 0 -+ PTR_ADD T0, Y, INC_Y -+ xvstelm.d Y0, T0, 0, 1 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y0, T0, 0, 2 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.d Y0, T0, 0, 3 -+.endm -+ -+.macro DLOAD_X_8_GAP -+ xvldrepl.d X0, X, 0x00 -+ PTR_ADD T0, X, INC_X -+ xvldrepl.d X1, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X2, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X3, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X4, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X5, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X6, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X7, T0, 0x00 -+ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ -+ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA -+.endm -+ -+.macro DLOAD_X_4_GAP -+ xvldrepl.d X0, X, 0x00 -+ PTR_ADD T0, X, INC_X -+ xvldrepl.d X1, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X2, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.d X3, T0, 0x00 -+ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA -+.endm -+ -+.macro DLOAD_X_2_GAP -+ xvldrepl.d X0, X, 0x00 -+ PTR_ADD T0, X, INC_X -+ xvldrepl.d X1, T0, 0x00 -+ GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA -+.endm -+ -+.macro DGEMV_N_8x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA0, 0, \ -+ A2, PA1, 0, A3, PA1, 0, \ -+ A4, PA2, 0, A5, PA2, 0, \ -+ A6, PA3, 0, A7, PA3, 0, \ -+ A8, PA4, 0, A9, PA4, 0, \ -+ A10, PA5, 0, A11, PA5, 0, \ -+ A12, PA6, 0, A13, PA6, 0, \ -+ A14, PA7, 0, A15, PA7, 0 -+ -+ GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ -+ Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ -+ Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ -+ Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ -+ Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ -+ Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ -+ Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ -+ Y0, A14, X7, Y0, Y1, A15, X7, Y1 -+.endm -+ -+.macro DGEMV_N_4x8 -+ GLD_INC xv, , 0x20, A0, PA0, 0, \ -+ A2, PA1, 0, \ -+ A4, PA2, 0, \ -+ A6, PA3, 0, \ -+ A8, PA4, 0, \ -+ A10, PA5, 0, \ -+ A12, PA6, 0, \ -+ A14, PA7, 0 -+ -+ GMADD xvf, d, Y0, A0, X0, Y0, \ -+ Y0, A2, X1, Y0, \ -+ Y0, A4, X2, Y0, \ -+ Y0, A6, X3, Y0, \ -+ Y0, A8, X4, Y0, \ -+ Y0, A10, X5, Y0, \ -+ Y0, A12, X6, Y0, \ -+ Y0, A14, X7, Y0 -+.endm -+ -+.macro DGEMV_N_1x8 -+ GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ -+ $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 -+ GMADD f, d, $f10, $f12, $f2, $f10, \ -+ $f10, $f14, $f3, $f10, \ -+ $f10, $f16, $f4, $f10, \ -+ $f10, $f18, $f5, $f10, \ -+ $f10, $f20, $f6, $f10, \ -+ $f10, $f22, $f7, $f10, \ -+ $f10, $f24, $f8, $f10, \ -+ $f10, $f26, $f9, $f10, -+.endm -+ -+.macro DGEMV_N_8x4 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA0, 0, \ -+ A2, PA1, 0, A3, PA1, 0, \ -+ A4, PA2, 0, A5, PA2, 0, \ -+ A6, PA3, 0, A7, PA3, 0 -+ -+ GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ -+ Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ -+ Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ -+ Y0, A6, X3, Y0, Y1, A7, X3, Y1 -+.endm -+ -+.macro DGEMV_N_4x4 -+ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 -+ -+ GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ -+ Y0, A4, X2, Y0, Y0, A6, X3, Y0 -+.endm -+ -+.macro DGEMV_N_1x4 -+ GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 -+ GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ -+ $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 -+.endm -+ -+.macro DGEMV_N_8x2 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA0, 0, \ -+ A2, PA1, 0, A3, PA1, 0 -+ GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ -+ Y0, A2, X1, Y0, Y1, A3, X1, Y1 -+.endm -+ -+.macro DGEMV_N_4x2 -+ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 -+ GMADD xvf, d, Y0, A0, X0, Y0, \ -+ Y0, A2, X1, Y0 -+.endm -+ -+.macro DGEMV_N_1x2 -+ GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 -+ GMADD f, d, $f10, $f12, $f2, $f10, \ -+ $f10, $f14, $f3, $f10 -+.endm -+ -+.macro DGEMV_N_1x1 -+ fld.d $f12, PA0, 0 -+ PTR_ADDI PA0, PA0, 0x08 -+ fmadd.d $f10, $f12, $f2, $f10 -+.endm -+ -+.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req -+ PTR_SRLI J, N, 3 -+ beqz J, .L_\XW\()_N_7 -+ PTR_SLLI K_LDA, LDA, 3 -+ PTR_SUB K_LDA, K_LDA, M8 -+.L_\XW\()_N_L8: -+ DLOAD_\X_8 -+ xor K, K, K -+ move Y, Y_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_M_7 -+.align 5 -+.L_\XW\()_M_L8: -+ DLOAD_\Y_8 -+ DGEMV_N_8x8 -+ DSTORE_\Y_8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL Y, INC_Y, Y, 3 -+ PTR_ADDI K, K, 8 -+ bnez I, .L_\XW\()_M_L8 -+.L_\XW\()_M_7: -+ andi I, M, 4 -+ beqz I, .L_\XW\()_M_3 -+ DLOAD_\Y_4 -+ DGEMV_N_4x8 -+ DSTORE_\Y_4 -+ PTR_ALSL Y, INC_Y, Y, 2 -+ PTR_ADDI K, K, 4 -+.L_\XW\()_M_3: -+ andi I, M, 3 -+ beqz I, .L_\XW\()_M_END -+.align 5 -+.L_\XW\()_M_L1: -+ DLOAD_\Y_1 -+ DGEMV_N_1x8 -+ DSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_M_L1 -+.L_\XW\()_M_END: -+ PTR_ADDI J, J, -1 -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#endif -+ PTR_ALSL X, INC_X, X, 3 -+ bnez J, .L_\XW\()_N_L8 -+.L_\XW\()_N_7: -+ andi J, N, 4 -+ beqz J, .L_\XW\()_N_3 -+ DLOAD_\X_4 -+ xor K, K, K -+ move Y, Y_ORG -+ -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_4_M_7 -+.align 5 -+.L_\XW\()_N_4_M_L8: -+ DLOAD_\Y_8 -+ DGEMV_N_8x4 -+ DSTORE_\Y_8 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI K, K, 8 -+ PTR_ALSL Y, INC_Y, Y, 3 -+ bnez I, .L_\XW\()_N_4_M_L8 -+.L_\XW\()_N_4_M_7: -+ andi I, M, 4 -+ beqz I, .L_\XW\()_N_4_M_3 -+ DLOAD_\Y_4 -+ DGEMV_N_4x4 -+ DSTORE_\Y_4 -+ PTR_ALSL Y, INC_Y, Y, 2 -+ PTR_ADDI K, K, 4 -+.L_\XW\()_N_4_M_3: -+ andi I, M, 3 -+ beqz I, .L_\XW\()_N_4_M_END -+.align 5 -+.L_\XW\()_N_4_M_L1: -+ DLOAD_\Y_1 -+ DGEMV_N_1x4 -+ DSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_N_4_M_L1 -+.L_\XW\()_N_4_M_END: -+ PTR_SLLI K_LDA, LDA, 2 -+ PTR_SUB K_LDA, K_LDA, M8 -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#endif -+ PTR_ALSL X, INC_X, X, 2 -+.L_\XW\()_N_3: -+ andi J, N, 2 -+ beqz J, .L_\XW\()_N_1 -+ DLOAD_\X_2 -+ xor K, K, K -+ move Y, Y_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_2_M_7 -+.align 5 -+.L_\XW\()_N_2_M_L8: -+ DLOAD_\Y_8 -+ DGEMV_N_8x2 -+ DSTORE_\Y_8 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI K, K, 8 -+ PTR_ALSL Y, INC_Y, Y, 3 -+ bnez I, .L_\XW\()_N_2_M_L8 -+.L_\XW\()_N_2_M_7: -+ andi I, M, 4 -+ beqz I, .L_\XW\()_N_2_M_3 -+ DLOAD_\Y_4 -+ DGEMV_N_4x2 -+ DSTORE_\Y_4 -+ PTR_ALSL Y, INC_Y, Y, 2 -+ PTR_ADDI K, K, 4 -+.L_\XW\()_N_2_M_3: -+ andi I, M, 3 -+ beqz I, .L_\XW\()_N_2_M_END -+.align 5 -+.L_\XW\()_N_2_M_L1: -+ DLOAD_\Y_1 -+ DGEMV_N_1x2 -+ DSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_N_2_M_L1 -+.L_\XW\()_N_2_M_END: -+ PTR_SLLI K_LDA, LDA, 1 -+ PTR_SUB K_LDA, K_LDA, M8 -+ PTR_ADD PA0, PA0, K_LDA -+ PTR_ADD PA1, PA1, K_LDA -+ PTR_ALSL X, INC_X, X, 1 -+.L_\XW\()_N_1: -+ andi J, N, 1 -+ beqz J, .L_END -+ DLOAD_\X_1 -+ xor K, K, K -+ move Y, Y_ORG -+ move I, M -+ beqz I, .L_END -+.align 5 -+.L_\XW\()_N_1_M_L1: -+ DLOAD_\Y_1 -+ DGEMV_N_1x1 -+ DSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_N_1_M_L1 -+ b .L_END -+.endm -+ -+ PROLOGUE -+ PTR_LD INC_Y, $sp, 0 -+ push_if_used 17 + 7, 24 + 4 -+ PTR_ADDI K, $r0, 0x01 -+ PTR_SUB I, INC_X, K -+ PTR_SUB J, INC_Y, K -+ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ -+ maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ -+ PTR_ALSL I, I, J, 1 -+ GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 -+ xvreplve0.d VALPHA, $xr0 -+ move Y_ORG, Y -+ move PA0, A -+#if __loongarch_grlen == 64 -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#else -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#endif -+ la.local T0, .L_GAP_TABLE -+ PTR_ALSL I, I, T0, 1 -+ ld.h K, I, 0 -+ PTR_ADD T0, T0, K -+ jirl $r0, T0, 0 -+.L_GAP_TABLE: -+ .hword .L_GAP_0_0 - .L_GAP_TABLE -+ .hword .L_GAP_0_1 - .L_GAP_TABLE -+ .hword .L_GAP_1_0 - .L_GAP_TABLE -+ .hword .L_GAP_1_1 - .L_GAP_TABLE -+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ -+ DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 -+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ -+ DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 -+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ -+ DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 -+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ -+ DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 -+.L_END: -+ pop_if_used 17 + 7, 24 + 4 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S -new file mode 100644 -index 000000000..71f942b0f ---- /dev/null -+++ b/kernel/loongarch64/dgemv_t_8_lasx.S -@@ -0,0 +1,481 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/07/17 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, -+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -+ */ -+#define M $r4 -+#define N $r5 -+#define ALPHA $f0 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INC_X $r10 -+#define Y $r11 -+#define INC_Y $r6 -+ -+#define J $r12 -+#define I $r13 -+#define K $r14 -+#define PY0 $r14 -+#define X_ORG $r15 -+#define PY1 $r16 -+#define K_LDA $r17 -+#define PY2 $r18 -+#define T0 $r19 -+#define PA0 $r20 -+#define PA1 $r23 -+#define PA2 $r24 -+#define PA3 $r25 -+#define PA4 $r26 -+#define PA5 $r27 -+#define PA6 $r28 -+#define PA7 $r29 -+#define M8 $r30 -+ -+#define VALPHA $xr0 -+#define X0 $xr1 -+#define X1 $xr2 -+#define A0 $xr3 -+#define A1 $xr4 -+#define A2 $xr5 -+#define A3 $xr6 -+#define A4 $xr7 -+#define A5 $xr8 -+#define A6 $xr9 -+#define A7 $xr10 -+#define A8 $xr11 -+#define A9 $xr12 -+#define A10 $xr13 -+#define A11 $xr14 -+#define A12 $xr15 -+#define A13 $xr16 -+#define A14 $xr17 -+#define A15 $xr18 -+#define TP0 $xr19 -+#define TP1 $xr20 -+#define TP2 $xr21 -+#define TP3 $xr22 -+#define TP4 $xr23 -+#define TP5 $xr24 -+#define TP6 $xr25 -+#define TP7 $xr26 -+#define Y0 $xr3 -+#define Y1 $xr4 -+#define Y2 $xr5 -+#define Y3 $xr6 -+#define Y4 $xr7 -+#define Y5 $xr8 -+#define Y6 $xr9 -+#define Y7 $xr10 -+ -+.macro ZERO_Y8 -+ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ -+ TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 -+.endm -+ -+.macro ZERO_Y4 -+ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 -+.endm -+ -+.macro ZERO_Y2 -+ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 -+.endm -+ -+.macro ZERO_Y1 -+ GXOR xv, v, TP0, TP0, TP0 -+.endm -+ -+.macro DLOAD_X8 -+ GLD xv, , X0, X, 0x00, X1, X, 0x20 -+.endm -+ -+.macro DLOAD_X4 -+ GLD xv, , X0, X, 0x00 -+.endm -+ -+.macro DLOAD_X8_GAP -+ fld.d $f1, X, 0x00 -+ fldx.d $f2, X, INC_X -+ PTR_ALSL T0, INC_X, X, 1 -+ fld.d $f3, T0, 0x00 -+ fldx.d $f4, T0, INC_X -+ GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 -+ PTR_ALSL T0, INC_X, X, 2 -+ fld.d $f2, T0, 0x00 -+ fldx.d $f3, T0, INC_X -+ PTR_ALSL T0, INC_X, T0, 1 -+ fld.d $f4, T0, 0x00 -+ fldx.d $f5, T0, INC_X -+ GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 -+.endm -+ -+.macro DLOAD_X4_GAP -+ fld.d $f1, X, 0x00 -+ fldx.d $f2, X, INC_X -+ PTR_ALSL T0, INC_X, X, 1 -+ fld.d $f3, T0, 0x00 -+ fldx.d $f4, T0, INC_X -+ GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 -+.endm -+ -+.macro DGEMV_T_8x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA0, 0, \ -+ A2, PA1, 0, A3, PA1, 0, \ -+ A4, PA2, 0, A5, PA2, 0, \ -+ A6, PA3, 0, A7, PA3, 0, \ -+ A8, PA4, 0, A9, PA4, 0, \ -+ A10, PA5, 0, A11, PA5, 0, \ -+ A12, PA6, 0, A13, PA6, 0, \ -+ A14, PA7, 0, A15, PA7, 0 -+ -+ GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ -+ TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ -+ TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ -+ TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ -+ TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ -+ TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ -+ TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ -+ TP7, A14, X0, TP7, TP7, A15, X1, TP7 -+.endm -+ -+.macro DGEMV_T_8x4 -+ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ -+ A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 -+ -+ GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ -+ TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ -+ TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ -+ TP6, A12, X0, TP6, TP7, A14, X0, TP7, -+.endm -+ -+.macro DGEMV_T_4x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA0, 0, \ -+ A2, PA1, 0, A3, PA1, 0, \ -+ A4, PA2, 0, A5, PA2, 0, \ -+ A6, PA3, 0, A7, PA3, 0 -+ -+ GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ -+ TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ -+ TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ -+ TP3, A6, X0, TP3, TP3, A7, X1, TP3 -+.endm -+ -+.macro DGEMV_T_4x4 -+ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 -+ -+ GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ -+ TP2, A4, X0, TP2, TP3, A6, X0, TP3 -+.endm -+ -+.macro DGEMV_T_2x8 -+ GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 -+ -+ GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ -+ TP1, A2, X0, TP1, TP1, A3, X1, TP1 -+.endm -+ -+.macro DGEMV_T_2x4 -+ GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 -+ -+ GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 -+.endm -+ -+.macro DGEMV_T_LASX XW:req X8:req, X4:req -+ PTR_SRLI J, N, 3 -+ beqz J, .L_\XW\()_N_7 -+ PTR_SLLI K_LDA, LDA, 3 -+ PTR_SUB K_LDA, K_LDA, M8 -+.L_\XW\()_N_L8: -+ ZERO_Y8 -+ move X, X_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_M_7 -+.align 5 -+.L_\XW\()_M_L8: -+ DLOAD_\X8 -+ DGEMV_T_8x8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL X, INC_X, X, 3 -+ bnez I, .L_\XW\()_M_L8 -+.L_\XW\()_M_7: -+ andi I, M, 4 -+ beqz I, .L_\XW\()_M_3 -+ DLOAD_\X4 -+ DGEMV_T_8x4 -+ PTR_ALSL X, INC_X, X, 2 -+.L_\XW\()_M_3: -+ // Accumulated -+ GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ -+ Y5, TP5, Y6, TP6, Y7, TP7 -+ andi I, M, 3 -+ beqz I, .L_\XW\()_M_END -+.align 5 -+.L_\XW\()_M_L1: -+ fld.d $f1, X, 0x00 -+ fld.d $f11, PA0, 0x00 -+ fld.d $f12, PA1, 0x00 -+ fld.d $f13, PA2, 0x00 -+ fld.d $f14, PA3, 0x00 -+ fld.d $f15, PA4, 0x00 -+ fld.d $f16, PA5, 0x00 -+ fld.d $f17, PA6, 0x00 -+ fld.d $f18, PA7, 0x00 -+#if __loongarch_grlen == 64 -+ GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ -+ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 -+#elif __loongarch_grlen == 32 -+ GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ -+ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 -+#else -+ GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ -+ PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 -+#endif -+ GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ -+ $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ bnez I, .L_\XW\()_M_L1 -+.L_\XW\()_M_END: -+ fld.d $f11, Y, 0x00 -+ fldx.d $f12, Y, INC_Y -+ PTR_ALSL PY0, INC_Y, Y, 1 -+ fld.d $f13, PY0, 0x00 -+ fldx.d $f14, PY0, INC_Y -+ PTR_ALSL PY1, INC_Y, Y, 2 -+ fld.d $f15, PY1, 0x00 -+ fldx.d $f16, PY1, INC_Y -+ PTR_ALSL PY2, INC_Y, PY1, 1 -+ fld.d $f17, PY2, 0x00 -+ fldx.d $f18, PY2, INC_Y -+ -+ GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ -+ $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 -+ -+ PTR_ADDI J, J, -1 -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#endif -+ fst.d $f11, Y, 0x00 -+ fstx.d $f12, Y, INC_Y -+ fst.d $f13, PY0, 0x00 -+ fstx.d $f14, PY0, INC_Y -+ fst.d $f15, PY1, 0x00 -+ fstx.d $f16, PY1, INC_Y -+ fst.d $f17, PY2, 0x00 -+ fstx.d $f18, PY2, INC_Y -+ PTR_ALSL Y, INC_Y, Y, 3 -+ bnez J, .L_\XW\()_N_L8 -+.L_\XW\()_N_7: -+ andi J, N, 4 -+ beqz J, .L_\XW\()_N_3 -+ ZERO_Y4 -+ move X, X_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_4_M_7 -+.align 5 -+.L_\XW\()_N_4_M_L8: -+ DLOAD_\X8 -+ DGEMV_T_4x8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL X, INC_X, X, 3 -+ bnez I, .L_\XW\()_N_4_M_L8 -+.L_\XW\()_N_4_M_7: -+ andi I, M, 4 -+ beqz I, .L_\XW\()_N_4_M_3 -+ DLOAD_\X4 -+ DGEMV_T_4x4 -+ PTR_ALSL X, INC_X, X, 2 -+.L_\XW\()_N_4_M_3: -+ // Accumulated -+ GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 -+ andi I, M, 3 -+ beqz I, .L_\XW\()_N_4_M_END -+.align 5 -+.L_\XW\()_N_4_M_L1: -+ fld.d $f1, X, 0x00 -+ GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 -+ GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ bnez I, .L_\XW\()_N_4_M_L1 -+.L_\XW\()_N_4_M_END: -+ fld.d $f11, Y, 0x00 -+ fldx.d $f12, Y, INC_Y -+ PTR_ALSL PY0, INC_Y, Y, 1 -+ fld.d $f13, PY0, 0x00 -+ fldx.d $f14, PY0, INC_Y -+ -+ GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 -+ -+ PTR_SLLI K_LDA, LDA, 2 -+ PTR_SUB K_LDA, K_LDA, M8 -+ -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#endif -+ fst.d $f11, Y, 0x00 -+ fstx.d $f12, Y, INC_Y -+ fst.d $f13, PY0, 0x00 -+ fstx.d $f14, PY0, INC_Y -+ PTR_ALSL Y, INC_Y, Y, 2 -+.L_\XW\()_N_3: -+ andi J, N, 2 -+ beqz J, .L_\XW\()_N_1 -+ ZERO_Y2 -+ move X, X_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_2_M_7 -+.align 5 -+.L_\XW\()_N_2_M_L8: -+ DLOAD_\X8 -+ DGEMV_T_2x8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL X, INC_X, X, 3 -+ bnez I, .L_\XW\()_N_2_M_L8 -+.L_\XW\()_N_2_M_7: -+ andi I, M, 4 -+ beqz I, .L_\XW\()_N_2_M_3 -+ DLOAD_\X4 -+ DGEMV_T_2x4 -+ PTR_ALSL X, INC_X, X, 2 -+.L_\XW\()_N_2_M_3: -+ // Accumulated -+ GACC xvf, d, Y0, TP0, Y1, TP1 -+ andi I, M, 3 -+ beqz I, .L_\XW\()_N_2_M_END -+.align 5 -+.L_\XW\()_N_2_M_L1: -+ fld.d $f1, X, 0x00 -+ GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 -+ GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ bnez I, .L_\XW\()_N_2_M_L1 -+.L_\XW\()_N_2_M_END: -+ fld.d $f11, Y, 0x00 -+ fldx.d $f12, Y, INC_Y -+ -+ GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 -+ -+ PTR_SLLI K_LDA, LDA, 1 -+ PTR_SUB K_LDA, K_LDA, M8 -+ -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA -+#endif -+ fst.d $f11, Y, 0x00 -+ fstx.d $f12, Y, INC_Y -+ PTR_ALSL Y, INC_Y, Y, 1 -+.L_\XW\()_N_1: -+ andi J, N, 1 -+ beqz J, .L_END -+ ZERO_Y1 -+ move X, X_ORG -+ move I, M -+ beqz I, .L_END -+.align 5 -+.L_\XW\()_N_1_M_L1: -+ fld.d $f3, PA0, 0x00 -+ fld.d $f1, X, 0x00 -+ fmadd.d $f19, $f3, $f1, $f19 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ PTR_ADDI PA0, PA0, 0x08 -+ bnez I, .L_\XW\()_N_1_M_L1 -+ fld.d $f3, Y, 0x00 -+ fmadd.d $f3, ALPHA, $f19, $f3 -+ fst.d $f3, Y, 0x00 -+ b .L_END -+.endm -+ -+ PROLOGUE -+ PTR_LD INC_Y, $sp, 0 -+ push_if_used 17 + 8, 24 + 3 -+ PTR_ADDI K, $r0, 0x01 -+ PTR_SUB I, INC_X, K -+ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ -+ GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 -+ xvreplve0.d VALPHA, $xr0 -+ move X_ORG, X -+ move PA0, A -+#if __loongarch_grlen == 64 -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#else -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#endif -+ la.local T0, .L_GAP_TABLE -+ PTR_ALSL I, I, T0, 1 -+ ld.h K, I, 0 -+ PTR_ADD T0, T0, K -+ jirl $r0, T0, 0 -+.L_GAP_TABLE: -+ .hword .L_GAP_0 - .L_GAP_TABLE -+ .hword .L_GAP_1 - .L_GAP_TABLE -+.L_GAP_0: /* if (incx == 1) */ -+ DGEMV_T_LASX GAP_0, X8, X4 -+.L_GAP_1: /* if (incx != 1) */ -+ DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP -+.L_END: -+ pop_if_used 17 + 8, 24 + 3 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S -new file mode 100644 -index 000000000..3315daccb ---- /dev/null -+++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S -@@ -0,0 +1,1366 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/07/26 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, -+ * FLOAT *c, BLASLONG ldc, BLASLONG offset) -+ */ -+#define M $r4 // param 1: bm -+#define N $r5 // param 2: bn -+#define K $r6 // param 3: bk -+#define A $r7 // param 5: ba -+#define B $r8 // param 6: bb -+#define C $r9 // param 7: bc -+#define LDC $r10 // param 8: ldc -+#define OFFSET $r11 // param 9: offset -+ -+/* Cycle control parameters */ -+#define I $r13 -+#define J $r14 -+#define L $r15 -+#define TL $r16 -+/* Matrix address */ -+#define A0 $r17 -+#define B0 $r18 -+#define C0 $r19 -+#define C1 $r20 -+#define C2 $r23 -+#define C3 $r24 -+#define T0 $r25 -+#define T1 $r26 -+#define T2 $r27 -+#define KK $r28 -+#define AA $r29 -+#define CC $r30 -+#undef ZERO -+#define ZERO $r0 -+ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+ -+/* Prefetch interval */ -+#define A_PRE 0x400 -+#define B_PRE 0x100 -+ -+#include "dtrsm_kernel_macro.S" -+ -+// By integrating the dgemm and dsolve processes, the following advantages can be obtained: -+// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel) -+// 2. Reduce the storage and retrieval of C data -+// 3. Vectorization of dsolve -+// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size. -+// To achieve finer-grained optimization, 15 scenarios have been addressed: -+// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1. -+ -+.macro dsolve_16 N -+// if N = 4 the data layout of C is as follows: -+// U0 U1 U2 U3 -+// U4 U5 U6 U7 -+// U8 U9 U10 U11 -+// U12 U13 U14 U15 -+// if N = 2 the dat layout of C is as follows: -+// U0 U1 U2 U3 -+// U4 U5 U6 U7 -+// if N = 1 the dat layout of C is as follows: -+// U0 U1 U2 U3 -+// The matrix A has dimensions of 16x16, and -+// it will be divided into 4 segments for processing. -+ -+#define G12 U3 -+#define G13 U7 -+#define G14 U11 -+#define G15 U15 -+ GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 -+ // A -+ // G12 G13 G14 G15 -+ // ----------------- -+ // 204 | D9 -+ // 220 221 | D8 D7 -+ // 236 237 238 | D6 D5 D4 -+ // 252 253 254 255 | D3 D2 D1 D0 -+ PTR_ADDI T0, A0, 252 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 236 * 8 -+ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 -+ PTR_ADDI T0, A0, 220 * 8 -+ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 -+ PTR_ADDI T0, A0, 204 * 8 -+ GLDREPL xv, d, D9, T0, 0 -+ -+ xvfmul.d G15, G15, D0 -+ GNMSUB xvf, d, G14, G15, D1, G14 -+ xvfmul.d G14, G14, D4 -+ GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13 -+ xvfmul.d G13, G13, D7 -+ GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12 -+ xvfmul.d G12, G12, D9 -+ // Store B -+.if \N == 4 -+ // x x x x ... x x x x -+ // x x x x ... x x x x -+ // x x x x ... x x x x -+ // b48 b49 b50 b51 ... b60 b61 b62 b63 -+ GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8 -+.elseif \N == 2 -+ // x x x x ... x x x x -+ // x x x x ... x x x x -+ // x x x x ... x x x x -+ // b24 b25 b26 b27 b28 b29 b30 b31 -+ GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8 -+.elseif \N == 1 -+ // x x x x -+ // x x x x -+ // x x x x -+ // b12 b13 b14 b15 -+ GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8 -+.endif -+ // Transpose G15 G14 G13 G12 -+ GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5 -+ // Store C -+.if \N == 4 -+ // x x x x ... c12 c13 c14 c15 -+ // x x x x ... c28 c29 c30 c31 -+ // x x x x ... c44 c45 c46 c47 -+ // x x x x ... c60 c61 c62 c63 -+ GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8 -+.elseif \N == 2 -+ // x x x x ... c12 c13 c14 c15 -+ // x x x x ... c28 c29 c30 c31 -+ GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8 -+.elseif \N == 1 -+ // Store C -+ // x x x x ... c12 c13 c14 c15 -+ GST xv, , D0, C0, 12 * 8 -+.endif -+ -+#define G8 U2 -+#define G9 U6 -+#define G10 U10 -+#define G11 U14 -+ GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 -+ // A -+ // G8 G9 G10 G11 -+ // ----------------- -+ // 136 | D9 -+ // 152 153 | D8 D7 -+ // 168 169 170 | D6 D5 D4 -+ // 184 185 186 187 | D3 D2 D1 D0 -+ // 200 201 202 203 | D15 D14 D13 D12 -+ // 216 217 218 219 | D11 D10 D9 D8 -+ // 232 233 234 235 | D7 D6 D5 D4 -+ // 248 249 250 251 | D3 D2 D1 D0 -+ PTR_ADDI T0, A0, 248 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 232 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 216 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 200 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G11, G15, D0, G11, G10, G15, D1, G10, G9, G15, D2, G9, G8, G15, D3, G8, \ -+ G11, G14, D4, G11, G10, G14, D5, G10, G9, G14, D6, G9, G8, G14, D7, G8, \ -+ G11, G13, D8, G11, G10, G13, D9, G10, G9, G13, D10, G9, G8, G13, D11, G8, \ -+ G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8 -+ PTR_ADDI T0, A0, 184 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 168 * 8 -+ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 -+ PTR_ADDI T0, A0, 152 * 8 -+ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 -+ PTR_ADDI T0, A0, 136 * 8 -+ GLDREPL xv, d, D9, T0, 0 -+ -+ xvfmul.d G11, G11, D0 -+ GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8 -+ xvfmul.d G10, G10, D4 -+ GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8 -+ xvfmul.d G9, G9, D7 -+ GNMSUB xvf, d, G8, G9, D8, G8 -+ xvfmul.d G8, G8, D9 -+ // Store B -+.if \N == 4 -+ // x x x x ... x x x x -+ // x x x x ... x x x x -+ // b32 b33 b34 b34 ... b44 b45 b46 b47 -+ // b48 b49 b50 b51 ... b60 b61 b62 b63 -+ GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8 -+.elseif \N == 2 -+ // x x x x ... x x x x -+ // x x x x ... x x x x -+ // b16 b17 b18 b19 b20 b21 b22 b23 -+ // b24 b25 b26 b27 b28 b29 b30 b31 -+ GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8 -+.elseif \N == 1 -+ // x x x x -+ // x x x x -+ // b8 b9 b10 b11 -+ // b12 b13 b14 b15 -+ GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8 -+.endif -+ // Transpose G11 G10 G9 G8 -+ GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5 -+ // Store C -+.if \N == 4 -+ // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 -+ // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31 -+ // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47 -+ // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63 -+ GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8 -+.elseif \N == 2 -+ // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 -+ // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31 -+ GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8 -+.elseif \N == 1 -+ // x x x x ... c8 c9 c10 c11 c12 c13 c14 c15 -+ GST xv, , D0, C0, 8 * 8 -+.endif -+ -+#define G4 U1 -+#define G5 U5 -+#define G6 U9 -+#define G7 U13 -+ GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1 -+ // A -+ // G4 G5 G6 G7 -+ // ------------------ -+ // 68 | D9 -+ // 84 85 | D8 D7 -+ // 100 101 102 | D6 D5 D4 -+ // 116 117 118 119 | D3 D2 D1 D0 -+ // 132 133 134 135 | D15 D14 D13 D12 -+ // 148 149 150 151 | D11 D10 D9 D8 -+ // 164 165 166 167 | D7 D6 D5 D4 -+ // 180 181 182 183 | D3 D2 D1 D0 -+ // 196 197 198 199 | D15 D14 D13 D12 -+ // 212 213 214 215 | D11 D10 D9 D8 -+ // 228 229 230 231 | D7 D6 D5 D4 -+ // 244 245 246 247 | D3 D2 D1 D0 -+ PTR_ADDI T0, A0, 244 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 228 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 212 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 196 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G7, G15, D0, G7, G6, G15, D1, G6, G5, G15, D2, G5, G4, G15, D3, G4, \ -+ G7, G14, D4, G7, G6, G14, D5, G6, G5, G14, D6, G5, G4, G14, D7, G4, \ -+ G7, G13, D8, G7, G6, G13, D9, G6, G5, G13, D10, G5, G4, G13, D11, G4, \ -+ G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4 -+ PTR_ADDI T0, A0, 180 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 164 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 148 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 132 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G7, G11, D0, G7, G6, G11, D1, G6, G5, G11, D2, G5, G4, G11, D3, G4, \ -+ G7, G10, D4, G7, G6, G10, D5, G6, G5, G10, D6, G5, G4, G10, D7, G4, \ -+ G7, G9, D8, G7, G6, G9, D9, G6, G5, G9, D10, G5, G4, G9, D11, G4, \ -+ G7, G8, D12, G7, G6, G8, D13, G6, G5, G8, D14, G5, G4, G8, D15, G4 -+ PTR_ADDI T0, A0, 116 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 100 * 8 -+ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 -+ PTR_ADDI T0, A0, 84 * 8 -+ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 -+ PTR_ADDI T0, A0, 68 * 8 -+ GLDREPL xv, d, D9, T0, 0 -+ xvfmul.d G7, G7, D0 -+ GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4 -+ xvfmul.d G6, G6, D4 -+ GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4 -+ xvfmul.d G5, G5, D7 -+ GNMSUB xvf, d, G4, G5, D8, G4 -+ xvfmul.d G4, G4, D9 -+ // Store B -+.if \N == 4 -+ // x x x x ... x x x x -+ // b16 b17 b18 b19 ... b28 b29 b30 b31 -+ // b32 b33 b34 b34 ... b44 b45 b46 b47 -+ // b48 b49 b50 b51 ... b60 b61 b62 b63 -+ GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8 -+.elseif \N == 2 -+ // x x x x ... x x x x -+ // b8 b9 b10 b11 b12 b13 b14 b15 -+ // b16 b17 b18 b19 b20 b21 b22 b23 -+ // b24 b25 b26 b27 b28 b29 b30 b31 -+ GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8 -+.elseif \N == 1 -+ // x x x x -+ // b4 b5 b6 b7 -+ // b8 b9 b10 b11 -+ // b12 b13 b14 b15 -+ GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8 -+.endif -+ // Transpose G7 G6 G5 G4 -+ GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5 -+ // Store C -+.if \N == 4 -+ // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 -+ // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 -+ // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47 -+ // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63 -+ GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8 -+.elseif \N == 2 -+ // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 -+ // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31 -+ GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8 -+.elseif \N == 1 -+ // x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 -+ GST xv, , D0, C0, 4 * 8 -+.endif -+ -+#define G0 U0 -+#define G1 U4 -+#define G2 U8 -+#define G3 U12 -+ GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1 -+ // A -+ // G0 G1 G2 G3 -+ // ------------------ -+ // 0 | D9 -+ // 16 17 | D8 D7 -+ // 32 33 34 | D6 D5 D4 -+ // 48 49 50 51 | D3 D2 D1 D0 -+ // 64 65 66 67 | D15 D14 D13 D12 -+ // 80 81 82 83 | D11 D10 D9 D8 -+ // 96 97 98 99 | D7 D6 D5 D4 -+ // 112 113 114 115 | D3 D2 D1 D0 -+ // 128 129 130 131 | D15 D14 D13 D12 -+ // 144 145 146 147 | D11 D10 D9 D8 -+ // 160 161 162 163 | D7 D6 D5 D4 -+ // 176 177 178 179 | D3 D2 D1 D0 -+ // 192 193 194 195 | D15 D14 D13 D12 -+ // 208 209 210 211 | D11 D10 D9 D8 -+ // 224 225 226 227 | D7 D6 D5 D4 -+ // 240 241 242 243 | D3 D2 D1 D0 -+ PTR_ADDI T0, A0, 240 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 224 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 208 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 192 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G3, G15, D0, G3, G2, G15, D1, G2, G1, G15, D2, G1, G0, G15, D3, G0, \ -+ G3, G14, D4, G3, G2, G14, D5, G2, G1, G14, D6, G1, G0, G14, D7, G0, \ -+ G3, G13, D8, G3, G2, G13, D9, G2, G1, G13, D10, G1, G0, G13, D11, G0, \ -+ G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0 -+ PTR_ADDI T0, A0, 176 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 160 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 144 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 128 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G3, G11, D0, G3, G2, G11, D1, G2, G1, G11, D2, G1, G0, G11, D3, G0, \ -+ G3, G10, D4, G3, G2, G10, D5, G2, G1, G10, D6, G1, G0, G10, D7, G0, \ -+ G3, G9, D8, G3, G2, G9, D9, G2, G1, G9, D10, G1, G0, G9, D11, G0, \ -+ G3, G8, D12, G3, G2, G8, D13, G2, G1, G8, D14, G1, G0, G8, D15, G0 -+ PTR_ADDI T0, A0, 112 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 96 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 80 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 64 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \ -+ G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \ -+ G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \ -+ G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0 -+ PTR_ADDI T0, A0, 48 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 32 * 8 -+ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 -+ PTR_ADDI T0, A0, 16 * 8 -+ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 -+ PTR_ADDI T0, A0, 0 * 8 -+ GLDREPL xv, d, D9, T0, 0 -+ -+ xvfmul.d G3, G3, D0 -+ GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 -+ xvfmul.d G2, G2, D4 -+ GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 -+ xvfmul.d G1, G1, D7 -+ GNMSUB xvf, d, G0, G1, D8, G0 -+ xvfmul.d G0, G0, D9 -+ // Store B -+.if \N == 4 -+ // b0 b1 b2 b3 ... b12 b13 b14 b15 -+ // b16 b17 b18 b19 ... b28 b29 b30 b31 -+ // b32 b33 b34 b34 ... b44 b45 b46 b47 -+ // b48 b49 b50 b51 ... b60 b61 b62 b63 -+ GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 -+.elseif \N == 2 -+ // b0 b1 b2 b3 b4 b5 b6 b7 -+ // b8 b9 b10 b11 b12 b13 b14 b15 -+ // b16 b17 b18 b19 b20 b21 b22 b23 -+ // b24 b25 b26 b27 b28 b29 b30 b31 -+ GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8 -+.elseif \N == 1 -+ // b0 b1 b2 b3 -+ // b4 b5 b6 b7 -+ // b8 b9 b10 b11 -+ // b12 b13 b14 b15 -+ GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8 -+.endif -+ // Transpose C3 C2 C1 C0 -+ GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 -+ // Store C -+.if \N == 4 -+ // c0 c1 c2 c3 ... c12 c13 c14 c15 -+ // c16 c17 c18 c19 ... c28 c29 c30 c31 -+ // c32 c33 c34 c34 ... c44 c45 c46 c47 -+ // c48 c49 c50 c51 ... c60 c61 c62 c63 -+ GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 -+.elseif \N == 2 -+ // c0 c1 c2 c3 ... c12 c13 c14 c15 -+ // c16 c17 c18 c19 ... c28 c29 c30 c31 -+ GST xv, , D0, C0, 0, D1, C1, 0 -+.elseif \N == 1 -+ // c0 c1 c2 c3 ... c12 c13 c14 c15 -+ GST xv, , D0, C0, 0 -+.endif -+ -+#undef G0 -+#undef G1 -+#undef G2 -+#undef G3 -+#undef G4 -+#undef G5 -+#undef G6 -+#undef G7 -+#undef G8 -+#undef G9 -+#undef G10 -+#undef G11 -+#undef G12 -+#undef G13 -+#undef G14 -+#undef G15 -+.endm -+ -+.macro dsolve_8 N -+// if N = 4 the data layout of C is as follows: -+// U0 U1 -+// U2 U3 -+// U4 U5 -+// U6 U7 -+// if N = 2 the dat layout of C is as follows: -+// U0 U1 -+// U2 U3 -+// if N = 1 the dat layout of C is as follows: -+// U0 U1 -+// The matrix A has dimensions of 8x8, and -+// it will be divided into 2 segments for processing. -+ -+#define G4 U1 -+#define G5 U3 -+#define G6 U5 -+#define G7 U7 -+ // Transpose U7 U5 U3 U1 -+ GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1 -+ // A -+ // G4 G5 G6 G7 -+ // --------------- -+ // 36 | D9 -+ // 44 45 | D8 D7 -+ // 52 53 54 | D6 D5 D4 -+ // 60 61 62 63 | D3 D2 D1 D0 -+ PTR_ADDI T0, A0, 60 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 52 * 8 -+ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 -+ PTR_ADDI T0, A0, 44 * 8 -+ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 -+ PTR_ADDI T0, A0, 36 * 8 -+ GLDREPL xv, d, D9, T0, 0 -+ -+ xvfmul.d G7, G7, D0 -+ GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4 -+ xvfmul.d G6, G6, D4 -+ GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4 -+ xvfmul.d G5, G5, D7 -+ GNMSUB xvf, d, G4, G5, D8, G4 -+ xvfmul.d G4, G4, D9 -+ // Store B -+.if \N == 4 -+ GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8 -+.elseif \N == 2 -+ GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8 -+.elseif \N == 1 -+ GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8 -+.endif -+ // Transpose -+ GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9 -+ // Store C -+.if \N == 4 -+ GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8 -+.elseif \N == 2 -+ GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8 -+.elseif \N == 1 -+ GST xv, , D4, C0, 4 * 8 -+.endif -+ -+#define G0 U0 -+#define G1 U2 -+#define G2 U4 -+#define G3 U6 -+ // Transpose U6 U4 U2 U0 -+ GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1 -+ // A -+ // G0 G1 G2 G3 -+ //----------------- -+ // 0 | D9 -+ // 8 9 | D8 D7 -+ // 16 17 18 | D6 D5 D4 -+ // 24 25 26 27 | D3 D2 D1 D0 -+ // 32 33 34 35 | D15 D14 D13 D12 -+ // 40 41 42 43 | D11 D10 D9 D8 -+ // 48 49 50 51 | D7 D6 D5 D4 -+ // 56 57 58 59 | D3 D2 D1 D0 -+ PTR_ADDI T0, A0, 56 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 48 * 8 -+ GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8 -+ PTR_ADDI T0, A0, 40 * 8 -+ GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8 -+ PTR_ADDI T0, A0, 32 * 8 -+ GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8 -+ GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \ -+ G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \ -+ G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \ -+ G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0 -+ PTR_ADDI T0, A0, 24 * 8 -+ GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8 -+ PTR_ADDI T0, A0, 16 * 8 -+ GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8 -+ PTR_ADDI T0, A0, 8 * 8 -+ GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8 -+ PTR_ADDI T0, A0, 0 * 8 -+ GLDREPL xv, d, D9, T0, 0 -+ -+ xvfmul.d G3, G3, D0 -+ GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 -+ xvfmul.d G2, G2, D4 -+ GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 -+ xvfmul.d G1, G1, D7 -+ GNMSUB xvf, d, G0, G1, D8, G0 -+ xvfmul.d G0, G0, D9 -+ // Store B -+.if \N == 4 -+ GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 -+.elseif \N == 2 -+ GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8 -+.elseif \N == 1 -+ GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8 -+.endif -+ // Transpose -+ GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 -+ // Store C -+.if \N == 4 -+ GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 -+.elseif \N == 2 -+ GST xv, , D0, C0, 0, D1, C1, 0 -+.elseif \N == 1 -+ GST xv, , D0, C0, 0 -+.endif -+ -+#undef G0 -+#undef G1 -+#undef G2 -+#undef G3 -+#undef G4 -+#undef G5 -+#undef G6 -+#undef G7 -+.endm -+ -+.macro dsolve_4 N -+// if N = 4 the data layout of C is as follows: -+// U0 -+// U1 -+// U2 -+// U3 -+// if N = 2 the dat layout of C is as follows: -+// U0 -+// U1 -+// if N = 1 the dat layout of C is as follows: -+// U0 -+// The matrix A has dimensions of 4x4, and -+// it will be divided into 1 segments for processing. -+ -+#define G0 U0 -+#define G1 U1 -+#define G2 U2 -+#define G3 U3 -+ // Transpose U3 U2 U1 U0 -+ GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1 -+ // A -+ // G0 G1 G2 G3 -+ //------------- -+ // 0 | D9 -+ // 4 5 | D8 D7 -+ // 8 9 10 | D6 D5 D4 -+ // 12 13 14 15 | D3 D2 D1 D0 -+ GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \ -+ D6, A0, 8 * 8, D5, A0, 9 * 8, D4, A0, 10 * 8, \ -+ D8, A0, 4 * 8, D7, A0, 5 * 8, \ -+ D9, A0, 0 * 8 -+ xvfmul.d G3, G3, D0 -+ GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0 -+ xvfmul.d G2, G2, D4 -+ GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0 -+ xvfmul.d G1, G1, D7 -+ GNMSUB xvf, d, G0, G1, D8, G0 -+ xvfmul.d G0, G0, D9 -+ // Store B -+.if \N == 4 -+ GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8 -+.elseif \N == 2 -+ GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8 -+.elseif \N == 1 -+ GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8 -+.endif -+ // Transpose -+ GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5 -+ // Store C -+.if \N == 4 -+ GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0 -+.elseif \N == 2 -+ GST xv, , D0, C0, 0, D1, C1, 0 -+.elseif \N == 1 -+ GST xv, , D0, C0, 0 -+.endif -+ -+#undef G0 -+#undef G1 -+#undef G2 -+#undef G3 -+.endm -+ -+.macro dsolve_2 N -+#define G0 U2 -+#define G1 U3 -+ // Transpose -+ GSBUTTERFLY xv, d, G0, G1, U1, U0 -+ // A -+ // G0 G1 -+ // ------ -+ // 0 | D2 -+ // 2 3 | D1 D0 -+ GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8 -+ xvfmul.d G1, G1, D0 -+ GNMSUB xvf, d, G0, G1, D1, G0 -+ xvfmul.d G0, G0, D2 -+ // Store B -+.if \N == 4 -+ GST xv, , G0, B0, 0, G1, B0, 4 * 8 -+.elseif \N == 2 -+ GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8 -+.elseif \N == 1 -+ GST f, d, $f2, B0, 0, $f3, B0, 8 -+.endif -+ // Transpose -+ GSBUTTERFLY xv, d, D0, D1, G1, G0 -+ // Store C -+.if \N == 4 -+ vst $vr16, C0, 0x00 -+ vst $vr17, C1, 0x00 -+ xvstelm.d D0, C2, 0x00, 0x02 -+ xvstelm.d D1, C3, 0x00, 0x02 -+ xvstelm.d D0, C2, 0x08, 0x03 -+ xvstelm.d D1, C3, 0x08, 0x03 -+.elseif \N == 2 -+ GST v, , $vr16, C0, 0, $vr17, C1, 0 -+.elseif \N == 1 -+ GST v, , $vr16, C0, 0 -+.endif -+ -+#undef G0 -+#undef G1 -+.endm -+ -+.macro dgemm_dsolve_16x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_16x4_load -+ dgemm_16x4 -+ b .L_dsolve_16x4 -+.L_dsolve_16x4_load: -+ // Load C -+ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 -+ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+/********************** solver ******************/ -+.L_dsolve_16x4: -+ PTR_ADDI A0, T1, -(16 * 8 * 8) -+ PTR_ADDI A0, A0, -(16 * 8 * 8) -+ PTR_ADDI B0, T2, -(16 * 4 * 8) -+ dsolve_16 4 -+.endm -+ -+.macro dgemm_dsolve_1x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_1x4_load -+ dgemm_1x4 -+ b .L_dsolve_1x4 -+.L_dsolve_1x4_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+ fld.d $f2, C2, 0x00 -+ fld.d $f3, C3, 0x00 -+ xvinsve0.d U0, U1, 0x01 -+ xvinsve0.d U0, U2, 0x02 -+ xvinsve0.d U0, U3, 0x03 -+.L_dsolve_1x4: -+ or A0, T1, T1 -+ or B0, T2, T2 -+ GLDREPL xv, d, D0, A0, -1 * 8 -+ GMUL xvf, d, U0, U0, D0 -+ // Store C -+ xvstelm.d U0, C0, 0x00, 0x00 -+ xvstelm.d U0, C1, 0x00, 0x01 -+ xvstelm.d U0, C2, 0x00, 0x02 -+ xvstelm.d U0, C3, 0x00, 0x03 -+ // Store B -+ xvst U0, B0, -32 -+.endm -+ -+.macro dgemm_dsolve_2x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_2x4_load -+ dgemm_2x4 -+ b .L_dsolve_2x4 -+.L_dsolve_2x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+ -+ xvpermi.q U0, U2, 0x02 -+ xvpermi.q U1, U3, 0x02 -+/********************** solver ******************/ -+.L_dsolve_2x4: -+ PTR_ADDI A0, T1, -(2 * 2 * 8) -+ PTR_ADDI B0, T2, -(2 * 4 * 8) -+ dsolve_2 4 -+.endm -+ -+.macro dgemm_dsolve_4x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_4x4_load -+ dgemm_4x4 -+ b .L_dsolve_4x4 -+.L_dsolve_4x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+/************** solver *****************/ -+.L_dsolve_4x4: -+ PTR_ADDI A0, T1, -(4 * 4 * 8) -+ PTR_ADDI B0, T2, -(4 * 4 * 8) -+ -+ dsolve_4 4 -+.endm -+ -+.macro dgemm_dsolve_8x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_8x4_load -+ dgemm_8x4 -+ b .L_dsolve_8x4 -+.L_dsolve_8x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ -+ /* Load C2 */ -+ xvld U4, C2, 0x00 -+ xvld U5, C2, 0x20 -+ -+ /* Load C3 */ -+ xvld U6, C3, 0x00 -+ xvld U7, C3, 0x20 -+/********* solver *********/ -+.L_dsolve_8x4: -+ PTR_ADDI A0, T1, -(8 * 8 * 8) -+ PTR_ADDI B0, T2, -(8 * 4 * 8) -+ -+ dsolve_8 4 -+.endm -+ -+.macro dgemm_dsolve_4x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_4x2_load -+ dgemm_4x2 -+ b .L_dsolve_4x2 -+.L_dsolve_4x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_4x2: -+ PTR_ADDI A0, T1, -(4 * 4 * 8) -+ PTR_ADDI B0, T2, -(4 * 2 * 8) -+ -+ dsolve_4 2 -+.endm -+ -+.macro dgemm_dsolve_2x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_2x2_load -+ dgemm_2x2 -+ b .L_dsolve_2x2 -+.L_dsolve_2x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_2x2: -+ PTR_ADDI A0, T1, -(2 * 2 * 8) -+ PTR_ADDI B0, T2, -(2 * 2 * 8) -+ -+ dsolve_2 2 -+.endm -+ -+.macro dgemm_dsolve_8x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_8x2_load -+ dgemm_8x2 -+ b .L_dsolve_8x2 -+.L_dsolve_8x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+.L_dsolve_8x2: -+ PTR_ADDI A0, T1, -(8 * 8 * 8) -+ PTR_ADDI B0, T2, -(8 * 2 * 8) -+ -+ dsolve_8 2 -+.endm -+ -+.macro dgemm_dsolve_16x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_16x2_load -+ dgemm_16x2 -+ b .L_dsolve_16x2 -+.L_dsolve_16x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ /* Load C1 */ -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+.L_dsolve_16x2: -+ PTR_ADDI A0, T1, -(16 * 8 * 8) -+ PTR_ADDI A0, A0, -(16 * 8 * 8) -+ PTR_ADDI B0, T2, -(16 * 2 * 8) -+ -+ dsolve_16 2 -+.endm -+ -+.macro dgemm_dsolve_2x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_2x1_load -+ dgemm_2x1 -+ b .L_dsolve_2x1 -+.L_dsolve_2x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_2x1: -+ PTR_ADDI A0, T1, -(2 * 2 * 8) -+ PTR_ADDI B0, T2, -(2 * 1 * 8) -+ -+ dsolve_2 1 -+.endm -+ -+.macro dgemm_dsolve_4x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_4x1_load -+ dgemm_4x1 -+ b .L_dsolve_4x1 -+.L_dsolve_4x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_4x1: -+ PTR_ADDI A0, T1, -(4 * 4 * 8) -+ PTR_ADDI B0, T2, -(4 * 1 * 8) -+ -+ dsolve_4 1 -+.endm -+ -+.macro dgemm_dsolve_8x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_8x1_load -+ dgemm_8x1 -+ b .L_dsolve_8x1 -+.L_dsolve_8x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+.L_dsolve_8x1: -+ PTR_ADDI A0, T1, -(8 * 8 * 8) -+ PTR_ADDI B0, T2, -(8 * 1 * 8) -+ -+ dsolve_8 1 -+.endm -+ -+.macro dgemm_dsolve_16x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_16x1_load -+ dgemm_16x1 -+ b .L_dsolve_16x1 -+.L_dsolve_16x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+.L_dsolve_16x1: -+ PTR_ADDI A0, T1, -(16 * 8 * 8) -+ PTR_ADDI A0, A0, -(16 * 8 * 8) -+ PTR_ADDI B0, T2, -(16 * 1 * 8) -+ -+ dsolve_16 1 -+.endm -+ -+ PROLOGUE -+ push_if_used 26, 32 -+ PTR_SLLI LDC, LDC, 3 -+ /* if (!(N >> 2)) goto L_N3 */ -+ PTR_SRAI J, N, 2 /* J = bn >> 2 */ -+ andi N, N, 0x03 -+ beq ZERO, J, .L_N3 -+.align 5 -+.L_J1: -+ PTR_ADDI J, J, -1 -+ PTR_ADD KK, M, OFFSET -+ -+ andi I, M, 15 -+ beq ZERO, I, .L_M16 -+ andi I, M, 1 -+ beqz I, .L_M2 -+.L_M1: -+ PTR_ADDI T0, M, -1 -+ PTR_SLLI T0, T0, 3 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ -+ PTR_ADD CC, T0, C /* c + (m - 1) */ -+ -+ PTR_SLLI T0, KK, 5 -+ PTR_ADD B0, B, T0 /* b + 4 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ dgemm_dsolve_1x4 -+ PTR_ADDI KK, KK, -1 -+.L_M2: -+ andi I, M, 2 -+ beqz I, .L_M4 -+ PTR_SRLI T0, M, 1 -+ PTR_SLLI T0, T0, 1 -+ PTR_ADDI T0, T0, -2 -+ PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ -+ PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ -+ PTR_SLLI T1, KK, 4 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ -+ PTR_SLLI T0, KK, 5 -+ PTR_ADD B0, B, T0 /* b + 4 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ dgemm_dsolve_2x4 -+ PTR_ADDI KK, KK, -2 -+.L_M4: -+ andi I, M, 4 -+ beqz I, .L_M8 -+ PTR_SRLI T0, M, 2 -+ PTR_SLLI T0, T0, 2 -+ PTR_ADDI T0, T0, -4 -+ PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ -+ PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ -+ PTR_SLLI T1, KK, 5 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ -+ PTR_SLLI T0, KK, 5 -+ PTR_ADD B0, B, T0 /* b + 4 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ dgemm_dsolve_4x4 -+ PTR_ADDI KK, KK, -4 -+.L_M8: -+ andi I, M, 8 -+ beqz I, .L_M16 -+ PTR_SRLI T0, M, 3 -+ PTR_SLLI T0, T0, 3 -+ PTR_ADDI T0, T0, -8 -+ PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ -+ PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ -+ PTR_SLLI T1, KK, 6 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ -+ PTR_SLLI T0, KK, 5 -+ PTR_ADD B0, B, T0 /* b + 4 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ dgemm_dsolve_8x4 -+ PTR_ADDI KK, KK, -8 -+.L_M16: -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_M0 -+ -+ PTR_SRLI T0, M, 4 -+ PTR_SLLI T0, T0, 4 -+ PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ -+ PTR_SLLI T0, T0, 3 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, A, AA -+ PTR_ADD CC, C, T0 -+.align 5 -+.L_I1: -+ PTR_SLLI T0, KK, 5 -+ PTR_ADD B0, B, T0 -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ PTR_SLLI T0, KK, 7 -+ PTR_ADD A0, AA, T0 -+ dgemm_dsolve_16x4 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI KK, KK, -16 -+ PTR_ADDI CC, CC, -(16 * 8) -+ PTR_SLLI T0, K, 7 -+ PTR_SUB AA, AA, T0 -+ blt ZERO, I, .L_I1 -+.L_M0: -+ PTR_SLLI T0, K, 3 -+ PTR_ALSL B, T0, B, 2 // b += 4 * k; -+ PTR_ALSL C, LDC, C, 2 // c += 4 * ldc -+ blt ZERO, J, .L_J1 -+.L_N3: -+ andi J, N, 2 -+ beq ZERO, J, .L_N1 -+ -+ PTR_ADD KK, M, OFFSET -+ andi I, M, 15 -+ beq ZERO, I, .L_N3_M16 -+ andi I, M, 1 -+ beqz I, .L_N3_M2 -+.L_N3_M1: -+ PTR_ADDI KK, KK, -1 -+ -+ PTR_ADDI T0, M, -1 -+ PTR_SLLI T0, T0, 3 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ -+ PTR_ADD CC, T0, C /* c + (m - 1) */ -+ -+ PTR_SLLI T0, KK, 4 -+ PTR_ADD B0, B, T0 /* b + 2 * kk */ -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ // dgemm_dsolve_1x2 -+ GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0 -+ GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0 -+ GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8 -+.L_N3_M2: -+ andi I, M, 2 -+ beqz I, .L_N3_M4 -+ PTR_SRLI T0, M, 1 -+ PTR_SLLI T0, T0, 1 -+ PTR_ADDI T0, T0, -2 -+ PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ -+ PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ -+ PTR_SLLI T1, KK, 4 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ -+ PTR_SLLI T0, KK, 4 -+ PTR_ADD B0, B, T0 /* b + 2 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ dgemm_dsolve_2x2 -+ PTR_ADDI KK, KK, -2 -+.L_N3_M4: -+ andi I, M, 4 -+ beqz I, .L_N3_M8 -+ PTR_SRLI T0, M, 2 -+ PTR_SLLI T0, T0, 2 -+ PTR_ADDI T0, T0, -4 -+ PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ -+ PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ -+ PTR_SLLI T1, KK, 5 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ -+ PTR_SLLI T0, KK, 4 -+ PTR_ADD B0, B, T0 /* b + 2 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ dgemm_dsolve_4x2 -+ PTR_ADDI KK, KK, -4 -+.L_N3_M8: -+ andi I, M, 8 -+ beqz I, .L_N3_M16 -+ PTR_SRLI T0, M, 3 -+ PTR_SLLI T0, T0, 3 -+ PTR_ADDI T0, T0, -8 -+ PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ -+ PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ -+ PTR_SLLI T1, KK, 6 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ -+ PTR_SLLI T0, KK, 4 -+ PTR_ADD B0, B, T0 /* b + 2 * kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ dgemm_dsolve_8x2 -+ PTR_ADDI KK, KK, -8 -+.L_N3_M16: -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N3_M0 -+ -+ PTR_SRLI T0, M, 4 -+ PTR_SLLI T0, T0, 4 -+ PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ -+ PTR_SLLI T0, T0, 3 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, A, AA -+ PTR_ADD CC, C, T0 -+.align 5 -+.L_N3_I1: -+ PTR_SLLI T0, KK, 4 -+ PTR_ADD B0, B, T0 -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ PTR_SLLI T0, KK, 7 -+ PTR_ADD A0, AA, T0 -+ dgemm_dsolve_16x2 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI KK, KK, -16 -+ PTR_ADDI CC, CC, -(16 * 8) -+ PTR_SLLI T0, K, 7 -+ PTR_SUB AA, AA, T0 -+ blt ZERO, I, .L_N3_I1 -+.L_N3_M0: -+ PTR_SLLI T0, K, 3 -+ PTR_ALSL B, T0, B, 1 // b += 2 * k; -+ PTR_ALSL C, LDC, C, 1 // c += 2 * ldc -+.L_N1: -+ andi J, N, 1 -+ beq ZERO, J, .L_N0 -+ -+ PTR_ADD KK, M, OFFSET -+ andi I, M, 15 -+ beq ZERO, I, .L_N1_M16 -+ andi I, M, 1 -+ beqz I, .L_N1_M2 -+.L_N1_M1: -+ PTR_ADDI KK, KK, -1 -+ -+ PTR_ADDI T0, M, -1 -+ PTR_SLLI T0, T0, 3 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */ -+ PTR_ADD CC, T0, C /* c + (m - 1) */ -+ -+ PTR_SLLI T0, KK, 3 -+ PTR_ADD B0, B, T0 /* b + kk */ -+ GADD , d, C0, CC, ZERO -+ // dgemm_dsolve_1x1 -+ GLD f, d, $f0, A0, 0, $f1, C0, 0 -+ GMUL f, d, $f1, $f1, $f0 -+ GST f, d, $f1, C0, 0, $f1, B0, 0 -+.L_N1_M2: -+ andi I, M, 2 -+ beqz I, .L_N1_M4 -+ PTR_SRLI T0, M, 1 -+ PTR_SLLI T0, T0, 1 -+ PTR_ADDI T0, T0, -2 -+ PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */ -+ PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/ -+ PTR_SLLI T1, KK, 4 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */ -+ PTR_SLLI T0, KK, 3 -+ PTR_ADD B0, B, T0 /* b + kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO -+ dgemm_dsolve_2x1 -+ PTR_ADDI KK, KK, -2 -+.L_N1_M4: -+ andi I, M, 4 -+ beqz I, .L_N1_M8 -+ PTR_SRLI T0, M, 2 -+ PTR_SLLI T0, T0, 2 -+ PTR_ADDI T0, T0, -4 -+ PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */ -+ PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/ -+ PTR_SLLI T1, KK, 5 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */ -+ PTR_SLLI T0, KK, 3 -+ PTR_ADD B0, B, T0 /* b + kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO -+ dgemm_dsolve_4x1 -+ PTR_ADDI KK, KK, -4 -+.L_N1_M8: -+ andi I, M, 8 -+ beqz I, .L_N1_M16 -+ PTR_SRLI T0, M, 3 -+ PTR_SLLI T0, T0, 3 -+ PTR_ADDI T0, T0, -8 -+ PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */ -+ PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/ -+ PTR_SLLI T1, KK, 6 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, AA, A -+ PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */ -+ PTR_SLLI T0, KK, 3 -+ PTR_ADD B0, B, T0 /* b + kk */ -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO -+ dgemm_dsolve_8x1 -+ PTR_ADDI KK, KK, -8 -+.L_N1_M16: -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N1_M0 -+ -+ PTR_SRLI T0, M, 4 -+ PTR_SLLI T0, T0, 4 -+ PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */ -+ PTR_SLLI T0, T0, 3 -+ PTR_MUL AA, T0, K -+ PTR_ADD AA, A, AA -+ PTR_ADD CC, C, T0 -+.align 5 -+.L_N1_I1: -+ PTR_SLLI T0, KK, 3 -+ PTR_ADD B0, B, T0 -+ PTR_SUB L, K, KK -+ GADD , d, C0, CC, ZERO -+ PTR_SLLI T0, KK, 7 -+ PTR_ADD A0, AA, T0 -+ dgemm_dsolve_16x1 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI KK, KK, -16 -+ PTR_ADDI CC, CC, -(16 * 8) -+ PTR_SLLI T0, K, 7 -+ PTR_SUB AA, AA, T0 -+ blt ZERO, I, .L_N1_I1 -+.L_N1_M0: -+.L_N0: -+ pop_if_used 26, 32 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S -new file mode 100644 -index 000000000..0e2cacccf ---- /dev/null -+++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S -@@ -0,0 +1,959 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/26 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, -+ * FLOAT *c, BLASLONG ldc, BLASLONG offset) -+ */ -+ -+#define M $r4 // param 1: bm -+#define N $r5 // param 2: bn -+#define K $r6 // param 3: bk -+#define A $r7 // param 5: ba -+#define B $r8 // param 6: bb -+#define C $r9 // param 7: bc -+#define LDC $r10 // param 8: ldc -+#define OFFSET $r11 // param 9: offset -+ -+/* Cycle control parameters */ -+#define I $r13 -+#define J $r14 -+#define L $r15 -+#define TL $r16 -+/* Matrix address */ -+#define A0 $r17 -+#define B0 $r18 -+#define C0 $r19 -+#define C1 $r20 -+#define C2 $r23 -+#define C3 $r24 -+#define T0 $r25 -+#define T1 $r26 -+#define T2 $r27 -+#define KK $r28 -+#define AA $r29 -+#define CC $r30 -+#define BB B0 -+#undef ZERO -+#define ZERO $r0 -+ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+#define G0 D0 -+#define G1 D1 -+#define G2 D2 -+#define G3 D3 -+#define G4 D4 -+#define G5 D5 -+#define G6 D6 -+#define G7 D7 -+#define G8 D8 -+#define G9 D9 -+#define G10 D10 -+#define G11 D11 -+#define G12 D12 -+#define G13 D13 -+#define G14 D14 -+#define G15 D15 -+ -+/* Prefetch interval */ -+#define A_PRE 0x400 -+#define B_PRE 0x100 -+ -+#include "dtrsm_kernel_macro.S" -+ -+.macro ldrepl_macro start, end, stride -+// Load Ux (x = 0...15) -+.if \start <= \end -+ GLDREPL xv, d, $xr\start, A0, \stride * 8 -+ ldrepl_macro %start + 1, \end, %stride + 1 -+.endif -+.endm -+.macro nmsub_macro start0, end0, start1, reg -+// Gx -= reg * Ux -+.if \start0 <= \end0 -+ xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 -+ nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg -+.endif -+.endm -+.macro B_st_macro start, end, stride, N -+// Store Gx(x = 16...31) -+.if \start <= \end -+.if \N == 4 -+ xvst $xr\start, B0, \stride * 0x20 -+.elseif \N == 2 -+ vst $vr\start, B0, \stride * 0x10 -+.elseif \N == 1 -+ fst.d $f\start, B0, \stride * 0x08 -+.endif -+ B_st_macro %start + 1, \end, %stride + 1, \N -+.endif -+.endm -+ -+.macro dsolve_16 N -+// The data layout of C (4x16) is as follows (store 4 data in each register): -+// U0 U1 U2 U3 -+// U4 U5 U6 U7 -+// U8 U9 U10 U11 -+// U12 U13 U14 U15 -+// The first step is to transpose the result of C -+ GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1 -+ GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1 -+ GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7 -+ GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7 -+// Now we have the following memory layout of C: -+// 0 1 2 3 ... 15 -+// 0 | | | | | | | -+// 1 | G0 | G1 | G2 | G3 | ... | G15 | -+// 2 | | | | | | | -+// 3 | | | | | | | -+// Next we are going to process matrix A with a size of 16x16, -+// using only the upper triangular portion. The memory layout of -+// matrix A is as follows, quite large. -+//0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 -+// 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 -+// 34 35 36 37 38 39 40 41 42 43 44 45 46 47 -+// 51 52 53 54 55 56 57 58 59 60 61 62 63 -+// 68 69 70 71 72 73 74 75 76 77 78 79 -+// 85 86 87 88 89 90 91 92 93 94 95 -+// 102 103 104 105 106 107 108 109 110 111 -+// 119 120 121 122 123 124 125 126 127 -+// 136 137 138 139 140 141 142 143 -+// 153 154 155 156 157 158 159 -+// 170 171 172 173 174 175 -+// 187 188 189 190 191 -+// 204 205 206 207 -+// 221 222 223 -+// 238 239 -+// 255 -+// Sequentially extract data from A in row order -+// Load 0 -+ ldrepl_macro 0, 15, 0 -+ GMUL xvf, d, G0, G0, U0 -+ nmsub_macro 17, 31, 1, G0 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 1 -+ ldrepl_macro 1, 15, 0 -+ GMUL xvf, d, G1, G1, U1 -+ nmsub_macro 18, 31, 2, G1 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 2 -+ ldrepl_macro 2, 15, 0 -+ GMUL xvf, d, G2, G2, U2 -+ nmsub_macro 19, 31, 3, G2 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 3 -+ ldrepl_macro 3, 15, 0 -+ GMUL xvf, d, G3, G3, U3 -+ nmsub_macro 20, 31, 4, G3 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 4 -+ ldrepl_macro 4, 15, 0 -+ GMUL xvf, d, G4, G4, U4 -+ nmsub_macro 21, 31, 5, G4 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 5 -+ ldrepl_macro 5, 15, 0 -+ GMUL xvf, d, G5, G5, U5 -+ nmsub_macro 22, 31, 6, G5 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 6 -+ ldrepl_macro 6, 15, 0 -+ GMUL xvf, d, G6, G6, U6 -+ nmsub_macro 23, 31, 7, G6 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 7 -+ ldrepl_macro 7, 15, 0 -+ GMUL xvf, d, G7, G7, U7 -+ nmsub_macro 24, 31, 8, G7 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 8 -+ ldrepl_macro 8, 15, 0 -+ GMUL xvf, d, G8, G8, U8 -+ nmsub_macro 25, 31, 9, G8 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 9 -+ ldrepl_macro 9, 15, 0 -+ GMUL xvf, d, G9, G9, U9 -+ nmsub_macro 26, 31, 10, G9 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 10 -+ ldrepl_macro 10, 15, 0 -+ GMUL xvf, d, G10, G10, U10 -+ nmsub_macro 27, 31, 11, G10 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 11 -+ ldrepl_macro 11, 15, 0 -+ GMUL xvf, d, G11, G11, U11 -+ nmsub_macro 28, 31, 12, G11 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 12 -+ ldrepl_macro 12, 15, 0 -+ GMUL xvf, d, G12, G12, U12 -+ nmsub_macro 29, 31, 13, G12 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 13 -+ ldrepl_macro 13, 15, 0 -+ GMUL xvf, d, G13, G13, U13 -+ nmsub_macro 30, 31, 14, G13 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 14 -+ ldrepl_macro 14, 15, 0 -+ GMUL xvf, d, G14, G14, U14 -+ nmsub_macro 31, 31, 15, G14 -+ PTR_ADDI A0, A0, 17 * 8 -+// Load 15 -+ ldrepl_macro 15, 15, 0 -+ GMUL xvf, d, G15, G15, U15 -+// Finally, We can store the result. -+// For B, stored sequentially, and C, first transpose and then store -+ B_st_macro 16, 31, 0, \N -+ GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 -+ GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 -+ GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1 -+ GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1 -+.if \N == 4 -+ GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ -+ G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60, \ -+ G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \ -+ G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60 -+.elseif \N == 2 -+ GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60, \ -+ G1, C1, 0x00, G5, C1, 0x20, G9, C1, 0x40, G13, C1, 0x60 -+.elseif \N == 1 -+ GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8, C0, 0x40, G12, C0, 0x60 -+.endif -+.endm -+ -+.macro dgemm_dsolve_16x4 -+ bge ZERO, KK, .L_dsolve_16x4_load -+ dgemm_16x4 -+ b .L_dsolve_16x4 -+.L_dsolve_16x4_load: -+ // Load C -+ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 -+ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+/********************** solver ******************/ -+.L_dsolve_16x4: -+ dsolve_16 4 -+.endm -+ -+.macro dsolve_8 N -+// The data layout of C (4x8) is as follows (store 4 data in each register): -+// U0 U1 -+// U2 U3 -+// U4 U5 -+// U6 U7 -+// The first step is to transpose the result of C -+ GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9 -+ GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9 -+// Now we have the following memory layout of C: -+// 0 1 2 3 ... 7 -+// 0 | | | | | | | -+// 1 | G0 | G1 | G2 | G3 | ... | G7 | -+// 2 | | | | | | | -+// 3 | | | | | | | -+// Next we are going to process matrix A with a size of 8x8, -+// using only the upper triangular portion. The memory layout of -+// matrix A is as follows: -+//0 1 2 3 4 5 6 7 -+// 9 10 11 12 13 14 15 -+// 18 19 20 21 22 23 -+// 27 28 29 30 31 -+// 36 37 38 39 -+// 45 46 47 -+// 54 55 -+// 63 -+// Sequentially extract data from A in row order -+// Load 0 -+ ldrepl_macro 0, 7, 0 -+ GMUL xvf, d, G0, G0, U0 -+ nmsub_macro 17, 23, 1, G0 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 1 -+ ldrepl_macro 1, 7, 0 -+ GMUL xvf, d, G1, G1, U1 -+ nmsub_macro 18, 23, 2, G1 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 2 -+ ldrepl_macro 2, 7, 0 -+ GMUL xvf, d, G2, G2, U2 -+ nmsub_macro 19, 23, 3, G2 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 3 -+ ldrepl_macro 3, 7, 0 -+ GMUL xvf, d, G3, G3, U3 -+ nmsub_macro 20, 23, 4, G3 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 4 -+ ldrepl_macro 4, 7, 0 -+ GMUL xvf, d, G4, G4, U4 -+ nmsub_macro 21, 23, 5, G4 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 5 -+ ldrepl_macro 5, 7, 0 -+ GMUL xvf, d, G5, G5, U5 -+ nmsub_macro 22, 23, 6, G5 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 6 -+ ldrepl_macro 6, 7, 0 -+ GMUL xvf, d, G6, G6, U6 -+ nmsub_macro 23, 23, 7, G6 -+ PTR_ADDI A0, A0, 9 * 8 -+// Load 7 -+ ldrepl_macro 7, 7, 0 -+ GMUL xvf, d, G7, G7, U7 -+// Finally, We can store the result. -+// For B, stored sequentially, and C, first transpose and then store -+ B_st_macro 16, 23, 0, \N -+ GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 -+ GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1 -+.if \N == 4 -+ GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ -+ G1, C1, 0x00, G5, C1, 0x20, \ -+ G2, C2, 0x00, G6, C2, 0x20, \ -+ G3, C3, 0x00, G7, C3, 0x20 -+.elseif \N == 2 -+ GST xv, , G0, C0, 0x00, G4, C0, 0x20, \ -+ G1, C1, 0x00, G5, C1, 0x20 -+.elseif \N == 1 -+ GST xv, , G0, C0, 0x00, G4, C0, 0x20 -+.endif -+.endm -+ -+.macro dgemm_dsolve_8x4 -+ bge ZERO, L, .L_dsolve_8x4_load -+ dgemm_8x4 -+ b .L_dsolve_8x4 -+.L_dsolve_8x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ -+ /* Load C2 */ -+ xvld U4, C2, 0x00 -+ xvld U5, C2, 0x20 -+ -+ /* Load C3 */ -+ xvld U6, C3, 0x00 -+ xvld U7, C3, 0x20 -+/********* solver *********/ -+.L_dsolve_8x4: -+ dsolve_8 4 -+.endm -+ -+.macro dsolve_4 N -+// The data layout of C (4x4) is as follows (store 4 data in each register): -+// U0 -+// U1 -+// U2 -+// U3 -+// The first step is to transpose the result of C -+ GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5 -+// Now we have the following memory layout of C: -+// 0 1 2 3 -+// 0 | | | | | -+// 1 | G0 | G1 | G2 | G3 | -+// 2 | | | | | -+// 3 | | | | | -+// Next we are going to process matrix A with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix A is as follows: -+//0 1 2 3 -+// 5 6 7 -+// 10 11 -+// 15 -+// Sequentially extract data from A in row order -+// Load 0 -+ ldrepl_macro 0, 3, 0 -+ GMUL xvf, d, G0, G0, U0 -+ nmsub_macro 17, 19, 1, G0 -+ PTR_ADDI A0, A0, 5 * 8 -+// Load 1 -+ ldrepl_macro 1, 3, 0 -+ GMUL xvf, d, G1, G1, U1 -+ nmsub_macro 18, 19, 2, G1 -+ PTR_ADDI A0, A0, 5 * 8 -+// Load 2 -+ ldrepl_macro 2, 3, 0 -+ GMUL xvf, d, G2, G2, U2 -+ nmsub_macro 19, 19, 3, G2 -+ PTR_ADDI A0, A0, 5 * 8 -+// Load 3 -+ ldrepl_macro 3, 3, 0 -+ GMUL xvf, d, G3, G3, U3 -+// Finally, We can store the result. -+// For B, stored sequentially, and C, first transpose and then store -+ B_st_macro 16, 19, 0, \N -+ GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1 -+.if \N == 4 -+ GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00 -+.elseif \N == 2 -+ GST xv, , G0, C0, 0x00, G1, C1, 0x00 -+.elseif \N == 1 -+ GST xv, , G0, C0, 0x00 -+.endif -+.endm -+ -+.macro dgemm_dsolve_4x4 -+ bge ZERO, L, .L_dsolve_4x4_load -+ dgemm_4x4 -+ b .L_dsolve_4x4 -+.L_dsolve_4x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+/************** solver *****************/ -+.L_dsolve_4x4: -+ dsolve_4 4 -+.endm -+ -+.macro dsolve_2 N -+// Transpose -+ GSBUTTERFLY xv, d, G0, G1, U1, U0 -+// Now we have the following memory layout of C: -+// 0 1 -+// 0 | | | -+// 1 | G0 | G1 | -+// 2 | | | -+// 3 | | | -+// Next we are going to process matrix A with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix A is as follows: -+//0 1 -+// 3 -+// Sequentially extract data from A in row order -+// Load 0 -+ ldrepl_macro 0, 1, 0 -+ GMUL xvf, d, G0, G0, U0 -+ nmsub_macro 17, 17, 1, G0 -+ PTR_ADDI A0, A0, 3 * 8 -+// Load 1 -+ ldrepl_macro 1, 1, 0 -+ GMUL xvf, d, G1, G1, U1 -+// Finally, We can store the result. -+// For B, stored sequentially, and C, first transpose and then store -+ B_st_macro 16, 17, 0, \N -+ GSBUTTERFLY xv, d, U0, U1, G1, G0 -+.if \N == 4 -+ vst $vr0, C0, 0x00 -+ vst $vr1, C1, 0x00 -+ xvstelm.d U0, C2, 0x00, 0x02 -+ xvstelm.d U1, C3, 0x00, 0x02 -+ xvstelm.d U0, C2, 0x08, 0x03 -+ xvstelm.d U1, C3, 0x08, 0x03 -+.elseif \N == 2 -+ vst $vr0, C0, 0x00 -+ vst $vr1, C1, 0x00 -+.elseif \N == 1 -+ vst $vr0, C0, 0x00 -+.endif -+.endm -+ -+.macro dgemm_dsolve_2x4 -+ bge ZERO, L, .L_dsolve_2x4_load -+ dgemm_2x4 -+ b .L_dsolve_2x4 -+.L_dsolve_2x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+ -+ xvpermi.q U0, U2, 0x02 -+ xvpermi.q U1, U3, 0x02 -+/********************** solver ******************/ -+.L_dsolve_2x4: -+ dsolve_2 4 -+.endm -+ -+.macro dgemm_dsolve_1x4 -+ bge ZERO, L, .L_dsolve_1x4_load -+ dgemm_1x4 -+ b .L_dsolve_1x4 -+.L_dsolve_1x4_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+ fld.d $f2, C2, 0x00 -+ fld.d $f3, C3, 0x00 -+ xvinsve0.d U0, U1, 0x01 -+ xvinsve0.d U0, U2, 0x02 -+ xvinsve0.d U0, U3, 0x03 -+.L_dsolve_1x4: -+ GLDREPL xv, d, D0, A0, 0x00 -+ GMUL xvf, d, U0, U0, D0 -+ // Store C -+ xvstelm.d U0, C0, 0x00, 0x00 -+ xvstelm.d U0, C1, 0x00, 0x01 -+ xvstelm.d U0, C2, 0x00, 0x02 -+ xvstelm.d U0, C3, 0x00, 0x03 -+ // Store B -+ xvst U0, B0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_16x2 -+ bge ZERO, L, .L_dsolve_16x2_load -+ dgemm_16x2 -+ b .L_dsolve_16x2 -+.L_dsolve_16x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ /* Load C1 */ -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+.L_dsolve_16x2: -+ dsolve_16 2 -+.endm -+ -+.macro dgemm_dsolve_8x2 -+ bge ZERO, L, .L_dsolve_8x2_load -+ dgemm_8x2 -+ b .L_dsolve_8x2 -+.L_dsolve_8x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+.L_dsolve_8x2: -+ dsolve_8 2 -+.endm -+ -+.macro dgemm_dsolve_4x2 -+ bge ZERO, L, .L_dsolve_4x2_load -+ dgemm_4x2 -+ b .L_dsolve_4x2 -+.L_dsolve_4x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_4x2: -+ dsolve_4 2 -+.endm -+ -+.macro dgemm_dsolve_1x2 -+ bge ZERO, L, .L_dsolve_1x2_load -+ dgemm_1x2 -+ b .L_dsolve_1x2 -+.L_dsolve_1x2_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+ xvinsve0.d U0, U1, 0x01 -+.L_dsolve_1x2: -+ GLDREPL xv, d, D0, A0, 0x00 -+ GMUL xvf, d, U0, U0, D0 -+ // Store C -+ xvstelm.d U0, C0, 0x00, 0x00 -+ xvstelm.d U0, C1, 0x00, 0x01 -+ // Store B -+ vst $vr0, B0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_2x2 -+ bge ZERO, L, .L_dsolve_2x2_load -+ dgemm_2x2 -+ b .L_dsolve_2x2 -+.L_dsolve_2x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_2x2: -+ dsolve_2 2 -+.endm -+ -+.macro dgemm_dsolve_16x1 -+ bge ZERO, L, .L_dsolve_16x1_load -+ dgemm_16x1 -+ b .L_dsolve_16x1 -+.L_dsolve_16x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+.L_dsolve_16x1: -+ dsolve_16 1 -+.endm -+ -+.macro dgemm_dsolve_8x1 -+ bge ZERO, L, .L_dsolve_8x1_load -+ dgemm_8x1 -+ b .L_dsolve_8x1 -+.L_dsolve_8x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+.L_dsolve_8x1: -+ dsolve_8 1 -+.endm -+ -+.macro dgemm_dsolve_4x1 -+ bge ZERO, L, .L_dsolve_4x1_load -+ dgemm_4x1 -+ b .L_dsolve_4x1 -+.L_dsolve_4x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_4x1: -+ dsolve_4 1 -+.endm -+ -+.macro dgemm_dsolve_2x1 -+ bge ZERO, L, .L_dsolve_2x1_load -+ dgemm_2x1 -+ b .L_dsolve_2x1 -+.L_dsolve_2x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_2x1: -+ dsolve_2 1 -+.endm -+ -+.macro dgemm_dsolve_1x1 -+ bge ZERO, L, .L_dsolve_1x1_load -+ dgemm_1x1 -+ b .L_dsolve_1x1 -+.L_dsolve_1x1_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+.L_dsolve_1x1: -+ GLDREPL xv, d, D0, A0, 0x00 -+ GMUL xvf, d, U0, U0, D0 -+ // Store C -+ xvstelm.d U0, C0, 0x00, 0x00 -+ // Store B -+ xvstelm.d U0, B0, 0x00, 0x00 -+.endm -+ -+ PROLOGUE -+ push_if_used 26, 32 -+ PTR_SLLI LDC, LDC, 3 -+ /* if (!(N >> 2)) goto L_N3 */ -+ PTR_SRAI J, N, 2 /* J = bn >> 2 */ -+ andi N, N, 0x03 -+ beq ZERO, J, .L_N3 -+.align 5 -+.L_J1: -+ PTR_ADDI J, J, -1 -+ move KK, OFFSET -+ move AA, A -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_M15 -+.align 4 -+.L_I1: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_16x4 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADDI KK, KK, 0x10 // kk += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_I1 -+.L_M15: -+ andi I, M, 8 -+ beqz I, .L_M7 -+.L_M8: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_8x4 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADDI KK, KK, 0x08 // kk += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_M7: -+ andi I, M, 4 -+ beqz I, .L_M3 -+.L_M4: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_4x4 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADDI KK, KK, 0x04 // kk += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_M3: -+ andi I, M, 2 -+ beqz I, .L_M1 -+.L_M2: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_2x4 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADDI KK, KK, 0x02 // kk += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_M1: -+ andi I, M, 1 -+ beqz I, .L_M0 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_1x4 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADDI KK, KK, 0x01 // kk += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_M0: -+ PTR_SLLI T0, K, 5 -+ PTR_SLLI T1, LDC, 2 -+ PTR_ADD B, B, T0 // b += 4 * k -+ PTR_ADD C, C, T1 // c += 4 * ldc -+ bnez J, .L_J1 -+.L_N3: -+ andi J, N, 2 -+ beq ZERO, J, .L_N1 -+.L_N2: -+ move KK, OFFSET -+ move AA, A -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_N2_M15 -+.align 4 -+.L_N2_I1: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_16x2 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADDI KK, KK, 0x10 // kk += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_N2_I1 -+.L_N2_M15: -+ andi I, M, 8 -+ beqz I, .L_N2_M7 -+.L_N2_M8: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_8x2 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADDI KK, KK, 0x08 // kk += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_N2_M7: -+ andi I, M, 4 -+ beqz I, .L_N2_M3 -+.L_N2_M4: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_4x2 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADDI KK, KK, 0x04 // kk += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_N2_M3: -+ andi I, M, 2 -+ beqz I, .L_N2_M1 -+.L_N2_M2: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_2x2 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADDI KK, KK, 0x02 // kk += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_N2_M1: -+ andi I, M, 1 -+ beqz I, .L_N2_M0 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_1x2 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADDI KK, KK, 0x01 // kk += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_N2_M0: -+ PTR_SLLI T0, K, 4 -+ PTR_SLLI T1, LDC, 1 -+ PTR_ADD B, B, T0 // b += 2 * k -+ PTR_ADD C, C, T1 // c += 2 * ldc -+.L_N1: -+ andi J, N, 1 -+ beq ZERO, J, .L_N0 -+ -+ move KK, OFFSET -+ move AA, A -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_N1_M15 -+.align 4 -+.L_N1_I1: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_16x1 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADDI KK, KK, 0x10 // kk += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_N1_I1 -+.L_N1_M15: -+ andi I, M, 8 -+ beqz I, .L_N1_M7 -+.L_N1_M8: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_8x1 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADDI KK, KK, 0x08 // kk += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_N1_M7: -+ andi I, M, 4 -+ beqz I, .L_N1_M3 -+.L_N1_M4: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_4x1 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADDI KK, KK, 0x04 // kk += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_N1_M3: -+ andi I, M, 2 -+ beqz I, .L_N1_M1 -+.L_N1_M2: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_2x1 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADDI KK, KK, 0x02 // kk += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_N1_M1: -+ andi I, M, 1 -+ beqz I, .L_N1_M0 -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_1x1 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADDI KK, KK, 0x01 // kk += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_N1_M0: -+.L_N0: -+ pop_if_used 26, 32 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S -new file mode 100644 -index 000000000..421339736 ---- /dev/null -+++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S -@@ -0,0 +1,882 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/09/26 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, -+ * FLOAT *c, BLASLONG ldc, BLASLONG offset) -+ */ -+ -+#define M $r4 // param 1: bm -+#define N $r5 // param 2: bn -+#define K $r6 // param 3: bk -+#define A $r7 // param 5: ba -+#define B $r8 // param 6: bb -+#define C $r9 // param 7: bc -+#define LDC $r10 // param 8: ldc -+#define OFFSET $r11 // param 9: offset -+ -+/* Cycle control parameters */ -+#define I $r13 -+#define J $r14 -+#define L $r15 -+#define TL $r16 -+/* Matrix address */ -+#define A0 $r17 -+#define B0 $r18 -+#define C0 $r19 -+#define C1 $r20 -+#define C2 $r23 -+#define C3 $r24 -+#define T0 $r25 -+#define T1 $r26 -+#define T2 $r27 -+#define KK $r28 -+#define AA $r29 -+#define CC $r30 -+#define BB B0 -+#undef ZERO -+#define ZERO $r0 -+ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+#define G0 D0 -+#define G1 D1 -+#define G2 D2 -+#define G3 D3 -+#define G4 D4 -+#define G5 D5 -+#define G6 D6 -+#define G7 D7 -+#define G8 D8 -+#define G9 D9 -+#define G10 D10 -+#define G11 D11 -+#define G12 D12 -+#define G13 D13 -+#define G14 D14 -+#define G15 D15 -+ -+/* Prefetch interval */ -+#define A_PRE 0x400 -+#define B_PRE 0x100 -+ -+#include "dtrsm_kernel_macro.S" -+ -+.macro ldrepl_macro start, end, stride -+// Load Ux (x = 0...15) -+.if \start <= \end -+ GLDREPL xv, d, $xr\start, B0, \stride * 8 -+ ldrepl_macro %start + 1, \end, %stride + 1 -+.endif -+.endm -+ -+.macro nmsub_macro start0, end0, start1, reg -+// Ux -= reg * Dx -+.if \start0 <= \end0 -+ xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 -+ nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg -+.endif -+.endm -+ -+.macro A_st_macro start, end, stride, N -+// Store Ux(x = 0...15) -+.if \start <= \end -+.if \N == 4 -+ xvst $xr\start, A0, \stride * 0x20 -+.elseif \N == 2 -+ vst $vr\start, A0, \stride * 0x10 -+.elseif \N == 1 -+ fst.d $f\start, A0, \stride * 0x08 -+.endif -+ A_st_macro %start + 1, \end, %stride + 1, \N -+.endif -+.endm -+ -+.macro dsolve_16x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 2 3 -+// 5 6 7 -+// 10 11 -+// 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 19, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 -+ ldrepl_macro 20, 22, 5 -+ nmsub_macro 4, 7, 0, D1 -+ ldrepl_macro 23, 24, 10 -+ GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7 -+ ldrepl_macro 25, 25, 15 -+ nmsub_macro 8, 11, 0, D2 -+ nmsub_macro 8, 11, 4, D5 -+ GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11 -+ nmsub_macro 12, 15, 0, D3 -+ nmsub_macro 12, 15, 4, D6 -+ nmsub_macro 12, 15, 8, D8 -+ GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 -+// Store A -+ A_st_macro 0, 15, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ -+ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ -+ U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ -+ U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+.endm -+ -+.macro dsolve_16x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 -+// 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 17, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 -+ ldrepl_macro 18, 18, 3 -+ nmsub_macro 4, 7, 0, D1 -+ GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 -+// Store A -+ A_st_macro 0, 7, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ -+ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+.endm -+ -+.macro dsolve_8x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 2 3 -+// 5 6 7 -+// 10 11 -+// 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 19, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1 -+ ldrepl_macro 20, 22, 5 -+ nmsub_macro 2, 3, 0, D1 -+ ldrepl_macro 23, 24, 10 -+ GMUL xvf, d, U2, D4, U2, U3, D4, U3 -+ ldrepl_macro 25, 25, 15 -+ nmsub_macro 4, 5, 0, D2 -+ nmsub_macro 4, 5, 2, D5 -+ GMUL xvf, d, U4, D7, U4, U5, D7, U5 -+ nmsub_macro 6, 7, 0, D3 -+ nmsub_macro 6, 7, 2, D6 -+ nmsub_macro 6, 7, 4, D8 -+ GMUL xvf, d, U6, D9, U6, U7, D9, U7 -+// Store A -+ A_st_macro 0, 7, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ -+ U2, C1, 0x00, U3, C1, 0x20, \ -+ U4, C2, 0x00, U5, C2, 0x20, \ -+ U6, C3, 0x00, U7, C3, 0x20 -+.endm -+ -+.macro dsolve_8x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 -+// 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 17, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1 -+ ldrepl_macro 18, 18, 3 -+ nmsub_macro 2, 3, 0, D1 -+ GMUL xvf, d, U2, D2, U2, U3, D2, U3 -+// Store A -+ A_st_macro 0, 3, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ -+ U2, C1, 0x00, U3, C1, 0x20 -+.endm -+ -+.macro dsolve_4x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 2 3 -+// 5 6 7 -+// 10 11 -+// 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 19, 0 -+ GMUL xvf, d, U0, D0, U0 -+ ldrepl_macro 20, 22, 5 -+ nmsub_macro 1, 1, 0, D1 -+ ldrepl_macro 23, 24, 10 -+ GMUL xvf, d, U1, D4, U1 -+ ldrepl_macro 25, 25, 15 -+ nmsub_macro 2, 2, 0, D2 -+ nmsub_macro 2, 2, 1, D5 -+ GMUL xvf, d, U2, D7, U2 -+ nmsub_macro 3, 3, 0, D3 -+ nmsub_macro 3, 3, 1, D6 -+ nmsub_macro 3, 3, 2, D8 -+ GMUL xvf, d, U3, D9, U3 -+// Store A -+ A_st_macro 0, 3, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 -+.endm -+ -+.macro dsolve_4x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 -+// 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 17, 0 -+ GMUL xvf, d, U0, D0, U0 -+ ldrepl_macro 18, 18, 3 -+ nmsub_macro 1, 1, 0, D1 -+ GMUL xvf, d, U1, D2, U1 -+// Store A -+ A_st_macro 0, 1, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C1, 0x00 -+.endm -+ -+.macro dsolve_2x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 2 3 -+// 5 6 7 -+// 10 11 -+// 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 19, 0 -+ GMUL xvf, d, U0, D0, U0 -+ ldrepl_macro 20, 22, 5 -+ nmsub_macro 1, 1, 0, D1 -+ ldrepl_macro 23, 24, 10 -+ GMUL xvf, d, U1, D4, U1 -+ -+ ldrepl_macro 25, 25, 15 -+ nmsub_macro 2, 2, 0, D2 -+ nmsub_macro 2, 2, 1, D5 -+ GMUL xvf, d, U2, D7, U2 -+ nmsub_macro 3, 3, 0, D3 -+ nmsub_macro 3, 3, 1, D6 -+ nmsub_macro 3, 3, 2, D8 -+ GMUL xvf, d, U3, D9, U3 -+// Store A -+ A_st_macro 0, 3, 0, 2 -+// Store C -+ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00, -+.endm -+ -+.macro dsolve_2x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 -+// 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 17, 0 -+ GMUL xvf, d, U0, D0, U0 -+ ldrepl_macro 18, 18, 3 -+ nmsub_macro 1, 1, 0, D1 -+ GMUL xvf, d, U1, D2, U1 -+// Store A -+ A_st_macro 0, 1, 0, 2 -+// Store C -+ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 -+.endm -+ -+.macro dsolve_1x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 2 3 -+// 5 6 7 -+// 10 11 -+// 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 19, 0 -+ GMUL xvf, d, U0, D0, U0 -+ ldrepl_macro 20, 22, 5 -+ nmsub_macro 1, 1, 0, D1 -+ ldrepl_macro 23, 24, 10 -+ GMUL xvf, d, U1, D4, U1 -+ -+ ldrepl_macro 25, 25, 15 -+ nmsub_macro 2, 2, 0, D2 -+ nmsub_macro 2, 2, 1, D5 -+ GMUL xvf, d, U2, D7, U2 -+ nmsub_macro 3, 3, 0, D3 -+ nmsub_macro 3, 3, 1, D6 -+ nmsub_macro 3, 3, 2, D8 -+ GMUL xvf, d, U3, D9, U3 -+// Store A -+ A_st_macro 0, 3, 0, 1 -+// Store C -+ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, -+.endm -+ -+.macro dsolve_1x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 1 -+// 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 17, 0 -+ GMUL xvf, d, U0, D0, U0 -+ ldrepl_macro 18, 18, 3 -+ nmsub_macro 1, 1, 0, D1 -+ GMUL xvf, d, U1, D2, U1 -+// Store A -+ A_st_macro 0, 1, 0, 1 -+// Store C -+ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 -+.endm -+ -+.macro dgemm_dsolve_16x4 -+ bge ZERO, L, .L_dsolve_16x4_load -+ dgemm_16x4 -+ b .L_dsolve_16x4 -+.L_dsolve_16x4_load: -+ // Load C -+ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 -+ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+/********************** solver ******************/ -+.L_dsolve_16x4: -+ dsolve_16x4 -+.endm -+ -+.macro dgemm_dsolve_8x4 -+ bge ZERO, L, .L_dsolve_8x4_load -+ dgemm_8x4 -+ b .L_dsolve_8x4 -+.L_dsolve_8x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ -+ /* Load C2 */ -+ xvld U4, C2, 0x00 -+ xvld U5, C2, 0x20 -+ -+ /* Load C3 */ -+ xvld U6, C3, 0x00 -+ xvld U7, C3, 0x20 -+/********* solver *********/ -+.L_dsolve_8x4: -+ dsolve_8x4 -+.endm -+ -+.macro dgemm_dsolve_4x4 -+ bge ZERO, L, .L_dsolve_4x4_load -+ dgemm_4x4 -+ b .L_dsolve_4x4 -+.L_dsolve_4x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+/************** solver *****************/ -+.L_dsolve_4x4: -+ dsolve_4x4 -+.endm -+ -+.macro dgemm_dsolve_2x4 -+ bge ZERO, L, .L_dsolve_2x4_load -+ dgemm_2x4 -+ xvpermi.q U2, U0, 0x01 -+ xvpermi.q U3, U1, 0x01 -+ b .L_dsolve_2x4 -+.L_dsolve_2x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+/********************** solver ******************/ -+.L_dsolve_2x4: -+ dsolve_2x4 -+.endm -+ -+.macro dgemm_dsolve_1x4 -+ bge ZERO, L, .L_dsolve_1x4_load -+ dgemm_1x4 -+ xvpackod.d U1, U0, U0 -+ xvpermi.q U2, U0, 0x01 -+ xvpermi.q U3, U1, 0x01 -+ b .L_dsolve_1x4 -+.L_dsolve_1x4_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+ fld.d $f2, C2, 0x00 -+ fld.d $f3, C3, 0x00 -+.L_dsolve_1x4: -+ dsolve_1x4 -+.endm -+ -+.macro dgemm_dsolve_16x2 -+ bge ZERO, L, .L_dsolve_16x2_load -+ dgemm_16x2 -+ b .L_dsolve_16x2 -+.L_dsolve_16x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ /* Load C1 */ -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+.L_dsolve_16x2: -+ dsolve_16x2 -+.endm -+ -+.macro dgemm_dsolve_8x2 -+ bge ZERO, L, .L_dsolve_8x2_load -+ dgemm_8x2 -+ b .L_dsolve_8x2 -+.L_dsolve_8x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+.L_dsolve_8x2: -+ dsolve_8x2 -+.endm -+ -+.macro dgemm_dsolve_4x2 -+ bge ZERO, L, .L_dsolve_4x2_load -+ dgemm_4x2 -+ b .L_dsolve_4x2 -+.L_dsolve_4x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_4x2: -+ dsolve_4x2 -+.endm -+ -+.macro dgemm_dsolve_2x2 -+ bge ZERO, L, .L_dsolve_2x2_load -+ dgemm_2x2 -+ b .L_dsolve_2x2 -+.L_dsolve_2x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_2x2: -+ dsolve_2x2 -+.endm -+ -+.macro dgemm_dsolve_1x2 -+ bge ZERO, L, .L_dsolve_1x2_load -+ dgemm_1x2 -+ xvpackod.d U1, U0, U0 -+ b .L_dsolve_1x2 -+.L_dsolve_1x2_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+.L_dsolve_1x2: -+ dsolve_1x2 -+.endm -+ -+.macro dgemm_dsolve_16x1 -+ bge ZERO, L, .L_dsolve_16x1_load -+ dgemm_16x1 -+ b .L_dsolve_16x1 -+.L_dsolve_16x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+.L_dsolve_16x1: -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 -+ // Store A -+ A_st_macro 0, 3, 0, 4 -+ // Strore C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+.endm -+ -+.macro dgemm_dsolve_8x1 -+ bge ZERO, L, .L_dsolve_8x1_load -+ dgemm_8x1 -+ b .L_dsolve_8x1 -+.L_dsolve_8x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+.L_dsolve_8x1: -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1 -+ // Store A -+ A_st_macro 0, 1, 0, 4 -+ // Strore C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20 -+.endm -+ -+.macro dgemm_dsolve_4x1 -+ bge ZERO, L, .L_dsolve_4x1_load -+ dgemm_4x1 -+ b .L_dsolve_4x1 -+.L_dsolve_4x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_4x1: -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0 -+ // Store A -+ A_st_macro 0, 0, 0, 4 -+ // Strore C -+ GST xv, , U0, C0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_2x1 -+ bge ZERO, L, .L_dsolve_2x1_load -+ dgemm_2x1 -+ b .L_dsolve_2x1 -+.L_dsolve_2x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_2x1: -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0 -+ // Store A -+ A_st_macro 0, 0, 0, 2 -+ // Strore C -+ GST v, , $vr0, C0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_1x1 -+ bge ZERO, L, .L_dsolve_1x1_load -+ dgemm_1x1 -+ b .L_dsolve_1x1 -+.L_dsolve_1x1_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+.L_dsolve_1x1: -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0 -+ // Store A -+ A_st_macro 0, 0, 0, 1 -+ // Strore C -+ GST f, d, $f0, C0, 0x00 -+.endm -+ -+ PROLOGUE -+ push_if_used 26, 32 -+ PTR_SLLI LDC, LDC, 3 -+ PTR_SUB KK, ZERO, OFFSET -+ /* if (!(N >> 2)) goto L_N3 */ -+ PTR_SRAI J, N, 2 /* J = bn >> 2 */ -+ andi N, N, 0x03 -+ beq ZERO, J, .L_N3 -+.align 5 -+.L_J1: -+ PTR_ADDI J, J, -1 -+ move AA, A -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_M15 -+.align 4 -+.L_I1: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_16x4 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_I1 -+.L_M15: -+ andi I, M, 8 -+ beqz I, .L_M7 -+.L_M8: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_8x4 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_M7: -+ andi I, M, 4 -+ beqz I, .L_M3 -+.L_M4: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_4x4 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_M3: -+ andi I, M, 2 -+ beqz I, .L_M1 -+.L_M2: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_2x4 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_M1: -+ andi I, M, 1 -+ beqz I, .L_M0 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_1x4 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_M0: -+ PTR_SLLI T0, K, 5 -+ PTR_SLLI T1, LDC, 2 -+ PTR_ADD B, B, T0 // b += 4 * k -+ PTR_ADD C, C, T1 // c += 4 * ldc -+ PTR_ADDI KK, KK, 4 // kk += 4 -+ bnez J, .L_J1 -+.L_N3: -+ andi J, N, 2 -+ beq ZERO, J, .L_N1 -+.L_N2: -+ move AA, A -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_N2_M15 -+.align 4 -+.L_N2_I1: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_16x2 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_N2_I1 -+.L_N2_M15: -+ andi I, M, 8 -+ beqz I, .L_N2_M7 -+.L_N2_M8: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_8x2 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_N2_M7: -+ andi I, M, 4 -+ beqz I, .L_N2_M3 -+.L_N2_M4: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_4x2 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_N2_M3: -+ andi I, M, 2 -+ beqz I, .L_N2_M1 -+.L_N2_M2: -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_2x2 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_N2_M1: -+ andi I, M, 1 -+ beqz I, .L_N2_M0 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_1x2 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_N2_M0: -+ PTR_SLLI T0, K, 4 -+ PTR_SLLI T1, LDC, 1 -+ PTR_ADD B, B, T0 // b += 2 * k -+ PTR_ADD C, C, T1 // c += 2 * ldc -+ PTR_ADDI KK, KK, 2 // kk += 2 -+.L_N1: -+ andi J, N, 1 -+ beq ZERO, J, .L_N0 -+ move AA, A -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_N1_M15 -+.align 4 -+.L_N1_I1: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_16x1 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_N1_I1 -+.L_N1_M15: -+ andi I, M, 8 -+ beqz I, .L_N1_M7 -+.L_N1_M8: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_8x1 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_N1_M7: -+ andi I, M, 4 -+ beqz I, .L_N1_M3 -+.L_N1_M4: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_4x1 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_N1_M3: -+ andi I, M, 2 -+ beqz I, .L_N1_M1 -+.L_N1_M2: -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_2x1 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_N1_M1: -+ andi I, M, 1 -+ beqz I, .L_N1_M0 -+ GADD , d, C0, CC, ZERO -+ move A0, AA -+ move B0, B -+ move L, KK -+ dgemm_dsolve_1x1 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_N1_M0: -+.L_N0: -+ pop_if_used 26, 32 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S -new file mode 100644 -index 000000000..5f86d75b5 ---- /dev/null -+++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S -@@ -0,0 +1,953 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/09/26 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, -+ * FLOAT *c, BLASLONG ldc, BLASLONG offset) -+ */ -+#define M $r4 // param 1: bm -+#define N $r5 // param 2: bn -+#define K $r6 // param 3: bk -+#define A $r7 // param 5: ba -+#define B $r8 // param 6: bb -+#define C $r9 // param 7: bc -+#define LDC $r10 // param 8: ldc -+#define OFFSET $r11 // param 9: offset -+ -+/* Cycle control parameters */ -+#define I $r13 -+#define J $r14 -+#define L $r15 -+#define TL $r16 -+/* Matrix address */ -+#define A0 $r17 -+#define B0 $r18 -+#define C0 $r19 -+#define C1 $r20 -+#define C2 $r23 -+#define C3 $r24 -+#define T0 $r25 -+#define T1 $r26 -+#define T2 $r27 -+#define KK $r28 -+#define AA $r29 -+#define CC $r30 -+#define BB $r31 -+#undef ZERO -+#define ZERO $r0 -+ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+ -+/* Prefetch interval */ -+#define A_PRE 0x400 -+#define B_PRE 0x100 -+ -+#include "dtrsm_kernel_macro.S" -+ -+.macro ldrepl_macro start, end, stride -+// Load Ux (x = 0...15) -+.if \start <= \end -+ GLDREPL xv, d, $xr\start, B0, \stride * 8 -+ ldrepl_macro %start + 1, \end, %stride + 1 -+.endif -+.endm -+ -+.macro nmsub_macro start0, end0, start1, reg -+// Ux -= reg * Dx -+.if \start0 <= \end0 -+ xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0 -+ nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg -+.endif -+.endm -+ -+.macro A_st_macro start, end, stride, N -+// Store Ux(x = 0...15) -+.if \start <= \end -+.if \N == 4 -+ xvst $xr\start, A0, \stride * 0x20 -+.elseif \N == 2 -+ vst $vr\start, A0, \stride * 0x10 -+.elseif \N == 1 -+ fst.d $f\start, A0, \stride * 0x08 -+.endif -+ A_st_macro %start + 1, \end, %stride + 1, \N -+.endif -+.endm -+ -+.macro dsolve_16x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//2 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 16, 0 -+ ldrepl_macro 17, 18, 2 -+ GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 -+ nmsub_macro 0, 3, 4, D1 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 -+// Store A -+ A_st_macro 0, 7, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ -+ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+.endm -+ -+.macro dsolve_8x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//2 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 16, 0 -+ ldrepl_macro 17, 18, 2 -+ GMUL xvf, d, U2, D2, U2, U3, D2, U3 -+ nmsub_macro 0, 1, 2, D1 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1 -+// Store A -+ A_st_macro 0, 3, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ -+ U2, C1, 0x00, U3, C1, 0x20 -+.endm -+ -+.macro dsolve_4x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//2 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 16, 0 -+ ldrepl_macro 17, 18, 2 -+ GMUL xvf, d, U1, D2, U1 -+ nmsub_macro 0, 0, 1, D1 -+ GMUL xvf, d, U0, D0, U0 -+// Store A -+ A_st_macro 0, 1, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C1, 0x00 -+.endm -+ -+.macro dsolve_2x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//2 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 16, 0 -+ ldrepl_macro 17, 18, 2 -+ GMUL xvf, d, U1, D2, U1 -+ nmsub_macro 0, 0, 1, D1 -+ GMUL xvf, d, U0, D0, U0 -+// Store A -+ A_st_macro 0, 1, 0, 2 -+// Store C -+ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00 -+.endm -+ -+.macro dsolve_1x2 -+// We are going to process matrix B with a size of 2x2, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//2 3 -+// Sequentially extract data from B in row order -+ ldrepl_macro 16, 16, 0 -+ ldrepl_macro 17, 18, 2 -+ GMUL xvf, d, U1, D2, U1 -+ nmsub_macro 0, 0, 1, D1 -+ GMUL xvf, d, U0, D0, U0 -+// Store A -+ A_st_macro 0, 1, 0, 1 -+// Store C -+ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00 -+.endm -+ -+.macro dsolve_16x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//4 5 -+//8 9 10 -+//12 13 14 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 22, 25, 12 -+ GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15 -+ ldrepl_macro 19, 21, 8 -+ nmsub_macro 8, 11, 12, D8 -+ ldrepl_macro 17, 18, 4 -+ GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11 -+ ldrepl_macro 16, 16, 0 -+ nmsub_macro 4, 7, 12, D7 -+ nmsub_macro 4, 7, 8, D4 -+ GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7 -+ nmsub_macro 0, 3, 12, D6 -+ nmsub_macro 0, 3, 8, D3 -+ nmsub_macro 0, 3, 4, D1 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 -+// Store A -+ A_st_macro 0, 15, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \ -+ U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \ -+ U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \ -+ U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+.endm -+ -+.macro dsolve_8x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//4 5 -+//8 9 10 -+//12 13 14 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 22, 25, 12 -+ GMUL xvf, d, U6, D9, U6, U7, D9, U7 -+ ldrepl_macro 19, 21, 8 -+ nmsub_macro 4, 5, 6, D8 -+ ldrepl_macro 17, 18, 4 -+ GMUL xvf, d, U4, D5, U4, U5, D5, U5 -+ ldrepl_macro 16, 16, 0 -+ nmsub_macro 2, 3, 6, D7 -+ nmsub_macro 2, 3, 4, D4 -+ GMUL xvf, d, U2, D2, U2, U3, D2, U3 -+ nmsub_macro 0, 1, 6, D6 -+ nmsub_macro 0, 1, 4, D3 -+ nmsub_macro 0, 1, 2, D1 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1 -+// Store A -+ A_st_macro 0, 7, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, \ -+ U2, C1, 0x00, U3, C1, 0x20, \ -+ U4, C2, 0x00, U5, C2, 0x20, \ -+ U6, C3, 0x00, U7, C3, 0x20 -+.endm -+ -+.macro dsolve_4x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//4 5 -+//8 9 10 -+//12 13 14 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 22, 25, 12 -+ GMUL xvf, d, U3, D9, U3 -+ ldrepl_macro 19, 21, 8 -+ nmsub_macro 2, 2, 3, D8 -+ ldrepl_macro 17, 18, 4 -+ GMUL xvf, d, U2, D5, U2 -+ ldrepl_macro 16, 16, 0 -+ nmsub_macro 1, 1, 3, D7 -+ nmsub_macro 1, 1, 2, D4 -+ GMUL xvf, d, U1, D2, U1 -+ nmsub_macro 0, 0, 3, D6 -+ nmsub_macro 0, 0, 2, D3 -+ nmsub_macro 0, 0, 1, D1 -+ GMUL xvf, d, U0, D0, U0 -+// Store A -+ A_st_macro 0, 3, 0, 4 -+// Store C -+ GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00 -+.endm -+ -+.macro dsolve_2x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//4 5 -+//8 9 10 -+//12 13 14 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 22, 25, 12 -+ GMUL xvf, d, U3, D9, U3 -+ ldrepl_macro 19, 21, 8 -+ nmsub_macro 2, 2, 3, D8 -+ ldrepl_macro 17, 18, 4 -+ GMUL xvf, d, U2, D5, U2 -+ ldrepl_macro 16, 16, 0 -+ nmsub_macro 1, 1, 3, D7 -+ nmsub_macro 1, 1, 2, D4 -+ GMUL xvf, d, U1, D2, U1 -+ nmsub_macro 0, 0, 3, D6 -+ nmsub_macro 0, 0, 2, D3 -+ nmsub_macro 0, 0, 1, D1 -+ GMUL xvf, d, U0, D0, U0 -+// Store A -+ A_st_macro 0, 3, 0, 2 -+// Store C -+ GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00 -+.endm -+ -+.macro dsolve_1x4 -+// We are going to process matrix B with a size of 4x4, -+// using only the upper triangular portion. The memory layout of -+// matrix B is as follows: -+//0 -+//4 5 -+//8 9 10 -+//12 13 14 15 -+// Sequentially extract data from B in row order -+ ldrepl_macro 22, 25, 12 -+ GMUL xvf, d, U3, D9, U3 -+ ldrepl_macro 19, 21, 8 -+ nmsub_macro 2, 2, 3, D8 -+ ldrepl_macro 17, 18, 4 -+ GMUL xvf, d, U2, D5, U2 -+ ldrepl_macro 16, 16, 0 -+ nmsub_macro 1, 1, 3, D7 -+ nmsub_macro 1, 1, 2, D4 -+ GMUL xvf, d, U1, D2, U1 -+ nmsub_macro 0, 0, 3, D6 -+ nmsub_macro 0, 0, 2, D3 -+ nmsub_macro 0, 0, 1, D1 -+ GMUL xvf, d, U0, D0, U0 -+// Store A -+ A_st_macro 0, 3, 0, 1 -+// Store C -+ GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00, -+.endm -+ -+.macro dgemm_dsolve_16x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_16x1_load -+ dgemm_16x1 -+ b .L_dsolve_16x1 -+.L_dsolve_16x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+.L_dsolve_16x1: -+ PTR_ADDI A0, T1, -16 * 8 -+ PTR_ADDI B0, T2, -1 * 8 -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3 -+ // Store A -+ A_st_macro 0, 3, 0, 4 -+ // Strore C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+.endm -+ -+.macro dgemm_dsolve_8x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_8x1_load -+ dgemm_8x1 -+ b .L_dsolve_8x1 -+.L_dsolve_8x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+.L_dsolve_8x1: -+ PTR_ADDI A0, T1, -8 * 8 -+ PTR_ADDI B0, T2, -1 * 8 -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0, U1, D0, U1 -+ // Store A -+ A_st_macro 0, 1, 0, 4 -+ // Strore C -+ GST xv, , U0, C0, 0x00, U1, C0, 0x20 -+.endm -+ -+.macro dgemm_dsolve_4x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_4x1_load -+ dgemm_4x1 -+ b .L_dsolve_4x1 -+.L_dsolve_4x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_4x1: -+ PTR_ADDI A0, T1, -4 * 8 -+ PTR_ADDI B0, T2, -1 * 8 -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0 -+ // Store A -+ A_st_macro 0, 0, 0, 4 -+ // Strore C -+ GST xv, , U0, C0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_2x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_2x1_load -+ dgemm_2x1 -+ b .L_dsolve_2x1 -+.L_dsolve_2x1_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+.L_dsolve_2x1: -+ PTR_ADDI A0, T1, -2 * 8 -+ PTR_ADDI B0, T2, -1 * 8 -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0 -+ // Store A -+ A_st_macro 0, 0, 0, 2 -+ // Strore C -+ GST v, , $vr0, C0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_1x1 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_1x1_load -+ dgemm_1x1 -+ b .L_dsolve_1x1 -+.L_dsolve_1x1_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+.L_dsolve_1x1: -+ PTR_ADDI A0, T1, -1 * 8 -+ PTR_ADDI B0, T2, -1 * 8 -+ ldrepl_macro 16, 16, 0 -+ GMUL xvf, d, U0, D0, U0 -+ // Store A -+ A_st_macro 0, 0, 0, 1 -+ // Strore C -+ GST f, d, $f0, C0, 0x00 -+.endm -+ -+.macro dgemm_dsolve_16x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_16x2_load -+ dgemm_16x2 -+ b .L_dsolve_16x2 -+.L_dsolve_16x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ /* Load C1 */ -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+.L_dsolve_16x2: -+ PTR_ADDI A0, T1, -(16 * 2) * 8 -+ PTR_ADDI B0, T2, -(2 * 2) * 8 -+ dsolve_16x2 -+.endm -+ -+.macro dgemm_dsolve_8x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_8x2_load -+ dgemm_8x2 -+ b .L_dsolve_8x2 -+.L_dsolve_8x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+.L_dsolve_8x2: -+ PTR_ADDI A0, T1, -(8 * 2) * 8 -+ PTR_ADDI B0, T2, -(2 * 2) * 8 -+ dsolve_8x2 -+.endm -+ -+.macro dgemm_dsolve_4x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_4x2_load -+ dgemm_4x2 -+ b .L_dsolve_4x2 -+.L_dsolve_4x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_4x2: -+ PTR_ADDI A0, T1, -(4 * 2) * 8 -+ PTR_ADDI B0, T2, -(2 * 2) * 8 -+ dsolve_4x2 -+.endm -+ -+.macro dgemm_dsolve_2x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_2x2_load -+ dgemm_2x2 -+ b .L_dsolve_2x2 -+.L_dsolve_2x2_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+.L_dsolve_2x2: -+ PTR_ADDI A0, T1, -(2 * 2) * 8 -+ PTR_ADDI B0, T2, -(2 * 2) * 8 -+ dsolve_2x2 -+.endm -+ -+.macro dgemm_dsolve_1x2 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_1x2_load -+ dgemm_1x2 -+ xvpackod.d U1, U0, U0 -+ b .L_dsolve_1x2 -+.L_dsolve_1x2_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+.L_dsolve_1x2: -+ PTR_ADDI A0, T1, -(1 * 2) * 8 -+ PTR_ADDI B0, T2, -(2 * 2) * 8 -+ dsolve_1x2 -+.endm -+ -+.macro dgemm_dsolve_16x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_16x4_load -+ dgemm_16x4 -+ b .L_dsolve_16x4 -+.L_dsolve_16x4_load: -+ // Load C -+ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 -+ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+/********************** solver ******************/ -+.L_dsolve_16x4: -+ PTR_ADDI A0, T1, -(16 * 4) * 8 -+ PTR_ADDI B0, T2, -(4 * 4) * 8 -+ dsolve_16x4 -+.endm -+ -+.macro dgemm_dsolve_8x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_8x4_load -+ dgemm_8x4 -+ b .L_dsolve_8x4 -+.L_dsolve_8x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ -+ /* Load C2 */ -+ xvld U4, C2, 0x00 -+ xvld U5, C2, 0x20 -+ -+ /* Load C3 */ -+ xvld U6, C3, 0x00 -+ xvld U7, C3, 0x20 -+/********* solver *********/ -+.L_dsolve_8x4: -+ PTR_ADDI A0, T1, -(8 * 4) * 8 -+ PTR_ADDI B0, T2, -(4 * 4) * 8 -+ dsolve_8x4 -+.endm -+ -+.macro dgemm_dsolve_4x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_4x4_load -+ dgemm_4x4 -+ b .L_dsolve_4x4 -+.L_dsolve_4x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+/************** solver *****************/ -+.L_dsolve_4x4: -+ PTR_ADDI A0, T1, -(4 * 4) * 8 -+ PTR_ADDI B0, T2, -(4 * 4) * 8 -+ dsolve_4x4 -+.endm -+ -+.macro dgemm_dsolve_2x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_2x4_load -+ dgemm_2x4 -+ xvpermi.q U2, U0, 0x01 -+ xvpermi.q U3, U1, 0x01 -+ b .L_dsolve_2x4 -+.L_dsolve_2x4_load: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+/********************** solver ******************/ -+.L_dsolve_2x4: -+ PTR_ADDI A0, T1, -(2 * 4) * 8 -+ PTR_ADDI B0, T2, -(4 * 4) * 8 -+ dsolve_2x4 -+.endm -+ -+.macro dgemm_dsolve_1x4 -+ or T1, A0, A0 -+ or T2, B0, B0 -+ bge ZERO, L, .L_dsolve_1x4_load -+ dgemm_1x4 -+ xvpackod.d U1, U0, U0 -+ xvpermi.q U2, U0, 0x01 -+ xvpermi.q U3, U1, 0x01 -+ b .L_dsolve_1x4 -+.L_dsolve_1x4_load: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+ fld.d $f2, C2, 0x00 -+ fld.d $f3, C3, 0x00 -+.L_dsolve_1x4: -+ PTR_ADDI A0, T1, -(1 * 4) * 8 -+ PTR_ADDI B0, T2, -(4 * 4) * 8 -+ dsolve_1x4 -+.endm -+ -+ PROLOGUE -+ push_if_used 26, 32 -+ PTR_SLLI LDC, LDC, 3 -+ PTR_SUB KK, N, OFFSET -+ PTR_MUL T0, N, LDC -+ PTR_MUL T1, N, K -+ PTR_ADD C, C, T0 // c += n * ldc -+ PTR_SLLI T1, T1, 3 -+ PTR_ADD B, B, T1 -+ -+ andi J, N, 1 -+ beqz J, .L_N2 -+.L_N1: -+ move AA, A -+ PTR_SUB C, C, LDC // c -= ldc -+ PTR_SLLI T0, K, 3 -+ PTR_SLLI T1, KK, 3 -+ PTR_SUB B, B, T0 // b -= k -+ PTR_ADD BB, B, T1 // bb = b + kk -+ move CC, C -+ -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_N1_M15 -+.align 4 -+.L_N1_I1: -+ PTR_SLLI T1, KK, 7 -+ GADD , d, C0, CC, ZERO -+ PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_16x1 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_N1_I1 -+.L_N1_M15: -+ andi I, M, 8 -+ beqz I, .L_N1_M7 -+.L_N1_M8: -+ PTR_SLLI T1, KK, 6 -+ GADD , d, C0, CC, ZERO -+ PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_8x1 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_N1_M7: -+ andi I, M, 4 -+ beqz I, .L_N1_M3 -+.L_N1_M4: -+ PTR_SLLI T1, KK, 5 -+ GADD , d, C0, CC, ZERO -+ PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_4x1 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_N1_M3: -+ andi I, M, 2 -+ beqz I, .L_N1_M1 -+.L_N1_M2: -+ PTR_SLLI T1, KK, 4 -+ GADD , d, C0, CC, ZERO -+ PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_2x1 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_N1_M1: -+ andi I, M, 1 -+ beqz I, .L_N1_M0 -+ PTR_SLLI T1, KK, 3 -+ GADD , d, C0, CC, ZERO -+ PTR_ADD A0, AA, T1 // a0 = aa + kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_1x1 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_N1_M0: -+ PTR_ADDI KK, KK, -1 -+.L_N2: -+ andi J, N, 2 -+ beq ZERO, J, .L_N4 -+ move AA, A -+ PTR_SLLI T0, LDC, 1 -+ PTR_SLLI T1, K, 4 -+ PTR_SLLI T2, KK, 4 -+ PTR_SUB B, B, T1 -+ PTR_SUB C, C, T0 -+ PTR_ADD BB, B, T2 -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_N2_M15 -+.align 4 -+.L_N2_I1: -+ PTR_SLLI T1, KK, 7 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_16x2 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_N2_I1 -+.L_N2_M15: -+ andi I, M, 8 -+ beqz I, .L_N2_M7 -+.L_N2_M8: -+ PTR_SLLI T1, KK, 6 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_8x2 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_N2_M7: -+ andi I, M, 4 -+ beqz I, .L_N2_M3 -+.L_N2_M4: -+ PTR_SLLI T1, KK, 5 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_4x2 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_N2_M3: -+ andi I, M, 2 -+ beqz I, .L_N2_M1 -+.L_N2_M2: -+ PTR_SLLI T1, KK, 4 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_2x2 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_N2_M1: -+ andi I, M, 1 -+ beqz I, .L_N2_M0 -+ PTR_SLLI T1, KK, 3 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_1x2 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_N2_M0: -+ PTR_ADDI KK, KK, -2 -+.L_N4: -+ PTR_SRAI J, N, 2 /* J = bn >> 2 */ -+ beq ZERO, J, .L_N0 -+.align 5 -+.L_J1: -+ PTR_ADDI J, J, -1 -+ move AA, A -+ PTR_SLLI T0, LDC, 2 -+ PTR_SLLI T1, K, 5 -+ PTR_SLLI T2, KK, 5 -+ PTR_SUB B, B, T1 -+ PTR_SUB C, C, T0 -+ PTR_ADD BB, B, T2 -+ move CC, C -+ PTR_SRAI I, M, 4 // M >> 4 -+ beqz I, .L_M15 -+.align 4 -+.L_I1: -+ PTR_SLLI T1, KK, 7 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_16x4 -+ PTR_ADDI I, I, -1 -+ PTR_SLLI T0, K, 7 -+ PTR_ADDI CC, CC, 0x80 // cc += 16 -+ PTR_ADD AA, AA, T0 // aa += 16 * k -+ bnez I, .L_I1 -+.L_M15: -+ andi I, M, 8 -+ beqz I, .L_M7 -+.L_M8: -+ PTR_SLLI T1, KK, 6 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_8x4 -+ PTR_SLLI T0, K, 6 -+ PTR_ADDI CC, CC, 0x40 // cc += 8 -+ PTR_ADD AA, AA, T0 // aa += 8 * k -+.L_M7: -+ andi I, M, 4 -+ beqz I, .L_M3 -+.L_M4: -+ PTR_SLLI T1, KK, 5 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_4x4 -+ PTR_SLLI T0, K, 5 -+ PTR_ADDI CC, CC, 0x20 // cc += 4 -+ PTR_ADD AA, AA, T0 // aa += 4 * k -+.L_M3: -+ andi I, M, 2 -+ beqz I, .L_M1 -+.L_M2: -+ PTR_SLLI T1, KK, 4 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_2x4 -+ PTR_SLLI T0, K, 4 -+ PTR_ADDI CC, CC, 0x10 // cc += 2 -+ PTR_ADD AA, AA, T0 // aa += 2 * k -+.L_M1: -+ andi I, M, 1 -+ beqz I, .L_M0 -+ PTR_SLLI T1, KK, 3 -+ GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC -+ PTR_ADD A0, AA, T1 // a0 = aa + kk -+ move B0, BB -+ PTR_SUB L, K, KK // L = K - KK -+ dgemm_dsolve_1x4 -+ PTR_SLLI T0, K, 3 -+ PTR_ADDI CC, CC, 0x08 // cc += 1 -+ PTR_ADD AA, AA, T0 // aa += 1 * k -+.L_M0: -+ PTR_ADDI KK, KK, -4 -+ bnez J, .L_J1 -+.L_N0: -+ pop_if_used 26, 32 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S -new file mode 100644 -index 000000000..88b7121d1 ---- /dev/null -+++ b/kernel/loongarch64/dtrsm_kernel_macro.S -@@ -0,0 +1,2147 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+ -+/************** Dgemm Kernel 16x4 ****************/ -+.macro KERNEL2x16x4 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ -+ preld 0, B0, B_PRE -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D10, U10, U14, D10 -+ xvfmadd.d D11, U11, U14, D11 -+ -+ preld 0, A0, A_PRE -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 -+ -+ preld 0, A0, A_PRE + 0x40 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D14, U10, U15, D14 -+ xvfmadd.d D15, U11, U15, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvld U9, A0, 0x20 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvld U10, A0, 0x40 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvld U11, A0, 0x60 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ -+ preld 0, B0, B_PRE -+ xvldrepl.d U13, B0, 0x08 -+ xvfmadd.d D10, U2, U6, D10 -+ xvfmadd.d D11, U3, U6, D11 -+ -+ preld 0, A0, A_PRE -+ xvldrepl.d U14, B0, 0x10 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+ -+ preld 0, A0, A_PRE + 0x40 -+ xvldrepl.d U15, B0, 0x18 -+ xvfmadd.d D14, U2, U7, D14 -+ xvfmadd.d D15, U3, U7, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+.endm -+ -+.macro KERNEL2x16x4_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ -+ preld 0, B0, B_PRE -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D10, U10, U14, D10 -+ xvfmadd.d D11, U11, U14, D11 -+ -+ preld 0, A0, A_PRE -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 -+ -+ preld 0, A0, A_PRE + 0x40 -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D14, U10, U15, D14 -+ xvfmadd.d D15, U11, U15, D15 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x20 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ -+ preld 0, B0, B_PRE -+ xvfmadd.d D10, U2, U6, D10 -+ xvfmadd.d D11, U3, U6, D11 -+ -+ preld 0, A0, A_PRE -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+ -+ preld 0, A0, A_PRE + 0x40 -+ xvfmadd.d D14, U2, U7, D14 -+ xvfmadd.d D15, U3, U7, D15 -+.endm -+ -+.macro KERNEL8x16x4 -+.rept 4 -+ KERNEL2x16x4 -+.endr -+.endm -+ -+.macro KERNEL8x16x4_END -+.rept 3 -+ KERNEL2x16x4 -+.endr -+ KERNEL2x16x4_END -+.endm -+ -+.macro KERNEL2x8x4 -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U13, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvldrepl.d U14, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ -+ xvldrepl.d U15, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+.endm -+ -+.macro KERNEL2x8x4_END -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 -+ xvfmadd.d D9, U9, U14, D9 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 -+ xvfmadd.d D13, U9, U15, D13 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x20 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+.endm -+ -+.macro KERNEL8x8x4 -+.rept 4 -+ KERNEL2x8x4 -+.endr -+.endm -+ -+.macro KERNEL8x8x4_END -+.rept 3 -+ KERNEL2x8x4 -+.endr -+ KERNEL2x8x4_END -+.endm -+ -+.macro KERNEL2x4x4 -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ xvld U8, A0, 0x00 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U13, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ xvldrepl.d U14, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ -+ xvldrepl.d U15, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+.endm -+ -+.macro KERNEL2x4x4_END -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U8, U14, D8 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U8, U15, D12 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x20 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D12, U0, U7, D12 -+.endm -+ -+.macro KERNEL8x4x4 -+.rept 4 -+ KERNEL2x4x4 -+.endr -+.endm -+ -+.macro KERNEL8x4x4_END -+.rept 3 -+ KERNEL2x4x4 -+.endr -+ KERNEL2x4x4_END -+.endm -+ -+.macro KERNEL2x2x4 -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 -+ -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U4, B0, 0x00 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ xvldrepl.d U8, A0, 0x00 -+ xvldrepl.d U9, A0, 0x08 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvld U12, B0, 0x00 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+.endm -+ -+.macro KERNEL2x2x4_END -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 -+ -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U4, B0, 0x00 -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x20 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+.endm -+ -+.macro KERNEL8x2x4 -+.rept 4 -+ KERNEL2x2x4 -+.endr -+.endm -+ -+.macro KERNEL8x2x4_END -+.rept 3 -+ KERNEL2x2x4 -+.endr -+ KERNEL2x2x4_END -+.endm -+ -+.macro KERNEL2x1x4 -+ xvldrepl.d U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvld U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ xvldrepl.d U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvld U12, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+.endm -+ -+.macro KERNEL2x1x4_END -+ xvldrepl.d U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvld U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x20 -+ -+ xvfmadd.d D0, U0, U4, D0 -+.endm -+ -+.macro KERNEL8x1x4 -+.rept 4 -+ KERNEL2x1x4 -+.endr -+.endm -+ -+.macro KERNEL8x1x4_END -+.rept 3 -+ KERNEL2x1x4 -+.endr -+ KERNEL2x1x4_END -+.endm -+ -+.macro KERNEL2x16x2 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvld U9, A0, 0x20 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvld U10, A0, 0x40 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvld U11, A0, 0x60 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+.endm -+ -+.macro KERNEL2x16x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvld U3, A0, 0x60 -+ xvfmadd.d D6, U10, U13, D6 -+ xvfmadd.d D7, U11, U13, D7 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x10 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+.endm -+ -+.macro KERNEL8x16x2 -+.rept 4 -+ KERNEL2x16x2 -+.endr -+.endm -+ -+.macro KERNEL8x16x2_END -+.rept 3 -+ KERNEL2x16x2 -+.endr -+ KERNEL2x16x2_END -+.endm -+ -+.macro KERNEL2x8x2 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvld U9, A0, 0x20 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+.endm -+ -+.macro KERNEL2x8x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D4, U8, U13, D4 -+ xvfmadd.d D5, U9, U13, D5 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x10 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+.endm -+ -+.macro KERNEL8x8x2 -+.rept 4 -+ KERNEL2x8x2 -+.endr -+.endm -+ -+.macro KERNEL8x8x2_END -+.rept 3 -+ KERNEL2x8x2 -+ .endr -+ KERNEL2x8x2_END -+.endm -+ -+.macro KERNEL2x4x2 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+.endm -+ -+.macro KERNEL2x4x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x10 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+.endm -+ -+.macro KERNEL8x4x2 -+.rept 4 -+ KERNEL2x4x2 -+.endr -+.endm -+ -+.macro KERNEL8x4x2_END -+.rept 3 -+ KERNEL2x4x2 -+.endr -+ KERNEL2x4x2_END -+.endm -+ -+.macro KERNEL2x2x2 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+.endm -+ -+.macro KERNEL2x2x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x10 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+.endm -+ -+.macro KERNEL8x2x2 -+.rept 4 -+ KERNEL2x2x2 -+.endr -+.endm -+ -+.macro KERNEL8x2x2_END -+.rept 3 -+ KERNEL2x2x2 -+.endr -+ KERNEL2x2x2_END -+.endm -+ -+.macro KERNEL2x1x2 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+.endm -+ -+.macro KERNEL2x1x2_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D4, U8, U13, D4 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvldrepl.d U5, B0, 0x08 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D4, U0, U5, D4 -+.endm -+ -+.macro KERNEL8x1x2 -+.rept 4 -+ KERNEL2x1x2 -+.endr -+.endm -+ -+.macro KERNEL8x1x2_END -+.rept 3 -+ KERNEL2x1x2 -+.endr -+ KERNEL2x1x2_END -+.endm -+ -+.macro KERNEL2x16x1 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvld U9, A0, 0x20 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 -+ -+ xvldrepl.d U12, B0, 0x00 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+.endm -+ -+.macro KERNEL2x16x1_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ -+ xvld U1, A0, 0x20 -+ xvfmadd.d D2, U10, U12, D2 -+ xvfmadd.d D3, U11, U12, D3 -+ -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x80 -+ addi.d B0, B0, 0x08 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+.endm -+ -+.macro KERNEL8x16x1 -+.rept 4 -+ KERNEL2x16x1 -+.endr -+.endm -+ -+.macro KERNEL8x16x1_END -+.rept 3 -+ KERNEL2x16x1 -+.endr -+ KERNEL2x16x1_END -+.endm -+ -+.macro KERNEL2x8x1 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ xvld U1, A0, 0x20 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvld U9, A0, 0x20 -+ xvldrepl.d U12, B0, 0x00 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+.endm -+ -+.macro KERNEL2x8x1_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvfmadd.d D1, U9, U12, D1 -+ xvld U1, A0, 0x20 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x40 -+ addi.d B0, B0, 0x08 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+.endm -+ -+.macro KERNEL8x8x1 -+.rept 4 -+ KERNEL2x8x1 -+.endr -+.endm -+ -+.macro KERNEL8x8x1_END -+.rept 3 -+ KERNEL2x8x1 -+.endr -+ KERNEL2x8x1_END -+.endm -+ -+.macro KERNEL2x4x1 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U12, B0, 0x00 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+.endm -+ -+.macro KERNEL2x4x1_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x20 -+ addi.d B0, B0, 0x08 -+ -+ xvfmadd.d D0, U0, U4, D0 -+.endm -+ -+.macro KERNEL8x4x1 -+.rept 4 -+ KERNEL2x4x1 -+.endr -+.endm -+ -+.macro KERNEL8x4x1_END -+.rept 3 -+ KERNEL2x4x1 -+.endr -+ KERNEL2x4x1_END -+.endm -+ -+.macro KERNEL2x2x1 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U12, B0, 0x00 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+.endm -+ -+.macro KERNEL2x2x1_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x10 -+ addi.d B0, B0, 0x08 -+ -+ xvfmadd.d D0, U0, U4, D0 -+.endm -+ -+.macro KERNEL8x2x1 -+.rept 4 -+ KERNEL2x2x1 -+.endr -+.endm -+ -+.macro KERNEL8x2x1_END -+.rept 3 -+ KERNEL2x2x1 -+.endr -+ KERNEL2x2x1_END -+.endm -+ -+.macro KERNEL2x1x1 -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ xvld U8, A0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvldrepl.d U12, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+.endm -+ -+.macro KERNEL2x1x1_END -+ xvld U0, A0, 0x00 -+ xvfmadd.d D0, U8, U12, D0 -+ xvldrepl.d U4, B0, 0x00 -+ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ xvfmadd.d D0, U0, U4, D0 -+.endm -+ -+.macro KERNEL8x1x1 -+.rept 4 -+ KERNEL2x1x1 -+.endr -+.endm -+ -+.macro KERNEL8x1x1_END -+.rept 3 -+ KERNEL2x1x1 -+.endr -+ KERNEL2x1x1_END -+.endm -+ -+.macro dgemm_16x4 -+.L_dgemm_16x4: // See dgemm_kernel_16x4.S -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 -+ -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 -+ xvfmul.d D6, U2, U5 -+ xvfmul.d D7, U3, U5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U6 -+ xvfmul.d D9, U1, U6 -+ xvfmul.d D10, U2, U6 -+ xvfmul.d D11, U3, U6 -+ -+ xvldrepl.d U7, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U7 -+ xvfmul.d D13, U1, U7 -+ xvfmul.d D14, U2, U7 -+ xvfmul.d D15, U3, U7 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_L7 */ -+ beq ZERO,TL, .L_dgemm_16x4_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ xvldrepl.d U14, B0, 0x10 -+ xvldrepl.d U15, B0, 0x18 -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x20 -+ -+ beq ZERO, TL, .L_dgemm_16x4_TL1_END -+.align 5 -+.L_dgemm_16x4_TL1: -+ KERNEL8x16x4 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO, TL, .L_dgemm_16x4_TL1 -+.L_dgemm_16x4_TL1_END: -+ KERNEL8x16x4_END -+.L_dgemm_16x4_L7: -+ andi TL, L, 7 -+ beq TL, ZERO, .L_dgemm_16x4_L0 -+.align 5 -+.L_dgemm_16x4_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ xvfmadd.d D10, U2, U6, D10 -+ xvfmadd.d D11, U3, U6, D11 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+ xvfmadd.d D14, U2, U7, D14 -+ xvfmadd.d D15, U3, U7, D15 -+ -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_16x4_L71 -+.L_dgemm_16x4_L0: -+ // Load C -+ GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60 -+ GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60 -+ GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60 -+ GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ -+ U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7, \ -+ U8, U8, D8, U9, U9, D9, U10, U10, D10, U11, U11, D11, \ -+ U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15 -+.endm -+ -+.macro dgemm_1x4 -+.L_dgemm_1x4: // See dgemm_kernel_16x4.S -+ xvldrepl.d U0, A0, 0x00 -+ xvld U4, B0, 0x00 -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x08 -+ PTR_ADDI B0, B0, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M1_L7 */ -+ beq ZERO,TL, .L_dgemm_1x4_M1_L7 -+ xvldrepl.d U8, A0, 0x00 -+ -+ PTR_ADDI TL, TL, -1 -+ xvld U12, B0, 0x00 -+ PTR_ADDI A0, A0, 0x08 -+ PTR_ADDI B0, B0, 0x20 -+ -+ beq ZERO, TL, .L_dgemm_1x4_M1_TL1_END -+.align 5 -+.L_dgemm_1x4_M1_TL1: -+ KERNEL8x1x4 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_1x4_M1_TL1 -+.L_dgemm_1x4_M1_TL1_END: -+ KERNEL8x1x4_END -+.L_dgemm_1x4_M1_L7: -+ /* if (!(L & 7)) goto L_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_1x4_M1_L0 -+.align 5 -+.L_dgemm_1x4_M1_L71: -+ xvldrepl.d U0, A0, 0x00 -+ xvld U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x08 -+ PTR_ADDI B0, B0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_1x4_M1_L71 -+.L_dgemm_1x4_M1_L0: -+ // Load C -+ fld.d $f0, C0, 0x00 -+ fld.d $f1, C1, 0x00 -+ fld.d $f2, C2, 0x00 -+ fld.d $f3, C3, 0x00 -+ xvinsve0.d U0, U1, 0x01 -+ xvinsve0.d U0, U2, 0x02 -+ xvinsve0.d U0, U3, 0x03 -+ GSUB xvf, d, U0, U0, D0 -+.endm -+ -+.macro dgemm_2x4 -+.L_dgemm_2x4: -+ /* Load 2 * 64 from A0 */ -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 -+ xvld U4, B0, 0x00 -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M2_L7 */ -+ beq ZERO,TL, .L_dgemm_2x4_M2_L7 -+ -+ xvldrepl.d U8, A0, 0x00 -+ xvldrepl.d U9, A0, 0x08 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvld U12, B0, 0x00 -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x20 -+ -+ beq ZERO, TL, .L_dgemm_2x4_M2_TL1_END -+.align 5 -+.L_dgemm_2x4_M2_TL1: -+ KERNEL8x2x4 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_2x4_M2_TL1 -+.L_dgemm_2x4_M2_TL1_END: -+ KERNEL8x2x4_END -+ -+.L_dgemm_2x4_M2_L7: -+ /* if (!(L & 7)) goto L_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_2x4_M2_L0 -+.align 5 -+.L_dgemm_2x4_M2_L71: -+ xvldrepl.d U0, A0, 0x00 -+ xvldrepl.d U1, A0, 0x08 -+ -+ xvld U4, B0, 0x00 -+ -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_2x4_M2_L71 -+.L_dgemm_2x4_M2_L0: -+ xvpackev.d D4, D1, D0 -+ xvpackod.d D5, D1, D0 -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+ -+ xvpermi.q U0, U2, 0x02 -+ xvpermi.q U1, U3, 0x02 -+ -+ GSUB xvf, d, U0, U0, D4, U1, U1, D5 -+.endm -+ -+.macro dgemm_4x4 -+.L_dgemm_4x4: -+ /* Load 4 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U6 -+ -+ xvldrepl.d U7, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U7 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M4_L7 */ -+ beq ZERO,TL, .L_dgemm_4x4_M4_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ xvldrepl.d U14, B0, 0x10 -+ xvldrepl.d U15, B0, 0x18 -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x20 -+ -+ beq ZERO, TL, .L_dgemm_4x4_M4_TL1_END -+.align 5 -+.L_dgemm_4x4_M4_TL1: /* TL-- */ -+ KERNEL8x4x4 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_4x4_M4_TL1 -+.L_dgemm_4x4_M4_TL1_END: -+ KERNEL8x4x4_END -+.L_dgemm_4x4_M4_L7: -+ /* if (!(L & 7)) goto L_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_4x4_M4_L0 -+.align 5 -+.L_dgemm_4x4_M4_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U4, B0, 0x08 -+ xvfmadd.d D4, U0, U4, D4 -+ -+ xvldrepl.d U4, B0, 0x10 -+ xvfmadd.d D8, U0, U4, D8 -+ -+ xvldrepl.d U4, B0, 0x18 -+ xvfmadd.d D12, U0, U4, D12 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_4x4_M4_L71 -+ .L_dgemm_4x4_M4_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ /* Load C2 */ -+ xvld U2, C2, 0x00 -+ /* Load C3 */ -+ xvld U3, C3, 0x00 -+ -+ GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12 -+.endm -+ -+.macro dgemm_8x4 -+.L_dgemm_8x4: -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ /* line 3 */ -+ xvfmul.d D8, U0, U6 -+ xvfmul.d D9, U1, U6 -+ -+ xvldrepl.d U7, B0, 0x18 -+ /* line 4 */ -+ xvfmul.d D12, U0, U7 -+ xvfmul.d D13, U1, U7 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M8_L7 */ -+ beq ZERO,TL, .L_dgemm_8x4_M8_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ xvldrepl.d U14, B0, 0x10 -+ xvldrepl.d U15, B0, 0x18 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x20 -+ -+ beq ZERO, TL, .L_dgemm_8x4_M8_TL1_END -+.align 5 -+.L_dgemm_8x4_M8_TL1: /* TL-- */ -+ KERNEL8x8x4 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_8x4_M8_TL1 -+ -+.L_dgemm_8x4_M8_TL1_END: -+ KERNEL8x8x4_END -+ -+.L_dgemm_8x4_M8_L7: -+ /* if (!(L & 7)) goto L_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_8x4_M8_L0 -+.align 5 -+.L_dgemm_8x4_M8_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ xvldrepl.d U6, B0, 0x10 -+ xvfmadd.d D8, U0, U6, D8 -+ xvfmadd.d D9, U1, U6, D9 -+ -+ xvldrepl.d U7, B0, 0x18 -+ xvfmadd.d D12, U0, U7, D12 -+ xvfmadd.d D13, U1, U7, D13 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_8x4_M8_L71 -+.L_dgemm_8x4_M8_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ -+ /* Load C2 */ -+ xvld U4, C2, 0x00 -+ xvld U5, C2, 0x20 -+ -+ /* Load C3 */ -+ xvld U6, C3, 0x00 -+ xvld U7, C3, 0x20 -+ -+ GSUB xvf, d, U0, U0, D0, U1, U1, D1, \ -+ U2, U2, D4, U3, U3, D5, \ -+ U4, U4, D8, U5, U5, D9, \ -+ U6, U6, D12, U7, U7, D13 -+.endm -+ -+.macro dgemm_4x2 -+.L_dgemm_4x2: -+ /* Load 4 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */ -+ beq ZERO,TL, .L_dgemm_4x2_N3_M4_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x10 -+ -+ beq ZERO, TL, .L_dgemm_4x2_N3_M4_TL1_END -+.align 5 -+.L_dgemm_4x2_N3_M4_TL1: /* TL-- */ -+ KERNEL8x4x2 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_4x2_N3_M4_TL1 -+.L_dgemm_4x2_N3_M4_TL1_END: -+ KERNEL8x4x2_END -+ -+.L_dgemm_4x2_N3_M4_L7: -+ /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_4x2_N3_M4_L0 -+.align 5 -+.L_dgemm_4x2_N3_M4_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x10 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_4x2_N3_M4_L71 -+ -+.L_dgemm_4x2_N3_M4_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D4 -+.endm -+ -+.macro dgemm_2x2 -+.L_dgemm_2x2: -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */ -+ beq ZERO,TL, .L_dgemm_2x2_N3_M2_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x10 -+ -+ beq ZERO, TL, .L_dgemm_2x2_N3_M2_TL1_END -+.align 5 -+.L_dgemm_2x2_N3_M2_TL1: /* TL-- */ -+ KERNEL8x2x2 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_2x2_N3_M2_TL1 -+.L_dgemm_2x2_N3_M2_TL1_END: -+ KERNEL8x2x2_END -+ -+.L_dgemm_2x2_N3_M2_L7: -+ /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_2x2_N3_M2_L0 -+.align 5 -+.L_dgemm_2x2_N3_M2_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x10 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_2x2_N3_M2_L71 -+.L_dgemm_2x2_N3_M2_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ /* Load C1 */ -+ xvld U1, C1, 0x00 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D4 -+.endm -+ -+.macro dgemm_8x2 -+.L_dgemm_8x2: -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */ -+ beq ZERO,TL, .L_dgemm_8x2_N3_M8_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x10 -+ -+ beq ZERO, TL, .L_dgemm_8x2_N3_M8_TL1_END -+.align 5 -+.L_dgemm_8x2_N3_M8_TL1: /* TL-- */ -+ KERNEL8x8x2 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_8x2_N3_M8_TL1 -+.L_dgemm_8x2_N3_M8_TL1_END: -+ KERNEL8x8x2_END -+ -+.L_dgemm_8x2_N3_M8_L7: -+ /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_8x2_N3_M8_L0 -+.align 5 -+.L_dgemm_8x2_N3_M8_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x10 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_8x2_N3_M8_L71 -+ -+.L_dgemm_8x2_N3_M8_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ /* Load C1 */ -+ xvld U2, C1, 0x00 -+ xvld U3, C1, 0x20 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5 -+.endm -+ -+.macro dgemm_16x2 -+.L_dgemm_16x2: -+ /* Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 -+ -+ xvldrepl.d U5, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U5 -+ xvfmul.d D5, U1, U5 -+ xvfmul.d D6, U2, U5 -+ xvfmul.d D7, U3, U5 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_L7 */ -+ beq ZERO,TL, .L_dgemm_16x2_N3_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x10 -+ -+ beq ZERO, TL, .L_dgemm_16x2_N3_TL1_END -+.align 5 -+.L_dgemm_16x2_N3_TL1: /* TL-- */ -+ KERNEL8x16x2 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_16x2_N3_TL1 -+.L_dgemm_16x2_N3_TL1_END: -+ KERNEL8x16x2_END -+ -+.L_dgemm_16x2_N3_L7: -+ /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_16x2_N3_L0 -+.align 5 -+.L_dgemm_16x2_N3_L71: -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ xvfmadd.d D5, U1, U5, D5 -+ xvfmadd.d D6, U2, U5, D6 -+ xvfmadd.d D7, U3, U5, D7 -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x10 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_16x2_N3_L71 -+ -+.L_dgemm_16x2_N3_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ /* Load C1 */ -+ xvld U4, C1, 0x00 -+ xvld U5, C1, 0x20 -+ xvld U6, C1, 0x40 -+ xvld U7, C1, 0x60 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \ -+ U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7 -+.endm -+ -+.macro dgemm_2x1 -+.L_dgemm_2x1: -+ /* Load 2 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */ -+ beq ZERO,TL, .L_dgemm_2x1_N1_M2_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x08 -+ -+ beq ZERO, TL, .L_dgemm_2x1_N1_M2_TL1_END -+.align 5 -+.L_dgemm_2x1_N1_M2_TL1: /* TL-- */ -+ KERNEL8x2x1 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_2x1_N1_M2_TL1 -+.L_dgemm_2x1_N1_M2_TL1_END: -+ KERNEL8x2x1_END -+ -+.L_dgemm_2x1_N1_M2_L7: -+ /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_2x1_N1_M2_L0 -+.align 5 -+.L_dgemm_2x1_N1_M2_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x10 -+ PTR_ADDI B0, B0, 0x08 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_2x1_N1_M2_L71 -+.L_dgemm_2x1_N1_M2_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ GSUB xvf, d, U0, U0, D0 -+.endm -+ -+.macro dgemm_4x1 -+.L_dgemm_4x1: -+ /* Load 4 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */ -+ beq ZERO,TL, .L_dgemm_4x1_N1_M4_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x08 -+ -+ beq ZERO, TL, .L_dgemm_4x1_N1_M4_TL1_END -+.align 5 -+.L_dgemm_4x1_N1_M4_TL1: /* TL-- */ -+ KERNEL8x4x1 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_4x1_N1_M4_TL1 -+.L_dgemm_4x1_N1_M4_TL1_END: -+ KERNEL8x4x1_END -+ -+.L_dgemm_4x1_N1_M4_L7: -+ /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_4x1_N1_M4_L0 -+.align 5 -+.L_dgemm_4x1_N1_M4_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x20 -+ PTR_ADDI B0, B0, 0x08 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_4x1_N1_M4_L71 -+.L_dgemm_4x1_N1_M4_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ GSUB xvf, d, U0, U0, D0 -+.endm -+ -+.macro dgemm_8x1 -+.L_dgemm_8x1: -+ /* Load 8 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */ -+ beq ZERO,TL, .L_dgemm_8x1_N1_M8_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x08 -+ -+ beq ZERO, TL, .L_dgemm_8x1_N1_M8_TL1_END -+.align 5 -+.L_dgemm_8x1_N1_M8_TL1: /* TL-- */ -+ KERNEL8x8x1 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_8x1_N1_M8_TL1 -+ -+.L_dgemm_8x1_N1_M8_TL1_END: -+ KERNEL8x8x1_END -+ -+.L_dgemm_8x1_N1_M8_L7: -+ /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_8x1_N1_M8_L0 -+.align 5 -+.L_dgemm_8x1_N1_M8_L71: -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x08 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_8x1_N1_M8_L71 -+.L_dgemm_8x1_N1_M8_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D1 -+.endm -+ -+.macro dgemm_16x1 -+.L_dgemm_16x1: -+ /* Load 16 * 64 from A0 -+ * U0 = {a3, a2, a1, a0} -+ * U1 = {a7, a6, a5, a4} -+ * U2 = {a11, a10, a9, a8} -+ * U3 = {a15, a14, a13, a12} -+ */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ xvfmul.d D1, U1, U4 -+ xvfmul.d D2, U2, U4 -+ xvfmul.d D3, U3, U4 -+ -+ /* Add stride for A0 and B0 */ -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */ -+ beq ZERO,TL, .L_dgemm_16x1_N1_L7 -+ -+ xvld U8, A0, 0x00 -+ xvld U9, A0, 0x20 -+ xvld U10, A0, 0x40 -+ xvld U11, A0, 0x60 -+ -+ PTR_ADDI TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x08 -+ -+ beq ZERO, TL, .L_dgemm_16x1_N1_TL1_END -+.align 5 -+.L_dgemm_16x1_N1_TL1: /* TL-- */ -+ KERNEL8x16x1 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_16x1_N1_TL1 -+.L_dgemm_16x1_N1_TL1_END: -+ KERNEL8x16x1_END -+ -+.L_dgemm_16x1_N1_L7: -+ /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_16x1_N1_L0 -+.align 5 -+.L_dgemm_16x1_N1_L71: -+ /* Load 16 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ xvld U1, A0, 0x20 -+ xvld U2, A0, 0x40 -+ xvld U3, A0, 0x60 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ xvfmadd.d D1, U1, U4, D1 -+ xvfmadd.d D2, U2, U4, D2 -+ xvfmadd.d D3, U3, U4, D3 -+ -+ /* Add stride for A0, B0 */ -+ PTR_ADDI A0, A0, 0x80 -+ PTR_ADDI B0, B0, 0x08 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_16x1_N1_L71 -+.L_dgemm_16x1_N1_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ xvld U1, C0, 0x20 -+ xvld U2, C0, 0x40 -+ xvld U3, C0, 0x60 -+ GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3 -+.endm -+ -+.macro dgemm_1x2 -+.L_dgemm_1x2: // See dgemm_kernel_16x4.S -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ xvldrepl.d U4, B0, 0x08 -+ /* line 2 */ -+ xvfmul.d D4, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N3_M1_L7 */ -+ beq ZERO,TL, .L_dgemm_1x2_N3_M1_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ addi.d TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ xvldrepl.d U13, B0, 0x08 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ beq ZERO, TL, .L_dgemm_1x2_N3_M1_TL1_END -+.L_dgemm_1x2_N3_M1_TL1: /* TL-- */ -+ KERNEL8x1x2 -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_dgemm_1x2_N3_M1_TL1 -+.L_dgemm_1x2_N3_M1_TL1_END: -+ KERNEL8x1x2_END -+.L_dgemm_1x2_N3_M1_L7: -+ /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_dgemm_1x2_N3_M1_L0 -+.L_dgemm_1x2_N3_M1_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ xvldrepl.d U5, B0, 0x08 -+ xvfmadd.d D4, U0, U5, D4 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x10 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_dgemm_1x2_N3_M1_L71 -+.L_dgemm_1x2_N3_M1_L0: -+ xvld U0, C0, 0x00 -+ xvld U1, C1, 0x00 -+ xvinsve0.d U0, U1, 0x01 -+ xvinsve0.d D0, D4, 0x01 -+ GSUB xvf, d, U0, U0, D0 -+.endm -+ -+.macro dgemm_1x1 -+.L_dgemm_1x1: -+ /* Load 1 * 64 from A0 */ -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ /* line 1 */ -+ xvfmul.d D0, U0, U4 -+ -+ /* Add stride for A0 and B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ /* Reduce L */ -+ addi.d L, L, -1 -+ srai.d TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M1_L7 */ -+ beq ZERO,TL, .L_N1_M1_L7 -+ -+ xvld U8, A0, 0x00 -+ -+ addi.d TL, TL, -1 -+ -+ xvldrepl.d U12, B0, 0x00 -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ beq ZERO, TL, .L_N1_M1_TL1_END -+.L_N1_M1_TL1: /* TL-- */ -+ KERNEL8x1x1 -+ addi.d TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M1_TL1 -+.L_N1_M1_TL1_END: -+ KERNEL8x1x1_END -+.L_N1_M1_L7: -+ /* if (!(L & 7)) goto L_N1_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M1_L0 -+ -+.L_N1_M1_L71: -+ xvld U0, A0, 0x00 -+ -+ xvldrepl.d U4, B0, 0x00 -+ xvfmadd.d D0, U0, U4, D0 -+ -+ /* Add stride for A0, B0 */ -+ addi.d A0, A0, 0x08 -+ addi.d B0, B0, 0x08 -+ -+ addi.d TL, TL, -1 -+ blt ZERO,TL, .L_N1_M1_L71 -+.L_N1_M1_L0: -+ /* Load C0 */ -+ xvld U0, C0, 0x00 -+ GSUB xvf, d, U0, U0, D0 -+.endm -diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S -new file mode 100644 -index 000000000..694dcdaa9 ---- /dev/null -+++ b/kernel/loongarch64/loongarch64_asm.S -@@ -0,0 +1,430 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+ -+#if __loongarch_grlen == 64 -+#define LA_REG int64_t -+#define REG_SIZE 8 -+#define REG_LOG 3 -+#define PTR_ADDI addi.d -+#define PTR_ADD add.d -+#define PTR_SUB sub.d -+#define PTR_LD ld.d -+#define PTR_ST st.d -+#define PTR_SLLI slli.d -+#define PTR_SRLI srli.d -+#define PTR_SRAI srai.d -+#define PTR_MUL mul.d -+#define PTR_ALSL alsl.d -+#elif __loongarch_grlen == 32 -+#define LA_REG int32_t -+#define REG_SIZE 4 -+#define REG_LOG 2 -+#define PTR_ADDI addi.w -+#define PTR_ADD add.w -+#define PTR_SUB sub.w -+#define PTR_LD ld.w -+#define PTR_ST st.w -+#define PTR_SLLI slli.w -+#define PTR_SRLI srli.w -+#define PTR_SRAI srai.w -+#define PTR_MUL mul.w -+#define PTR_ALSL alsl.w -+#else -+// If neither of the above two conditions is supported, it means this is an early -+// internal toolchain. To ensure maximum compatibility, the following approach is taken: -+#define LA_REG int64_t -+#define REG_SIZE 8 -+#define REG_LOG 3 -+#define PTR_ADDI addi.d -+#define PTR_ADD add.d -+#define PTR_SUB sub.d -+#define PTR_LD ld.d -+#define PTR_ST st.d -+#define PTR_SLLI slli.d -+#define PTR_SRLI srli.d -+#define PTR_SRAI srai.d -+#define PTR_MUL mul.d -+#define PTR_ALSL alsl.d -+#endif -+ -+#if __loongarch_frlen == 64 -+#define FREG_SIZE 8 -+#define FREG_LOG 3 -+#define PTR_FLD fld.d -+#define PTR_FST fst.d -+#elif __loongarch_frlen == 32 -+#define FREG_SIZE 4 -+#define FREG_LOG 2 -+#define PTR_FLD fld.s -+#define PTR_FST fst.s -+#else -+// If neither of the above two conditions is supported, it means this is an early -+// internal toolchain. To ensure maximum compatibility, the following approach is taken: -+#define FREG_SIZE 8 -+#define FREG_LOG 3 -+#define PTR_FLD fld.d -+#define PTR_FST fst.d -+#endif -+ -+// The max registers available to the user which -+// do not need to be preserved across calls. -+// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html -+#define MAX_INT_CALLER_SAVED 17 -+#define MAX_FP_CALLER_SAVED 24 -+ -+.altmacro // Enable alternate macro mode -+ -+.macro push_if_used regs, fregs -+.if \regs > MAX_INT_CALLER_SAVED -+ PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) -+ push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 -+.endif -+.if \fregs > MAX_FP_CALLER_SAVED -+ PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) -+ push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 -+.endif -+.endm // End push_if_used -+.macro pop_if_used regs, fregs -+.if \fregs > MAX_FP_CALLER_SAVED -+ pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 -+ PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG -+.endif -+.if \regs > MAX_INT_CALLER_SAVED -+ pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 -+ PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG -+.endif -+.endm // End pop_if_used -+.macro push_regs from, to -+ PTR_ST $s\()\from, $sp, \from << REG_LOG -+.if \to - \from -+ push_regs %from + 1, \to -+.endif -+.endm // End push_regs -+.macro pop_regs from, to -+ PTR_LD $s\()\from, $sp, \from << REG_LOG -+.if \to - \from -+ pop_regs %from + 1, \to -+.endif -+.endm // End pop_regs -+.macro push_fregs from, to -+ PTR_FST $fs\()\from, $sp, \from << FREG_LOG -+.if \to - \from -+ push_fregs %from + 1, \to -+.endif -+.endm // End push_fregs -+.macro pop_fregs from, to -+ PTR_FLD $fs\()\from, $sp, \from << FREG_LOG -+.if \to - \from -+ pop_fregs %from + 1, \to -+.endif -+.endm // End pop_fregs -+ -+// -+// Instruction Related Macros -+// -+// GLD -+// -+.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg -+.ifeqs "\suf_op", "0" -+ \pre_op\()ld \out, \src, \offset -+.else -+ \pre_op\()ld.\suf_op \out, \src, \offset -+.endif -+.ifnb \more -+ GLD \pre_op, \suf_op, \more -+.endif -+.endm -+ -+// -+// GLD_INC -+// -+.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg -+.ifeqs "\suf_op", "0" -+ \pre_op\()ld \out, \src, \offset -+.else -+ \pre_op\()ld.\suf_op \out, \src, \offset -+.endif -+ PTR_ADDI \src, \src, \inc -+.ifnb \more -+ GLD_INC \pre_op, \suf_op, \inc, \more -+.endif -+.endm -+// -+// GLDX is same as GLD except the stride is a register -+// -+.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg -+.ifeqs "\suf_op", "0" -+ \pre_op\()ldx \out, \src, \offset -+.else -+ \pre_op\()ldx.\suf_op \out, \src, \offset -+.endif -+.ifnb \more -+ GLDX \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GLDREPL -+// -+.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg -+ \pre_op\()ldrepl.\suf_op \out, \src, \offset -+.ifnb \more -+ GLDREPL \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GST -+// -+.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg -+.ifeqs "\suf_op", "0" -+ \pre_op\()st \src, \dst, \offset -+.else -+ \pre_op\()st.\suf_op \src, \dst, \offset -+.endif -+.ifnb \more -+ GST \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GMUL -+// -+.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()mul.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GMUL \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GMADD -+// -+.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg -+ \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 -+.ifnb \more -+ GMADD \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GADD -+// -+.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()add.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GADD \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GADDI -+// -+.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()addi.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GADDI \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GSUB -+// -+.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()sub.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GSUB \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GSLLI -+// -+.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()slli.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GSLLI \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GINSVE0 -+// -+.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()insve0.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GINSVE0 \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GXOR -+// -+.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()xor.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GXOR \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GPERMI -+// -+.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg -+ \pre_op\()permi.\suf_op \out, \in0, \in1 -+.ifnb \more -+ GPERMI \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GNMSUB -+// -+.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg -+ \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 -+.ifnb \more -+ GNMSUB \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GPRELD -+// -+.macro GPRELD in0:req, in1:req, in2:req, more:vararg -+ preld \in0, \in1, \in2 -+.ifnb \more -+ GPRELD \more -+.endif -+.endm -+ -+// -+// Compound instructions -+// -+// GACC: Accumulate the values of vector registers -+// -+.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg -+.ifeqs "\pre_op", "xvf" -+ xvpermi.q \out, \in, 0x01 -+ \pre_op\()add.\suf_op \in, \out, \in -+ xvpackod.d \out, \in, \in -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifeqs "\suf_op", "s" -+ xvpackod.w \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.endif -+.endif -+ -+.ifeqs "\pre_op", "vf" -+ vpackod.d \out, \in, \in -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifeqs "\suf_op", "s" -+ vpackod.w \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.endif -+.endif -+ -+.ifeqs "\pre_op", "xv" -+ xvpermi.q \out, \in, 0x01 -+ \pre_op\()add.\suf_op \in, \out, \in -+ xvpackod.d \out, \in, \in -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifnc "\suf_op", "d" -+ xvpackod.w \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifnc "\suf_op", "w" -+ xvpackod.h \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifnc "\suf_op", "h" -+ xvpackod.b \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.endif -+.endif -+.endif -+.endif -+ -+.ifeqs "\pre_op", "v" -+ vpackod.d \out, \in, \in -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifnc "\suf_op", "d" -+ vpackod.w \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifnc "\suf_op", "w" -+ vpackod.h \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.ifnc "\suf_op", "h" -+ vpackod.b \in, \out, \out -+ \pre_op\()add.\suf_op \out, \out, \in -+.endif -+.endif -+.endif -+.endif -+ -+.ifnb \more -+ GACC \pre_op, \suf_op, \more -+.endif -+.endm -+// -+// GMOV -+// -+.macro GMOV pre_op:req, out:req, in:req, more:vararg -+ \pre_op\()or.v \out, \in, \in -+.ifnb \more -+ GMOV \pre_op, \more -+.endif -+.endm -+ -+// -+// Media Related Macros -+// -+.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 -+ \pre_op\()ilvl.\suf_op \out0, \in0, \in1 -+ \pre_op\()ilvh.\suf_op \out1, \in0, \in1 -+.endm -+.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 -+ \pre_op\()pickev.\suf_op \out0, \in0, \in1 -+ \pre_op\()pickod.\suf_op \out1, \in0, \in1 -+.endm -+ -+// -+// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, -+// has no pre_op param. 128-bit vector instructions are not supported. -+// -+.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ -+ vt0, vt1 -+ GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 -+ GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 -+ GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 -+ GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 -+.endm -+ -+.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ -+ in0, in1, in2, in3, in4, in5, in6, in7, \ -+ tmp0, tmp1, tmp2, tmp3 -+ GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 -+ GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 -+ GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 -+ GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 -+ -+ GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 -+ GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 -+ GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 -+ GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 -+ -+ GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 -+ -+ GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ -+ \out2, \out6, 0x02, \out3, \out7, 0x02, \ -+ \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ -+ \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 -+.endm -diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S -new file mode 100644 -index 000000000..bd609394e ---- /dev/null -+++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S -@@ -0,0 +1,2348 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/23 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+* 2023/08/23 guxiwei -+* Parameter: -+* SGEMM_DEFAULT_UNROLL_N 8 -+* SGEMM_DEFAULT_UNROLL_M 16 -+* SGEMM_DEFAULT_P 256 -+* SGEMM_DEFAULT_Q 256 -+* SGEMM_DEFAULT_R 1024 -+* A_PRE 1024 -+* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled. -+* -+* -+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: -+* 1 thread: 71.7 GFLOPS -+* 2 threads: 142.6 GFLOPS -+* 3 threads: 211.5 GFLOPS -+* 4 threads: 265.0 GFLOPS -+*********************************************************************/ -+ -+/* Function parameters */ -+#define M $r4 // param 1: bm -+#define N $r5 // param 2: bn -+#define K $r6 // param 3: bk -+#define ALPHA $f0 // param 4: alpha -+#define A $r7 // param 5: ba -+#define B $r8 // param 6: bb -+#define C $r9 // param 7: bc -+#define LDC $r10 // param 8: ldc -+ -+#ifdef TRMMKERNEL -+#define OFFSET $r11 // param 9: offset -+#endif -+#define OFF $r12 -+ -+/* Cycle control parameters */ -+#define I $r13 -+#define J $r14 -+#define L $r15 -+#define TL $r16 -+/* Matrix address */ -+#define A0 $r17 -+#define B0 $r18 -+#define C0 $r19 -+#define C1 $r20 -+#define C2 $r23 -+#define C3 $r24 -+#define C4 $r25 -+#define C5 $r26 -+#define C6 $r27 -+#define C7 $r28 -+#define T0 $r29 -+#define T1 $r30 -+#undef ZERO -+#define ZERO $r0 -+ -+/* LASX Vectors -+ * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data. -+ * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data. -+ * Use D0 to D15 to store intermediate values of the computation. -+ * Use VALPHA to store the broadcast value of alpha -+ */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define X0 $xr2 -+#define X1 $xr3 -+#define X2 $xr4 -+#define X3 $xr5 -+#define X4 $xr6 -+#define X5 $xr7 -+#define X6 $xr8 -+#define X7 $xr9 -+#define D0 $xr10 -+#define D1 $xr11 -+#define D2 $xr12 -+#define D3 $xr13 -+#define D4 $xr14 -+#define D5 $xr15 -+#define D6 $xr16 -+#define D7 $xr17 -+#define D8 $xr18 -+#define D9 $xr19 -+#define D10 $xr20 -+#define D11 $xr21 -+#define D12 $xr22 -+#define D13 $xr23 -+#define D14 $xr24 -+#define D15 $xr25 -+#define VALPHA $xr26 -+ -+/* Prefetch interval */ -+#define A_PRE 0x400 -+#define B_PRE 0x100 -+ -+// Loops outline: -+// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */ -+// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */ -+// | | .L_M16_TL1 | | -+// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 | -+// | | .L_M16_L71 | | -+// | | .L_M16_L0 ---------------- | -+// | .L_M8 | -+// | | .L_M8_TL1 | | -+// | | .L_M8_L7 | KERNEK8x8 | -+// | | .L_M8_L71 | | -+// | | .L_M8_L0 | | -+// | .L_M4 | -+// | | .L_M4_TL1 | | -+// | | .L_M4_L7 | KERNEK4x8 | -+// | | .L_M4_L71 | | -+// | | .L_M4_L0 | | -+// | .L_M2 | -+// | | .L_M2_TL1 | | -+// | | .L_M2_L7 | KERNEK2x8 | -+// | | .L_M2_L71 | | -+// | | .L_M2_L0 | | -+// | .L_M1 | -+// | | .L_M1_TL1 | | -+// | | .L_M1_L7 | KERNEK1x8 | -+// | | .L_M1_L71 | | -+// | | .L_M1_L0 | | -+// | .L_M0------------------------------------------------------------------------------------------ -+// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */ -+// .L_N4 -+// | .L_N4_M16 <--------------------- -+// | | .L_N4_M16_TL1 | -+// | | .L_N4_M16_L7 | KERNEL16x4 -+// | | .L_N4_M16_L71 | -+// | | .L_N4_M16_L0 ---------------- -+// | .L_N4_M8 -+// | | .L_N4_M8_TL1 | -+// | | .L_N4_M8_L7 | KERNEL8x4 -+// | | .L_N4_M8_L71 | -+// | | .L_N4_M8_L0 | -+// | .L_N4_M4 -+// | | .L_N4_M4_TL1 | -+// | | .L_N4_M4_L7 | KERNEL4x4 -+// | | .L_N4_M4_L71 | -+// | | .L_N4_M4_L0 | -+// | .L_N4_M2 -+// | | .L_N4_M2_TL1 | -+// | | .L_N4_M2_L7 | KERNEL2x4 -+// | | .L_N4_M2_L71 | -+// | | .L_N4_M2_L0 | -+// | .L_N4_M1 -+// | | .L_N4_M1_TL1 | -+// | | .L_N4_M1_L7 | KERNEL1x4 -+// | | .L_N4_M1_L71 | -+// | | .L_N4_M1_L0 | -+// | .L_N4_M0 -+// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */ -+// .L_N2 -+// | .L_N2_M16 <--------------------- -+// | | .L_N2_M16_TL1 | -+// | | .L_N2_M16_L7 | KERNEL16x2 -+// | | .L_N2_M16_L71 | -+// | | .L_N2_M16_L0 ---------------- -+// | .L_N2_M8 -+// | | .L_N2_M8_TL1 | -+// | | .L_N2_M8_L7 | KERNEL8x2 -+// | | .L_N2_M8_L71 | -+// | | .L_N2_M8_L0 | -+// | .L_N2_M4 -+// | | .L_N2_M4_TL1 | -+// | | .L_N2_M4_L7 | KERNEL4x2 -+// | | .L_N2_M4_L71 | -+// | | .L_N2_M4_L0 | -+// | .L_N2_M2 -+// | | .L_N2_M2_TL1 | -+// | | .L_N2_M2_L7 | KERNEL2x2 -+// | | .L_N2_M2_L71 | -+// | | .L_N2_M2_L0 | -+// | .L_N2_M1 -+// | | .L_N2_M1_TL1 | -+// | | .L_N2_M1_L7 | KERNEL1x2 -+// | | .L_N2_M1_L71 | -+// | | .L_N2_M1_L0 | -+// | .L_N2_M0 -+// .L_N1 -+// | .L_N1_M16 <--------------------- -+// | | .L_N1_M16_TL1 | -+// | | .L_N1_M16_L7 | KERNEL16x1 -+// | | .L_N1_M16_L71 | -+// | | .L_N1_M16_L0 ---------------- -+// | .L_N1_M8 -+// | | .L_N1_M8_TL1 | -+// | | .L_N1_M8_L7 | KERNEL8x1 -+// | | .L_N1_M8_L71 | -+// | | .L_N1_M8_L0 | -+// | .L_N1_M4 -+// | | .L_N1_M4_TL1 | -+// | | .L_N1_M4_L7 | KERNEL4x1 -+// | | .L_N1_M4_L71 | -+// | | .L_N1_M4_L0 | -+// | .L_N1_M2 -+// | | .L_N1_M2_TL1 | -+// | | .L_N1_M2_L7 | KERNEL2x1 -+// | | .L_N1_M2_L71 | -+// | | .L_N1_M2_L0 | -+// | .L_N1_M1 -+// | | .L_N1_M1_TL1 | -+// | | .L_N1_M1_L7 | KERNEL1x1 -+// | | .L_N1_M1_L71 | -+// | | .L_N1_M1_L0 | -+// | .L_N1_M0 -+// .L_N0 -+ -+/*************** sgemm_kernel_macros ***************/ -+.macro KERNEL1x16x8_START -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMUL xvf, s, D0, U0, X0, D1, U1, X0 -+ preld 0, C0, 0x00 -+ GMUL xvf, s, D2, U0, X1, D3, U1, X1 -+ preld 0, C1, 0x00 -+ GMUL xvf, s, D4, U0, X2, D5, U1, X2 -+ preld 0, C2, 0x00 -+ GMUL xvf, s, D6, U0, X3, D7, U1, X3 -+ preld 0, C3, 0x00 -+ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C -+ GMUL xvf, s, D8, U0, X4, D9, U1, X4 -+ preld 0, C4, 0x00 -+ GMUL xvf, s, D10, U0, X5, D11, U1, X5 -+ preld 0, C5, 0x00 -+ GMUL xvf, s, D12, U0, X6, D13, U1, X6 -+ preld 0, C6, 0x00 -+ GMUL xvf, s, D14, U0, X7, D15, U1, X7 -+ preld 0, C7, 0x00 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x20 -+.endm -+ -+.macro KERNEL1x16x8 -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ -+ D2, U0, X1, D2, D3, U1, X1, D3 -+ preld 0, A0, A_PRE -+ GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \ -+ D6, U0, X3, D6, D7, U1, X3 D7 -+ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C -+ GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \ -+ D10, U0, X5, D10, D11, U1, X5, D11 -+ //preld 0, B0, B_PRE -+ GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \ -+ D14, U0, X7, D14, D15, U1, X7 D15 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x20 -+.endm -+ -+.macro KERNEL8x16x8 -+.rept 8 -+ KERNEL1x16x8 -+.endr -+.endm -+ -+.macro SAVE16x8 -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ -+ D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ -+ D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ -+ D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA -+#else -+ /* Load C0 */ -+ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 -+ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 -+ /* Load C1 */ -+ GLD xv, , X2, C1, 0x00, X3, C1, 0x20 -+ GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 -+ /* Load C2 */ -+ GLD xv, , X4, C2, 0x00, X5, C2, 0x20 -+ GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 -+ /* Load C3 */ -+ GLD xv, , X6, C3, 0x00, X7, C3, 0x20 -+ GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 -+ /* Load C4 */ -+ GLD xv, , X0, C4, 0x00, X1, C4, 0x20 -+ GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1 -+ /* Load C5 */ -+ GLD xv, , X2, C5, 0x00, X3, C5, 0x20 -+ GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3 -+ /* Load C6 */ -+ GLD xv, , X4, C6, 0x00, X5, C6, 0x20 -+ GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5 -+ /* Load C7 */ -+ GLD xv, , X6, C7, 0x00, X7, C7, 0x20 -+ GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7 -+#endif // #if defined(TRMMKERNEL) -+ GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ -+ D2, C1, 0x00, D3, C1, 0x20, \ -+ D4, C2, 0x00, D5, C2, 0x20, \ -+ D6, C3, 0x00, D7, C3, 0x20, \ -+ D8, C4, 0x00, D9, C4, 0x20, \ -+ D10, C5, 0x00, D11, C5, 0x20, \ -+ D12, C6, 0x00, D13, C6, 0x20, \ -+ D14, C7, 0x00, D15, C7, 0x20 -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ -+ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ -+ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 -+#else -+ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ -+ C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 -+#endif -+.endm -+ -+// m = 8, 4, 2, 1 -+// stride = 0x20, 0x10, 0x08, 0x04 -+.macro KERNEL1xMx8_START m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ -+ D4, U0, X2, D6, U0, X3 -+ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C -+ GMUL xvf, s, D8, U0, X4, D10, U0, X5, \ -+ D12, U0, X6, D14, U0, X7 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x20 -+.endm -+ -+.macro KERNEL1xMx8 m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ -+ D4, U0, X2, D4, D6, U0, X3, D6 -+ GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C -+ GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \ -+ D12, U0, X6, D12, D14, U0, X7, D14 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x20 -+.endm -+ -+.macro KERNEL8xMx8 m, stride -+.rept 8 -+ KERNEL1xMx8 \m, \stride -+.endr -+.endm -+ -+.macro SAVEMx8 m, stride -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ -+ D4, D4, VALPHA, D6, D6, VALPHA, \ -+ D8, D8, VALPHA, D10, D10, VALPHA, \ -+ D12, D12, VALPHA, D14, D14, VALPHA -+#else -+ /* Load C0, C1, C2, C3, C4, C5, C6, C7 */ -+ .if \m == 8 -+ GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 -+ .elseif \m == 4 -+ GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 -+.elseif \m == 2 -+ GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 -+.elseif \m == 1 -+ GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 -+ .endif -+ GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ -+ D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 -+.if \m == 8 -+ GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00 -+.elseif \m == 2 -+ GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 -+.elseif \m == 1 -+ GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 -+.endif -+ GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \ -+ D12, D12, VALPHA, X4, D14, D14, VALPHA, X6 -+#endif // #if defined(TRMMKERNEL) -+.if \m == 8 -+ GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ -+ D4, C2, 0x00, D6, C3, 0x00, \ -+ D8, C4, 0x00, D10, C5, 0x00, \ -+ D12, C6, 0x00, D14, C7, 0x00 -+.elseif \m == 4 -+ GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ -+ $vr14, C2, 0x00, $vr16, C3, 0x00, \ -+ $vr18, C4, 0x00, $vr20, C5, 0x00, \ -+ $vr22, C6, 0x00, $vr24, C7, 0x00 -+.elseif \m == 2 -+ GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ -+ $f14, C2, 0x00, $f16, C3, 0x00, \ -+ $f18, C4, 0x00, $f20, C5, 0x00, \ -+ $f22, C6, 0x00, $f24, C7, 0x00 -+.elseif \m == 1 -+ GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ -+ $f14, C2, 0x00, $f16, C3, 0x00, \ -+ $f18, C4, 0x00, $f20, C5, 0x00, \ -+ $f22, C6, 0x00, $f24, C7, 0x00 -+.endif -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ -+ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ -+ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride -+#else -+ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ -+ C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride -+#endif -+.endm -+ -+.macro KERNEL1x16x4_START -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ -+ D2, U0, X1, D3, U1, X1, \ -+ D4, U0, X2, D5, U1, X2, \ -+ D6, U0, X3, D7, U1, X3 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x10 -+.endm -+ -+.macro KERNEL1x16x4 -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ -+ D2, U0, X1, D2, D3, U1, X1, D3, \ -+ D4, U0, X2, D4, D5, U1, X2, D5, \ -+ D6, U0, X3, D6, D7, U1, X3 D7 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x10 -+.endm -+ -+.macro KERNEL8x16x4 -+.rept 8 -+ KERNEL1x16x4 -+.endr -+.endm -+ -+.macro SAVE16x4 -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ -+ D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA -+#else -+ /* Load C0 */ -+ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 -+ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 -+ /* Load C1 */ -+ GLD xv, , X2, C1, 0x00, X3, C1, 0x20 -+ GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 -+ /* Load C2 */ -+ GLD xv, , X4, C2, 0x00, X5, C2, 0x20 -+ GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 -+ /* Load C3 */ -+ GLD xv, , X6, C3, 0x00, X7, C3, 0x20 -+ GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 -+#endif // #if defined(TRMMKERNEL) -+ GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ -+ D2, C1, 0x00, D3, C1, 0x20, \ -+ D4, C2, 0x00, D5, C2, 0x20, \ -+ D6, C3, 0x00, D7, C3, 0x20 -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 -+#else -+ GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 -+#endif -+.endm -+ -+// m = 8, 4, 2, 1 -+// stride = 0x20, 0x10, 0x08, 0x04 -+.macro KERNEL1xMx4_START m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ -+ D4, U0, X2, D6, U0, X3 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x10 -+.endm -+ -+.macro KERNEL1xMx4 m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C -+ GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ -+ D4, U0, X2, D4, D6, U0, X3, D6 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x10 -+.endm -+ -+.macro KERNEL8xMx4 m, stride -+.rept 8 -+ KERNEL1xMx4 \m, \stride -+.endr -+.endm -+ -+.macro SAVEMx4 m, stride -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ -+ D4, D4, VALPHA, D6, D6, VALPHA -+#else -+ /* Load C0, C1, C2, C3 */ -+ .if \m == 8 -+ GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 -+ .elseif \m == 4 -+ GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 -+.elseif \m == 2 -+ GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 -+.elseif \m == 1 -+ GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 -+ .endif -+ GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ -+ D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 -+#endif // #if defined(TRMMKERNEL) -+.if \m == 8 -+ GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ -+ D4, C2, 0x00, D6, C3, 0x00 -+.elseif \m == 4 -+ GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ -+ $vr14, C2, 0x00, $vr16, C3, 0x00 -+.elseif \m == 2 -+ GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ -+ $f14, C2, 0x00, $f16, C3, 0x00 -+.elseif \m == 1 -+ GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ -+ $f14, C2, 0x00, $f16, C3, 0x00 -+.endif -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride -+#else -+ GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride -+#endif -+.endm -+ -+.macro KERNEL1x16x2_START -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 -+ GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ -+ D2, U0, X1, D3, U1, X1 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x08 -+.endm -+ -+.macro KERNEL1x16x2 -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 -+ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ -+ D2, U0, X1, D2, D3, U1, X1, D3 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x08 -+.endm -+ -+.macro KERNEL8x16x2 -+.rept 8 -+ KERNEL1x16x2 -+.endr -+.endm -+ -+.macro SAVE16x2 -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA -+#else -+ /* Load C0 */ -+ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 -+ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 -+ /* Load C1 */ -+ GLD xv, , X2, C1, 0x00, X3, C1, 0x20 -+ GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 -+#endif // #if defined(TRMMKERNEL) -+ GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ -+ D2, C1, 0x00, D3, C1, 0x20 -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, 0x40, C1, C1, 0x40 -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, 0x40, C1, C1, 0x40 -+#else -+ GADDI , d, C0, C0, 0x40, C1, C1, 0x40 -+#endif -+.endm -+ -+// m = 8, 4, 2, 1 -+// stride = 0x20, 0x10, 0x08, 0x04 -+.macro KERNEL1xMx2_START m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 -+ GMUL xvf, s, D0, U0, X0, D2, U0, X1 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x08 -+.endm -+ -+.macro KERNEL1xMx2 m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 -+ GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x08 -+.endm -+ -+.macro KERNEL8xMx2 m, stride -+.rept 8 -+ KERNEL1xMx2 \m, \stride -+.endr -+.endm -+ -+.macro SAVEMx2 m, stride -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA -+#else -+ /* Load C0, C1 */ -+ .if \m == 8 -+ GLD xv, , X0, C0, 0x00, X2, C1, 0x00 -+ .elseif \m == 4 -+ GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00 -+.elseif \m == 2 -+ GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00 -+.elseif \m == 1 -+ GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00 -+ .endif -+ GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2 -+#endif // #if defined(TRMMKERNEL) -+.if \m == 8 -+ GST xv, , D0, C0, 0x00, D2, C1, 0x00 -+.elseif \m == 4 -+ GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00 -+.elseif \m == 2 -+ GST f, d, $f10, C0, 0x00, $f12, C1, 0x00 -+.elseif \m == 1 -+ GST f, s, $f10, C0, 0x00, $f12, C1, 0x00 -+.endif -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, \stride, C1, C1, \stride -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, \stride, C1, C1, \stride -+#else -+ GADDI , d, C0, C0, \stride, C1, C1, \stride -+#endif -+.endm -+ -+.macro KERNEL1x16x1_START -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ GLDREPL xv, w, X0, B0, 0x00 -+ GMUL xvf, s, D0, U0, X0, D1, U1, X0 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x04 -+.endm -+ -+.macro KERNEL1x16x1 -+ GLD xv, , U0, A0, 0x00, U1, A0, 0x20 -+ GLDREPL xv, w, X0, B0, 0x00 -+ GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1 -+ PTR_ADDI A0, A0, 0x40 -+ PTR_ADDI B0, B0, 0x04 -+.endm -+ -+.macro KERNEL8x16x1 -+.rept 8 -+ KERNEL1x16x1 -+.endr -+.endm -+ -+.macro SAVE16x1 -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA -+#else -+ /* Load C0 */ -+ GLD xv, , X0, C0, 0x00, X1, C0, 0x20 -+ GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 -+#endif // #if defined(TRMMKERNEL) -+ GST xv, , D0, C0, 0x00, D1, C0, 0x20 -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, 0x40 -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, 0x40 -+#else -+ GADDI , d, C0, C0, 0x40 -+#endif -+.endm -+ -+// m = 8, 4, 2, 1 -+// stride = 0x20, 0x10, 0x08, 0x04 -+.macro KERNEL1xMx1_START m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00 -+ GMUL xvf, s, D0, U0, X0 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x04 -+.endm -+ -+.macro KERNEL1xMx1 m, stride -+.if \m == 8 -+ GLD xv, , U0, A0, 0x00 -+.elseif \m == 4 -+ GLD v, , $vr0, A0, 0x00 -+.elseif \m ==2 -+ GLD f, d, $f0, A0, 0x00 -+.elseif \m ==1 -+ GLD f, s, $f0, A0, 0x00 -+.endif -+ GLDREPL xv, w, X0, B0, 0x00 -+ GMADD xvf, s, D0, U0, X0, D0 -+ PTR_ADDI A0, A0, \stride -+ PTR_ADDI B0, B0, 0x04 -+.endm -+ -+.macro KERNEL8xMx1 m, stride -+.rept 8 -+ KERNEL1xMx1 \m, \stride -+.endr -+.endm -+ -+.macro SAVEMx1 m, stride -+#if defined(TRMMKERNEL) -+ GMUL xvf, s, D0, D0, VALPHA -+#else -+ /* Load C0, C1 */ -+ .if \m == 8 -+ GLD xv, , X0, C0, 0x00 -+ .elseif \m == 4 -+ GLD v, , $vr2, C0, 0x00 -+.elseif \m == 2 -+ GLD f, d, $f2, C0, 0x00 -+.elseif \m == 1 -+ GLD f, s, $f2, C0, 0x00 -+ .endif -+ GMADD xvf, s, D0, D0, VALPHA, X0 -+#endif // #if defined(TRMMKERNEL) -+.if \m == 8 -+ GST xv, , D0, C0, 0x00 -+.elseif \m == 4 -+ GST v, , $vr10, C0, 0x00 -+.elseif \m == 2 -+ GST f, d, $f10, C0, 0x00 -+.elseif \m == 1 -+ GST f, s, $f10, C0, 0x00 -+.endif -+#if __loongarch_grlen == 64 -+ GADDI , d, C0, C0, \stride -+#elif __loongarch_grlen == 32 -+ GADDI , w, C0, C0, \stride -+#else -+ GADDI , d, C0, C0, \stride -+#endif -+.endm -+ -+ PROLOGUE -+ push_if_used 26, 32 -+ xvreplve0.w VALPHA, $xr0 -+#if defined (TRMMKERNEL) && !defined(LEFT) -+ PTR_SUB OFF, ZERO, OFFSET -+#else -+ xor OFF, OFF, OFF -+#endif -+ /* if (!(N >> 3)) goto L_N7 */ -+ PTR_SRAI J, N, 3 /* J = bn >> 3 */ -+ andi N, N, 0x07 -+ beq ZERO, J, .L_N7 -+.L_N8: /* J -- */ -+ move C0, C -+ move A0, A -+ PTR_SLLI T0, LDC, 2 -+ PTR_ADDI J, J, -1 /* J-- */ -+#if __loongarch_grlen == 64 -+ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ -+ C6, C5, T0, C7, C6, T0 -+#elif __loongarch_grlen == 32 -+ GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ -+ C6, C5, T0, C7, C6, T0 -+#else -+ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ -+ C6, C5, T0, C7, C6, T0 -+#endif -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ /* if (!(M >> 4)) goto L_M8 */ -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_M8 -+.align 5 -+.L_M16: /* I-- */ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x06 -+ PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ -+#endif -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 16 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 8 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1x16x8_START -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M16_L7 */ -+ beq ZERO,TL, .L_M16_L7 -+.align 5 -+.L_M16_TL1: -+ KERNEL8x16x8 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M16_TL1 -+.L_M16_L7: -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M16_L0 -+.align 5 -+.L_M16_L71: -+ KERNEL1x16x8 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_M16_L71 -+.L_M16_L0: -+ SAVE16x8 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI L, L, -16 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, L, -8 -+#endif -+ PTR_SLLI T0, L, 0x06 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x10 /* number of values in A */ -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ PTR_ADDI I, I, -1 /* I-- */ -+ blt ZERO,I, .L_M16 -+.L_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_M4 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */ -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ -+#endif -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 8 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 8 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif // #if defined(TRMMKERNEL) -+ KERNEL1xMx8_START 8, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M8_L7 */ -+ beq ZERO,TL, .L_M8_L7 -+.align 5 -+.L_M8_TL1: -+ KERNEL8xMx8 8, 0x20 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M8_TL1 -+.L_M8_L7: -+ /* if (!(L & 7)) goto L_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M8_L0 -+.align 5 -+.L_M8_L71: -+ KERNEL1xMx8 8, 0x20 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_M8_L71 -+.L_M8_L0: -+ SAVEMx8 8, 0x20 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI L, L, -8 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, L, -8 -+#endif -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_M2 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */ -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 4 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 8 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx8_START 4, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M4_L7 */ -+ beq ZERO,TL, .L_M4_L7 -+.align 5 -+.L_M4_TL1: -+ KERNEL8xMx8 4, 0x10 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_M4_TL1 -+.L_M4_L7: -+ /* if (!(L & 7)) goto L_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M4_L0 -+.L_M4_L71: -+ KERNEL1xMx8 4, 0x10 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_M4_L71 -+.L_M4_L0: -+ SAVEMx8 4, 0x10 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI L, L, -4 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, L, -8 -+#endif -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_M1 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 2 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 8 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx8_START 2, 0x08 -+ -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M2_L7 */ -+ beq ZERO,TL, .L_M2_L7 -+.align 5 -+.L_M2_TL1: -+ KERNEL8xMx8 2, 0x08 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M2_TL1 -+.L_M2_L7: -+ /* if (!(L & 7)) goto L_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M2_L0 -+.align 5 -+.L_M2_L71: -+ KERNEL1xMx8 2, 0x08 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_M2_L71 -+.L_M2_L0: -+ SAVEMx8 2, 0x08 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI L, L, -2 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, L, -8 -+#endif -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 1 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 8 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx8_START 1, 0x04 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_M1_L7 */ -+ beq ZERO,TL, .L_M1_L7 -+.align 5 -+.L_M1_TL1: -+ KERNEL8xMx8 1, 0x04 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_M1_TL1 -+.L_M1_L7: -+ /* if (!(L & 7)) goto L_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_M1_L0 -+.align 5 -+.L_M1_L71: -+ KERNEL1xMx8 1, 0x04 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_M1_L71 -+.L_M1_L0: -+ SAVEMx8 1, 0x04 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI L, L, -1 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, L, -8 -+#endif -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ /* number of values in A */ -+ PTR_ADDI OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+.L_M0: -+ /* Add stride for B and C -+ * B += (K * 32) -+ * C += (LDC * 32) -+ */ -+ PTR_SLLI T0, K, 5 -+ PTR_SLLI T1, LDC, 5 -+ PTR_ADD B, B, T0 -+ PTR_ADD C, C, T1 -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ PTR_ADDI OFF, OFF, 0x08 /* number of values in B */ -+#endif -+ blt ZERO, J, .L_N8 -+ -+.L_N7: -+ andi J, N, 4 -+ beq ZERO, J, .L_N3 -+.L_N4: -+ move C0, C -+ move A0, A -+ PTR_SLLI T0, LDC, 2 -+#if __loongarch_grlen == 64 -+ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 -+#elif __loongarch_grlen == 32 -+ GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 -+#else -+ GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 -+#endif -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ -+ /* if (!(M >> 4)) goto L_N4_M8 */ -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N4_M8 -+.align 5 -+.L_N4_M16: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x06 -+ PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD B0, B, T0 /* B0 += 4 * OFF */ -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 16 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1x16x4_START -+ -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N4_L7 */ -+ beq ZERO,TL, .L_N4_M16_L7 -+.align 5 -+.L_N4_M16_TL1: /* TL-- */ -+ KERNEL8x16x4 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N4_M16_TL1 -+.L_N4_M16_L7: -+ /* if (!(L & 7)) goto L_N4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N4_M16_L0 -+.align 5 -+.L_N4_M16_L71: -+ KERNEL1x16x4 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N4_M16_L71 -+.L_N4_M16_L0: -+ SAVE16x4 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -16 -+#else -+ PTR_ADDI L, L, -4 -+#endif -+ PTR_SLLI T0, L, 0x06 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ PTR_ADDI I, I, -1 /* I-- */ -+ blt ZERO,I, .L_N4_M16 -+.L_N4_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_N4_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_N4_M4 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 8 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx4_START 8, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N4_M8_L7 */ -+ beq ZERO,TL, .L_N4_M8_L7 -+.align 5 -+.L_N4_M8_TL1: /* TL-- */ -+ KERNEL8xMx4 8, 0x20 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N4_M8_TL1 -+.L_N4_M8_L7: -+ /* if (!(L & 7)) goto L_N4_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N4_M8_L0 -+.align 5 -+.L_N4_M8_L71: -+ KERNEL1xMx4 8, 0x20 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N4_M8_L71 -+.L_N4_M8_L0: -+ SAVEMx4 8, 0x20 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -8 -+#else -+ PTR_ADDI L, L, -4 -+#endif -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N4_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_N4_M2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 4 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx4_START 4, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N4_M4_L7 */ -+ beq ZERO,TL, .L_N4_M4_L7 -+.align 5 -+.L_N4_M4_TL1: /* TL-- */ -+ KERNEL8xMx4 4, 0x10 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N4_M4_TL1 -+.L_N4_M4_L7: -+ /* if (!(L & 7)) goto L_N4_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N4_M4_L0 -+.align 5 -+.L_N4_M4_L71: -+ KERNEL1xMx4 4, 0x10 -+ -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N4_M4_L71 -+.L_N4_M4_L0: -+ SAVEMx4 4, 0x10 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -4 -+#else -+ PTR_ADDI L, L, -4 -+#endif -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N4_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_N4_M1 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 2 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx4_START 2, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N4_M2_L7 */ -+ beq ZERO,TL, .L_N4_M2_L7 -+.align 5 -+.L_N4_M2_TL1: /* TL-- */ -+ KERNEL8xMx4 2, 0x08 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N4_M2_TL1 -+.L_N4_M2_L7: -+ /* if (!(L & 7)) goto L_N4_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N4_M2_L0 -+.align 5 -+.L_N4_M2_L71: -+ KERNEL1xMx4 2, 0x08 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N4_M2_L71 -+.L_N4_M2_L0: -+ SAVEMx4 2, 0x08 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -2 -+#else -+ PTR_ADDI L, L, -4 -+#endif -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N4_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_N4_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 1 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 4 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx4_START 1, 0x04 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N4_M1_L7 */ -+ beq ZERO,TL, .L_N4_M1_L7 -+.align 5 -+.L_N4_M1_TL1: /* TL-- */ -+ KERNEL8xMx4 1, 0x04 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N4_M1_TL1 -+.L_N4_M1_L7: -+ /* if (!(L & 7)) goto L_N4_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N4_M1_L0 -+.align 5 -+.L_N4_M1_L71: -+ KERNEL1xMx4 1, 0x04 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N4_M1_L71 -+.L_N4_M1_L0: -+ SAVEMx4 1, 0x04 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -1 -+#else -+ PTR_ADDI L, L, -4 -+#endif -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N4_M0: -+ /* Add stride for B and C -+ * B += 4 * K -+ * C += 4 * LDC -+ */ -+ PTR_SLLI T0, K, 4 -+ PTR_SLLI T1, LDC, 4 -+ PTR_ADD B, B, T0 -+ PTR_ADD C, C, T1 -+ -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ PTR_ADDI OFF, OFF, 0x04 -+#endif -+ /* We must reinit I */ -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+.L_N3: -+ andi J, N, 2 -+ beq ZERO, J, .L_N1 -+ -+.L_N2: -+ move C0, C -+ move A0, A -+ PTR_SLLI T0, LDC, 2 -+ PTR_ADD C1, C0, T0 -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ -+ /* if (!(M >> 4)) goto L_N2_M8 */ -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N2_M8 -+.align 5 -+.L_N2_M16: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x06 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 16 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1x16x2_START -+ -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N2_M16_L7 */ -+ beq ZERO,TL, .L_N2_M16_L7 -+.align 5 -+.L_N2_M16_TL1: /* TL-- */ -+ KERNEL8x16x2 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N2_M16_TL1 -+.L_N2_M16_L7: -+ /* if (!(L & 7)) goto L_N2_M16_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N2_M16_L0 -+.align 5 -+.L_N2_M16_L71: -+ KERNEL1x16x2 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N2_M16_L71 -+.L_N2_M16_L0: -+ SAVE16x2 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -16 -+#else -+ PTR_ADDI L, L, -2 -+#endif -+ PTR_SLLI T0, L, 0x06 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ PTR_ADDI I, I, -1 /* I-- */ -+ blt ZERO,I, .L_N2_M16 -+.L_N2_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_N2_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_N2_M4 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 8 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx2_START 8, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N2_M8_L7 */ -+ beq ZERO,TL, .L_N2_M8_L7 -+.align 5 -+.L_N2_M8_TL1: /* TL-- */ -+ KERNEL8xMx2 8, 0x20 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N2_M8_TL1 -+.L_N2_M8_L7: -+ /* if (!(L & 7)) goto L_N2_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N2_M8_L0 -+.align 5 -+.L_N2_M8_L71: -+ KERNEL1xMx2 8, 0x20 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N2_M8_L71 -+.L_N2_M8_L0: -+ SAVEMx2 8, 0x20 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -8 -+#else -+ PTR_ADDI L, L, -2 -+#endif -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N2_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_N2_M2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 4 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx2_START 4, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N2_M4_L7 */ -+ beq ZERO,TL, .L_N2_M4_L7 -+.align 5 -+.L_N2_M4_TL1: /* TL-- */ -+ KERNEL8xMx2 4, 0x10 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N2_M4_TL1 -+.L_N2_M4_L7: -+ /* if (!(L & 7)) goto L_N2_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N2_M4_L0 -+.align 5 -+.L_N2_M4_L71: -+ KERNEL1xMx2 4, 0x10 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N2_M4_L71 -+.L_N2_M4_L0: -+ SAVEMx2 4, 0x10 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -4 -+#else -+ PTR_ADDI L, L, -2 -+#endif -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N2_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_N2_M1 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 2 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx2_START 2, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N2_M2_L7 */ -+ beq ZERO,TL, .L_N2_M2_L7 -+.align 5 -+.L_N2_M2_TL1: /* TL-- */ -+ KERNEL8xMx2 2, 0x08 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N2_M2_TL1 -+.L_N2_M2_L7: -+ /* if (!(L & 7)) goto L_N2_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N2_M2_L0 -+.align 5 -+.L_N2_M2_L71: -+ KERNEL1xMx2 2, 0x08 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N2_M2_L71 -+.L_N2_M2_L0: -+ SAVEMx2 2, 0x08 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -2 -+#else -+ PTR_ADDI L, L, -2 -+#endif -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N2_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_N2_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 1 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 2 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx2_START 1, 0x04 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N2_M1_L7 */ -+ beq ZERO,TL, .L_N2_M1_L7 -+.align 5 -+.L_N2_M1_TL1: /* TL-- */ -+ KERNEL8xMx2 1, 0x04 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N2_M1_TL1 -+.L_N2_M1_L7: -+ /* if (!(L & 7)) goto L_N2_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N2_M1_L0 -+.align 5 -+.L_N2_M1_L71: -+ KERNEL1xMx2 1, 0x04 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N2_M1_L71 -+.L_N2_M1_L0: -+ SAVEMx2 1, 0x04 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -1 -+#else -+ PTR_ADDI L, L, -2 -+#endif -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N2_M0: -+ /* Add stride for B and C -+ * B += 2 * K -+ * C += 2 * LDC -+ */ -+ PTR_SLLI T0, K, 3 -+ PTR_SLLI T1, LDC, 3 -+ PTR_ADD B, B, T0 -+ PTR_ADD C, C, T1 -+#if defined(TRMMKERNEL) && !defined(LEFT) -+ PTR_ADDI OFF, OFF, 0x02 -+#endif -+ /* We must reinit I */ -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+.L_N1: -+ andi J, N, 1 -+ beq ZERO, J, .L_N0 -+ move C0, C -+ move A0, A -+ -+#if defined(TRMMKERNEL) && defined(LEFT) -+ move OFF, OFFSET -+#endif -+ /* if (!(M >> 4)) goto L_N1_M8 */ -+ PTR_SRAI I, M, 4 /* I = bm >> 4 */ -+ beq ZERO, I, .L_N1_M8 -+.L_N1_M16: -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x06 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 16 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1x16x1_START -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M16_L7 */ -+ beq ZERO,TL, .L_N1_M16_L7 -+.align 5 -+.L_N1_M16_TL1: /* TL-- */ -+ KERNEL8x16x1 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M16_TL1 -+.L_N1_M16_L7: -+ /* if (!(L & 7)) goto L_N1_M16_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M16_L0 -+.align 5 -+.L_N1_M16_L71: -+ KERNEL1x16x1 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N1_M16_L71 -+.L_N1_M16_L0: -+ SAVE16x1 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -16 -+#else -+ PTR_ADDI L, L, -1 -+#endif -+ PTR_SLLI T0, L, 0x06 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x10 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+ PTR_ADDI I, I, -1 /* I-- */ -+ blt ZERO,I, .L_N1_M16 -+.L_N1_M8: -+ /* We have done M & 16, considering M=8/4/2/1 */ -+ andi I, M, 15 -+ beq ZERO,I, .L_N1_M0 -+ -+ andi I, M, 8 -+ beq ZERO,I, .L_N1_M4 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 8 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx1_START 8, 0x20 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M8_L7 */ -+ beq ZERO,TL, .L_N1_M8_L7 -+.align 5 -+.L_N1_M8_TL1: /* TL-- */ -+ KERNEL8xMx1 8, 0x20 -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M8_TL1 -+.L_N1_M8_L7: -+ /* if (!(L & 7)) goto L_N1_M8_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M8_L0 -+.align 5 -+.L_N1_M8_L71: -+ KERNEL1xMx1 8, 0x20 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N1_M8_L71 -+.L_N1_M8_L0: -+ SAVEMx1 8, 0x20 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -8 -+#else -+ PTR_ADDI L, L, -1 -+#endif -+ PTR_SLLI T0, L, 0x05 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x08 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N1_M4: -+ andi I, M, 4 -+ beq ZERO,I, .L_N1_M2 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 4 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx1_START 4, 0x10 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M4_L7 */ -+ beq ZERO,TL, .L_N1_M4_L7 -+.align 5 -+.L_N1_M4_TL1: /* TL-- */ -+ KERNEL8xMx1 4, 0x10 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M4_TL1 -+.L_N1_M4_L7: -+ /* if (!(L & 7)) goto L_N1_M4_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M4_L0 -+.align 5 -+.L_N1_M4_L71: -+ KERNEL1xMx1 4, 0x10 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N1_M4_L71 -+.L_N1_M4_L0: -+ SAVEMx1 4, 0x10 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -4 -+#else -+ PTR_ADDI L, L, -1 -+#endif -+ PTR_SLLI T0, L, 0x04 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x04 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N1_M2: -+ andi I, M, 2 -+ beq ZERO,I, .L_N1_M1 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 2 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx1_START 2, 0x08 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M2_L7 */ -+ beq ZERO,TL, .L_N1_M2_L7 -+.align 5 -+.L_N1_M2_TL1: /* TL-- */ -+ KERNEL8xMx1 2, 0x08 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M2_TL1 -+.L_N1_M2_L7: -+ /* if (!(L & 7)) goto L_N1_M2_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M2_L0 -+.align 5 -+.L_N1_M2_L71: -+ KERNEL1xMx1 2, 0x08 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N1_M2_L71 -+.L_N1_M2_L0: -+ SAVEMx1 2, 0x08 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -2 -+#else -+ PTR_ADDI L, L, -1 -+#endif -+ PTR_SLLI T0, L, 0x03 -+ PTR_ADD A0, A0, T0 -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x02 -+#endif -+#endif // #if defined(TRMMKERNEL) -+ -+.L_N1_M1: -+ andi I, M, 1 -+ beq ZERO,I, .L_N1_M0 -+ -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ move B0, B -+#else -+ PTR_SLLI T0, OFF, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_ADD B0, B, T0 -+#endif -+ -+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#elif defined(LEFT) -+ /* number of values in A */ -+ PTR_ADDI L, OFF, 1 -+#else -+ /* number of values in B */ -+ PTR_ADDI L, OFF, 1 -+#endif -+#else // #if !defined(TRMMKERNEL) -+ move B0, B -+ move L, K /* L = bk */ -+#endif -+ KERNEL1xMx1_START 1, 0x04 -+ /* Reduce L */ -+ PTR_ADDI L, L, -1 -+ PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ -+ /* if (TL < 1) goto L_N1_M1_L7 */ -+ beq ZERO,TL, .L_N1_M1_L7 -+.align 5 -+.L_N1_M1_TL1: /* TL-- */ -+ KERNEL8xMx1 1, 0x04 -+ -+ PTR_ADDI TL, TL, -1 /* TL-- */ -+ blt ZERO,TL, .L_N1_M1_TL1 -+.L_N1_M1_L7: -+ /* if (!(L & 7)) goto L_N1_M1_L0 */ -+ andi TL, L, 7 -+ beq TL, ZERO,.L_N1_M1_L0 -+.align 5 -+.L_N1_M1_L71: -+ KERNEL1xMx1 1, 0x04 -+ PTR_ADDI TL, TL, -1 -+ blt ZERO,TL, .L_N1_M1_L71 -+.L_N1_M1_L0: -+ SAVEMx1 1, 0x04 -+#if defined(TRMMKERNEL) -+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -+ PTR_SUB L, K, OFF -+#ifdef LEFT -+ PTR_ADDI L, L, -1 -+#else -+ PTR_ADDI L, L, -1 -+#endif -+ PTR_SLLI T0, L, 0x02 -+ PTR_ADD A0, A0, T0 -+ PTR_ADD B0, B0, T0 -+#endif -+ -+#ifdef LEFT -+ PTR_ADDI OFF, OFF, 0x01 -+#endif -+#endif // #if defined(TRMMKERNEL) -+.L_N1_M0: -+.L_N0: -+ pop_if_used 26, 32 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S -new file mode 100644 -index 000000000..266c07c5c ---- /dev/null -+++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S -@@ -0,0 +1,463 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/23 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+*********************************************************************/ -+ -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define S9 $r20 -+#define S10 $r23 -+#define S11 $r24 -+#define S12 $r25 -+#define S13 $r26 -+#define S14 $r27 -+#define S15 $r28 -+#define S16 $r29 -+#define TD $r30 -+#define TS $r31 -+#define TL $r7 -+#define T0 $r6 -+#undef ZERO -+#define ZERO $r0 -+ -+#define F0 $f0 -+#define F1 $f1 -+#define F2 $f2 -+#define F3 $f3 -+#define F4 $f4 -+#define F5 $f5 -+#define F6 $f6 -+#define F7 $f7 -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define U8 $xr8 -+#define U9 $xr9 -+#define U10 $xr10 -+#define U11 $xr11 -+#define U12 $xr12 -+#define U13 $xr13 -+#define U14 $xr14 -+#define U15 $xr15 -+#define D0 $xr16 -+#define D1 $xr17 -+#define D2 $xr18 -+#define D3 $xr19 -+#define D4 $xr20 -+#define D5 $xr21 -+#define D6 $xr22 -+#define D7 $xr23 -+#define D8 $xr24 -+#define D9 $xr25 -+#define D10 $xr26 -+#define D11 $xr27 -+#define D12 $xr28 -+#define D13 $xr29 -+#define D14 $xr30 -+#define D15 $xr31 -+ -+// Loops outline -+//.L_N16 <------------------- -+//| .L_M8: | -+//| .L_M7: | Main Loop -+//| .L_M1: | -+//| .L_M0: --------------- -+//.L_N15: -+//.L_N8: -+//| .L_N8_M8: -+//| .L_N8_M7: -+//| .L_N8_M1: -+//.L_N7: -+//.L_N4: -+//| .L_N4_M4: -+//| .L_N4_M3: -+//| .L_N4_M1: -+//.L_N3: -+//.L_N2: -+//| .L_N2_M2: -+//| .L_N2_M1: -+//.L_N1: -+//| .L_N1_M1: -+//.L_N0 -+ -+ PROLOGUE -+ push_if_used 26, 32 -+ -+ move TD, DST -+ move TS, SRC -+ PTR_SLLI TL, LDA, 0x02 -+ PTR_SLLI T0, TL, 0x01 -+ PTR_SRAI J, N, 0x04 -+ beq J, ZERO, .L_N15 -+.align 5 -+.L_N16: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x03 -+ PTR_ADD S3, S2, TL -+ PTR_ADDI J, J, -1 -+ PTR_ADD S4, S3, TL -+ PTR_ADD S5, S3, T0 -+ PTR_ADD S6, S4, T0 -+ PTR_ADD S7, S5, T0 -+ PTR_ADD S8, S6, T0 -+ PTR_ADD S9, S7, T0 -+ PTR_ADD S10, S8, T0 -+ PTR_ADD S11, S9, T0 -+ PTR_ADD S12, S10, T0 -+ PTR_ADD S13, S11, T0 -+ PTR_ADD S14, S12, T0 -+ PTR_ADD S15, S13, T0 -+ PTR_ADD S16, S14, T0 -+ PTR_ADD TS, S15, T0 -+ beq I, ZERO, .L_M7 -+.align 5 -+.L_M8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ xvld U8, S9, 0x00 -+ xvld U9, S10, 0x00 -+ xvld U10, S11, 0x00 -+ xvld U11, S12, 0x00 -+ xvld U12, S13, 0x00 -+ xvld U13, S14, 0x00 -+ xvld U14, S15, 0x00 -+ xvld U15, S16, 0x00 -+ -+ GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ -+ U0, U1, U2, U3, U4, U5, U6, U7, \ -+ D1, D3, D5, D7 // As tmp -+ GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ -+ U8, U9, U10, U11, U12, U13, U14, U15, \ -+ U0, U1, U2, U3 // As tmp -+ GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ -+ D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 -+ PTR_ADDI TD, TD, 0x100 -+ GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ -+ D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 -+ PTR_ADDI TD, TD, 0x100 -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ PTR_ADDI S5, S5, 0x20 -+ PTR_ADDI S6, S6, 0x20 -+ PTR_ADDI S7, S7, 0x20 -+ PTR_ADDI S8, S8, 0x20 -+ PTR_ADDI S9, S9, 0x20 -+ PTR_ADDI S10, S10, 0x20 -+ PTR_ADDI S11, S11, 0x20 -+ PTR_ADDI S12, S12, 0x20 -+ PTR_ADDI S13, S13, 0x20 -+ PTR_ADDI S14, S14, 0x20 -+ PTR_ADDI S15, S15, 0x20 -+ PTR_ADDI S16, S16, 0x20 -+ -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_M8 -+.L_M7: -+ andi I, M, 0x07 -+ beq I, ZERO, .L_M0 -+.align 5 -+.L_M1: -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ fld.s F2, S3, 0x00 -+ fld.s F3, S4, 0x00 -+ fld.s F4, S5, 0x00 -+ fld.s F5, S6, 0x00 -+ fld.s F6, S7, 0x00 -+ fld.s F7, S8, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ fst.s F1, TD, 0x04 -+ fst.s F2, TD, 0x08 -+ fst.s F3, TD, 0x0C -+ fst.s F4, TD, 0x10 -+ fst.s F5, TD, 0x14 -+ fst.s F6, TD, 0x18 -+ fst.s F7, TD, 0x1C -+ -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI S3, S3, 0x04 -+ PTR_ADDI S4, S4, 0x04 -+ PTR_ADDI S5, S5, 0x04 -+ PTR_ADDI S6, S6, 0x04 -+ PTR_ADDI S7, S7, 0x04 -+ PTR_ADDI S8, S8, 0x04 -+ PTR_ADDI TD, TD, 0x20 -+ -+ fld.s F0, S9, 0x00 -+ fld.s F1, S10, 0x00 -+ fld.s F2, S11, 0x00 -+ fld.s F3, S12, 0x00 -+ fld.s F4, S13, 0x00 -+ fld.s F5, S14, 0x00 -+ fld.s F6, S15, 0x00 -+ fld.s F7, S16, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ fst.s F1, TD, 0x04 -+ fst.s F2, TD, 0x08 -+ fst.s F3, TD, 0x0C -+ fst.s F4, TD, 0x10 -+ fst.s F5, TD, 0x14 -+ fst.s F6, TD, 0x18 -+ fst.s F7, TD, 0x1C -+ -+ PTR_ADDI S9, S9, 0x04 -+ PTR_ADDI S10, S10, 0x04 -+ PTR_ADDI S11, S11, 0x04 -+ PTR_ADDI S12, S12, 0x04 -+ PTR_ADDI S13, S13, 0x04 -+ PTR_ADDI S14, S14, 0x04 -+ PTR_ADDI S15, S15, 0x04 -+ PTR_ADDI S16, S16, 0x04 -+ PTR_ADDI TD, TD, 0x20 -+ -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_M1 -+.L_M0: -+ blt ZERO, J, .L_N16 -+.L_N15: -+ andi J, N, 0x0f -+ beq ZERO, J, .L_N0 -+ -+ andi J, N, 0x08 -+ beq ZERO, J, .L_N7 -+.L_N8: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x03 -+ PTR_ADD S3, S2, TL -+ PTR_ADD S4, S2, T0 -+ PTR_ADD S5, S3, T0 -+ PTR_ADD S6, S4, T0 -+ PTR_ADD S7, S5, T0 -+ PTR_ADD S8, S6, T0 -+ PTR_ADD TS, S7, T0 -+ beq I, ZERO, .L_N8_M7 -+.align 5 -+.L_N8_M8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ -+ U0, U1, U2, U3, U4, U5, U6, U7, \ -+ D1, D3, D5, D7 // As tmp -+ GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ -+ D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 -+ PTR_ADDI TD, TD, 0x100 -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ PTR_ADDI S5, S5, 0x20 -+ PTR_ADDI S6, S6, 0x20 -+ PTR_ADDI S7, S7, 0x20 -+ PTR_ADDI S8, S8, 0x20 -+ -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N8_M8 -+.L_N8_M7: -+ andi I, M, 0x07 -+ beq I, ZERO, .L_N7 -+.align 5 -+.L_N8_M1: -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ fld.s F2, S3, 0x00 -+ fld.s F3, S4, 0x00 -+ fld.s F4, S5, 0x00 -+ fld.s F5, S6, 0x00 -+ fld.s F6, S7, 0x00 -+ fld.s F7, S8, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F1, TD, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ fst.s F2, TD, 0x08 -+ PTR_ADDI S3, S3, 0x04 -+ fst.s F3, TD, 0x0C -+ PTR_ADDI S4, S4, 0x04 -+ fst.s F4, TD, 0x10 -+ PTR_ADDI S5, S5, 0x04 -+ fst.s F5, TD, 0x14 -+ PTR_ADDI S6, S6, 0x04 -+ fst.s F6, TD, 0x18 -+ PTR_ADDI S7, S7, 0x04 -+ fst.s F7, TD, 0x1C -+ PTR_ADDI S8, S8, 0x04 -+ -+ PTR_ADDI TD, TD, 0x20 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N8_M1 -+.L_N7: -+ andi J, N, 0x07 -+ beq ZERO, J, .L_N0 -+ -+ andi J, N, 0x04 -+ beq ZERO, J, .L_N3 -+.L_N4: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x02 -+ PTR_ADD S3, S2, TL -+ PTR_ADD S4, S2, T0 -+ PTR_ADD TS, S3, T0 -+ beq I, ZERO, .L_N4_M3 -+.align 5 -+.L_N4_M4: -+ GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 -+ GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 -+ GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 -+ GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 -+ GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 -+ GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI S3, S3, 0x10 -+ PTR_ADDI S4, S4, 0x10 -+ PTR_ADDI TD, TD, 0x40 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N4_M4 -+.L_N4_M3: -+ andi I, M, 0x03 -+ beq I, ZERO, .L_N3 -+.align 5 -+.L_N4_M1: -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ fld.s F2, S3, 0x00 -+ fld.s F3, S4, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F1, TD, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ fst.s F2, TD, 0x08 -+ PTR_ADDI S3, S3, 0x04 -+ fst.s F3, TD, 0x0C -+ PTR_ADDI S4, S4, 0x04 -+ -+ PTR_ADDI TD, TD, 0x10 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N4_M1 -+.L_N3: -+ andi J, N, 0x03 -+ beq ZERO, J, .L_N0 -+ -+ andi J, N, 0x02 -+ beq ZERO, J, .L_N1 -+.L_N2: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x01 -+ PTR_ADD TS, S2, TL -+ beq I, ZERO, .L_N2_M1 -+.align 5 -+.L_N2_M2: -+ GLD f, d, F0, S1, 0x00, F1, S2, 0x00 -+ vilvl.w $vr0, $vr1, $vr0 -+ GST v, , $vr0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI TD, TD, 0x10 -+ -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N2_M2 -+.L_N2_M1: -+ andi I, M, 0x01 -+ beq I, ZERO, .L_N1 -+ -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F1, TD, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI TD, TD, 0x08 -+.align 5 -+.L_N1: -+ move S1, TS -+ beq ZERO, M, .L_N0 -+.L_N1_M1: -+ fld.s F0, S1, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F0, TD, 0x00 -+ PTR_ADDI TD, TD, 0x04 -+ PTR_ADDI M, M, -1 -+ blt ZERO, M, .L_N1_M1 -+.L_N0: -+ pop_if_used 26, 32 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S -new file mode 100644 -index 000000000..5c173568b ---- /dev/null -+++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S -@@ -0,0 +1,298 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/23 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+*********************************************************************/ -+ -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define TD $r20 -+#define TS $r11 -+#define TL $r7 -+#define T0 $r6 -+#undef ZERO -+#define ZERO $r0 -+ -+#define F0 $f0 -+#define F1 $f1 -+#define F2 $f2 -+#define F3 $f3 -+#define F4 $f4 -+#define F5 $f5 -+#define F6 $f6 -+#define F7 $f7 -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+#define D0 $xr8 -+#define D1 $xr9 -+#define D2 $xr10 -+#define D3 $xr11 -+#define D4 $xr12 -+#define D5 $xr13 -+#define D6 $xr14 -+#define D7 $xr15 -+#define D8 $xr16 -+#define D10 $xr17 -+#define D12 $xr18 -+#define D14 $xr19 -+ -+// Loops outline -+//.L_N8: <---------------- -+//| .L_M8: | -+//| .L_M7: | Main Loop -+//| .L_M1: | -+//| .L_M0:-------------- -+//.L_N7: -+//.L_N4: -+//| .L_N4_M4: -+//| .L_N4_M3: -+//| .L_N4_M1: -+//.L_N3: -+//.L_N2: -+//| .L_N2_M2: -+//| .L_N2_M1: -+//.L_N1: -+//| .L_N1_M1: -+//.L_N0 -+ -+ PROLOGUE -+ push_if_used 17, 20 -+ -+ move TD, DST -+ move TS, SRC -+ PTR_SLLI TL, LDA, 0x02 -+ PTR_SLLI T0, TL, 0x01 -+ PTR_SRAI J, N, 0x03 -+ beq J, ZERO, .L_N7 -+.align 5 -+.L_N8: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x03 -+ PTR_ADD S3, S2, TL -+ PTR_ADDI J, J, -1 -+ PTR_ADD S4, S2, T0 -+ PTR_ADD S5, S3, T0 -+ PTR_ADD S6, S4, T0 -+ PTR_ADD S7, S5, T0 -+ PTR_ADD S8, S6, T0 -+ PTR_ADD TS, S7, T0 -+ beq I, ZERO, .L_M7 -+.align 5 -+.L_M8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ -+ U0, U1, U2, U3, U4, U5, U6, U7, \ -+ D1, D3, D5, D7 // As tmp -+ GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ -+ D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 -+ PTR_ADDI TD, TD, 0x100 -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ PTR_ADDI S5, S5, 0x20 -+ PTR_ADDI S6, S6, 0x20 -+ PTR_ADDI S7, S7, 0x20 -+ PTR_ADDI S8, S8, 0x20 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_M8 -+.L_M7: -+ andi I, M, 0x07 -+ beq I, ZERO, .L_M0 -+.align 5 -+.L_M1: -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ fld.s F2, S3, 0x00 -+ fld.s F3, S4, 0x00 -+ fld.s F4, S5, 0x00 -+ fld.s F5, S6, 0x00 -+ fld.s F6, S7, 0x00 -+ fld.s F7, S8, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F1, TD, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ fst.s F2, TD, 0x08 -+ PTR_ADDI S3, S3, 0x04 -+ fst.s F3, TD, 0x0C -+ PTR_ADDI S4, S4, 0x04 -+ fst.s F4, TD, 0x10 -+ PTR_ADDI S5, S5, 0x04 -+ fst.s F5, TD, 0x14 -+ PTR_ADDI S6, S6, 0x04 -+ fst.s F6, TD, 0x18 -+ PTR_ADDI S7, S7, 0x04 -+ fst.s F7, TD, 0x1C -+ PTR_ADDI S8, S8, 0x04 -+ -+ PTR_ADDI TD, TD, 0x20 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_M1 -+.L_M0: -+ blt ZERO, J, .L_N8 -+.L_N7: -+ andi J, N, 0x07 -+ beq ZERO, J, .L_N0 -+ -+ andi J, N, 0x04 -+ beq ZERO, J, .L_N3 -+.L_N4: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x02 -+ PTR_ADD S3, S2, TL -+ PTR_ADD S4, S2, T0 -+ PTR_ADD TS, S3, T0 -+ beq I, ZERO, .L_N4_M3 -+.align 5 -+.L_N4_M4: -+ GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 -+ GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 -+ GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 -+ GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 -+ GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 -+ GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI S3, S3, 0x10 -+ PTR_ADDI S4, S4, 0x10 -+ PTR_ADDI TD, TD, 0x40 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N4_M4 -+.L_N4_M3: -+ andi I, M, 0x03 -+ beq I, ZERO, .L_N3 -+.align 5 -+.L_N4_M1: -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ fld.s F2, S3, 0x00 -+ fld.s F3, S4, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F1, TD, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ fst.s F2, TD, 0x08 -+ PTR_ADDI S3, S3, 0x04 -+ fst.s F3, TD, 0x0C -+ PTR_ADDI S4, S4, 0x04 -+ -+ PTR_ADDI TD, TD, 0x10 -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N4_M1 -+.L_N3: -+ andi J, N, 0x03 -+ beq ZERO, J, .L_N0 -+ -+ andi J, N, 0x02 -+ beq ZERO, J, .L_N1 -+.L_N2: -+ move S1, TS -+ PTR_ADD S2, TS, TL -+ PTR_SRAI I, M, 0x01 -+ PTR_ADD TS, S2, TL -+ beq I, ZERO, .L_N2_M1 -+.align 5 -+.L_N2_M2: -+ GLD f, d, F0, S1, 0x00, F1, S2, 0x00 -+ vilvl.w $vr0, $vr1, $vr0 -+ GST v, , $vr0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI TD, TD, 0x10 -+ -+ PTR_ADDI I, I, -1 -+ blt ZERO, I, .L_N2_M2 -+.L_N2_M1: -+ andi I, M, 0x01 -+ beq I, ZERO, .L_N1 -+ -+ fld.s F0, S1, 0x00 -+ fld.s F1, S2, 0x00 -+ -+ fst.s F0, TD, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F1, TD, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI TD, TD, 0x08 -+.align 5 -+.L_N1: -+ move S1, TS -+ beq ZERO, M, .L_N0 -+.L_N1_M1: -+ fld.s F0, S1, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ fst.s F0, TD, 0x00 -+ PTR_ADDI TD, TD, 0x04 -+ PTR_ADDI M, M, -1 -+ blt ZERO, M, .L_N1_M1 -+.L_N0: -+ pop_if_used 17, 20 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S -new file mode 100644 -index 000000000..d9789bdcd ---- /dev/null -+++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S -@@ -0,0 +1,526 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/23 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+*********************************************************************/ -+ -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S0 $r11 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define P0 $r20 -+#define P1 $r23 -+#define P2 $r24 -+#define P3 $r25 -+#define P4 $r26 -+#define P5 $r27 -+#define T0 $r28 -+#define T1 $r29 -+#define TL $r7 -+#define ZERO $r0 -+ -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+ -+// Loops outline -+//.L_M8 <------------------- -+//| .L_N16: | -+//| .L_N15: | -+//| .L_N8: | -+//| .L_N7: | Main Loop -+//| .L_N4: | -+//| .L_N3: | -+//| .L_N2: | -+//| .L_N1: | -+//| .L_N0: --------------- -+//.L_M7 -+//.L_M4 -+//| .L_M4_N16: -+//| .L_M4_N15: -+//| .L_M4_N8: -+//| .L_M4_N7: -+//| .L_M4_N4: -+//| .L_M4_N3: -+//| .L_M4_N2: -+//| .L_M4_N1: -+//.L_M3 -+//.L_M2 -+//| .L_M2_N16: -+//| .L_M2_N15: -+//| .L_M2_N8: -+//| .L_M2_N7: -+//| .L_M2_N4: -+//| .L_M2_N3: -+//| .L_M2_N2: -+//| .L_M2_N1: -+//.L_M1 -+//| .L_M1_N16: -+//| .L_M1_N15: -+//| .L_M1_N8: -+//| .L_M1_N7: -+//| .L_M1_N4: -+//| .L_M1_N3: -+//| .L_M1_N2: -+//| .L_M1_N1: -+//.L_M0 -+ -+ PROLOGUE -+ push_if_used 24, 8 -+ -+ move S0, SRC -+ move P0, DST -+ -+ PTR_SRAI T0, N, 0x04 -+ PTR_SRAI T1, N, 0x03 -+ PTR_SLLI T0, T0, 0x04 -+ PTR_SLLI T1, T1, 0x03 -+ -+ PTR_MUL P2, M, T0 -+ PTR_MUL P3, M, T1 -+ PTR_SLLI P2, P2, 0x02 -+ PTR_SLLI P3, P3, 0x02 -+ PTR_ADD P2, DST, P2 -+ PTR_ADD P3, DST, P3 -+ -+ PTR_SRAI T0, N, 0x02 -+ PTR_SRAI T1, N, 0x01 -+ PTR_SLLI T0, T0, 0x02 -+ PTR_SLLI T1, T1, 0x01 -+ PTR_MUL P4, M, T0 -+ PTR_MUL P5, M, T1 -+ PTR_SLLI P4, P4, 0x02 -+ PTR_SLLI P5, P5, 0x02 -+ PTR_ADD P4, DST, P4 -+ PTR_ADD P5, DST, P5 -+ -+ PTR_SLLI TL, LDA, 0x02 -+ PTR_SRAI J, M, 0x03 -+ PTR_SLLI T0, TL, 0x01 -+ PTR_SLLI T1, M, 0x06 -+ beq ZERO, J, .L_M7 -+.align 5 -+.L_M8: -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ PTR_ADD S3, S1, T0 -+ PTR_ADD S4, S2, T0 -+ PTR_ADD S5, S3, T0 -+ PTR_ADD S6, S4, T0 -+ PTR_ADD S7, S5, T0 -+ PTR_ADD S8, S6, T0 -+ PTR_ADD S0, S7, T0 -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x200 -+ -+ PTR_SRAI I, N, 0x04 -+ PTR_ADDI J, J, -1 -+ beq ZERO, I, .L_N15 -+.L_N16: -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S2, 0x00 -+ xvld U3, S2, 0x20 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ -+ xvld U4, S3, 0x00 -+ xvld U5, S3, 0x20 -+ xvld U6, S4, 0x00 -+ xvld U7, S4, 0x20 -+ -+ xvst U4, P1, 0x80 -+ xvst U5, P1, 0xA0 -+ xvst U6, P1, 0xC0 -+ xvst U7, P1, 0xE0 -+ -+ xvld U0, S5, 0x00 -+ xvld U1, S5, 0x20 -+ xvld U2, S6, 0x00 -+ xvld U3, S6, 0x20 -+ -+ xvst U0, P1, 0x100 -+ xvst U1, P1, 0x120 -+ xvst U2, P1, 0x140 -+ xvst U3, P1, 0x160 -+ -+ xvld U4, S7, 0x00 -+ xvld U5, S7, 0x20 -+ xvld U6, S8, 0x00 -+ xvld U7, S8, 0x20 -+ -+ xvst U4, P1, 0x180 -+ xvst U5, P1, 0x1A0 -+ xvst U6, P1, 0x1C0 -+ xvst U7, P1, 0x1E0 -+ -+ PTR_ADDI S1, S1, 0x40 -+ PTR_ADDI S2, S2, 0x40 -+ PTR_ADDI S3, S3, 0x40 -+ PTR_ADDI S4, S4, 0x40 -+ PTR_ADDI S5, S5, 0x40 -+ PTR_ADDI S6, S6, 0x40 -+ PTR_ADDI S7, S7, 0x40 -+ PTR_ADDI S8, S8, 0x40 -+ -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_N16 -+.L_N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_N7 -+.L_N8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ -+ U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ PTR_ADDI S5, S5, 0x20 -+ PTR_ADDI S6, S6, 0x20 -+ PTR_ADDI S7, S7, 0x20 -+ PTR_ADDI S8, S8, 0x20 -+ PTR_ADDI P2, P2, 0x100 -+.L_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_N3 -+.L_N4: -+ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ -+ $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 -+ GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ -+ $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI S3, S3, 0x10 -+ PTR_ADDI S4, S4, 0x10 -+ PTR_ADDI S5, S5, 0x10 -+ PTR_ADDI S6, S6, 0x10 -+ PTR_ADDI S7, S7, 0x10 -+ PTR_ADDI S8, S8, 0x10 -+ PTR_ADDI P3, P3, 0x80 -+.L_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_N1 -+.L_N2: -+ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ -+ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 -+ GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ -+ $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI S3, S3, 0x08 -+ PTR_ADDI S4, S4, 0x08 -+ PTR_ADDI S5, S5, 0x08 -+ PTR_ADDI S6, S6, 0x08 -+ PTR_ADDI S7, S7, 0x08 -+ PTR_ADDI S8, S8, 0x08 -+ PTR_ADDI P4, P4, 0x40 -+.L_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_N0 -+ -+ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ -+ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 -+ GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ -+ $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI S3, S3, 0x04 -+ PTR_ADDI S4, S4, 0x04 -+ PTR_ADDI S5, S5, 0x04 -+ PTR_ADDI S6, S6, 0x04 -+ PTR_ADDI S7, S7, 0x04 -+ PTR_ADDI S8, S8, 0x04 -+ PTR_ADDI P5, P5, 0x20 -+.L_N0: -+ blt ZERO, J, .L_M8 -+.L_M7: -+ andi J, M, 0x04 -+ beq ZERO, J, .L_M3 -+.L_M4: -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ PTR_ADD S3, S1, T0 -+ PTR_ADD S4, S2, T0 -+ PTR_ADD S0, S3, T0 -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x100 -+ -+ PTR_SRAI I, N, 0x04 -+ beq ZERO, I, .L_M4_N15 -+.align 5 -+.L_M4_N16: -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S2, 0x00 -+ xvld U3, S2, 0x20 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ -+ xvld U4, S3, 0x00 -+ xvld U5, S3, 0x20 -+ xvld U6, S4, 0x00 -+ xvld U7, S4, 0x20 -+ -+ xvst U4, P1, 0x80 -+ xvst U5, P1, 0xA0 -+ xvst U6, P1, 0xC0 -+ xvst U7, P1, 0xE0 -+ -+ PTR_ADDI S1, S1, 0x40 -+ PTR_ADDI S2, S2, 0x40 -+ PTR_ADDI S3, S3, 0x40 -+ PTR_ADDI S4, S4, 0x40 -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_M4_N16 -+.L_M4_N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_M4_N7 -+.L_M4_N8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ PTR_ADDI P2, P2, 0x80 -+.L_M4_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_M4_N3 -+.L_M4_N4: -+ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 -+ GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI S3, S3, 0x10 -+ PTR_ADDI S4, S4, 0x10 -+ PTR_ADDI P3, P3, 0x40 -+.L_M4_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_M4_N1 -+.L_M4_N2: -+ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 -+ GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI S3, S3, 0x08 -+ PTR_ADDI S4, S4, 0x08 -+ PTR_ADDI P4, P4, 0x20 -+.L_M4_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M3 -+ -+ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 -+ GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI S3, S3, 0x04 -+ PTR_ADDI S4, S4, 0x04 -+ PTR_ADDI P5, P5, 0x10 -+.L_M3: -+ andi J, M, 0x02 -+ beq ZERO, J, .L_M1 -+.L_M2: -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ PTR_ADD S0, S0, T0 -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x80 -+ -+ PTR_SRAI I, N, 0x04 -+ beq ZERO, I, .L_M2_N15 -+.align 5 -+.L_M2_N16: -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ xvld U2, S2, 0x00 -+ xvld U3, S2, 0x20 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ xvst U2, P1, 0x40 -+ xvst U3, P1, 0x60 -+ -+ PTR_ADDI S1, S1, 0x40 -+ PTR_ADDI S2, S2, 0x40 -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_M2_N16 -+.L_M2_N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_M2_N7 -+.L_M2_N8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ GST xv, , U0, P2, 0x00, U1, P2, 0x20 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI P2, P2, 0x40 -+.L_M2_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_M2_N3 -+.L_M2_N4: -+ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 -+ GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI P3, P3, 0x20 -+.L_M2_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_M2_N1 -+.L_M2_N2: -+ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 -+ GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI P4, P4, 0x10 -+.L_M2_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M1 -+ -+ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 -+ GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI P5, P5, 0x08 -+.L_M1: -+ andi J, M, 0x01 -+ beq ZERO, J, .L_M0 -+ -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x40 -+ -+ PTR_SRAI I, N, 0x04 -+ beq ZERO, I, .L_M1_N15 -+.align 5 -+.L_M1_N16: -+ xvld U0, S1, 0x00 -+ xvld U1, S1, 0x20 -+ -+ xvst U0, P1, 0x00 -+ xvst U1, P1, 0x20 -+ -+ PTR_ADDI S1, S1, 0x40 -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_M1_N16 -+.L_M1_N15: -+ andi I, N, 0x08 -+ beq ZERO, I, .L_M1_N7 -+.L_M1_N8: -+ xvld U0, S1, 0x00 -+ -+ GST xv, , U0, P2, 0x00 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI P2, P2, 0x20 -+.L_M1_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_M1_N3 -+.L_M1_N4: -+ GLD v, , $vr0, S1, 0x00 -+ GST v, , $vr0, P3, 0x00 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI P3, P3, 0x10 -+.L_M1_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_M1_N1 -+.L_M1_N2: -+ GLD f, d, $f0, S1, 0x00 -+ GST f, d, $f0, P4, 0x00 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI P4, P4, 0x08 -+.L_M1_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M0 -+ -+ GLD f, s, $f0, S1, 0x00 -+ GST f, s, $f0, P5, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI P5, P5, 0x04 -+.L_M0: -+ pop_if_used 24, 8 -+ jirl $r0, $r1, 0x00 -+ EPILOGUE -diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S -new file mode 100644 -index 000000000..725a47a60 ---- /dev/null -+++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S -@@ -0,0 +1,406 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/23 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+*********************************************************************/ -+ -+/* Function parameters */ -+#define M $r4 // param 1: m -+#define N $r5 // param 2: n -+#define SRC $r6 // param 3: src -+#define LDA $r7 // param 4: lda -+#define DST $r8 // param 5: dst -+ -+#define I $r9 -+#define J $r10 -+#define S0 $r11 -+#define S1 $r12 -+#define S2 $r13 -+#define S3 $r14 -+#define S4 $r15 -+#define S5 $r16 -+#define S6 $r17 -+#define S7 $r18 -+#define S8 $r19 -+#define P0 $r20 -+#define P1 $r23 -+#define P2 $r24 -+#define P3 $r25 -+#define P4 $r26 -+#define T0 $r27 -+#define T1 $r28 -+#define TL $r7 -+#undef ZERO -+#define ZERO $r0 -+ -+/* LASX vectors */ -+#define U0 $xr0 -+#define U1 $xr1 -+#define U2 $xr2 -+#define U3 $xr3 -+#define U4 $xr4 -+#define U5 $xr5 -+#define U6 $xr6 -+#define U7 $xr7 -+ -+// Loops outline -+//.L_M8 <------------------- -+//| .L_N8: | -+//| .L_N7: | Main Loop -+//| .L_N4: | -+//| .L_N3: | -+//| .L_N2: | -+//| .L_N1: | -+//| .L_N0: --------------- -+//.L_M7 -+//.L_M4 -+//| .L_M4_N8: -+//| .L_M4_N7: -+//| .L_M4_N4: -+//| .L_M4_N3: -+//| .L_M4_N2: -+//| .L_M4_N1: -+//.L_M3 -+//.L_M2 -+//| .L_M2_N8: -+//| .L_M2_N7: -+//| .L_M2_N4: -+//| .L_M2_N3: -+//| .L_M2_N2: -+//| .L_M2_N1: -+//.L_M1 -+//| .L_M1_N8: -+//| .L_M1_N7: -+//| .L_M1_N4: -+//| .L_M1_N3: -+//| .L_M1_N2: -+//| .L_M1_N1: -+//.L_M0 -+ -+ PROLOGUE -+ push_if_used 23, 8 -+ -+ move S0, SRC -+ move P0, DST -+ -+ PTR_SRAI T0, N, 0x04 -+ PTR_SRAI T1, N, 0x03 -+ PTR_SLLI T0, T0, 0x04 -+ PTR_SLLI T1, T1, 0x03 -+ -+ PTR_MUL P2, M, T1 -+ PTR_SLLI P2, P2, 0x02 -+ PTR_ADD P2, DST, P2 -+ PTR_SRAI T0, N, 0x02 -+ PTR_SRAI T1, N, 0x01 -+ PTR_SLLI T0, T0, 0x02 -+ PTR_SLLI T1, T1, 0x01 -+ PTR_MUL P3, M, T0 -+ PTR_MUL P4, M, T1 -+ PTR_SLLI P3, P3, 0x02 -+ PTR_SLLI P4, P4, 0x02 -+ PTR_ADD P3, DST, P3 -+ PTR_ADD P4, DST, P4 -+ -+ PTR_SLLI TL, LDA, 0x02 -+ PTR_SRAI J, M, 0x03 -+ PTR_SLLI T0, TL, 0x01 -+ PTR_SLLI T1, M, 0x05 -+ beq ZERO, J, .L_M7 -+.align 5 -+.L_M8: -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ PTR_ADD S3, S1, T0 -+ PTR_ADD S4, S2, T0 -+ PTR_ADD S5, S3, T0 -+ PTR_ADD S6, S4, T0 -+ PTR_ADD S7, S5, T0 -+ PTR_ADD S8, S6, T0 -+ PTR_ADD S0, S7, T0 -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x100 -+ -+ PTR_SRAI I, N, 0x03 -+ PTR_ADDI J, J, -1 -+ beq ZERO, I, .L_N7 -+.L_N8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ xvld U4, S5, 0x00 -+ xvld U5, S6, 0x00 -+ xvld U6, S7, 0x00 -+ xvld U7, S8, 0x00 -+ -+ GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ -+ U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ PTR_ADDI S5, S5, 0x20 -+ PTR_ADDI S6, S6, 0x20 -+ PTR_ADDI S7, S7, 0x20 -+ PTR_ADDI S8, S8, 0x20 -+ -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_N8 -+.L_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_N3 -+.L_N4: -+ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ -+ $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 -+ GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ -+ $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI S3, S3, 0x10 -+ PTR_ADDI S4, S4, 0x10 -+ PTR_ADDI S5, S5, 0x10 -+ PTR_ADDI S6, S6, 0x10 -+ PTR_ADDI S7, S7, 0x10 -+ PTR_ADDI S8, S8, 0x10 -+ PTR_ADDI P2, P2, 0x80 -+.L_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_N1 -+.L_N2: -+ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ -+ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 -+ GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ -+ $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI S3, S3, 0x08 -+ PTR_ADDI S4, S4, 0x08 -+ PTR_ADDI S5, S5, 0x08 -+ PTR_ADDI S6, S6, 0x08 -+ PTR_ADDI S7, S7, 0x08 -+ PTR_ADDI S8, S8, 0x08 -+ PTR_ADDI P3, P3, 0x40 -+.L_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_N0 -+ -+ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ -+ $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 -+ GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ -+ $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI S3, S3, 0x04 -+ PTR_ADDI S4, S4, 0x04 -+ PTR_ADDI S5, S5, 0x04 -+ PTR_ADDI S6, S6, 0x04 -+ PTR_ADDI S7, S7, 0x04 -+ PTR_ADDI S8, S8, 0x04 -+ PTR_ADDI P4, P4, 0x20 -+.L_N0: -+ blt ZERO, J, .L_M8 -+ -+.L_M7: -+ andi J, M, 0x04 -+ beq ZERO, J, .L_M3 -+.L_M4: -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ PTR_ADD S3, S1, T0 -+ PTR_ADD S4, S2, T0 -+ PTR_ADD S0, S3, T0 -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x80 -+ -+ PTR_SRAI I, N, 0x03 -+ beq ZERO, I, .L_M4_N7 -+.align 5 -+.L_M4_N8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ xvld U2, S3, 0x00 -+ xvld U3, S4, 0x00 -+ -+ GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI S3, S3, 0x20 -+ PTR_ADDI S4, S4, 0x20 -+ -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_M4_N8 -+.L_M4_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_M4_N3 -+.L_M4_N4: -+ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 -+ GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI S3, S3, 0x10 -+ PTR_ADDI S4, S4, 0x10 -+ PTR_ADDI P2, P2, 0x40 -+.L_M4_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_M4_N1 -+.L_M4_N2: -+ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 -+ GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI S3, S3, 0x08 -+ PTR_ADDI S4, S4, 0x08 -+ PTR_ADDI P3, P3, 0x20 -+.L_M4_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M3 -+ -+ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 -+ GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI S3, S3, 0x04 -+ PTR_ADDI S4, S4, 0x04 -+ PTR_ADDI P4, P4, 0x10 -+.L_M3: -+ andi J, M, 0x02 -+ beq ZERO, J, .L_M1 -+.L_M2: -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ PTR_ADD S0, S0, T0 -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x40 -+ -+ PTR_SRAI I, N, 0x03 -+ beq ZERO, I, .L_M2_N7 -+.align 5 -+.L_M2_N8: -+ xvld U0, S1, 0x00 -+ xvld U1, S2, 0x00 -+ -+ GST xv, , U0, P1, 0x00, U1, P1, 0x20 -+ -+ PTR_ADDI S1, S1, 0x20 -+ PTR_ADDI S2, S2, 0x20 -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_M2_N8 -+.L_M2_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_M2_N3 -+.L_M2_N4: -+ GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 -+ GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI S2, S2, 0x10 -+ PTR_ADDI P2, P2, 0x20 -+.L_M2_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_M2_N1 -+.L_M2_N2: -+ GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 -+ GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI S2, S2, 0x08 -+ PTR_ADDI P3, P3, 0x10 -+.L_M2_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M1 -+ -+ GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 -+ GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI S2, S2, 0x04 -+ PTR_ADDI P4, P4, 0x08 -+.L_M1: -+ andi J, M, 0x01 -+ beq ZERO, J, .L_M0 -+ -+ move S1, S0 -+ PTR_ADD S2, S0, TL -+ -+ move P1, P0 -+ PTR_ADDI P0, P0, 0x20 -+ -+ PTR_SRAI I, N, 0x03 -+ beq ZERO, I, .L_M1_N7 -+.align 5 -+.L_M1_N8: -+ xvld U0, S1, 0x00 -+ -+ GST xv, , U0, P1, 0x00 -+ -+ PTR_ADDI S1, S1, 0x20 -+ -+ PTR_ADDI I, I, -1 -+ PTR_ADD P1, P1, T1 -+ blt ZERO, I, .L_M1_N8 -+.L_M1_N7: -+ andi I, N, 0x04 -+ beq ZERO, I, .L_M1_N3 -+.L_M1_N4: -+ GLD v, , $vr0, S1, 0x00 -+ GST v, , $vr0, P2, 0x00 -+ PTR_ADDI S1, S1, 0x10 -+ PTR_ADDI P2, P2, 0x10 -+.L_M1_N3: -+ andi I, N, 0x02 -+ beq ZERO, I, .L_M1_N1 -+.L_M1_N2: -+ GLD f, d, $f0, S1, 0x00 -+ GST f, d, $f0, P3, 0x00 -+ PTR_ADDI S1, S1, 0x08 -+ PTR_ADDI P3, P3, 0x08 -+.L_M1_N1: -+ andi I, N, 0x01 -+ beq ZERO, I, .L_M0 -+ -+ GLD f, s, $f0, S1, 0x00 -+ GST f, s, $f0, P4, 0x00 -+ PTR_ADDI S1, S1, 0x04 -+ PTR_ADDI P4, P4, 0x04 -+.L_M0: -+ pop_if_used 23, 8 -+ jirl $r0, $r1, 0x00 -+ EPILOGUE -diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S -new file mode 100644 -index 000000000..52ffc320e ---- /dev/null -+++ b/kernel/loongarch64/sgemv_n_8_lasx.S -@@ -0,0 +1,463 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/30 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, -+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -+ */ -+#define M $r4 -+#define N $r5 -+#define ALPHA $f0 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INC_X $r10 -+#define Y $r11 -+#define INC_Y $r6 -+ -+#define J $r12 -+#define I $r13 -+#define K $r14 -+#define Y_ORG $r15 -+#define OFFSET $r16 -+#define K_LDA $r17 -+#define M4 $r18 -+#define T0 $r19 -+#define PA0 $r20 -+#define PA1 $r23 -+#define PA2 $r24 -+#define PA3 $r25 -+#define PA4 $r26 -+#define PA5 $r27 -+#define PA6 $r28 -+#define PA7 $r29 -+ -+#define VALPHA $xr1 -+#define X0 $xr2 -+#define X1 $xr3 -+#define X2 $xr4 -+#define X3 $xr5 -+#define X4 $xr6 -+#define X5 $xr7 -+#define X6 $xr8 -+#define X7 $xr9 -+#define Y0 $xr10 -+#define A0 $xr11 -+#define A1 $xr12 -+#define A2 $xr13 -+#define A3 $xr14 -+#define A4 $xr15 -+#define A5 $xr16 -+#define A6 $xr17 -+#define A7 $xr18 -+ -+#define X0_F $f2 -+#define X1_F $f3 -+#define X2_F $f4 -+#define X3_F $f5 -+#define X4_F $f6 -+#define X5_F $f7 -+#define X6_F $f8 -+#define X7_F $f9 -+#define Y0_F $f10 -+#define A0_F $f11 -+#define A1_F $f12 -+#define A2_F $f13 -+#define A3_F $f14 -+#define A4_F $f15 -+#define A5_F $f16 -+#define A6_F $f17 -+#define A7_F $f18 -+ -+.macro SLOAD_X_8 -+ GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \ -+ X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C -+ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ -+ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA -+.endm -+ -+.macro SLOAD_X_8_GAP -+ xvldrepl.w X0, X, 0x00 -+ PTR_ADD T0, X, INC_X -+ xvldrepl.w X1, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X2, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X3, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X4, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X5, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X6, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X7, T0, 0x00 -+ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ -+ X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA -+.endm -+ -+.macro SLOAD_X_4 -+ GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C -+ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA -+.endm -+ -+.macro SLOAD_X_4_GAP -+ xvldrepl.w X0, X, 0x00 -+ PTR_ADD T0, X, INC_X -+ xvldrepl.w X1, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X2, T0, 0x00 -+ PTR_ADD T0, T0, INC_X -+ xvldrepl.w X3, T0, 0x00 -+ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA -+.endm -+ -+.macro SLOAD_X_2 -+ GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04 -+ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA -+.endm -+ -+.macro SLOAD_X_2_GAP -+ xvldrepl.w X0, X, 0x00 -+ PTR_ADD T0, X, INC_X -+ xvldrepl.w X1, T0, 0x00 -+ GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA -+.endm -+ -+.macro SLOAD_X_1 -+ GLDREPL xv, w, X0, X, 0x00 -+ GMUL xvf, s, X0, X0, VALPHA -+.endm -+ -+.macro SLOAD_Y_8 -+ GLD xv, , Y0, Y, 0 -+.endm -+ -+.macro SLOAD_Y_8_GAP -+ fld.s Y0_F, Y, 0 -+ fldx.s A0_F, Y, INC_Y -+ PTR_ALSL T0, INC_Y, Y, 1 -+ fld.s A1_F, T0, 0 -+ fldx.s A2_F, T0, INC_Y -+ PTR_ALSL T0, INC_Y, Y, 2 -+ fld.s A3_F, T0, 0 -+ fldx.s A4_F, T0, INC_Y -+ PTR_ADD T0, T0, INC_Y -+ PTR_ADD T0, T0, INC_Y -+ fld.s A5_F, T0, 0 -+ fldx.s A6_F, T0, INC_Y -+ GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \ -+ Y0, A4, 5, Y0, A5, 6, Y0, A6, 7 -+.endm -+ -+.macro SLOAD_Y_1 -+ GLD f, s, Y0_F, Y, 0 -+.endm -+ -+.macro SGEMV_N_8x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA1, 0, \ -+ A2, PA2, 0, A3, PA3, 0, \ -+ A4, PA4, 0, A5, PA5, 0, \ -+ A6, PA6, 0, A7, PA7, 0 -+ GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ -+ Y0, A2, X2, Y0, Y0, A3, X3, Y0, \ -+ Y0, A4, X4, Y0, Y0, A5, X5, Y0, \ -+ Y0, A6, X6, Y0, Y0, A7, X7, Y0 -+.endm -+ -+.macro SGEMV_N_1x8 -+ GLD_INC f, s, 0x04, \ -+ A0_F, PA0, 0, A1_F, PA1, 0, \ -+ A2_F, PA2, 0, A3_F, PA3, 0, \ -+ A4_F, PA4, 0, A5_F, PA5, 0, \ -+ A6_F, PA6, 0, A7_F, PA7, 0 -+ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ -+ Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \ -+ Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \ -+ Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F -+.endm -+ -+.macro SGEMV_N_8x4 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA1, 0, \ -+ A2, PA2, 0, A3, PA3, 0 -+ GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \ -+ Y0, A2, X2, Y0, Y0, A3, X3, Y0 -+.endm -+ -+.macro SGEMV_N_1x4 -+ GLD_INC f, s, 0x04, \ -+ A0_F, PA0, 0, A1_F, PA1, 0, \ -+ A2_F, PA2, 0, A3_F, PA3, 0 -+ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \ -+ Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F -+.endm -+ -+.macro SGEMV_N_8x2 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA1, 0 -+ GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0 -+.endm -+ -+.macro SGEMV_N_1x2 -+ GLD_INC f, s, 0x04, \ -+ A0_F, PA0, 0, A1_F, PA1, 0 -+ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F -+.endm -+ -+.macro SGEMV_N_1x1 -+ GLD_INC f, s, 0x04, A0_F, PA0, 0 -+ GMADD f, s, Y0_F, A0_F, X0_F, Y0_F -+.endm -+ -+.macro SSTORE_Y_8 -+ GST xv, , Y0, Y, 0 -+.endm -+ -+.macro SSTORE_Y_8_GAP -+ xvstelm.w Y0, Y, 0, 0 -+ PTR_ADD T0, Y, INC_Y -+ xvstelm.w Y0, T0, 0, 1 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.w Y0, T0, 0, 2 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.w Y0, T0, 0, 3 -+ -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.w Y0, T0, 0, 4 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.w Y0, T0, 0, 5 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.w Y0, T0, 0, 6 -+ PTR_ADD T0, T0, INC_Y -+ xvstelm.w Y0, T0, 0, 7 -+.endm -+ -+.macro SSTORE_Y_1 -+ GST f, s, Y0_F, Y, 0 -+.endm -+ -+.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req -+ PTR_SRLI J, N, 3 -+ beqz J, .L_\XW\()_N_7 -+ PTR_SLLI K_LDA, LDA, 3 -+ PTR_SUB K_LDA, K_LDA, M4 -+.L_\XW\()_N_L8: -+ SLOAD_\X_8 -+ xor K, K, K -+ move Y, Y_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_M_7 -+.align 5 -+.L_\XW\()_M_L8: -+ SLOAD_\Y_8 -+ SGEMV_N_8x8 -+ SSTORE_\Y_8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL Y, INC_Y, Y, 3 -+ PTR_ADDI K, K, 8 -+ bnez I, .L_\XW\()_M_L8 -+.L_\XW\()_M_7: -+ andi I, M, 7 -+ beqz I, .L_\XW\()_M_END -+.align 5 -+.L_\XW\()_M_L1: -+ SLOAD_\Y_1 -+ SGEMV_N_1x8 -+ SSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_M_L1 -+.L_\XW\()_M_END: -+ PTR_ADDI J, J, -1 -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#endif -+ PTR_ALSL X, INC_X, X, 3 -+ bnez J, .L_\XW\()_N_L8 -+.L_\XW\()_N_7: -+ andi J, N, 4 -+ beqz J, .L_\XW\()_N_3 -+ SLOAD_\X_4 -+ xor K, K, K -+ move Y, Y_ORG -+ -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_4_M_7 -+.align 5 -+.L_\XW\()_N_4_M_L8: -+ SLOAD_\Y_8 -+ SGEMV_N_8x4 -+ SSTORE_\Y_8 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI K, K, 8 -+ PTR_ALSL Y, INC_Y, Y, 3 -+ bnez I, .L_\XW\()_N_4_M_L8 -+.L_\XW\()_N_4_M_7: -+ andi I, M, 7 -+ beqz I, .L_\XW\()_N_4_M_END -+.align 5 -+.L_\XW\()_N_4_M_L1: -+ SLOAD_\Y_1 -+ SGEMV_N_1x4 -+ SSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_N_4_M_L1 -+.L_\XW\()_N_4_M_END: -+ PTR_SLLI K_LDA, LDA, 2 -+ PTR_SUB K_LDA, K_LDA, M4 -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#endif -+ PTR_ALSL X, INC_X, X, 2 -+.L_\XW\()_N_3: -+ andi J, N, 2 -+ beqz J, .L_\XW\()_N_1 -+ SLOAD_\X_2 -+ xor K, K, K -+ move Y, Y_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_2_M_7 -+.align 5 -+.L_\XW\()_N_2_M_L8: -+ SLOAD_\Y_8 -+ SGEMV_N_8x2 -+ SSTORE_\Y_8 -+ PTR_ADDI I, I, -1 -+ PTR_ADDI K, K, 8 -+ PTR_ALSL Y, INC_Y, Y, 3 -+ bnez I, .L_\XW\()_N_2_M_L8 -+.L_\XW\()_N_2_M_7: -+ andi I, M, 7 -+ beqz I, .L_\XW\()_N_2_M_END -+.align 5 -+.L_\XW\()_N_2_M_L1: -+ SLOAD_\Y_1 -+ SGEMV_N_1x2 -+ SSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_N_2_M_L1 -+.L_\XW\()_N_2_M_END: -+ PTR_SLLI K_LDA, LDA, 1 -+ PTR_SUB K_LDA, K_LDA, M4 -+ PTR_ADD PA0, PA0, K_LDA -+ PTR_ADD PA1, PA1, K_LDA -+ PTR_ALSL X, INC_X, X, 1 -+.L_\XW\()_N_1: -+ andi J, N, 1 -+ beqz J, .L_END -+ SLOAD_\X_1 -+ xor K, K, K -+ move Y, Y_ORG -+ move I, M -+ beqz I, .L_END -+.align 5 -+.L_\XW\()_N_1_M_L1: -+ SLOAD_\Y_1 -+ SGEMV_N_1x1 -+ SSTORE_\Y_1 -+ PTR_ADDI I, I, -1 -+ PTR_ADD Y, Y, INC_Y -+ PTR_ADDI K, K, 1 -+ bnez I, .L_\XW\()_N_1_M_L1 -+ b .L_END -+.endm -+ -+ PROLOGUE -+ PTR_LD INC_Y, $sp, 0 -+ push_if_used 17 + 7, 19 -+ PTR_ADDI K, $r0, 0x01 -+ PTR_SUB I, INC_X, K -+ PTR_SUB J, INC_Y, K -+ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ -+ maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ -+ PTR_ALSL I, I, J, 1 -+ GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 -+ xvreplve0.w VALPHA, $xr0 -+ move Y_ORG, Y -+ move PA0, A -+#if __loongarch_grlen == 64 -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#else -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#endif -+ la.local T0, .L_GAP_TABLE -+ PTR_ALSL I, I, T0, 1 -+ ld.h K, I, 0 -+ PTR_ADD T0, T0, K -+ jirl $r0, T0, 0 -+.L_GAP_TABLE: -+ .hword .L_GAP_0_0 - .L_GAP_TABLE -+ .hword .L_GAP_0_1 - .L_GAP_TABLE -+ .hword .L_GAP_1_0 - .L_GAP_TABLE -+ .hword .L_GAP_1_1 - .L_GAP_TABLE -+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ -+ SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 -+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ -+ SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 -+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ -+ SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 -+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ -+ SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 -+.L_END: -+ pop_if_used 17 + 7, 19 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S -new file mode 100644 -index 000000000..f4bfffb42 ---- /dev/null -+++ b/kernel/loongarch64/sgemv_t_8_lasx.S -@@ -0,0 +1,405 @@ -+/******************************************************************************* -+Copyright (c) 2023, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*******************************************************************************/ -+#define ASSEMBLER -+ -+#include "common.h" -+#include "loongarch64_asm.S" -+ -+/********************************************************************* -+* 2023/08/30 guxiwei -+* UTEST : OK -+* CTEST : OK -+* TEST : OK -+* -+* -+*********************************************************************/ -+ -+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, -+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -+ */ -+#define M $r4 -+#define N $r5 -+#define ALPHA $f0 -+#define A $r7 -+#define LDA $r8 -+#define X $r9 -+#define INC_X $r10 -+#define Y $r11 -+#define INC_Y $r6 -+ -+#define J $r12 -+#define I $r13 -+#define K $r14 -+#define PY0 $r14 -+#define X_ORG $r15 -+#define PY1 $r16 -+#define K_LDA $r17 -+#define PY2 $r18 -+#define T0 $r19 -+#define PA0 $r20 -+#define PA1 $r23 -+#define PA2 $r24 -+#define PA3 $r25 -+#define PA4 $r26 -+#define PA5 $r27 -+#define PA6 $r28 -+#define PA7 $r29 -+#define M4 $r30 -+ -+#define VALPHA $xr0 -+#define X0 $xr1 -+#define A0 $xr2 -+#define A1 $xr3 -+#define A2 $xr4 -+#define A3 $xr5 -+#define A4 $xr6 -+#define A5 $xr7 -+#define A6 $xr8 -+#define A7 $xr9 -+#define TP0 $xr10 -+#define TP1 $xr11 -+#define TP2 $xr12 -+#define TP3 $xr13 -+#define TP4 $xr14 -+#define TP5 $xr15 -+#define TP6 $xr16 -+#define TP7 $xr17 -+#define Y0 $xr2 -+#define Y1 $xr3 -+#define Y2 $xr4 -+#define Y3 $xr5 -+#define Y4 $xr6 -+#define Y5 $xr7 -+#define Y6 $xr8 -+#define Y7 $xr9 -+ -+.macro ZERO_Y8 -+ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ -+ TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 -+.endm -+ -+.macro ZERO_Y4 -+ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 -+.endm -+ -+.macro ZERO_Y2 -+ GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 -+.endm -+ -+.macro ZERO_Y1 -+ GXOR xv, v, TP0, TP0, TP0 -+.endm -+ -+.macro SLOAD_X8 -+ GLD xv, , X0, X, 0x00 -+.endm -+ -+.macro SLOAD_X8_GAP -+ fld.s $f1, X, 0x00 -+ fldx.s $f2, X, INC_X -+ PTR_ALSL T0, INC_X, X, 1 -+ fld.s $f3, T0, 0x00 -+ fldx.s $f4, T0, INC_X -+ GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3 -+ PTR_ALSL T0, INC_X, X, 2 -+ fld.s $f2, T0, 0x00 -+ fldx.s $f3, T0, INC_X -+ PTR_ALSL T0, INC_X, T0, 1 -+ fld.s $f4, T0, 0x00 -+ fldx.s $f5, T0, INC_X -+ GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7 -+.endm -+ -+.macro SGEMV_T_8x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA1, 0, \ -+ A2, PA2, 0, A3, PA3, 0, \ -+ A4, PA4, 0, A5, PA5, 0, \ -+ A6, PA6, 0, A7, PA7, 0 -+ GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ -+ TP2, A2, X0, TP2, TP3, A3, X0, TP3, \ -+ TP4, A4, X0, TP4, TP5, A5, X0, TP5, \ -+ TP6, A6, X0, TP6, TP7, A7, X0, TP7 -+.endm -+ -+.macro SGEMV_T_4x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA1, 0, \ -+ A2, PA2, 0, A3, PA3, 0 -+ GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \ -+ TP2, A2, X0, TP2, TP3, A3, X0, TP3 -+.endm -+ -+.macro SGEMV_T_2x8 -+ GLD_INC xv, , 0x20, \ -+ A0, PA0, 0, A1, PA1, 0 -+ GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1 -+.endm -+ -+.macro SGEMV_T_LASX XW:req X8:req, X4:req -+ PTR_SRLI J, N, 3 -+ beqz J, .L_\XW\()_N_7 -+ PTR_SLLI K_LDA, LDA, 3 -+ PTR_SUB K_LDA, K_LDA, M4 -+.L_\XW\()_N_L8: -+ ZERO_Y8 -+ move X, X_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_M_7 -+.align 5 -+.L_\XW\()_M_L8: -+ SLOAD_\X8 -+ SGEMV_T_8x8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL X, INC_X, X, 3 -+ bnez I, .L_\XW\()_M_L8 -+.L_\XW\()_M_7: -+ // Accumulated -+ GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ -+ Y5, TP5, Y6, TP6, Y7, TP7 -+ andi I, M, 7 -+ beqz I, .L_\XW\()_M_END -+.align 5 -+.L_\XW\()_M_L1: -+ fld.s $f1, X, 0x00 -+ fld.s $f10, PA0, 0x00 -+ fld.s $f11, PA1, 0x00 -+ fld.s $f12, PA2, 0x00 -+ fld.s $f13, PA3, 0x00 -+ fld.s $f14, PA4, 0x00 -+ fld.s $f15, PA5, 0x00 -+ fld.s $f16, PA6, 0x00 -+ fld.s $f17, PA7, 0x00 -+#if __loongarch_grlen == 64 -+ GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ -+ PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 -+#elif __loongarch_grlen == 32 -+ GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ -+ PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 -+#else -+ GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \ -+ PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04 -+#endif -+ GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \ -+ $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ bnez I, .L_\XW\()_M_L1 -+.L_\XW\()_M_END: -+ fld.s $f10, Y, 0x00 -+ fldx.s $f11, Y, INC_Y -+ PTR_ALSL PY0, INC_Y, Y, 1 -+ fld.s $f12, PY0, 0x00 -+ fldx.s $f13, PY0, INC_Y -+ PTR_ALSL PY1, INC_Y, Y, 2 -+ fld.s $f14, PY1, 0x00 -+ fldx.s $f15, PY1, INC_Y -+ PTR_ALSL PY2, INC_Y, PY1, 1 -+ fld.s $f16, PY2, 0x00 -+ fldx.s $f17, PY2, INC_Y -+ -+ GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \ -+ $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17 -+ -+ PTR_ADDI J, J, -1 -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ -+ PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA -+#endif -+ fst.s $f10, Y, 0x00 -+ fstx.s $f11, Y, INC_Y -+ fst.s $f12, PY0, 0x00 -+ fstx.s $f13, PY0, INC_Y -+ fst.s $f14, PY1, 0x00 -+ fstx.s $f15, PY1, INC_Y -+ fst.s $f16, PY2, 0x00 -+ fstx.s $f17, PY2, INC_Y -+ -+ PTR_ALSL Y, INC_Y, Y, 3 -+ bnez J, .L_\XW\()_N_L8 -+.L_\XW\()_N_7: -+ andi J, N, 4 -+ beqz J, .L_\XW\()_N_3 -+ ZERO_Y4 -+ move X, X_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_4_M_7 -+.align 5 -+.L_\XW\()_N_4_M_L8: -+ SLOAD_\X8 -+ SGEMV_T_4x8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL X, INC_X, X, 3 -+ bnez I, .L_\XW\()_N_4_M_L8 -+.L_\XW\()_N_4_M_7: -+ // Accumulated -+ GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 -+ andi I, M, 7 -+ beqz I, .L_\XW\()_N_4_M_END -+.align 5 -+.L_\XW\()_N_4_M_L1: -+ fld.s $f1, X, 0x00 -+ GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00 -+ GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ bnez I, .L_\XW\()_N_4_M_L1 -+.L_\XW\()_N_4_M_END: -+ fld.s $f10, Y, 0x00 -+ fldx.s $f11, Y, INC_Y -+ PTR_ALSL PY0, INC_Y, Y, 1 -+ fld.s $f12, PY0, 0x00 -+ fldx.s $f13, PY0, INC_Y -+ -+ GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13 -+ -+ PTR_SLLI K_LDA, LDA, 2 -+ PTR_SUB K_LDA, K_LDA, M4 -+ -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA -+#endif -+ fst.s $f10, Y, 0x00 -+ fstx.s $f11, Y, INC_Y -+ fst.s $f12, PY0, 0x00 -+ fstx.s $f13, PY0, INC_Y -+ PTR_ALSL Y, INC_Y, Y, 2 -+.L_\XW\()_N_3: -+ andi J, N, 2 -+ beqz J, .L_\XW\()_N_1 -+ ZERO_Y2 -+ move X, X_ORG -+ PTR_SRLI I, M, 3 -+ beqz I, .L_\XW\()_N_2_M_7 -+.align 5 -+.L_\XW\()_N_2_M_L8: -+ SLOAD_\X8 -+ SGEMV_T_2x8 -+ PTR_ADDI I, I, -1 -+ PTR_ALSL X, INC_X, X, 3 -+ bnez I, .L_\XW\()_N_2_M_L8 -+.L_\XW\()_N_2_M_7: -+ // Accumulated -+ GACC xvf, s, Y0, TP0, Y1, TP1 -+ andi I, M, 7 -+ beqz I, .L_\XW\()_N_2_M_END -+.align 5 -+.L_\XW\()_N_2_M_L1: -+ fld.s $f1, X, 0x00 -+ GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00 -+ GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ bnez I, .L_\XW\()_N_2_M_L1 -+.L_\XW\()_N_2_M_END: -+ fld.s $f10, Y, 0x00 -+ fldx.s $f11, Y, INC_Y -+ -+ GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11 -+ -+ PTR_SLLI K_LDA, LDA, 1 -+ PTR_SUB K_LDA, K_LDA, M4 -+ -+#if __loongarch_grlen == 64 -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA -+#else -+ GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA -+#endif -+ fst.s $f10, Y, 0x00 -+ fstx.s $f11, Y, INC_Y -+ PTR_ALSL Y, INC_Y, Y, 1 -+.L_\XW\()_N_1: -+ andi J, N, 1 -+ beqz J, .L_END -+ ZERO_Y1 -+ move X, X_ORG -+ move I, M -+ beqz I, .L_END -+.align 5 -+.L_\XW\()_N_1_M_L1: -+ fld.s $f2, PA0, 0x00 -+ fld.s $f1, X, 0x00 -+ fmadd.s $f10, $f2, $f1, $f10 -+ PTR_ADDI I, I, -1 -+ PTR_ADD X, X, INC_X -+ PTR_ADDI PA0, PA0, 0x04 -+ bnez I, .L_\XW\()_N_1_M_L1 -+ -+ fld.s $f2, Y, 0x00 -+ fmadd.s $f2, ALPHA, $f10, $f2 -+ fst.s $f2, Y, 0x00 -+ b .L_END -+.endm -+ -+ PROLOGUE -+ PTR_LD INC_Y, $sp, 0 -+ push_if_used 17 + 8, 18 -+ PTR_ADDI K, $r0, 0x01 -+ PTR_SUB I, INC_X, K -+ maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ -+ GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2 -+ xvreplve0.w VALPHA, $xr0 -+ move X_ORG, X -+ move PA0, A -+#if __loongarch_grlen == 64 -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#elif __loongarch_grlen == 32 -+ GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#else -+ GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ -+ PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA -+#endif -+ la.local T0, .L_GAP_TABLE -+ PTR_ALSL I, I, T0, 1 -+ ld.h K, I, 0 -+ PTR_ADD T0, T0, K -+ jirl $r0, T0, 0 -+.L_GAP_TABLE: -+ .hword .L_GAP_0 - .L_GAP_TABLE -+ .hword .L_GAP_1 - .L_GAP_TABLE -+.L_GAP_0: /* if (incx == 1) */ -+ SGEMV_T_LASX GAP_0, X8, X4 -+.L_GAP_1: /* if (incx != 1) */ -+ SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP -+.L_END: -+ pop_if_used 17 + 8, 18 -+ jirl $r0, $r1, 0x0 -+ EPILOGUE -diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile -index 71e5a87cb..1c85667ec 100644 ---- a/lapack/laswp/loongarch64/Makefile -+++ b/lapack/laswp/loongarch64/Makefile -@@ -1,6 +1,11 @@ - TOPDIR = ../../.. - include ../../../Makefile.system - -+ifeq ($(DYNAMIC_ARCH), 1) -+LASWP = ../generic/laswp_k_4.c -+ZLASWP = ../generic/zlaswp_k_4.c -+endif -+ - ifndef LASWP - LASWP = ../generic/laswp_k.c - endif -diff --git a/param.h b/param.h -index f1f5cbdad..a34e806c0 100644 ---- a/param.h -+++ b/param.h -@@ -2845,31 +2845,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define GEMM_DEFAULT_OFFSET_B 0 - #define GEMM_DEFAULT_ALIGN 0x0ffffUL - -+#if defined(NO_LASX) -+#define DGEMM_DEFAULT_UNROLL_N 8 -+#define DGEMM_DEFAULT_UNROLL_M 2 - #define SGEMM_DEFAULT_UNROLL_N 8 -+#define SGEMM_DEFAULT_UNROLL_M 2 -+#else - #define DGEMM_DEFAULT_UNROLL_N 4 -+#define DGEMM_DEFAULT_UNROLL_M 16 -+#define SGEMM_DEFAULT_UNROLL_N 8 -+#define SGEMM_DEFAULT_UNROLL_M 16 -+#endif -+ - #define QGEMM_DEFAULT_UNROLL_N 2 - #define CGEMM_DEFAULT_UNROLL_N 4 - #define ZGEMM_DEFAULT_UNROLL_N 4 - #define XGEMM_DEFAULT_UNROLL_N 1 - --#define SGEMM_DEFAULT_UNROLL_M 2 --#define DGEMM_DEFAULT_UNROLL_M 16 - #define QGEMM_DEFAULT_UNROLL_M 2 - #define CGEMM_DEFAULT_UNROLL_M 1 - #define ZGEMM_DEFAULT_UNROLL_M 1 - #define XGEMM_DEFAULT_UNROLL_M 1 - --#define SGEMM_DEFAULT_P 512 -+#define SGEMM_DEFAULT_P 256 - #define DGEMM_DEFAULT_P 32 - #define CGEMM_DEFAULT_P 128 - #define ZGEMM_DEFAULT_P 128 - --#define SGEMM_DEFAULT_R 12288 -+#define SGEMM_DEFAULT_R 1024 - #define DGEMM_DEFAULT_R 858 - #define CGEMM_DEFAULT_R 4096 - #define ZGEMM_DEFAULT_R 4096 - --#define SGEMM_DEFAULT_Q 128 -+#define SGEMM_DEFAULT_Q 256 - #define DGEMM_DEFAULT_Q 152 - #define CGEMM_DEFAULT_Q 128 - #define ZGEMM_DEFAULT_Q 128 --- -2.20.1 - diff --git a/OpenBLAS-0.3.23.tar.gz b/OpenBLAS-0.3.26.tar.gz similarity index 69% rename from OpenBLAS-0.3.23.tar.gz rename to OpenBLAS-0.3.26.tar.gz index 31bec85b5297b69820ba189c120feebc730345dd..4bd0abb9ba92a5549485869e3b12bf15e93fe975 100644 Binary files a/OpenBLAS-0.3.23.tar.gz and b/OpenBLAS-0.3.26.tar.gz differ diff --git a/openblas.spec b/openblas.spec index 9cf7955d56e794e04eccb435bc02a76b9e801c48..5f316cc5476bf1932a9320503c1e7b87a32dd495 100644 --- a/openblas.spec +++ b/openblas.spec @@ -1,19 +1,18 @@ -%define anolis_release 3 +%define anolis_release 1 %bcond_with system_lapack -%global lapackver 3.9.1 +%global lapackver 3.11.0 Name: openblas Summary: An optimized BLAS library based on GotoBLAS2 -Version: 0.3.23 +Version: 0.3.26 Release: %{anolis_release}%{?dist} -License: BSD -URL: https://github.com/xianyi/OpenBLAS/ -Source0: https://github.com/xianyi/OpenBLAS/releases/download/v%{version}/OpenBLAS-%{version}.tar.gz +License: BSD-3-Clause +URL: https://github.com/OpenMathLib/OpenBLAS +Source0: https://github.com/OpenMathLib/OpenBLAS/releases/download/v%{version}/OpenBLAS-%{version}.tar.gz Patch0001: 0001-openblas-0.2.15-system_lapack.patch Patch0002: 0002-openblas-0.2.5-libname.patch Patch0003: 0003-openblas-0.3.11-tests.patch -Patch0004: 0004-OpenBLAS-0.3.23-Add-opt-for-LoongArch64.patch BuildRequires: make gcc gcc-c++ gcc-gfortran BuildRequires: perl-devel @@ -203,11 +202,10 @@ This package contains the static libraries. tar zxf %{SOURCE0} cd OpenBLAS-%{version} %if %{with system_lapack} -%patch0001 -p1 -b .system_lapack +%patch -P0001 -p1 -b .system_lapack %endif -%patch0002 -p1 -b .libname -%patch0003 -p1 -b .tests -%patch0004 -p1 -b .Add-opt-for-LoongArch64 +%patch -P0002 -p1 -b .libname +%patch -P0003 -p1 -b .tests find -name \*.f -exec chmod 644 {} \; @@ -527,6 +525,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sun Feb 04 2024 Funda Wang - 0.3.26-1 +- New version 0.3.26 + * Wed Nov 8 2023 XiWei Gu - 0.3.23-3 - Enable DYNAMIC_ARCH for LoongArch64