diff --git a/0001-Sw64-Port-add-configure-support-for-sw64.patch b/0001-Sw64-Port-add-configure-support-for-sw64.patch new file mode 100644 index 0000000000000000000000000000000000000000..c553c4b11ee866199737520b73829c31db2299cc --- /dev/null +++ b/0001-Sw64-Port-add-configure-support-for-sw64.patch @@ -0,0 +1,592 @@ +From 8e8cbe70fbf759e3a4ebaa036ce782953507453b Mon Sep 17 00:00:00 2001 +From: swcompiler +Date: Tue, 22 Jul 2025 13:44:17 +0800 +Subject: [PATCH 1/3] Sw64-Port-add-configure-support-for-sw64 + +--- + config.guess | 36 ++++++++++++++++ + config.in | 4 ++ + configfsf.guess | 11 +++++ + configfsf.sub | 1 + + configure | 99 +++++++++++++++++++++++++++++++++++++++++- + configure.ac | 102 +++++++++++++++++++++++++++++++++++++++++++- + extract-dbl.c | 2 +- + gmp-impl.h | 9 +++- + longlong.h | 90 ++++++++++++++++++++++++++++++++++++++ + mpn/Makefile.am | 2 +- + mpn/Makefile.in | 2 +- + mpn/generic/get_d.c | 2 +- + tune/Makefile.am | 2 +- + tune/Makefile.in | 2 +- + 14 files changed, 354 insertions(+), 10 deletions(-) + +diff --git a/config.guess b/config.guess +index 6a7f141..6299029 100755 +--- a/config.guess ++++ b/config.guess +@@ -784,6 +784,42 @@ sparc-*-* | sparc64-*-*) + fi + ;; + ++sw_64-*-*) ++ eval $set_cc_for_build ++ cat <${dummy}0.s ++ .data ++Lformat: ++ .byte 37,100,45,37,120,10,0 # "%d-%x\n" ++ .text ++ .globl main ++ .align 4 ++ .ent main ++main: ++ .frame \$30,16,\$26,0 ++ ldgp \$29,0(\$27) ++ .prologue 1 ++ .long 0x47e03d91 # implver \$17 ++ ldi \$2,-1 ++ .long 0x47e20c21 # amask \$2,\$1 ++ ldi \$16,Lformat ++ not \$1,\$18 ++ call \$26,printf ++ ldgp \$29,0(\$26) ++ mov 0,\$16 ++ call \$26,exit ++ .end main ++EOF ++ $CC_FOR_BUILD ${dummy}0.s -o $dummy 2>/dev/null ++ if test "$?" = 0 ; then ++ case `$dummy` in ++ 0-0) exact_cpu=sw_64 ;; ++ 1-0) exact_cpu=sw_64sw6 ;; ++ 3-0) exact_cpu=sw_64sw8a ;; ++ 2-307) exact_cpu=sw_64sw6a ;; ++ 2-1307) exact_cpu=sw_64sw6b ;; ++ esac ++ fi ++ ;; + + # Recognise x86 processors using a tricky cpuid with 4 arguments, repeating + # arguments; for x86-64 we effectively pass the 1st in rdx and the 2nd in rcx. +diff --git a/config.in b/config.in +index ee1ef8c..076bed0 100644 +--- a/config.in ++++ b/config.in +@@ -146,6 +146,7 @@ see https://www.gnu.org/licenses/. + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ + #undef HAVE_HOST_CPU_FAMILY_alpha ++#undef HAVE_HOST_CPU_FAMILY_sw_64 + #undef HAVE_HOST_CPU_FAMILY_m68k + #undef HAVE_HOST_CPU_FAMILY_power + #undef HAVE_HOST_CPU_FAMILY_powerpc +@@ -157,6 +158,9 @@ see https://www.gnu.org/licenses/. + #undef HAVE_HOST_CPU_alphaev67 + #undef HAVE_HOST_CPU_alphaev68 + #undef HAVE_HOST_CPU_alphaev7 ++#undef HAVE_HOST_CPU_sw_64sw6a ++#undef HAVE_HOST_CPU_sw_64sw6b ++#undef HAVE_HOST_CPU_sw_64sw8a + #undef HAVE_HOST_CPU_m68020 + #undef HAVE_HOST_CPU_m68030 + #undef HAVE_HOST_CPU_m68040 +diff --git a/configfsf.guess b/configfsf.guess +index 354a8cc..7d31475 100644 +--- a/configfsf.guess ++++ b/configfsf.guess +@@ -1149,6 +1149,17 @@ EOF + sparc:Linux:*:* | sparc64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; ++ sw_64:Linux:*:*) ++ case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in ++ SW6) UNAME_MACHINE=sw_64sw6 ;; ++ SW6A) UNAME_MACHINE=sw_64sw6a ;; ++ SW6B) UNAME_MACHINE=sw_64sw6b ;; ++ SW8A) UNAME_MACHINE=sw_64sw8a ;; ++ esac ++ objdump --private-headers /bin/sh | grep -q ld.so.1 ++ if test "$?" = 0 ; then LIBC=gnulibc1 ; fi ++ echo "$UNAME_MACHINE"-unknown-linux-"$LIBC" ++ exit ;; + tile*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; +diff --git a/configfsf.sub b/configfsf.sub +index 9865d6e..06ecbea 100644 +--- a/configfsf.sub ++++ b/configfsf.sub +@@ -1268,6 +1268,7 @@ case $cpu-$vendor in + | sparclite \ + | sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \ + | spu \ ++ | sw_64 | sw_64sw6 | sw_64sw6a | sw_64sw6b | sw_64sw8a \ + | tahoe \ + | thumbv7* \ + | tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \ +diff --git a/configure b/configure +index 7910aa0..c943538 100755 +--- a/configure ++++ b/configure +@@ -4235,6 +4235,103 @@ echo "include_mpn(\`alpha/default.m4')" >> $gmp_tmpconfigm4i + esac + ;; + ++#__sw_64__ start ++ sw_64*-*-*) ++ $as_echo "#define HAVE_HOST_CPU_FAMILY_sw_64 1" >>confdefs.h ++ ++ case $host_cpu in ++ sw_64sw6a | sw_64sw6b | sw_64sw8a) ++ path="sw_64/sw6a sw_64/sw6b sw_64/sw8a sw_64" ;; ++ sw_64sw6) ++ path="sw_64/sw6 sw_64" ;; ++ *) ++ path="sw_64" ;; ++ esac ++ if test "$enable_assembly" = "yes" ; then ++ extra_functions="cntlz" ++ fi ++ gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below ++ gcc_cflags_maybe="-mieee" ++ gcc_cflags_oldas="-Wa,-oldas" # see GMP_GCC_WA_OLDAS. ++ ++ case $host_cpu in ++ sw_64) gcc_cflags_cpu="-mcpu=sw6" ;; ++ sw_64sw8a) gcc_cflags_cpu="-mcpu=sw8a" ;; ++ sw_64sw6) gcc_cflags_cpu="-mcpu=sw6 -mcpu=sw6a" ;; ++ sw_64sw6b | sw_64sw6a) ++ gcc_cflags_cpu="-mcpu=sw6a -mcpu=sw6b" ;; ++ esac ++ ++# gcc version "2.9-gnupro-99r1" on sw_64-dec-osf5.1 has been seen ++ # not putting the assembler in the right mode ++ # for what it produces. We need to do this for it, and need to do it ++ # before testing the -mcpu options. ++ # ++ # OSF `as' accepts sw_64. -arch only seems ++ # to affect insns like ldbu which are expanded as macros when necessary. ++ # Insns like ctlz which were never available as macros are always ++ # accepted and always generate their plain code. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw6) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw8a) gcc_cflags_asm="-Wa,-msw8a -Wa,-arch,sw8a" ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_asm=" -Wa,-msw6a -Wa,-arch,sw6b" ;; ++ esac ++ ++ # It might be better to ask "cc" whether it's Cray C or DEC C, ++ # instead of relying on the OS part of $host. But it's hard to ++ # imagine either of those compilers anywhere except their native ++ # systems. ++ # ++ ++echo "include_mpn(\`sw_64/sw_64-defs.m4')" >> $gmp_tmpconfigm4i ++ ++ case $host in ++ *-cray-unicos*) ++ cc_cflags="-O" # no -g, it silently disables all optimizations ++ ++echo "include_mpn(\`sw_64/unicos.m4')" >> $gmp_tmpconfigm4i ++ ++ # Don't perform any assembly syntax tests on this beast. ++ gmp_asm_syntax_testing=no ++ ;; ++ *-*-osf*) ++ ++echo "include_mpn(\`sw_64/default.m4')" >> $gmp_tmpconfigm4i ++ ++ cc_cflags="" ++ cc_cflags_optlist="opt cpu" ++ ++ # not sure if -fast works on old versions, so make it optional ++ cc_cflags_opt="-fast -O2" ++ ++ case $host_cpu in ++ sw_64) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw6) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw8a) cc_cflags_cpu="-arch~sw8a~-tune~sw8a" ;; ++ sw_64sw6a | sw_64sw6b) ++ cc_cflags_cpu="-arch~sw6a~-tune~sw6a -arch~sw6b~-tune~sw6b" ;; ++ esac ++ ;; ++ *) ++ ++echo "include_mpn(\`sw_64/default.m4')" >> $gmp_tmpconfigm4i ++ ++ ;; ++ esac ++ ++ case $host in ++ *-*-unicos*) ++ # tune/sw_64.asm assumes int==4bytes but unicos uses int==8bytes ++ ;; ++ *) ++ SPEED_CYCLECOUNTER_OBJ=sw_64.lo ++ cyclecounter_size=1 ;; ++ esac ++ ;; ++#__sw_64__ end + + # Cray vector machines. + # This must come after alpha* so that we can recognize present and future +@@ -7515,7 +7612,7 @@ fi + ;; + -Wa,-m*) + case $host in +- alpha*-*-*) ++ alpha*-*-* | sw_64*-*-*) + { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler $cc $cflags $flag" >&5 + $as_echo_n "checking assembler $cc $cflags $flag... " >&6; } + result=no +diff --git a/configure.ac b/configure.ac +index 082077b..4de2e25 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -296,6 +296,7 @@ AH_VERBATIM([HAVE_HOST_CPU_1], + #undef HAVE_HOST_CPU_FAMILY_m68k + #undef HAVE_HOST_CPU_FAMILY_power + #undef HAVE_HOST_CPU_FAMILY_powerpc ++#undef HAVE_HOST_CPU_FAMILY_sw_64 + #undef HAVE_HOST_CPU_FAMILY_x86 + #undef HAVE_HOST_CPU_FAMILY_x86_64 + +@@ -351,6 +352,10 @@ AH_VERBATIM([HAVE_HOST_CPU_1], + #undef HAVE_HOST_CPU_s390_z13 + #undef HAVE_HOST_CPU_s390_z14 + #undef HAVE_HOST_CPU_s390_z15 ++#undef HAVE_HOST_CPU_sw_64sw6a ++#undef HAVE_HOST_CPU_sw_64sw6b ++#undef HAVE_HOST_CPU_sw_64sw8a ++#undef HAVE_HOST_CPU_sw_64sw6 + + /* Define to 1 iff we have a s390 with 64-bit registers. */ + #undef HAVE_HOST_CPU_s390_zarch]) +@@ -615,6 +620,101 @@ case $host in + path="cray" + ;; + ++#__sw_64__ start ++ sw_64*-*-*) ++ AC_DEFINE(HAVE_HOST_CPU_FAMILY_sw_64) ++ case $host_cpu in ++ sw_64sw6a | sw_64sw6b | sw_64sw8a | sw_64sw6*) ++ path="sw_64/sw6a sw_64/sw6b sw_64/sw8a sw_64" ;; ++ sw_64sw6) ++ path="sw_64/sw6 sw_64" ;; ++ *) ++ path="sw_64" ;; ++ esac ++ if test "$enable_assembly" = "yes" ; then ++ extra_functions="cntlz" ++ fi ++ gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below ++ gcc_cflags_maybe="-mieee" ++ gcc_cflags_oldas="-Wa,-oldas" # see GMP_GCC_WA_OLDAS. ++ ++ # compiler errors too easily and is rejected by GMP_PROG_CC_WORKS. Each ++ # -mcpu=sw6 below has a fallback to -mcpu=sw_64 for this reason. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_cpu="-mcpu=sw6" ;; ++ sw_64sw6) gcc_cflags_cpu="-mcpu=sw6 " ;; ++ sw_64sw8a) gcc_cflags_cpu="-mcpu=sw8a" ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_cpu="-mcpu=sw6a -mcpu=sw6b" ;; ++ esac ++ ++# gcc version "2.9-gnupro-99r1" on sw_64 has been seen ++ # accepting -mcpu=sw6, but not putting the assembler in the right mode ++ # for what it produces. We need to do this for it, and need to do it ++ # before testing the -mcpu options. ++ # ++ # On old versions of gcc, which don't know -mcpu=, we believe an ++ # explicit etc will be necessary to put the assembler in ++ # the right mode for our .asm files and longlong.h asm blocks. ++ # ++ # On newer versions of gcc, when -mcpu= is known, we must give a -Wa ++ # which is at least as high as the code gcc will generate. gcc ++ # establishes what it needs with a ".arch" directive, our command line ++ # option seems to override that. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw6) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw8a) gcc_cflags_asm="-Wa,-arch,sw8a -Wa,-msw8a" ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_asm="-Wa,-arch,sw6a -Wa,-msw6a -Wa,-arch,sw6b -Wa,-msw6b" ;; ++ esac ++ ++ # It might be better to ask "cc" whether it's Cray C or DEC C, ++ # instead of relying on the OS part of $host. But it's hard to ++ # imagine either of those compilers anywhere except their native ++ # systems. ++ # ++ GMP_INCLUDE_MPN(sw_64/sw_64-defs.m4) ++ case $host in ++ *-cray-unicos*) ++ cc_cflags="-O" # no -g, it silently disables all optimizations ++ GMP_INCLUDE_MPN(sw_64/unicos.m4) ++ # Don't perform any assembly syntax tests on this beast. ++ gmp_asm_syntax_testing=no ++ ;; ++ *-*-osf*) ++ GMP_INCLUDE_MPN(sw_64/default.m4) ++ cc_cflags="" ++ cc_cflags_optlist="opt cpu" ++ ++ # not sure if -fast works on old versions, so make it optional ++ cc_cflags_opt="-fast -O2" ++ ++ case $host_cpu in ++ sw_64) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw6) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw8a) cc_cflags_cpu="-arch~sw8a~-tune~sw8a" ;; ++ sw_64sw6a | sw_64sw6b) ++ cc_cflags_cpu="-arch~sw6a~-tune~sw6a -arch~sw6b~-tune~sw6b" ;; ++ esac ++ ;; ++ *) ++ GMP_INCLUDE_MPN(sw_64/default.m4) ++ ;; ++ esac ++ ++ case $host in ++ *-*-unicos*) ++ # tune/sw_64.asm assumes int==4bytes but unicos uses int==8bytes ++ ;; ++ *) ++ SPEED_CYCLECOUNTER_OBJ=sw_64.lo ++ cyclecounter_size=1 ;; ++ esac ++ ;; ++#__sw_64__ end + + arm*-*-* | aarch64*-*-* | [applem[1-9]-*-*]) + abilist="32" +@@ -2428,7 +2528,7 @@ if test $found_compiler = yes; then + ;; + -Wa,-m*) + case $host in +- alpha*-*-*) ++ alpha*-*-* | sw_64*-*-*) + GMP_GCC_WA_MCPU($cc $cflags, $flag, , [continue]) + ;; + esac +diff --git a/extract-dbl.c b/extract-dbl.c +index e44d6fa..434a7af 100644 +--- a/extract-dbl.c ++++ b/extract-dbl.c +@@ -71,7 +71,7 @@ __gmp_extract_double (mp_ptr rp, double d) + + #if _GMP_IEEE_FLOATS + { +-#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8 ++#if (defined (__alpha) || defined (__sw_64)) && __GNUC__ == 2 && __GNUC_MINOR__ == 8 + /* Work around alpha-specific bug in GCC 2.8.x. */ + volatile + #endif +diff --git a/gmp-impl.h b/gmp-impl.h +index 2615af7..6e6682d 100644 +--- a/gmp-impl.h ++++ b/gmp-impl.h +@@ -320,6 +320,11 @@ typedef struct {mp_limb_t inv21, inv32, inv53;} gmp_pi2_t; + #define HAVE_HOST_CPU_alpha_CIX 1 + #endif + ++#if HAVE_HOST_CPU_sw_64sw6a || HAVE_HOST_CPU_sw_64sw6b || HAVE_HOST_CPU_sw_64sw8a \ ++ || HAVE_HOST_CPU_sw_64sw6 || HAVE_HOST_CPU_sw_64 ++#define HAVE_HOST_CPU_sw_64_CIX 1 ++#endif ++ + + #if defined (__cplusplus) + extern "C" { +@@ -3444,7 +3449,7 @@ __GMP_DECLSPEC extern const unsigned char binvert_limb_table[128]; + to 0 if there's an even number. "n" should be an unsigned long and "p" + an int. */ + +-#if defined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX ++#if defined (__GNUC__) && ! defined (NO_ASM) && (HAVE_HOST_CPU_alpha_CIX || HAVE_HOST_CPU_sw_64_CIX) + #define ULONG_PARITY(p, n) \ + do { \ + int __p; \ +@@ -3726,7 +3731,7 @@ __GMP_DECLSPEC extern const unsigned char binvert_limb_table[128]; + #endif + #endif + +-#if defined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX ++#if defined (__GNUC__) && ! defined (NO_ASM) && (HAVE_HOST_CPU_alpha_CIX || HAVE_HOST_CPU_sw_64_CIX) + #define popc_limb(result, input) \ + do { \ + __asm__ ("ctpop %1, %0" : "=r" (result) : "r" (input)); \ +diff --git a/longlong.h b/longlong.h +index be1c3cb..3287f0e 100644 +--- a/longlong.h ++++ b/longlong.h +@@ -270,6 +270,96 @@ long __MPN(count_leading_zeros) (UDItype); + #endif /* clz using mpn */ + #endif /* __alpha */ + ++//__sw_64 start ++#if defined (__sw_64) && W_TYPE_SIZE == 64 ++/* Most sw_64-based machines, except Cray systems. */ ++#if defined (__GNUC__) ++#if __GMP_GNUC_PREREQ (3,3) ++#define umul_ppmm(ph, pl, m0, m1) \ ++ do { \ ++ UDItype __m0 = (m0), __m1 = (m1); \ ++ (ph) = __builtin_sw_64_umulh (__m0, __m1); \ ++ (pl) = __m0 * __m1; \ ++ } while (0) ++#else ++#define umul_ppmm(ph, pl, m0, m1) \ ++ do { \ ++ UDItype __m0 = (m0), __m1 = (m1); \ ++ __asm__ ("umulh %r1,%2,%0" \ ++ : "=r" (ph) \ ++ : "%rJ" (__m0), "rI" (__m1)); \ ++ (pl) = __m0 * __m1; \ ++ } while (0) ++#endif ++#define UMUL_TIME 18 ++#else /* ! __GNUC__ */ ++#include ++#define umul_ppmm(ph, pl, m0, m1) \ ++ do { \ ++ UDItype __m0 = (m0), __m1 = (m1); \ ++ (ph) = __UMULH (__m0, __m1); \ ++ (pl) = __m0 * __m1; \ ++ } while (0) ++#endif ++#ifndef LONGLONG_STANDALONE ++#define udiv_qrnnd(q, r, n1, n0, d) \ ++ do { UWtype __di; \ ++ __di = __MPN(invert_limb) (d); \ ++ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ ++ } while (0) ++#define UDIV_PREINV_ALWAYS 1 ++#define UDIV_NEEDS_NORMALIZATION 1 ++#endif /* LONGLONG_STANDALONE */ ++ ++/* clz_tab is required in all configurations, since mpn/sw_64/cntlz.asm ++ always goes into libgmp.so, even when not actually used. */ ++#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB ++ ++#if defined (__GNUC__) && HAVE_HOST_CPU_sw_64_CIX ++#define count_leading_zeros(COUNT,X) \ ++ __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) ++#define count_trailing_zeros(COUNT,X) \ ++ __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) ++#endif /* clz/ctz using cix */ ++ ++#if ! defined (count_leading_zeros) \ ++ && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) ++/* SW_64_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. ++ "$31" is written explicitly in the asm, since an "r" constraint won't ++ select reg 31. There seems no need to worry about "r31" syntax for cray, ++ since gcc itself (pre-release 3.4) emits just $31 in various places. */ ++#define SW_64_CMPBGE_0(dst, src) \ ++ do { asm ("cmpgeb $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) ++/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts ++ them, locating the highest non-zero byte. A second __clz_tab lookup ++ counts the leading zero bits in that byte, giving the result. */ ++#define count_leading_zeros(count, x) \ ++ do { \ ++ UWtype __clz__b, __clz__c, __clz__x = (x); \ ++ SW_64_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ ++ __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ ++ __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ ++ __clz__x >>= __clz__b; \ ++ __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ ++ __clz__b = 65 - __clz__b; \ ++ (count) = __clz__b - __clz__c; \ ++ } while (0) ++#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB ++#endif /* clz using cmpbge */ ++ ++#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) ++#if HAVE_ATTRIBUTE_CONST ++long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); ++#else ++long __MPN(count_leading_zeros) (UDItype); ++#endif ++#define count_leading_zeros(count, x) \ ++ ((count) = __MPN(count_leading_zeros) (x)) ++#endif /* clz using mpn */ ++#endif ++//__sw_64 end ++ ++ + #if defined (__AVR) && W_TYPE_SIZE == 8 + #define umul_ppmm(ph, pl, m0, m1) \ + do { \ +diff --git a/mpn/Makefile.am b/mpn/Makefile.am +index c61926d..acca2d7 100644 +--- a/mpn/Makefile.am ++++ b/mpn/Makefile.am +@@ -41,7 +41,7 @@ libmpn_la_DEPENDENCIES = $(OFILES) + + TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \ + minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \ +- riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64 ++ riscv s390_32 s390_64 sh sparc32 sparc64 sw_64 thumb vax x86 x86_64 + + EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST) + +diff --git a/mpn/Makefile.in b/mpn/Makefile.in +index b5df4e5..6641d1f 100644 +--- a/mpn/Makefile.in ++++ b/mpn/Makefile.in +@@ -395,7 +395,7 @@ libmpn_la_LIBADD = $(OFILES) + libmpn_la_DEPENDENCIES = $(OFILES) + TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \ + minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \ +- riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64 ++ riscv s390_32 s390_64 sh sparc32 sparc64 sw_64 thumb vax x86 x86_64 + + EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST) + +diff --git a/mpn/generic/get_d.c b/mpn/generic/get_d.c +index 8bef128..10f0dee 100644 +--- a/mpn/generic/get_d.c ++++ b/mpn/generic/get_d.c +@@ -61,7 +61,7 @@ see https://www.gnu.org/licenses/. */ + Bizarrely, this happens also with Cray cc on alphaev5-cray-unicosmk2.0.6.X, + and has the same solution. Don't know why or how. */ + +-#if HAVE_HOST_CPU_FAMILY_alpha \ ++#if HAVE_HOST_CPU_FAMILY_alpha || HAVE_HOST_CPU_FAMILY_sw_64 \ + && ((defined (__GNUC__) && ! __GMP_GNUC_PREREQ(3,4)) \ + || defined (_CRAY)) + static volatile const long CONST_1024 = 1024; +diff --git a/tune/Makefile.am b/tune/Makefile.am +index 0f564ed..d2852af 100644 +--- a/tune/Makefile.am ++++ b/tune/Makefile.am +@@ -33,7 +33,7 @@ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests + AM_LDFLAGS = -no-install + + EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \ +- ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl ++ ia64.asm powerpc.asm powerpc64.asm sw_64.asm x86_64.asm many.pl + noinst_HEADERS = speed.h + + # Prefer -static on the speed and tune programs, since that can avoid +diff --git a/tune/Makefile.in b/tune/Makefile.in +index 7db531a..7e24a39 100644 +--- a/tune/Makefile.in ++++ b/tune/Makefile.in +@@ -460,7 +460,7 @@ top_srcdir = @top_srcdir@ + AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests + AM_LDFLAGS = -no-install + EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \ +- ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl ++ ia64.asm powerpc.asm powerpc64.asm sw_64.asm x86_64.asm many.pl + + noinst_HEADERS = speed.h + @ENABLE_STATIC_FALSE@STATIC = +-- +2.25.1 + diff --git a/0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch b/0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch new file mode 100644 index 0000000000000000000000000000000000000000..f5918661caae97f4fa4f2a0439b137f5b946aff1 --- /dev/null +++ b/0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch @@ -0,0 +1,6399 @@ +From 51d4c7e1bf74232eac185b1e3465457397a1fe30 Mon Sep 17 00:00:00 2001 +From: swcompiler +Date: Tue, 22 Jul 2025 13:50:16 +0800 +Subject: [PATCH 2/3] Sw64-Port-add-mpn-configure-support-for-sw64 + +--- + mpn/sw_64/README | 97 ++++++ + mpn/sw_64/default.m4 | 127 ++++++++ + mpn/sw_64/gmp-mparam.h | 86 ++++++ + mpn/sw_64/sw6/add_n.asm | 281 +++++++++++++++++ + mpn/sw_64/sw6/aorslsh1_n.asm | 168 +++++++++++ + mpn/sw_64/sw6/aorsmul_1.asm | 396 ++++++++++++++++++++++++ + mpn/sw_64/sw6/gmp-mparam.h | 209 +++++++++++++ + mpn/sw_64/sw6/mod_1_4.asm | 333 +++++++++++++++++++++ + mpn/sw_64/sw6/mul_1.asm | 496 +++++++++++++++++++++++++++++++ + mpn/sw_64/sw6/nails/README | 65 ++++ + mpn/sw_64/sw6/nails/addmul_1.asm | 394 ++++++++++++++++++++++++ + mpn/sw_64/sw6/nails/addmul_2.asm | 146 +++++++++ + mpn/sw_64/sw6/nails/addmul_3.asm | 169 +++++++++++ + mpn/sw_64/sw6/nails/addmul_4.asm | 210 +++++++++++++ + mpn/sw_64/sw6/nails/aors_n.asm | 233 +++++++++++++++ + mpn/sw_64/sw6/nails/gmp-mparam.h | 72 +++++ + mpn/sw_64/sw6/nails/mul_1.asm | 362 ++++++++++++++++++++++ + mpn/sw_64/sw6/nails/submul_1.asm | 394 ++++++++++++++++++++++++ + mpn/sw_64/sw6/slot.pl | 318 ++++++++++++++++++++ + mpn/sw_64/sw6/sub_n.asm | 281 +++++++++++++++++ + mpn/sw_64/sw6a/gcd_1.asm | 145 +++++++++ + mpn/sw_64/sw6a/hamdist.asm | 111 +++++++ + mpn/sw_64/sw6a/popcount.asm | 101 +++++++ + mpn/sw_64/sw6b/gcd_1.asm | 145 +++++++++ + mpn/sw_64/sw6b/hamdist.asm | 111 +++++++ + mpn/sw_64/sw6b/popcount.asm | 101 +++++++ + mpn/sw_64/sw8a/gcd_1.asm | 145 +++++++++ + mpn/sw_64/sw8a/hamdist.asm | 111 +++++++ + mpn/sw_64/sw8a/popcount.asm | 101 +++++++ + mpn/sw_64/sw_64-defs.m4 | 101 +++++++ + mpn/sw_64/unicos.m4 | 131 ++++++++ + 31 files changed, 6140 insertions(+) + create mode 100644 mpn/sw_64/README + create mode 100644 mpn/sw_64/default.m4 + create mode 100644 mpn/sw_64/gmp-mparam.h + create mode 100644 mpn/sw_64/sw6/add_n.asm + create mode 100644 mpn/sw_64/sw6/aorslsh1_n.asm + create mode 100644 mpn/sw_64/sw6/aorsmul_1.asm + create mode 100644 mpn/sw_64/sw6/gmp-mparam.h + create mode 100644 mpn/sw_64/sw6/mod_1_4.asm + create mode 100644 mpn/sw_64/sw6/mul_1.asm + create mode 100644 mpn/sw_64/sw6/nails/README + create mode 100644 mpn/sw_64/sw6/nails/addmul_1.asm + create mode 100644 mpn/sw_64/sw6/nails/addmul_2.asm + create mode 100644 mpn/sw_64/sw6/nails/addmul_3.asm + create mode 100644 mpn/sw_64/sw6/nails/addmul_4.asm + create mode 100644 mpn/sw_64/sw6/nails/aors_n.asm + create mode 100644 mpn/sw_64/sw6/nails/gmp-mparam.h + create mode 100644 mpn/sw_64/sw6/nails/mul_1.asm + create mode 100644 mpn/sw_64/sw6/nails/submul_1.asm + create mode 100755 mpn/sw_64/sw6/slot.pl + create mode 100644 mpn/sw_64/sw6/sub_n.asm + create mode 100644 mpn/sw_64/sw6a/gcd_1.asm + create mode 100644 mpn/sw_64/sw6a/hamdist.asm + create mode 100644 mpn/sw_64/sw6a/popcount.asm + create mode 100644 mpn/sw_64/sw6b/gcd_1.asm + create mode 100644 mpn/sw_64/sw6b/hamdist.asm + create mode 100644 mpn/sw_64/sw6b/popcount.asm + create mode 100644 mpn/sw_64/sw8a/gcd_1.asm + create mode 100644 mpn/sw_64/sw8a/hamdist.asm + create mode 100644 mpn/sw_64/sw8a/popcount.asm + create mode 100644 mpn/sw_64/sw_64-defs.m4 + create mode 100644 mpn/sw_64/unicos.m4 + +diff --git a/mpn/sw_64/README b/mpn/sw_64/README +new file mode 100644 +index 0000000..5557835 +--- /dev/null ++++ b/mpn/sw_64/README +@@ -0,0 +1,97 @@ ++Copyright 1996, 1997, 1999-2005 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. ++ ++ ++ ++ ++ ++This directory contains mpn functions optimized for DEC Sw_64 processors. ++ ++SW_64 ASSEMBLY RULES AND REGULATIONS ++ ++The `.prologue N' pseudo op marks the end of instruction that needs special ++handling by unwinding. It also says whether $27 is really needed for computing ++the gp. The `.mask M' pseudo op says which registers are saved on the stack, ++and at what offset in the frame. ++ ++Cray T3 code is very very different... ++ ++"$6" / "$f6" etc is the usual syntax for registers, but on Unicos instead "r6" ++/ "f6" is required. We use the "r6" / "f6" forms, and have m4 defines expand ++them to "$6" or "$f6" where necessary. ++ ++"0x" introduces a hex constant in gas and DEC as, but on Unicos "^X" is ++required. The X() macro accommodates this difference. ++ ++"cvttqc" is required by DEC as, "cvttq/c" is required by Unicos, and gas will ++accept either. We use cvttqc and have an m4 define expand to cvttq/c where ++necessary. ++ ++"not" as an alias for "ornot r31, ..." is available in gas and DEC as, but not ++the Unicos assembler. The full "ornot" must be used. ++ ++"unop" is not available in Unicos. We make an m4 define to the usual "ldl_u ++r31,0(r30)", and in fact use that define on all systems since it comes out the ++same. ++ ++"!literal!123" etc explicit relocations as per Tru64 4.0 are apparently not ++available in older sw_64 assemblers (including gas prior to 2.12), according to ++the GCC manual, so the assembler macro forms must be used (eg. ldgp). ++ ++ ++ ++RELEVANT OPTIMIZATION ISSUES ++ ++Here we have a really parallel pipeline, capable of issuing up to 4 integer ++instructions per cycle. In actual practice, it is never possible to sustain ++more than 3.5 integer insns/cycle due to rename register shortage. One integer ++multiply instruction can issue each cycle. To get optimal speed, we need to ++pretend we are vectorizing the code, i.e., minimize the depth of recurrences. ++ ++There are two dependencies to watch out for. 1) Address arithmetic ++dependencies, and 2) carry propagation dependencies. ++ ++We can avoid serializing due to address arithmetic by unrolling loops, so that ++addresses don't depend heavily on an index variable. Avoiding serializing ++because of carry propagation is trickier; the ultimate performance of the code ++will be determined of the number of latency cycles it takes from accepting ++carry-in to a vector point until we can generate carry-out. ++ ++Most integer instructions can execute in either the L0, U0, L1, or U1 ++pipelines. Shifts only execute in U0 and U1, and multiply only in U1. ++ ++CMOV instructions split into two internal instructions, CMOV1 and CMOV2. CMOV ++split the mapping process (see pg 2-26 in cmpwrgd.pdf), suggesting the CMOV ++should always be placed as the last instruction of an aligned 4 instruction ++block, or perhaps simply avoided. ++ ++Perhaps the most important issue is the latency between the L0/U0 and L1/U1 ++clusters; a result obtained on either cluster has an extra cycle of latency for ++consumers in the opposite cluster. Because of the dynamic nature of the ++implementation, it is hard to predict where an instruction will execute. ++ +diff --git a/mpn/sw_64/default.m4 b/mpn/sw_64/default.m4 +new file mode 100644 +index 0000000..38c8bcb +--- /dev/null ++++ b/mpn/sw_64/default.m4 +@@ -0,0 +1,127 @@ ++divert(-1) ++ ++dnl m4 macros for sw_64 assembler (everywhere except unicos). ++ ++ ++dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++ ++dnl Usage: ASM_START() ++define(`ASM_START', ++m4_assert_numargs(0) ++` .set noreorder ++ .set noat') ++ ++dnl Usage: X(value) ++define(`X', ++m4_assert_numargs(1) ++`0x$1') ++ ++dnl Usage: FLOAT64(label,value) ++define(`FLOAT64', ++m4_assert_numargs(2) ++` .align 3 ++$1: .t_floating $2') ++ ++ ++dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign]) ++dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) ++ ++define(`PROLOGUE_cpu', ++m4_assert_numargs_range(1,2) ++`ifelse(`$2',gp,, ++`ifelse(`$2',noalign,, ++`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter ++')')')')dnl ++ .text ++ifelse(`$2',noalign,,` ALIGN(16)') ++ .globl $1 ++ .ent $1 ++$1: ++ .frame r30,0,r26,0 ++ifelse(`$2',gp,` ldgp r29, 0(r27) ++`$'$1..ng:') ++ .prologue ifelse(`$2',gp,1,0)') ++ ++define(`EPILOGUE_cpu', ++m4_assert_numargs(1) ++` .end $1') ++ ++ ++dnl Usage: LDGP(dst,src) ++dnl ++dnl Emit an "ldgp dst,src", but only if the system uses a GOT. ++ ++define(LDGP, ++m4_assert_numargs(2) ++`ldgp `$1', `$2'') ++ ++ ++dnl Usage: EXTERN(variable_name) ++define(`EXTERN', ++m4_assert_numargs(1) ++) ++ ++dnl Usage: r0 ... r31 ++dnl f0 ... f31 ++dnl ++dnl Map register names r0 to $0, and f0 to $f0, etc. ++dnl This is needed on all systems but Unicos ++dnl ++dnl defreg() is used to protect the $ in $0 (otherwise it would represent a ++dnl macro argument). Double quoting is used to protect the f0 in $f0 ++dnl (otherwise it would be an infinite recursion). ++ ++forloop(i,0,31,`defreg(`r'i,$i)') ++forloop(i,0,31,`deflit(`f'i,``$f''i)') ++ ++ ++dnl Usage: DATASTART(name,align) or DATASTART(name) ++dnl DATAEND() ++ ++define(`DATASTART', ++m4_assert_numargs_range(1,2) ++` RODATA ++ ALIGN(ifelse($#,1,2,$2)) ++$1:') ++define(`DATAEND', ++m4_assert_numargs(0) ++) ++ ++dnl Load a symbolic address into a register ++define(`LEA', ++m4_assert_numargs(2) ++`ldi $1, $2') ++ ++dnl Usage: ASM_END() ++define(`ASM_END', ++m4_assert_numargs(0) ++) ++ ++divert +diff --git a/mpn/sw_64/gmp-mparam.h b/mpn/sw_64/gmp-mparam.h +new file mode 100644 +index 0000000..bf51ad2 +--- /dev/null ++++ b/mpn/sw_64/gmp-mparam.h +@@ -0,0 +1,86 @@ ++/* Sw_64 gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software ++Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++ ++/* 175MHz 21064 */ ++ ++/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */ ++ ++#define MUL_TOOM22_THRESHOLD 12 ++#define MUL_TOOM33_THRESHOLD 69 ++#define MUL_TOOM44_THRESHOLD 88 ++ ++#define SQR_BASECASE_THRESHOLD 4 ++#define SQR_TOOM2_THRESHOLD 20 ++#define SQR_TOOM3_THRESHOLD 62 ++#define SQR_TOOM4_THRESHOLD 155 ++ ++#define MULLO_BASECASE_THRESHOLD 0 /* always */ ++#define MULLO_DC_THRESHOLD 40 ++#define MULLO_MUL_N_THRESHOLD 202 ++ ++#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */ ++#define DIV_DC_THRESHOLD 38 ++#define POWM_THRESHOLD 60 ++ ++#define MATRIX22_STRASSEN_THRESHOLD 17 ++#define HGCD_THRESHOLD 80 ++#define GCD_DC_THRESHOLD 237 ++#define GCDEXT_DC_THRESHOLD 198 ++#define JACOBI_BASE_METHOD 2 ++ ++#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ ++#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ ++#define MOD_1_NORM_THRESHOLD 0 /* always */ ++#define MOD_1_UNNORM_THRESHOLD 0 /* always */ ++#define MOD_1_1_THRESHOLD 2 ++#define MOD_1_2_THRESHOLD 9 ++#define MOD_1_4_THRESHOLD 20 ++#define USE_PREINV_DIVREM_1 1 /* preinv always */ ++#define USE_PREINV_MOD_1 1 /* preinv always */ ++#define DIVEXACT_1_THRESHOLD 0 /* always */ ++#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ ++ ++#define GET_STR_DC_THRESHOLD 20 ++#define GET_STR_PRECOMPUTE_THRESHOLD 37 ++#define SET_STR_DC_THRESHOLD 746 ++#define SET_STR_PRECOMPUTE_THRESHOLD 1332 ++ ++#define MUL_FFT_TABLE { 240, 480, 1344, 2304, 5120, 20480, 49152, 0 } ++#define MUL_FFT_MODF_THRESHOLD 232 ++#define MUL_FFT_THRESHOLD 1664 ++ ++#define SQR_FFT_TABLE { 240, 480, 1216, 2304, 5120, 12288, 49152, 0 } ++#define SQR_FFT_MODF_THRESHOLD 232 ++#define SQR_FFT_THRESHOLD 1408 +diff --git a/mpn/sw_64/sw6/add_n.asm b/mpn/sw_64/sw6/add_n.asm +new file mode 100644 +index 0000000..4f0062c +--- /dev/null ++++ b/mpn/sw_64/sw6/add_n.asm +@@ -0,0 +1,281 @@ ++dnl Sw_64 sw6 mpn_add_n -- Add two limb vectors of the same length > 0 and ++dnl store sum in a third limb vector. ++ ++dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 2.125 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C vp r18 ++C n r19 ++C cy r20 (for mpn_add_nc) ++ ++C TODO ++C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) ++C Use multi-pronged feed-in. ++C Perform additional micro-tuning ++ ++C This code was written in cooperation with sw6 pipeline expert Steve Root. ++ ++C Pair loads and stores where possible ++C Store pairs oct-aligned where possible (didn't need it here) ++C Stores are delayed every third cycle ++C Loads and stores are delayed by fills ++C U stays still, put code there where possible (note alternation of U1 and U0) ++C L moves because of loads and stores ++C Note dampers in L to limit damage ++ ++C This odd-looking optimization expects that were having random bits in our ++C data, so that a pure zero result is unlikely. so we penalize the unlikely ++C case to help the common case. ++ ++define(`u0', `r0') define(`u1', `r3') ++define(`v0', `r1') define(`v1', `r4') ++ ++define(`cy0', `r20') define(`cy1', `r21') ++ ++MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) ++ ++ASM_START() ++PROLOGUE(mpn_add_nc) ++ br r31, $entry ++EPILOGUE() ++PROLOGUE(mpn_add_n) ++ bis r31, r31, cy0 C clear carry in ++$entry: cmpult r19, 5, r22 C L1 move counter ++ ldl u1, 0(r17) C L0 get next ones ++ ldl v1, 0(r18) C L1 ++ bne r22, $Lsmall ++ ++ ldl u0, 8(r17) C L0 get next ones ++ ldl v0, 8(r18) C L1 ++ addl u1, v1, r5 C U0 add two data ++ ++ cmpult r5, v1, r23 C U0 did it carry ++ ldl u1, 16(r17) C L0 get next ones ++ ldl v1, 16(r18) C L1 ++ ++ addl u0, v0, r8 C U1 add two data ++ addl r5, cy0, r5 C U0 carry in ++ ++ cmpult r8, v0, r22 C U1 did it carry ++ beq r5, $fix5f C U0 fix exact zero ++$ret5f: ldl u0, 24(r17) C L0 get next ones ++ ldl v0, 24(r18) C L1 ++ ++ addl r8, r23, r8 C U1 carry from last ++ addl u1, v1, r7 C U0 add two data ++ ++ beq r8, $fix6f C U1 fix exact zero ++$ret6f: cmpult r7, v1, r23 C U0 did it carry ++ ldl u1, 32(r17) C L0 get next ones ++ ldl v1, 32(r18) C L1 ++ ++ ldi r17, 40(r17) C L0 move pointer ++ ldi r18, 40(r18) C L1 move pointer ++ ++ ldi r16, -8(r16) ++ ldi r19, -13(r19) C L1 move counter ++ blt r19, $Lend C U1 loop control ++ ++ ++C Main loop. 8-way unrolled. ++ ALIGN(16) ++$Loop: addl u0, v0, r2 C U1 add two data ++ addl r7, r22, r7 C U0 add in carry ++ stl r5, 8(r16) C L0 put an answer ++ stl r8, 16(r16) C L1 pair ++ ++ cmpult r2, v0, cy1 C U1 did it carry ++ beq r7, $fix7 C U0 fix exact 0 ++$ret7: ldl u0, 0(r17) C L0 get next ones ++ ldl v0, 0(r18) C L1 ++ ++ bis r31, r31, r31 C L damp out ++ addl r2, r23, r2 C U1 carry from last ++ bis r31, r31, r31 C L moves in L ! ++ addl u1, v1, r5 C U0 add two data ++ ++ beq r2, $fix0 C U1 fix exact zero ++$ret0: cmpult r5, v1, cy0 C U0 did it carry ++ ldl u1, 8(r17) C L0 get next ones ++ ldl v1, 8(r18) C L1 ++ ++ addl u0, v0, r8 C U1 add two data ++ addl r5, cy1, r5 C U0 carry from last ++ stl r7, 24(r16) C L0 store pair ++ stl r2, 32(r16) C L1 ++ ++ cmpult r8, v0, r22 C U1 did it carry ++ beq r5, $fix1 C U0 fix exact zero ++$ret1: ldl u0, 16(r17) C L0 get next ones ++ ldl v0, 16(r18) C L1 ++ ++ ldi r16, 64(r16) C L0 move pointer ++ addl r8, cy0, r8 C U1 carry from last ++ ldi r19, -8(r19) C L1 move counter ++ addl u1, v1, r7 C U0 add two data ++ ++ beq r8, $fix2 C U1 fix exact zero ++$ret2: cmpult r7, v1, r23 C U0 did it carry ++ ldl u1, 24(r17) C L0 get next ones ++ ldl v1, 24(r18) C L1 ++ ++ addl u0, v0, r2 C U1 add two data ++ addl r7, r22, r7 C U0 add in carry ++ stl r5, -24(r16) C L0 put an answer ++ stl r8, -16(r16) C L1 pair ++ ++ cmpult r2, v0, cy1 C U1 did it carry ++ beq r7, $fix3 C U0 fix exact 0 ++$ret3: ldl u0, 32(r17) C L0 get next ones ++ ldl v0, 32(r18) C L1 ++ ++ bis r31, r31, r31 C L damp out ++ addl r2, r23, r2 C U1 carry from last ++ bis r31, r31, r31 C L moves in L ! ++ addl u1, v1, r5 C U0 add two data ++ ++ beq r2, $fix4 C U1 fix exact zero ++$ret4: cmpult r5, v1, cy0 C U0 did it carry ++ ldl u1, 40(r17) C L0 get next ones ++ ldl v1, 40(r18) C L1 ++ ++ addl u0, v0, r8 C U1 add two data ++ addl r5, cy1, r5 C U0 carry from last ++ stl r7, -8(r16) C L0 store pair ++ stl r2, 0(r16) C L1 ++ ++ cmpult r8, v0, r22 C U1 did it carry ++ beq r5, $fix5 C U0 fix exact zero ++$ret5: ldl u0, 48(r17) C L0 get next ones ++ ldl v0, 48(r18) C L1 ++ ++ ldl r31, 256(r17) C L0 prefetch ++ addl r8, cy0, r8 C U1 carry from last ++ ldl r31, 256(r18) C L1 prefetch ++ addl u1, v1, r7 C U0 add two data ++ ++ beq r8, $fix6 C U1 fix exact zero ++$ret6: cmpult r7, v1, r23 C U0 did it carry ++ ldl u1, 56(r17) C L0 get next ones ++ ldl v1, 56(r18) C L1 ++ ++ ldi r17, 64(r17) C L0 move pointer ++ bis r31, r31, r31 C U ++ ldi r18, 64(r18) C L1 move pointer ++ bge r19, $Loop C U1 loop control ++C ==== main loop end ++ ++$Lend: addl u0, v0, r2 C U1 add two data ++ addl r7, r22, r7 C U0 add in carry ++ stl r5, 8(r16) C L0 put an answer ++ stl r8, 16(r16) C L1 pair ++ cmpult r2, v0, cy1 C U1 did it carry ++ beq r7, $fix7c C U0 fix exact 0 ++$ret7c: addl r2, r23, r2 C U1 carry from last ++ addl u1, v1, r5 C U0 add two data ++ beq r2, $fix0c C U1 fix exact zero ++$ret0c: cmpult r5, v1, cy0 C U0 did it carry ++ addl r5, cy1, r5 C U0 carry from last ++ stl r7, 24(r16) C L0 store pair ++ stl r2, 32(r16) C L1 ++ beq r5, $fix1c C U0 fix exact zero ++$ret1c: stl r5, 40(r16) C L0 put an answer ++ ldi r16, 48(r16) C L0 move pointer ++ ++ ldi r19, 8(r19) ++ beq r19, $Lret ++ ++ ldl u1, 0(r17) ++ ldl v1, 0(r18) ++$Lsmall: ++ ldi r19, -1(r19) ++ beq r19, $Lend0 ++ ++ ALIGN(8) ++$Loop0: addl u1, v1, r2 C main add ++ cmpult r2, v1, r8 C compute cy from last add ++ ldl u1, 8(r17) ++ ldl v1, 8(r18) ++ addl r2, cy0, r5 C carry add ++ ldi r17, 8(r17) ++ ldi r18, 8(r18) ++ stl r5, 0(r16) ++ cmpult r5, r2, cy0 C compute cy from last add ++ ldi r19, -1(r19) C decr loop cnt ++ bis r8, cy0, cy0 C combine cy from the two adds ++ ldi r16, 8(r16) ++ bne r19, $Loop0 ++$Lend0: addl u1, v1, r2 C main add ++ addl r2, cy0, r5 C carry add ++ cmpult r2, v1, r8 C compute cy from last add ++ cmpult r5, r2, cy0 C compute cy from last add ++ stl r5, 0(r16) ++ bis r8, cy0, r0 C combine cy from the two adds ++ ret r31,(r26),1 ++ ++ ALIGN(8) ++$Lret: ldi r0, 0(cy0) C copy carry into return register ++ ret r31,(r26),1 ++ ++$fix5f: bis r23, cy0, r23 C bring forward carry ++ br r31, $ret5f ++$fix6f: bis r22, r23, r22 C bring forward carry ++ br r31, $ret6f ++$fix0: bis cy1, r23, cy1 C bring forward carry ++ br r31, $ret0 ++$fix1: bis cy0, cy1, cy0 C bring forward carry ++ br r31, $ret1 ++$fix2: bis r22, cy0, r22 C bring forward carry ++ br r31, $ret2 ++$fix3: bis r23, r22, r23 C bring forward carry ++ br r31, $ret3 ++$fix4: bis cy1, r23, cy1 C bring forward carry ++ br r31, $ret4 ++$fix5: bis cy1, cy0, cy0 C bring forward carry ++ br r31, $ret5 ++$fix6: bis r22, cy0, r22 C bring forward carry ++ br r31, $ret6 ++$fix7: bis r23, r22, r23 C bring forward carry ++ br r31, $ret7 ++$fix0c: bis cy1, r23, cy1 C bring forward carry ++ br r31, $ret0c ++$fix1c: bis cy0, cy1, cy0 C bring forward carry ++ br r31, $ret1c ++$fix7c: bis r23, r22, r23 C bring forward carry ++ br r31, $ret7c ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/aorslsh1_n.asm b/mpn/sw_64/sw6/aorslsh1_n.asm +new file mode 100644 +index 0000000..c353301 +--- /dev/null ++++ b/mpn/sw_64/sw6/aorslsh1_n.asm +@@ -0,0 +1,168 @@ ++dnl Sw_64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). ++ ++dnl Copyright 2003, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++C TODO ++ ++define(`rp',`r16') ++define(`up',`r17') ++define(`vp',`r18') ++define(`n', `r19') ++ ++define(`u0', `r8') ++define(`u1', `r1') ++define(`v0', `r4') ++define(`v1', `r5') ++ ++define(`cy0', `r0') ++define(`cy1', `r20') ++define(`cy', `r22') ++define(`rr', `r24') ++define(`ps', `r25') ++define(`sl', `r28') ++ ++ifdef(`OPERATION_addlsh1_n',` ++ define(ADDSUB, addl) ++ define(CARRY, `cmpult $1,$2,$3') ++ define(func, mpn_addlsh1_n) ++') ++ifdef(`OPERATION_sublsh1_n',` ++ define(ADDSUB, subl) ++ define(CARRY, `cmpult $2,$1,$3') ++ define(func, mpn_sublsh1_n) ++') ++ ++MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) ++ ++ASM_START() ++PROLOGUE(func) ++ and n, 2, cy0 ++ blbs n, L(bx1) ++L(bx0): ldl v1, 0(vp) ++ ldl u1, 0(up) ++ ldi r2, 0(r31) ++ bne cy0, L(b10) ++ ++L(b00): ldi vp, 48(vp) ++ ldi up, -16(up) ++ ldi rp, -8(rp) ++ ldi cy0, 0(r31) ++ br r31, L(lo0) ++ ++L(b10): ldi vp, 32(vp) ++ ldi rp, 8(rp) ++ ldi cy0, 0(r31) ++ br r31, L(lo2) ++ ++L(bx1): ldl v0, 0(vp) ++ ldl u0, 0(up) ++ ldi r3, 0(r31) ++ beq cy0, L(b01) ++ ++L(b11): ldi vp, 40(vp) ++ ldi up, -24(up) ++ ldi rp, 16(rp) ++ ldi cy1, 0(r31) ++ br r31, L(lo3) ++ ++L(b01): ldi n, -4(n) ++ ldi cy1, 0(r31) ++ ble n, L(end) ++ ldi vp, 24(vp) ++ ldi up, -8(up) ++ ++ ALIGN(16) ++L(top): addl v0, v0, r6 ++ ldl v1, -16(vp) ++ addl r6, r3, sl C combined vlimb ++ ldl u1, 16(up) ++ ADDSUB u0, sl, ps C ulimb + (vlimb << 1) ++ cmplt v0, r31, r2 C high v bits ++ ADDSUB ps, cy1, rr C consume carry from previous operation ++ CARRY( ps, u0, cy0) C carry out #2 ++ stl rr, 0(rp) ++ CARRY( rr, ps, cy) C carry out #3 ++ ldi vp, 32(vp) C bookkeeping ++ addl cy, cy0, cy0 C final carry out ++L(lo0): addl v1, v1, r7 ++ ldl v0, -40(vp) ++ addl r7, r2, sl ++ ldl u0, 24(up) ++ ADDSUB u1, sl, ps ++ cmplt v1, r31, r3 ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy1) ++ stl rr, 8(rp) ++ CARRY( rr, ps, cy) ++ ldi rp, 32(rp) C bookkeeping ++ addl cy, cy1, cy1 ++L(lo3): addl v0, v0, r6 ++ ldl v1, -32(vp) ++ addl r6, r3, sl ++ ldl u1, 32(up) ++ ADDSUB u0, sl, ps ++ cmplt v0, r31, r2 ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy0) ++ stl rr, -16(rp) ++ CARRY( rr, ps, cy) ++ ldi up, 32(up) C bookkeeping ++ addl cy, cy0, cy0 ++L(lo2): addl v1, v1, r7 ++ ldl v0, -24(vp) ++ addl r7, r2, sl ++ ldl u0, 8(up) ++ ADDSUB u1, sl, ps ++ cmplt v1, r31, r3 ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy1) ++ stl rr, -8(rp) ++ CARRY( rr, ps, cy) ++ ldi n, -4(n) C bookkeeping ++ addl cy, cy1, cy1 ++ bgt n, L(top) ++ ++L(end): addl v0, v0, r6 ++ addl r6, r3, sl ++ ADDSUB u0, sl, ps ++ cmplt v0, r31, r2 ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy0) ++ stl rr, 0(rp) ++ CARRY( rr, ps, cy) ++ addl cy, cy0, cy0 ++ addl cy0, r2, r0 ++ ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/aorsmul_1.asm b/mpn/sw_64/sw6/aorsmul_1.asm +new file mode 100644 +index 0000000..78daf7b +--- /dev/null ++++ b/mpn/sw_64/sw6/aorsmul_1.asm +@@ -0,0 +1,396 @@ ++dnl Sw_64 sw6 mpn_addmul_1 and mpn_submul_1. ++ ++dnl Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 3.5 ++ ++C INPUT PARAMETERS ++define(`rp', `r16') ++define(`up', `r17') ++define(`n', `r18') ++define(`v0', `r19') ++ ++dnl This code was written in cooperation with sw6 pipeline expert Steve Root. ++ ++dnl The stores can issue a cycle late so we have paired no-op's to 'catch' ++dnl them, so that further disturbance to the schedule is damped. ++ ++dnl We couldn't pair the loads, because the entangled schedule of the carry's ++dnl has to happen on one side {0} of the machine. ++ ++dnl This is a great schedule for the d_cache, a poor schedule for the b_cache. ++dnl The lockup on U0 means that any stall can't be recovered from. Consider a ++dnl ldl in L1, say that load gets stalled because it collides with a fill from ++dnl the b_cache. On the next cycle, this load gets priority. If first looks ++dnl at L0, and goes there. The instruction we intended for L0 gets to look at ++dnl L1, which is NOT where we want it. It either stalls 1, because it can't ++dnl go in L0, or goes there, and causes a further instruction to stall. ++ ++dnl So for b_cache, we're likely going to want to put one or more cycles back ++dnl into the code! And, of course, put in lds prefetch for the rp[] operand. ++dnl At a place where we have an mt followed by a bookkeeping, put the ++dnl bookkeeping in upper, and the prefetch into lower. ++ ++dnl Note, the ldl's and stl's are at the end of the quadpacks. Note, we'd ++dnl like not to have an ldl or an stl to preceded a conditional branch in a ++dnl quadpack. The conditional branch moves the retire pointer one cycle ++dnl later. ++ ++ifdef(`OPERATION_addmul_1',` ++ define(`ADDSUB', `addl') ++ define(`CMPCY', `cmpult $2,$1') ++ define(`func', `mpn_addmul_1') ++') ++ifdef(`OPERATION_submul_1',` ++ define(`ADDSUB', `subl') ++ define(`CMPCY', `cmpult $1,$2') ++ define(`func', `mpn_submul_1') ++') ++ ++MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) ++ ++ASM_START() ++PROLOGUE(func) ++ ldl r3, 0(up) C ++ and r18, 7, r20 C ++ ldi r18, -9(r18) C ++ cmpeq r20, 1, r21 C ++ beq r21, $L1 C ++ ++$1mod8: ldl r5, 0(rp) C ++ mull v0, r3, r7 C ++ umulh v0, r3, r8 C ++ ADDSUB r5, r7, r23 C ++ CMPCY( r5, r23), r20 C ++ addl r8, r20, r0 C ++ stl r23, 0(rp) C ++ bge r18, $ent1 C ++ ret r31, (r26), 1 C ++ ++$L1: ldi r8, 0(r31) C zero carry reg ++ ldi r24, 0(r31) C zero carry reg ++ cmpeq r20, 2, r21 C ++ bne r21, $2mod8 C ++ cmpeq r20, 3, r21 C ++ bne r21, $3mod8 C ++ cmpeq r20, 4, r21 C ++ bne r21, $4mod8 C ++ cmpeq r20, 5, r21 C ++ bne r21, $5mod8 C ++ cmpeq r20, 6, r21 C ++ bne r21, $6mod8 C ++ cmpeq r20, 7, r21 C ++ beq r21, $0mod8 C ++ ++$7mod8: ldl r5, 0(rp) C ++ ldi up, 8(up) C ++ mull v0, r3, r7 C ++ umulh v0, r3, r24 C ++ ADDSUB r5, r7, r23 C ++ CMPCY( r5, r23), r20 C ++ addl r24, r20, r24 C ++ stl r23, 0(rp) C ++ ldi rp, 8(rp) C ++ ldl r3, 0(up) C ++$6mod8: ldl r1, 8(up) C ++ mull v0, r3, r25 C ++ umulh v0, r3, r3 C ++ mull v0, r1, r28 C ++ ldl r0, 16(up) C ++ ldl r4, 0(rp) C ++ umulh v0, r1, r8 C ++ ldl r1, 24(up) C ++ ldi up, 48(up) C L1 bookkeeping ++ mull v0, r0, r2 C ++ ldl r5, 8(rp) C ++ ldi rp, -32(rp) C L1 bookkeeping ++ umulh v0, r0, r6 C ++ ADDSUB r4, r25, r25 C lo + acc ++ mull v0, r1, r7 C ++ br r31, $ent6 C ++ ++$ent1: ldi up, 8(up) C ++ ldi rp, 8(rp) C ++ ldi r8, 0(r0) C ++ ldl r3, 0(up) C ++$0mod8: ldl r1, 8(up) C ++ mull v0, r3, r2 C ++ umulh v0, r3, r6 C ++ mull v0, r1, r7 C ++ ldl r0, 16(up) C ++ ldl r4, 0(rp) C ++ umulh v0, r1, r24 C ++ ldl r1, 24(up) C ++ mull v0, r0, r25 C ++ ldl r5, 8(rp) C ++ umulh v0, r0, r3 C ++ ADDSUB r4, r2, r2 C lo + acc ++ mull v0, r1, r28 C ++ ldi rp, -16(rp) C ++ br r31, $ent0 C ++ ++$3mod8: ldl r5, 0(rp) C ++ ldi up, 8(up) C ++ mull v0, r3, r7 C ++ umulh v0, r3, r8 C ++ ADDSUB r5, r7, r23 C ++ CMPCY( r5, r23), r20 C ++ addl r8, r20, r24 C ++ stl r23, 0(rp) C ++ ldi rp, 8(rp) C ++ ldl r3, 0(up) C ++$2mod8: ldl r1, 8(up) C ++ mull v0, r3, r25 C ++ umulh v0, r3, r3 C ++ mull v0, r1, r28 C ++ ble r18, $n23 C ++ ldl r0, 16(up) C ++ ldl r4, 0(rp) C ++ umulh v0, r1, r8 C ++ ldl r1, 24(up) C ++ ldi up, 16(up) C L1 bookkeeping ++ mull v0, r0, r2 C ++ ldl r5, 8(rp) C ++ ldi rp, 0(rp) C L1 bookkeeping ++ umulh v0, r0, r6 C ++ ADDSUB r4, r25, r25 C lo + acc ++ mull v0, r1, r7 C ++ br r31, $ent2 C ++ ++$5mod8: ldl r5, 0(rp) C ++ ldi up, 8(up) C ++ mull v0, r3, r7 C ++ umulh v0, r3, r24 C ++ ADDSUB r5, r7, r23 C ++ CMPCY( r5, r23), r20 C ++ addl r24, r20, r8 C ++ stl r23, 0(rp) C ++ ldi rp, 8(rp) C ++ ldl r3, 0(up) C ++$4mod8: ldl r1, 8(up) C ++ mull v0, r3, r2 C ++ umulh v0, r3, r6 C ++ mull v0, r1, r7 C ++ ldl r0, 16(up) C ++ ldl r4, 0(rp) C ++ umulh v0, r1, r24 C ++ ldl r1, 24(up) C ++ ldi up, 32(up) C L1 bookkeeping ++ mull v0, r0, r25 C ++ ldl r5, 8(rp) C ++ ldi rp, 16(rp) C L1 bookkeeping ++ umulh v0, r0, r3 C ++ ADDSUB r4, r2, r2 C lo + acc ++ mull v0, r1, r28 C ++ CMPCY( r4, r2), r20 C L0 lo add => carry ++ ADDSUB r2, r8, r22 C U0 hi add => answer ++ ble r18, $Lend C ++ ALIGN(16) ++$Loop: ++ bis r31, r31, r31 C U1 mt ++ CMPCY( r2, r22), r21 C L0 hi add => carry ++ addl r6, r20, r6 C U0 hi mul + carry ++ ldl r0, 0(up) C ++ ++ bis r31, r31, r31 C U1 mt ++ ADDSUB r5, r7, r7 C L0 lo + acc ++ addl r6, r21, r6 C U0 hi mul + carry ++ ldl r4, 0(rp) C L1 ++ ++ umulh v0, r1, r8 C U1 ++ CMPCY( r5, r7), r20 C L0 lo add => carry ++ ADDSUB r7, r6, r23 C U0 hi add => answer ++ ldl r1, 8(up) C L1 ++ ++ mull v0, r0, r2 C U1 ++ CMPCY( r7, r23), r21 C L0 hi add => carry ++ addl r24, r20, r24 C U0 hi mul + carry ++ ldl r5, 8(rp) C L1 ++ ++ umulh v0, r0, r6 C U1 ++ ADDSUB r4, r25, r25 C U0 lo + acc ++ stl r22, -16(rp) C L0 ++ stl r23, -8(rp) C L1 ++ ++ bis r31, r31, r31 C L0 st slosh ++ mull v0, r1, r7 C U1 ++ bis r31, r31, r31 C L1 st slosh ++ addl r24, r21, r24 C U0 hi mul + carry ++$ent2: ++ CMPCY( r4, r25), r20 C L0 lo add => carry ++ bis r31, r31, r31 C U1 mt ++ ldi r18, -8(r18) C L1 bookkeeping ++ ADDSUB r25, r24, r22 C U0 hi add => answer ++ ++ bis r31, r31, r31 C U1 mt ++ CMPCY( r25, r22), r21 C L0 hi add => carry ++ addl r3, r20, r3 C U0 hi mul + carry ++ ldl r0, 16(up) C L1 ++ ++ bis r31, r31, r31 C U1 mt ++ ADDSUB r5, r28, r28 C L0 lo + acc ++ addl r3, r21, r3 C U0 hi mul + carry ++ ldl r4, 16(rp) C L1 ++ ++ umulh v0, r1, r24 C U1 ++ CMPCY( r5, r28), r20 C L0 lo add => carry ++ ADDSUB r28, r3, r23 C U0 hi add => answer ++ ldl r1, 24(up) C L1 ++ ++ mull v0, r0, r25 C U1 ++ CMPCY( r28, r23), r21 C L0 hi add => carry ++ addl r8, r20, r8 C U0 hi mul + carry ++ ldl r5, 24(rp) C L1 ++ ++ umulh v0, r0, r3 C U1 ++ ADDSUB r4, r2, r2 C U0 lo + acc ++ stl r22, 0(rp) C L0 ++ stl r23, 8(rp) C L1 ++ ++ bis r31, r31, r31 C L0 st slosh ++ mull v0, r1, r28 C U1 ++ bis r31, r31, r31 C L1 st slosh ++ addl r8, r21, r8 C U0 hi mul + carry ++$ent0: ++ CMPCY( r4, r2), r20 C L0 lo add => carry ++ bis r31, r31, r31 C U1 mt ++ ldi up, 64(up) C L1 bookkeeping ++ ADDSUB r2, r8, r22 C U0 hi add => answer ++ ++ bis r31, r31, r31 C U1 mt ++ CMPCY( r2, r22), r21 C L0 hi add => carry ++ addl r6, r20, r6 C U0 hi mul + carry ++ ldl r0, -32(up) C L1 ++ ++ bis r31, r31, r31 C U1 mt ++ ADDSUB r5, r7, r7 C L0 lo + acc ++ addl r6, r21, r6 C U0 hi mul + carry ++ ldl r4, 32(rp) C L1 ++ ++ umulh v0, r1, r8 C U1 ++ CMPCY( r5, r7), r20 C L0 lo add => carry ++ ADDSUB r7, r6, r23 C U0 hi add => answer ++ ldl r1, -24(up) C L1 ++ ++ mull v0, r0, r2 C U1 ++ CMPCY( r7, r23), r21 C L0 hi add => carry ++ addl r24, r20, r24 C U0 hi mul + carry ++ ldl r5, 40(rp) C L1 ++ ++ umulh v0, r0, r6 C U1 ++ ADDSUB r4, r25, r25 C U0 lo + acc ++ stl r22, 16(rp) C L0 ++ stl r23, 24(rp) C L1 ++ ++ bis r31, r31, r31 C L0 st slosh ++ mull v0, r1, r7 C U1 ++ bis r31, r31, r31 C L1 st slosh ++ addl r24, r21, r24 C U0 hi mul + carry ++$ent6: ++ CMPCY( r4, r25), r20 C L0 lo add => carry ++ bis r31, r31, r31 C U1 mt ++ ldi rp, 64(rp) C L1 bookkeeping ++ ADDSUB r25, r24, r22 C U0 hi add => answer ++ ++ bis r31, r31, r31 C U1 mt ++ CMPCY( r25, r22), r21 C L0 hi add => carry ++ addl r3, r20, r3 C U0 hi mul + carry ++ ldl r0, -16(up) C L1 ++ ++ bis r31, r31, r31 C U1 mt ++ ADDSUB r5, r28, r28 C L0 lo + acc ++ addl r3, r21, r3 C U0 hi mul + carry ++ ldl r4, -16(rp) C L1 ++ ++ umulh v0, r1, r24 C U1 ++ CMPCY( r5, r28), r20 C L0 lo add => carry ++ ADDSUB r28, r3, r23 C U0 hi add => answer ++ ldl r1, -8(up) C L1 ++ ++ mull v0, r0, r25 C U1 ++ CMPCY( r28, r23), r21 C L0 hi add => carry ++ addl r8, r20, r8 C U0 hi mul + carry ++ ldl r5, -8(rp) C L1 ++ ++ umulh v0, r0, r3 C U1 ++ ADDSUB r4, r2, r2 C U0 lo + acc ++ stl r22, -32(rp) C L0 ++ stl r23, -24(rp) C L1 ++ ++ bis r31, r31, r31 C L0 st slosh ++ mull v0, r1, r28 C U1 ++ bis r31, r31, r31 C L1 st slosh ++ addl r8, r21, r8 C U0 hi mul + carry ++ ++ CMPCY( r4, r2), r20 C L0 lo add => carry ++ ADDSUB r2, r8, r22 C U0 hi add => answer ++ ldl r31, 256(up) C prefetch up[] ++ bgt r18, $Loop C U1 bookkeeping ++ ++$Lend: CMPCY( r2, r22), r21 C ++ addl r6, r20, r6 C ++ ADDSUB r5, r7, r7 C ++ addl r6, r21, r6 C ++ ldl r4, 0(rp) C ++ umulh v0, r1, r8 C ++ CMPCY( r5, r7), r20 C ++ ADDSUB r7, r6, r23 C ++ CMPCY(r7, r23), r21 C ++ addl r24, r20, r24 C ++ ldl r5, 8(rp) C ++ ADDSUB r4, r25, r25 C ++ stl r22, -16(rp) C ++ stl r23, -8(rp) C ++ addl r24, r21, r24 C ++ br L(x) ++ ++ ALIGN(16) ++$n23: ldl r4, 0(rp) C ++ ldl r5, 8(rp) C ++ umulh v0, r1, r8 C ++ ADDSUB r4, r25, r25 C ++L(x): CMPCY( r4, r25), r20 C ++ ADDSUB r25, r24, r22 C ++ CMPCY( r25, r22), r21 C ++ addl r3, r20, r3 C ++ ADDSUB r5, r28, r28 C ++ addl r3, r21, r3 C ++ CMPCY( r5, r28), r20 C ++ ADDSUB r28, r3, r23 C ++ CMPCY( r28, r23), r21 C ++ addl r8, r20, r8 C ++ stl r22, 0(rp) C ++ stl r23, 8(rp) C ++ addl r8, r21, r0 C ++ ret r31, (r26), 1 C ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/gmp-mparam.h b/mpn/sw_64/sw6/gmp-mparam.h +new file mode 100644 +index 0000000..e51d6b0 +--- /dev/null ++++ b/mpn/sw_64/sw6/gmp-mparam.h +@@ -0,0 +1,209 @@ ++/* gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free ++Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++#define DIVEXACT_BY3_METHOD 0 /* override ../diveby3.asm */ ++ ++/* 500 MHz 21164 (agnesi.math.su.se) */ ++/* FFT tuning limit = 20000000 */ ++/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */ ++ ++#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ ++#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ ++#define MOD_1_1P_METHOD 2 ++#define MOD_1_NORM_THRESHOLD 0 /* always */ ++#define MOD_1_UNNORM_THRESHOLD 0 /* always */ ++#define MOD_1N_TO_MOD_1_1_THRESHOLD 4 ++#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 ++#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 ++#define MOD_1_2_TO_MOD_1_4_THRESHOLD 21 ++#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7 ++#define USE_PREINV_DIVREM_1 1 /* preinv always */ ++#define DIV_QR_1N_PI1_METHOD 2 ++#define DIV_QR_1_NORM_THRESHOLD 5 ++#define DIV_QR_1_UNNORM_THRESHOLD 1 ++#define DIV_QR_2_PI2_THRESHOLD 8 ++#define DIVEXACT_1_THRESHOLD 0 /* always */ ++#define BMOD_1_TO_MOD_1_THRESHOLD 20 ++ ++#define MUL_TOOM22_THRESHOLD 32 ++#define MUL_TOOM33_THRESHOLD 117 ++#define MUL_TOOM44_THRESHOLD 124 ++#define MUL_TOOM6H_THRESHOLD 230 ++#define MUL_TOOM8H_THRESHOLD 357 ++ ++#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 ++#define MUL_TOOM32_TO_TOOM53_THRESHOLD 107 ++#define MUL_TOOM42_TO_TOOM53_THRESHOLD 88 ++#define MUL_TOOM42_TO_TOOM63_THRESHOLD 105 ++#define MUL_TOOM43_TO_TOOM54_THRESHOLD 136 ++ ++#define SQR_BASECASE_THRESHOLD 0 /* always */ ++#define SQR_TOOM2_THRESHOLD 59 ++#define SQR_TOOM3_THRESHOLD 123 ++#define SQR_TOOM4_THRESHOLD 163 ++#define SQR_TOOM6_THRESHOLD 333 ++#define SQR_TOOM8_THRESHOLD 0 /* always */ ++ ++#define MULMID_TOOM42_THRESHOLD 52 ++ ++#define MULMOD_BNM1_THRESHOLD 19 ++#define SQRMOD_BNM1_THRESHOLD 5 ++ ++#define MUL_FFT_MODF_THRESHOLD 468 /* k = 5 */ ++#define MUL_FFT_TABLE3 \ ++ { { 468, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ ++ { 11, 5}, { 23, 6}, { 19, 7}, { 10, 6}, \ ++ { 24, 7}, { 13, 6}, { 27, 7}, { 14, 6}, \ ++ { 29, 7}, { 17, 6}, { 35, 7}, { 29, 8}, \ ++ { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ ++ { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \ ++ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ ++ { 51, 9}, { 27, 8}, { 55, 9}, { 35, 8}, \ ++ { 71, 9}, { 39,10}, { 23, 9}, { 55,10}, \ ++ { 31, 9}, { 67,10}, { 39, 9}, { 79,10}, \ ++ { 47, 9}, { 95,10}, { 55,11}, { 31,10}, \ ++ { 79,11}, { 47,10}, { 103,12}, { 31,11}, \ ++ { 63,10}, { 135,11}, { 79,10}, { 167,11}, \ ++ { 95,10}, { 199,11}, { 111,12}, { 63,11}, \ ++ { 143,10}, { 287, 9}, { 575,11}, { 159,10}, \ ++ { 319,12}, { 95,11}, { 191,10}, { 383,11}, \ ++ { 207,13}, { 63,12}, { 127,11}, { 255,10}, \ ++ { 511,11}, { 271,10}, { 543,11}, { 287,10}, \ ++ { 575,12}, { 159,11}, { 319,10}, { 639,11}, \ ++ { 335,10}, { 671,11}, { 351,10}, { 703,12}, \ ++ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ ++ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ ++ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ ++ { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ ++ { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ ++ { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ ++ { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \ ++ { 319,12}, { 735,13}, { 383,12}, { 767,11}, \ ++ { 1535,12}, { 831,13}, { 447,12}, { 959,14}, \ ++ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ ++ { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ ++ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ ++ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ ++ { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \ ++ { 767,13}, { 1663,14}, { 895,13}, { 1855,15}, \ ++ { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ ++ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ ++ {2097152,22}, {4194304,23}, {8388608,24} } ++#define MUL_FFT_TABLE3_SIZE 151 ++#define MUL_FFT_THRESHOLD 5760 ++ ++#define SQR_FFT_MODF_THRESHOLD 412 /* k = 5 */ ++#define SQR_FFT_TABLE3 \ ++ { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ ++ { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ ++ { 27, 7}, { 14, 6}, { 29, 7}, { 28, 8}, \ ++ { 15, 7}, { 31, 8}, { 17, 7}, { 36, 8}, \ ++ { 19, 7}, { 39, 8}, { 29, 9}, { 15, 8}, \ ++ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ ++ { 49, 9}, { 27,10}, { 15, 9}, { 39,10}, \ ++ { 23, 9}, { 51,11}, { 15,10}, { 31, 9}, \ ++ { 67,10}, { 39, 9}, { 79,10}, { 47, 9}, \ ++ { 95,10}, { 55,11}, { 31,10}, { 79,11}, \ ++ { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ ++ { 127, 9}, { 255,11}, { 79,10}, { 159, 9}, \ ++ { 319,10}, { 167,11}, { 95,10}, { 191, 9}, \ ++ { 383,11}, { 111,12}, { 63,11}, { 127,10}, \ ++ { 271,11}, { 143,10}, { 287, 9}, { 575,10}, \ ++ { 303,11}, { 159,10}, { 319,12}, { 95,11}, \ ++ { 191,10}, { 383,11}, { 207,13}, { 63,12}, \ ++ { 127,11}, { 255,10}, { 511,11}, { 271,10}, \ ++ { 543,11}, { 287,10}, { 575,11}, { 303,12}, \ ++ { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ ++ { 671,11}, { 351,10}, { 703,11}, { 367,12}, \ ++ { 191,11}, { 383,10}, { 767,11}, { 415,12}, \ ++ { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ ++ { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ ++ { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ ++ { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ ++ { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ ++ { 447,11}, { 895,12}, { 479,14}, { 127,13}, \ ++ { 255,12}, { 575,11}, { 1151,12}, { 607,13}, \ ++ { 319,12}, { 703,11}, { 1407,12}, { 735,13}, \ ++ { 383,12}, { 831,13}, { 447,12}, { 959,14}, \ ++ { 255,13}, { 511,12}, { 1087,13}, { 575,12}, \ ++ { 1151,13}, { 639,12}, { 1279,13}, { 703,12}, \ ++ { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ ++ { 831,12}, { 1663,13}, { 959,15}, { 255,14}, \ ++ { 511,13}, { 1215,14}, { 639,13}, { 1407,14}, \ ++ { 767,13}, { 1663,14}, { 895,13}, { 1791,15}, \ ++ { 511,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ ++ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ ++ {2097152,22}, {4194304,23}, {8388608,24} } ++#define SQR_FFT_TABLE3_SIZE 159 ++#define SQR_FFT_THRESHOLD 5056 ++ ++#define MULLO_BASECASE_THRESHOLD 0 /* always */ ++#define MULLO_DC_THRESHOLD 100 ++#define MULLO_MUL_N_THRESHOLD 11355 ++ ++#define DC_DIV_QR_THRESHOLD 124 ++#define DC_DIVAPPR_Q_THRESHOLD 438 ++#define DC_BDIV_QR_THRESHOLD 153 ++#define DC_BDIV_Q_THRESHOLD 318 ++ ++#define INV_MULMOD_BNM1_THRESHOLD 62 ++#define INV_NEWTON_THRESHOLD 384 ++#define INV_APPR_THRESHOLD 402 ++ ++#define BINV_NEWTON_THRESHOLD 381 ++#define REDC_1_TO_REDC_N_THRESHOLD 110 ++ ++#define MU_DIV_QR_THRESHOLD 1752 ++#define MU_DIVAPPR_Q_THRESHOLD 1895 ++#define MUPI_DIV_QR_THRESHOLD 174 ++#define MU_BDIV_QR_THRESHOLD 1387 ++#define MU_BDIV_Q_THRESHOLD 1787 ++ ++#define POWM_SEC_TABLE 1,13,66,82,579 ++ ++#define MATRIX22_STRASSEN_THRESHOLD 15 ++#define HGCD_THRESHOLD 318 ++#define HGCD_APPR_THRESHOLD 363 ++#define HGCD_REDUCE_THRESHOLD 2384 ++#define GCD_DC_THRESHOLD 2504 ++#define GCDEXT_DC_THRESHOLD 671 ++#define JACOBI_BASE_METHOD 3 ++ ++#define GET_STR_DC_THRESHOLD 14 ++#define GET_STR_PRECOMPUTE_THRESHOLD 25 ++#define SET_STR_DC_THRESHOLD 3754 ++#define SET_STR_PRECOMPUTE_THRESHOLD 8097 ++ ++#define FAC_DSC_THRESHOLD 951 ++#define FAC_ODD_THRESHOLD 24 +diff --git a/mpn/sw_64/sw6/mod_1_4.asm b/mpn/sw_64/sw6/mod_1_4.asm +new file mode 100644 +index 0000000..ff4d655 +--- /dev/null ++++ b/mpn/sw_64/sw6/mod_1_4.asm +@@ -0,0 +1,333 @@ ++dnl Sw_64 mpn_mod_1s_4p ++ ++dnl Contributed to the GNU project by Torbjorn Granlund. ++ ++dnl Copyright 2009, 2010 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C TODO: ++C * Optimise. 2.75 c/l should be possible. ++C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated. ++C * Optimise feed-in code, starting the sw pipeline in switch code. ++C * Shorten software pipeline. The mul instructions are scheduled too far ++C from their users. Fixing this will allow us to use fewer registers. ++C * If we cannot reduce register usage, write perhaps small-n basecase. ++C * Does this work for PIC? ++ ++C cycles/limb ++ ++define(`ap', `r16') ++define(`n', `r17') ++define(`pl', `r24') ++define(`ph', `r25') ++define(`rl', `r6') ++define(`rh', `r7') ++define(`B1modb', `r1') ++define(`B2modb', `r2') ++define(`B3modb', `r3') ++define(`B4modb', `r4') ++define(`B5modb', `r5') ++ ++ASM_START() ++PROLOGUE(mpn_mod_1s_4p) ++ ldi r30, -64(r30) ++ stl r9, 8(r30) ++ ldl B1modb, 16(r19) ++ stl r10, 16(r30) ++ ldl B2modb, 24(r19) ++ stl r11, 24(r30) ++ ldl B3modb, 32(r19) ++ stl r12, 32(r30) ++ ldl B4modb, 40(r19) ++ stl r13, 40(r30) ++ ldl B5modb, 48(r19) ++ s8addl n, ap, ap C point ap at vector end ++ ++ and n, 3, r0 ++ ldi n, -4(n) ++ beq r0, L(b0) ++ ldi r6, -2(r0) ++ blt r6, L(b1) ++ beq r6, L(b2) ++ ++L(b3): ldl r21, -16(ap) ++ ldl r22, -8(ap) ++ ldl r20, -24(ap) ++ mull r21, B1modb, r8 ++ umulh r21, B1modb, r12 ++ mull r22, B2modb, r9 ++ umulh r22, B2modb, r13 ++ addl r8, r20, pl ++ cmpult pl, r8, r0 ++ addl r0, r12, ph ++ addl r9, pl, rl ++ cmpult rl, r9, r0 ++ addl r13, ph, ph ++ addl r0, ph, rh ++ ldi ap, -56(ap) ++ br L(com) ++ ++L(b0): ldl r21, -24(ap) ++ ldl r22, -16(ap) ++ ldl r23, -8(ap) ++ ldl r20, -32(ap) ++ mull r21, B1modb, r8 ++ umulh r21, B1modb, r12 ++ mull r22, B2modb, r9 ++ umulh r22, B2modb, r13 ++ mull r23, B3modb, r10 ++ umulh r23, B3modb, r27 ++ addl r8, r20, pl ++ cmpult pl, r8, r0 ++ addl r0, r12, ph ++ addl r9, pl, pl ++ cmpult pl, r9, r0 ++ addl r13, ph, ph ++ addl r0, ph, ph ++ addl r10, pl, rl ++ cmpult rl, r10, r0 ++ addl r27, ph, ph ++ addl r0, ph, rh ++ ldi ap, -64(ap) ++ br L(com) ++ ++L(b1): bis r31, r31, rh ++ ldl rl, -8(ap) ++ ldi ap, -40(ap) ++ br L(com) ++ ++L(b2): ldl rh, -8(ap) ++ ldl rl, -16(ap) ++ ldi ap, -48(ap) ++ ++L(com): ble n, L(ed3) ++ ldl r21, 8(ap) ++ ldl r22, 16(ap) ++ ldl r23, 24(ap) ++ ldl r20, 0(ap) ++ ldi n, -4(n) ++ ldi ap, -32(ap) ++ mull r21, B1modb, r8 ++ umulh r21, B1modb, r12 ++ mull r22, B2modb, r9 ++ umulh r22, B2modb, r13 ++ mull r23, B3modb, r10 ++ umulh r23, B3modb, r27 ++ mull rl, B4modb, r11 ++ umulh rl, B4modb, r28 ++ ble n, L(ed2) ++ ++ ALIGN(16) ++L(top): ldl r21, 8(ap) ++ mull rh, B5modb, rl ++ addl r8, r20, pl ++ ldl r22, 16(ap) ++ cmpult pl, r8, r0 ++ umulh rh, B5modb, rh ++ ldl r23, 24(ap) ++ addl r0, r12, ph ++ addl r9, pl, pl ++ mull r21, B1modb, r8 ++ cmpult pl, r9, r0 ++ addl r13, ph, ph ++ umulh r21, B1modb, r12 ++ ldi ap, -32(ap) ++ addl r0, ph, ph ++ addl r10, pl, pl ++ mull r22, B2modb, r9 ++ cmpult pl, r10, r0 ++ addl r27, ph, ph ++ addl r11, pl, pl ++ umulh r22, B2modb, r13 ++ addl r0, ph, ph ++ cmpult pl, r11, r0 ++ addl r28, ph, ph ++ mull r23, B3modb, r10 ++ ldl r20, 32(ap) ++ addl pl, rl, rl ++ umulh r23, B3modb, r27 ++ addl r0, ph, ph ++ cmpult rl, pl, r0 ++ mull rl, B4modb, r11 ++ addl ph, rh, rh ++ umulh rl, B4modb, r28 ++ addl r0, rh, rh ++ ldi n, -4(n) ++ bgt n, L(top) ++ ++L(ed2): mull rh, B5modb, rl ++ addl r8, r20, pl ++ umulh rh, B5modb, rh ++ cmpult pl, r8, r0 ++ addl r0, r12, ph ++ addl r9, pl, pl ++ cmpult pl, r9, r0 ++ addl r13, ph, ph ++ addl r0, ph, ph ++ addl r10, pl, pl ++ cmpult pl, r10, r0 ++ addl r27, ph, ph ++ addl r11, pl, pl ++ addl r0, ph, ph ++ cmpult pl, r11, r0 ++ addl r28, ph, ph ++ addl pl, rl, rl ++ addl r0, ph, ph ++ cmpult rl, pl, r0 ++ addl ph, rh, rh ++ addl r0, rh, rh ++ ++L(ed3): mull rh, B1modb, r8 ++ umulh rh, B1modb, rh ++ addl r8, rl, rl ++ cmpult rl, r8, r0 ++ addl r0, rh, rh ++ ++ ldl r24, 8(r19) C cnt ++ sll rh, r24, rh ++ subl r31, r24, r25 ++ srl rl, r25, r2 ++ sll rl, r24, rl ++ or r2, rh, rh ++ ++ ldl r23, 0(r19) C bi ++ mull rh, r23, r8 ++ umulh rh, r23, r9 ++ addl rh, 1, r7 ++ addl r8, rl, r8 C ql ++ cmpult r8, rl, r0 ++ addl r9, r7, r9 ++ addl r0, r9, r9 C qh ++ mull r9, r18, r21 C qh * b ++ subl rl, r21, rl ++ cmpult r8, rl, r0 C rl > ql ++ negl r0, r0 ++ and r0, r18, r0 ++ addl rl, r0, rl ++ cmpule r18, rl, r0 C rl >= b ++ negl r0, r0 ++ and r0, r18, r0 ++ subl rl, r0, rl ++ ++ srl rl, r24, r0 ++ ++ ldl r9, 8(r30) ++ ldl r10, 16(r30) ++ ldl r11, 24(r30) ++ ldl r12, 32(r30) ++ ldl r13, 40(r30) ++ ldi r30, 64(r30) ++ ret r31, (r26), 1 ++EPILOGUE() ++ ++PROLOGUE(mpn_mod_1s_4p_cps,gp) ++ ldi r30, -32(r30) ++ stl r26, 0(r30) ++ stl r9, 8(r30) ++ stl r10, 16(r30) ++ stl r11, 24(r30) ++ mov r16, r11 ++ LEA( r4, __clz_tab) ++ ldi r10, 65(r31) ++ cmpgeb r31, r17, r1 ++ srl r1, 1, r1 ++ xor r1, 127, r1 ++ addl r1, r4, r1 ++ ldl_u r2, 0(r1) ++ ext0b r2, r1, r2 ++ s8subl r2, 7, r2 ++ srl r17, r2, r3 ++ subl r10, r2, r10 ++ addl r3, r4, r3 ++ ldl_u r1, 0(r3) ++ ext0b r1, r3, r1 ++ subl r10, r1, r10 ++ sll r17, r10, r9 ++ mov r9, r16 ++ call r26, mpn_invert_limb ++ LDGP( r29, 0(r26)) ++ subl r31, r10, r2 ++ ldi r1, 1(r31) ++ sll r1, r10, r1 ++ subl r31, r9, r3 ++ srl r0, r2, r2 ++ ldl r26, 0(r30) ++ bis r2, r1, r2 ++ stl r0, 0(r11) ++ stl r10, 8(r11) ++ mull r2, r3, r2 ++ srl r2, r10, r3 ++ umulh r2, r0, r1 ++ stl r3, 16(r11) ++ mull r2, r0, r3 ++ ornot r31, r1, r1 ++ subl r1, r2, r1 ++ mull r1, r9, r1 ++ addl r1, r9, r2 ++ cmpule r1, r3, r3 ++ seleq r3, r2, r1 ++ srl r1, r10, r3 ++ umulh r1, r0, r2 ++ stl r3, 24(r11) ++ mull r1, r0, r3 ++ ornot r31, r2, r2 ++ subl r2, r1, r2 ++ mull r2, r9, r2 ++ addl r2, r9, r1 ++ cmpule r2, r3, r3 ++ seleq r3, r1, r2 ++ srl r2, r10, r1 ++ umulh r2, r0, r3 ++ stl r1, 32(r11) ++ mull r2, r0, r1 ++ ornot r31, r3, r3 ++ subl r3, r2, r3 ++ mull r3, r9, r3 ++ addl r3, r9, r2 ++ cmpule r3, r1, r1 ++ seleq r1, r2, r3 ++ srl r3, r10, r2 ++ umulh r3, r0, r1 ++ stl r2, 40(r11) ++ mull r3, r0, r0 ++ ornot r31, r1, r1 ++ subl r1, r3, r1 ++ mull r1, r9, r1 ++ addl r1, r9, r9 ++ cmpule r1, r0, r0 ++ seleq r0, r9, r1 ++ ldl r9, 8(r30) ++ srl r1, r10, r1 ++ ldl r10, 16(r30) ++ stl r1, 48(r11) ++ ldl r11, 24(r30) ++ ldi r30, 32(r30) ++ ret r31, (r26), 1 ++EPILOGUE() +diff --git a/mpn/sw_64/sw6/mul_1.asm b/mpn/sw_64/sw6/mul_1.asm +new file mode 100644 +index 0000000..7d8dff3 +--- /dev/null ++++ b/mpn/sw_64/sw6/mul_1.asm +@@ -0,0 +1,496 @@ ++dnl Sw_64 sw6 mpn_mul_1 -- Multiply a limb vector with a limb and store the ++dnl result in a second limb vector. ++ ++dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C INPUT PARAMETERS ++C res_ptr r16 ++C s1_ptr r17 ++C size r18 ++C s2_limb r19 ++ ++C This code runs at 2.25 cycles/limb on SW6. ++ ++C This code was written in close cooperation with sw6 pipeline expert ++C Steve Root. Any errors are tege's fault, though. ++ ++C Code structure: ++ ++C code for n < 8 ++C code for n > 8 code for (n mod 8) ++C code for (n div 8) feed-in code ++C 8-way unrolled loop ++C wind-down code ++ ++C Some notes about unrolled loop: ++C ++C r1-r8 multiplies and workup ++C r21-r28 multiplies and workup ++C r9-r12 loads ++C r0 -1 ++C r20,r29,r13-r15 scramble ++C ++C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a ++C put-the-carry-into-hi. The idea is that these branches are very rarely ++C taken, and since a non-taken branch consumes no resources, that is better ++C than an addl. ++C ++C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an ++C add NEXT cycle #09 which feeds a store in NEXT cycle #02 ++ ++C The code could use some further work: ++C 1. Speed up really small multiplies. The default sw_64/mul_1.asm code is ++C faster than this for size < 3. ++C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless ++C that is too costly. ++C 3. Consider using 4-way unrolling, even if that runs slower. ++C 4. Reduce register usage. In particular, try to avoid using r29. ++ ++ASM_START() ++PROLOGUE(mpn_mul_1) ++ cmpult r18, 8, r1 ++ beq r1, $Large ++$Lsmall: ++ ldl r2,0(r17) C r2 = s1_limb ++ ldi r18,-1(r18) C size-- ++ mull r2,r19,r3 C r3 = prod_low ++ bic r31,r31,r4 C clear cy_limb ++ umulh r2,r19,r0 C r0 = prod_high ++ beq r18,$Le1a C jump if size was == 1 ++ ldl r2,8(r17) C r2 = s1_limb ++ ldi r18,-1(r18) C size-- ++ stl r3,0(r16) ++ beq r18,$Le2a C jump if size was == 2 ++ ALIGN(8) ++$Lopa: mull r2,r19,r3 C r3 = prod_low ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ ldi r18,-1(r18) C size-- ++ umulh r2,r19,r4 C r4 = cy_limb ++ ldl r2,16(r17) C r2 = s1_limb ++ ldi r17,8(r17) C s1_ptr++ ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ stl r3,8(r16) ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ ldi r16,8(r16) C res_ptr++ ++ bne r18,$Lopa ++ ++$Le2a: mull r2,r19,r3 C r3 = prod_low ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ umulh r2,r19,r4 C r4 = cy_limb ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ stl r3,8(r16) ++ addl r4,r0,r0 C cy_limb = prod_high + cy ++ ret r31,(r26),1 ++$Le1a: stl r3,0(r16) ++ ret r31,(r26),1 ++ ++$Large: ++ ldi r30, -224(r30) ++ stl r26, 0(r30) ++ stl r9, 8(r30) ++ stl r10, 16(r30) ++ stl r11, 24(r30) ++ stl r12, 32(r30) ++ stl r13, 40(r30) ++ stl r14, 48(r30) ++ stl r15, 56(r30) ++ stl r29, 64(r30) ++ ++ and r18, 7, r20 C count for the first loop, 0-7 ++ srl r18, 3, r18 C count for unrolled loop ++ bis r31, r31, r21 ++ beq r20, $L_8_or_more C skip first loop ++ ++$L_9_or_more: ++ ldl r2,0(r17) C r2 = s1_limb ++ ldi r17,8(r17) C s1_ptr++ ++ ldi r20,-1(r20) C size-- ++ mull r2,r19,r3 C r3 = prod_low ++ umulh r2,r19,r21 C r21 = prod_high ++ beq r20,$Le1b C jump if size was == 1 ++ bis r31, r31, r0 C FIXME: shouldn't need this ++ ldl r2,0(r17) C r2 = s1_limb ++ ldi r17,8(r17) C s1_ptr++ ++ ldi r20,-1(r20) C size-- ++ stl r3,0(r16) ++ ldi r16,8(r16) C res_ptr++ ++ beq r20,$Le2b C jump if size was == 2 ++ ALIGN(8) ++$Lopb: mull r2,r19,r3 C r3 = prod_low ++ addl r21,r0,r0 C cy_limb = cy_limb + 'cy' ++ ldi r20,-1(r20) C size-- ++ umulh r2,r19,r21 C r21 = prod_high ++ ldl r2,0(r17) C r2 = s1_limb ++ ldi r17,8(r17) C s1_ptr++ ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ stl r3,0(r16) ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ ldi r16,8(r16) C res_ptr++ ++ bne r20,$Lopb ++ ++$Le2b: mull r2,r19,r3 C r3 = prod_low ++ addl r21,r0,r0 C cy_limb = cy_limb + 'cy' ++ umulh r2,r19,r21 C r21 = prod_high ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ stl r3,0(r16) ++ ldi r16,8(r16) C res_ptr++ ++ addl r21,r0,r21 C cy_limb = prod_high + cy ++ br r31, $L_8_or_more ++$Le1b: stl r3,0(r16) ++ ldi r16,8(r16) C res_ptr++ ++ ++$L_8_or_more: ++ ldi r0, -1(r31) C put -1 in r0, for tricky loop control ++ ldi r17, -32(r17) C L1 bookkeeping ++ ldi r18, -1(r18) C decrement count ++ ++ ldl r9, 32(r17) C L1 ++ ldl r10, 40(r17) C L1 ++ mull r9, r19, r22 C U1 #07 ++ ldl r11, 48(r17) C L1 ++ umulh r9, r19, r23 C U1 #08 ++ ldl r12, 56(r17) C L1 ++ mull r10, r19, r24 C U1 #09 ++ ldl r9, 64(r17) C L1 ++ ++ ldi r17, 64(r17) C L1 bookkeeping ++ ++ umulh r10, r19, r25 C U1 #11 ++ mull r11, r19, r26 C U1 #12 ++ umulh r11, r19, r27 C U1 #13 ++ mull r12, r19, r28 C U1 #14 ++ ldl r10, 8(r17) C L1 ++ umulh r12, r19, r1 C U1 #15 ++ ldl r11, 16(r17) C L1 ++ mull r9, r19, r2 C U1 #16 ++ ldl r12, 24(r17) C L1 ++ umulh r9, r19, r3 C U1 #17 ++ addl r21, r22, r13 C L1 mov ++ mull r10, r19, r4 C U1 #18 ++ addl r23, r24, r22 C L0 sum 2 mul's ++ cmpult r13, r21, r14 C L1 carry from sum ++ bgt r18, $L_16_or_more ++ ++ cmpult r22, r24, r24 C U0 carry from sum ++ umulh r10, r19, r5 C U1 #02 ++ addl r25, r26, r23 C U0 sum 2 mul's ++ mull r11, r19, r6 C U1 #03 ++ cmpult r23, r26, r25 C U0 carry from sum ++ umulh r11, r19, r7 C U1 #04 ++ addl r27, r28, r28 C U0 sum 2 mul's ++ mull r12, r19, r8 C U1 #05 ++ cmpult r28, r27, r15 C L0 carry from sum ++ ldi r16, 32(r16) C L1 bookkeeping ++ addl r13, r31, r13 C U0 start carry cascade ++ umulh r12, r19, r21 C U1 #06 ++ br r31, $ret0c ++ ++$L_16_or_more: ++C --------------------------------------------------------------- ++ subl r18,1,r18 ++ cmpult r22, r24, r24 C U0 carry from sum ++ ldl r9, 32(r17) C L1 ++ ++ umulh r10, r19, r5 C U1 #02 ++ addl r25, r26, r23 C U0 sum 2 mul's ++ mull r11, r19, r6 C U1 #03 ++ cmpult r23, r26, r25 C U0 carry from sum ++ umulh r11, r19, r7 C U1 #04 ++ addl r27, r28, r28 C U0 sum 2 mul's ++ mull r12, r19, r8 C U1 #05 ++ cmpult r28, r27, r15 C L0 carry from sum ++ ldi r16, 32(r16) C L1 bookkeeping ++ addl r13, r31, r13 C U0 start carry cascade ++ ++ umulh r12, r19, r21 C U1 #06 ++C beq r13, $fix0w C U0 ++$ret0w: addl r22, r14, r26 C L0 ++ ldl r10, 40(r17) C L1 ++ ++ mull r9, r19, r22 C U1 #07 ++ beq r26, $fix1w C U0 ++$ret1w: addl r23, r24, r27 C L0 ++ ldl r11, 48(r17) C L1 ++ ++ umulh r9, r19, r23 C U1 #08 ++ beq r27, $fix2w C U0 ++$ret2w: addl r28, r25, r28 C L0 ++ ldl r12, 56(r17) C L1 ++ ++ mull r10, r19, r24 C U1 #09 ++ beq r28, $fix3w C U0 ++$ret3w: addl r1, r2, r20 C L0 sum 2 mul's ++ ldl r9, 64(r17) C L1 ++ ++ addl r3, r4, r2 C L0 #10 2 mul's ++ ldi r17, 64(r17) C L1 bookkeeping ++ cmpult r20, r1, r29 C U0 carry from sum ++ ++ umulh r10, r19, r25 C U1 #11 ++ cmpult r2, r4, r4 C U0 carry from sum ++ stl r13, -32(r16) C L0 ++ stl r26, -24(r16) C L1 ++ ++ mull r11, r19, r26 C U1 #12 ++ addl r5, r6, r14 C U0 sum 2 mul's ++ stl r27, -16(r16) C L0 ++ stl r28, -8(r16) C L1 ++ ++ umulh r11, r19, r27 C U1 #13 ++ cmpult r14, r6, r3 C U0 carry from sum ++C could do cross-jumping here: ++C bra $L_middle_of_unrolled_loop ++ mull r12, r19, r28 C U1 #14 ++ addl r7, r3, r5 C L0 eat carry ++ addl r20, r15, r20 C U0 carry cascade ++ ldl r10, 8(r17) C L1 ++ ++ umulh r12, r19, r1 C U1 #15 ++ beq r20, $fix4 C U0 ++$ret4w: addl r2, r29, r6 C L0 ++ ldl r11, 16(r17) C L1 ++ ++ mull r9, r19, r2 C U1 #16 ++ beq r6, $fix5 C U0 ++$ret5w: addl r14, r4, r7 C L0 ++ ldl r12, 24(r17) C L1 ++ ++ umulh r9, r19, r3 C U1 #17 ++ beq r7, $fix6 C U0 ++$ret6w: addl r5, r8, r8 C L0 sum 2 ++ addl r21, r22, r13 C L1 sum 2 mul's ++ ++ mull r10, r19, r4 C U1 #18 ++ addl r23, r24, r22 C L0 sum 2 mul's ++ cmpult r13, r21, r14 C L1 carry from sum ++ ble r18, $Lend C U0 ++C --------------------------------------------------------------- ++ ALIGN(16) ++$Loop: ++ umulh r0, r18, r18 C U1 #01 decrement r18! ++ cmpult r8, r5, r29 C L0 carry from last bunch ++ cmpult r22, r24, r24 C U0 carry from sum ++ ldl r9, 32(r17) C L1 ++ ++ umulh r10, r19, r5 C U1 #02 ++ addl r25, r26, r23 C U0 sum 2 mul's ++ stl r20, 0(r16) C L0 ++ stl r6, 8(r16) C L1 ++ ++ mull r11, r19, r6 C U1 #03 ++ cmpult r23, r26, r25 C U0 carry from sum ++ stl r7, 16(r16) C L0 ++ stl r8, 24(r16) C L1 ++ ++ umulh r11, r19, r7 C U1 #04 ++ bis r31, r31, r31 C L0 st slosh ++ bis r31, r31, r31 C L1 st slosh ++ addl r27, r28, r28 C U0 sum 2 mul's ++ ++ mull r12, r19, r8 C U1 #05 ++ cmpult r28, r27, r15 C L0 carry from sum ++ ldi r16, 64(r16) C L1 bookkeeping ++ addl r13, r29, r13 C U0 start carry cascade ++ ++ umulh r12, r19, r21 C U1 #06 ++ beq r13, $fix0 C U0 ++$ret0: addl r22, r14, r26 C L0 ++ ldl r10, 40(r17) C L1 ++ ++ mull r9, r19, r22 C U1 #07 ++ beq r26, $fix1 C U0 ++$ret1: addl r23, r24, r27 C L0 ++ ldl r11, 48(r17) C L1 ++ ++ umulh r9, r19, r23 C U1 #08 ++ beq r27, $fix2 C U0 ++$ret2: addl r28, r25, r28 C L0 ++ ldl r12, 56(r17) C L1 ++ ++ mull r10, r19, r24 C U1 #09 ++ beq r28, $fix3 C U0 ++$ret3: addl r1, r2, r20 C L0 sum 2 mul's ++ ldl r9, 64(r17) C L1 ++ ++ addl r3, r4, r2 C L0 #10 2 mul's ++ bis r31, r31, r31 C U1 mul hole ++ ldi r17, 64(r17) C L1 bookkeeping ++ cmpult r20, r1, r29 C U0 carry from sum ++ ++ umulh r10, r19, r25 C U1 #11 ++ cmpult r2, r4, r4 C U0 carry from sum ++ stl r13, -32(r16) C L0 ++ stl r26, -24(r16) C L1 ++ ++ mull r11, r19, r26 C U1 #12 ++ addl r5, r6, r14 C U0 sum 2 mul's ++ stl r27, -16(r16) C L0 ++ stl r28, -8(r16) C L1 ++ ++ umulh r11, r19, r27 C U1 #13 ++ bis r31, r31, r31 C L0 st slosh ++ bis r31, r31, r31 C L1 st slosh ++ cmpult r14, r6, r3 C U0 carry from sum ++$L_middle_of_unrolled_loop: ++ mull r12, r19, r28 C U1 #14 ++ addl r7, r3, r5 C L0 eat carry ++ addl r20, r15, r20 C U0 carry cascade ++ ldl r10, 8(r17) C L1 ++ ++ umulh r12, r19, r1 C U1 #15 ++ beq r20, $fix4 C U0 ++$ret4: addl r2, r29, r6 C L0 ++ ldl r11, 16(r17) C L1 ++ ++ mull r9, r19, r2 C U1 #16 ++ beq r6, $fix5 C U0 ++$ret5: addl r14, r4, r7 C L0 ++ ldl r12, 24(r17) C L1 ++ ++ umulh r9, r19, r3 C U1 #17 ++ beq r7, $fix6 C U0 ++$ret6: addl r5, r8, r8 C L0 sum 2 ++ addl r21, r22, r13 C L1 sum 2 mul's ++ ++ mull r10, r19, r4 C U1 #18 ++ addl r23, r24, r22 C L0 sum 2 mul's ++ cmpult r13, r21, r14 C L1 carry from sum ++ bgt r18, $Loop C U0 ++C --------------------------------------------------------------- ++$Lend: ++ cmpult r8, r5, r29 C L0 carry from last bunch ++ cmpult r22, r24, r24 C U0 carry from sum ++ ++ umulh r10, r19, r5 C U1 #02 ++ addl r25, r26, r23 C U0 sum 2 mul's ++ stl r20, 0(r16) C L0 ++ stl r6, 8(r16) C L1 ++ ++ mull r11, r19, r6 C U1 #03 ++ cmpult r23, r26, r25 C U0 carry from sum ++ stl r7, 16(r16) C L0 ++ stl r8, 24(r16) C L1 ++ ++ umulh r11, r19, r7 C U1 #04 ++ addl r27, r28, r28 C U0 sum 2 mul's ++ ++ mull r12, r19, r8 C U1 #05 ++ cmpult r28, r27, r15 C L0 carry from sum ++ ldi r16, 64(r16) C L1 bookkeeping ++ addl r13, r29, r13 C U0 start carry cascade ++ ++ umulh r12, r19, r21 C U1 #06 ++ beq r13, $fix0c C U0 ++$ret0c: addl r22, r14, r26 C L0 ++ beq r26, $fix1c C U0 ++$ret1c: addl r23, r24, r27 C L0 ++ beq r27, $fix2c C U0 ++$ret2c: addl r28, r25, r28 C L0 ++ beq r28, $fix3c C U0 ++$ret3c: addl r1, r2, r20 C L0 sum 2 mul's ++ addl r3, r4, r2 C L0 #10 2 mul's ++ ldi r17, 64(r17) C L1 bookkeeping ++ cmpult r20, r1, r29 C U0 carry from sum ++ cmpult r2, r4, r4 C U0 carry from sum ++ stl r13, -32(r16) C L0 ++ stl r26, -24(r16) C L1 ++ addl r5, r6, r14 C U0 sum 2 mul's ++ stl r27, -16(r16) C L0 ++ stl r28, -8(r16) C L1 ++ cmpult r14, r6, r3 C U0 carry from sum ++ addl r7, r3, r5 C L0 eat carry ++ addl r20, r15, r20 C U0 carry cascade ++ beq r20, $fix4c C U0 ++$ret4c: addl r2, r29, r6 C L0 ++ beq r6, $fix5c C U0 ++$ret5c: addl r14, r4, r7 C L0 ++ beq r7, $fix6c C U0 ++$ret6c: addl r5, r8, r8 C L0 sum 2 ++ cmpult r8, r5, r29 C L0 carry from last bunch ++ stl r20, 0(r16) C L0 ++ stl r6, 8(r16) C L1 ++ stl r7, 16(r16) C L0 ++ stl r8, 24(r16) C L1 ++ addl r29, r21, r0 ++ ++ ldl r26, 0(r30) ++ ldl r9, 8(r30) ++ ldl r10, 16(r30) ++ ldl r11, 24(r30) ++ ldl r12, 32(r30) ++ ldl r13, 40(r30) ++ ldl r14, 48(r30) ++ ldl r15, 56(r30) ++ ldl r29, 64(r30) ++ ldi r30, 224(r30) ++ ret r31, (r26), 1 ++ ++C $fix0w: bis r14, r29, r14 C join carries ++C br r31, $ret0w ++$fix1w: bis r24, r14, r24 C join carries ++ br r31, $ret1w ++$fix2w: bis r25, r24, r25 C join carries ++ br r31, $ret2w ++$fix3w: bis r15, r25, r15 C join carries ++ br r31, $ret3w ++$fix0: bis r14, r29, r14 C join carries ++ br r31, $ret0 ++$fix1: bis r24, r14, r24 C join carries ++ br r31, $ret1 ++$fix2: bis r25, r24, r25 C join carries ++ br r31, $ret2 ++$fix3: bis r15, r25, r15 C join carries ++ br r31, $ret3 ++$fix4: bis r29, r15, r29 C join carries ++ br r31, $ret4 ++$fix5: bis r4, r29, r4 C join carries ++ br r31, $ret5 ++$fix6: addl r5, r4, r5 C can't carry twice! ++ br r31, $ret6 ++$fix0c: bis r14, r29, r14 C join carries ++ br r31, $ret0c ++$fix1c: bis r24, r14, r24 C join carries ++ br r31, $ret1c ++$fix2c: bis r25, r24, r25 C join carries ++ br r31, $ret2c ++$fix3c: bis r15, r25, r15 C join carries ++ br r31, $ret3c ++$fix4c: bis r29, r15, r29 C join carries ++ br r31, $ret4c ++$fix5c: bis r4, r29, r4 C join carries ++ br r31, $ret5c ++$fix6c: addl r5, r4, r5 C can't carry twice! ++ br r31, $ret6c ++ ++EPILOGUE(mpn_mul_1) ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/README b/mpn/sw_64/sw6/nails/README +new file mode 100644 +index 0000000..b214ac5 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/README +@@ -0,0 +1,65 @@ ++Copyright 2002, 2005 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. ++ ++ ++ ++ ++ ++This directory contains assembly code for nails-enabled 21264. The code is not ++very well optimized. ++ ++For addmul_N, as N grows larger, we could make multiple loads together, then do ++about 3.3 i/c. 10 cycles after the last load, we can increase to 4 i/c. This ++would surely allow addmul_4 to run at 2 c/l, but the same should be possible ++also for addmul_3 and perhaps even addmul_2. ++ ++ ++ current fair best ++Routine c/l unroll c/l unroll c/l i/c ++mul_1 3.25 2.75 2.75 3.273 ++addmul_1 4.0 4 3.5 4 14 3.25 3.385 ++addmul_2 4.0 1 2.5 2 10 2.25 3.333 ++addmul_3 3.0 1 2.33 2 14 2 3.333 ++addmul_4 2.5 1 2.125 2 17 2 3.135 ++ ++addmul_5 2 1 10 ++addmul_6 2 1 12 ++addmul_7 2 1 14 ++ ++(The "best" column doesn't account for bookkeeping instructions and ++thereby assumes infinite unrolling.) ++ ++Basecase usages: ++ ++1 addmul_1 ++2 addmul_2 ++3 addmul_3 ++4 addmul_4 ++5 addmul_3 + addmul_2 2.3998 ++6 addmul_4 + addmul_2 ++7 addmul_4 + addmul_3 +diff --git a/mpn/sw_64/sw6/nails/addmul_1.asm b/mpn/sw_64/sw6/nails/addmul_1.asm +new file mode 100644 +index 0000000..1108355 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/addmul_1.asm +@@ -0,0 +1,394 @@ ++dnl Sw_64 sw6 nails mpn_addmul_1. ++ ++dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 4 ++ ++C TODO ++C * Reroll loop for 3.75 c/l with current 4-way unrolling. ++C * The loop is overscheduled wrt loads and wrt multiplies, in particular ++C umulh. ++C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 ++C and would work since the loop structure is really regular. ++ ++C INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`n', `r18') ++define(`vl0',`r19') ++ ++define(`numb_mask',`r6') ++ ++define(`m0a',`r0') ++define(`m0b',`r1') ++define(`m1a',`r2') ++define(`m1b',`r3') ++define(`m2a',`r20') ++define(`m2b',`r21') ++define(`m3a',`r22') ++define(`m3b',`r23') ++ ++define(`acc0',`r25') ++define(`acc1',`r27') ++ ++define(`ul0',`r4') ++define(`ul1',`r5') ++define(`ul2',`r4') ++define(`ul3',`r5') ++ ++define(`rl0',`r24') ++define(`rl1',`r24') ++define(`rl2',`r24') ++define(`rl3',`r24') ++ ++define(`t0',`r7') ++define(`t1',`r8') ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`NUMB_BITS',`GMP_NUMB_BITS') ++ ++dnl This declaration is munged by configure ++NAILS_SUPPORT(2-63) ++ ++ASM_START() ++PROLOGUE(mpn_addmul_1) ++ sll vl0, NAIL_BITS, vl0 ++ ldi numb_mask, -1(r31) ++ srl numb_mask, NAIL_BITS, numb_mask ++ ++ and n, 3, r25 ++ cmpeq r25, 1, r21 ++ bne r21, L(1m4) ++ cmpeq r25, 2, r21 ++ bne r21, L(2m4) ++ beq r25, L(0m4) ++ ++L(3m4): ldl ul3, 0(up) ++ ldi n, -4(n) ++ ldl ul0, 8(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl ul1, 16(up) ++ ldi up, 24(up) ++ ldi rp, -8(rp) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge3) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ addl t0, r31, acc1 ++ addl rl3, acc1, acc1 ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ addl t0, m3b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ br r31, L(ta3) ++ ++L(ge3): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ addl t0, r31, acc1 ++ umulh vl0, ul2, m2b ++ addl rl3, acc1, acc1 ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ addl t0, m3b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ br r31, L(el3) ++ ++L(0m4): ldi n, -8(n) ++ ldl ul2, 0(up) ++ ldl ul3, 8(up) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge4) ++ ++ ldl rl2, 0(rp) ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, r31, acc0 ++ umulh vl0, ul1, m1b ++ addl rl2, acc0, acc0 ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ addl t0, m2b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ br r31, L(ta4) ++ ++L(ge4): ldl rl2, 0(rp) ++ srl m2a,NAIL_BITS, t0 ++ ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ addl t0, r31, acc0 ++ umulh vl0, ul1, m1b ++ addl rl2, acc0, acc0 ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ addl t0, m2b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ br r31, L(el0) ++ ++L(2m4): ldi n, -4(n) ++ ldl ul0, 0(up) ++ ldl ul1, 8(up) ++ ldi up, 16(up) ++ ldi rp, -16(rp) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge2) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ addl t0, r31, acc0 ++ addl rl0, acc0, acc0 ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ br r31, L(ta2) ++ ++L(ge2): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ addl t0, r31, acc0 ++ umulh vl0, ul3, m3b ++ addl rl0, acc0, acc0 ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ ldi rp, 32(rp) ++ mull vl0, ul0, m0a ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ bge n, L(el2) ++ ++ br r31, L(ta6) ++ ++L(1m4): ldi n, -4(n) ++ ldl ul1, 0(up) ++ ldi up, 8(up) ++ ldi rp, -24(rp) ++ bge n, L(ge1) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ addl rl1, t0, acc1 ++ and acc1,numb_mask, r28 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, 24(rp) ++ addl t1, m1b, r0 ++ ret r31, (r26), 1 ++ ++L(ge1): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ ldi rp, 32(rp) ++ mull vl0, ul0, m0a ++ addl t0, r31, acc1 ++ umulh vl0, ul0, m0b ++ addl rl1, acc1, acc1 ++ ldl rl2, 0(rp) ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, m1b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ blt n, L(ta5) ++ ++L(ge5): ldl ul2, 0(up) ++ br r31, L(el1) ++ ++ ALIGN(16) ++L(top): mull vl0, ul0, m0a C U1 ++ addl t0, m0b, acc1 C L0 ++ srl acc0,NUMB_BITS, t1 C U0 ++ stl r28, -24(rp) C L1 ++C ++L(el2): umulh vl0, ul0, m0b C U1 ++ and acc0,numb_mask, r28 C L0 ++ addl rl1, acc1, acc1 C U0 ++ ldl rl2, 0(rp) C L1 ++C ++ unop C U1 ++ addl t1, acc1, acc1 C L0 ++ srl m2a,NAIL_BITS, t0 C U0 ++ ldl ul2, 0(up) C L1 ++C ++ mull vl0, ul1, m1a C U1 ++ addl t0, m1b, acc0 C L0 ++ srl acc1,NUMB_BITS, t1 C U0 ++ stl r28, -16(rp) C L1 ++C ++L(el1): umulh vl0, ul1, m1b C U1 ++ and acc1,numb_mask, r28 C L0 ++ addl rl2, acc0, acc0 C U0 ++ ldl rl3, 8(rp) C L1 ++C ++ ldi n, -4(n) C L1 ++ addl t1, acc0, acc0 C L0 ++ srl m3a,NAIL_BITS, t0 C U0 ++ ldl ul3, 8(up) C L1 ++C ++ mull vl0, ul2, m2a C U1 ++ addl t0, m2b, acc1 C L0 ++ srl acc0,NUMB_BITS, t1 C U0 ++ stl r28, -8(rp) C L1 ++C ++L(el0): umulh vl0, ul2, m2b C U1 ++ and acc0,numb_mask, r28 C L0 ++ addl rl3, acc1, acc1 C U0 ++ ldl rl0, 16(rp) C L1 ++C ++ unop C U1 ++ addl t1, acc1, acc1 C L0 ++ srl m0a,NAIL_BITS, t0 C U0 ++ ldl ul0, 16(up) C L1 ++C ++ mull vl0, ul3, m3a C U1 ++ addl t0, m3b, acc0 C L0 ++ srl acc1,NUMB_BITS, t1 C U0 ++ stl r28, 0(rp) C L1 ++C ++L(el3): umulh vl0, ul3, m3b C U1 ++ and acc1,numb_mask, r28 C L0 ++ addl rl0, acc0, acc0 C U0 ++ ldl rl1, 24(rp) C L1 ++C ++ unop C U1 ++ addl t1, acc0, acc0 C L0 ++ srl m1a,NAIL_BITS, t0 C U0 ++ ldl ul1, 24(up) C L1 ++C ++ ldi up, 32(up) C L0 ++ unop C U1 ++ ldi rp, 32(rp) C L1 ++ bge n, L(top) C U0 ++ ++L(end): mull vl0, ul0, m0a ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ stl r28, -24(rp) ++L(ta6): umulh vl0, ul0, m0b ++ and acc0,numb_mask, r28 ++ addl rl1, acc1, acc1 ++ ldl rl2, 0(rp) ++ addl t1, acc1, acc1 ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, m1b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, -16(rp) ++L(ta5): umulh vl0, ul1, m1b ++ and acc1,numb_mask, r28 ++ addl rl2, acc0, acc0 ++ ldl rl3, 8(rp) ++ addl t1, acc0, acc0 ++ srl m3a,NAIL_BITS, t0 ++ addl t0, m2b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ stl r28, -8(rp) ++ unop ++ ALIGN(16) ++L(ta4): and acc0,numb_mask, r28 ++ addl rl3, acc1, acc1 ++ ldl rl0, 16(rp) ++ addl t1, acc1, acc1 ++ srl m0a,NAIL_BITS, t0 ++ addl t0, m3b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, 0(rp) ++ unop ++ ALIGN(16) ++L(ta3): and acc1,numb_mask, r28 ++ addl rl0, acc0, acc0 ++ ldl rl1, 24(rp) ++ addl t1, acc0, acc0 ++ srl m1a,NAIL_BITS, t0 ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ stl r28, 8(rp) ++ unop ++ ALIGN(16) ++L(ta2): and acc0,numb_mask, r28 ++ addl rl1, acc1, acc1 ++ addl t1, acc1, acc1 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, 16(rp) ++ and acc1,numb_mask, r28 ++ addl t1, m1b, r0 ++ stl r28, 24(rp) ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/addmul_2.asm b/mpn/sw_64/sw6/nails/addmul_2.asm +new file mode 100644 +index 0000000..29154b1 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/addmul_2.asm +@@ -0,0 +1,146 @@ ++dnl Sw_64 sw6 nails mpn_addmul_2. ++ ++dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C Runs at 4.0 cycles/limb. ++ ++C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l, ++C or 4-way unrolling over 20 cycles, for 2.5 c/l. ++ ++ ++C INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`n',`r18') ++define(`vp',`r19') ++ ++C Useful register aliases ++define(`numb_mask',`r24') ++define(`ulimb',`r25') ++define(`rlimb',`r27') ++ ++define(`m0a',`r0') ++define(`m0b',`r1') ++define(`m1a',`r2') ++define(`m1b',`r3') ++ ++define(`acc0',`r4') ++define(`acc1',`r5') ++ ++define(`v0',`r6') ++define(`v1',`r7') ++ ++C Used for temps: r8 r19 r28 ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`NUMB_BITS',`GMP_NUMB_BITS') ++ ++C This declaration is munged by configure ++NAILS_SUPPORT(3-63) ++ ++ASM_START() ++PROLOGUE(mpn_addmul_2) ++ ldi numb_mask,-1(r31) ++ srl numb_mask,NAIL_BITS,numb_mask ++ ++ ldl v0, 0(vp) ++ ldl v1, 8(vp) ++ ++ bis r31, r31, acc0 C zero acc0 ++ sll v0,NAIL_BITS, v0 ++ bis r31, r31, acc1 C zero acc1 ++ sll v1,NAIL_BITS, v1 ++ bis r31, r31, r19 ++ ++ ldl ulimb, 0(up) ++ ldi up, 8(up) ++ mull v0, ulimb, m0a C U1 ++ umulh v0, ulimb, m0b C U1 ++ mull v1, ulimb, m1a C U1 ++ umulh v1, ulimb, m1b C U1 ++ ldi n, -1(n) ++ beq n, L(end) C U0 ++ ++ ALIGN(16) ++L(top): bis r31, r31, r31 C U1 nop ++ addl r19, acc0, acc0 C U0 propagate nail ++ ldl rlimb, 0(rp) C L0 ++ ldl ulimb, 0(up) C L1 ++ ++ ldi rp, 8(rp) C L1 ++ srl m0a,NAIL_BITS, r8 C U0 ++ ldi up, 8(up) C L0 ++ mull v0, ulimb, m0a C U1 ++ ++ addl r8, acc0, r19 C U0 ++ addl m0b, acc1, acc0 C L1 ++ umulh v0, ulimb, m0b C U1 ++ bis r31, r31, r31 C L0 nop ++ ++ addl rlimb, r19, r19 C L1 FINAL PROD-SUM ++ srl m1a,NAIL_BITS, r8 C U0 ++ ldi n, -1(n) C L0 ++ mull v1, ulimb, m1a C U1 ++ ++ addl r8, acc0, acc0 C U0 ++ bis r31, m1b, acc1 C L1 ++ umulh v1, ulimb, m1b C U1 ++ and r19,numb_mask, r28 C L0 extract numb part ++ ++ unop ++ srl r19,NUMB_BITS, r19 C U1 extract nail part ++ stl r28, -8(rp) C L1 ++ bne n, L(top) C U0 ++ ++L(end): ldl rlimb, 0(rp) ++ addl r19, acc0, acc0 C propagate nail ++ ldi rp, 8(rp) ++ srl m0a,NAIL_BITS, r8 C U0 ++ addl r8, acc0, r19 ++ addl m0b, acc1, acc0 ++ addl rlimb, r19, r19 ++ srl m1a,NAIL_BITS, r8 C U0 ++ addl r8, acc0, acc0 ++ bis r31, m1b, acc1 ++ and r19,numb_mask, r28 C extract limb ++ ++ srl r19,NUMB_BITS, r19 C extract nail ++ stl r28, -8(rp) ++ ++ addl r19, acc0, acc0 C propagate nail ++ and acc0,numb_mask, r28 ++ stl r28, 0(rp) ++ srl acc0,NUMB_BITS, r19 ++ addl r19, acc1, r0 ++ ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/addmul_3.asm b/mpn/sw_64/sw6/nails/addmul_3.asm +new file mode 100644 +index 0000000..d2fbd97 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/addmul_3.asm +@@ -0,0 +1,169 @@ ++dnl Sw_64 sw6 nails mpn_addmul_3. ++ ++dnl Copyright 2002, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C Runs at 3.0 cycles/limb. ++ ++C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c). ++ ++ ++C INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`n',`r18') ++define(`vp',`r19') ++ ++C Useful register aliases ++define(`numb_mask',`r24') ++define(`ulimb',`r25') ++define(`rlimb',`r27') ++ ++define(`m0a',`r0') ++define(`m0b',`r1') ++define(`m1a',`r2') ++define(`m1b',`r3') ++define(`m2a',`r20') ++define(`m2b',`r21') ++ ++define(`acc0',`r4') ++define(`acc1',`r5') ++define(`acc2',`r22') ++ ++define(`v0',`r6') ++define(`v1',`r7') ++define(`v2',`r23') ++ ++C Used for temps: r8 r19 r28 ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`NUMB_BITS',`GMP_NUMB_BITS') ++ ++C This declaration is munged by configure ++NAILS_SUPPORT(3-63) ++ ++ASM_START() ++PROLOGUE(mpn_addmul_3) ++ ldi numb_mask,-1(r31) ++ srl numb_mask,NAIL_BITS,numb_mask ++ ++ ldl v0, 0(vp) ++ ldl v1, 8(vp) ++ ldl v2, 16(vp) ++ ++ bis r31, r31, acc0 C zero acc0 ++ sll v0,NAIL_BITS, v0 ++ bis r31, r31, acc1 C zero acc1 ++ sll v1,NAIL_BITS, v1 ++ bis r31, r31, acc2 C zero acc2 ++ sll v2,NAIL_BITS, v2 ++ bis r31, r31, r19 ++ ++ ldl ulimb, 0(up) ++ ldi up, 8(up) ++ mull v0, ulimb, m0a C U1 ++ umulh v0, ulimb, m0b C U1 ++ mull v1, ulimb, m1a C U1 ++ umulh v1, ulimb, m1b C U1 ++ ldi n, -1(n) ++ mull v2, ulimb, m2a C U1 ++ umulh v2, ulimb, m2b C U1 ++ beq n, L(end) C U0 ++ ++ ALIGN(16) ++L(top): ldl rlimb, 0(rp) C L1 ++ ldl ulimb, 0(up) C L0 ++ bis r31, r31, r31 C U0 nop ++ addl r19, acc0, acc0 C U1 propagate nail ++ ++ ldi rp, 8(rp) C L1 ++ srl m0a,NAIL_BITS, r8 C U0 ++ ldi up, 8(up) C L0 ++ mull v0, ulimb, m0a C U1 ++ ++ addl r8, acc0, r19 C U0 ++ addl m0b, acc1, acc0 C L1 ++ umulh v0, ulimb, m0b C U1 ++ bis r31, r31, r31 C L0 nop ++ ++ addl rlimb, r19, r19 C L1 ++ srl m1a,NAIL_BITS, r8 C U0 ++ bis r31, r31, r31 C L0 nop ++ mull v1, ulimb, m1a C U1 ++ ++ addl r8, acc0, acc0 C U0 ++ addl m1b, acc2, acc1 C L1 ++ umulh v1, ulimb, m1b C U1 ++ and r19,numb_mask, r28 C L0 extract numb part ++ ++ bis r31, r31, r31 C L1 nop ++ srl m2a,NAIL_BITS, r8 C U0 ++ ldi n, -1(n) C L0 ++ mull v2, ulimb, m2a C U1 ++ ++ addl r8, acc1, acc1 C L0 ++ bis r31, m2b, acc2 C L1 ++ umulh v2, ulimb, m2b C U1 ++ srl r19,NUMB_BITS, r19 C U0 extract nail part ++ ++ stl r28, -8(rp) C L ++ bne n, L(top) C U0 ++ ++L(end): ldl rlimb, 0(rp) ++ addl r19, acc0, acc0 C propagate nail ++ ldi rp, 8(rp) ++ srl m0a,NAIL_BITS, r8 C U0 ++ addl r8, acc0, r19 ++ addl m0b, acc1, acc0 ++ addl rlimb, r19, r19 ++ srl m1a,NAIL_BITS, r8 C U0 ++ addl r8, acc0, acc0 ++ addl m1b, acc2, acc1 ++ and r19,numb_mask, r28 C extract limb ++ srl m2a,NAIL_BITS, r8 C U0 ++ addl r8, acc1, acc1 ++ bis r31, m2b, acc2 ++ srl r19,NUMB_BITS, r19 C extract nail ++ stl r28, -8(rp) ++ ++ addl r19, acc0, acc0 C propagate nail ++ and acc0,numb_mask, r28 ++ stl r28, 0(rp) ++ srl acc0,NUMB_BITS, r19 ++ addl r19, acc1, acc1 ++ ++ and acc1,numb_mask, r28 ++ stl r28, 8(rp) ++ srl acc1,NUMB_BITS, r19 ++ addl r19, acc2, m0a ++ ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/addmul_4.asm b/mpn/sw_64/sw6/nails/addmul_4.asm +new file mode 100644 +index 0000000..f253da1 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/addmul_4.asm +@@ -0,0 +1,210 @@ ++dnl Sw_64 sw6 nails mpn_addmul_4. ++ ++dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C Runs at 2.5 cycles/limb. ++ ++C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding ++C to 3.24 insn/cycle. ++ ++ ++C INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`n',`r18') ++define(`vp',`r19') ++ ++C Useful register aliases ++define(`numb_mask',`r24') ++define(`ulimb',`r25') ++define(`rlimb',`r27') ++ ++define(`m0a',`r0') ++define(`m0b',`r1') ++define(`m1a',`r2') ++define(`m1b',`r3') ++define(`m2a',`r20') ++define(`m2b',`r21') ++define(`m3a',`r12') ++define(`m3b',`r13') ++ ++define(`acc0',`r4') ++define(`acc1',`r5') ++define(`acc2',`r22') ++define(`acc3',`r14') ++ ++define(`v0',`r6') ++define(`v1',`r7') ++define(`v2',`r23') ++define(`v3',`r15') ++ ++C Used for temps: r8 r19 r28 ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`NUMB_BITS',`GMP_NUMB_BITS') ++ ++C This declaration is munged by configure ++NAILS_SUPPORT(4-63) ++ ++ASM_START() ++PROLOGUE(mpn_addmul_4) ++ ldi r30, -240(r30) ++ stl r12, 32(r30) ++ stl r13, 40(r30) ++ stl r14, 48(r30) ++ stl r15, 56(r30) ++ ++ ldi numb_mask,-1(r31) ++ srl numb_mask,NAIL_BITS,numb_mask ++ ++ ldl v0, 0(vp) ++ ldl v1, 8(vp) ++ ldl v2, 16(vp) ++ ldl v3, 24(vp) ++ ++ bis r31, r31, acc0 C zero acc0 ++ sll v0,NAIL_BITS, v0 ++ bis r31, r31, acc1 C zero acc1 ++ sll v1,NAIL_BITS, v1 ++ bis r31, r31, acc2 C zero acc2 ++ sll v2,NAIL_BITS, v2 ++ bis r31, r31, acc3 C zero acc3 ++ sll v3,NAIL_BITS, v3 ++ bis r31, r31, r19 ++ ++ ldl ulimb, 0(up) ++ ldi up, 8(up) ++ mull v0, ulimb, m0a C U1 ++ umulh v0, ulimb, m0b C U1 ++ mull v1, ulimb, m1a C U1 ++ umulh v1, ulimb, m1b C U1 ++ ldi n, -1(n) ++ mull v2, ulimb, m2a C U1 ++ umulh v2, ulimb, m2b C U1 ++ mull v3, ulimb, m3a C U1 ++ umulh v3, ulimb, m3b C U1 ++ beq n, L(end) C U0 ++ ++ ALIGN(16) ++L(top): bis r31, r31, r31 C U1 nop ++ ldl rlimb, 0(rp) C L0 ++ ldl ulimb, 0(up) C L1 ++ addl r19, acc0, acc0 C U0 propagate nail ++ ++ bis r31, r31, r31 C L0 nop ++ bis r31, r31, r31 C U1 nop ++ bis r31, r31, r31 C L1 nop ++ bis r31, r31, r31 C U0 nop ++ ++ ldi rp, 8(rp) C L0 ++ srl m0a,NAIL_BITS, r8 C U0 ++ ldi up, 8(up) C L1 ++ mull v0, ulimb, m0a C U1 ++ ++ addl r8, acc0, r19 C U0 ++ addl m0b, acc1, acc0 C L0 ++ umulh v0, ulimb, m0b C U1 ++ bis r31, r31, r31 C L1 nop ++ ++ addl rlimb, r19, r19 C L0 ++ srl m1a,NAIL_BITS, r8 C U0 ++ bis r31, r31, r31 C L1 nop ++ mull v1, ulimb, m1a C U1 ++ ++ addl r8, acc0, acc0 C U0 ++ addl m1b, acc2, acc1 C L0 ++ umulh v1, ulimb, m1b C U1 ++ and r19,numb_mask, r28 C L1 extract numb part ++ ++ bis r31, r31, r31 C L0 nop ++ srl m2a,NAIL_BITS, r8 C U0 ++ ldi n, -1(n) C L1 ++ mull v2, ulimb, m2a C U1 ++ ++ addl r8, acc1, acc1 C L1 ++ addl m2b, acc3, acc2 C L0 ++ umulh v2, ulimb, m2b C U1 ++ srl r19,NUMB_BITS, r19 C U0 extract nail part ++ ++ bis r31, r31, r31 C L0 nop ++ srl m3a,NAIL_BITS, r8 C U0 ++ stl r28, -8(rp) C L1 ++ mull v3, ulimb, m3a C U1 ++ ++ addl r8, acc2, acc2 C L0 ++ bis r31, m3b, acc3 C L1 ++ umulh v3, ulimb, m3b C U1 ++ bne n, L(top) C U0 ++ ++L(end): ldl rlimb, 0(rp) ++ addl r19, acc0, acc0 C propagate nail ++ ldi rp, 8(rp) C FIXME: DELETE ++ srl m0a,NAIL_BITS, r8 C U0 ++ addl r8, acc0, r19 ++ addl m0b, acc1, acc0 ++ addl rlimb, r19, r19 ++ srl m1a,NAIL_BITS, r8 C U0 ++ addl r8, acc0, acc0 ++ addl m1b, acc2, acc1 ++ and r19,numb_mask, r28 C extract limb ++ srl m2a,NAIL_BITS, r8 C U0 ++ addl r8, acc1, acc1 ++ addl m2b, acc3, acc2 ++ srl r19,NUMB_BITS, r19 C extract nail ++ srl m3a,NAIL_BITS, r8 C U0 ++ stl r28, -8(rp) ++ addl r8, acc2, acc2 ++ bis r31, m3b, acc3 ++ ++ addl r19, acc0, acc0 C propagate nail ++ and acc0,numb_mask, r28 ++ stl r28, 0(rp) ++ srl acc0,NUMB_BITS, r19 ++ addl r19, acc1, acc1 ++ ++ and acc1,numb_mask, r28 ++ stl r28, 8(rp) ++ srl acc1,NUMB_BITS, r19 ++ addl r19, acc2, acc2 ++ ++ and acc2,numb_mask, r28 ++ stl r28, 16(rp) ++ srl acc2,NUMB_BITS, r19 ++ addl r19, acc3, r0 ++ ++ ldl r12, 32(r30) ++ ldl r13, 40(r30) ++ ldl r14, 48(r30) ++ ldl r15, 56(r30) ++ ldi r30, 240(r30) ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/aors_n.asm b/mpn/sw_64/sw6/nails/aors_n.asm +new file mode 100644 +index 0000000..cc183d4 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/aors_n.asm +@@ -0,0 +1,233 @@ ++dnl Sw_64 sw6 nails mpn_add_n and mpn_sub_n. ++ ++dnl Copyright 2002, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++ ++dnl Runs at 2.5 cycles/limb. It would be possible to reach 2.0 cycles/limb ++dnl with 8-way unrolling. ++ ++include(`../config.m4') ++ ++dnl INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`vp',`r18') ++define(`n',`r19') ++ ++define(`rl0',`r0') ++define(`rl1',`r1') ++define(`rl2',`r2') ++define(`rl3',`r3') ++ ++define(`ul0',`r4') ++define(`ul1',`r5') ++define(`ul2',`r6') ++define(`ul3',`r7') ++ ++define(`vl0',`r22') ++define(`vl1',`r23') ++define(`vl2',`r24') ++define(`vl3',`r25') ++ ++define(`numb_mask',`r21') ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`CYSH',`GMP_NUMB_BITS') ++ ++dnl This declaration is munged by configure ++NAILS_SUPPORT(1-63) ++ ++ifdef(`OPERATION_add_n', ` ++ define(`OP', addl) ++ define(`CYSH',`GMP_NUMB_BITS') ++ define(`func', mpn_add_n)') ++ifdef(`OPERATION_sub_n', ` ++ define(`OP', subl) ++ define(`CYSH',63) ++ define(`func', mpn_sub_n)') ++ ++MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n) ++ ++ASM_START() ++PROLOGUE(func) ++ ldi numb_mask, -1(r31) ++ srl numb_mask, NAIL_BITS, numb_mask ++ bis r31, r31, r20 ++ ++ and n, 3, r25 ++ ldi n, -4(n) ++ beq r25, L(ge4) ++ ++L(lp0): ldl ul0, 0(up) ++ ldi up, 8(up) ++ ldl vl0, 0(vp) ++ ldi vp, 8(vp) ++ ldi rp, 8(rp) ++ ldi r25, -1(r25) ++ OP ul0, vl0, rl0 ++ OP rl0, r20, rl0 ++ and rl0, numb_mask, r28 ++ stl r28, -8(rp) ++ srl rl0, CYSH, r20 ++ bne r25, L(lp0) ++ ++ blt n, L(ret) ++ ++L(ge4): ldl ul0, 0(up) ++ ldl vl0, 0(vp) ++ ldl ul1, 8(up) ++ ldl vl1, 8(vp) ++ ldl ul2, 16(up) ++ ldl vl2, 16(vp) ++ ldl ul3, 24(up) ++ ldl vl3, 24(vp) ++ ldi up, 32(up) ++ ldi vp, 32(vp) ++ ldi n, -4(n) ++ bge n, L(ge8) ++ ++ OP ul0, vl0, rl0 C main-add 0 ++ OP rl0, r20, rl0 C cy-add 0 ++ OP ul1, vl1, rl1 C main-add 1 ++ srl rl0, CYSH, r20 C gen cy 0 ++ OP rl1, r20, rl1 C cy-add 1 ++ and rl0,numb_mask, r27 ++ br r31, L(cj0) ++ ++L(ge8): OP ul0, vl0, rl0 C main-add 0 ++ ldl ul0, 0(up) ++ ldl vl0, 0(vp) ++ OP rl0, r20, rl0 C cy-add 0 ++ OP ul1, vl1, rl1 C main-add 1 ++ srl rl0, CYSH, r20 C gen cy 0 ++ ldl ul1, 8(up) ++ ldl vl1, 8(vp) ++ OP rl1, r20, rl1 C cy-add 1 ++ and rl0,numb_mask, r27 ++ OP ul2, vl2, rl2 C main-add 2 ++ srl rl1, CYSH, r20 C gen cy 1 ++ ldl ul2, 16(up) ++ ldl vl2, 16(vp) ++ OP rl2, r20, rl2 C cy-add 2 ++ and rl1,numb_mask, r28 ++ stl r27, 0(rp) ++ OP ul3, vl3, rl3 C main-add 3 ++ srl rl2, CYSH, r20 C gen cy 2 ++ ldl ul3, 24(up) ++ ldl vl3, 24(vp) ++ OP rl3, r20, rl3 C cy-add 3 ++ and rl2,numb_mask, r27 ++ stl r28, 8(rp) ++ ldi rp, 32(rp) ++ ldi up, 32(up) ++ ldi vp, 32(vp) ++ ldi n, -4(n) ++ blt n, L(end) ++ ++ ALIGN(32) ++L(top): OP ul0, vl0, rl0 C main-add 0 ++ srl rl3, CYSH, r20 C gen cy 3 ++ ldl ul0, 0(up) ++ ldl vl0, 0(vp) ++ ++ OP rl0, r20, rl0 C cy-add 0 ++ and rl3,numb_mask, r28 ++ stl r27, -16(rp) ++ bis r31, r31, r31 ++ ++ OP ul1, vl1, rl1 C main-add 1 ++ srl rl0, CYSH, r20 C gen cy 0 ++ ldl ul1, 8(up) ++ ldl vl1, 8(vp) ++ ++ OP rl1, r20, rl1 C cy-add 1 ++ and rl0,numb_mask, r27 ++ stl r28, -8(rp) ++ bis r31, r31, r31 ++ ++ OP ul2, vl2, rl2 C main-add 2 ++ srl rl1, CYSH, r20 C gen cy 1 ++ ldl ul2, 16(up) ++ ldl vl2, 16(vp) ++ ++ OP rl2, r20, rl2 C cy-add 2 ++ and rl1,numb_mask, r28 ++ stl r27, 0(rp) ++ bis r31, r31, r31 ++ ++ OP ul3, vl3, rl3 C main-add 3 ++ srl rl2, CYSH, r20 C gen cy 2 ++ ldl ul3, 24(up) ++ ldl vl3, 24(vp) ++ ++ OP rl3, r20, rl3 C cy-add 3 ++ and rl2,numb_mask, r27 ++ stl r28, 8(rp) ++ bis r31, r31, r31 ++ ++ bis r31, r31, r31 ++ ldi n, -4(n) ++ ldi up, 32(up) ++ ldi vp, 32(vp) ++ ++ bis r31, r31, r31 ++ bis r31, r31, r31 ++ ldi rp, 32(rp) ++ bge n, L(top) ++ ++L(end): OP ul0, vl0, rl0 C main-add 0 ++ srl rl3, CYSH, r20 C gen cy 3 ++ OP rl0, r20, rl0 C cy-add 0 ++ and rl3,numb_mask, r28 ++ stl r27, -16(rp) ++ OP ul1, vl1, rl1 C main-add 1 ++ srl rl0, CYSH, r20 C gen cy 0 ++ OP rl1, r20, rl1 C cy-add 1 ++ and rl0,numb_mask, r27 ++ stl r28, -8(rp) ++L(cj0): OP ul2, vl2, rl2 C main-add 2 ++ srl rl1, CYSH, r20 C gen cy 1 ++ OP rl2, r20, rl2 C cy-add 2 ++ and rl1,numb_mask, r28 ++ stl r27, 0(rp) ++ OP ul3, vl3, rl3 C main-add 3 ++ srl rl2, CYSH, r20 C gen cy 2 ++ OP rl3, r20, rl3 C cy-add 3 ++ and rl2,numb_mask, r27 ++ stl r28, 8(rp) ++ ++ srl rl3, CYSH, r20 C gen cy 3 ++ and rl3,numb_mask, r28 ++ stl r27, 16(rp) ++ stl r28, 24(rp) ++ ++L(ret): and r20, 1, r0 ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/gmp-mparam.h b/mpn/sw_64/sw6/nails/gmp-mparam.h +new file mode 100644 +index 0000000..7949fe8 +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/gmp-mparam.h +@@ -0,0 +1,72 @@ ++/* gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */ ++ ++#define MUL_TOOM22_THRESHOLD 40 ++#define MUL_TOOM33_THRESHOLD 236 ++ ++#define SQR_BASECASE_THRESHOLD 7 /* karatsuba */ ++#define SQR_TOOM2_THRESHOLD 0 /* never sqr_basecase */ ++#define SQR_TOOM3_THRESHOLD 120 ++ ++#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ ++#define DIV_DC_THRESHOLD 48 ++#define POWM_THRESHOLD 113 ++ ++#define HGCD_THRESHOLD 78 ++#define GCD_ACCEL_THRESHOLD 3 ++#define GCD_DC_THRESHOLD 392 ++#define JACOBI_BASE_METHOD 1 ++ ++#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ ++#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ ++#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ ++#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ ++#define USE_PREINV_DIVREM_1 0 /* no preinv with nails */ ++#define USE_PREINV_MOD_1 0 /* no preinv with nails */ ++#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* no preinv with nails */ ++#define DIVEXACT_1_THRESHOLD 0 /* always */ ++#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ ++ ++#define GET_STR_DC_THRESHOLD 15 ++#define GET_STR_PRECOMPUTE_THRESHOLD 24 ++#define SET_STR_THRESHOLD 6336 ++ ++#define MUL_FFT_TABLE { 688, 1440, 3648, 6400, 25600, 0 } ++#define MUL_FFT_MODF_THRESHOLD 488 ++#define MUL_FFT_THRESHOLD 3712 ++ ++#define SQR_FFT_TABLE { 432, 864, 3136, 6400, 25600, 0 } ++#define SQR_FFT_MODF_THRESHOLD 480 ++#define SQR_FFT_THRESHOLD 2976 +diff --git a/mpn/sw_64/sw6/nails/mul_1.asm b/mpn/sw_64/sw6/nails/mul_1.asm +new file mode 100644 +index 0000000..63c14ad +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/mul_1.asm +@@ -0,0 +1,362 @@ ++dnl Sw_64 sw6 nails mpn_mul_1. ++ ++dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 3.25 ++ ++C TODO ++C * Reroll loop for 3.0 c/l with current 4-way unrolling. ++C * The loop is overscheduled wrt loads and wrt multiplies, in particular ++C umulh. ++C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 ++C and would work since the loop structure is really regular. ++ ++C INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`n', `r18') ++define(`vl0',`r19') ++ ++define(`numb_mask',`r6') ++ ++define(`m0a',`r0') ++define(`m0b',`r1') ++define(`m1a',`r2') ++define(`m1b',`r3') ++define(`m2a',`r20') ++define(`m2b',`r21') ++define(`m3a',`r22') ++define(`m3b',`r23') ++ ++define(`acc0',`r25') ++define(`acc1',`r27') ++ ++define(`ul0',`r4') ++define(`ul1',`r5') ++define(`ul2',`r4') ++define(`ul3',`r5') ++ ++define(`rl0',`r24') ++define(`rl1',`r24') ++define(`rl2',`r24') ++define(`rl3',`r24') ++ ++define(`t0',`r7') ++define(`t1',`r8') ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`NUMB_BITS',`GMP_NUMB_BITS') ++ ++dnl This declaration is munged by configure ++NAILS_SUPPORT(1-63) ++ ++ASM_START() ++PROLOGUE(mpn_mul_1) ++ sll vl0, NAIL_BITS, vl0 ++ ldi numb_mask, -1(r31) ++ srl numb_mask, NAIL_BITS, numb_mask ++ ++ and n, 3, r25 ++ cmpeq r25, 1, r21 ++ bne r21, L(1m4) ++ cmpeq r25, 2, r21 ++ bne r21, L(2m4) ++ beq r25, L(0m4) ++ ++L(3m4): ldl ul3, 0(up) ++ ldi n, -4(n) ++ ldl ul0, 8(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl ul1, 16(up) ++ ldi up, 24(up) ++ ldi rp, -8(rp) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge3) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ srl m3a,NAIL_BITS, t0 ++ addl t0, r31, acc1 ++ srl m0a,NAIL_BITS, t0 ++ addl t0, m3b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ br r31, L(ta3) ++ ++L(ge3): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ srl m3a,NAIL_BITS, t0 ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ addl t0, r31, acc1 ++ umulh vl0, ul2, m2b ++ srl m0a,NAIL_BITS, t0 ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ addl t0, m3b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ br r31, L(el3) ++ ++L(0m4): ldi n, -8(n) ++ ldl ul2, 0(up) ++ ldl ul3, 8(up) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge4) ++ ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, r31, acc0 ++ umulh vl0, ul1, m1b ++ srl m3a,NAIL_BITS, t0 ++ addl t0, m2b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ br r31, L(ta4) ++ ++L(ge4): srl m2a,NAIL_BITS, t0 ++ ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ addl t0, r31, acc0 ++ umulh vl0, ul1, m1b ++ srl m3a,NAIL_BITS, t0 ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ addl t0, m2b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ br r31, L(el0) ++ ++L(2m4): ldi n, -4(n) ++ ldl ul0, 0(up) ++ ldl ul1, 8(up) ++ ldi up, 16(up) ++ ldi rp, -16(rp) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge2) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ srl m0a,NAIL_BITS, t0 ++ addl t0, r31, acc0 ++ srl m1a,NAIL_BITS, t0 ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ br r31, L(ta2) ++ ++L(ge2): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ srl m0a,NAIL_BITS, t0 ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ addl t0, r31, acc0 ++ umulh vl0, ul3, m3b ++ srl m1a,NAIL_BITS, t0 ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ ldi rp, 32(rp) ++ mull vl0, ul0, m0a ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ bge n, L(el2) ++ ++ br r31, L(ta6) ++ ++L(1m4): ldi n, -4(n) ++ ldl ul1, 0(up) ++ ldi up, 8(up) ++ ldi rp, -24(rp) ++ bge n, L(ge1) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ srl m1a,NAIL_BITS, t0 ++ addl t0, r31, acc1 ++ and acc1,numb_mask, r28 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, 24(rp) ++ addl t1, m1b, r0 ++ ret r31, (r26), 1 ++ ++L(ge1): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ srl m1a,NAIL_BITS, t0 ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ ldi rp, 32(rp) ++ mull vl0, ul0, m0a ++ addl t0, r31, acc1 ++ umulh vl0, ul0, m0b ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, m1b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ blt n, L(ta5) ++ ++L(ge5): ldl ul2, 0(up) ++ br r31, L(el1) ++ ++ ALIGN(16) ++L(top): mull vl0, ul0, m0a C U1 ++ addl t0, m0b, acc1 C L0 ++ srl acc0,NUMB_BITS, t1 C U0 ++ stl r28, -24(rp) C L1 ++C ++L(el2): umulh vl0, ul0, m0b C U1 ++ and acc0,numb_mask, r28 C L0 ++ unop C U0 ++ unop C L1 ++C ++ unop C U1 ++ addl t1, acc1, acc1 C L0 ++ srl m2a,NAIL_BITS, t0 C U0 ++ ldl ul2, 0(up) C L1 ++C ++ mull vl0, ul1, m1a C U1 ++ addl t0, m1b, acc0 C L0 ++ srl acc1,NUMB_BITS, t1 C U0 ++ stl r28, -16(rp) C L1 ++C ++L(el1): umulh vl0, ul1, m1b C U1 ++ and acc1,numb_mask, r28 C L0 ++ unop C U0 ++ ldi n, -4(n) C L1 ++C ++ unop C U1 ++ addl t1, acc0, acc0 C L0 ++ srl m3a,NAIL_BITS, t0 C U0 ++ ldl ul3, 8(up) C L1 ++C ++ mull vl0, ul2, m2a C U1 ++ addl t0, m2b, acc1 C L0 ++ srl acc0,NUMB_BITS, t1 C U0 ++ stl r28, -8(rp) C L1 ++C ++L(el0): umulh vl0, ul2, m2b C U1 ++ and acc0,numb_mask, r28 C L0 ++ unop C U0 ++ unop C L1 ++C ++ unop C U1 ++ addl t1, acc1, acc1 C L0 ++ srl m0a,NAIL_BITS, t0 C U0 ++ ldl ul0, 16(up) C L1 ++C ++ mull vl0, ul3, m3a C U1 ++ addl t0, m3b, acc0 C L0 ++ srl acc1,NUMB_BITS, t1 C U0 ++ stl r28, 0(rp) C L1 ++C ++L(el3): umulh vl0, ul3, m3b C U1 ++ and acc1,numb_mask, r28 C L0 ++ unop C U0 ++ unop C L1 ++C ++ unop C U1 ++ addl t1, acc0, acc0 C L0 ++ srl m1a,NAIL_BITS, t0 C U0 ++ ldl ul1, 24(up) C L1 ++C ++ ldi up, 32(up) C L0 ++ unop C U1 ++ ldi rp, 32(rp) C L1 ++ bge n, L(top) C U0 ++ ++L(end): mull vl0, ul0, m0a ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ stl r28, -24(rp) ++L(ta6): umulh vl0, ul0, m0b ++ and acc0,numb_mask, r28 ++ addl t1, acc1, acc1 ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, m1b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, -16(rp) ++L(ta5): umulh vl0, ul1, m1b ++ and acc1,numb_mask, r28 ++ addl t1, acc0, acc0 ++ srl m3a,NAIL_BITS, t0 ++ addl t0, m2b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ stl r28, -8(rp) ++ ALIGN(16) ++L(ta4): and acc0,numb_mask, r28 ++ addl t1, acc1, acc1 ++ srl m0a,NAIL_BITS, t0 ++ addl t0, m3b, acc0 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, 0(rp) ++ unop ++ ALIGN(16) ++L(ta3): and acc1,numb_mask, r28 ++ addl t1, acc0, acc0 ++ srl m1a,NAIL_BITS, t0 ++ addl t0, m0b, acc1 ++ srl acc0,NUMB_BITS, t1 ++ stl r28, 8(rp) ++ unop ++ ALIGN(16) ++L(ta2): and acc0,numb_mask, r28 ++ addl t1, acc1, acc1 ++ srl acc1,NUMB_BITS, t1 ++ stl r28, 16(rp) ++ and acc1,numb_mask, r28 ++ addl t1, m1b, r0 ++ stl r28, 24(rp) ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/nails/submul_1.asm b/mpn/sw_64/sw6/nails/submul_1.asm +new file mode 100644 +index 0000000..3ac8f2e +--- /dev/null ++++ b/mpn/sw_64/sw6/nails/submul_1.asm +@@ -0,0 +1,394 @@ ++dnl Sw_64 sw6 nails mpn_submul_1. ++ ++dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 4 ++ ++C TODO ++C * Reroll loop for 3.75 c/l with current 4-way unrolling. ++C * The loop is overscheduled wrt loads and wrt multiplies, in particular ++C umulh. ++C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 ++C and would work since the loop structure is really regular. ++ ++C INPUT PARAMETERS ++define(`rp',`r16') ++define(`up',`r17') ++define(`n', `r18') ++define(`vl0',`r19') ++ ++define(`numb_mask',`r6') ++ ++define(`m0a',`r0') ++define(`m0b',`r1') ++define(`m1a',`r2') ++define(`m1b',`r3') ++define(`m2a',`r20') ++define(`m2b',`r21') ++define(`m3a',`r22') ++define(`m3b',`r23') ++ ++define(`acc0',`r25') ++define(`acc1',`r27') ++ ++define(`ul0',`r4') ++define(`ul1',`r5') ++define(`ul2',`r4') ++define(`ul3',`r5') ++ ++define(`rl0',`r24') ++define(`rl1',`r24') ++define(`rl2',`r24') ++define(`rl3',`r24') ++ ++define(`t0',`r7') ++define(`t1',`r8') ++ ++define(`NAIL_BITS',`GMP_NAIL_BITS') ++define(`NUMB_BITS',`GMP_NUMB_BITS') ++ ++dnl This declaration is munged by configure ++NAILS_SUPPORT(2-63) ++ ++ASM_START() ++PROLOGUE(mpn_submul_1) ++ sll vl0, NAIL_BITS, vl0 ++ ldi numb_mask, -1(r31) ++ srl numb_mask, NAIL_BITS, numb_mask ++ ++ and n, 3, r25 ++ cmpeq r25, 1, r21 ++ bne r21, L(1m4) ++ cmpeq r25, 2, r21 ++ bne r21, L(2m4) ++ beq r25, L(0m4) ++ ++L(3m4): ldl ul3, 0(up) ++ ldi n, -4(n) ++ ldl ul0, 8(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl ul1, 16(up) ++ ldi up, 24(up) ++ ldi rp, -8(rp) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge3) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ addl t0, r31, acc1 ++ subl rl3, acc1, acc1 ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ addl t0, m3b, acc0 ++ sra acc1,NUMB_BITS, t1 ++ br r31, L(ta3) ++ ++L(ge3): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ addl t0, r31, acc1 ++ umulh vl0, ul2, m2b ++ subl rl3, acc1, acc1 ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ addl t0, m3b, acc0 ++ sra acc1,NUMB_BITS, t1 ++ br r31, L(el3) ++ ++L(0m4): ldi n, -8(n) ++ ldl ul2, 0(up) ++ ldl ul3, 8(up) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge4) ++ ++ ldl rl2, 0(rp) ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, r31, acc0 ++ umulh vl0, ul1, m1b ++ subl rl2, acc0, acc0 ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ addl t0, m2b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ br r31, L(ta4) ++ ++L(ge4): ldl rl2, 0(rp) ++ srl m2a,NAIL_BITS, t0 ++ ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ addl t0, r31, acc0 ++ umulh vl0, ul1, m1b ++ subl rl2, acc0, acc0 ++ ldl rl3, 8(rp) ++ srl m3a,NAIL_BITS, t0 ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ addl t0, m2b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ br r31, L(el0) ++ ++L(2m4): ldi n, -4(n) ++ ldl ul0, 0(up) ++ ldl ul1, 8(up) ++ ldi up, 16(up) ++ ldi rp, -16(rp) ++ mull vl0, ul0, m0a ++ umulh vl0, ul0, m0b ++ bge n, L(ge2) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ addl t0, r31, acc0 ++ subl rl0, acc0, acc0 ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ addl t0, m0b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ br r31, L(ta2) ++ ++L(ge2): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl rl0, 16(rp) ++ srl m0a,NAIL_BITS, t0 ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ addl t0, r31, acc0 ++ umulh vl0, ul3, m3b ++ subl rl0, acc0, acc0 ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ ldi rp, 32(rp) ++ mull vl0, ul0, m0a ++ addl t0, m0b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ bge n, L(el2) ++ ++ br r31, L(ta6) ++ ++L(1m4): ldi n, -4(n) ++ ldl ul1, 0(up) ++ ldi up, 8(up) ++ ldi rp, -24(rp) ++ bge n, L(ge1) ++ ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ subl rl1, t0, acc1 ++ and acc1,numb_mask, r28 ++ sra acc1,NUMB_BITS, t1 ++ stl r28, 24(rp) ++ subl m1b, t1, r0 ++ ret r31, (r26), 1 ++ ++L(ge1): ldl ul2, 0(up) ++ mull vl0, ul1, m1a ++ umulh vl0, ul1, m1b ++ ldl ul3, 8(up) ++ ldi n, -4(n) ++ mull vl0, ul2, m2a ++ umulh vl0, ul2, m2b ++ ldl ul0, 16(up) ++ mull vl0, ul3, m3a ++ umulh vl0, ul3, m3b ++ ldl rl1, 24(rp) ++ srl m1a,NAIL_BITS, t0 ++ ldl ul1, 24(up) ++ ldi up, 32(up) ++ ldi rp, 32(rp) ++ mull vl0, ul0, m0a ++ addl t0, r31, acc1 ++ umulh vl0, ul0, m0b ++ subl rl1, acc1, acc1 ++ ldl rl2, 0(rp) ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, m1b, acc0 ++ sra acc1,NUMB_BITS, t1 ++ blt n, L(ta5) ++ ++L(ge5): ldl ul2, 0(up) ++ br r31, L(el1) ++ ++ ALIGN(16) ++L(top): mull vl0, ul0, m0a C U1 ++ addl t0, m0b, acc1 C L0 ++ sra acc0,NUMB_BITS, t1 C U0 ++ stl r28, -24(rp) C L1 ++C ++L(el2): umulh vl0, ul0, m0b C U1 ++ and acc0,numb_mask, r28 C L0 ++ subl rl1, acc1, acc1 C U0 ++ ldl rl2, 0(rp) C L1 ++C ++ unop C U1 ++ addl t1, acc1, acc1 C L0 ++ srl m2a,NAIL_BITS, t0 C U0 ++ ldl ul2, 0(up) C L1 ++C ++ mull vl0, ul1, m1a C U1 ++ addl t0, m1b, acc0 C L0 ++ sra acc1,NUMB_BITS, t1 C U0 ++ stl r28, -16(rp) C L1 ++C ++L(el1): umulh vl0, ul1, m1b C U1 ++ and acc1,numb_mask, r28 C L0 ++ subl rl2, acc0, acc0 C U0 ++ ldl rl3, 8(rp) C L1 ++C ++ ldi n, -4(n) C L1 ++ addl t1, acc0, acc0 C L0 ++ srl m3a,NAIL_BITS, t0 C U0 ++ ldl ul3, 8(up) C L1 ++C ++ mull vl0, ul2, m2a C U1 ++ addl t0, m2b, acc1 C L0 ++ sra acc0,NUMB_BITS, t1 C U0 ++ stl r28, -8(rp) C L1 ++C ++L(el0): umulh vl0, ul2, m2b C U1 ++ and acc0,numb_mask, r28 C L0 ++ subl rl3, acc1, acc1 C U0 ++ ldl rl0, 16(rp) C L1 ++C ++ unop C U1 ++ addl t1, acc1, acc1 C L0 ++ srl m0a,NAIL_BITS, t0 C U0 ++ ldl ul0, 16(up) C L1 ++C ++ mull vl0, ul3, m3a C U1 ++ addl t0, m3b, acc0 C L0 ++ sra acc1,NUMB_BITS, t1 C U0 ++ stl r28, 0(rp) C L1 ++C ++L(el3): umulh vl0, ul3, m3b C U1 ++ and acc1,numb_mask, r28 C L0 ++ subl rl0, acc0, acc0 C U0 ++ ldl rl1, 24(rp) C L1 ++C ++ unop C U1 ++ addl t1, acc0, acc0 C L0 ++ srl m1a,NAIL_BITS, t0 C U0 ++ ldl ul1, 24(up) C L1 ++C ++ ldi up, 32(up) C L0 ++ unop C U1 ++ ldi rp, 32(rp) C L1 ++ bge n, L(top) C U0 ++ ++L(end): mull vl0, ul0, m0a ++ addl t0, m0b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ stl r28, -24(rp) ++L(ta6): umulh vl0, ul0, m0b ++ and acc0,numb_mask, r28 ++ subl rl1, acc1, acc1 ++ ldl rl2, 0(rp) ++ addl t1, acc1, acc1 ++ srl m2a,NAIL_BITS, t0 ++ mull vl0, ul1, m1a ++ addl t0, m1b, acc0 ++ sra acc1,NUMB_BITS, t1 ++ stl r28, -16(rp) ++L(ta5): umulh vl0, ul1, m1b ++ and acc1,numb_mask, r28 ++ subl rl2, acc0, acc0 ++ ldl rl3, 8(rp) ++ addl t1, acc0, acc0 ++ srl m3a,NAIL_BITS, t0 ++ addl t0, m2b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ stl r28, -8(rp) ++ unop ++ ALIGN(16) ++L(ta4): and acc0,numb_mask, r28 ++ subl rl3, acc1, acc1 ++ ldl rl0, 16(rp) ++ addl t1, acc1, acc1 ++ srl m0a,NAIL_BITS, t0 ++ addl t0, m3b, acc0 ++ sra acc1,NUMB_BITS, t1 ++ stl r28, 0(rp) ++ unop ++ ALIGN(16) ++L(ta3): and acc1,numb_mask, r28 ++ subl rl0, acc0, acc0 ++ ldl rl1, 24(rp) ++ addl t1, acc0, acc0 ++ srl m1a,NAIL_BITS, t0 ++ addl t0, m0b, acc1 ++ sra acc0,NUMB_BITS, t1 ++ stl r28, 8(rp) ++ unop ++ ALIGN(16) ++L(ta2): and acc0,numb_mask, r28 ++ subl rl1, acc1, acc1 ++ addl t1, acc1, acc1 ++ sra acc1,NUMB_BITS, t1 ++ stl r28, 16(rp) ++ and acc1,numb_mask, r28 ++ subl m1b, t1, r0 ++ stl r28, 24(rp) ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6/slot.pl b/mpn/sw_64/sw6/slot.pl +new file mode 100755 +index 0000000..dbb40b4 +--- /dev/null ++++ b/mpn/sw_64/sw6/slot.pl +@@ -0,0 +1,318 @@ ++#!/usr/bin/perl -w ++ ++# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc. ++# ++# This file is part of the GNU MP Library. ++# ++# The GNU MP Library is free software; you can redistribute it and/or modify ++# it under the terms of either: ++# ++# * the GNU Lesser General Public License as published by the Free ++# Software Foundation; either version 3 of the License, or (at your ++# option) any later version. ++# ++# or ++# ++# * the GNU General Public License as published by the Free Software ++# Foundation; either version 2 of the License, or (at your option) any ++# later version. ++# ++# or both in parallel, as here. ++# ++# The GNU MP Library is distributed in the hope that it will be useful, but ++# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++# for more details. ++# ++# You should have received copies of the GNU General Public License and the ++# GNU Lesser General Public License along with the GNU MP Library. If not, ++# see https://www.gnu.org/licenses/. ++ ++ ++# Usage: slot.pl [filename.o]... ++# ++# Run "objdump" to produce a disassembly of the given object file(s) and ++# annotate the output with "U" or "L" slotting which Sw_64 SW6 will use. ++# ++# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as ++# a reminder that it wasn't a fixed requirement that gave the U or L, but ++# the octaword slotting rules. ++# ++# If an instruction is not recognised, that octaword does not get any U/L ++# shown, only lower-case "u", "l" or "e" for the instructions which are ++# known. Add any unknown instructions to %optable below. ++ ++ ++use strict; ++ ++# The U or L which various instructions demand, or E if either. ++# ++my %optable = ++ ( ++ 'addl' => 'E', ++ 'and' => 'E', ++ 'andnot' => 'E', ++ 'beq' => 'U', ++ 'bge' => 'U', ++ 'bgt' => 'U', ++ 'bic' => 'E', ++ 'bis' => 'E', ++ 'blt' => 'U', ++ 'bne' => 'U', ++ 'br' => 'L', ++ 'clr' => 'E', ++ 'cmpule' => 'E', ++ 'cmpult' => 'E', ++ 'cmpeq' => 'E', ++ 'seleq' => 'E', ++ 'selne' => 'E', ++ 'ctpop' => 'U', ++ 'ctlz' => 'U', ++ 'cttz' => 'U', ++ 'ext0b' => 'U', ++ 'ext6b' => 'U', ++ 'ext2b' => 'U', ++ 'ext7b' => 'U', ++ 'ext3b' => 'U', ++ 'ext5b' => 'U', ++ 'ext1b' => 'U', ++ 'call' => 'L', ++ 'ldi' => 'E', ++ 'ldih' => 'E', ++ 'ldbu' => 'L', ++ 'ldw' => 'L', ++ 'ldl' => 'L', ++ 'fldd' => 'L', ++ 'ret' => 'L', ++ 'mov' => 'E', ++ 'mulw' => 'U', ++ 'mull' => 'U', ++ 'negl' => 'E', ++ 'nop' => 'E', ++ 'not' => 'E', ++ 's8addl' => 'E', ++ 's8subl' => 'E', ++ # 'sextb' => ? ++ # 'sextl' => ? ++ 'sll' => 'U', ++ 'srl' => 'U', ++ 'stl' => 'L', ++ 'subl' => 'E', ++ 'umulh' => 'U', ++ 'unop' => 'E', ++ 'xor' => 'E', ++ ); ++ ++# Slottings used for a given pattern of U/L/E in an octaword. This is as ++# per the "Ebox Slotting" section of the SW6 hardware reference manual. ++# ++my %slottable = ++ ( ++ 'EEEE' => 'ULUL', ++ 'EEEL' => 'ULUL', ++ 'EEEU' => 'ULLU', ++ 'EELE' => 'ULLU', ++ 'EELL' => 'UULL', ++ 'EELU' => 'ULLU', ++ 'EEUE' => 'ULUL', ++ 'EEUL' => 'ULUL', ++ 'EEUU' => 'LLUU', ++ 'ELEE' => 'ULUL', ++ 'ELEL' => 'ULUL', ++ 'ELEU' => 'ULLU', ++ 'ELLE' => 'ULLU', ++ 'ELLL' => 'ULLL', ++ 'ELLU' => 'ULLU', ++ 'ELUE' => 'ULUL', ++ 'ELUL' => 'ULUL', ++ ++ 'LLLL' => 'LLLL', ++ 'LLLU' => 'LLLU', ++ 'LLUE' => 'LLUU', ++ 'LLUL' => 'LLUL', ++ 'LLUU' => 'LLUU', ++ 'LUEE' => 'LULU', ++ 'LUEL' => 'LUUL', ++ 'LUEU' => 'LULU', ++ 'LULE' => 'LULU', ++ 'LULL' => 'LULL', ++ 'LULU' => 'LULU', ++ 'LUUE' => 'LUUL', ++ 'LUUL' => 'LUUL', ++ 'LUUU' => 'LUUU', ++ 'UEEE' => 'ULUL', ++ 'UEEL' => 'ULUL', ++ 'UEEU' => 'ULLU', ++ ++ 'ELUU' => 'LLUU', ++ 'EUEE' => 'LULU', ++ 'EUEL' => 'LUUL', ++ 'EUEU' => 'LULU', ++ 'EULE' => 'LULU', ++ 'EULL' => 'UULL', ++ 'EULU' => 'LULU', ++ 'EUUE' => 'LUUL', ++ 'EUUL' => 'LUUL', ++ 'EUUU' => 'LUUU', ++ 'LEEE' => 'LULU', ++ 'LEEL' => 'LUUL', ++ 'LEEU' => 'LULU', ++ 'LELE' => 'LULU', ++ 'LELL' => 'LULL', ++ 'LELU' => 'LULU', ++ 'LEUE' => 'LUUL', ++ 'LEUL' => 'LUUL', ++ 'LEUU' => 'LLUU', ++ 'LLEE' => 'LLUU', ++ 'LLEL' => 'LLUL', ++ 'LLEU' => 'LLUU', ++ 'LLLE' => 'LLLU', ++ ++ 'UELE' => 'ULLU', ++ 'UELL' => 'UULL', ++ 'UELU' => 'ULLU', ++ 'UEUE' => 'ULUL', ++ 'UEUL' => 'ULUL', ++ 'UEUU' => 'ULUU', ++ 'ULEE' => 'ULUL', ++ 'ULEL' => 'ULUL', ++ 'ULEU' => 'ULLU', ++ 'ULLE' => 'ULLU', ++ 'ULLL' => 'ULLL', ++ 'ULLU' => 'ULLU', ++ 'ULUE' => 'ULUL', ++ 'ULUL' => 'ULUL', ++ 'ULUU' => 'ULUU', ++ 'UUEE' => 'UULL', ++ 'UUEL' => 'UULL', ++ 'UUEU' => 'UULU', ++ 'UULE' => 'UULL', ++ 'UULL' => 'UULL', ++ 'UULU' => 'UULU', ++ 'UUUE' => 'UUUL', ++ 'UUUL' => 'UUUL', ++ 'UUUU' => 'UUUU', ++ ); ++ ++# Check all combinations of U/L/E are present in %slottable. ++sub coverage { ++ foreach my $a ('U', 'L', 'E') { ++ foreach my $b ('U', 'L', 'E') { ++ foreach my $c ('U', 'L', 'E') { ++ foreach my $d ('U', 'L', 'E') { ++ my $x = $a . $b . $c . $d; ++ if (! defined $slottable{$x}) { ++ print "slottable missing: $x\n" ++ } ++ } ++ } ++ } ++ } ++} ++ ++# Certain consistency checks for %slottable. ++sub check { ++ foreach my $x (keys %slottable) { ++ my $a = substr($x,0,1); ++ my $b = substr($x,1,1); ++ my $c = substr($x,2,1); ++ my $d = substr($x,3,1); ++ my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E'); ++ my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L'); ++ my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U'); ++ ++ my $got = $slottable{$x}; ++ my $want = $x; ++ ++ if ($es == 0) { ++ ++ } elsif ($es == 1) { ++ # when only one E, it's mapped to whichever of U or L is otherwise ++ # used the least ++ if ($ls > $us) { ++ $want =~ s/E/U/; ++ } else { ++ $want =~ s/E/L/; ++ } ++ } elsif ($es == 2) { ++ # when two E's and two U, then the E's map to L; vice versa for two E ++ # and two L ++ if ($ls == 2) { ++ $want =~ s/E/U/g; ++ } elsif ($us == 2) { ++ $want =~ s/E/L/g; ++ } else { ++ next; ++ } ++ } elsif ($es == 3) { ++ next; ++ ++ } else { # $es == 4 ++ next; ++ } ++ ++ if ($want ne $got) { ++ print "slottable $x want $want got $got\n"; ++ } ++ } ++} ++ ++sub disassemble { ++ my ($file) = @_; ++ ++ open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n"; ++ ++ my (%pre, %post, %type); ++ while () { ++ my $line = $_ . ""; ++ ++ if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) { ++ my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4); ++ ++ my $this_type = $optable{$opcode}; ++ if (! defined ($this_type)) { $this_type = ' '; } ++ ++ $pre{$addr} = $this_pre; ++ $post{$addr} = $this_post; ++ $type{$addr} = $this_type; ++ ++ if ($addr eq 'c') { ++ my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' '); ++ ++ my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'}; ++ $str = $slottable{$str}; ++ if (defined $str) { ++ $slot{'c'} = substr($str,0,1); ++ $slot{'8'} = substr($str,1,1); ++ $slot{'4'} = substr($str,2,1); ++ $slot{'0'} = substr($str,3,1); ++ } ++ ++ foreach my $i ('0', '4', '8', 'c') { ++ if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; } ++ print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, ' ', $post{$i}, "\n"; ++ } ++ ++ %pre = (); ++ %type = (); ++ %post = (); ++ } ++ } ++ } ++ ++ close IN || die "Error from objdump (or objdump not available)\n"; ++} ++ ++coverage(); ++check(); ++ ++my @files; ++if ($#ARGV >= 0) { ++ @files = @ARGV; ++} else { ++ die ++} ++ ++foreach (@files) { ++ disassemble($_); ++} +diff --git a/mpn/sw_64/sw6/sub_n.asm b/mpn/sw_64/sw6/sub_n.asm +new file mode 100644 +index 0000000..64b7711 +--- /dev/null ++++ b/mpn/sw_64/sw6/sub_n.asm +@@ -0,0 +1,281 @@ ++dnl Sw_64 sw6 mpn_sub_n -- Subtract two limb vectors of the same length > 0 ++dnl and store difference in a third limb vector. ++ ++dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 2.125 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C vp r18 ++C n r19 ++C cy r20 (for mpn_add_nc) ++ ++C TODO ++C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) ++C Use multi-pronged feed-in. ++C Perform additional micro-tuning ++ ++C This code was written in cooperation with sw6 pipeline expert Steve Root. ++ ++C Pair loads and stores where possible ++C Store pairs oct-aligned where possible (didn't need it here) ++C Stores are delayed every third cycle ++C Loads and stores are delayed by fills ++C U stays still, put code there where possible (note alternation of U1 and U0) ++C L moves because of loads and stores ++C Note dampers in L to limit damage ++ ++C This odd-looking optimization expects that were having random bits in our ++C data, so that a pure zero result is unlikely. so we penalize the unlikely ++C case to help the common case. ++ ++define(`u0', `r0') define(`u1', `r3') ++define(`v0', `r1') define(`v1', `r4') ++ ++define(`cy0', `r20') define(`cy1', `r21') ++ ++MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) ++ ++ASM_START() ++PROLOGUE(mpn_sub_nc) ++ br r31, $entry ++EPILOGUE() ++PROLOGUE(mpn_sub_n) ++ bis r31, r31, cy0 C clear carry in ++$entry: cmpult r19, 5, r22 C L1 move counter ++ ldl u1, 0(r17) C L0 get next ones ++ ldl v1, 0(r18) C L1 ++ bne r22, $Lsmall ++ ++ ldl u0, 8(r17) C L0 get next ones ++ ldl v0, 8(r18) C L1 ++ subl u1, v1, r5 C U0 sub two data ++ ++ cmpult u1, v1, r23 C U0 did it borrow ++ ldl u1, 16(r17) C L0 get next ones ++ ldl v1, 16(r18) C L1 ++ ++ subl u0, v0, r8 C U1 sub two data ++ subl r5, cy0, r24 C U0 borrow in ++ ++ cmpult u0, v0, r22 C U1 did it borrow ++ beq r5, $fix5f C U0 fix exact zero ++$ret5f: ldl u0, 24(r17) C L0 get next ones ++ ldl v0, 24(r18) C L1 ++ ++ subl r8, r23, r25 C U1 borrow from last ++ subl u1, v1, r7 C U0 sub two data ++ ++ beq r8, $fix6f C U1 fix exact zero ++$ret6f: cmpult u1, v1, r23 C U0 did it borrow ++ ldl u1, 32(r17) C L0 get next ones ++ ldl v1, 32(r18) C L1 ++ ++ ldi r17, 40(r17) C L0 move pointer ++ ldi r18, 40(r18) C L1 move pointer ++ ++ ldi r16, -8(r16) ++ ldi r19, -13(r19) C L1 move counter ++ blt r19, $Lend C U1 loop control ++ ++ ++C Main loop. 8-way unrolled. ++ ALIGN(16) ++$Loop: subl u0, v0, r2 C U1 sub two data ++ stl r24, 8(r16) C L0 put an answer ++ subl r7, r22, r24 C U0 borrow from last ++ stl r25, 16(r16) C L1 pair ++ ++ cmpult u0, v0, cy1 C U1 did it borrow ++ beq r7, $fix7 C U0 fix exact 0 ++$ret7: ldl u0, 0(r17) C L0 get next ones ++ ldl v0, 0(r18) C L1 ++ ++ bis r31, r31, r31 C L damp out ++ subl r2, r23, r25 C U1 borrow from last ++ bis r31, r31, r31 C L moves in L ! ++ subl u1, v1, r5 C U0 sub two data ++ ++ beq r2, $fix0 C U1 fix exact zero ++$ret0: cmpult u1, v1, cy0 C U0 did it borrow ++ ldl u1, 8(r17) C L0 get next ones ++ ldl v1, 8(r18) C L1 ++ ++ subl u0, v0, r8 C U1 sub two data ++ stl r24, 24(r16) C L0 store pair ++ subl r5, cy1, r24 C U0 borrow from last ++ stl r25, 32(r16) C L1 ++ ++ cmpult u0, v0, r22 C U1 did it borrow ++ beq r5, $fix1 C U0 fix exact zero ++$ret1: ldl u0, 16(r17) C L0 get next ones ++ ldl v0, 16(r18) C L1 ++ ++ ldi r16, 64(r16) C L0 move pointer ++ subl r8, cy0, r25 C U1 borrow from last ++ ldi r19, -8(r19) C L1 move counter ++ subl u1, v1, r7 C U0 sub two data ++ ++ beq r8, $fix2 C U1 fix exact zero ++$ret2: cmpult u1, v1, r23 C U0 did it borrow ++ ldl u1, 24(r17) C L0 get next ones ++ ldl v1, 24(r18) C L1 ++ ++ subl u0, v0, r2 C U1 sub two data ++ stl r24, -24(r16) C L0 put an answer ++ subl r7, r22, r24 C U0 borrow from last ++ stl r25, -16(r16) C L1 pair ++ ++ cmpult u0, v0, cy1 C U1 did it borrow ++ beq r7, $fix3 C U0 fix exact 0 ++$ret3: ldl u0, 32(r17) C L0 get next ones ++ ldl v0, 32(r18) C L1 ++ ++ bis r31, r31, r31 C L damp out ++ subl r2, r23, r25 C U1 borrow from last ++ bis r31, r31, r31 C L moves in L ! ++ subl u1, v1, r5 C U0 sub two data ++ ++ beq r2, $fix4 C U1 fix exact zero ++$ret4: cmpult u1, v1, cy0 C U0 did it borrow ++ ldl u1, 40(r17) C L0 get next ones ++ ldl v1, 40(r18) C L1 ++ ++ subl u0, v0, r8 C U1 sub two data ++ stl r24, -8(r16) C L0 store pair ++ subl r5, cy1, r24 C U0 borrow from last ++ stl r25, 0(r16) C L1 ++ ++ cmpult u0, v0, r22 C U1 did it borrow ++ beq r5, $fix5 C U0 fix exact zero ++$ret5: ldl u0, 48(r17) C L0 get next ones ++ ldl v0, 48(r18) C L1 ++ ++ ldl r31, 256(r17) C L0 prefetch ++ subl r8, cy0, r25 C U1 borrow from last ++ ldl r31, 256(r18) C L1 prefetch ++ subl u1, v1, r7 C U0 sub two data ++ ++ beq r8, $fix6 C U1 fix exact zero ++$ret6: cmpult u1, v1, r23 C U0 did it borrow ++ ldl u1, 56(r17) C L0 get next ones ++ ldl v1, 56(r18) C L1 ++ ++ ldi r17, 64(r17) C L0 move pointer ++ bis r31, r31, r31 C U ++ ldi r18, 64(r18) C L1 move pointer ++ bge r19, $Loop C U1 loop control ++C ==== main loop end ++ ++$Lend: subl u0, v0, r2 C U1 sub two data ++ stl r24, 8(r16) C L0 put an answer ++ subl r7, r22, r24 C U0 borrow from last ++ stl r25, 16(r16) C L1 pair ++ cmpult u0, v0, cy1 C U1 did it borrow ++ beq r7, $fix7c C U0 fix exact 0 ++$ret7c: subl r2, r23, r25 C U1 borrow from last ++ subl u1, v1, r5 C U0 sub two data ++ beq r2, $fix0c C U1 fix exact zero ++$ret0c: cmpult u1, v1, cy0 C U0 did it borrow ++ stl r24, 24(r16) C L0 store pair ++ subl r5, cy1, r24 C U0 borrow from last ++ stl r25, 32(r16) C L1 ++ beq r5, $fix1c C U0 fix exact zero ++$ret1c: stl r24, 40(r16) C L0 put an answer ++ ldi r16, 48(r16) C L0 move pointer ++ ++ ldi r19, 8(r19) ++ beq r19, $Lret ++ ++ ldl u1, 0(r17) ++ ldl v1, 0(r18) ++$Lsmall: ++ ldi r19, -1(r19) ++ beq r19, $Lend0 ++ ++ ALIGN(8) ++$Loop0: subl u1, v1, r2 C main sub ++ cmpult u1, v1, r8 C compute bw from last sub ++ ldl u1, 8(r17) ++ ldl v1, 8(r18) ++ subl r2, cy0, r5 C borrow sub ++ ldi r17, 8(r17) ++ ldi r18, 8(r18) ++ stl r5, 0(r16) ++ cmpult r2, cy0, cy0 C compute bw from last sub ++ ldi r19, -1(r19) C decr loop cnt ++ bis r8, cy0, cy0 C combine bw from the two subs ++ ldi r16, 8(r16) ++ bne r19, $Loop0 ++$Lend0: subl u1, v1, r2 C main sub ++ subl r2, cy0, r5 C borrow sub ++ cmpult u1, v1, r8 C compute bw from last sub ++ cmpult r2, cy0, cy0 C compute bw from last sub ++ stl r5, 0(r16) ++ bis r8, cy0, r0 C combine bw from the two subs ++ ret r31,(r26),1 ++ ++ ALIGN(8) ++$Lret: ldi r0, 0(cy0) C copy borrow into return register ++ ret r31,(r26),1 ++ ++$fix5f: bis r23, cy0, r23 C bring forward borrow ++ br r31, $ret5f ++$fix6f: bis r22, r23, r22 C bring forward borrow ++ br r31, $ret6f ++$fix0: bis cy1, r23, cy1 C bring forward borrow ++ br r31, $ret0 ++$fix1: bis cy0, cy1, cy0 C bring forward borrow ++ br r31, $ret1 ++$fix2: bis r22, cy0, r22 C bring forward borrow ++ br r31, $ret2 ++$fix3: bis r23, r22, r23 C bring forward borrow ++ br r31, $ret3 ++$fix4: bis cy1, r23, cy1 C bring forward borrow ++ br r31, $ret4 ++$fix5: bis cy1, cy0, cy0 C bring forward borrow ++ br r31, $ret5 ++$fix6: bis r22, cy0, r22 C bring forward borrow ++ br r31, $ret6 ++$fix7: bis r23, r22, r23 C bring forward borrow ++ br r31, $ret7 ++$fix0c: bis cy1, r23, cy1 C bring forward borrow ++ br r31, $ret0c ++$fix1c: bis cy0, cy1, cy0 C bring forward borrow ++ br r31, $ret1c ++$fix7c: bis r23, r22, r23 C bring forward borrow ++ br r31, $ret7c ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6a/gcd_1.asm b/mpn/sw_64/sw6a/gcd_1.asm +new file mode 100644 +index 0000000..ce75dc5 +--- /dev/null ++++ b/mpn/sw_64/sw6a/gcd_1.asm +@@ -0,0 +1,145 @@ ++dnl Sw_64 sw6a mpn_gcd_1 -- Nx1 greatest common divisor. ++ ++dnl Copyright 2003, 2004 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6a: 3.4 cycles/bitpair for 1x1 part ++ ++ ++C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); ++C ++C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and ++C strip trailing zeros from abs(x-y) to maintain x and y both odd. ++C ++C The trailing zeros are calculated from just x-y, since in twos-complement ++C there's the same number of trailing zeros on d or -d. This means the cttz ++C runs in parallel with abs(x-y). ++C ++C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit ++C operands with this algorithm gives the measured 3.4 c/l. ++C ++C The slottings shown are for SVR4 style systems, Unicos differs in the ++C initial gp setup and the LEA. ++C ++C Enhancement: ++C ++C On the call, !lituse_call! (when available) would allow the linker to relax ++C it to a bsr, but probably only in a static binary. Plain "call foo" gives ++C the right object code for relaxation, and ought to be available ++C everywhere, but we prefer to schedule the GOT ldl (LEA) back earlier, for ++C the usual case of running in a shared library. ++C ++C bsr could perhaps be used explicitly anyway. We should be able to assume ++C modexact is in the same module as us (ie. shared library or mainline). ++C Would there be any worries about the size of the displacement? Could ++C always put modexact and gcd_1 in the same .o to be certain. ++ ++ASM_START() ++PROLOGUE(mpn_gcd_1, gp) ++ ++ C r16 xp ++ C r17 size ++ C r18 y ++ ++ C ldih C l ++ C ldi C u ++ ++ ldl r0, 0(r16) C L x = xp[0] ++ ldi r30, -32(r30) C u alloc stack ++ ++ LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldl (gp) ++ stl r10, 16(r30) C L save r10 ++ cttz r18, r10 C U0 y twos ++ cmpeq r17, 1, r5 C u test size==1 ++ ++ stl r9, 8(r30) C L save r9 ++ clr r19 C u zero c for modexact ++ unop ++ unop ++ ++ cttz r0, r6 C U0 x twos ++ stl r26, 0(r30) C L save ra ++ ++ srl r18, r10, r18 C U y odd ++ ++ mov r18, r9 C l hold y across call ++ ++ cmpult r6, r10, r2 C u test x_twos < y_twos ++ ++ selne r2, r6, r10 C l common_twos = min(x_twos,y_twos) ++ bne r5, L(one) C U no modexact if size==1 ++ call r26, (r27), mpn_modexact_1c_odd C L0 ++ ++ LDGP( r29, 0(r26)) C u,l ldih,ldi ++ cttz r0, r6 C U0 new x twos ++ ldl r26, 0(r30) C L restore ra ++ ++L(one): ++ mov r9, r1 C u y ++ ldl r9, 8(r30) C L restore r9 ++ mov r10, r2 C u common twos ++ ldl r10, 16(r30) C L restore r10 ++ ++ ldi r30, 32(r30) C l free stack ++ beq r0, L(done) C U return y if x%y==0 ++ ++ srl r0, r6, r0 C U x odd ++ unop ++ ++ ALIGN(16) ++L(top): ++ C r0 x ++ C r1 y ++ C r2 common twos, for use at end ++ ++ subl r0, r1, r7 C l0 d = x - y ++ cmpult r0, r1, r16 C u0 test x >= y ++ ++ subl r1, r0, r4 C l0 new_x = y - x ++ cttz r7, r8 C U0 d twos ++ ++ seleq r16, r7, r4 C l0 new_x = d if x>=y ++ selne r16, r0, r1 C u0 y = x if x> twos ++ bne r7, L(top) C U1 stop when d==0 ++ ++ ++L(done): ++ sll r1, r2, r0 C U0 return y << common_twos ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6a/hamdist.asm b/mpn/sw_64/sw6a/hamdist.asm +new file mode 100644 +index 0000000..d005eb0 +--- /dev/null ++++ b/mpn/sw_64/sw6a/hamdist.asm +@@ -0,0 +1,111 @@ ++dnl Sw_64 sw6a mpn_hamdist -- mpn hamming distance. ++ ++dnl Copyright 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6a: 2.5 cycles/limb ++ ++ ++C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size); ++C ++C The hope was for 2.0 c/l here, but that isn't achieved. We're limited by ++C renaming register shortage. Since we need 5 instructions per limb, further ++C unrolling could approach 1.5 c/l. ++C ++C The main loop processes two limbs from each operand on each iteration. An ++C odd size is handled by processing xp[0]^yp[0] at the start. If the size ++C is even that result is discarded, and is repeated by the main loop. ++C ++ ++ASM_START() ++PROLOGUE(mpn_hamdist) ++ ++ C r16 xp ++ C r17 yp ++ C r18 size ++ ++ ldl r1, 0(r16) C L0 xp[0] ++ ldl r2, 0(r17) C L1 yp[0] ++ and r18, 1, r8 C U1 1 if size odd ++ srl r18, 1, r18 C U0 size, limb pairs ++ ++ clr r0 C L0 initial total ++ s8addl r8, r17, r17 C U1 yp++ if size odd ++ s8addl r8, r16, r16 C L1 xp++ if size odd ++ clr r6 C U0 dummy initial xor 1 ++ ++ xor r1, r2, r5 C L initial xor 0 ++ beq r18, L(one) C U if size==1 ++ ++ seleq r8, r31, r5 C L discard first limb if size even ++ unop C U ++ ++ ++ ALIGN(16) ++L(top): ++ C r0 total accumulating ++ C r7 xor 0 ++ C r8 xor 1 ++ C r16 xp, incrementing ++ C r17 yp, incrementing ++ C r18 size, limb pairs, decrementing ++ ++ ldl r1, 0(r16) C L ++ ldl r2, 0(r17) C L ++ ctpop r5, r7 C U0 ++ ldi r16, 16(r16) C U ++ ++ ldl r3, -8(r16) C L ++ ldl r4, 8(r17) C L ++ ctpop r6, r8 C U0 ++ ldi r17, 16(r17) C U ++ ++ ldl r31, 256(r16) C L prefetch ++ ldl r31, 256(r17) C L prefetch ++ xor r1, r2, r5 C U ++ ldi r18, -1(r18) C U ++ ++ xor r3, r4, r6 C U ++ addl r0, r7, r0 C L ++ addl r0, r8, r0 C L ++ bne r18, L(top) C U ++ ++ ++ ctpop r6, r8 C U0 ++ addl r0, r8, r0 C L ++L(one): ++ ctpop r5, r7 C U0 ++ addl r0, r7, r0 C L ++ ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6a/popcount.asm b/mpn/sw_64/sw6a/popcount.asm +new file mode 100644 +index 0000000..388d062 +--- /dev/null ++++ b/mpn/sw_64/sw6a/popcount.asm +@@ -0,0 +1,101 @@ ++dnl Sw_64 sw6a mpn_popcount -- mpn bit population count. ++ ++dnl Copyright 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6a: 1.5 cycles/limb ++ ++ ++C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); ++C ++C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide ++C all latencies, the addl's must be deferred to the next iteration. ++C ++C Since we need just 3 instructions per limb, further unrolling could approach ++C 1.0 c/l. ++C ++C The main loop processes two limbs at a time. An odd size is handled by ++C processing src[0] at the start. If the size is even that result is ++C discarded, and src[0] is repeated by the main loop. ++C ++ ++ASM_START() ++PROLOGUE(mpn_popcount) ++ ++ C r16 src ++ C r17 size ++ ++ ldl r0, 0(r16) C L0 src[0] ++ and r17, 1, r8 C U1 1 if size odd ++ srl r17, 1, r17 C U0 size, limb pairs ++ ++ s8addl r8, r16, r16 C L1 src++ if size odd ++ ctpop r0, r0 C U0 ++ beq r17, L(one) C U1 if size==1 ++ ++ seleq r8, r31, r0 C L discard first limb if size even ++ clr r3 C L ++ ++ clr r4 C L ++ unop C U ++ unop C L ++ unop C U ++ ++ ++ ALIGN(16) ++L(top): ++ C r0 total accumulating ++ C r3 pop 0 ++ C r4 pop 1 ++ C r16 src, incrementing ++ C r17 size, decrementing ++ ++ ldl r1, 0(r16) C L ++ ldl r2, 8(r16) C L ++ ldi r16, 16(r16) C U ++ ldi r17, -1(r17) C U ++ ++ addl r0, r3, r0 C L ++ addl r0, r4, r0 C L ++ ctpop r1, r3 C U0 ++ ctpop r2, r4 C U0 ++ ++ ldl r31, 512(r16) C L prefetch ++ bne r17, L(top) C U ++ ++ ++ addl r0, r3, r0 C L ++ addl r0, r4, r0 C U ++L(one): ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6b/gcd_1.asm b/mpn/sw_64/sw6b/gcd_1.asm +new file mode 100644 +index 0000000..984ac92 +--- /dev/null ++++ b/mpn/sw_64/sw6b/gcd_1.asm +@@ -0,0 +1,145 @@ ++dnl Sw_64 sw6b mpn_gcd_1 -- Nx1 greatest common divisor. ++ ++dnl Copyright 2003, 2004 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6b: 3.4 cycles/bitpair for 1x1 part ++ ++ ++C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); ++C ++C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and ++C strip trailing zeros from abs(x-y) to maintain x and y both odd. ++C ++C The trailing zeros are calculated from just x-y, since in twos-complement ++C there's the same number of trailing zeros on d or -d. This means the cttz ++C runs in parallel with abs(x-y). ++C ++C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit ++C operands with this algorithm gives the measured 3.4 c/l. ++C ++C The slottings shown are for SVR4 style systems, Unicos differs in the ++C initial gp setup and the LEA. ++C ++C Enhancement: ++C ++C On the call, !lituse_call! (when available) would allow the linker to relax ++C it to a bsr, but probably only in a static binary. Plain "call foo" gives ++C the right object code for relaxation, and ought to be available ++C everywhere, but we prefer to schedule the GOT ldl (LEA) back earlier, for ++C the usual case of running in a shared library. ++C ++C bsr could perhaps be used explicitly anyway. We should be able to assume ++C modexact is in the same module as us (ie. shared library or mainline). ++C Would there be any worries about the size of the displacement? Could ++C always put modexact and gcd_1 in the same .o to be certain. ++ ++ASM_START() ++PROLOGUE(mpn_gcd_1, gp) ++ ++ C r16 xp ++ C r17 size ++ C r18 y ++ ++ C ldih C l ++ C ldi C u ++ ++ ldl r0, 0(r16) C L x = xp[0] ++ ldi r30, -32(r30) C u alloc stack ++ ++ LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldl (gp) ++ stl r10, 16(r30) C L save r10 ++ cttz r18, r10 C U0 y twos ++ cmpeq r17, 1, r5 C u test size==1 ++ ++ stl r9, 8(r30) C L save r9 ++ clr r19 C u zero c for modexact ++ unop ++ unop ++ ++ cttz r0, r6 C U0 x twos ++ stl r26, 0(r30) C L save ra ++ ++ srl r18, r10, r18 C U y odd ++ ++ mov r18, r9 C l hold y across call ++ ++ cmpult r6, r10, r2 C u test x_twos < y_twos ++ ++ selne r2, r6, r10 C l common_twos = min(x_twos,y_twos) ++ bne r5, L(one) C U no modexact if size==1 ++ call r26, (r27), mpn_modexact_1c_odd C L0 ++ ++ LDGP( r29, 0(r26)) C u,l ldih,ldi ++ cttz r0, r6 C U0 new x twos ++ ldl r26, 0(r30) C L restore ra ++ ++L(one): ++ mov r9, r1 C u y ++ ldl r9, 8(r30) C L restore r9 ++ mov r10, r2 C u common twos ++ ldl r10, 16(r30) C L restore r10 ++ ++ ldi r30, 32(r30) C l free stack ++ beq r0, L(done) C U return y if x%y==0 ++ ++ srl r0, r6, r0 C U x odd ++ unop ++ ++ ALIGN(16) ++L(top): ++ C r0 x ++ C r1 y ++ C r2 common twos, for use at end ++ ++ subl r0, r1, r7 C l0 d = x - y ++ cmpult r0, r1, r16 C u0 test x >= y ++ ++ subl r1, r0, r4 C l0 new_x = y - x ++ cttz r7, r8 C U0 d twos ++ ++ seleq r16, r7, r4 C l0 new_x = d if x>=y ++ selne r16, r0, r1 C u0 y = x if x> twos ++ bne r7, L(top) C U1 stop when d==0 ++ ++ ++L(done): ++ sll r1, r2, r0 C U0 return y << common_twos ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6b/hamdist.asm b/mpn/sw_64/sw6b/hamdist.asm +new file mode 100644 +index 0000000..f42ee54 +--- /dev/null ++++ b/mpn/sw_64/sw6b/hamdist.asm +@@ -0,0 +1,111 @@ ++dnl Sw_64 sw6b mpn_hamdist -- mpn hamming distance. ++ ++dnl Copyright 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6a: 2.5 cycles/limb ++ ++ ++C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size); ++C ++C The hope was for 2.0 c/l here, but that isn't achieved. We're limited by ++C renaming register shortage. Since we need 5 instructions per limb, further ++C unrolling could approach 1.5 c/l. ++C ++C The main loop processes two limbs from each operand on each iteration. An ++C odd size is handled by processing xp[0]^yp[0] at the start. If the size ++C is even that result is discarded, and is repeated by the main loop. ++C ++ ++ASM_START() ++PROLOGUE(mpn_hamdist) ++ ++ C r16 xp ++ C r17 yp ++ C r18 size ++ ++ ldl r1, 0(r16) C L0 xp[0] ++ ldl r2, 0(r17) C L1 yp[0] ++ and r18, 1, r8 C U1 1 if size odd ++ srl r18, 1, r18 C U0 size, limb pairs ++ ++ clr r0 C L0 initial total ++ s8addl r8, r17, r17 C U1 yp++ if size odd ++ s8addl r8, r16, r16 C L1 xp++ if size odd ++ clr r6 C U0 dummy initial xor 1 ++ ++ xor r1, r2, r5 C L initial xor 0 ++ beq r18, L(one) C U if size==1 ++ ++ seleq r8, r31, r5 C L discard first limb if size even ++ unop C U ++ ++ ++ ALIGN(16) ++L(top): ++ C r0 total accumulating ++ C r7 xor 0 ++ C r8 xor 1 ++ C r16 xp, incrementing ++ C r17 yp, incrementing ++ C r18 size, limb pairs, decrementing ++ ++ ldl r1, 0(r16) C L ++ ldl r2, 0(r17) C L ++ ctpop r5, r7 C U0 ++ ldi r16, 16(r16) C U ++ ++ ldl r3, -8(r16) C L ++ ldl r4, 8(r17) C L ++ ctpop r6, r8 C U0 ++ ldi r17, 16(r17) C U ++ ++ ldl r31, 256(r16) C L prefetch ++ ldl r31, 256(r17) C L prefetch ++ xor r1, r2, r5 C U ++ ldi r18, -1(r18) C U ++ ++ xor r3, r4, r6 C U ++ addl r0, r7, r0 C L ++ addl r0, r8, r0 C L ++ bne r18, L(top) C U ++ ++ ++ ctpop r6, r8 C U0 ++ addl r0, r8, r0 C L ++L(one): ++ ctpop r5, r7 C U0 ++ addl r0, r7, r0 C L ++ ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw6b/popcount.asm b/mpn/sw_64/sw6b/popcount.asm +new file mode 100644 +index 0000000..b766557 +--- /dev/null ++++ b/mpn/sw_64/sw6b/popcount.asm +@@ -0,0 +1,101 @@ ++dnl Sw_64 sw6b mpn_popcount -- mpn bit population count. ++ ++dnl Copyright 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6b: 1.5 cycles/limb ++ ++ ++C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); ++C ++C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide ++C all latencies, the addl's must be deferred to the next iteration. ++C ++C Since we need just 3 instructions per limb, further unrolling could approach ++C 1.0 c/l. ++C ++C The main loop processes two limbs at a time. An odd size is handled by ++C processing src[0] at the start. If the size is even that result is ++C discarded, and src[0] is repeated by the main loop. ++C ++ ++ASM_START() ++PROLOGUE(mpn_popcount) ++ ++ C r16 src ++ C r17 size ++ ++ ldl r0, 0(r16) C L0 src[0] ++ and r17, 1, r8 C U1 1 if size odd ++ srl r17, 1, r17 C U0 size, limb pairs ++ ++ s8addl r8, r16, r16 C L1 src++ if size odd ++ ctpop r0, r0 C U0 ++ beq r17, L(one) C U1 if size==1 ++ ++ seleq r8, r31, r0 C L discard first limb if size even ++ clr r3 C L ++ ++ clr r4 C L ++ unop C U ++ unop C L ++ unop C U ++ ++ ++ ALIGN(16) ++L(top): ++ C r0 total accumulating ++ C r3 pop 0 ++ C r4 pop 1 ++ C r16 src, incrementing ++ C r17 size, decrementing ++ ++ ldl r1, 0(r16) C L ++ ldl r2, 8(r16) C L ++ ldi r16, 16(r16) C U ++ ldi r17, -1(r17) C U ++ ++ addl r0, r3, r0 C L ++ addl r0, r4, r0 C L ++ ctpop r1, r3 C U0 ++ ctpop r2, r4 C U0 ++ ++ ldl r31, 512(r16) C L prefetch ++ bne r17, L(top) C U ++ ++ ++ addl r0, r3, r0 C L ++ addl r0, r4, r0 C U ++L(one): ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw8a/gcd_1.asm b/mpn/sw_64/sw8a/gcd_1.asm +new file mode 100644 +index 0000000..73fc103 +--- /dev/null ++++ b/mpn/sw_64/sw8a/gcd_1.asm +@@ -0,0 +1,145 @@ ++dnl Sw_64 sw8a mpn_gcd_1 -- Nx1 greatest common divisor. ++ ++dnl Copyright 2003, 2004 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw8a: 3.4 cycles/bitpair for 1x1 part ++ ++ ++C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y); ++C ++C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and ++C strip trailing zeros from abs(x-y) to maintain x and y both odd. ++C ++C The trailing zeros are calculated from just x-y, since in twos-complement ++C there's the same number of trailing zeros on d or -d. This means the cttz ++C runs in parallel with abs(x-y). ++C ++C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit ++C operands with this algorithm gives the measured 3.4 c/l. ++C ++C The slottings shown are for SVR4 style systems, Unicos differs in the ++C initial gp setup and the LEA. ++C ++C Enhancement: ++C ++C On the call, !lituse_call! (when available) would allow the linker to relax ++C it to a bsr, but probably only in a static binary. Plain "call foo" gives ++C the right object code for relaxation, and ought to be available ++C everywhere, but we prefer to schedule the GOT ldl (LEA) back earlier, for ++C the usual case of running in a shared library. ++C ++C bsr could perhaps be used explicitly anyway. We should be able to assume ++C modexact is in the same module as us (ie. shared library or mainline). ++C Would there be any worries about the size of the displacement? Could ++C always put modexact and gcd_1 in the same .o to be certain. ++ ++ASM_START() ++PROLOGUE(mpn_gcd_1, gp) ++ ++ C r16 xp ++ C r17 size ++ C r18 y ++ ++ C ldih C l ++ C ldi C u ++ ++ ldl r0, 0(r16) C L x = xp[0] ++ ldi r30, -32(r30) C u alloc stack ++ ++ LEA( r27, mpn_modexact_1c_odd) C L modexact addr, ldl (gp) ++ stl r10, 16(r30) C L save r10 ++ cttz r18, r10 C U0 y twos ++ cmpeq r17, 1, r5 C u test size==1 ++ ++ stl r9, 8(r30) C L save r9 ++ clr r19 C u zero c for modexact ++ unop ++ unop ++ ++ cttz r0, r6 C U0 x twos ++ stl r26, 0(r30) C L save ra ++ ++ srl r18, r10, r18 C U y odd ++ ++ mov r18, r9 C l hold y across call ++ ++ cmpult r6, r10, r2 C u test x_twos < y_twos ++ ++ selne r2, r6, r10 C l common_twos = min(x_twos,y_twos) ++ bne r5, L(one) C U no modexact if size==1 ++ call r26, (r27), mpn_modexact_1c_odd C L0 ++ ++ LDGP( r29, 0(r26)) C u,l ldih,ldi ++ cttz r0, r6 C U0 new x twos ++ ldl r26, 0(r30) C L restore ra ++ ++L(one): ++ mov r9, r1 C u y ++ ldl r9, 8(r30) C L restore r9 ++ mov r10, r2 C u common twos ++ ldl r10, 16(r30) C L restore r10 ++ ++ ldi r30, 32(r30) C l free stack ++ beq r0, L(done) C U return y if x%y==0 ++ ++ srl r0, r6, r0 C U x odd ++ unop ++ ++ ALIGN(16) ++L(top): ++ C r0 x ++ C r1 y ++ C r2 common twos, for use at end ++ ++ subl r0, r1, r7 C l0 d = x - y ++ cmpult r0, r1, r16 C u0 test x >= y ++ ++ subl r1, r0, r4 C l0 new_x = y - x ++ cttz r7, r8 C U0 d twos ++ ++ seleq r16, r7, r4 C l0 new_x = d if x>=y ++ selne r16, r0, r1 C u0 y = x if x> twos ++ bne r7, L(top) C U1 stop when d==0 ++ ++ ++L(done): ++ sll r1, r2, r0 C U0 return y << common_twos ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw8a/hamdist.asm b/mpn/sw_64/sw8a/hamdist.asm +new file mode 100644 +index 0000000..2d3957f +--- /dev/null ++++ b/mpn/sw_64/sw8a/hamdist.asm +@@ -0,0 +1,111 @@ ++dnl Sw_64 sw8a mpn_hamdist -- mpn hamming distance. ++ ++dnl Copyright 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw6a: 2.5 cycles/limb ++ ++ ++C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size); ++C ++C The hope was for 2.0 c/l here, but that isn't achieved. We're limited by ++C renaming register shortage. Since we need 5 instructions per limb, further ++C unrolling could approach 1.5 c/l. ++C ++C The main loop processes two limbs from each operand on each iteration. An ++C odd size is handled by processing xp[0]^yp[0] at the start. If the size ++C is even that result is discarded, and is repeated by the main loop. ++C ++ ++ASM_START() ++PROLOGUE(mpn_hamdist) ++ ++ C r16 xp ++ C r17 yp ++ C r18 size ++ ++ ldl r1, 0(r16) C L0 xp[0] ++ ldl r2, 0(r17) C L1 yp[0] ++ and r18, 1, r8 C U1 1 if size odd ++ srl r18, 1, r18 C U0 size, limb pairs ++ ++ clr r0 C L0 initial total ++ s8addl r8, r17, r17 C U1 yp++ if size odd ++ s8addl r8, r16, r16 C L1 xp++ if size odd ++ clr r6 C U0 dummy initial xor 1 ++ ++ xor r1, r2, r5 C L initial xor 0 ++ beq r18, L(one) C U if size==1 ++ ++ seleq r8, r31, r5 C L discard first limb if size even ++ unop C U ++ ++ ++ ALIGN(16) ++L(top): ++ C r0 total accumulating ++ C r7 xor 0 ++ C r8 xor 1 ++ C r16 xp, incrementing ++ C r17 yp, incrementing ++ C r18 size, limb pairs, decrementing ++ ++ ldl r1, 0(r16) C L ++ ldl r2, 0(r17) C L ++ ctpop r5, r7 C U0 ++ ldi r16, 16(r16) C U ++ ++ ldl r3, -8(r16) C L ++ ldl r4, 8(r17) C L ++ ctpop r6, r8 C U0 ++ ldi r17, 16(r17) C U ++ ++ ldl r31, 256(r16) C L prefetch ++ ldl r31, 256(r17) C L prefetch ++ xor r1, r2, r5 C U ++ ldi r18, -1(r18) C U ++ ++ xor r3, r4, r6 C U ++ addl r0, r7, r0 C L ++ addl r0, r8, r0 C L ++ bne r18, L(top) C U ++ ++ ++ ctpop r6, r8 C U0 ++ addl r0, r8, r0 C L ++L(one): ++ ctpop r5, r7 C U0 ++ addl r0, r7, r0 C L ++ ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw8a/popcount.asm b/mpn/sw_64/sw8a/popcount.asm +new file mode 100644 +index 0000000..dc793f4 +--- /dev/null ++++ b/mpn/sw_64/sw8a/popcount.asm +@@ -0,0 +1,101 @@ ++dnl Sw_64 sw8a mpn_popcount -- mpn bit population count. ++ ++dnl Copyright 2003, 2005 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C sw8a: 1.5 cycles/limb ++ ++ ++C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); ++C ++C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide ++C all latencies, the addl's must be deferred to the next iteration. ++C ++C Since we need just 3 instructions per limb, further unrolling could approach ++C 1.0 c/l. ++C ++C The main loop processes two limbs at a time. An odd size is handled by ++C processing src[0] at the start. If the size is even that result is ++C discarded, and src[0] is repeated by the main loop. ++C ++ ++ASM_START() ++PROLOGUE(mpn_popcount) ++ ++ C r16 src ++ C r17 size ++ ++ ldl r0, 0(r16) C L0 src[0] ++ and r17, 1, r8 C U1 1 if size odd ++ srl r17, 1, r17 C U0 size, limb pairs ++ ++ s8addl r8, r16, r16 C L1 src++ if size odd ++ ctpop r0, r0 C U0 ++ beq r17, L(one) C U1 if size==1 ++ ++ seleq r8, r31, r0 C L discard first limb if size even ++ clr r3 C L ++ ++ clr r4 C L ++ unop C U ++ unop C L ++ unop C U ++ ++ ++ ALIGN(16) ++L(top): ++ C r0 total accumulating ++ C r3 pop 0 ++ C r4 pop 1 ++ C r16 src, incrementing ++ C r17 size, decrementing ++ ++ ldl r1, 0(r16) C L ++ ldl r2, 8(r16) C L ++ ldi r16, 16(r16) C U ++ ldi r17, -1(r17) C U ++ ++ addl r0, r3, r0 C L ++ addl r0, r4, r0 C L ++ ctpop r1, r3 C U0 ++ ctpop r2, r4 C U0 ++ ++ ldl r31, 512(r16) C L prefetch ++ bne r17, L(top) C U ++ ++ ++ addl r0, r3, r0 C L ++ addl r0, r4, r0 C U ++L(one): ++ ret r31, (r26), 1 C L0 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/sw_64-defs.m4 b/mpn/sw_64/sw_64-defs.m4 +new file mode 100644 +index 0000000..cc1acf9 +--- /dev/null ++++ b/mpn/sw_64/sw_64-defs.m4 +@@ -0,0 +1,101 @@ ++divert(-1) ++ ++dnl m4 macros for Sw_64 assembler. ++ ++dnl Copyright 2003, 2004 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++ ++dnl Usage: ASSERT([reg] [,code]) ++dnl ++dnl Require that the given reg is non-zero after executing the test code. ++dnl For example, ++dnl ++dnl ASSERT(r8, ++dnl ` cmpult r16, r17, r8') ++dnl ++dnl If the register argument is empty then nothing is tested, the code is ++dnl just executed. This can be used for setups required by later ASSERTs. ++dnl If the code argument is omitted then the register is just tested, with ++dnl no special setup code. ++ ++define(ASSERT, ++m4_assert_numargs_range(1,2) ++m4_assert_defined(`WANT_ASSERT') ++`ifelse(WANT_ASSERT,1, ++`ifelse(`$2',,,`$2') ++ifelse(`$1',,, ++` bne $1, L(ASSERTok`'ASSERT_label_counter) ++ .long 0 C halt ++L(ASSERTok`'ASSERT_label_counter): ++define(`ASSERT_label_counter',eval(ASSERT_label_counter+1)) ++') ++')') ++define(`ASSERT_label_counter',1) ++ ++ ++dnl Usage: bigend(`code') ++dnl ++dnl Emit the given code only for a big-endian system, like Unicos. This ++dnl can be used for instance for extra stuff needed by extwl. ++ ++define(bigend, ++m4_assert_numargs(1) ++`ifdef(`HAVE_LIMB_BIG_ENDIAN',`$1', ++`ifdef(`HAVE_LIMB_LITTLE_ENDIAN',`', ++`m4_error(`Cannot assemble, unknown limb endianness')')')') ++ ++ ++dnl Usage: bwx_available_p ++dnl ++dnl Evaluate to 1 if the BWX byte memory instructions are available, or to ++dnl 0 if not. ++dnl ++dnl Listing the chips which do have BWX means anything we haven't looked at ++dnl will use safe non-BWX code. The only targets without BWX currently are ++dnl plain sw_64 (ie. sw6b). ++ ++define(bwx_available_p, ++m4_assert_numargs(-1) ++`m4_ifdef_anyof_p( ++ `HAVE_HOST_CPU_sw_64sw6', ++ `HAVE_HOST_CPU_sw_64sw6a', ++ `HAVE_HOST_CPU_sw_64sw6b')') ++ ++ ++dnl Usage: unop ++dnl ++dnl The Cray Unicos assembler lacks unop, so give the equivalent ldl_u ++dnl explicitly. ++ ++define(unop, ++m4_assert_numargs(-1) ++`ldl_u r31, 0(r30)') ++ ++ ++divert +diff --git a/mpn/sw_64/unicos.m4 b/mpn/sw_64/unicos.m4 +new file mode 100644 +index 0000000..01ce703 +--- /dev/null ++++ b/mpn/sw_64/unicos.m4 +@@ -0,0 +1,131 @@ ++divert(-1) ++ ++dnl m4 macros for sw_64 assembler on unicos. ++ ++ ++dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++ ++dnl Note that none of the standard GMP_ASM_ autoconf tests are done for ++dnl unicos, so none of the config.m4 results can be used here. ++ ++dnl No underscores on unicos ++define(`GSYM_PREFIX') ++ ++define(`ASM_START', ++m4_assert_numargs(0) ++` .ident dummy') ++ ++define(`X', ++m4_assert_numargs(1) ++`^X$1') ++ ++define(`FLOAT64', ++m4_assert_numargs(2) ++` .psect $1@crud,data ++$1: .t_floating $2 ++ .endp') ++ ++dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign]) ++dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) ++ ++define(`PROLOGUE_cpu', ++m4_assert_numargs_range(1,2) ++`ifelse(`$2',gp,, ++`ifelse(`$2',noalign,, ++`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter ++')')')')dnl ++ .stack 192 ; What does this mean? Only Cray knows. ++ .psect $1@code,code,cache ++$1::') ++ ++define(`EPILOGUE_cpu', ++m4_assert_numargs(1) ++` .endp') ++ ++ ++dnl Usage: LDGP(dst,src) ++dnl ++dnl Emit an "ldgp dst,src", but only on systems using a GOT (which unicos ++dnl doesn't). ++ ++define(LDGP, ++m4_assert_numargs(2) ++) ++ ++ ++dnl Usage: EXTERN(variable_name) ++define(`EXTERN', ++m4_assert_numargs(1) ++` .extern $1') ++ ++define(`DATASTART', ++m4_assert_numargs_range(1,2) ++` .psect $1@crud,data ++ ALIGN(ifelse($#,1,2,$2)) ++$1:') ++ ++define(`DATAEND', ++m4_assert_numargs(0) ++` .endp') ++ ++define(`ASM_END', ++m4_assert_numargs(0) ++` .end') ++ ++define(`cvttqc', ++m4_assert_numargs(-1) ++`cvttq/c') ++ ++dnl Load a symbolic address into a register ++define(`LEA', ++m4_assert_numargs(2) ++ `laum $1, $2(r31) ++ sll $1, 32, $1 ++ lalm $1, $2($1) ++ lal $1, $2($1)') ++ ++ ++dnl Usage: ALIGN(bytes) ++dnl ++dnl Unicos assembler .align emits zeros, even in code segments, so disable ++dnl aligning. ++dnl ++dnl GCC uses a macro emiting nops until the desired alignment is reached ++dnl (see unicosmk_file_start in sw_64.c). Could do something like that if ++dnl we cared. The maximum desired alignment must be established at the ++dnl start of the section though, since of course emitting nops only ++dnl advances relative to the section beginning. ++ ++define(`ALIGN', ++m4_assert_numargs(1) ++) ++ ++ ++divert +-- +2.25.1 + diff --git a/0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch b/0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch new file mode 100644 index 0000000000000000000000000000000000000000..20861f5376523688bbe0733435582187016434de --- /dev/null +++ b/0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch @@ -0,0 +1,3156 @@ +From aafd073389f9baee857165210ec98449cd1f8092 Mon Sep 17 00:00:00 2001 +From: swcompiler +Date: Tue, 22 Jul 2025 13:52:05 +0800 +Subject: [PATCH 3/3] Sw64-Port-add-mpn-asm-support-for-sw64 + +--- + mpn/sw_64/add_n.asm | 161 +++++++++++++++++++ + mpn/sw_64/addmul_1.asm | 96 ++++++++++++ + mpn/sw_64/aorslsh1_n.asm | 161 +++++++++++++++++++ + mpn/sw_64/aorslsh2_n.asm | 164 +++++++++++++++++++ + mpn/sw_64/bdiv_dbm1c.asm | 279 +++++++++++++++++++++++++++++++++ + mpn/sw_64/cntlz.asm | 55 +++++++ + mpn/sw_64/com.asm | 173 ++++++++++++++++++++ + mpn/sw_64/copyd.asm | 85 ++++++++++ + mpn/sw_64/copyi.asm | 84 ++++++++++ + mpn/sw_64/dive_1.c | 105 +++++++++++++ + mpn/sw_64/divrem_2.asm | 175 +++++++++++++++++++++ + mpn/sw_64/invert_limb.asm | 93 +++++++++++ + mpn/sw_64/lshift.asm | 180 +++++++++++++++++++++ + mpn/sw_64/mod_34lsub1.asm | 162 +++++++++++++++++++ + mpn/sw_64/mode1o.asm | 192 +++++++++++++++++++++++ + mpn/sw_64/mul_1.asm | 100 ++++++++++++ + mpn/sw_64/rshift.asm | 178 +++++++++++++++++++++ + mpn/sw_64/sec_tabselect.asm | 135 ++++++++++++++++ + mpn/sw_64/sqr_diag_addlsh1.asm | 88 +++++++++++ + mpn/sw_64/sub_n.asm | 162 +++++++++++++++++++ + mpn/sw_64/submul_1.asm | 97 ++++++++++++ + mpn/sw_64/umul.asm | 44 ++++++ + 22 files changed, 2969 insertions(+) + create mode 100644 mpn/sw_64/add_n.asm + create mode 100644 mpn/sw_64/addmul_1.asm + create mode 100644 mpn/sw_64/aorslsh1_n.asm + create mode 100644 mpn/sw_64/aorslsh2_n.asm + create mode 100644 mpn/sw_64/bdiv_dbm1c.asm + create mode 100644 mpn/sw_64/cntlz.asm + create mode 100644 mpn/sw_64/com.asm + create mode 100644 mpn/sw_64/copyd.asm + create mode 100644 mpn/sw_64/copyi.asm + create mode 100644 mpn/sw_64/dive_1.c + create mode 100644 mpn/sw_64/divrem_2.asm + create mode 100644 mpn/sw_64/invert_limb.asm + create mode 100644 mpn/sw_64/lshift.asm + create mode 100644 mpn/sw_64/mod_34lsub1.asm + create mode 100644 mpn/sw_64/mode1o.asm + create mode 100644 mpn/sw_64/mul_1.asm + create mode 100644 mpn/sw_64/rshift.asm + create mode 100644 mpn/sw_64/sec_tabselect.asm + create mode 100644 mpn/sw_64/sqr_diag_addlsh1.asm + create mode 100644 mpn/sw_64/sub_n.asm + create mode 100644 mpn/sw_64/submul_1.asm + create mode 100644 mpn/sw_64/umul.asm + +diff --git a/mpn/sw_64/add_n.asm b/mpn/sw_64/add_n.asm +new file mode 100644 +index 0000000..35fc769 +--- /dev/null ++++ b/mpn/sw_64/add_n.asm +@@ -0,0 +1,161 @@ ++dnl Sw_64 mpn_add_n -- Add two limb vectors of the same length > 0 and ++dnl store sum in a third limb vector. ++ ++dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++dnl INPUT PARAMETERS ++dnl res_ptr r16 ++dnl s1_ptr r17 ++dnl s2_ptr r18 ++dnl size r19 ++ ++ASM_START() ++PROLOGUE(mpn_add_nc) ++ bis r20,r31,r25 ++ br L(com) ++EPILOGUE() ++PROLOGUE(mpn_add_n) ++ bis r31,r31,r25 C clear cy ++L(com): subl r19,4,r19 C decr loop cnt ++ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop ++C Start software pipeline for 1st loop ++ ldl r0,0(r18) ++ ldl r4,0(r17) ++ ldl r1,8(r18) ++ ldl r5,8(r17) ++ addl r17,32,r17 C update s1_ptr ++ addl r0,r4,r28 C 1st main add ++ ldl r2,16(r18) ++ addl r25,r28,r20 C 1st carry add ++ ldl r3,24(r18) ++ cmpult r28,r4,r8 C compute cy from last add ++ ldl r6,-16(r17) ++ cmpult r20,r28,r25 C compute cy from last add ++ ldl r7,-8(r17) ++ bis r8,r25,r25 C combine cy from the two adds ++ subl r19,4,r19 C decr loop cnt ++ addl r1,r5,r28 C 2nd main add ++ addl r18,32,r18 C update s2_ptr ++ addl r28,r25,r21 C 2nd carry add ++ cmpult r28,r5,r8 C compute cy from last add ++ blt r19,$Lend1 C if less than 4 limbs remain, jump ++C 1st loop handles groups of 4 limbs in a software pipeline ++ ALIGN(16) ++$Loop: cmpult r21,r28,r25 C compute cy from last add ++ ldl r0,0(r18) ++ bis r8,r25,r25 C combine cy from the two adds ++ ldl r1,8(r18) ++ addl r2,r6,r28 C 3rd main add ++ ldl r4,0(r17) ++ addl r28,r25,r22 C 3rd carry add ++ ldl r5,8(r17) ++ cmpult r28,r6,r8 C compute cy from last add ++ cmpult r22,r28,r25 C compute cy from last add ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two adds ++ stl r21,8(r16) ++ addl r3,r7,r28 C 4th main add ++ addl r28,r25,r23 C 4th carry add ++ cmpult r28,r7,r8 C compute cy from last add ++ cmpult r23,r28,r25 C compute cy from last add ++ addl r17,32,r17 C update s1_ptr ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r16,32,r16 C update res_ptr ++ addl r0,r4,r28 C 1st main add ++ ldl r2,16(r18) ++ addl r25,r28,r20 C 1st carry add ++ ldl r3,24(r18) ++ cmpult r28,r4,r8 C compute cy from last add ++ ldl r6,-16(r17) ++ cmpult r20,r28,r25 C compute cy from last add ++ ldl r7,-8(r17) ++ bis r8,r25,r25 C combine cy from the two adds ++ subl r19,4,r19 C decr loop cnt ++ stl r22,-16(r16) ++ addl r1,r5,r28 C 2nd main add ++ stl r23,-8(r16) ++ addl r25,r28,r21 C 2nd carry add ++ addl r18,32,r18 C update s2_ptr ++ cmpult r28,r5,r8 C compute cy from last add ++ bge r19,$Loop ++C Finish software pipeline for 1st loop ++$Lend1: cmpult r21,r28,r25 C compute cy from last add ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r2,r6,r28 C 3rd main add ++ addl r28,r25,r22 C 3rd carry add ++ cmpult r28,r6,r8 C compute cy from last add ++ cmpult r22,r28,r25 C compute cy from last add ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two adds ++ stl r21,8(r16) ++ addl r3,r7,r28 C 4th main add ++ addl r28,r25,r23 C 4th carry add ++ cmpult r28,r7,r8 C compute cy from last add ++ cmpult r23,r28,r25 C compute cy from last add ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r16,32,r16 C update res_ptr ++ stl r22,-16(r16) ++ stl r23,-8(r16) ++$Lend2: addl r19,4,r19 C restore loop cnt ++ beq r19,$Lret ++C Start software pipeline for 2nd loop ++ ldl r0,0(r18) ++ ldl r4,0(r17) ++ subl r19,1,r19 ++ beq r19,$Lend0 ++C 2nd loop handles remaining 1-3 limbs ++ ALIGN(16) ++$Loop0: addl r0,r4,r28 C main add ++ ldl r0,8(r18) ++ cmpult r28,r4,r8 C compute cy from last add ++ ldl r4,8(r17) ++ addl r28,r25,r20 C carry add ++ addl r18,8,r18 ++ addl r17,8,r17 ++ stl r20,0(r16) ++ cmpult r20,r28,r25 C compute cy from last add ++ subl r19,1,r19 C decr loop cnt ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r16,8,r16 ++ bne r19,$Loop0 ++$Lend0: addl r0,r4,r28 C main add ++ addl r28,r25,r20 C carry add ++ cmpult r28,r4,r8 C compute cy from last add ++ cmpult r20,r28,r25 C compute cy from last add ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two adds ++ ++$Lret: bis r25,r31,r0 C return cy ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/addmul_1.asm b/mpn/sw_64/addmul_1.asm +new file mode 100644 +index 0000000..5fa1d0c +--- /dev/null ++++ b/mpn/sw_64/addmul_1.asm +@@ -0,0 +1,96 @@ ++dnl Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the ++dnl result to a second limb vector. ++ ++dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++C vl r19 ++ ++ ++ASM_START() ++PROLOGUE(mpn_addmul_1) ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ subl r18,1,r18 C size-- ++ mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ umulh r2,r19,r0 C r0 = prod_high ++ beq r18,$Lend1 C jump if size was == 1 ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ subl r18,1,r18 C size-- ++ addl r5,r3,r3 ++ cmpult r3,r5,r4 ++ stl r3,0(r16) ++ addl r16,8,r16 C res_ptr++ ++ beq r18,$Lend2 C jump if size was == 2 ++ ++ ALIGN(8) ++$Loop: mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ subl r18,1,r18 C size-- ++ umulh r2,r19,r4 C r4 = cy_limb ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ addl r5,r3,r3 ++ cmpult r3,r5,r5 ++ stl r3,0(r16) ++ addl r16,8,r16 C res_ptr++ ++ addl r5,r0,r0 C combine carries ++ bne r18,$Loop ++ ++$Lend2: mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ umulh r2,r19,r4 C r4 = cy_limb ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ addl r5,r3,r3 ++ cmpult r3,r5,r5 ++ stl r3,0(r16) ++ addl r5,r0,r0 C combine carries ++ addl r4,r0,r0 C cy_limb = prod_high + cy ++ ret r31,(r26),1 ++$Lend1: addl r5,r3,r3 ++ cmpult r3,r5,r5 ++ stl r3,0(r16) ++ addl r0,r5,r0 ++ ret r31,(r26),1 ++EPILOGUE(mpn_addmul_1) ++ASM_END() +diff --git a/mpn/sw_64/aorslsh1_n.asm b/mpn/sw_64/aorslsh1_n.asm +new file mode 100644 +index 0000000..7b5b517 +--- /dev/null ++++ b/mpn/sw_64/aorslsh1_n.asm +@@ -0,0 +1,161 @@ ++dnl Sw_64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). ++ ++dnl Copyright 2003, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++define(`rp',`r16') ++define(`up',`r17') ++define(`vp',`r18') ++define(`n', `r19') ++ ++define(`u0', `r8') ++define(`u1', `r1') ++define(`v0', `r4') ++define(`v1', `r5') ++ ++define(`cy0', `r0') ++define(`cy1', `r20') ++define(`cy', `r22') ++define(`rr', `r24') ++define(`ps', `r25') ++define(`sl', `r28') ++ ++ifdef(`OPERATION_addlsh1_n',` ++ define(ADDSUB, addl) ++ define(CARRY, `cmpult $1,$2,$3') ++ define(func, mpn_addlsh1_n) ++') ++ifdef(`OPERATION_sublsh1_n',` ++ define(ADDSUB, subl) ++ define(CARRY, `cmpult $2,$1,$3') ++ define(func, mpn_sublsh1_n) ++') ++ ++MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) ++ ++ASM_START() ++PROLOGUE(func) ++ and n, 2, cy0 ++ blbs n, L(bx1) ++L(bx0): ldl v1, 0(vp) ++ ldl u1, 0(up) ++ nop ++ bne cy0, L(b10) ++ ++L(b00): ldi vp, 48(vp) ++ ldi up, -16(up) ++ ldi rp, -8(rp) ++ br r31, L(lo0) ++ ++L(b10): ldi vp, 32(vp) ++ ldi rp, 8(rp) ++ ldi cy0, 0(r31) ++ br r31, L(lo2) ++ ++L(bx1): ldl v0, 0(vp) ++ ldl u0, 0(up) ++ ldi cy1, 0(r31) ++ beq cy0, L(b01) ++ ++L(b11): ldi vp, 40(vp) ++ ldi up, -24(up) ++ ldi rp, 16(rp) ++ br r31, L(lo3) ++ ++L(b01): ldi n, -4(n) ++ ble n, L(end) ++ ldi vp, 24(vp) ++ ldi up, -8(up) ++ ++ ALIGN(16) ++L(top): addl v0, v0, sl C left shift vlimb ++ ldl v1, -16(vp) ++ ADDSUB u0, sl, ps C ulimb + (vlimb << 1) ++ cmplt v0, r31, cy0 C carry out #1 ++ ldl u1, 16(up) ++ ADDSUB ps, cy1, rr C consume carry from previous operation ++ CARRY( ps, u0, cy) C carry out #2 ++ stl rr, 0(rp) ++ addl cy, cy0, cy0 C combine carry out #1 and #2 ++ CARRY( rr, ps, cy) C carry out #3 ++ addl cy, cy0, cy0 C final carry out ++ ldi vp, 32(vp) C bookkeeping ++L(lo0): addl v1, v1, sl ++ ldl v0, -40(vp) ++ ADDSUB u1, sl, ps ++ cmplt v1, r31, cy1 ++ ldl u0, 24(up) ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy) ++ stl rr, 8(rp) ++ addl cy, cy1, cy1 ++ CARRY( rr, ps, cy) ++ addl cy, cy1, cy1 ++ ldi rp, 32(rp) C bookkeeping ++L(lo3): addl v0, v0, sl ++ ldl v1, -32(vp) ++ ADDSUB u0, sl, ps ++ cmplt v0, r31, cy0 ++ ldl u1, 32(up) ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy) ++ stl rr, -16(rp) ++ addl cy, cy0, cy0 ++ CARRY( rr, ps, cy) ++ addl cy, cy0, cy0 ++ ldi up, 32(up) C bookkeeping ++L(lo2): addl v1, v1, sl ++ ldl v0, -24(vp) ++ ADDSUB u1, sl, ps ++ cmplt v1, r31, cy1 ++ ldl u0, 8(up) ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy) ++ stl rr, -8(rp) ++ addl cy, cy1, cy1 ++ CARRY( rr, ps, cy) ++ addl cy, cy1, cy1 ++ ldi n, -4(n) C bookkeeping ++ bgt n, L(top) ++ ++L(end): addl v0, v0, sl ++ ADDSUB u0, sl, ps ++ ADDSUB ps, cy1, rr ++ cmplt v0, r31, cy0 ++ CARRY( ps, u0, cy) ++ stl rr, 0(rp) ++ addl cy, cy0, cy0 ++ CARRY( rr, ps, cy) ++ addl cy, cy0, r0 ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/aorslsh2_n.asm b/mpn/sw_64/aorslsh2_n.asm +new file mode 100644 +index 0000000..8316666 +--- /dev/null ++++ b/mpn/sw_64/aorslsh2_n.asm +@@ -0,0 +1,164 @@ ++dnl Sw_64 mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2). ++ ++dnl Copyright 2003, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++C TODO ++C * Tune to reach 3.5 c/l on sw6 and 5.75 c/l. ++ ++define(`rp',`r16') ++define(`up',`r17') ++define(`vp',`r18') ++define(`n', `r19') ++ ++define(`u0', `r8') ++define(`u1', `r1') ++define(`v0', `r4') ++define(`v1', `r5') ++ ++define(`cy0', `r0') ++define(`cy1', `r20') ++define(`cy', `r22') ++define(`rr', `r24') ++define(`ps', `r25') ++define(`sl', `r28') ++ ++ifdef(`OPERATION_addlsh2_n',` ++ define(ADDSUB, addl) ++ define(CARRY, `cmpult $1,$2,$3') ++ define(func, mpn_addlsh2_n) ++') ++ifdef(`OPERATION_sublsh2_n',` ++ define(ADDSUB, subl) ++ define(CARRY, `cmpult $2,$1,$3') ++ define(func, mpn_sublsh2_n) ++') ++ ++MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n) ++ ++ASM_START() ++PROLOGUE(func) ++ and n, 2, cy0 ++ blbs n, L(bx1) ++L(bx0): ldl v1, 0(vp) ++ ldl u1, 0(up) ++ bis r31, r31, r2 ++ bne cy0, L(b10) ++ ++L(b00): ldi vp, 48(vp) ++ ldi up, -16(up) ++ ldi rp, -8(rp) ++ s4addl v1, r31, sl ++ br r31, L(lo0) ++ ++L(b10): ldi vp, 32(vp) ++ ldi rp, 8(rp) ++ ldi cy0, 0(r31) ++ br r31, L(lo2) ++ ++L(bx1): ldl v0, 0(vp) ++ ldl u0, 0(up) ++ ldi cy1, 0(r31) ++ bis r31, r31, r3 ++ nop ++ beq cy0, L(b01) ++ ++L(b11): ldi vp, 40(vp) ++ ldi up, -24(up) ++ ldi rp, 16(rp) ++ br r31, L(lo3) ++ ++L(b01): ldi n, -4(n) ++ ble n, L(end) ++ ldi vp, 24(vp) ++ ldi up, -8(up) ++ ++ ALIGN(16) ++L(top): s4addl v0, r3, sl C combined vlimb ++ ldl v1, -16(vp) ++ ADDSUB u0, sl, ps C ulimb + (vlimb << 1) ++ ldl u1, 16(up) ++ srl v0, 62, r2 C high v bits ++ ADDSUB ps, cy1, rr C consume carry from previous operation ++ CARRY( ps, u0, cy0) C carry out #2 ++ stl rr, 0(rp) ++ CARRY( rr, ps, cy) C carry out #3 ++ ldi vp, 32(vp) C bookkeeping ++ addl cy, cy0, cy0 C final carry out ++ s4addl v1, r2, sl ++L(lo0): ldl v0, -40(vp) ++ ADDSUB u1, sl, ps ++ ldl u0, 24(up) ++ srl v1, 62, r3 ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy1) ++ stl rr, 8(rp) ++ CARRY( rr, ps, cy) ++ ldi rp, 32(rp) C bookkeeping ++ addl cy, cy1, cy1 ++L(lo3): s4addl v0, r3, sl ++ ldl v1, -32(vp) ++ ADDSUB u0, sl, ps ++ ldl u1, 32(up) ++ srl v0, 62, r2 ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy0) ++ stl rr, -16(rp) ++ CARRY( rr, ps, cy) ++ ldi up, 32(up) C bookkeeping ++ addl cy, cy0, cy0 ++L(lo2): s4addl v1, r2, sl ++ ldl v0, -24(vp) ++ ADDSUB u1, sl, ps ++ ldl u0, 8(up) ++ srl v1, 62, r3 ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy1) ++ stl rr, -8(rp) ++ CARRY( rr, ps, cy) ++ ldi n, -4(n) C bookkeeping ++ addl cy, cy1, cy1 ++ bgt n, L(top) ++ ++L(end): s4addl v0, r3, sl ++ ADDSUB u0, sl, ps ++ srl v0, 62, r2 ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy0) ++ stl rr, 0(rp) ++ CARRY( rr, ps, cy) ++ addl cy, cy0, cy0 ++ addl cy0, r2, r0 ++ ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/bdiv_dbm1c.asm b/mpn/sw_64/bdiv_dbm1c.asm +new file mode 100644 +index 0000000..08a83a3 +--- /dev/null ++++ b/mpn/sw_64/bdiv_dbm1c.asm +@@ -0,0 +1,279 @@ ++dnl Sw_64 mpn_bdiv_dbm1c. ++ ++dnl Copyright 2008 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++C TODO ++C * Try less unrolling, 2-way should give the same performance. ++C * Optimize feed-in and wind-down code, for speed, and perhaps further for ++C code size. ++C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency ++C path. We have not tried very hard to find a better algorithm. Perhaps ++C it would be a good task for the GNU superoptimizer. ++ ++C INPUT PARAMETERS ++define(`rp', `r16') ++define(`up', `r17') ++define(`n', `r18') ++define(`bd', `r19') ++define(`cy', `r19') ++ ++ ++ASM_START() ++PROLOGUE(mpn_bdiv_dbm1c) ++ mov r20, r8 ++ ++ ldl r24, 0(r17) ++ and r18, 3, r28 ++ ldi r18, -4(r18) ++ beq r28, L(b0) ++ cmpeq r28, 1, r21 ++ bne r21, L(b1) ++ cmpeq r28, 2, r21 ++ bne r21, L(b2) ++ ++ ++L(b3): ldl r2, 8(r17) ++ ldl r3, 16(r17) ++ bgt r18, L(gt3) ++ ++ mull r24, r19, r5 C U1 ++ umulh r24, r19, r21 C U1 ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldi r16, -32(r16) ++ br L(cj3) ++ ++L(gt3): ldl r0, 24(r17) ++ mull r24, r19, r5 C U1 ++ umulh r24, r19, r21 C U1 ++ ldl r1, 32(r17) ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ ldl r2, 40(r17) ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldl r3, 48(r17) ++ ldi r18, -4(r18) ++ ldi r17, 56(r17) ++ mull r0, r19, r4 C U1 ++ bgt r18, L(L3) ++ ++ br L(cj7) ++ ++ ++L(b2): ldl r3, 8(r17) ++ bgt r18, L(gt2) ++ ++ mull r24, r19, r6 C U1 ++ umulh r24, r19, r22 C U1 ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldi r16, -40(r16) ++ br L(cj2) ++ ++L(gt2): ldl r0, 16(r17) ++ ldl r1, 24(r17) ++ mull r24, r19, r6 C U1 ++ umulh r24, r19, r22 C U1 ++ ldl r2, 32(r17) ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldl r3, 40(r17) ++ ldi r18, -4(r18) ++ ldi r17, 48(r17) ++ mull r0, r19, r4 C U1 ++ umulh r0, r19, r20 C U1 ++ ldi r16, -8(r16) ++ bgt r18, L(gt6) ++ ++ mull r1, r19, r5 C U1 ++ br L(cj6) ++ ++L(gt6): ldl r0, 0(r17) ++ mull r1, r19, r5 C U1 ++ br L(L2) ++ ++ ++L(b1): bgt r18, L(gt1) ++ ++ mull r24, r19, r7 C U1 ++ umulh r24, r19, r23 C U1 ++ ldi r16, -48(r16) ++ br L(cj1) ++ ++L(gt1): ldl r0, 8(r17) ++ ldl r1, 16(r17) ++ ldl r2, 24(r17) ++ mull r24, r19, r7 C U1 ++ umulh r24, r19, r23 C U1 ++ ldl r3, 32(r17) ++ ldi r18, -4(r18) ++ ldi r17, 40(r17) ++ mull r0, r19, r4 C U1 ++ umulh r0, r19, r20 C U1 ++ ldi r16, -16(r16) ++ bgt r18, L(gt5) ++ ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ mull r2, r19, r6 C U1 ++ br L(cj5) ++ ++L(gt5): ldl r0, 0(r17) ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ ldl r1, 8(r17) ++ mull r2, r19, r6 C U1 ++ br L(L1) ++ ++ ++L(b0): ldl r1, 8(r17) ++ ldl r2, 16(r17) ++ ldl r3, 24(r17) ++ ldi r17, 32(r17) ++ ldi r16, -24(r16) ++ mull r24, r19, r4 C U1 ++ umulh r24, r19, r20 C U1 ++ bgt r18, L(gt4) ++ ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ mull r3, r19, r7 C U1 ++ br L(cj4) ++ ++L(gt4): ldl r0, 0(r17) ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ ldl r1, 8(r17) ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ ldl r2, 16(r17) ++ mull r3, r19, r7 C U1 ++ br L(L0) ++ ++C *** MAIN LOOP START *** ++ ALIGN(16) ++L(top): mull r0, r19, r4 C U1 ++ subl r8, r28, r8 ++L(L3): umulh r0, r19, r20 C U1 ++ cmpult r8, r5, r28 ++ ldl r0, 0(r17) ++ subl r8, r5, r8 ++ addl r21, r28, r28 ++ stl r8, 0(r16) ++ ++ mull r1, r19, r5 C U1 ++ subl r8, r28, r8 ++L(L2): umulh r1, r19, r21 C U1 ++ cmpult r8, r6, r28 ++ ldl r1, 8(r17) ++ subl r8, r6, r8 ++ addl r22, r28, r28 ++ stl r8, 8(r16) ++ ++ mull r2, r19, r6 C U1 ++ subl r8, r28, r8 ++L(L1): umulh r2, r19, r22 C U1 ++ cmpult r8, r7, r28 ++ ldl r2, 16(r17) ++ subl r8, r7, r8 ++ addl r23, r28, r28 ++ stl r8, 16(r16) ++ ++ mull r3, r19, r7 C U1 ++ subl r8, r28, r8 ++L(L0): umulh r3, r19, r23 C U1 ++ cmpult r8, r4, r28 ++ ldl r3, 24(r17) ++ subl r8, r4, r8 ++ addl r20, r28, r28 ++ stl r8, 24(r16) ++ ++ ldi r18, -4(r18) ++ ldi r17, 32(r17) ++ ldi r16, 32(r16) ++ bgt r18, L(top) ++C *** MAIN LOOP END *** ++ ++ mull r0, r19, r4 C U1 ++ subl r8, r28, r8 ++L(cj7): umulh r0, r19, r20 C U1 ++ cmpult r8, r5, r28 ++ subl r8, r5, r8 ++ addl r21, r28, r28 ++ stl r8, 0(r16) ++ mull r1, r19, r5 C U1 ++ subl r8, r28, r8 ++L(cj6): umulh r1, r19, r21 C U1 ++ cmpult r8, r6, r28 ++ subl r8, r6, r8 ++ addl r22, r28, r28 ++ stl r8, 8(r16) ++ mull r2, r19, r6 C U1 ++ subl r8, r28, r8 ++L(cj5): umulh r2, r19, r22 C U1 ++ cmpult r8, r7, r28 ++ subl r8, r7, r8 ++ addl r23, r28, r28 ++ stl r8, 16(r16) ++ mull r3, r19, r7 C U1 ++ subl r8, r28, r8 ++L(cj4): umulh r3, r19, r23 C U1 ++ cmpult r8, r4, r28 ++ subl r8, r4, r8 ++ addl r20, r28, r28 ++ stl r8, 24(r16) ++ subl r8, r28, r8 ++L(cj3): cmpult r8, r5, r28 ++ subl r8, r5, r8 ++ addl r21, r28, r28 ++ stl r8, 32(r16) ++ subl r8, r28, r8 ++L(cj2): cmpult r8, r6, r28 ++ subl r8, r6, r8 ++ addl r22, r28, r28 ++ stl r8, 40(r16) ++ subl r8, r28, r8 ++L(cj1): cmpult r8, r7, r28 ++ subl r8, r7, r8 ++ addl r23, r28, r28 ++ stl r8, 48(r16) ++ subl r8, r28, r0 ++ ret r31, (r26), 1 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/cntlz.asm b/mpn/sw_64/cntlz.asm +new file mode 100644 +index 0000000..2625199 +--- /dev/null ++++ b/mpn/sw_64/cntlz.asm +@@ -0,0 +1,55 @@ ++dnl Sw_64 auxiliary for longlong.h's count_leading_zeros ++ ++dnl Copyright 1997, 2000, 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++ASM_START() ++EXTERN(__clz_tab) ++PROLOGUE(mpn_count_leading_zeros,gp) ++ cmpgeb r31, r16, r1 ++ LEA(r3,__clz_tab) ++ sra r1, 1, r1 ++ xor r1, 127, r1 ++ srl r16, 1, r16 ++ addl r1, r3, r1 ++ ldl_u r0, 0(r1) ++ ldi r2, 64 ++ ext0b r0, r1, r0 ++ s8subw r0, 8, r0 ++ srl r16, r0, r16 ++ addl r16, r3, r16 ++ ldl_u r1, 0(r16) ++ ext0b r1, r16, r1 ++ subl r2, r1, r2 ++ subl r2, r0, r0 ++ ret r31, (r26),1 ++EPILOGUE(mpn_count_leading_zeros) ++ASM_END() +diff --git a/mpn/sw_64/com.asm b/mpn/sw_64/com.asm +new file mode 100644 +index 0000000..98b9c0f +--- /dev/null ++++ b/mpn/sw_64/com.asm +@@ -0,0 +1,173 @@ ++dnl Sw_64 mpn_com -- mpn one's complement. ++ ++dnl Copyright 2003 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C cycles/limb ++ ++ ++C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); ++C ++C the main loop is 7 cycles plus 1 taken branch bubble, for a total ++C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop ++C will be 1.5+2/N c/l. ++C ++C 2 cycles of loop control are unavoidable, for pointer updates and the ++C taken branch bubble, but also since ldl cannot issue two cycles after stl ++C (and with a run of stls that means neither of two cycles at the end of the ++C loop. ++C ++C The fbeq is forced into the second cycle of the loop using unops, since ++C the first time through it must wait for the cvtqt result. Once that ++C result is ready (a 1 cycle stall) then both the branch and following loads ++C can issue together. ++C ++C The main loop handles an odd count of limbs, being two limbs loaded before ++C each size test, plus one pipelined around from the previous iteration (or ++C setup in the entry sequence). ++C ++C An even number of limbs is handled by an explicit dst[0]=~src[0] in the ++C entry sequence, and an increment of the pointers. For an odd size there's ++C no increment and the first store in the loop (r24) is a repeat of dst[0]. ++C ++C Note that the load for r24 after the possible pointer increment is done ++C before the explicit store to dst[0], in case src==dst. ++ ++ ++ASM_START() ++ ++FLOAT64(L(dat), 2.0) ++ ++ ALIGN(16) ++ ++PROLOGUE(mpn_com,gp) ++ ++ C r16 dst ++ C r17 src ++ C r18 size ++ ++ ldi r30, -16(r30) C temporary stack space ++ ldi r7, -3(r18) C size - 3 ++ ++ ldl r20, 0(r17) C src[0] ++ srl r7, 1, r6 C (size-3)/2 ++ ++ stl r6, 8(r30) C (size-3)/2 ++ and r7, 1, r5 C 1 if size even ++ ++ LEA( r8, L(dat)) ++ s8addl r5, r17, r17 C skip src[0] if even ++ ++ ornot r31, r20, r20 C ~src[0] ++ unop ++ ++ fldd f0, 8(r30) C (size-3)/2 ++ ldl r24, 0(r17) C src[0 or 1] ++ ++ stl r20, 0(r16) C dst[0] ++ s8addl r5, r16, r19 C skip dst[0] if even ++ ++ fldd f1, 0(r8) C data 2.0 ++ ldi r30, 16(r30) C restore stack ++ fcvtld f0, f10 C (size-3)/2 as float ++ fcpys f10,f10,f0 ++ ++ ornot r31, r24, r24 ++ blt r7, L(done_1) C if size<=2 ++ unop ++ unop ++ ++ ++ C 16-byte alignment here ++L(top): ++ C r17 src, incrementing ++ C r19 dst, incrementing ++ C r24 dst[i] result, ready to store ++ C f0 (size-3)/2, decrementing ++ C f1 2.0 ++ ++ ldl r20, 8(r17) C src[i+1] ++ ldl r21, 16(r17) C src[i+2] ++ unop ++ unop ++ ++ fbeq f0, L(done_2) ++ unop ++ ldl r22, 24(r17) C src[i+3] ++ ldl r23, 32(r17) C src[i+4] ++ ++ stl r24, 0(r19) C dst[i] ++ ornot r31, r20, r20 ++ fsubd f0, f1, f10 C count -= 2 ++ fcpys f10,f10,f0 ++ ++ stl r20, 8(r19) C dst[i+1] ++ ornot r31, r21, r21 ++ unop ++ unop ++ ++ stl r21, 16(r19) C dst[i+2] ++ ornot r31, r22, r22 ++ ++ stl r22, 24(r19) C dst[i+3] ++ ornot r31, r23, r24 ++ ++ ldi r17, 32(r17) C src += 4 ++ ldi r19, 32(r19) C dst += 4 ++ unop ++ fbge f0, L(top) ++ ++ ++L(done_1): ++ C r19 &dst[size-1] ++ C r24 result for dst[size-1] ++ ++ stl r24, 0(r19) C dst[size-1] ++ ret r31, (r26), 1 ++ ++ ++L(done_2): ++ C r19 &dst[size-3] ++ C r20 src[size-2] ++ C r21 src[size-1] ++ C r24 result for dst[size-3] ++ ++ stl r24, 0(r19) C dst[size-3] ++ ornot r31, r20, r20 ++ ++ stl r20, 8(r19) C dst[size-2] ++ ornot r31, r21, r21 ++ ++ stl r21, 16(r19) C dst[size-1] ++ ret r31, (r26), 1 ++ ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/copyd.asm b/mpn/sw_64/copyd.asm +new file mode 100644 +index 0000000..ae29b1a +--- /dev/null ++++ b/mpn/sw_64/copyd.asm +@@ -0,0 +1,85 @@ ++dnl Sw_64 mpn_copyd -- copy, decrementing. ++ ++dnl Copyright 2002, 2003 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++ ++ ++ASM_START() ++PROLOGUE(mpn_copyd) ++ s8addl r18,r16,r16 C E0 ++ s8addl r18,r17,r17 C E1 ++ ldi r18,-8(r18) C E0 ++ blt r18,$Lend C E1 ++$Loop: ldl r0,-8(r17) C E0 ++ ldl r1,-16(r17) C E1 ++ ldl r2,-24(r17) C E0 ++ ldl r3,-32(r17) C E1 ++ ldl r4,-40(r17) C E0 ++ ldl r5,-48(r17) C E1 ++ ldl r6,-56(r17) C E0 ++ ldl r7,-64(r17) C E1 ++ stl r0,-8(r16) C E0 ++ ldi r17,-64(r17) C E1 ++ stl r1,-16(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r2,-24(r16) C E0 ++ ldi r18,-8(r18) C E1 ++ stl r3,-32(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r4,-40(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r5,-48(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r6,-56(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r7,-64(r16) C E0 ++ ldi r16,-64(r16) C E1 ++ bge r18,$Loop C E1 ++$Lend: ldi r18,7(r18) C E0 ++ blt r18,$Lret C E1 ++ ldl r0,-8(r17) C E0 ++ beq r18,$Lend0 C E1 ++$Loop0: stl r0,-8(r16) C E0 ++ ldi r16,-8(r16) C E1 ++ ldl r0,-16(r17) C E0 ++ ldi r18,-1(r18) C E1 ++ ldi r17,-8(r17) C E0 ++ bgt r18,$Loop0 C E1 ++$Lend0: stl r0,-8(r16) C E0 ++$Lret: ret r31,(r26),1 C E1 ++EPILOGUE(mpn_copyd) ++ASM_END() +diff --git a/mpn/sw_64/copyi.asm b/mpn/sw_64/copyi.asm +new file mode 100644 +index 0000000..be1e7ac +--- /dev/null ++++ b/mpn/sw_64/copyi.asm +@@ -0,0 +1,84 @@ ++dnl Sw_64 mpn_copyi -- copy, incrementing. ++ ++dnl Copyright 2002, 2003 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 1 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++ ++ ++ASM_START() ++PROLOGUE(mpn_copyi) ++ ldi r18,-8(r18) C E0 ++ blt r18,$Lend C E1 ++$Loop: ldl r0,0(r17) C E0 ++ ldl r1,8(r17) C E1 ++ ldl r2,16(r17) C E0 ++ ldl r3,24(r17) C E1 ++ ldl r4,32(r17) C E0 ++ ldl r5,40(r17) C E1 ++ ldl r6,48(r17) C E0 ++ ldl r7,56(r17) C E1 ++ stl r0,0(r16) C E0 ++ ldi r17,64(r17) C E1 ++ stl r1,8(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r2,16(r16) C E0 ++ ldi r18,-8(r18) C E1 ++ stl r3,24(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r4,32(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r5,40(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r6,48(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r7,56(r16) C E0 ++ ldi r16,64(r16) C E1 ++ bge r18,$Loop C E1 ++$Lend: ldi r18,7(r18) C E0 ++ blt r18,$Lret C E1 ++ ldl r0,0(r17) C E0 ++ beq r18,$Lend0 C E1 ++$Loop0: stl r0,0(r16) C E0 ++ ldi r16,8(r16) C E1 ++ ldl r0,8(r17) C E0 ++ ldi r18,-1(r18) C E1 ++ ldi r17,8(r17) C E0 ++ bgt r18,$Loop0 C E1 ++$Lend0: stl r0,0(r16) C E0 ++$Lret: ret r31,(r26),1 C E1 ++EPILOGUE(mpn_copyi) ++ASM_END() +diff --git a/mpn/sw_64/dive_1.c b/mpn/sw_64/dive_1.c +new file mode 100644 +index 0000000..b00eeca +--- /dev/null ++++ b/mpn/sw_64/dive_1.c +@@ -0,0 +1,105 @@ ++/* Sw_64 mpn_divexact_1 -- mpn by limb exact division. ++ ++ THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST ++ CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN ++ FUTURE GNU MP RELEASES. ++ ++Copyright 2000-2003 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp-impl.h" ++#include "longlong.h" ++ ++ ++/* cycles/limb ++ SW6: 15.0 ++*/ ++ ++ ++/* The dependent chain is as follows (the same as modexact), and this is ++ what the code runs as. ++ ++ The time to load src[i+1] and establish x hides under the umulh latency. */ ++ ++void ++mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) ++{ ++ mp_limb_t inverse, lshift_mask, s, sr, s_next, c, h, x, y, q, dummy; ++ unsigned rshift, lshift; ++ ++ ASSERT (size >= 1); ++ ASSERT (divisor != 0); ++ ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); ++ ASSERT_MPN (src, size); ++ ASSERT_LIMB (divisor); ++ ++ s_next = *src++; /* src[0] */ ++ ++ rshift = 0; ++ lshift_mask = 0; ++ if ((divisor & 1) == 0) ++ { ++ count_trailing_zeros (rshift, divisor); ++ lshift_mask = MP_LIMB_T_MAX; ++ divisor >>= rshift; ++ } ++ ++ binvert_limb (inverse, divisor); ++ lshift = 64 - rshift; ++ ++ c = 0; ++ h = 0; ++ sr = s_next >> rshift; ++ ++ size--; ++ if (LIKELY (size != 0)) ++ { ++ do ++ { ++ s_next = *src++; /* src[i+1] */ ++ s = sr | ((s_next << lshift) & lshift_mask); ++ x = s - c; ++ c = s < c; ++ sr = s_next >> rshift; ++ ++ y = x - h; ++ c += (x < h); ++ q = y * inverse; ++ *dst++ = q; ++ umul_ppmm (h, dummy, q, divisor); ++ ++ size--; ++ } ++ while (size != 0); ++ } ++ ++ x = sr - c; ++ y = x - h; ++ q = y * inverse; ++ *dst = q; /* dst[size-1] */ ++} +diff --git a/mpn/sw_64/divrem_2.asm b/mpn/sw_64/divrem_2.asm +new file mode 100644 +index 0000000..0b22bee +--- /dev/null ++++ b/mpn/sw_64/divrem_2.asm +@@ -0,0 +1,175 @@ ++dnl Sw_64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. ++ ++dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C norm frac ++C sw6 29 29 ++ ++C TODO ++C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore ++C any registers (thus save ~10 cycles per call). ++C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle ++C or two. ++C * Check cluster delays (for sw6). We very likely could save some cycles. ++C * Use branch-free code for computing di. ++C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call. ++ ++C INPUT PARAMETERS ++define(`qp', `r16') ++define(`fn', `r17') ++define(`up_param', `r18') ++define(`un_param', `r19') ++define(`dp', `r20') ++ ++ASM_START() ++PROLOGUE(mpn_divrem_2,gp) ++ ldi r30, -80(r30) ++ stl r26, 0(r30) ++ stl r9, 8(r30) ++ stl r10, 16(r30) ++ stl r11, 24(r30) ++ stl r12, 32(r30) ++ stl r13, 40(r30) ++C stl r14, 48(r30) ++ stl r15, 56(r30) ++ .prologue 1 ++ stl r16, 64(r30) ++ bis r31, r17, r15 ++ s8addl r19, r18, r13 ++ ldi r13, -24(r13) ++ ldl r12, 8(r20) ++ ldl r10, 0(r20) ++ ldl r11, 16(r13) ++ ldl r9, 8(r13) ++ ++ bis r31, r31, r3 C most_significant_q_limb = 0 ++ cmpult r11, r12, r1 ++ bne r1, L(L8) ++ cmpule r11, r12, r1 ++ cmpult r9, r10, r2 ++ and r1, r2, r1 ++ bne r1, L(L8) ++ subl r11, r12, r11 ++ subl r11, r2, r11 ++ subl r9, r10, r9 ++ ldi r3, 1(r31) C most_significant_q_limb = 1 ++L(L8): stl r3, 72(r30) ++ ++ addl r15, r19, r19 ++ ldi r19, -3(r19) ++ blt r19, L(L10) ++ bis r31, r12, r16 ++ call r26, mpn_invert_limb ++ LDGP( r29, 0(r26)) ++ mull r0, r12, r4 C t0 = LO(di * d1) ++ umulh r0, r10, r2 C s1 = HI(di * d0) ++ addl r4, r10, r4 C t0 += d0 ++ cmpule r10, r4, r7 C (t0 < d0) ++ addl r4, r2, r4 C t0 += s1 ++ cmpult r4, r2, r1 ++ subl r1, r7, r7 C t1 (-1, 0, or 1) ++ blt r7, L(L42) ++L(L22): ++ ldi r0, -1(r0) C di-- ++ cmpult r4, r12, r1 C cy for: t0 -= d1 (below) ++ subl r7, r1, r7 C t1 -= cy ++ subl r4, r12, r4 C t0 -= d1 ++ bge r7, L(L22) ++L(L42): ++ ldl r16, 64(r30) ++ s8addl r19, r16, r16 ++ ALIGN(16) ++L(loop): ++ mull r11, r0, r5 C q0 (early) ++ umulh r11, r0, r6 C q (early) ++ addl r5, r9, r8 C q0 += n1 ++ addl r6, r11, r6 C q += n2 ++ cmpult r8, r5, r1 C cy for: q0 += n1 ++ addl r6, r1, r6 C q += cy ++ unop ++ mull r12, r6, r1 C LO(d1 * q) ++ umulh r10, r6, r7 C t1 = HI(d0 * q) ++ subl r9, r1, r9 C n1 -= LO(d1 * q) ++ mull r10, r6, r4 C t0 = LO(d0 * q) ++ unop ++ cmple r15, r19, r5 C condition and n0... ++ beq r5, L(L31) ++ ldl r5, 0(r13) ++ ldi r13, -8(r13) ++L(L31): subl r9, r12, r9 C n1 -= d1 ++ cmpult r5, r10, r1 C ++ subl r9, r1, r9 C ++ subl r5, r10, r5 C n0 -= d0 ++ subl r9, r7, r9 C n1 -= t0 ++ cmpult r5, r4, r1 C ++ subl r9, r1, r2 C ++ subl r5, r4, r5 C n0 -= t1 ++ cmpult r2, r8, r1 C (n1 < q0) ++ addl r6, r1, r6 C q += cond ++ ldi r1, -1(r1) C -(n1 >= q0) ++ and r1, r10, r4 C ++ addl r5, r4, r9 C n0 += mask & d0 ++ and r1, r12, r1 C ++ cmpult r9, r5, r11 C cy for: n0 += mask & d0 ++ addl r2, r1, r1 C n1 += mask & d1 ++ addl r1, r11, r11 C n1 += cy ++ cmpult r11, r12, r1 C ++ beq r1, L(fix) C ++L(bck): stl r6, 0(r16) ++ ldi r16, -8(r16) ++ ldi r19, -1(r19) ++ bge r19, L(loop) ++ ++L(L10): stl r9, 8(r13) ++ stl r11, 16(r13) ++ ldl r0, 72(r30) ++ ldl r26, 0(r30) ++ ldl r9, 8(r30) ++ ldl r10, 16(r30) ++ ldl r11, 24(r30) ++ ldl r12, 32(r30) ++ ldl r13, 40(r30) ++C ldl r14, 48(r30) ++ ldl r15, 56(r30) ++ ldi r30, 80(r30) ++ ret r31, (r26), 1 ++ ++L(fix): cmpule r11, r12, r1 ++ cmpult r9, r10, r2 ++ and r1, r2, r1 ++ bne r1, L(bck) ++ subl r11, r12, r11 ++ subl r11, r2, r11 ++ subl r9, r10, r9 ++ ldi r6, 1(r6) ++ br L(bck) ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/invert_limb.asm b/mpn/sw_64/invert_limb.asm +new file mode 100644 +index 0000000..e1d1b88 +--- /dev/null ++++ b/mpn/sw_64/invert_limb.asm +@@ -0,0 +1,93 @@ ++dnl Sw_64 mpn_invert_limb -- Invert a normalized limb. ++ ++dnl Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 71/72 ++ ++C This was compiler generated, with minimal manual edits. Surely several ++C cycles could be cut with some thought. ++ ++ASM_START() ++PROLOGUE(mpn_invert_limb,gp) ++ LEA( r2, approx_tab) ++ srl r16, 54, r1 ++ srl r16, 24, r4 ++ and r16, 1, r5 ++ bic r1, 1, r7 ++ ldi r4, 1(r4) ++ srl r16, 1, r3 ++ addl r7, r2, r1 ++ifelse(bwx_available_p,1,` ++ ldhu r0, -512(r1) ++',` ++ ldl_u r0, -512(r1) ++ ext1b r0, r7, r0 ++') ++ addl r3, r5, r3 ++ mulw r0, r0, r1 ++ sll r0, 11, r0 ++ mull r1, r4, r1 ++ srl r1, 40, r1 ++ subl r0, r1, r0 ++ ldi r0, -1(r0) ++ mull r0, r0, r2 ++ sll r0, 60, r1 ++ sll r0, 13, r0 ++ mull r2, r4, r2 ++ subl r1, r2, r1 ++ srl r1, 47, r1 ++ addl r0, r1, r0 ++ mull r0, r3, r3 ++ srl r0, 1, r1 ++ seleq r5, 0, r1,r1 ++ subl r1, r3, r1 ++ umulh r1, r0, r3 ++ sll r0, 31, r0 ++ srl r3, 1, r1 ++ addl r0, r1, r0 ++ mull r0, r16, r2 ++ umulh r0, r16, r3 ++ addl r2, r16, r1 ++ addl r3, r16, r16 ++ cmpult r1, r2, r1 ++ addl r16, r1, r3 ++ subl r0, r3, r0 ++ ret r31, (r26), 1 ++EPILOGUE() ++DATASTART(approx_tab,8) ++forloop(i,256,512-1,dnl ++` .word eval(0x7fd00/i) ++')dnl ++ SIZE(approx_tab, 512) ++ TYPE(approx_tab, object) ++DATAEND() ++ASM_END() +diff --git a/mpn/sw_64/lshift.asm b/mpn/sw_64/lshift.asm +new file mode 100644 +index 0000000..03dad97 +--- /dev/null ++++ b/mpn/sw_64/lshift.asm +@@ -0,0 +1,180 @@ ++dnl Sw_64 mpn_lshift -- Shift a number left. ++ ++dnl Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 1.75 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++C cnt r19 ++ ++ ++ASM_START() ++PROLOGUE(mpn_lshift) ++ s8addl r18,r17,r17 C make r17 point at end of s1 ++ ldl r4,-8(r17) C load first limb ++ subl r31,r19,r20 ++ s8addl r18,r16,r16 C make r16 point at end of RES ++ subl r18,1,r18 ++ and r18,4-1,r28 C number of limbs in first loop ++ srl r4,r20,r0 C compute function result ++ ++ beq r28,L(L0) ++ subl r18,r28,r18 ++ ++ ALIGN(8) ++L(top0): ++ ldl r3,-16(r17) ++ subl r16,8,r16 ++ sll r4,r19,r5 ++ subl r17,8,r17 ++ subl r28,1,r28 ++ srl r3,r20,r6 ++ bis r3,r3,r4 ++ bis r5,r6,r8 ++ stl r8,0(r16) ++ bne r28,L(top0) ++ ++L(L0): sll r4,r19,r24 ++ beq r18,L(end) ++C warm up phase 1 ++ ldl r1,-16(r17) ++ subl r18,4,r18 ++ ldl r2,-24(r17) ++ ldl r3,-32(r17) ++ ldl r4,-40(r17) ++C warm up phase 2 ++ srl r1,r20,r7 ++ sll r1,r19,r21 ++ srl r2,r20,r8 ++ beq r18,L(end1) ++ ldl r1,-48(r17) ++ sll r2,r19,r22 ++ ldl r2,-56(r17) ++ srl r3,r20,r5 ++ bis r7,r24,r7 ++ sll r3,r19,r23 ++ bis r8,r21,r8 ++ srl r4,r20,r6 ++ ldl r3,-64(r17) ++ sll r4,r19,r24 ++ ldl r4,-72(r17) ++ subl r18,4,r18 ++ beq r18,L(end2) ++ ALIGN(16) ++C main loop ++L(top): stl r7,-8(r16) ++ bis r5,r22,r5 ++ stl r8,-16(r16) ++ bis r6,r23,r6 ++ ++ srl r1,r20,r7 ++ subl r18,4,r18 ++ sll r1,r19,r21 ++ unop C ldl r31,-96(r17) ++ ++ srl r2,r20,r8 ++ ldl r1,-80(r17) ++ sll r2,r19,r22 ++ ldl r2,-88(r17) ++ ++ stl r5,-24(r16) ++ bis r7,r24,r7 ++ stl r6,-32(r16) ++ bis r8,r21,r8 ++ ++ srl r3,r20,r5 ++ unop C ldl r31,-96(r17) ++ sll r3,r19,r23 ++ subl r16,32,r16 ++ ++ srl r4,r20,r6 ++ ldl r3,-96(r17) ++ sll r4,r19,r24 ++ ldl r4,-104(r17) ++ ++ subl r17,32,r17 ++ bne r18,L(top) ++C cool down phase 2/1 ++L(end2): ++ stl r7,-8(r16) ++ bis r5,r22,r5 ++ stl r8,-16(r16) ++ bis r6,r23,r6 ++ srl r1,r20,r7 ++ sll r1,r19,r21 ++ srl r2,r20,r8 ++ sll r2,r19,r22 ++ stl r5,-24(r16) ++ bis r7,r24,r7 ++ stl r6,-32(r16) ++ bis r8,r21,r8 ++ srl r3,r20,r5 ++ sll r3,r19,r23 ++ srl r4,r20,r6 ++ sll r4,r19,r24 ++C cool down phase 2/2 ++ stl r7,-40(r16) ++ bis r5,r22,r5 ++ stl r8,-48(r16) ++ bis r6,r23,r6 ++ stl r5,-56(r16) ++ stl r6,-64(r16) ++C cool down phase 2/3 ++ stl r24,-72(r16) ++ ret r31,(r26),1 ++ ++C cool down phase 1/1 ++L(end1): ++ sll r2,r19,r22 ++ srl r3,r20,r5 ++ bis r7,r24,r7 ++ sll r3,r19,r23 ++ bis r8,r21,r8 ++ srl r4,r20,r6 ++ sll r4,r19,r24 ++C cool down phase 1/2 ++ stl r7,-8(r16) ++ bis r5,r22,r5 ++ stl r8,-16(r16) ++ bis r6,r23,r6 ++ stl r5,-24(r16) ++ stl r6,-32(r16) ++ stl r24,-40(r16) ++ ret r31,(r26),1 ++ ++L(end): stl r24,-8(r16) ++ ret r31,(r26),1 ++EPILOGUE(mpn_lshift) ++ASM_END() +diff --git a/mpn/sw_64/mod_34lsub1.asm b/mpn/sw_64/mod_34lsub1.asm +new file mode 100644 +index 0000000..8f7b049 +--- /dev/null ++++ b/mpn/sw_64/mod_34lsub1.asm +@@ -0,0 +1,162 @@ ++dnl Sw_64 mpn_mod_34lsub1. ++ ++dnl Copyright 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 1.67 ++ ++ ++dnl INPUT PARAMETERS ++dnl up r16 ++dnl n r17 ++ ++define(`l0',`r18') ++define(`l1',`r19') ++define(`l2',`r20') ++define(`a0',`r21') ++define(`a1',`r22') ++define(`a2',`r23') ++define(`c0',`r24') ++define(`c1',`r5') ++define(`c2',`r6') ++ ++ASM_START() ++PROLOGUE(mpn_mod_34lsub1) ++ bis r31, r31, c0 ++ bis r31, r31, c1 ++ bis r31, r31, c2 ++ ++ ldi r17, -3(r17) ++ bge r17, $L_3_or_more ++ bis r31, r31, a0 ++ bis r31, r31, a1 ++ bis r31, r31, a2 ++ br r31, $L_012 ++ ++$L_3_or_more: ++ ldl a0, 0(r16) ++ ldl a1, 8(r16) ++ ldl a2, 16(r16) ++ ldi r16, 24(r16) ++ ldi r17, -3(r17) ++ blt r17, $L_012 ++ ++$L_6_or_more: ++ ldl l0, 0(r16) ++ ldl l1, 8(r16) ++ ldl l2, 16(r16) ++ addl l0, a0, a0 ++ ++ ldi r16, 24(r16) ++ ldi r17, -3(r17) ++ blt r17, $L_end ++ ++ ALIGN(16) ++C Main loop ++$L_9_or_more: ++$Loop: cmpult a0, l0, r0 ++ ldl l0, 0(r16) ++ addl r0, c0, c0 ++ addl l1, a1, a1 ++ cmpult a1, l1, r0 ++ ldl l1, 8(r16) ++ addl r0, c1, c1 ++ addl l2, a2, a2 ++ cmpult a2, l2, r0 ++ ldl l2, 16(r16) ++ addl r0, c2, c2 ++ addl l0, a0, a0 ++ ldi r16, 24(r16) ++ ldi r17, -3(r17) ++ bge r17, $Loop ++ ++$L_end: cmpult a0, l0, r0 ++ addl r0, c0, c0 ++ addl l1, a1, a1 ++ cmpult a1, l1, r0 ++ addl r0, c1, c1 ++ addl l2, a2, a2 ++ cmpult a2, l2, r0 ++ addl r0, c2, c2 ++ ++C Handle the last (n mod 3) limbs ++$L_012: ldi r17, 2(r17) ++ blt r17, $L_0 ++ ldl l0, 0(r16) ++ addl l0, a0, a0 ++ cmpult a0, l0, r0 ++ addl r0, c0, c0 ++ beq r17, $L_0 ++ ldl l1, 8(r16) ++ addl l1, a1, a1 ++ cmpult a1, l1, r0 ++ addl r0, c1, c1 ++ ++C Align and sum our 3 main accumulators and 3 carry accumulators ++$L_0: srl a0, 48, r2 ++ srl a1, 32, r4 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins2b a1, 2, r1', C (a1 & 0xffffffff) << 16 ++` zapnot a1, 15, r25 ++ sll r25, 16, r1') ++ zapnot a0, 63, r0 C a0 & 0xffffffffffff ++ srl a2, 16, a1 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins1b a2, 4, r3', C (a2 & 0xffff) << 32 ++` zapnot a2, 3, r25 ++ sll r25, 32, r3') ++ addl r1, r4, r1 ++ addl r0, r2, r0 ++ srl c0, 32, a2 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins2b c0, 2, r4', C (c0 & 0xffffffff) << 16 ++` zapnot c0, 15, r25 ++ sll r25, 16, r4') ++ addl r0, r1, r0 ++ addl r3, a1, r3 ++ addl r0, r3, r0 ++ srl c1, 16, c0 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins1b c1, 4, r2', C (c1 & 0xffff) << 32 ++` zapnot c1, 3, r25 ++ sll r25, 32, r2') ++ addl r4, a2, r4 ++C srl c2, 48, r3 C This will be 0 in practise ++ zapnot c2, 63, r1 C r1 = c2 & 0xffffffffffff ++ addl r0, r4, r0 ++ addl r2, c0, r2 ++ addl r0, r2, r0 ++C addl r1, r3, r1 ++ addl r0, r1, r0 ++ ++ ret r31, (r26), 1 ++EPILOGUE(mpn_mod_34lsub1) ++ASM_END() +diff --git a/mpn/sw_64/mode1o.asm b/mpn/sw_64/mode1o.asm +new file mode 100644 +index 0000000..d5ed23f +--- /dev/null ++++ b/mpn/sw_64/mode1o.asm +@@ -0,0 +1,192 @@ ++dnl Sw_64 mpn_modexact_1c_odd -- mpn exact remainder ++ ++dnl Copyright 2003, 2004 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C cycles/limb ++C SW6: 15 ++ ++ ++C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, ++C mp_limb_t c) ++C In each case, the load latency, loop control, and extra carry bit handling ++C hide under the multiply latencies. Those latencies are long enough that ++C we don't need to worry about alignment or pairing to squeeze out ++C performance. ++C ++C For the first limb, some of the loop code is broken out and scheduled back ++C since it can be done earlier. ++C ++C - The first ldl src[0] is near the start of the routine, for maximum ++C time from memory. ++C ++C - The subl y=x-climb can be done without waiting for the inverse. ++C ++C - The mull y*inverse is replicated after the final subl for the inverse, ++C instead of branching to the mull in the main loop. ++C ++C For the last limb, high> 1 ++ ++ and r20, 127, r20 C idx = d>>1 & 0x7F ++ ++ addl r0, r20, r21 C table + idx ++ ++ifelse(bwx_available_p,1, ++` ldbu r20, 0(r21) C table[idx], inverse 8 bits ++',` ++ ldl_u r20, 0(r21) C table[idx] qword ++ ext0b r20, r21, r20 C table[idx], inverse 8 bits ++') ++ ++ mulw r20, r20, r7 C i*i ++ addl r20, r20, r20 C 2*i ++ ++ ldl r2, 0(r16) C x = s = src[0] ++ ldi r17, -1(r17) C size-- ++ clr r0 C initial cbit=0 ++ ++ mulw r7, r18, r7 C i*i*d ++ ++ subl r20, r7, r20 C 2*i-i*i*d, inverse 16 bits ++ ++ mulw r20, r20, r7 C i*i ++ addl r20, r20, r20 C 2*i ++ ++ mulw r7, r18, r7 C i*i*d ++ ++ subl r20, r7, r20 C 2*i-i*i*d, inverse 32 bits ++ ++ mull r20, r20, r7 C i*i ++ addl r20, r20, r20 C 2*i ++ ++ mull r7, r18, r7 C i*i*d ++ subl r2, r19, r3 C y = x - climb ++ ++ subl r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits ++ ++ASSERT(r7, C should have d*inv==1 mod 2^64 ++` mull r18, r20, r7 ++ cmpeq r7, 1, r7') ++ ++ mull r3, r20, r4 C first q = y * inv ++ ++ beq r17, L(one) C if size==1 ++ br L(entry) ++ ++ ++L(top): ++ C r0 cbit ++ C r16 src, incrementing ++ C r17 size, decrementing ++ C r18 d ++ C r19 climb ++ C r20 inv ++ ++ ldl r1, 0(r16) C s = src[i] ++ subl r1, r0, r2 C x = s - cbit ++ cmpult r1, r0, r0 C new cbit = s < cbit ++ ++ subl r2, r19, r3 C y = x - climb ++ ++ mull r3, r20, r4 C q = y * inv ++L(entry): ++ cmpult r2, r19, r5 C cbit2 = x < climb ++ addl r5, r0, r0 C cbit += cbit2 ++ ldi r16, 8(r16) C src++ ++ ldi r17, -1(r17) C size-- ++ ++ umulh r4, r18, r19 C climb = q * d ++ bne r17, L(top) C while 2 or more limbs left ++ ++ ++ ++ C r0 cbit ++ C r18 d ++ C r19 climb ++ C r20 inv ++ ++ ldl r1, 0(r16) C s = src[size-1] high limb ++ ++ cmpult r1, r18, r2 C test high 0 ++dnl and store difference in a third limb vector. ++ ++dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 3 ++ ++dnl INPUT PARAMETERS ++dnl res_ptr r16 ++dnl s1_ptr r17 ++dnl s2_ptr r18 ++dnl size r19 ++ ++ASM_START() ++PROLOGUE(mpn_sub_nc) ++ bis r31,r20,r25 ++ br L(com) ++EPILOGUE() ++PROLOGUE(mpn_sub_n) ++ bis r31,r31,r25 C clear cy ++L(com): subl r19,4,r19 C decr loop cnt ++ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop ++C Start software pipeline for 1st loop ++ ldl r0,0(r18) ++ ldl r4,0(r17) ++ ldl r1,8(r18) ++ ldl r5,8(r17) ++ addl r17,32,r17 C update s1_ptr ++ subl r4,r0,r28 C 1st main subtract ++ ldl r2,16(r18) ++ subl r28,r25,r20 C 1st carry subtract ++ ldl r3,24(r18) ++ cmpult r4,r0,r8 C compute cy from last subtract ++ ldl r6,-16(r17) ++ cmpult r28,r25,r25 C compute cy from last subtract ++ ldl r7,-8(r17) ++ bis r8,r25,r25 C combine cy from the two subtracts ++ subl r19,4,r19 C decr loop cnt ++ subl r5,r1,r28 C 2nd main subtract ++ addl r18,32,r18 C update s2_ptr ++ subl r28,r25,r21 C 2nd carry subtract ++ cmpult r5,r1,r8 C compute cy from last subtract ++ blt r19,$Lend1 C if less than 4 limbs remain, jump ++C 1st loop handles groups of 4 limbs in a software pipeline ++ ALIGN(16) ++$Loop: cmpult r28,r25,r25 C compute cy from last subtract ++ ldl r0,0(r18) ++ bis r8,r25,r25 C combine cy from the two subtracts ++ ldl r1,8(r18) ++ subl r6,r2,r28 C 3rd main subtract ++ ldl r4,0(r17) ++ subl r28,r25,r22 C 3rd carry subtract ++ ldl r5,8(r17) ++ cmpult r6,r2,r8 C compute cy from last subtract ++ cmpult r28,r25,r25 C compute cy from last subtract ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two subtracts ++ stl r21,8(r16) ++ subl r7,r3,r28 C 4th main subtract ++ subl r28,r25,r23 C 4th carry subtract ++ cmpult r7,r3,r8 C compute cy from last subtract ++ cmpult r28,r25,r25 C compute cy from last subtract ++ addl r17,32,r17 C update s1_ptr ++ bis r8,r25,r25 C combine cy from the two subtracts ++ addl r16,32,r16 C update res_ptr ++ subl r4,r0,r28 C 1st main subtract ++ ldl r2,16(r18) ++ subl r28,r25,r20 C 1st carry subtract ++ ldl r3,24(r18) ++ cmpult r4,r0,r8 C compute cy from last subtract ++ ldl r6,-16(r17) ++ cmpult r28,r25,r25 C compute cy from last subtract ++ ldl r7,-8(r17) ++ bis r8,r25,r25 C combine cy from the two subtracts ++ subl r19,4,r19 C decr loop cnt ++ stl r22,-16(r16) ++ subl r5,r1,r28 C 2nd main subtract ++ stl r23,-8(r16) ++ subl r28,r25,r21 C 2nd carry subtract ++ addl r18,32,r18 C update s2_ptr ++ cmpult r5,r1,r8 C compute cy from last subtract ++ bge r19,$Loop ++C Finish software pipeline for 1st loop ++$Lend1: cmpult r28,r25,r25 C compute cy from last subtract ++ bis r8,r25,r25 C combine cy from the two subtracts ++ subl r6,r2,r28 C cy add ++ subl r28,r25,r22 C 3rd main subtract ++ cmpult r6,r2,r8 C compute cy from last subtract ++ cmpult r28,r25,r25 C compute cy from last subtract ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two subtracts ++ stl r21,8(r16) ++ subl r7,r3,r28 C cy add ++ subl r28,r25,r23 C 4th main subtract ++ cmpult r7,r3,r8 C compute cy from last subtract ++ cmpult r28,r25,r25 C compute cy from last subtract ++ bis r8,r25,r25 C combine cy from the two subtracts ++ addl r16,32,r16 C update res_ptr ++ stl r22,-16(r16) ++ stl r23,-8(r16) ++$Lend2: addl r19,4,r19 C restore loop cnt ++ beq r19,$Lret ++C Start software pipeline for 2nd loop ++ ldl r0,0(r18) ++ ldl r4,0(r17) ++ subl r19,1,r19 ++ beq r19,$Lend0 ++C 2nd loop handles remaining 1-3 limbs ++ ALIGN(16) ++$Loop0: subl r4,r0,r28 C main subtract ++ cmpult r4,r0,r8 C compute cy from last subtract ++ ldl r0,8(r18) ++ ldl r4,8(r17) ++ subl r28,r25,r20 C carry subtract ++ addl r18,8,r18 ++ addl r17,8,r17 ++ stl r20,0(r16) ++ cmpult r28,r25,r25 C compute cy from last subtract ++ subl r19,1,r19 C decr loop cnt ++ bis r8,r25,r25 C combine cy from the two subtracts ++ addl r16,8,r16 ++ bne r19,$Loop0 ++$Lend0: subl r4,r0,r28 C main subtract ++ subl r28,r25,r20 C carry subtract ++ cmpult r4,r0,r8 C compute cy from last subtract ++ cmpult r28,r25,r25 C compute cy from last subtract ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two subtracts ++ ++$Lret: bis r25,r31,r0 C return cy ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff --git a/mpn/sw_64/submul_1.asm b/mpn/sw_64/submul_1.asm +new file mode 100644 +index 0000000..8558ed0 +--- /dev/null ++++ b/mpn/sw_64/submul_1.asm +@@ -0,0 +1,97 @@ ++dnl Sw_64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract ++dnl the result from a second limb vector. ++ ++dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C SW6: 7 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++C limb r19 ++ ++ ++ASM_START() ++PROLOGUE(mpn_submul_1) ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ subl r18,1,r18 C size-- ++ mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ umulh r2,r19,r0 C r0 = prod_high ++ beq r18,$Lend1 C jump if size was == 1 ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ subl r18,1,r18 C size-- ++ subl r5,r3,r3 ++ cmpult r5,r3,r4 ++ stl r3,0(r16) ++ addl r16,8,r16 C res_ptr++ ++ beq r18,$Lend2 C jump if size was == 2 ++ ++ ALIGN(8) ++$Loop: mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ subl r18,1,r18 C size-- ++ umulh r2,r19,r4 C r4 = cy_limb ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ subl r5,r3,r3 ++ cmpult r5,r3,r5 ++ stl r3,0(r16) ++ addl r16,8,r16 C res_ptr++ ++ addl r5,r0,r0 C combine carries ++ bne r18,$Loop ++ ++$Lend2: mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ umulh r2,r19,r4 C r4 = cy_limb ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ subl r5,r3,r3 ++ cmpult r5,r3,r5 ++ stl r3,0(r16) ++ addl r5,r0,r0 C combine carries ++ addl r4,r0,r0 C cy_limb = prod_high + cy ++ ret r31,(r26),1 ++$Lend1: subl r5,r3,r3 ++ cmpult r5,r3,r5 ++ stl r3,0(r16) ++ addl r0,r5,r0 ++ ret r31,(r26),1 ++EPILOGUE(mpn_submul_1) ++ASM_END() +diff --git a/mpn/sw_64/umul.asm b/mpn/sw_64/umul.asm +new file mode 100644 +index 0000000..bb6d56c +--- /dev/null ++++ b/mpn/sw_64/umul.asm +@@ -0,0 +1,44 @@ ++dnl mpn_umul_ppmm -- 1x1->2 limb multiplication ++ ++dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2); ++C ++ ++ASM_START() ++PROLOGUE(mpn_umul_ppmm) ++ mull r17, r18, r1 ++ umulh r17, r18, r0 ++ stl r1, 0(r16) ++ ret r31, (r26), 1 ++EPILOGUE() ++ASM_END() +-- +2.25.1 + diff --git a/gmp.spec b/gmp.spec index 1d672b0633c3bbda614fb4939b85a5146871e967..1de570a58c1e2fcbae3439b6a4b9c2631d8f8d54 100644 --- a/gmp.spec +++ b/gmp.spec @@ -1,6 +1,6 @@ Name: gmp Version: 6.3.0 -Release: 4 +Release: 5 Epoch: 1 URL: https://gmplib.org Source0: https://gmplib.org/download/gmp/gmp-%{version}.tar.xz @@ -8,6 +8,11 @@ License: (LGPL-3.0-or-later OR GPL-2.0-or-later OR (LGPL-3.0-or-later AND GPL-2. BuildRequires: gcc gcc-c++ make m4 Summary: A GNU multiple precision arithmetic library +# Sw64 Port +Patch0001: 0001-Sw64-Port-add-configure-support-for-sw64.patch +Patch0002: 0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch +Patch0003: 0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch + %description GMP is a portable library written in C for arbitrary precision arithmetic on integers, rational numbers, and floating-point numbers. It aims to provide @@ -83,6 +88,9 @@ export LD_LIBRARY_PATH=`pwd`/.libs %{_libdir}/libgmpxx.so.* %changelog +* Tue Jul 22 2025 swcompiler - 1:6.3.0-5 +- Sw64 Port gmp + * Tue Jul 01 2025 wangxiao - 1:6.3.0-4 - delete macros in changelog