diff --git a/gmp-6.2.1-sw_64.patch b/gmp-6.2.1-sw_64.patch new file mode 100644 index 0000000000000000000000000000000000000000..b2b50c8f8fd111f063168912fd0a5cb859cdbb48 --- /dev/null +++ b/gmp-6.2.1-sw_64.patch @@ -0,0 +1,9668 @@ +diff -Naru gmp-6.2.1/config.in gmp-6.2.1-sw/config.in +--- gmp-6.2.1/config.in 2020-11-14 18:45:15.000000000 +0000 ++++ gmp-6.2.1-sw/config.in 2022-08-26 07:24:34.611601453 +0000 +@@ -146,6 +146,7 @@ + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ + #undef HAVE_HOST_CPU_FAMILY_alpha ++#undef HAVE_HOST_CPU_FAMILY_sw_64 + #undef HAVE_HOST_CPU_FAMILY_m68k + #undef HAVE_HOST_CPU_FAMILY_power + #undef HAVE_HOST_CPU_FAMILY_powerpc +@@ -157,6 +158,8 @@ + #undef HAVE_HOST_CPU_alphaev67 + #undef HAVE_HOST_CPU_alphaev68 + #undef HAVE_HOST_CPU_alphaev7 ++#undef HAVE_HOST_CPU_sw_64sw6a ++#undef HAVE_HOST_CPU_sw_64sw6b + #undef HAVE_HOST_CPU_m68020 + #undef HAVE_HOST_CPU_m68030 + #undef HAVE_HOST_CPU_m68040 +diff -Naru gmp-6.2.1/configure gmp-6.2.1-sw/configure +--- gmp-6.2.1/configure 2020-11-14 18:45:15.000000000 +0000 ++++ gmp-6.2.1-sw/configure 2022-08-26 07:32:21.872463477 +0000 +@@ -4235,6 +4235,119 @@ + esac + ;; + ++#__sw_64__ start ++ sw_64*-*-*) ++ $as_echo "#define HAVE_HOST_CPU_FAMILY_sw_64 1" >>confdefs.h ++ ++ case $host_cpu in ++ sw_64sw6a | sw_64sw6b | sw_64sw6*) ++ path="sw_64/sw6a sw_64/sw6b sw_64" ;; ++ sw_64sw6) ++ path="sw_64/sw6 sw_64" ;; ++ *) ++ path="sw_64" ;; ++ esac ++ if test "$enable_assembly" = "yes" ; then ++ extra_functions="cntlz" ++ fi ++ gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below ++ gcc_cflags_maybe="-mieee" ++ gcc_cflags_oldas="-Wa,-oldas" # see GMP_GCC_WA_OLDAS. ++ ++ # compiler errors too easily and is rejected by GMP_PROG_CC_WORKS. Each ++ # -mcpu=sw6 below has a fallback to -mcpu=ev56 for this reason. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_cpu="-mcpu=sw6" ;; ++ sw_64sw6) gcc_cflags_cpu="-mcpu=sw6 " ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_cpu="-mcpu=sw6a -mcpu=sw6b" ;; ++ esac ++ ++ # gcc version "2.9-gnupro-99r1" on sw_64sw68-dec-osf5.1 has been seen ++ # accepting -mcpu=sw6, but not putting the assembler in the right mode ++ # for what it produces. We need to do this for it, and need to do it ++ # before testing the -mcpu options. ++ # ++ # On old versions of gcc, which don't know -mcpu=, we believe an ++ # explicit -Wa,-mev5 etc will be necessary to put the assembler in ++ # the right mode for our .asm files and longlong.h asm blocks. ++ # ++ # On newer versions of gcc, when -mcpu= is known, we must give a -Wa ++ # which is at least as high as the code gcc will generate. gcc ++ # establishes what it needs with a ".arch" directive, our command line ++ # option seems to override that. ++ # ++ # gas prior to 2.14 doesn't accept -msw67, but -msw6 seems enough for ++ # ctlz and cttz (in 2.10.0 at least). ++ # ++ # OSF `as' accepts sw68 but stupidly treats it as ev4. -arch only seems ++ # to affect insns like ldbu which are expanded as macros when necessary. ++ # Insns like ctlz which were never available as macros are always ++ # accepted and always generate their plain code. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw6) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_asm="-Wa,-arch,sw6a -Wa,-msw6a -Wa,-arch,sw6b -Wa,-msw6b" ;; ++ esac ++ ++ # It might be better to ask "cc" whether it's Cray C or DEC C, ++ # instead of relying on the OS part of $host. But it's hard to ++ # imagine either of those compilers anywhere except their native ++ # systems. ++ # ++ ++echo "include_mpn(\`sw_64/sw_64-defs.m4')" >> $gmp_tmpconfigm4i ++ ++ case $host in ++ *-cray-unicos*) ++ cc_cflags="-O" # no -g, it silently disables all optimizations ++ ++echo "include_mpn(\`sw_64/unicos.m4')" >> $gmp_tmpconfigm4i ++ ++ # Don't perform any assembly syntax tests on this beast. ++ ++gmp_asm_syntax_testing=no ++ ;; ++ *-*-osf*) ++ ++echo "include_mpn(\`sw_64/default.m4')" >> $gmp_tmpconfigm4i ++ ++ cc_cflags="" ++ cc_cflags_optlist="opt cpu" ++ ++ # not sure if -fast works on old versions, so make it optional ++ cc_cflags_opt="-fast -O2" ++ ++ # DEC C V5.9-005 knows ev4, ev5, ev56, pca56, sw6. ++ # Compaq C V6.3-029 adds sw67. ++ # ++ case $host_cpu in ++ sw_64) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw6) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw6a | sw_64sw6b) ++ cc_cflags_cpu="-arch~sw6a~-tune~sw6a -arch~sw6b~-tune~sw6b" ;; ++ esac ++ ;; ++ *) ++ ++echo "include_mpn(\`sw_64/default.m4')" >> $gmp_tmpconfigm4i ++ ++ ;; ++ esac ++ ++ case $host in ++ *-*-unicos*) ++ # tune/sw_64.asm assumes int==4bytes but unicos uses int==8bytes ++ ;; ++ *) ++ SPEED_CYCLECOUNTER_OBJ=sw_64.lo ++ cyclecounter_size=1 ;; ++ esac ++ ;; ++#__sw_64__ end + + # Cray vector machines. + # This must come after alpha* so that we can recognize present and future +diff -Naru gmp-6.2.1/configure.ac gmp-6.2.1-sw/configure.ac +--- gmp-6.2.1/configure.ac 2020-11-14 18:45:09.000000000 +0000 ++++ gmp-6.2.1-sw/configure.ac 2022-08-26 07:37:01.582987131 +0000 +@@ -293,6 +293,7 @@ + If your CPU is not in any of these families, leave all undefined. + For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */ + #undef HAVE_HOST_CPU_FAMILY_alpha ++#undef HAVE_HOST_CPU_FAMILY_sw_64 + #undef HAVE_HOST_CPU_FAMILY_m68k + #undef HAVE_HOST_CPU_FAMILY_power + #undef HAVE_HOST_CPU_FAMILY_powerpc +@@ -304,6 +305,9 @@ + #undef HAVE_HOST_CPU_alphaev67 + #undef HAVE_HOST_CPU_alphaev68 + #undef HAVE_HOST_CPU_alphaev7 ++#undef HAVE_HOST_CPU_sw_64sw6a ++#undef HAVE_HOST_CPU_sw_64sw6b ++#undef HAVE_HOST_CPU_sw_64sw6 + #undef HAVE_HOST_CPU_m68020 + #undef HAVE_HOST_CPU_m68030 + #undef HAVE_HOST_CPU_m68040 +@@ -597,6 +601,110 @@ + esac + ;; + ++#__sw_64__ start ++ sw_64*-*-*) ++ AC_DEFINE(HAVE_HOST_CPU_FAMILY_sw_64) ++ case $host_cpu in ++ sw_64sw6a | sw_64sw6b | sw_64sw6*) ++ path="sw_64/sw6a sw_64/sw6b sw_64" ;; ++ sw_64sw6) ++ path="sw_64/sw6 sw_64" ;; ++ *) ++ path="sw_64" ;; ++ esac ++ if test "$enable_assembly" = "yes" ; then ++ extra_functions="cntlz" ++ fi ++ gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below ++ gcc_cflags_maybe="-mieee" ++ gcc_cflags_oldas="-Wa,-oldas" # see GMP_GCC_WA_OLDAS. ++ ++ # compiler errors too easily and is rejected by GMP_PROG_CC_WORKS. Each ++ # -mcpu=sw6 below has a fallback to -mcpu=ev56 for this reason. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_cpu="-mcpu=sw6" ;; ++ sw_64sw6) gcc_cflags_cpu="-mcpu=sw6 " ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_cpu="-mcpu=sw6a -mcpu=sw6b" ;; ++ esac ++ ++ # gcc version "2.9-gnupro-99r1" on sw_64sw68-dec-osf5.1 has been seen ++ # accepting -mcpu=sw6, but not putting the assembler in the right mode ++ # for what it produces. We need to do this for it, and need to do it ++ # before testing the -mcpu options. ++ # ++ # On old versions of gcc, which don't know -mcpu=, we believe an ++ # explicit -Wa,-mev5 etc will be necessary to put the assembler in ++ # the right mode for our .asm files and longlong.h asm blocks. ++ # ++ # On newer versions of gcc, when -mcpu= is known, we must give a -Wa ++ # which is at least as high as the code gcc will generate. gcc ++ # establishes what it needs with a ".arch" directive, our command line ++ # option seems to override that. ++ # ++ # gas prior to 2.14 doesn't accept -msw67, but -msw6 seems enough for ++ # ctlz and cttz (in 2.10.0 at least). ++ # ++ # OSF `as' accepts sw68 but stupidly treats it as ev4. -arch only seems ++ # to affect insns like ldbu which are expanded as macros when necessary. ++ # Insns like ctlz which were never available as macros are always ++ # accepted and always generate their plain code. ++ # ++ case $host_cpu in ++ sw_64) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw6) gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;; ++ sw_64sw6a | sw_64sw6b) ++ gcc_cflags_asm="-Wa,-arch,sw6a -Wa,-msw6a -Wa,-arch,sw6b -Wa,-msw6b" ;; ++ esac ++ ++ # It might be better to ask "cc" whether it's Cray C or DEC C, ++ # instead of relying on the OS part of $host. But it's hard to ++ # imagine either of those compilers anywhere except their native ++ # systems. ++ # ++ GMP_INCLUDE_MPN(sw_64/sw_64-defs.m4) ++ case $host in ++ *-cray-unicos*) ++ cc_cflags="-O" # no -g, it silently disables all optimizations ++ GMP_INCLUDE_MPN(sw_64/unicos.m4) ++ # Don't perform any assembly syntax tests on this beast. ++ gmp_asm_syntax_testing=no ++ ;; ++ *-*-osf*) ++ GMP_INCLUDE_MPN(sw_64/default.m4) ++ cc_cflags="" ++ cc_cflags_optlist="opt cpu" ++ ++ # not sure if -fast works on old versions, so make it optional ++ cc_cflags_opt="-fast -O2" ++ ++ # DEC C V5.9-005 knows ev4, ev5, ev56, pca56, sw6. ++ # Compaq C V6.3-029 adds sw67. ++ # ++ case $host_cpu in ++ sw_64) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw6) cc_cflags_cpu="-arch~sw6~-tune~sw6" ;; ++ sw_64sw6a | sw_64sw6b) ++ cc_cflags_cpu="-arch~sw6a~-tune~sw6a -arch~sw6b~-tune~sw6b" ;; ++ esac ++ ;; ++ *) ++ GMP_INCLUDE_MPN(sw_64/default.m4) ++ ;; ++ esac ++ ++ case $host in ++ *-*-unicos*) ++ # tune/sw_64.asm assumes int==4bytes but unicos uses int==8bytes ++ ;; ++ *) ++ SPEED_CYCLECOUNTER_OBJ=sw_64.lo ++ cyclecounter_size=1 ;; ++ esac ++ ;; ++#__sw_64__ end ++ + + # Cray vector machines. + # This must come after alpha* so that we can recognize present and future +@@ -2366,7 +2474,7 @@ + ;; + -Wa,-m*) + case $host in +- alpha*-*-*) ++ alpha*-*-* | sw_64*-*-*) + GMP_GCC_WA_MCPU($cc $cflags, $flag, , [continue]) + ;; + esac +diff -Naru gmp-6.2.1/extract-dbl.c gmp-6.2.1-sw/extract-dbl.c +--- gmp-6.2.1/extract-dbl.c 2020-11-14 18:45:09.000000000 +0000 ++++ gmp-6.2.1-sw/extract-dbl.c 2022-08-26 07:37:40.403060309 +0000 +@@ -71,7 +71,7 @@ + + #if _GMP_IEEE_FLOATS + { +-#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8 ++#if (defined (__alpha) || defined (__sw_64))&& __GNUC__ == 2 && __GNUC_MINOR__ == 8 + /* Work around alpha-specific bug in GCC 2.8.x. */ + volatile + #endif +diff -Naru gmp-6.2.1/gmp-impl.h gmp-6.2.1-sw/gmp-impl.h +--- gmp-6.2.1/gmp-impl.h 2020-11-14 18:45:09.000000000 +0000 ++++ gmp-6.2.1-sw/gmp-impl.h 2022-08-26 07:39:24.323256204 +0000 +@@ -318,6 +318,10 @@ + #define HAVE_HOST_CPU_alpha_CIX 1 + #endif + ++#if HAVE_HOST_CPU_sw_64sw6a || HAVE_HOST_CPU_sw_64sw6b \ ++ || HAVE_HOST_CPU_sw_64sw6 || HAVE_HOST_CPU_sw_64 ++#define HAVE_HOST_CPU_sw_64_CIX 1 ++#endif + + #if defined (__cplusplus) + extern "C" { +@@ -3378,7 +3382,7 @@ + to 0 if there's an even number. "n" should be an unsigned long and "p" + an int. */ + +-#if defined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX ++#if defined (__GNUC__) && ! defined (NO_ASM) && (HAVE_HOST_CPU_alpha_CIX || HAVE_HOST_CPU_sw_64_CIX) + #define ULONG_PARITY(p, n) \ + do { \ + int __p; \ +@@ -3660,7 +3664,7 @@ + #endif + #endif + +-#if defined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX ++#if defined (__GNUC__) && ! defined (NO_ASM) && (HAVE_HOST_CPU_alpha_CIX || HAVE_HOST_CPU_sw_64_CIX) + #define popc_limb(result, input) \ + do { \ + __asm__ ("ctpop %1, %0" : "=r" (result) : "r" (input)); \ +diff -Naru gmp-6.2.1/longlong.h gmp-6.2.1-sw/longlong.h +--- gmp-6.2.1/longlong.h 2020-11-14 18:45:09.000000000 +0000 ++++ gmp-6.2.1-sw/longlong.h 2022-08-26 07:45:12.673912863 +0000 +@@ -270,6 +270,96 @@ + #endif /* clz using mpn */ + #endif /* __alpha */ + ++//__sw_64 start ++#if defined (__sw_64) && W_TYPE_SIZE == 64 ++/* Most sw_64-based machines, except Cray systems. */ ++#if defined (__GNUC__) ++#if __GMP_GNUC_PREREQ (3,3) ++#define umul_ppmm(ph, pl, m0, m1) \ ++ do { \ ++ UDItype __m0 = (m0), __m1 = (m1); \ ++ (ph) = __builtin_sw_64_umulh (__m0, __m1); \ ++ (pl) = __m0 * __m1; \ ++ } while (0) ++#else ++#define umul_ppmm(ph, pl, m0, m1) \ ++ do { \ ++ UDItype __m0 = (m0), __m1 = (m1); \ ++ __asm__ ("umulh %r1,%2,%0" \ ++ : "=r" (ph) \ ++ : "%rJ" (__m0), "rI" (__m1)); \ ++ (pl) = __m0 * __m1; \ ++ } while (0) ++#endif ++#define UMUL_TIME 18 ++#else /* ! __GNUC__ */ ++#include ++#define umul_ppmm(ph, pl, m0, m1) \ ++ do { \ ++ UDItype __m0 = (m0), __m1 = (m1); \ ++ (ph) = __UMULH (__m0, __m1); \ ++ (pl) = __m0 * __m1; \ ++ } while (0) ++#endif ++#ifndef LONGLONG_STANDALONE ++#define udiv_qrnnd(q, r, n1, n0, d) \ ++ do { UWtype __di; \ ++ __di = __MPN(invert_limb) (d); \ ++ udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \ ++ } while (0) ++#define UDIV_PREINV_ALWAYS 1 ++#define UDIV_NEEDS_NORMALIZATION 1 ++#define UDIV_TIME 220 ++#endif /* LONGLONG_STANDALONE */ ++ ++/* clz_tab is required in all configurations, since mpn/sw_64/cntlz.asm ++ always goes into libgmp.so, even when not actually used. */ ++#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB ++ ++#if defined (__GNUC__) && HAVE_HOST_CPU_sw_64_CIX ++#define count_leading_zeros(COUNT,X) \ ++ __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X)) ++#define count_trailing_zeros(COUNT,X) \ ++ __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X)) ++#endif /* clz/ctz using cix */ ++ ++#if ! defined (count_leading_zeros) \ ++ && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE) ++/* SW_64_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0. ++ "$31" is written explicitly in the asm, since an "r" constraint won't ++ select reg 31. There seems no need to worry about "r31" syntax for cray, ++ since gcc itself (pre-release 3.4) emits just $31 in various places. */ ++#define SW_64_CMPBGE_0(dst, src) \ ++ do { asm ("cmpgeb $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0) ++/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts ++ them, locating the highest non-zero byte. A second __clz_tab lookup ++ counts the leading zero bits in that byte, giving the result. */ ++#define count_leading_zeros(count, x) \ ++ do { \ ++ UWtype __clz__b, __clz__c, __clz__x = (x); \ ++ SW_64_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \ ++ __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \ ++ __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \ ++ __clz__x >>= __clz__b; \ ++ __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \ ++ __clz__b = 65 - __clz__b; \ ++ (count) = __clz__b - __clz__c; \ ++ } while (0) ++#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB ++#endif /* clz using cmpbge */ ++ ++#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE) ++#if HAVE_ATTRIBUTE_CONST ++long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const)); ++#else ++long __MPN(count_leading_zeros) (UDItype); ++#endif ++#define count_leading_zeros(count, x) \ ++ ((count) = __MPN(count_leading_zeros) (x)) ++#endif /* clz using mpn */ ++#endif ++//__sw_64 end ++ + #if defined (__AVR) && W_TYPE_SIZE == 8 + #define umul_ppmm(ph, pl, m0, m1) \ + do { \ +diff -Naru gmp-6.2.1/mpn/sw_64/addmul_1.asm gmp-6.2.1-sw/mpn/sw_64/addmul_1.asm +--- gmp-6.2.1/mpn/sw_64/addmul_1.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/addmul_1.asm 2022-08-26 07:41:33.233499207 +0000 +@@ -0,0 +1,99 @@ ++dnl Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the ++dnl result to a second limb vector. ++ ++dnl Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: 42 ++C EV5: 18 ++C SW6: 7 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++C vl r19 ++ ++ ++ASM_START() ++PROLOGUE(mpn_addmul_1) ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ subl r18,1,r18 C size-- ++ mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ umulh r2,r19,r0 C r0 = prod_high ++ beq r18,$Lend1 C jump if size was == 1 ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ subl r18,1,r18 C size-- ++ addl r5,r3,r3 ++ cmpult r3,r5,r4 ++ stl r3,0(r16) ++ addl r16,8,r16 C res_ptr++ ++ beq r18,$Lend2 C jump if size was == 2 ++ ++ ALIGN(8) ++$Loop: mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ subl r18,1,r18 C size-- ++ umulh r2,r19,r4 C r4 = cy_limb ++ ldl r2,0(r17) C r2 = s1_limb ++ addl r17,8,r17 C s1_ptr++ ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ addl r5,r3,r3 ++ cmpult r3,r5,r5 ++ stl r3,0(r16) ++ addl r16,8,r16 C res_ptr++ ++ addl r5,r0,r0 C combine carries ++ bne r18,$Loop ++ ++$Lend2: mull r2,r19,r3 C r3 = prod_low ++ ldl r5,0(r16) C r5 = *res_ptr ++ addl r4,r0,r0 C cy_limb = cy_limb + 'cy' ++ umulh r2,r19,r4 C r4 = cy_limb ++ addl r3,r0,r3 C r3 = cy_limb + prod_low ++ cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) ++ addl r5,r3,r3 ++ cmpult r3,r5,r5 ++ stl r3,0(r16) ++ addl r5,r0,r0 C combine carries ++ addl r4,r0,r0 C cy_limb = prod_high + cy ++ ret r31,(r26),1 ++$Lend1: addl r5,r3,r3 ++ cmpult r3,r5,r5 ++ stl r3,0(r16) ++ addl r0,r5,r0 ++ ret r31,(r26),1 ++EPILOGUE(mpn_addmul_1) ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/add_n.asm gmp-6.2.1-sw/mpn/sw_64/add_n.asm +--- gmp-6.2.1/mpn/sw_64/add_n.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/add_n.asm 2022-08-26 07:41:33.273499282 +0000 +@@ -0,0 +1,164 @@ ++dnl Sw_64 mpn_add_n -- Add two limb vectors of the same length > 0 and ++dnl store sum in a third limb vector. ++ ++dnl Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: ? ++C EV5: 4.75 ++C EV6: 3 ++ ++dnl INPUT PARAMETERS ++dnl res_ptr r16 ++dnl s1_ptr r17 ++dnl s2_ptr r18 ++dnl size r19 ++ ++ASM_START() ++PROLOGUE(mpn_add_nc) ++ bis r20,r31,r25 ++ br L(com) ++EPILOGUE() ++PROLOGUE(mpn_add_n) ++ bis r31,r31,r25 C clear cy ++L(com): subl r19,4,r19 C decr loop cnt ++ blt r19,$Lend2 C if less than 4 limbs, goto 2nd loop ++C Start software pipeline for 1st loop ++ ldl r0,0(r18) ++ ldl r4,0(r17) ++ ldl r1,8(r18) ++ ldl r5,8(r17) ++ addl r17,32,r17 C update s1_ptr ++ addl r0,r4,r28 C 1st main add ++ ldl r2,16(r18) ++ addl r25,r28,r20 C 1st carry add ++ ldl r3,24(r18) ++ cmpult r28,r4,r8 C compute cy from last add ++ ldl r6,-16(r17) ++ cmpult r20,r28,r25 C compute cy from last add ++ ldl r7,-8(r17) ++ bis r8,r25,r25 C combine cy from the two adds ++ subl r19,4,r19 C decr loop cnt ++ addl r1,r5,r28 C 2nd main add ++ addl r18,32,r18 C update s2_ptr ++ addl r28,r25,r21 C 2nd carry add ++ cmpult r28,r5,r8 C compute cy from last add ++ blt r19,$Lend1 C if less than 4 limbs remain, jump ++C 1st loop handles groups of 4 limbs in a software pipeline ++ ALIGN(16) ++$Loop: cmpult r21,r28,r25 C compute cy from last add ++ ldl r0,0(r18) ++ bis r8,r25,r25 C combine cy from the two adds ++ ldl r1,8(r18) ++ addl r2,r6,r28 C 3rd main add ++ ldl r4,0(r17) ++ addl r28,r25,r22 C 3rd carry add ++ ldl r5,8(r17) ++ cmpult r28,r6,r8 C compute cy from last add ++ cmpult r22,r28,r25 C compute cy from last add ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two adds ++ stl r21,8(r16) ++ addl r3,r7,r28 C 4th main add ++ addl r28,r25,r23 C 4th carry add ++ cmpult r28,r7,r8 C compute cy from last add ++ cmpult r23,r28,r25 C compute cy from last add ++ addl r17,32,r17 C update s1_ptr ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r16,32,r16 C update res_ptr ++ addl r0,r4,r28 C 1st main add ++ ldl r2,16(r18) ++ addl r25,r28,r20 C 1st carry add ++ ldl r3,24(r18) ++ cmpult r28,r4,r8 C compute cy from last add ++ ldl r6,-16(r17) ++ cmpult r20,r28,r25 C compute cy from last add ++ ldl r7,-8(r17) ++ bis r8,r25,r25 C combine cy from the two adds ++ subl r19,4,r19 C decr loop cnt ++ stl r22,-16(r16) ++ addl r1,r5,r28 C 2nd main add ++ stl r23,-8(r16) ++ addl r25,r28,r21 C 2nd carry add ++ addl r18,32,r18 C update s2_ptr ++ cmpult r28,r5,r8 C compute cy from last add ++ bge r19,$Loop ++C Finish software pipeline for 1st loop ++$Lend1: cmpult r21,r28,r25 C compute cy from last add ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r2,r6,r28 C 3rd main add ++ addl r28,r25,r22 C 3rd carry add ++ cmpult r28,r6,r8 C compute cy from last add ++ cmpult r22,r28,r25 C compute cy from last add ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two adds ++ stl r21,8(r16) ++ addl r3,r7,r28 C 4th main add ++ addl r28,r25,r23 C 4th carry add ++ cmpult r28,r7,r8 C compute cy from last add ++ cmpult r23,r28,r25 C compute cy from last add ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r16,32,r16 C update res_ptr ++ stl r22,-16(r16) ++ stl r23,-8(r16) ++$Lend2: addl r19,4,r19 C restore loop cnt ++ beq r19,$Lret ++C Start software pipeline for 2nd loop ++ ldl r0,0(r18) ++ ldl r4,0(r17) ++ subl r19,1,r19 ++ beq r19,$Lend0 ++C 2nd loop handles remaining 1-3 limbs ++ ALIGN(16) ++$Loop0: addl r0,r4,r28 C main add ++ ldl r0,8(r18) ++ cmpult r28,r4,r8 C compute cy from last add ++ ldl r4,8(r17) ++ addl r28,r25,r20 C carry add ++ addl r18,8,r18 ++ addl r17,8,r17 ++ stl r20,0(r16) ++ cmpult r20,r28,r25 C compute cy from last add ++ subl r19,1,r19 C decr loop cnt ++ bis r8,r25,r25 C combine cy from the two adds ++ addl r16,8,r16 ++ bne r19,$Loop0 ++$Lend0: addl r0,r4,r28 C main add ++ addl r28,r25,r20 C carry add ++ cmpult r28,r4,r8 C compute cy from last add ++ cmpult r20,r28,r25 C compute cy from last add ++ stl r20,0(r16) ++ bis r8,r25,r25 C combine cy from the two adds ++ ++$Lret: bis r25,r31,r0 C return cy ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/aorslsh1_n.asm gmp-6.2.1-sw/mpn/sw_64/aorslsh1_n.asm +--- gmp-6.2.1/mpn/sw_64/aorslsh1_n.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/aorslsh1_n.asm 2022-08-26 07:41:33.283499301 +0000 +@@ -0,0 +1,164 @@ ++dnl Sw_64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1). ++ ++dnl Copyright 2003, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: ? ++C EV5: 6.25 ++C SW6: 4.5 ++ ++define(`rp',`r16') ++define(`up',`r17') ++define(`vp',`r18') ++define(`n', `r19') ++ ++define(`u0', `r8') ++define(`u1', `r1') ++define(`v0', `r4') ++define(`v1', `r5') ++ ++define(`cy0', `r0') ++define(`cy1', `r20') ++define(`cy', `r22') ++define(`rr', `r24') ++define(`ps', `r25') ++define(`sl', `r28') ++ ++ifdef(`OPERATION_addlsh1_n',` ++ define(ADDSUB, addl) ++ define(CARRY, `cmpult $1,$2,$3') ++ define(func, mpn_addlsh1_n) ++') ++ifdef(`OPERATION_sublsh1_n',` ++ define(ADDSUB, subl) ++ define(CARRY, `cmpult $2,$1,$3') ++ define(func, mpn_sublsh1_n) ++') ++ ++MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) ++ ++ASM_START() ++PROLOGUE(func) ++ and n, 2, cy0 ++ blbs n, L(bx1) ++L(bx0): ldl v1, 0(vp) ++ ldl u1, 0(up) ++ nop ++ bne cy0, L(b10) ++ ++L(b00): ldi vp, 48(vp) ++ ldi up, -16(up) ++ ldi rp, -8(rp) ++ br r31, L(lo0) ++ ++L(b10): ldi vp, 32(vp) ++ ldi rp, 8(rp) ++ ldi cy0, 0(r31) ++ br r31, L(lo2) ++ ++L(bx1): ldl v0, 0(vp) ++ ldl u0, 0(up) ++ ldi cy1, 0(r31) ++ beq cy0, L(b01) ++ ++L(b11): ldi vp, 40(vp) ++ ldi up, -24(up) ++ ldi rp, 16(rp) ++ br r31, L(lo3) ++ ++L(b01): ldi n, -4(n) ++ ble n, L(end) ++ ldi vp, 24(vp) ++ ldi up, -8(up) ++ ++ ALIGN(16) ++L(top): addl v0, v0, sl C left shift vlimb ++ ldl v1, -16(vp) ++ ADDSUB u0, sl, ps C ulimb + (vlimb << 1) ++ cmplt v0, r31, cy0 C carry out #1 ++ ldl u1, 16(up) ++ ADDSUB ps, cy1, rr C consume carry from previous operation ++ CARRY( ps, u0, cy) C carry out #2 ++ stl rr, 0(rp) ++ addl cy, cy0, cy0 C combine carry out #1 and #2 ++ CARRY( rr, ps, cy) C carry out #3 ++ addl cy, cy0, cy0 C final carry out ++ ldi vp, 32(vp) C bookkeeping ++L(lo0): addl v1, v1, sl ++ ldl v0, -40(vp) ++ ADDSUB u1, sl, ps ++ cmplt v1, r31, cy1 ++ ldl u0, 24(up) ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy) ++ stl rr, 8(rp) ++ addl cy, cy1, cy1 ++ CARRY( rr, ps, cy) ++ addl cy, cy1, cy1 ++ ldi rp, 32(rp) C bookkeeping ++L(lo3): addl v0, v0, sl ++ ldl v1, -32(vp) ++ ADDSUB u0, sl, ps ++ cmplt v0, r31, cy0 ++ ldl u1, 32(up) ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy) ++ stl rr, -16(rp) ++ addl cy, cy0, cy0 ++ CARRY( rr, ps, cy) ++ addl cy, cy0, cy0 ++ ldi up, 32(up) C bookkeeping ++L(lo2): addl v1, v1, sl ++ ldl v0, -24(vp) ++ ADDSUB u1, sl, ps ++ cmplt v1, r31, cy1 ++ ldl u0, 8(up) ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy) ++ stl rr, -8(rp) ++ addl cy, cy1, cy1 ++ CARRY( rr, ps, cy) ++ addl cy, cy1, cy1 ++ ldi n, -4(n) C bookkeeping ++ bgt n, L(top) ++ ++L(end): addl v0, v0, sl ++ ADDSUB u0, sl, ps ++ ADDSUB ps, cy1, rr ++ cmplt v0, r31, cy0 ++ CARRY( ps, u0, cy) ++ stl rr, 0(rp) ++ addl cy, cy0, cy0 ++ CARRY( rr, ps, cy) ++ addl cy, cy0, r0 ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/aorslsh2_n.asm gmp-6.2.1-sw/mpn/sw_64/aorslsh2_n.asm +--- gmp-6.2.1/mpn/sw_64/aorslsh2_n.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/aorslsh2_n.asm 2022-08-26 07:41:33.283499301 +0000 +@@ -0,0 +1,167 @@ ++dnl Sw_64 mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2). ++ ++dnl Copyright 2003, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: ? ++C EV5: 6 ++C SW6: 3.75 ++ ++C TODO ++C * Tune to reach 3.5 c/l on sw6 and 5.75 c/l on ev5. ++ ++define(`rp',`r16') ++define(`up',`r17') ++define(`vp',`r18') ++define(`n', `r19') ++ ++define(`u0', `r8') ++define(`u1', `r1') ++define(`v0', `r4') ++define(`v1', `r5') ++ ++define(`cy0', `r0') ++define(`cy1', `r20') ++define(`cy', `r22') ++define(`rr', `r24') ++define(`ps', `r25') ++define(`sl', `r28') ++ ++ifdef(`OPERATION_addlsh2_n',` ++ define(ADDSUB, addl) ++ define(CARRY, `cmpult $1,$2,$3') ++ define(func, mpn_addlsh2_n) ++') ++ifdef(`OPERATION_sublsh2_n',` ++ define(ADDSUB, subl) ++ define(CARRY, `cmpult $2,$1,$3') ++ define(func, mpn_sublsh2_n) ++') ++ ++MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n) ++ ++ASM_START() ++PROLOGUE(func) ++ and n, 2, cy0 ++ blbs n, L(bx1) ++L(bx0): ldl v1, 0(vp) ++ ldl u1, 0(up) ++ bis r31, r31, r2 ++ bne cy0, L(b10) ++ ++L(b00): ldi vp, 48(vp) ++ ldi up, -16(up) ++ ldi rp, -8(rp) ++ s4addl v1, r31, sl ++ br r31, L(lo0) ++ ++L(b10): ldi vp, 32(vp) ++ ldi rp, 8(rp) ++ ldi cy0, 0(r31) ++ br r31, L(lo2) ++ ++L(bx1): ldl v0, 0(vp) ++ ldl u0, 0(up) ++ ldi cy1, 0(r31) ++ bis r31, r31, r3 ++ nop ++ beq cy0, L(b01) ++ ++L(b11): ldi vp, 40(vp) ++ ldi up, -24(up) ++ ldi rp, 16(rp) ++ br r31, L(lo3) ++ ++L(b01): ldi n, -4(n) ++ ble n, L(end) ++ ldi vp, 24(vp) ++ ldi up, -8(up) ++ ++ ALIGN(16) ++L(top): s4addl v0, r3, sl C combined vlimb ++ ldl v1, -16(vp) ++ ADDSUB u0, sl, ps C ulimb + (vlimb << 1) ++ ldl u1, 16(up) ++ srl v0, 62, r2 C high v bits ++ ADDSUB ps, cy1, rr C consume carry from previous operation ++ CARRY( ps, u0, cy0) C carry out #2 ++ stl rr, 0(rp) ++ CARRY( rr, ps, cy) C carry out #3 ++ ldi vp, 32(vp) C bookkeeping ++ addl cy, cy0, cy0 C final carry out ++ s4addl v1, r2, sl ++L(lo0): ldl v0, -40(vp) ++ ADDSUB u1, sl, ps ++ ldl u0, 24(up) ++ srl v1, 62, r3 ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy1) ++ stl rr, 8(rp) ++ CARRY( rr, ps, cy) ++ ldi rp, 32(rp) C bookkeeping ++ addl cy, cy1, cy1 ++L(lo3): s4addl v0, r3, sl ++ ldl v1, -32(vp) ++ ADDSUB u0, sl, ps ++ ldl u1, 32(up) ++ srl v0, 62, r2 ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy0) ++ stl rr, -16(rp) ++ CARRY( rr, ps, cy) ++ ldi up, 32(up) C bookkeeping ++ addl cy, cy0, cy0 ++L(lo2): s4addl v1, r2, sl ++ ldl v0, -24(vp) ++ ADDSUB u1, sl, ps ++ ldl u0, 8(up) ++ srl v1, 62, r3 ++ ADDSUB ps, cy0, rr ++ CARRY( ps, u1, cy1) ++ stl rr, -8(rp) ++ CARRY( rr, ps, cy) ++ ldi n, -4(n) C bookkeeping ++ addl cy, cy1, cy1 ++ bgt n, L(top) ++ ++L(end): s4addl v0, r3, sl ++ ADDSUB u0, sl, ps ++ srl v0, 62, r2 ++ ADDSUB ps, cy1, rr ++ CARRY( ps, u0, cy0) ++ stl rr, 0(rp) ++ CARRY( rr, ps, cy) ++ addl cy, cy0, cy0 ++ addl cy0, r2, r0 ++ ++ ret r31,(r26),1 ++EPILOGUE() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/bdiv_dbm1c.asm gmp-6.2.1-sw/mpn/sw_64/bdiv_dbm1c.asm +--- gmp-6.2.1/mpn/sw_64/bdiv_dbm1c.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/bdiv_dbm1c.asm 2022-08-26 07:41:33.293499320 +0000 +@@ -0,0 +1,282 @@ ++dnl Sw_64 mpn_bdiv_dbm1c. ++ ++dnl Copyright 2008 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: 42 ++C EV5: 18 ++C SW6: 3 ++ ++C TODO ++C * Try less unrolling, 2-way should give the same performance. ++C * Optimize feed-in and wind-down code, for speed, and perhaps further for ++C code size. ++C * This runs optimally given the algorithm, r8 is on a 3 operation recurrency ++C path. We have not tried very hard to find a better algorithm. Perhaps ++C it would be a good task for the GNU superoptimizer. ++ ++C INPUT PARAMETERS ++define(`rp', `r16') ++define(`up', `r17') ++define(`n', `r18') ++define(`bd', `r19') ++define(`cy', `r19') ++ ++ ++ASM_START() ++PROLOGUE(mpn_bdiv_dbm1c) ++ mov r20, r8 ++ ++ ldl r24, 0(r17) ++ and r18, 3, r28 ++ ldi r18, -4(r18) ++ beq r28, L(b0) ++ cmpeq r28, 1, r21 ++ bne r21, L(b1) ++ cmpeq r28, 2, r21 ++ bne r21, L(b2) ++ ++ ++L(b3): ldl r2, 8(r17) ++ ldl r3, 16(r17) ++ bgt r18, L(gt3) ++ ++ mull r24, r19, r5 C U1 ++ umulh r24, r19, r21 C U1 ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldi r16, -32(r16) ++ br L(cj3) ++ ++L(gt3): ldl r0, 24(r17) ++ mull r24, r19, r5 C U1 ++ umulh r24, r19, r21 C U1 ++ ldl r1, 32(r17) ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ ldl r2, 40(r17) ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldl r3, 48(r17) ++ ldi r18, -4(r18) ++ ldi r17, 56(r17) ++ mull r0, r19, r4 C U1 ++ bgt r18, L(L3) ++ ++ br L(cj7) ++ ++ ++L(b2): ldl r3, 8(r17) ++ bgt r18, L(gt2) ++ ++ mull r24, r19, r6 C U1 ++ umulh r24, r19, r22 C U1 ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldi r16, -40(r16) ++ br L(cj2) ++ ++L(gt2): ldl r0, 16(r17) ++ ldl r1, 24(r17) ++ mull r24, r19, r6 C U1 ++ umulh r24, r19, r22 C U1 ++ ldl r2, 32(r17) ++ mull r3, r19, r7 C U1 ++ umulh r3, r19, r23 C U1 ++ ldl r3, 40(r17) ++ ldi r18, -4(r18) ++ ldi r17, 48(r17) ++ mull r0, r19, r4 C U1 ++ umulh r0, r19, r20 C U1 ++ ldi r16, -8(r16) ++ bgt r18, L(gt6) ++ ++ mull r1, r19, r5 C U1 ++ br L(cj6) ++ ++L(gt6): ldl r0, 0(r17) ++ mull r1, r19, r5 C U1 ++ br L(L2) ++ ++ ++L(b1): bgt r18, L(gt1) ++ ++ mull r24, r19, r7 C U1 ++ umulh r24, r19, r23 C U1 ++ ldi r16, -48(r16) ++ br L(cj1) ++ ++L(gt1): ldl r0, 8(r17) ++ ldl r1, 16(r17) ++ ldl r2, 24(r17) ++ mull r24, r19, r7 C U1 ++ umulh r24, r19, r23 C U1 ++ ldl r3, 32(r17) ++ ldi r18, -4(r18) ++ ldi r17, 40(r17) ++ mull r0, r19, r4 C U1 ++ umulh r0, r19, r20 C U1 ++ ldi r16, -16(r16) ++ bgt r18, L(gt5) ++ ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ mull r2, r19, r6 C U1 ++ br L(cj5) ++ ++L(gt5): ldl r0, 0(r17) ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ ldl r1, 8(r17) ++ mull r2, r19, r6 C U1 ++ br L(L1) ++ ++ ++L(b0): ldl r1, 8(r17) ++ ldl r2, 16(r17) ++ ldl r3, 24(r17) ++ ldi r17, 32(r17) ++ ldi r16, -24(r16) ++ mull r24, r19, r4 C U1 ++ umulh r24, r19, r20 C U1 ++ bgt r18, L(gt4) ++ ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ mull r3, r19, r7 C U1 ++ br L(cj4) ++ ++L(gt4): ldl r0, 0(r17) ++ mull r1, r19, r5 C U1 ++ umulh r1, r19, r21 C U1 ++ ldl r1, 8(r17) ++ mull r2, r19, r6 C U1 ++ umulh r2, r19, r22 C U1 ++ ldl r2, 16(r17) ++ mull r3, r19, r7 C U1 ++ br L(L0) ++ ++C *** MAIN LOOP START *** ++ ALIGN(16) ++L(top): mull r0, r19, r4 C U1 ++ subl r8, r28, r8 ++L(L3): umulh r0, r19, r20 C U1 ++ cmpult r8, r5, r28 ++ ldl r0, 0(r17) ++ subl r8, r5, r8 ++ addl r21, r28, r28 ++ stl r8, 0(r16) ++ ++ mull r1, r19, r5 C U1 ++ subl r8, r28, r8 ++L(L2): umulh r1, r19, r21 C U1 ++ cmpult r8, r6, r28 ++ ldl r1, 8(r17) ++ subl r8, r6, r8 ++ addl r22, r28, r28 ++ stl r8, 8(r16) ++ ++ mull r2, r19, r6 C U1 ++ subl r8, r28, r8 ++L(L1): umulh r2, r19, r22 C U1 ++ cmpult r8, r7, r28 ++ ldl r2, 16(r17) ++ subl r8, r7, r8 ++ addl r23, r28, r28 ++ stl r8, 16(r16) ++ ++ mull r3, r19, r7 C U1 ++ subl r8, r28, r8 ++L(L0): umulh r3, r19, r23 C U1 ++ cmpult r8, r4, r28 ++ ldl r3, 24(r17) ++ subl r8, r4, r8 ++ addl r20, r28, r28 ++ stl r8, 24(r16) ++ ++ ldi r18, -4(r18) ++ ldi r17, 32(r17) ++ ldi r16, 32(r16) ++ bgt r18, L(top) ++C *** MAIN LOOP END *** ++ ++ mull r0, r19, r4 C U1 ++ subl r8, r28, r8 ++L(cj7): umulh r0, r19, r20 C U1 ++ cmpult r8, r5, r28 ++ subl r8, r5, r8 ++ addl r21, r28, r28 ++ stl r8, 0(r16) ++ mull r1, r19, r5 C U1 ++ subl r8, r28, r8 ++L(cj6): umulh r1, r19, r21 C U1 ++ cmpult r8, r6, r28 ++ subl r8, r6, r8 ++ addl r22, r28, r28 ++ stl r8, 8(r16) ++ mull r2, r19, r6 C U1 ++ subl r8, r28, r8 ++L(cj5): umulh r2, r19, r22 C U1 ++ cmpult r8, r7, r28 ++ subl r8, r7, r8 ++ addl r23, r28, r28 ++ stl r8, 16(r16) ++ mull r3, r19, r7 C U1 ++ subl r8, r28, r8 ++L(cj4): umulh r3, r19, r23 C U1 ++ cmpult r8, r4, r28 ++ subl r8, r4, r8 ++ addl r20, r28, r28 ++ stl r8, 24(r16) ++ subl r8, r28, r8 ++L(cj3): cmpult r8, r5, r28 ++ subl r8, r5, r8 ++ addl r21, r28, r28 ++ stl r8, 32(r16) ++ subl r8, r28, r8 ++L(cj2): cmpult r8, r6, r28 ++ subl r8, r6, r8 ++ addl r22, r28, r28 ++ stl r8, 40(r16) ++ subl r8, r28, r8 ++L(cj1): cmpult r8, r7, r28 ++ subl r8, r7, r8 ++ addl r23, r28, r28 ++ stl r8, 48(r16) ++ subl r8, r28, r0 ++ ret r31, (r26), 1 ++ ++EPILOGUE() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/cntlz.asm gmp-6.2.1-sw/mpn/sw_64/cntlz.asm +--- gmp-6.2.1/mpn/sw_64/cntlz.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/cntlz.asm 2022-08-26 07:41:33.293499320 +0000 +@@ -0,0 +1,55 @@ ++dnl Sw_64 auxiliary for longlong.h's count_leading_zeros ++ ++dnl Copyright 1997, 2000, 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++ASM_START() ++EXTERN(__clz_tab) ++PROLOGUE(mpn_count_leading_zeros,gp) ++ cmpgeb r31, r16, r1 ++ LEA(r3,__clz_tab) ++ sra r1, 1, r1 ++ xor r1, 127, r1 ++ srl r16, 1, r16 ++ addl r1, r3, r1 ++ ldl_u r0, 0(r1) ++ ldi r2, 64 ++ ext0b r0, r1, r0 ++ s8subw r0, 8, r0 ++ srl r16, r0, r16 ++ addl r16, r3, r16 ++ ldl_u r1, 0(r16) ++ ext0b r1, r16, r1 ++ subl r2, r1, r2 ++ subl r2, r0, r0 ++ ret r31, (r26),1 ++EPILOGUE(mpn_count_leading_zeros) ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/com.asm gmp-6.2.1-sw/mpn/sw_64/com.asm +--- gmp-6.2.1/mpn/sw_64/com.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/com.asm 2022-08-26 07:41:33.303499339 +0000 +@@ -0,0 +1,176 @@ ++dnl Sw_64 mpn_com -- mpn one's complement. ++ ++dnl Copyright 2003 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C cycles/limb ++C EV4: 4.75 ++C EV5: 2.0 ++C SW6: 1.5 ++ ++ ++C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); ++C ++C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total ++C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop ++C will be 1.5+2/N c/l. ++C ++C 2 cycles of loop control are unavoidable, for pointer updates and the ++C taken branch bubble, but also since ldl cannot issue two cycles after stl ++C (and with a run of stls that means neither of two cycles at the end of the ++C loop. ++C ++C The fbeq is forced into the second cycle of the loop using unops, since ++C the first time through it must wait for the cvtqt result. Once that ++C result is ready (a 1 cycle stall) then both the branch and following loads ++C can issue together. ++C ++C The main loop handles an odd count of limbs, being two limbs loaded before ++C each size test, plus one pipelined around from the previous iteration (or ++C setup in the entry sequence). ++C ++C An even number of limbs is handled by an explicit dst[0]=~src[0] in the ++C entry sequence, and an increment of the pointers. For an odd size there's ++C no increment and the first store in the loop (r24) is a repeat of dst[0]. ++C ++C Note that the load for r24 after the possible pointer increment is done ++C before the explicit store to dst[0], in case src==dst. ++ ++ ++ASM_START() ++ ++FLOAT64(L(dat), 2.0) ++ ++ ALIGN(16) ++ ++PROLOGUE(mpn_com,gp) ++ ++ C r16 dst ++ C r17 src ++ C r18 size ++ ++ ldi r30, -16(r30) C temporary stack space ++ ldi r7, -3(r18) C size - 3 ++ ++ ldl r20, 0(r17) C src[0] ++ srl r7, 1, r6 C (size-3)/2 ++ ++ stl r6, 8(r30) C (size-3)/2 ++ and r7, 1, r5 C 1 if size even ++ ++ LEA( r8, L(dat)) ++ s8addl r5, r17, r17 C skip src[0] if even ++ ++ ornot r31, r20, r20 C ~src[0] ++ unop ++ ++ fldd f0, 8(r30) C (size-3)/2 ++ ldl r24, 0(r17) C src[0 or 1] ++ ++ stl r20, 0(r16) C dst[0] ++ s8addl r5, r16, r19 C skip dst[0] if even ++ ++ fldd f1, 0(r8) C data 2.0 ++ ldi r30, 16(r30) C restore stack ++ fcvtld f0, f10 C (size-3)/2 as float ++ fcpys f10,f10,f0 ++ ++ ornot r31, r24, r24 ++ blt r7, L(done_1) C if size<=2 ++ unop ++ unop ++ ++ ++ C 16-byte alignment here ++L(top): ++ C r17 src, incrementing ++ C r19 dst, incrementing ++ C r24 dst[i] result, ready to store ++ C f0 (size-3)/2, decrementing ++ C f1 2.0 ++ ++ ldl r20, 8(r17) C src[i+1] ++ ldl r21, 16(r17) C src[i+2] ++ unop ++ unop ++ ++ fbeq f0, L(done_2) ++ unop ++ ldl r22, 24(r17) C src[i+3] ++ ldl r23, 32(r17) C src[i+4] ++ ++ stl r24, 0(r19) C dst[i] ++ ornot r31, r20, r20 ++ fsubd f0, f1, f10 C count -= 2 ++ fcpys f10,f10,f0 ++ ++ stl r20, 8(r19) C dst[i+1] ++ ornot r31, r21, r21 ++ unop ++ unop ++ ++ stl r21, 16(r19) C dst[i+2] ++ ornot r31, r22, r22 ++ ++ stl r22, 24(r19) C dst[i+3] ++ ornot r31, r23, r24 ++ ++ ldi r17, 32(r17) C src += 4 ++ ldi r19, 32(r19) C dst += 4 ++ unop ++ fbge f0, L(top) ++ ++ ++L(done_1): ++ C r19 &dst[size-1] ++ C r24 result for dst[size-1] ++ ++ stl r24, 0(r19) C dst[size-1] ++ ret r31, (r26), 1 ++ ++ ++L(done_2): ++ C r19 &dst[size-3] ++ C r20 src[size-2] ++ C r21 src[size-1] ++ C r24 result for dst[size-3] ++ ++ stl r24, 0(r19) C dst[size-3] ++ ornot r31, r20, r20 ++ ++ stl r20, 8(r19) C dst[size-2] ++ ornot r31, r21, r21 ++ ++ stl r21, 16(r19) C dst[size-1] ++ ret r31, (r26), 1 ++ ++EPILOGUE() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/copyd.asm gmp-6.2.1-sw/mpn/sw_64/copyd.asm +--- gmp-6.2.1/mpn/sw_64/copyd.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/copyd.asm 2022-08-26 07:41:33.303499339 +0000 +@@ -0,0 +1,88 @@ ++dnl Sw_64 mpn_copyd -- copy, decrementing. ++ ++dnl Copyright 2002, 2003 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: 4 ++C EV5: 1.75 ++C SW6: 1 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++ ++ ++ASM_START() ++PROLOGUE(mpn_copyd) ++ s8addl r18,r16,r16 C E0 ++ s8addl r18,r17,r17 C E1 ++ ldi r18,-8(r18) C E0 ++ blt r18,$Lend C E1 ++$Loop: ldl r0,-8(r17) C E0 ++ ldl r1,-16(r17) C E1 ++ ldl r2,-24(r17) C E0 ++ ldl r3,-32(r17) C E1 ++ ldl r4,-40(r17) C E0 ++ ldl r5,-48(r17) C E1 ++ ldl r6,-56(r17) C E0 ++ ldl r7,-64(r17) C E1 ++ stl r0,-8(r16) C E0 ++ ldi r17,-64(r17) C E1 ++ stl r1,-16(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r2,-24(r16) C E0 ++ ldi r18,-8(r18) C E1 ++ stl r3,-32(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r4,-40(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r5,-48(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r6,-56(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r7,-64(r16) C E0 ++ ldi r16,-64(r16) C E1 ++ bge r18,$Loop C E1 ++$Lend: ldi r18,7(r18) C E0 ++ blt r18,$Lret C E1 ++ ldl r0,-8(r17) C E0 ++ beq r18,$Lend0 C E1 ++$Loop0: stl r0,-8(r16) C E0 ++ ldi r16,-8(r16) C E1 ++ ldl r0,-16(r17) C E0 ++ ldi r18,-1(r18) C E1 ++ ldi r17,-8(r17) C E0 ++ bgt r18,$Loop0 C E1 ++$Lend0: stl r0,-8(r16) C E0 ++$Lret: ret r31,(r26),1 C E1 ++EPILOGUE(mpn_copyd) ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/copyi.asm gmp-6.2.1-sw/mpn/sw_64/copyi.asm +--- gmp-6.2.1/mpn/sw_64/copyi.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/copyi.asm 2022-08-26 07:41:33.313499357 +0000 +@@ -0,0 +1,86 @@ ++dnl Sw_64 mpn_copyi -- copy, incrementing. ++ ++dnl Copyright 2002, 2003 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: 4 ++C EV5: 1.75 ++C SW6: 1 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++ ++ ++ASM_START() ++PROLOGUE(mpn_copyi) ++ ldi r18,-8(r18) C E0 ++ blt r18,$Lend C E1 ++$Loop: ldl r0,0(r17) C E0 ++ ldl r1,8(r17) C E1 ++ ldl r2,16(r17) C E0 ++ ldl r3,24(r17) C E1 ++ ldl r4,32(r17) C E0 ++ ldl r5,40(r17) C E1 ++ ldl r6,48(r17) C E0 ++ ldl r7,56(r17) C E1 ++ stl r0,0(r16) C E0 ++ ldi r17,64(r17) C E1 ++ stl r1,8(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r2,16(r16) C E0 ++ ldi r18,-8(r18) C E1 ++ stl r3,24(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r4,32(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r5,40(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r6,48(r16) C E0 ++ bis r31, r31, r31 C E1 ++ stl r7,56(r16) C E0 ++ ldi r16,64(r16) C E1 ++ bge r18,$Loop C E1 ++$Lend: ldi r18,7(r18) C E0 ++ blt r18,$Lret C E1 ++ ldl r0,0(r17) C E0 ++ beq r18,$Lend0 C E1 ++$Loop0: stl r0,0(r16) C E0 ++ ldi r16,8(r16) C E1 ++ ldl r0,8(r17) C E0 ++ ldi r18,-1(r18) C E1 ++ ldi r17,8(r17) C E0 ++ bgt r18,$Loop0 C E1 ++$Lend0: stl r0,0(r16) C E0 ++$Lret: ret r31,(r26),1 C E1 ++EPILOGUE(mpn_copyi) ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/default.m4 gmp-6.2.1-sw/mpn/sw_64/default.m4 +--- gmp-6.2.1/mpn/sw_64/default.m4 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/default.m4 2022-08-26 07:41:33.313499357 +0000 +@@ -0,0 +1,127 @@ ++divert(-1) ++ ++dnl m4 macros for sw_64 assembler (everywhere except unicos). ++ ++ ++dnl Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++ ++dnl Usage: ASM_START() ++define(`ASM_START', ++m4_assert_numargs(0) ++` .set noreorder ++ .set noat') ++ ++dnl Usage: X(value) ++define(`X', ++m4_assert_numargs(1) ++`0x$1') ++ ++dnl Usage: FLOAT64(label,value) ++define(`FLOAT64', ++m4_assert_numargs(2) ++` .align 3 ++$1: .t_floating $2') ++ ++ ++dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign]) ++dnl EPILOGUE_cpu(GSYM_PREFIX`'foo) ++ ++define(`PROLOGUE_cpu', ++m4_assert_numargs_range(1,2) ++`ifelse(`$2',gp,, ++`ifelse(`$2',noalign,, ++`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter ++')')')')dnl ++ .text ++ifelse(`$2',noalign,,` ALIGN(16)') ++ .globl $1 ++ .ent $1 ++$1: ++ .frame r30,0,r26,0 ++ifelse(`$2',gp,` ldgp r29, 0(r27) ++`$'$1..ng:') ++ .prologue ifelse(`$2',gp,1,0)') ++ ++define(`EPILOGUE_cpu', ++m4_assert_numargs(1) ++` .end $1') ++ ++ ++dnl Usage: LDGP(dst,src) ++dnl ++dnl Emit an "ldgp dst,src", but only if the system uses a GOT. ++ ++define(LDGP, ++m4_assert_numargs(2) ++`ldgp `$1', `$2'') ++ ++ ++dnl Usage: EXTERN(variable_name) ++define(`EXTERN', ++m4_assert_numargs(1) ++) ++ ++dnl Usage: r0 ... r31 ++dnl f0 ... f31 ++dnl ++dnl Map register names r0 to $0, and f0 to $f0, etc. ++dnl This is needed on all systems but Unicos ++dnl ++dnl defreg() is used to protect the $ in $0 (otherwise it would represent a ++dnl macro argument). Double quoting is used to protect the f0 in $f0 ++dnl (otherwise it would be an infinite recursion). ++ ++forloop(i,0,31,`defreg(`r'i,$i)') ++forloop(i,0,31,`deflit(`f'i,``$f''i)') ++ ++ ++dnl Usage: DATASTART(name,align) or DATASTART(name) ++dnl DATAEND() ++ ++define(`DATASTART', ++m4_assert_numargs_range(1,2) ++` RODATA ++ ALIGN(ifelse($#,1,2,$2)) ++$1:') ++define(`DATAEND', ++m4_assert_numargs(0) ++) ++ ++dnl Load a symbolic address into a register ++define(`LEA', ++m4_assert_numargs(2) ++`ldi $1, $2') ++ ++dnl Usage: ASM_END() ++define(`ASM_END', ++m4_assert_numargs(0) ++) ++ ++divert +diff -Naru gmp-6.2.1/mpn/sw_64/dive_1.c gmp-6.2.1-sw/mpn/sw_64/dive_1.c +--- gmp-6.2.1/mpn/sw_64/dive_1.c 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/dive_1.c 2022-08-26 07:41:33.323499376 +0000 +@@ -0,0 +1,113 @@ ++/* Sw_64 mpn_divexact_1 -- mpn by limb exact division. ++ ++ THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST ++ CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN ++ FUTURE GNU MP RELEASES. ++ ++Copyright 2000-2003 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp.h" ++#include "gmp-impl.h" ++#include "longlong.h" ++ ++ ++/* cycles/limb ++ SW6: 15.0 ++*/ ++ ++ ++/* The dependent chain is as follows (the same as modexact), and this is ++ what the code runs as. ++ ++ ev4 ev5 sw6 ++ 1 1 1 sub y = x - h ++ 23 13 7 mull q = y * inverse ++ 23 15 7 umulh h = high (q * d) ++ -- -- -- ++ 47 30 15 ++ ++ The time to load src[i+1] and establish x hides under the umulh latency. */ ++ ++void ++mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) ++{ ++ mp_limb_t inverse, lshift_mask, s, sr, s_next, c, h, x, y, q, dummy; ++ unsigned rshift, lshift; ++ ++ ASSERT (size >= 1); ++ ASSERT (divisor != 0); ++ ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); ++ ASSERT_MPN (src, size); ++ ASSERT_LIMB (divisor); ++ ++ s_next = *src++; /* src[0] */ ++ ++ rshift = 0; ++ lshift_mask = 0; ++ if ((divisor & 1) == 0) ++ { ++ count_trailing_zeros (rshift, divisor); ++ lshift_mask = MP_LIMB_T_MAX; ++ divisor >>= rshift; ++ } ++ ++ binvert_limb (inverse, divisor); ++ lshift = 64 - rshift; ++ ++ c = 0; ++ h = 0; ++ sr = s_next >> rshift; ++ ++ size--; ++ if (LIKELY (size != 0)) ++ { ++ do ++ { ++ s_next = *src++; /* src[i+1] */ ++ s = sr | ((s_next << lshift) & lshift_mask); ++ x = s - c; ++ c = s < c; ++ sr = s_next >> rshift; ++ ++ y = x - h; ++ c += (x < h); ++ q = y * inverse; ++ *dst++ = q; ++ umul_ppmm (h, dummy, q, divisor); ++ ++ size--; ++ } ++ while (size != 0); ++ } ++ ++ x = sr - c; ++ y = x - h; ++ q = y * inverse; ++ *dst = q; /* dst[size-1] */ ++} +diff -Naru gmp-6.2.1/mpn/sw_64/divrem_2.asm gmp-6.2.1-sw/mpn/sw_64/divrem_2.asm +--- gmp-6.2.1/mpn/sw_64/divrem_2.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/divrem_2.asm 2022-08-26 07:41:33.323499376 +0000 +@@ -0,0 +1,177 @@ ++dnl Sw_64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. ++ ++dnl Copyright 2007, 2008, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C norm frac ++C ev4 ++C ev5 70 70 ++C sw6 29 29 ++ ++C TODO ++C * Perhaps inline mpn_invert_limb, that would allow us to not save/restore ++C any registers (thus save ~10 cycles per call). ++C * Use negated d1 and/or d0 to speed carry propagation. Might save a cycle ++C or two. ++C * Check cluster delays (for sw6). We very likely could save some cycles. ++C * Use branch-free code for computing di. ++C * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call. ++ ++C INPUT PARAMETERS ++define(`qp', `r16') ++define(`fn', `r17') ++define(`up_param', `r18') ++define(`un_param', `r19') ++define(`dp', `r20') ++ ++ASM_START() ++PROLOGUE(mpn_divrem_2,gp) ++ ldi r30, -80(r30) ++ stl r26, 0(r30) ++ stl r9, 8(r30) ++ stl r10, 16(r30) ++ stl r11, 24(r30) ++ stl r12, 32(r30) ++ stl r13, 40(r30) ++C stl r14, 48(r30) ++ stl r15, 56(r30) ++ .prologue 1 ++ stl r16, 64(r30) ++ bis r31, r17, r15 ++ s8addl r19, r18, r13 ++ ldi r13, -24(r13) ++ ldl r12, 8(r20) ++ ldl r10, 0(r20) ++ ldl r11, 16(r13) ++ ldl r9, 8(r13) ++ ++ bis r31, r31, r3 C most_significant_q_limb = 0 ++ cmpult r11, r12, r1 ++ bne r1, L(L8) ++ cmpule r11, r12, r1 ++ cmpult r9, r10, r2 ++ and r1, r2, r1 ++ bne r1, L(L8) ++ subl r11, r12, r11 ++ subl r11, r2, r11 ++ subl r9, r10, r9 ++ ldi r3, 1(r31) C most_significant_q_limb = 1 ++L(L8): stl r3, 72(r30) ++ ++ addl r15, r19, r19 ++ ldi r19, -3(r19) ++ blt r19, L(L10) ++ bis r31, r12, r16 ++ call r26, mpn_invert_limb ++ LDGP( r29, 0(r26)) ++ mull r0, r12, r4 C t0 = LO(di * d1) ++ umulh r0, r10, r2 C s1 = HI(di * d0) ++ addl r4, r10, r4 C t0 += d0 ++ cmpule r10, r4, r7 C (t0 < d0) ++ addl r4, r2, r4 C t0 += s1 ++ cmpult r4, r2, r1 ++ subl r1, r7, r7 C t1 (-1, 0, or 1) ++ blt r7, L(L42) ++L(L22): ++ ldi r0, -1(r0) C di-- ++ cmpult r4, r12, r1 C cy for: t0 -= d1 (below) ++ subl r7, r1, r7 C t1 -= cy ++ subl r4, r12, r4 C t0 -= d1 ++ bge r7, L(L22) ++L(L42): ++ ldl r16, 64(r30) ++ s8addl r19, r16, r16 ++ ALIGN(16) ++L(loop): ++ mull r11, r0, r5 C q0 (early) ++ umulh r11, r0, r6 C q (early) ++ addl r5, r9, r8 C q0 += n1 ++ addl r6, r11, r6 C q += n2 ++ cmpult r8, r5, r1 C cy for: q0 += n1 ++ addl r6, r1, r6 C q += cy ++ unop ++ mull r12, r6, r1 C LO(d1 * q) ++ umulh r10, r6, r7 C t1 = HI(d0 * q) ++ subl r9, r1, r9 C n1 -= LO(d1 * q) ++ mull r10, r6, r4 C t0 = LO(d0 * q) ++ unop ++ cmple r15, r19, r5 C condition and n0... ++ beq r5, L(L31) ++ ldl r5, 0(r13) ++ ldi r13, -8(r13) ++L(L31): subl r9, r12, r9 C n1 -= d1 ++ cmpult r5, r10, r1 C ++ subl r9, r1, r9 C ++ subl r5, r10, r5 C n0 -= d0 ++ subl r9, r7, r9 C n1 -= t0 ++ cmpult r5, r4, r1 C ++ subl r9, r1, r2 C ++ subl r5, r4, r5 C n0 -= t1 ++ cmpult r2, r8, r1 C (n1 < q0) ++ addl r6, r1, r6 C q += cond ++ ldi r1, -1(r1) C -(n1 >= q0) ++ and r1, r10, r4 C ++ addl r5, r4, r9 C n0 += mask & d0 ++ and r1, r12, r1 C ++ cmpult r9, r5, r11 C cy for: n0 += mask & d0 ++ addl r2, r1, r1 C n1 += mask & d1 ++ addl r1, r11, r11 C n1 += cy ++ cmpult r11, r12, r1 C ++ beq r1, L(fix) C ++L(bck): stl r6, 0(r16) ++ ldi r16, -8(r16) ++ ldi r19, -1(r19) ++ bge r19, L(loop) ++ ++L(L10): stl r9, 8(r13) ++ stl r11, 16(r13) ++ ldl r0, 72(r30) ++ ldl r26, 0(r30) ++ ldl r9, 8(r30) ++ ldl r10, 16(r30) ++ ldl r11, 24(r30) ++ ldl r12, 32(r30) ++ ldl r13, 40(r30) ++C ldl r14, 48(r30) ++ ldl r15, 56(r30) ++ ldi r30, 80(r30) ++ ret r31, (r26), 1 ++ ++L(fix): cmpule r11, r12, r1 ++ cmpult r9, r10, r2 ++ and r1, r2, r1 ++ bne r1, L(bck) ++ subl r11, r12, r11 ++ subl r11, r2, r11 ++ subl r9, r10, r9 ++ ldi r6, 1(r6) ++ br L(bck) ++EPILOGUE() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/gmp-mparam.h gmp-6.2.1-sw/mpn/sw_64/gmp-mparam.h +--- gmp-6.2.1/mpn/sw_64/gmp-mparam.h 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/gmp-mparam.h 2022-08-26 07:41:33.333499395 +0000 +@@ -0,0 +1,86 @@ ++/* Sw_64 EV4 gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software ++Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++ ++/* 175MHz 21064 */ ++ ++/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */ ++ ++#define MUL_TOOM22_THRESHOLD 12 ++#define MUL_TOOM33_THRESHOLD 69 ++#define MUL_TOOM44_THRESHOLD 88 ++ ++#define SQR_BASECASE_THRESHOLD 4 ++#define SQR_TOOM2_THRESHOLD 20 ++#define SQR_TOOM3_THRESHOLD 62 ++#define SQR_TOOM4_THRESHOLD 155 ++ ++#define MULLO_BASECASE_THRESHOLD 0 /* always */ ++#define MULLO_DC_THRESHOLD 40 ++#define MULLO_MUL_N_THRESHOLD 202 ++ ++#define DIV_SB_PREINV_THRESHOLD 0 /* preinv always */ ++#define DIV_DC_THRESHOLD 38 ++#define POWM_THRESHOLD 60 ++ ++#define MATRIX22_STRASSEN_THRESHOLD 17 ++#define HGCD_THRESHOLD 80 ++#define GCD_DC_THRESHOLD 237 ++#define GCDEXT_DC_THRESHOLD 198 ++#define JACOBI_BASE_METHOD 2 ++ ++#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ ++#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ ++#define MOD_1_NORM_THRESHOLD 0 /* always */ ++#define MOD_1_UNNORM_THRESHOLD 0 /* always */ ++#define MOD_1_1_THRESHOLD 2 ++#define MOD_1_2_THRESHOLD 9 ++#define MOD_1_4_THRESHOLD 20 ++#define USE_PREINV_DIVREM_1 1 /* preinv always */ ++#define USE_PREINV_MOD_1 1 /* preinv always */ ++#define DIVEXACT_1_THRESHOLD 0 /* always */ ++#define MODEXACT_1_ODD_THRESHOLD 0 /* always */ ++ ++#define GET_STR_DC_THRESHOLD 20 ++#define GET_STR_PRECOMPUTE_THRESHOLD 37 ++#define SET_STR_DC_THRESHOLD 746 ++#define SET_STR_PRECOMPUTE_THRESHOLD 1332 ++ ++#define MUL_FFT_TABLE { 240, 480, 1344, 2304, 5120, 20480, 49152, 0 } ++#define MUL_FFT_MODF_THRESHOLD 232 ++#define MUL_FFT_THRESHOLD 1664 ++ ++#define SQR_FFT_TABLE { 240, 480, 1216, 2304, 5120, 12288, 49152, 0 } ++#define SQR_FFT_MODF_THRESHOLD 232 ++#define SQR_FFT_THRESHOLD 1408 +diff -Naru gmp-6.2.1/mpn/sw_64/invert_limb.asm gmp-6.2.1-sw/mpn/sw_64/invert_limb.asm +--- gmp-6.2.1/mpn/sw_64/invert_limb.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/invert_limb.asm 2022-08-26 07:41:33.333499395 +0000 +@@ -0,0 +1,95 @@ ++dnl Sw_64 mpn_invert_limb -- Invert a normalized limb. ++ ++dnl Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: ? ++C EV5: 137/140 (with BWX/without BWX) ++C SW6: 71/72 (with BWX/without BWX) ++ ++C This was compiler generated, with minimal manual edits. Surely several ++C cycles could be cut with some thought. ++ ++ASM_START() ++PROLOGUE(mpn_invert_limb,gp) ++ LEA( r2, approx_tab) ++ srl r16, 54, r1 ++ srl r16, 24, r4 ++ and r16, 1, r5 ++ bic r1, 1, r7 ++ ldi r4, 1(r4) ++ srl r16, 1, r3 ++ addl r7, r2, r1 ++ifelse(bwx_available_p,1,` ++ ldhu r0, -512(r1) ++',` ++ ldl_u r0, -512(r1) ++ ext1b r0, r7, r0 ++') ++ addl r3, r5, r3 ++ mulw r0, r0, r1 ++ sll r0, 11, r0 ++ mull r1, r4, r1 ++ srl r1, 40, r1 ++ subl r0, r1, r0 ++ ldi r0, -1(r0) ++ mull r0, r0, r2 ++ sll r0, 60, r1 ++ sll r0, 13, r0 ++ mull r2, r4, r2 ++ subl r1, r2, r1 ++ srl r1, 47, r1 ++ addl r0, r1, r0 ++ mull r0, r3, r3 ++ srl r0, 1, r1 ++ seleq r5, 0, r1,r1 ++ subl r1, r3, r1 ++ umulh r1, r0, r3 ++ sll r0, 31, r0 ++ srl r3, 1, r1 ++ addl r0, r1, r0 ++ mull r0, r16, r2 ++ umulh r0, r16, r3 ++ addl r2, r16, r1 ++ addl r3, r16, r16 ++ cmpult r1, r2, r1 ++ addl r16, r1, r3 ++ subl r0, r3, r0 ++ ret r31, (r26), 1 ++EPILOGUE() ++DATASTART(approx_tab,8) ++forloop(i,256,512-1,dnl ++` .word eval(0x7fd00/i) ++')dnl ++ SIZE(approx_tab, 512) ++ TYPE(approx_tab, object) ++DATAEND() ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/lshift.asm gmp-6.2.1-sw/mpn/sw_64/lshift.asm +--- gmp-6.2.1/mpn/sw_64/lshift.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/lshift.asm 2022-08-26 07:41:33.343499414 +0000 +@@ -0,0 +1,182 @@ ++dnl Sw_64 mpn_lshift -- Shift a number left. ++ ++dnl Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: ? ++C EV5: 3.25 ++C SW6: 1.75 ++ ++C INPUT PARAMETERS ++C rp r16 ++C up r17 ++C n r18 ++C cnt r19 ++ ++ ++ASM_START() ++PROLOGUE(mpn_lshift) ++ s8addl r18,r17,r17 C make r17 point at end of s1 ++ ldl r4,-8(r17) C load first limb ++ subl r31,r19,r20 ++ s8addl r18,r16,r16 C make r16 point at end of RES ++ subl r18,1,r18 ++ and r18,4-1,r28 C number of limbs in first loop ++ srl r4,r20,r0 C compute function result ++ ++ beq r28,L(L0) ++ subl r18,r28,r18 ++ ++ ALIGN(8) ++L(top0): ++ ldl r3,-16(r17) ++ subl r16,8,r16 ++ sll r4,r19,r5 ++ subl r17,8,r17 ++ subl r28,1,r28 ++ srl r3,r20,r6 ++ bis r3,r3,r4 ++ bis r5,r6,r8 ++ stl r8,0(r16) ++ bne r28,L(top0) ++ ++L(L0): sll r4,r19,r24 ++ beq r18,L(end) ++C warm up phase 1 ++ ldl r1,-16(r17) ++ subl r18,4,r18 ++ ldl r2,-24(r17) ++ ldl r3,-32(r17) ++ ldl r4,-40(r17) ++C warm up phase 2 ++ srl r1,r20,r7 ++ sll r1,r19,r21 ++ srl r2,r20,r8 ++ beq r18,L(end1) ++ ldl r1,-48(r17) ++ sll r2,r19,r22 ++ ldl r2,-56(r17) ++ srl r3,r20,r5 ++ bis r7,r24,r7 ++ sll r3,r19,r23 ++ bis r8,r21,r8 ++ srl r4,r20,r6 ++ ldl r3,-64(r17) ++ sll r4,r19,r24 ++ ldl r4,-72(r17) ++ subl r18,4,r18 ++ beq r18,L(end2) ++ ALIGN(16) ++C main loop ++L(top): stl r7,-8(r16) ++ bis r5,r22,r5 ++ stl r8,-16(r16) ++ bis r6,r23,r6 ++ ++ srl r1,r20,r7 ++ subl r18,4,r18 ++ sll r1,r19,r21 ++ unop C ldl r31,-96(r17) ++ ++ srl r2,r20,r8 ++ ldl r1,-80(r17) ++ sll r2,r19,r22 ++ ldl r2,-88(r17) ++ ++ stl r5,-24(r16) ++ bis r7,r24,r7 ++ stl r6,-32(r16) ++ bis r8,r21,r8 ++ ++ srl r3,r20,r5 ++ unop C ldl r31,-96(r17) ++ sll r3,r19,r23 ++ subl r16,32,r16 ++ ++ srl r4,r20,r6 ++ ldl r3,-96(r17) ++ sll r4,r19,r24 ++ ldl r4,-104(r17) ++ ++ subl r17,32,r17 ++ bne r18,L(top) ++C cool down phase 2/1 ++L(end2): ++ stl r7,-8(r16) ++ bis r5,r22,r5 ++ stl r8,-16(r16) ++ bis r6,r23,r6 ++ srl r1,r20,r7 ++ sll r1,r19,r21 ++ srl r2,r20,r8 ++ sll r2,r19,r22 ++ stl r5,-24(r16) ++ bis r7,r24,r7 ++ stl r6,-32(r16) ++ bis r8,r21,r8 ++ srl r3,r20,r5 ++ sll r3,r19,r23 ++ srl r4,r20,r6 ++ sll r4,r19,r24 ++C cool down phase 2/2 ++ stl r7,-40(r16) ++ bis r5,r22,r5 ++ stl r8,-48(r16) ++ bis r6,r23,r6 ++ stl r5,-56(r16) ++ stl r6,-64(r16) ++C cool down phase 2/3 ++ stl r24,-72(r16) ++ ret r31,(r26),1 ++ ++C cool down phase 1/1 ++L(end1): ++ sll r2,r19,r22 ++ srl r3,r20,r5 ++ bis r7,r24,r7 ++ sll r3,r19,r23 ++ bis r8,r21,r8 ++ srl r4,r20,r6 ++ sll r4,r19,r24 ++C cool down phase 1/2 ++ stl r7,-8(r16) ++ bis r5,r22,r5 ++ stl r8,-16(r16) ++ bis r6,r23,r6 ++ stl r5,-24(r16) ++ stl r6,-32(r16) ++ stl r24,-40(r16) ++ ret r31,(r26),1 ++ ++L(end): stl r24,-8(r16) ++ ret r31,(r26),1 ++EPILOGUE(mpn_lshift) ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/mod_34lsub1.asm gmp-6.2.1-sw/mpn/sw_64/mod_34lsub1.asm +--- gmp-6.2.1/mpn/sw_64/mod_34lsub1.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/mod_34lsub1.asm 2022-08-26 07:41:33.343499414 +0000 +@@ -0,0 +1,164 @@ ++dnl Sw_64 mpn_mod_34lsub1. ++ ++dnl Copyright 2002 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++C cycles/limb ++C EV4: 4 (?) ++C EV5: 2.67 ++C SW6: 1.67 ++ ++ ++dnl INPUT PARAMETERS ++dnl up r16 ++dnl n r17 ++ ++define(`l0',`r18') ++define(`l1',`r19') ++define(`l2',`r20') ++define(`a0',`r21') ++define(`a1',`r22') ++define(`a2',`r23') ++define(`c0',`r24') ++define(`c1',`r5') ++define(`c2',`r6') ++ ++ASM_START() ++PROLOGUE(mpn_mod_34lsub1) ++ bis r31, r31, c0 ++ bis r31, r31, c1 ++ bis r31, r31, c2 ++ ++ ldi r17, -3(r17) ++ bge r17, $L_3_or_more ++ bis r31, r31, a0 ++ bis r31, r31, a1 ++ bis r31, r31, a2 ++ br r31, $L_012 ++ ++$L_3_or_more: ++ ldl a0, 0(r16) ++ ldl a1, 8(r16) ++ ldl a2, 16(r16) ++ ldi r16, 24(r16) ++ ldi r17, -3(r17) ++ blt r17, $L_012 ++ ++$L_6_or_more: ++ ldl l0, 0(r16) ++ ldl l1, 8(r16) ++ ldl l2, 16(r16) ++ addl l0, a0, a0 ++ ++ ldi r16, 24(r16) ++ ldi r17, -3(r17) ++ blt r17, $L_end ++ ++ ALIGN(16) ++C Main loop ++$L_9_or_more: ++$Loop: cmpult a0, l0, r0 ++ ldl l0, 0(r16) ++ addl r0, c0, c0 ++ addl l1, a1, a1 ++ cmpult a1, l1, r0 ++ ldl l1, 8(r16) ++ addl r0, c1, c1 ++ addl l2, a2, a2 ++ cmpult a2, l2, r0 ++ ldl l2, 16(r16) ++ addl r0, c2, c2 ++ addl l0, a0, a0 ++ ldi r16, 24(r16) ++ ldi r17, -3(r17) ++ bge r17, $Loop ++ ++$L_end: cmpult a0, l0, r0 ++ addl r0, c0, c0 ++ addl l1, a1, a1 ++ cmpult a1, l1, r0 ++ addl r0, c1, c1 ++ addl l2, a2, a2 ++ cmpult a2, l2, r0 ++ addl r0, c2, c2 ++ ++C Handle the last (n mod 3) limbs ++$L_012: ldi r17, 2(r17) ++ blt r17, $L_0 ++ ldl l0, 0(r16) ++ addl l0, a0, a0 ++ cmpult a0, l0, r0 ++ addl r0, c0, c0 ++ beq r17, $L_0 ++ ldl l1, 8(r16) ++ addl l1, a1, a1 ++ cmpult a1, l1, r0 ++ addl r0, c1, c1 ++ ++C Align and sum our 3 main accumulators and 3 carry accumulators ++$L_0: srl a0, 48, r2 ++ srl a1, 32, r4 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins2b a1, 2, r1', C (a1 & 0xffffffff) << 16 ++` zapnot a1, 15, r25 ++ sll r25, 16, r1') ++ zapnot a0, 63, r0 C a0 & 0xffffffffffff ++ srl a2, 16, a1 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins1b a2, 4, r3', C (a2 & 0xffff) << 32 ++` zapnot a2, 3, r25 ++ sll r25, 32, r3') ++ addl r1, r4, r1 ++ addl r0, r2, r0 ++ srl c0, 32, a2 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins2b c0, 2, r4', C (c0 & 0xffffffff) << 16 ++` zapnot c0, 15, r25 ++ sll r25, 16, r4') ++ addl r0, r1, r0 ++ addl r3, a1, r3 ++ addl r0, r3, r0 ++ srl c1, 16, c0 ++ifdef(`HAVE_LIMB_LITTLE_ENDIAN', ++` ins1b c1, 4, r2', C (c1 & 0xffff) << 32 ++` zapnot c1, 3, r25 ++ sll r25, 32, r2') ++ addl r4, a2, r4 ++C srl c2, 48, r3 C This will be 0 in practise ++ zapnot c2, 63, r1 C r1 = c2 & 0xffffffffffff ++ addl r0, r4, r0 ++ addl r2, c0, r2 ++ addl r0, r2, r0 ++C addl r1, r3, r1 ++ addl r0, r1, r0 ++ ++ ret r31, (r26), 1 ++EPILOGUE(mpn_mod_34lsub1) ++ASM_END() +diff -Naru gmp-6.2.1/mpn/sw_64/mode1o.asm gmp-6.2.1-sw/mpn/sw_64/mode1o.asm +--- gmp-6.2.1/mpn/sw_64/mode1o.asm 1970-01-01 00:00:00.000000000 +0000 ++++ gmp-6.2.1-sw/mpn/sw_64/mode1o.asm 2022-08-26 07:41:33.353499433 +0000 +@@ -0,0 +1,209 @@ ++dnl Sw_64 mpn_modexact_1c_odd -- mpn exact remainder ++ ++dnl Copyright 2003, 2004 Free Software Foundation, Inc. ++ ++dnl This file is part of the GNU MP Library. ++dnl ++dnl The GNU MP Library is free software; you can redistribute it and/or modify ++dnl it under the terms of either: ++dnl ++dnl * the GNU Lesser General Public License as published by the Free ++dnl Software Foundation; either version 3 of the License, or (at your ++dnl option) any later version. ++dnl ++dnl or ++dnl ++dnl * the GNU General Public License as published by the Free Software ++dnl Foundation; either version 2 of the License, or (at your option) any ++dnl later version. ++dnl ++dnl or both in parallel, as here. ++dnl ++dnl The GNU MP Library is distributed in the hope that it will be useful, but ++dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++dnl for more details. ++dnl ++dnl You should have received copies of the GNU General Public License and the ++dnl GNU Lesser General Public License along with the GNU MP Library. If not, ++dnl see https://www.gnu.org/licenses/. ++ ++include(`../config.m4') ++ ++ ++C cycles/limb ++C EV4: 47 ++C EV5: 30 ++C SW6: 15 ++ ++ ++C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, ++C mp_limb_t c) ++C ++C This code follows the "alternate" code in mpn/generic/mode1o.c, ++C eliminating cbit+climb from the dependent chain. This leaves, ++C ++C ev4 ev5 ev6 ++C 1 3 1 subl y = x - h ++C 23 13 7 mull q = y * inverse ++C 23 14 7 umulh h = high (q * d) ++C -- -- -- ++C 47 30 15 ++C ++C In each case, the load latency, loop control, and extra carry bit handling ++C hide under the multiply latencies. Those latencies are long enough that ++C we don't need to worry about alignment or pairing to squeeze out ++C performance. ++C ++C For the first limb, some of the loop code is broken out and scheduled back ++C since it can be done earlier. ++C ++C - The first ldl src[0] is near the start of the routine, for maximum ++C time from memory. ++C ++C - The subl y=x-climb can be done without waiting for the inverse. ++C ++C - The mull y*inverse is replicated after the final subl for the inverse, ++C instead of branching to the mull in the main loop. On ev4 a branch ++C there would cost cycles, but we can hide them under the mull latency. ++C ++C For the last limb, high> 1 ++ ++ and r20, 127, r20 C idx = d>>1 & 0x7F ++ ++ addl r0, r20, r21 C table + idx ++ ++ifelse(bwx_available_p,1, ++` ldbu r20, 0(r21) C table[idx], inverse 8 bits ++',` ++ ldl_u r20, 0(r21) C table[idx] qword ++ ext0b r20, r21, r20 C table[idx], inverse 8 bits ++') ++ ++ mulw r20, r20, r7 C i*i ++ addl r20, r20, r20 C 2*i ++ ++ ldl r2, 0(r16) C x = s = src[0] ++ ldi r17, -1(r17) C size-- ++ clr r0 C initial cbit=0 ++ ++ mulw r7, r18, r7 C i*i*d ++ ++ subl r20, r7, r20 C 2*i-i*i*d, inverse 16 bits ++ ++ mulw r20, r20, r7 C i*i ++ addl r20, r20, r20 C 2*i ++ ++ mulw r7, r18, r7 C i*i*d ++ ++ subl r20, r7, r20 C 2*i-i*i*d, inverse 32 bits ++ ++ mull r20, r20, r7 C i*i ++ addl r20, r20, r20 C 2*i ++ ++ mull r7, r18, r7 C i*i*d ++ subl r2, r19, r3 C y = x - climb ++ ++ subl r20, r7, r20 C inv = 2*i-i*i*d, inverse 64 bits ++ ++ASSERT(r7, C should have d*inv==1 mod 2^64 ++` mull r18, r20, r7 ++ cmpeq r7, 1, r7') ++ ++ mull r3, r20, r4 C first q = y * inv ++ ++ beq r17, L(one) C if size==1 ++ br L(entry) ++ ++ ++L(top): ++ C r0 cbit ++ C r16 src, incrementing ++ C r17 size, decrementing ++ C r18 d ++ C r19 climb ++ C r20 inv ++ ++ ldl r1, 0(r16) C s = src[i] ++ subl r1, r0, r2 C x = s - cbit ++ cmpult r1, r0, r0 C new cbit = s < cbit ++ ++ subl r2, r19, r3 C y = x - climb ++ ++ mull r3, r20, r4 C q = y * inv ++L(entry): ++ cmpult r2, r19, r5 C cbit2 = x < climb ++ addl r5, r0, r0 C cbit += cbit2 ++ ldi r16, 8(r16) C src++ ++ ldi r17, -1(r17) C size-- ++ ++ umulh r4, r18, r19 C climb = q * d ++ bne r17, L(top) C while 2 or more limbs left ++ ++ ++ ++ C r0 cbit ++ C r18 d ++ C r19 climb ++ C r20 inv ++ ++ ldl r1, 0(r16) C s = src[size-1] high limb ++ ++ cmpult r1, r18, r2 C test high