diff --git a/0001-Sw64-Port-add-configure-support-for-sw64.patch b/0001-Sw64-Port-add-configure-support-for-sw64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..c553c4b11ee866199737520b73829c31db2299cc
--- /dev/null
+++ b/0001-Sw64-Port-add-configure-support-for-sw64.patch
@@ -0,0 +1,592 @@
+From 8e8cbe70fbf759e3a4ebaa036ce782953507453b Mon Sep 17 00:00:00 2001
+From: swcompiler <lc@wxiat.com>
+Date: Tue, 22 Jul 2025 13:44:17 +0800
+Subject: [PATCH 1/3] Sw64-Port-add-configure-support-for-sw64
+
+---
+ config.guess        |  36 ++++++++++++++++
+ config.in           |   4 ++
+ configfsf.guess     |  11 +++++
+ configfsf.sub       |   1 +
+ configure           |  99 +++++++++++++++++++++++++++++++++++++++++-
+ configure.ac        | 102 +++++++++++++++++++++++++++++++++++++++++++-
+ extract-dbl.c       |   2 +-
+ gmp-impl.h          |   9 +++-
+ longlong.h          |  90 ++++++++++++++++++++++++++++++++++++++
+ mpn/Makefile.am     |   2 +-
+ mpn/Makefile.in     |   2 +-
+ mpn/generic/get_d.c |   2 +-
+ tune/Makefile.am    |   2 +-
+ tune/Makefile.in    |   2 +-
+ 14 files changed, 354 insertions(+), 10 deletions(-)
+
+diff --git a/config.guess b/config.guess
+index 6a7f141..6299029 100755
+--- a/config.guess
++++ b/config.guess
+@@ -784,6 +784,42 @@ sparc-*-* | sparc64-*-*)
+   fi
+   ;;
+ 
++sw_64-*-*)
++  eval $set_cc_for_build
++  cat <<EOF >${dummy}0.s
++        .data
++Lformat:
++	.byte 37,100,45,37,120,10,0     # "%d-%x\n"
++	.text
++	.globl main
++	.align 4
++	.ent main
++main:
++	.frame \$30,16,\$26,0
++	ldgp \$29,0(\$27)
++	.prologue 1
++	.long 0x47e03d91 # implver \$17
++	ldi \$2,-1
++	.long 0x47e20c21 # amask \$2,\$1
++	ldi \$16,Lformat
++	not \$1,\$18
++	call \$26,printf
++	ldgp \$29,0(\$26)
++	mov 0,\$16
++	call \$26,exit
++	.end main
++EOF
++  $CC_FOR_BUILD ${dummy}0.s -o $dummy 2>/dev/null
++  if test "$?" = 0 ; then
++    case `$dummy` in
++    0-0)        exact_cpu=sw_64      ;;
++    1-0)        exact_cpu=sw_64sw6   ;;
++    3-0)        exact_cpu=sw_64sw8a  ;;
++    2-307)      exact_cpu=sw_64sw6a  ;;
++    2-1307)     exact_cpu=sw_64sw6b  ;;
++    esac
++  fi
++  ;;
+ 
+ # Recognise x86 processors using a tricky cpuid with 4 arguments, repeating
+ # arguments; for x86-64 we effectively pass the 1st in rdx and the 2nd in rcx.
+diff --git a/config.in b/config.in
+index ee1ef8c..076bed0 100644
+--- a/config.in
++++ b/config.in
+@@ -146,6 +146,7 @@ see https://www.gnu.org/licenses/.
+    If your CPU is not in any of these families, leave all undefined.
+    For an AMD64 chip, define "x86" in ABI=32, but not in ABI=64. */
+ #undef HAVE_HOST_CPU_FAMILY_alpha
++#undef HAVE_HOST_CPU_FAMILY_sw_64
+ #undef HAVE_HOST_CPU_FAMILY_m68k
+ #undef HAVE_HOST_CPU_FAMILY_power
+ #undef HAVE_HOST_CPU_FAMILY_powerpc
+@@ -157,6 +158,9 @@ see https://www.gnu.org/licenses/.
+ #undef HAVE_HOST_CPU_alphaev67
+ #undef HAVE_HOST_CPU_alphaev68
+ #undef HAVE_HOST_CPU_alphaev7
++#undef HAVE_HOST_CPU_sw_64sw6a
++#undef HAVE_HOST_CPU_sw_64sw6b
++#undef HAVE_HOST_CPU_sw_64sw8a
+ #undef HAVE_HOST_CPU_m68020
+ #undef HAVE_HOST_CPU_m68030
+ #undef HAVE_HOST_CPU_m68040
+diff --git a/configfsf.guess b/configfsf.guess
+index 354a8cc..7d31475 100644
+--- a/configfsf.guess
++++ b/configfsf.guess
+@@ -1149,6 +1149,17 @@ EOF
+     sparc:Linux:*:* | sparc64:Linux:*:*)
+ 	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+ 	;;
++    sw_64:Linux:*:*)
++	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in
++	  SW6)   UNAME_MACHINE=sw_64sw6 ;;
++	  SW6A)  UNAME_MACHINE=sw_64sw6a ;;
++	  SW6B) UNAME_MACHINE=sw_64sw6b ;;
++	  SW8A) UNAME_MACHINE=sw_64sw8a ;;
++	esac
++	objdump --private-headers /bin/sh | grep -q ld.so.1
++	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
++	echo "$UNAME_MACHINE"-unknown-linux-"$LIBC"
++	exit ;;
+     tile*:Linux:*:*)
+ 	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+ 	;;
+diff --git a/configfsf.sub b/configfsf.sub
+index 9865d6e..06ecbea 100644
+--- a/configfsf.sub
++++ b/configfsf.sub
+@@ -1268,6 +1268,7 @@ case $cpu-$vendor in
+ 			| sparclite \
+ 			| sparcv8 | sparcv9 | sparcv9b | sparcv9v | sv1 | sx* \
+ 			| spu \
++			| sw_64 | sw_64sw6 | sw_64sw6a | sw_64sw6b | sw_64sw8a \
+ 			| tahoe \
+ 			| thumbv7* \
+ 			| tic30 | tic4x | tic54x | tic55x | tic6x | tic80 \
+diff --git a/configure b/configure
+index 7910aa0..c943538 100755
+--- a/configure
++++ b/configure
+@@ -4235,6 +4235,103 @@ echo "include_mpn(\`alpha/default.m4')" >> $gmp_tmpconfigm4i
+     esac
+     ;;
+ 
++#__sw_64__ start
++  sw_64*-*-*)
++    $as_echo "#define HAVE_HOST_CPU_FAMILY_sw_64 1" >>confdefs.h
++
++    case $host_cpu in
++      sw_64sw6a | sw_64sw6b | sw_64sw8a)
++	path="sw_64/sw6a sw_64/sw6b sw_64/sw8a sw_64" ;;
++      sw_64sw6)
++	path="sw_64/sw6 sw_64" ;;
++      *)
++	path="sw_64" ;;
++    esac
++    if test "$enable_assembly" = "yes" ; then
++       extra_functions="cntlz"
++    fi
++    gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below
++    gcc_cflags_maybe="-mieee"
++    gcc_cflags_oldas="-Wa,-oldas"     # see GMP_GCC_WA_OLDAS.
++
++    case $host_cpu in
++      sw_64)        gcc_cflags_cpu="-mcpu=sw6" ;;
++      sw_64sw8a)        gcc_cflags_cpu="-mcpu=sw8a" ;;
++      sw_64sw6)     gcc_cflags_cpu="-mcpu=sw6 -mcpu=sw6a" ;;
++      sw_64sw6b | sw_64sw6a)
++                    gcc_cflags_cpu="-mcpu=sw6a -mcpu=sw6b" ;;
++    esac
++
++# gcc version "2.9-gnupro-99r1" on sw_64-dec-osf5.1 has been seen
++    # not putting the assembler in the right mode
++    # for what it produces.  We need to do this for it, and need to do it
++    # before testing the -mcpu options.
++    #
++    # OSF `as' accepts sw_64.  -arch only seems
++    # to affect insns like ldbu which are expanded as macros when necessary.
++    # Insns like ctlz which were never available as macros are always
++    # accepted and always generate their plain code.
++    #
++    case $host_cpu in
++      sw_64)        gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;;
++      sw_64sw6)     gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;;
++      sw_64sw8a)     gcc_cflags_asm="-Wa,-msw8a -Wa,-arch,sw8a" ;;
++      sw_64sw6a | sw_64sw6b)
++                    gcc_cflags_asm=" -Wa,-msw6a -Wa,-arch,sw6b" ;;
++    esac
++
++    # It might be better to ask "cc" whether it's Cray C or DEC C,
++    # instead of relying on the OS part of $host.  But it's hard to
++    # imagine either of those compilers anywhere except their native
++    # systems.
++    #
++
++echo "include_mpn(\`sw_64/sw_64-defs.m4')" >> $gmp_tmpconfigm4i
++
++    case $host in
++      *-cray-unicos*)
++        cc_cflags="-O"          # no -g, it silently disables all optimizations
++
++echo "include_mpn(\`sw_64/unicos.m4')" >> $gmp_tmpconfigm4i
++
++        # Don't perform any assembly syntax tests on this beast.
++        gmp_asm_syntax_testing=no
++        ;;
++      *-*-osf*)
++
++echo "include_mpn(\`sw_64/default.m4')" >> $gmp_tmpconfigm4i
++
++        cc_cflags=""
++        cc_cflags_optlist="opt cpu"
++
++        # not sure if -fast works on old versions, so make it optional
++        cc_cflags_opt="-fast -O2"
++
++        case $host_cpu in
++          sw_64)       cc_cflags_cpu="-arch~sw6~-tune~sw6" ;;
++          sw_64sw6)    cc_cflags_cpu="-arch~sw6~-tune~sw6" ;;
++          sw_64sw8a)    cc_cflags_cpu="-arch~sw8a~-tune~sw8a" ;;
++          sw_64sw6a | sw_64sw6b)
++            cc_cflags_cpu="-arch~sw6a~-tune~sw6a -arch~sw6b~-tune~sw6b" ;;
++        esac
++        ;;
++      *)
++
++echo "include_mpn(\`sw_64/default.m4')" >> $gmp_tmpconfigm4i
++
++        ;;
++    esac
++
++    case $host in
++      *-*-unicos*)
++        # tune/sw_64.asm assumes int==4bytes but unicos uses int==8bytes
++        ;;
++      *)
++        SPEED_CYCLECOUNTER_OBJ=sw_64.lo
++        cyclecounter_size=1 ;;
++    esac
++    ;;
++#__sw_64__ end
+ 
+   # Cray vector machines.
+   # This must come after alpha* so that we can recognize present and future
+@@ -7515,7 +7612,7 @@ fi
+             ;;
+           -Wa,-m*)
+             case $host in
+-              alpha*-*-*)
++              alpha*-*-* | sw_64*-*-*)
+                 { $as_echo "$as_me:${as_lineno-$LINENO}: checking assembler $cc $cflags $flag" >&5
+ $as_echo_n "checking assembler $cc $cflags $flag... " >&6; }
+ result=no
+diff --git a/configure.ac b/configure.ac
+index 082077b..4de2e25 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -296,6 +296,7 @@ AH_VERBATIM([HAVE_HOST_CPU_1],
+ #undef HAVE_HOST_CPU_FAMILY_m68k
+ #undef HAVE_HOST_CPU_FAMILY_power
+ #undef HAVE_HOST_CPU_FAMILY_powerpc
++#undef HAVE_HOST_CPU_FAMILY_sw_64
+ #undef HAVE_HOST_CPU_FAMILY_x86
+ #undef HAVE_HOST_CPU_FAMILY_x86_64
+ 
+@@ -351,6 +352,10 @@ AH_VERBATIM([HAVE_HOST_CPU_1],
+ #undef HAVE_HOST_CPU_s390_z13
+ #undef HAVE_HOST_CPU_s390_z14
+ #undef HAVE_HOST_CPU_s390_z15
++#undef HAVE_HOST_CPU_sw_64sw6a
++#undef HAVE_HOST_CPU_sw_64sw6b
++#undef HAVE_HOST_CPU_sw_64sw8a
++#undef HAVE_HOST_CPU_sw_64sw6
+ 
+ /* Define to 1 iff we have a s390 with 64-bit registers.  */
+ #undef HAVE_HOST_CPU_s390_zarch])
+@@ -615,6 +620,101 @@ case $host in
+     path="cray"
+     ;;
+ 
++#__sw_64__ start
++  sw_64*-*-*)
++    AC_DEFINE(HAVE_HOST_CPU_FAMILY_sw_64)
++    case $host_cpu in
++      sw_64sw6a | sw_64sw6b | sw_64sw8a | sw_64sw6*)
++	path="sw_64/sw6a sw_64/sw6b sw_64/sw8a sw_64" ;;
++      sw_64sw6)
++	path="sw_64/sw6 sw_64" ;;
++      *)
++	path="sw_64" ;;
++    esac
++    if test "$enable_assembly" = "yes" ; then
++       extra_functions="cntlz"
++    fi
++    gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below
++    gcc_cflags_maybe="-mieee"
++    gcc_cflags_oldas="-Wa,-oldas"     # see GMP_GCC_WA_OLDAS.
++
++    # compiler errors too easily and is rejected by GMP_PROG_CC_WORKS.  Each
++    # -mcpu=sw6 below has a fallback to -mcpu=sw_64 for this reason.
++    #
++    case $host_cpu in
++      sw_64)        gcc_cflags_cpu="-mcpu=sw6" ;;
++      sw_64sw6)     gcc_cflags_cpu="-mcpu=sw6 " ;;
++      sw_64sw8a)     gcc_cflags_cpu="-mcpu=sw8a" ;;
++      sw_64sw6a | sw_64sw6b)
++                    gcc_cflags_cpu="-mcpu=sw6a -mcpu=sw6b" ;;
++    esac
++
++# gcc version "2.9-gnupro-99r1" on sw_64 has been seen
++    # accepting -mcpu=sw6, but not putting the assembler in the right mode
++    # for what it produces.  We need to do this for it, and need to do it
++    # before testing the -mcpu options.
++    #
++    # On old versions of gcc, which don't know -mcpu=, we believe an
++    # explicit etc will be necessary to put the assembler in
++    # the right mode for our .asm files and longlong.h asm blocks.
++    #
++    # On newer versions of gcc, when -mcpu= is known, we must give a -Wa
++    # which is at least as high as the code gcc will generate.  gcc
++    # establishes what it needs with a ".arch" directive, our command line
++    # option seems to override that.
++    #
++    case $host_cpu in
++      sw_64)        gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;;
++      sw_64sw6)     gcc_cflags_asm="-Wa,-arch,sw6 -Wa,-msw6" ;;
++      sw_64sw8a)     gcc_cflags_asm="-Wa,-arch,sw8a -Wa,-msw8a" ;;
++      sw_64sw6a | sw_64sw6b)
++                    gcc_cflags_asm="-Wa,-arch,sw6a -Wa,-msw6a -Wa,-arch,sw6b -Wa,-msw6b" ;;
++    esac
++
++    # It might be better to ask "cc" whether it's Cray C or DEC C,
++    # instead of relying on the OS part of $host.  But it's hard to
++    # imagine either of those compilers anywhere except their native
++    # systems.
++    #
++    GMP_INCLUDE_MPN(sw_64/sw_64-defs.m4)
++    case $host in
++      *-cray-unicos*)
++        cc_cflags="-O"          # no -g, it silently disables all optimizations
++        GMP_INCLUDE_MPN(sw_64/unicos.m4)
++        # Don't perform any assembly syntax tests on this beast.
++        gmp_asm_syntax_testing=no
++        ;;
++      *-*-osf*)
++        GMP_INCLUDE_MPN(sw_64/default.m4)
++        cc_cflags=""
++        cc_cflags_optlist="opt cpu"
++
++        # not sure if -fast works on old versions, so make it optional
++        cc_cflags_opt="-fast -O2"
++
++        case $host_cpu in
++          sw_64)       cc_cflags_cpu="-arch~sw6~-tune~sw6" ;;
++          sw_64sw6)    cc_cflags_cpu="-arch~sw6~-tune~sw6" ;;
++          sw_64sw8a)    cc_cflags_cpu="-arch~sw8a~-tune~sw8a" ;;
++          sw_64sw6a | sw_64sw6b)
++            cc_cflags_cpu="-arch~sw6a~-tune~sw6a -arch~sw6b~-tune~sw6b" ;;
++        esac
++        ;;
++      *)
++        GMP_INCLUDE_MPN(sw_64/default.m4)
++        ;;
++    esac
++
++    case $host in
++      *-*-unicos*)
++        # tune/sw_64.asm assumes int==4bytes but unicos uses int==8bytes
++        ;;
++      *)
++        SPEED_CYCLECOUNTER_OBJ=sw_64.lo
++        cyclecounter_size=1 ;;
++    esac
++    ;;
++#__sw_64__ end
+ 
+   arm*-*-* | aarch64*-*-* | [applem[1-9]-*-*])
+     abilist="32"
+@@ -2428,7 +2528,7 @@ if test $found_compiler = yes; then
+             ;;
+           -Wa,-m*)
+             case $host in
+-              alpha*-*-*)
++	      alpha*-*-* | sw_64*-*-*)
+                 GMP_GCC_WA_MCPU($cc $cflags, $flag, , [continue])
+               ;;
+             esac
+diff --git a/extract-dbl.c b/extract-dbl.c
+index e44d6fa..434a7af 100644
+--- a/extract-dbl.c
++++ b/extract-dbl.c
+@@ -71,7 +71,7 @@ __gmp_extract_double (mp_ptr rp, double d)
+ 
+ #if _GMP_IEEE_FLOATS
+   {
+-#if defined (__alpha) && __GNUC__ == 2 && __GNUC_MINOR__ == 8
++#if (defined (__alpha) || defined (__sw_64)) && __GNUC__ == 2 && __GNUC_MINOR__ == 8
+     /* Work around alpha-specific bug in GCC 2.8.x.  */
+     volatile
+ #endif
+diff --git a/gmp-impl.h b/gmp-impl.h
+index 2615af7..6e6682d 100644
+--- a/gmp-impl.h
++++ b/gmp-impl.h
+@@ -320,6 +320,11 @@ typedef struct {mp_limb_t inv21, inv32, inv53;} gmp_pi2_t;
+ #define HAVE_HOST_CPU_alpha_CIX 1
+ #endif
+ 
++#if HAVE_HOST_CPU_sw_64sw6a || HAVE_HOST_CPU_sw_64sw6b || HAVE_HOST_CPU_sw_64sw8a \
++  || HAVE_HOST_CPU_sw_64sw6 || HAVE_HOST_CPU_sw_64
++#define HAVE_HOST_CPU_sw_64_CIX 1
++#endif
++
+ 
+ #if defined (__cplusplus)
+ extern "C" {
+@@ -3444,7 +3449,7 @@ __GMP_DECLSPEC extern const unsigned char  binvert_limb_table[128];
+    to 0 if there's an even number.  "n" should be an unsigned long and "p"
+    an int.  */
+ 
+-#if defined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX
++#if defined (__GNUC__) && ! defined (NO_ASM) && (HAVE_HOST_CPU_alpha_CIX || HAVE_HOST_CPU_sw_64_CIX)
+ #define ULONG_PARITY(p, n)						\
+   do {									\
+     int __p;								\
+@@ -3726,7 +3731,7 @@ __GMP_DECLSPEC extern const unsigned char  binvert_limb_table[128];
+ #endif
+ #endif
+ 
+-#if defined (__GNUC__) && ! defined (NO_ASM) && HAVE_HOST_CPU_alpha_CIX
++#if defined (__GNUC__) && ! defined (NO_ASM) && (HAVE_HOST_CPU_alpha_CIX || HAVE_HOST_CPU_sw_64_CIX)
+ #define popc_limb(result, input)					\
+   do {									\
+     __asm__ ("ctpop %1, %0" : "=r" (result) : "r" (input));		\
+diff --git a/longlong.h b/longlong.h
+index be1c3cb..3287f0e 100644
+--- a/longlong.h
++++ b/longlong.h
+@@ -270,6 +270,96 @@ long __MPN(count_leading_zeros) (UDItype);
+ #endif /* clz using mpn */
+ #endif /* __alpha */
+ 
++//__sw_64 start
++#if defined (__sw_64) && W_TYPE_SIZE == 64
++/* Most sw_64-based machines, except Cray systems. */
++#if defined (__GNUC__)
++#if __GMP_GNUC_PREREQ (3,3)
++#define umul_ppmm(ph, pl, m0, m1) \
++  do {                                                                  \
++    UDItype __m0 = (m0), __m1 = (m1);                                   \
++    (ph) = __builtin_sw_64_umulh (__m0, __m1);                          \
++    (pl) = __m0 * __m1;                                                 \
++  } while (0)
++#else
++#define umul_ppmm(ph, pl, m0, m1) \
++  do {                                                                  \
++    UDItype __m0 = (m0), __m1 = (m1);                                   \
++    __asm__ ("umulh %r1,%2,%0"                                          \
++             : "=r" (ph)                                                \
++             : "%rJ" (__m0), "rI" (__m1));                              \
++    (pl) = __m0 * __m1;                                                 \
++  } while (0)
++#endif
++#define UMUL_TIME 18
++#else /* ! __GNUC__ */
++#include <machine/builtins.h>
++#define umul_ppmm(ph, pl, m0, m1) \
++  do {                                                                  \
++    UDItype __m0 = (m0), __m1 = (m1);                                   \
++    (ph) = __UMULH (__m0, __m1);                                        \
++    (pl) = __m0 * __m1;                                                 \
++  } while (0)
++#endif
++#ifndef LONGLONG_STANDALONE
++#define udiv_qrnnd(q, r, n1, n0, d) \
++  do { UWtype __di;                                                     \
++    __di = __MPN(invert_limb) (d);                                      \
++    udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
++  } while (0)
++#define UDIV_PREINV_ALWAYS  1
++#define UDIV_NEEDS_NORMALIZATION 1
++#endif /* LONGLONG_STANDALONE */
++
++/* clz_tab is required in all configurations, since mpn/sw_64/cntlz.asm
++   always goes into libgmp.so, even when not actually used.  */
++#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
++
++#if defined (__GNUC__) && HAVE_HOST_CPU_sw_64_CIX
++#define count_leading_zeros(COUNT,X) \
++  __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
++#define count_trailing_zeros(COUNT,X) \
++  __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
++#endif /* clz/ctz using cix */
++
++#if ! defined (count_leading_zeros)                             \
++  && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
++/* SW_64_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
++   "$31" is written explicitly in the asm, since an "r" constraint won't
++   select reg 31.  There seems no need to worry about "r31" syntax for cray,
++   since gcc itself (pre-release 3.4) emits just $31 in various places.  */
++#define SW_64_CMPBGE_0(dst, src)                                        \
++  do { asm ("cmpgeb $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
++/* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
++   them, locating the highest non-zero byte.  A second __clz_tab lookup
++   counts the leading zero bits in that byte, giving the result.  */
++#define count_leading_zeros(count, x)                                   \
++  do {                                                                  \
++    UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
++    SW_64_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
++    __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
++    __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
++    __clz__x >>= __clz__b;                                              \
++    __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
++    __clz__b = 65 - __clz__b;                                           \
++    (count) = __clz__b - __clz__c;                                      \
++  } while (0)
++#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
++#endif /* clz using cmpbge */
++
++#if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
++#if HAVE_ATTRIBUTE_CONST
++long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
++#else
++long __MPN(count_leading_zeros) (UDItype);
++#endif
++#define count_leading_zeros(count, x) \
++  ((count) = __MPN(count_leading_zeros) (x))
++#endif /* clz using mpn */
++#endif
++//__sw_64 end
++
++
+ #if defined (__AVR) && W_TYPE_SIZE == 8
+ #define umul_ppmm(ph, pl, m0, m1) \
+   do {									\
+diff --git a/mpn/Makefile.am b/mpn/Makefile.am
+index c61926d..acca2d7 100644
+--- a/mpn/Makefile.am
++++ b/mpn/Makefile.am
+@@ -41,7 +41,7 @@ libmpn_la_DEPENDENCIES = $(OFILES)
+ 
+ TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \
+   minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \
+-  riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64
++  riscv s390_32 s390_64 sh sparc32 sparc64 sw_64 thumb vax x86 x86_64
+ 
+ EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST)
+ 
+diff --git a/mpn/Makefile.in b/mpn/Makefile.in
+index b5df4e5..6641d1f 100644
+--- a/mpn/Makefile.in
++++ b/mpn/Makefile.in
+@@ -395,7 +395,7 @@ libmpn_la_LIBADD = $(OFILES)
+ libmpn_la_DEPENDENCIES = $(OFILES)
+ TARG_DIST = alpha arm arm64 cray generic ia64 lisp loongarch m68k m88k \
+   minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \
+-  riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64
++  riscv s390_32 s390_64 sh sparc32 sparc64 sw_64 thumb vax x86 x86_64
+ 
+ EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST)
+ 
+diff --git a/mpn/generic/get_d.c b/mpn/generic/get_d.c
+index 8bef128..10f0dee 100644
+--- a/mpn/generic/get_d.c
++++ b/mpn/generic/get_d.c
+@@ -61,7 +61,7 @@ see https://www.gnu.org/licenses/.  */
+    Bizarrely, this happens also with Cray cc on alphaev5-cray-unicosmk2.0.6.X,
+    and has the same solution.  Don't know why or how.  */
+ 
+-#if HAVE_HOST_CPU_FAMILY_alpha				\
++#if HAVE_HOST_CPU_FAMILY_alpha || HAVE_HOST_CPU_FAMILY_sw_64	\
+   && ((defined (__GNUC__) && ! __GMP_GNUC_PREREQ(3,4))	\
+       || defined (_CRAY))
+ static volatile const long CONST_1024 = 1024;
+diff --git a/tune/Makefile.am b/tune/Makefile.am
+index 0f564ed..d2852af 100644
+--- a/tune/Makefile.am
++++ b/tune/Makefile.am
+@@ -33,7 +33,7 @@ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests
+ AM_LDFLAGS = -no-install
+ 
+ EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \
+-  ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl
++  ia64.asm powerpc.asm powerpc64.asm sw_64.asm x86_64.asm many.pl
+ noinst_HEADERS = speed.h
+ 
+ # Prefer -static on the speed and tune programs, since that can avoid
+diff --git a/tune/Makefile.in b/tune/Makefile.in
+index 7db531a..7e24a39 100644
+--- a/tune/Makefile.in
++++ b/tune/Makefile.in
+@@ -460,7 +460,7 @@ top_srcdir = @top_srcdir@
+ AM_CPPFLAGS = -I$(top_srcdir) -I$(top_srcdir)/tests
+ AM_LDFLAGS = -no-install
+ EXTRA_DIST = alpha.asm pentium.asm sparcv9.asm hppa.asm hppa2.asm hppa2w.asm \
+-  ia64.asm powerpc.asm powerpc64.asm x86_64.asm many.pl
++  ia64.asm powerpc.asm powerpc64.asm sw_64.asm x86_64.asm many.pl
+ 
+ noinst_HEADERS = speed.h
+ @ENABLE_STATIC_FALSE@STATIC = 
+-- 
+2.25.1
+
diff --git a/0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch b/0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..f5918661caae97f4fa4f2a0439b137f5b946aff1
--- /dev/null
+++ b/0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch
@@ -0,0 +1,6399 @@
+From 51d4c7e1bf74232eac185b1e3465457397a1fe30 Mon Sep 17 00:00:00 2001
+From: swcompiler <lc@wxiat.com>
+Date: Tue, 22 Jul 2025 13:50:16 +0800
+Subject: [PATCH 2/3] Sw64-Port-add-mpn-configure-support-for-sw64
+
+---
+ mpn/sw_64/README                 |  97 ++++++
+ mpn/sw_64/default.m4             | 127 ++++++++
+ mpn/sw_64/gmp-mparam.h           |  86 ++++++
+ mpn/sw_64/sw6/add_n.asm          | 281 +++++++++++++++++
+ mpn/sw_64/sw6/aorslsh1_n.asm     | 168 +++++++++++
+ mpn/sw_64/sw6/aorsmul_1.asm      | 396 ++++++++++++++++++++++++
+ mpn/sw_64/sw6/gmp-mparam.h       | 209 +++++++++++++
+ mpn/sw_64/sw6/mod_1_4.asm        | 333 +++++++++++++++++++++
+ mpn/sw_64/sw6/mul_1.asm          | 496 +++++++++++++++++++++++++++++++
+ mpn/sw_64/sw6/nails/README       |  65 ++++
+ mpn/sw_64/sw6/nails/addmul_1.asm | 394 ++++++++++++++++++++++++
+ mpn/sw_64/sw6/nails/addmul_2.asm | 146 +++++++++
+ mpn/sw_64/sw6/nails/addmul_3.asm | 169 +++++++++++
+ mpn/sw_64/sw6/nails/addmul_4.asm | 210 +++++++++++++
+ mpn/sw_64/sw6/nails/aors_n.asm   | 233 +++++++++++++++
+ mpn/sw_64/sw6/nails/gmp-mparam.h |  72 +++++
+ mpn/sw_64/sw6/nails/mul_1.asm    | 362 ++++++++++++++++++++++
+ mpn/sw_64/sw6/nails/submul_1.asm | 394 ++++++++++++++++++++++++
+ mpn/sw_64/sw6/slot.pl            | 318 ++++++++++++++++++++
+ mpn/sw_64/sw6/sub_n.asm          | 281 +++++++++++++++++
+ mpn/sw_64/sw6a/gcd_1.asm         | 145 +++++++++
+ mpn/sw_64/sw6a/hamdist.asm       | 111 +++++++
+ mpn/sw_64/sw6a/popcount.asm      | 101 +++++++
+ mpn/sw_64/sw6b/gcd_1.asm         | 145 +++++++++
+ mpn/sw_64/sw6b/hamdist.asm       | 111 +++++++
+ mpn/sw_64/sw6b/popcount.asm      | 101 +++++++
+ mpn/sw_64/sw8a/gcd_1.asm         | 145 +++++++++
+ mpn/sw_64/sw8a/hamdist.asm       | 111 +++++++
+ mpn/sw_64/sw8a/popcount.asm      | 101 +++++++
+ mpn/sw_64/sw_64-defs.m4          | 101 +++++++
+ mpn/sw_64/unicos.m4              | 131 ++++++++
+ 31 files changed, 6140 insertions(+)
+ create mode 100644 mpn/sw_64/README
+ create mode 100644 mpn/sw_64/default.m4
+ create mode 100644 mpn/sw_64/gmp-mparam.h
+ create mode 100644 mpn/sw_64/sw6/add_n.asm
+ create mode 100644 mpn/sw_64/sw6/aorslsh1_n.asm
+ create mode 100644 mpn/sw_64/sw6/aorsmul_1.asm
+ create mode 100644 mpn/sw_64/sw6/gmp-mparam.h
+ create mode 100644 mpn/sw_64/sw6/mod_1_4.asm
+ create mode 100644 mpn/sw_64/sw6/mul_1.asm
+ create mode 100644 mpn/sw_64/sw6/nails/README
+ create mode 100644 mpn/sw_64/sw6/nails/addmul_1.asm
+ create mode 100644 mpn/sw_64/sw6/nails/addmul_2.asm
+ create mode 100644 mpn/sw_64/sw6/nails/addmul_3.asm
+ create mode 100644 mpn/sw_64/sw6/nails/addmul_4.asm
+ create mode 100644 mpn/sw_64/sw6/nails/aors_n.asm
+ create mode 100644 mpn/sw_64/sw6/nails/gmp-mparam.h
+ create mode 100644 mpn/sw_64/sw6/nails/mul_1.asm
+ create mode 100644 mpn/sw_64/sw6/nails/submul_1.asm
+ create mode 100755 mpn/sw_64/sw6/slot.pl
+ create mode 100644 mpn/sw_64/sw6/sub_n.asm
+ create mode 100644 mpn/sw_64/sw6a/gcd_1.asm
+ create mode 100644 mpn/sw_64/sw6a/hamdist.asm
+ create mode 100644 mpn/sw_64/sw6a/popcount.asm
+ create mode 100644 mpn/sw_64/sw6b/gcd_1.asm
+ create mode 100644 mpn/sw_64/sw6b/hamdist.asm
+ create mode 100644 mpn/sw_64/sw6b/popcount.asm
+ create mode 100644 mpn/sw_64/sw8a/gcd_1.asm
+ create mode 100644 mpn/sw_64/sw8a/hamdist.asm
+ create mode 100644 mpn/sw_64/sw8a/popcount.asm
+ create mode 100644 mpn/sw_64/sw_64-defs.m4
+ create mode 100644 mpn/sw_64/unicos.m4
+
+diff --git a/mpn/sw_64/README b/mpn/sw_64/README
+new file mode 100644
+index 0000000..5557835
+--- /dev/null
++++ b/mpn/sw_64/README
+@@ -0,0 +1,97 @@
++Copyright 1996, 1997, 1999-2005 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.
++
++
++
++
++
++This directory contains mpn functions optimized for DEC Sw_64 processors.
++
++SW_64 ASSEMBLY RULES AND REGULATIONS
++
++The `.prologue N' pseudo op marks the end of instruction that needs special
++handling by unwinding.  It also says whether $27 is really needed for computing
++the gp.  The `.mask M' pseudo op says which registers are saved on the stack,
++and at what offset in the frame.
++
++Cray T3 code is very very different...
++
++"$6" / "$f6" etc is the usual syntax for registers, but on Unicos instead "r6"
++/ "f6" is required.  We use the "r6" / "f6" forms, and have m4 defines expand
++them to "$6" or "$f6" where necessary.
++
++"0x" introduces a hex constant in gas and DEC as, but on Unicos "^X" is
++required.  The X() macro accommodates this difference.
++
++"cvttqc" is required by DEC as, "cvttq/c" is required by Unicos, and gas will
++accept either.  We use cvttqc and have an m4 define expand to cvttq/c where
++necessary.
++
++"not" as an alias for "ornot r31, ..." is available in gas and DEC as, but not
++the Unicos assembler.  The full "ornot" must be used.
++
++"unop" is not available in Unicos.  We make an m4 define to the usual "ldl_u
++r31,0(r30)", and in fact use that define on all systems since it comes out the
++same.
++
++"!literal!123" etc explicit relocations as per Tru64 4.0 are apparently not
++available in older sw_64 assemblers (including gas prior to 2.12), according to
++the GCC manual, so the assembler macro forms must be used (eg. ldgp).
++
++
++
++RELEVANT OPTIMIZATION ISSUES
++
++Here we have a really parallel pipeline, capable of issuing up to 4 integer
++instructions per cycle.  In actual practice, it is never possible to sustain
++more than 3.5 integer insns/cycle due to rename register shortage.  One integer
++multiply instruction can issue each cycle.  To get optimal speed, we need to
++pretend we are vectorizing the code, i.e., minimize the depth of recurrences.
++
++There are two dependencies to watch out for.  1) Address arithmetic
++dependencies, and 2) carry propagation dependencies.
++
++We can avoid serializing due to address arithmetic by unrolling loops, so that
++addresses don't depend heavily on an index variable.  Avoiding serializing
++because of carry propagation is trickier; the ultimate performance of the code
++will be determined of the number of latency cycles it takes from accepting
++carry-in to a vector point until we can generate carry-out.
++
++Most integer instructions can execute in either the L0, U0, L1, or U1
++pipelines.  Shifts only execute in U0 and U1, and multiply only in U1.
++
++CMOV instructions split into two internal instructions, CMOV1 and CMOV2.  CMOV
++split the mapping process (see pg 2-26 in cmpwrgd.pdf), suggesting the CMOV
++should always be placed as the last instruction of an aligned 4 instruction
++block, or perhaps simply avoided.
++
++Perhaps the most important issue is the latency between the L0/U0 and L1/U1
++clusters; a result obtained on either cluster has an extra cycle of latency for
++consumers in the opposite cluster.  Because of the dynamic nature of the
++implementation, it is hard to predict where an instruction will execute.
++
+diff --git a/mpn/sw_64/default.m4 b/mpn/sw_64/default.m4
+new file mode 100644
+index 0000000..38c8bcb
+--- /dev/null
++++ b/mpn/sw_64/default.m4
+@@ -0,0 +1,127 @@
++divert(-1)
++
++dnl  m4 macros for sw_64 assembler (everywhere except unicos).
++
++
++dnl  Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++
++dnl  Usage: ASM_START()
++define(`ASM_START',
++m4_assert_numargs(0)
++`	.set noreorder
++	.set noat')
++
++dnl  Usage: X(value)
++define(`X',
++m4_assert_numargs(1)
++`0x$1')
++
++dnl  Usage: FLOAT64(label,value)
++define(`FLOAT64',
++m4_assert_numargs(2)
++`	.align	3
++$1:	.t_floating $2')
++
++
++dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign])
++dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
++
++define(`PROLOGUE_cpu',
++m4_assert_numargs_range(1,2)
++`ifelse(`$2',gp,,
++`ifelse(`$2',noalign,,
++`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter
++')')')')dnl
++	.text
++ifelse(`$2',noalign,,`	ALIGN(16)')
++	.globl	$1
++	.ent	$1
++$1:
++	.frame r30,0,r26,0
++ifelse(`$2',gp,`	ldgp	r29, 0(r27)
++`$'$1..ng:')
++	.prologue ifelse(`$2',gp,1,0)')
++
++define(`EPILOGUE_cpu',
++m4_assert_numargs(1)
++`	.end	$1')
++
++
++dnl  Usage: LDGP(dst,src)
++dnl
++dnl  Emit an "ldgp dst,src", but only if the system uses a GOT.
++
++define(LDGP,
++m4_assert_numargs(2)
++`ldgp	`$1', `$2'')
++
++
++dnl  Usage: EXTERN(variable_name)
++define(`EXTERN',
++m4_assert_numargs(1)
++)
++
++dnl  Usage: r0 ... r31
++dnl         f0 ... f31
++dnl
++dnl  Map register names r0 to $0, and f0 to $f0, etc.
++dnl  This is needed on all systems but Unicos
++dnl
++dnl  defreg() is used to protect the $ in $0 (otherwise it would represent a
++dnl  macro argument).  Double quoting is used to protect the f0 in $f0
++dnl  (otherwise it would be an infinite recursion).
++
++forloop(i,0,31,`defreg(`r'i,$i)')
++forloop(i,0,31,`deflit(`f'i,``$f''i)')
++
++
++dnl  Usage: DATASTART(name,align)  or  DATASTART(name)
++dnl         DATAEND()
++
++define(`DATASTART',
++m4_assert_numargs_range(1,2)
++`	RODATA
++	ALIGN(ifelse($#,1,2,$2))
++$1:')
++define(`DATAEND',
++m4_assert_numargs(0)
++)
++
++dnl  Load a symbolic address into a register
++define(`LEA',
++m4_assert_numargs(2)
++`ldi	$1, $2')
++
++dnl  Usage: ASM_END()
++define(`ASM_END',
++m4_assert_numargs(0)
++)
++
++divert
+diff --git a/mpn/sw_64/gmp-mparam.h b/mpn/sw_64/gmp-mparam.h
+new file mode 100644
+index 0000000..bf51ad2
+--- /dev/null
++++ b/mpn/sw_64/gmp-mparam.h
+@@ -0,0 +1,86 @@
++/* Sw_64 gmp-mparam.h -- Compiler/machine parameter header file.
++
++Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software
++Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#define GMP_LIMB_BITS 64
++#define GMP_LIMB_BYTES 8
++
++
++/* 175MHz 21064 */
++
++/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */
++
++#define MUL_TOOM22_THRESHOLD             12
++#define MUL_TOOM33_THRESHOLD             69
++#define MUL_TOOM44_THRESHOLD             88
++
++#define SQR_BASECASE_THRESHOLD            4
++#define SQR_TOOM2_THRESHOLD              20
++#define SQR_TOOM3_THRESHOLD              62
++#define SQR_TOOM4_THRESHOLD             155
++
++#define MULLO_BASECASE_THRESHOLD          0  /* always */
++#define MULLO_DC_THRESHOLD               40
++#define MULLO_MUL_N_THRESHOLD           202
++
++#define DIV_SB_PREINV_THRESHOLD           0  /* preinv always */
++#define DIV_DC_THRESHOLD                 38
++#define POWM_THRESHOLD                   60
++
++#define MATRIX22_STRASSEN_THRESHOLD      17
++#define HGCD_THRESHOLD                   80
++#define GCD_DC_THRESHOLD                237
++#define GCDEXT_DC_THRESHOLD             198
++#define JACOBI_BASE_METHOD                2
++
++#define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
++#define DIVREM_1_UNNORM_THRESHOLD         0  /* always */
++#define MOD_1_NORM_THRESHOLD              0  /* always */
++#define MOD_1_UNNORM_THRESHOLD            0  /* always */
++#define MOD_1_1_THRESHOLD                 2
++#define MOD_1_2_THRESHOLD                 9
++#define MOD_1_4_THRESHOLD                20
++#define USE_PREINV_DIVREM_1               1  /* preinv always */
++#define USE_PREINV_MOD_1                  1  /* preinv always */
++#define DIVEXACT_1_THRESHOLD              0  /* always */
++#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
++
++#define GET_STR_DC_THRESHOLD             20
++#define GET_STR_PRECOMPUTE_THRESHOLD     37
++#define SET_STR_DC_THRESHOLD            746
++#define SET_STR_PRECOMPUTE_THRESHOLD   1332
++
++#define MUL_FFT_TABLE  { 240, 480, 1344, 2304, 5120, 20480, 49152, 0 }
++#define MUL_FFT_MODF_THRESHOLD          232
++#define MUL_FFT_THRESHOLD              1664
++
++#define SQR_FFT_TABLE  { 240, 480, 1216, 2304, 5120, 12288, 49152, 0 }
++#define SQR_FFT_MODF_THRESHOLD          232
++#define SQR_FFT_THRESHOLD              1408
+diff --git a/mpn/sw_64/sw6/add_n.asm b/mpn/sw_64/sw6/add_n.asm
+new file mode 100644
+index 0000000..4f0062c
+--- /dev/null
++++ b/mpn/sw_64/sw6/add_n.asm
+@@ -0,0 +1,281 @@
++dnl  Sw_64 sw6 mpn_add_n -- Add two limb vectors of the same length > 0 and
++dnl  store sum in a third limb vector.
++
++dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     2.125
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  vp	r18
++C  n	r19
++C  cy	r20   (for mpn_add_nc)
++
++C TODO
++C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
++C   Use multi-pronged feed-in.
++C   Perform additional micro-tuning
++
++C  This code was written in cooperation with sw6 pipeline expert Steve Root.
++
++C  Pair loads and stores where possible
++C  Store pairs oct-aligned where possible (didn't need it here)
++C  Stores are delayed every third cycle
++C  Loads and stores are delayed by fills
++C  U stays still, put code there where possible (note alternation of U1 and U0)
++C  L moves because of loads and stores
++C  Note dampers in L to limit damage
++
++C  This odd-looking optimization expects that were having random bits in our
++C  data, so that a pure zero result is unlikely. so we penalize the unlikely
++C  case to help the common case.
++
++define(`u0', `r0')  define(`u1', `r3')
++define(`v0', `r1')  define(`v1', `r4')
++
++define(`cy0', `r20')  define(`cy1', `r21')
++
++MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
++
++ASM_START()
++PROLOGUE(mpn_add_nc)
++	br	r31,	$entry
++EPILOGUE()
++PROLOGUE(mpn_add_n)
++	bis	r31,	r31,	cy0	C clear carry in
++$entry:	cmpult	r19,	5,	r22	C L1 move counter
++	ldl	u1,	0(r17)		C L0 get next ones
++	ldl	v1,	0(r18)		C L1
++	bne	r22,	$Lsmall
++
++	ldl	u0,	8(r17)		C L0 get next ones
++	ldl	v0,	8(r18)		C L1
++	addl	u1,	v1,	r5	C U0 add two data
++
++	cmpult	r5,	v1,	r23	C U0 did it carry
++	ldl	u1,	16(r17)		C L0 get next ones
++	ldl	v1,	16(r18)		C L1
++
++	addl	u0,	v0,	r8	C U1 add two data
++	addl	r5,	cy0,	r5	C U0 carry in
++
++	cmpult	r8,	v0,	r22	C U1 did it carry
++	beq	r5,	$fix5f		C U0 fix exact zero
++$ret5f:	ldl	u0,	24(r17)		C L0 get next ones
++	ldl	v0,	24(r18)		C L1
++
++	addl	r8,	r23,	r8	C U1 carry from last
++	addl	u1,	v1,	r7	C U0 add two data
++
++	beq	r8,	$fix6f		C U1 fix exact zero
++$ret6f:	cmpult	r7,	v1,	r23	C U0 did it carry
++	ldl	u1,	32(r17)		C L0 get next ones
++	ldl	v1,	32(r18)		C L1
++
++	ldi	r17,	40(r17)		C L0 move pointer
++	ldi	r18,	40(r18)		C L1 move pointer
++
++	ldi	r16,	-8(r16)
++	ldi	r19,	-13(r19)	C L1 move counter
++	blt	r19,	$Lend		C U1 loop control
++
++
++C Main loop.  8-way unrolled.
++	ALIGN(16)
++$Loop:	addl	u0,	v0,	r2	C U1 add two data
++	addl	r7,	r22,	r7	C U0 add in carry
++	stl	r5,	8(r16)		C L0 put an answer
++	stl	r8,	16(r16)		C L1 pair
++
++	cmpult	r2,	v0,	cy1	C U1 did it carry
++	beq	r7,	$fix7		C U0 fix exact 0
++$ret7:	ldl	u0,	0(r17)		C L0 get next ones
++	ldl	v0,	0(r18)		C L1
++
++	bis	r31,	r31,	r31	C L  damp out
++	addl	r2,	r23,	r2	C U1 carry from last
++	bis	r31,	r31,	r31	C L  moves in L !
++	addl	u1,	v1,	r5	C U0 add two data
++
++	beq	r2,	$fix0		C U1 fix exact zero
++$ret0:	cmpult	r5,	v1,	cy0	C U0 did it carry
++	ldl	u1,	8(r17)		C L0 get next ones
++	ldl	v1,	8(r18)		C L1
++
++	addl	u0,	v0,	r8	C U1 add two data
++	addl	r5,	cy1,	r5	C U0 carry from last
++	stl	r7,	24(r16)		C L0 store pair
++	stl	r2,	32(r16)		C L1
++
++	cmpult	r8,	v0,	r22	C U1 did it carry
++	beq	r5,	$fix1		C U0 fix exact zero
++$ret1:	ldl	u0,	16(r17)		C L0 get next ones
++	ldl	v0,	16(r18)		C L1
++
++	ldi	r16,	64(r16)		C L0 move pointer
++	addl	r8,	cy0,	r8	C U1 carry from last
++	ldi	r19,	-8(r19)		C L1 move counter
++	addl	u1,	v1,	r7	C U0 add two data
++
++	beq	r8,	$fix2		C U1 fix exact zero
++$ret2:	cmpult	r7,	v1,	r23	C U0 did it carry
++	ldl	u1,	24(r17)		C L0 get next ones
++	ldl	v1,	24(r18)		C L1
++
++	addl	u0,	v0,	r2	C U1 add two data
++	addl	r7,	r22,	r7	C U0 add in carry
++	stl	r5,	-24(r16)	C L0 put an answer
++	stl	r8,	-16(r16)	C L1 pair
++
++	cmpult	r2,	v0,	cy1	C U1 did it carry
++	beq	r7,	$fix3		C U0 fix exact 0
++$ret3:	ldl	u0,	32(r17)		C L0 get next ones
++	ldl	v0,	32(r18)		C L1
++
++	bis	r31,	r31,	r31	C L  damp out
++	addl	r2,	r23,	r2	C U1 carry from last
++	bis	r31,	r31,	r31	C L  moves in L !
++	addl	u1,	v1,	r5	C U0 add two data
++
++	beq	r2,	$fix4		C U1 fix exact zero
++$ret4:	cmpult	r5,	v1,	cy0	C U0 did it carry
++	ldl	u1,	40(r17)		C L0 get next ones
++	ldl	v1,	40(r18)		C L1
++
++	addl	u0,	v0,	r8	C U1 add two data
++	addl	r5,	cy1,	r5	C U0 carry from last
++	stl	r7,	-8(r16)		C L0 store pair
++	stl	r2,	0(r16)		C L1
++
++	cmpult	r8,	v0,	r22	C U1 did it carry
++	beq	r5,	$fix5		C U0 fix exact zero
++$ret5:	ldl	u0,	48(r17)		C L0 get next ones
++	ldl	v0,	48(r18)		C L1
++
++	ldl	r31, 256(r17)		C L0 prefetch
++	addl	r8,	cy0,	r8	C U1 carry from last
++	ldl	r31, 256(r18)		C L1 prefetch
++	addl	u1,	v1,	r7	C U0 add two data
++
++	beq	r8,	$fix6		C U1 fix exact zero
++$ret6:	cmpult	r7,	v1,	r23	C U0 did it carry
++	ldl	u1,	56(r17)		C L0 get next ones
++	ldl	v1,	56(r18)		C L1
++
++	ldi	r17,	64(r17)		C L0 move pointer
++	bis	r31,	r31,	r31	C U
++	ldi	r18,	64(r18)		C L1 move pointer
++	bge	r19,	$Loop		C U1 loop control
++C ==== main loop end
++
++$Lend:	addl	u0,	v0,	r2	C U1 add two data
++	addl	r7,	r22,	r7	C U0 add in carry
++	stl	r5,	8(r16)		C L0 put an answer
++	stl	r8,	16(r16)		C L1 pair
++	cmpult	r2,	v0,	cy1	C U1 did it carry
++	beq	r7,	$fix7c		C U0 fix exact 0
++$ret7c:	addl	r2,	r23,	r2	C U1 carry from last
++	addl	u1,	v1,	r5	C U0 add two data
++	beq	r2,	$fix0c		C U1 fix exact zero
++$ret0c:	cmpult	r5,	v1,	cy0	C U0 did it carry
++	addl	r5,	cy1,	r5	C U0 carry from last
++	stl	r7,	24(r16)		C L0 store pair
++	stl	r2,	32(r16)		C L1
++	beq	r5,	$fix1c		C U0 fix exact zero
++$ret1c:	stl	r5,	40(r16)		C L0 put an answer
++	ldi	r16,	48(r16)		C L0 move pointer
++
++	ldi	r19,	8(r19)
++	beq	r19,	$Lret
++
++	ldl	u1,	0(r17)
++	ldl	v1,	0(r18)
++$Lsmall:
++	ldi	r19,	-1(r19)
++	beq	r19,	$Lend0
++
++	ALIGN(8)
++$Loop0:	addl	u1,	v1,	r2	C main add
++	cmpult	r2,	v1,	r8	C compute cy from last add
++	ldl	u1,	8(r17)
++	ldl	v1,	8(r18)
++	addl	r2,	cy0,	r5	C carry add
++	ldi	r17,	8(r17)
++	ldi	r18,	8(r18)
++	stl	r5,	0(r16)
++	cmpult	r5,	r2,	cy0	C compute cy from last add
++	ldi	r19,	-1(r19)		C decr loop cnt
++	bis	r8,	cy0,	cy0	C combine cy from the two adds
++	ldi	r16,	8(r16)
++	bne	r19,	$Loop0
++$Lend0:	addl	u1,	v1,	r2	C main add
++	addl	r2,	cy0,	r5	C carry add
++	cmpult	r2,	v1,	r8	C compute cy from last add
++	cmpult	r5,	r2,	cy0	C compute cy from last add
++	stl	r5,	0(r16)
++	bis	r8,	cy0,	r0	C combine cy from the two adds
++	ret	r31,(r26),1
++
++	ALIGN(8)
++$Lret:	ldi	r0,	0(cy0)		C copy carry into return register
++	ret	r31,(r26),1
++
++$fix5f:	bis	r23,	cy0,	r23	C bring forward carry
++	br	r31,	$ret5f
++$fix6f:	bis	r22,	r23,	r22	C bring forward carry
++	br	r31,	$ret6f
++$fix0:	bis	cy1,	r23,	cy1	C bring forward carry
++	br	r31,	$ret0
++$fix1:	bis	cy0,	cy1,	cy0	C bring forward carry
++	br	r31,	$ret1
++$fix2:	bis	r22,	cy0,	r22	C bring forward carry
++	br	r31,	$ret2
++$fix3:	bis	r23,	r22,	r23	C bring forward carry
++	br	r31,	$ret3
++$fix4:	bis	cy1,	r23,	cy1	C bring forward carry
++	br	r31,	$ret4
++$fix5:	bis	cy1,	cy0,	cy0	C bring forward carry
++	br	r31,	$ret5
++$fix6:	bis	r22,	cy0,	r22	C bring forward carry
++	br	r31,	$ret6
++$fix7:	bis	r23,	r22,	r23	C bring forward carry
++	br	r31,	$ret7
++$fix0c:	bis	cy1,	r23,	cy1	C bring forward carry
++	br	r31,	$ret0c
++$fix1c:	bis	cy0,	cy1,	cy0	C bring forward carry
++	br	r31,	$ret1c
++$fix7c:	bis	r23,	r22,	r23	C bring forward carry
++	br	r31,	$ret7c
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/aorslsh1_n.asm b/mpn/sw_64/sw6/aorslsh1_n.asm
+new file mode 100644
+index 0000000..c353301
+--- /dev/null
++++ b/mpn/sw_64/sw6/aorslsh1_n.asm
+@@ -0,0 +1,168 @@
++dnl  Sw_64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
++
++dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++C TODO
++
++define(`rp',`r16')
++define(`up',`r17')
++define(`vp',`r18')
++define(`n', `r19')
++
++define(`u0', `r8')
++define(`u1', `r1')
++define(`v0', `r4')
++define(`v1', `r5')
++
++define(`cy0', `r0')
++define(`cy1', `r20')
++define(`cy', `r22')
++define(`rr', `r24')
++define(`ps', `r25')
++define(`sl', `r28')
++
++ifdef(`OPERATION_addlsh1_n',`
++  define(ADDSUB,       addl)
++  define(CARRY,       `cmpult $1,$2,$3')
++  define(func, mpn_addlsh1_n)
++')
++ifdef(`OPERATION_sublsh1_n',`
++  define(ADDSUB,       subl)
++  define(CARRY,       `cmpult $2,$1,$3')
++  define(func, mpn_sublsh1_n)
++')
++
++MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
++
++ASM_START()
++PROLOGUE(func)
++	and	n, 2, cy0
++	blbs	n, L(bx1)
++L(bx0):	ldl	v1, 0(vp)
++	ldl	u1, 0(up)
++	ldi	r2, 0(r31)
++	bne	cy0, L(b10)
++
++L(b00):	ldi	vp, 48(vp)
++	ldi	up, -16(up)
++	ldi	rp, -8(rp)
++	ldi	cy0, 0(r31)
++	br	r31, L(lo0)
++
++L(b10):	ldi	vp, 32(vp)
++	ldi	rp, 8(rp)
++	ldi	cy0, 0(r31)
++	br	r31, L(lo2)
++
++L(bx1):	ldl	v0, 0(vp)
++	ldl	u0, 0(up)
++	ldi	r3, 0(r31)
++	beq	cy0, L(b01)
++
++L(b11):	ldi	vp, 40(vp)
++	ldi	up, -24(up)
++	ldi	rp, 16(rp)
++	ldi	cy1, 0(r31)
++	br	r31, L(lo3)
++
++L(b01):	ldi	n, -4(n)
++	ldi	cy1, 0(r31)
++	ble	n, L(end)
++	ldi	vp, 24(vp)
++	ldi	up, -8(up)
++
++	ALIGN(16)
++L(top):	addl	v0, v0, r6
++	ldl	v1, -16(vp)
++	addl	r6, r3, sl	C combined vlimb
++	ldl	u1, 16(up)
++	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
++	cmplt	v0, r31, r2	C high v bits
++	ADDSUB	ps, cy1, rr	C consume carry from previous operation
++	CARRY(	ps, u0, cy0)	C carry out #2
++	stl	rr, 0(rp)
++	CARRY(	rr, ps, cy)	C carry out #3
++	ldi	vp, 32(vp)	C bookkeeping
++	addl	cy, cy0, cy0	C final carry out
++L(lo0):	addl	v1, v1, r7
++	ldl	v0, -40(vp)
++	addl	r7, r2, sl
++	ldl	u0, 24(up)
++	ADDSUB	u1, sl, ps
++	cmplt	v1, r31, r3
++	ADDSUB	ps, cy0, rr
++	CARRY(	ps, u1, cy1)
++	stl	rr, 8(rp)
++	CARRY(	rr, ps, cy)
++	ldi	rp, 32(rp)	C bookkeeping
++	addl	cy, cy1, cy1
++L(lo3):	addl	v0, v0, r6
++	ldl	v1, -32(vp)
++	addl	r6, r3, sl
++	ldl	u1, 32(up)
++	ADDSUB	u0, sl, ps
++	cmplt	v0, r31, r2
++	ADDSUB	ps, cy1, rr
++	CARRY(	ps, u0, cy0)
++	stl	rr, -16(rp)
++	CARRY(	rr, ps, cy)
++	ldi	up, 32(up)	C bookkeeping
++	addl	cy, cy0, cy0
++L(lo2):	addl	v1, v1, r7
++	ldl	v0, -24(vp)
++	addl	r7, r2, sl
++	ldl	u0, 8(up)
++	ADDSUB	u1, sl, ps
++	cmplt	v1, r31, r3
++	ADDSUB	ps, cy0, rr
++	CARRY(	ps, u1, cy1)
++	stl	rr, -8(rp)
++	CARRY(	rr, ps, cy)
++	ldi	n, -4(n)	C bookkeeping
++	addl	cy, cy1, cy1
++	bgt	n, L(top)
++
++L(end):	addl	v0, v0, r6
++	addl	r6, r3, sl
++	ADDSUB	u0, sl, ps
++	cmplt	v0, r31, r2
++	ADDSUB	ps, cy1, rr
++	CARRY(	ps, u0, cy0)
++	stl	rr, 0(rp)
++	CARRY(	rr, ps, cy)
++	addl	cy, cy0, cy0
++	addl	cy0, r2, r0
++
++	ret	r31,(r26),1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/aorsmul_1.asm b/mpn/sw_64/sw6/aorsmul_1.asm
+new file mode 100644
+index 0000000..78daf7b
+--- /dev/null
++++ b/mpn/sw_64/sw6/aorsmul_1.asm
+@@ -0,0 +1,396 @@
++dnl  Sw_64 sw6 mpn_addmul_1 and mpn_submul_1.
++
++dnl  Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     3.5
++
++C  INPUT PARAMETERS
++define(`rp',	`r16')
++define(`up',	`r17')
++define(`n',	`r18')
++define(`v0',	`r19')
++
++dnl  This code was written in cooperation with sw6 pipeline expert Steve Root.
++
++dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
++dnl  them, so that further disturbance to the schedule is damped.
++
++dnl  We couldn't pair the loads, because the entangled schedule of the carry's
++dnl  has to happen on one side {0} of the machine.
++
++dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
++dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
++dnl  ldl in L1, say that load gets stalled because it collides with a fill from
++dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
++dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
++dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
++dnl  go in L0, or goes there, and causes a further instruction to stall.
++
++dnl  So for b_cache, we're likely going to want to put one or more cycles back
++dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
++dnl  At a place where we have an mt followed by a bookkeeping, put the
++dnl  bookkeeping in upper, and the prefetch into lower.
++
++dnl  Note, the ldl's and stl's are at the end of the quadpacks.  Note, we'd
++dnl  like not to have an ldl or an stl to preceded a conditional branch in a
++dnl  quadpack.  The conditional branch moves the retire pointer one cycle
++dnl  later.
++
++ifdef(`OPERATION_addmul_1',`
++    define(`ADDSUB',	`addl')
++    define(`CMPCY',	`cmpult	$2,$1')
++    define(`func',	`mpn_addmul_1')
++')
++ifdef(`OPERATION_submul_1',`
++    define(`ADDSUB',	`subl')
++    define(`CMPCY',	`cmpult	$1,$2')
++    define(`func',	`mpn_submul_1')
++')
++
++MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
++
++ASM_START()
++PROLOGUE(func)
++	ldl	r3,	0(up)		C
++	and	r18,	7,	r20	C
++	ldi	r18,	-9(r18)		C
++	cmpeq	r20,	1,	r21	C
++	beq	r21,	$L1		C
++
++$1mod8:	ldl	r5,	0(rp)		C
++	mull	v0,	r3,	r7	C
++	umulh	v0,	r3,	r8	C
++	ADDSUB	r5,	r7,	r23	C
++	CMPCY(	r5,	r23),	r20	C
++	addl	r8,	r20,	r0	C
++	stl	r23,	0(rp)		C
++	bge	r18,	$ent1		C
++	ret	r31,	(r26),	1	C
++
++$L1:	ldi	r8,	0(r31)		C zero carry reg
++	ldi	r24,	0(r31)		C zero carry reg
++	cmpeq	r20,	2,	r21	C
++	bne	r21,	$2mod8		C
++	cmpeq	r20,	3,	r21	C
++	bne	r21,	$3mod8		C
++	cmpeq	r20,	4,	r21	C
++	bne	r21,	$4mod8		C
++	cmpeq	r20,	5,	r21	C
++	bne	r21,	$5mod8		C
++	cmpeq	r20,	6,	r21	C
++	bne	r21,	$6mod8		C
++	cmpeq	r20,	7,	r21	C
++	beq	r21,	$0mod8		C
++
++$7mod8:	ldl	r5,	0(rp)		C
++	ldi	up,	8(up)		C
++	mull	v0,	r3,	r7	C
++	umulh	v0,	r3,	r24	C
++	ADDSUB	r5,	r7,	r23	C
++	CMPCY(	r5,	r23),	r20	C
++	addl	r24,	r20,	r24	C
++	stl	r23,	0(rp)		C
++	ldi	rp,	8(rp)		C
++	ldl	r3,	0(up)		C
++$6mod8:	ldl	r1,	8(up)		C
++	mull	v0,	r3,	r25	C
++	umulh	v0,	r3,	r3	C
++	mull	v0,	r1,	r28	C
++	ldl	r0,	16(up)		C
++	ldl	r4,	0(rp)		C
++	umulh	v0,	r1,	r8	C
++	ldl	r1,	24(up)		C
++	ldi	up,	48(up)		C L1 bookkeeping
++	mull	v0,	r0,	r2	C
++	ldl	r5,	8(rp)		C
++	ldi	rp,	-32(rp)		C L1 bookkeeping
++	umulh	v0,	r0,	r6	C
++	ADDSUB	r4,	r25,	r25	C lo + acc
++	mull	v0,	r1,	r7	C
++	br	r31,	$ent6		C
++
++$ent1:	ldi	up,	8(up)		C
++	ldi	rp,	8(rp)		C
++	ldi	r8,	0(r0)		C
++	ldl	r3,	0(up)		C
++$0mod8:	ldl	r1,	8(up)		C
++	mull	v0,	r3,	r2	C
++	umulh	v0,	r3,	r6	C
++	mull	v0,	r1,	r7	C
++	ldl	r0,	16(up)		C
++	ldl	r4,	0(rp)		C
++	umulh	v0,	r1,	r24	C
++	ldl	r1,	24(up)		C
++	mull	v0,	r0,	r25	C
++	ldl	r5,	8(rp)		C
++	umulh	v0,	r0,	r3	C
++	ADDSUB	r4,	r2,	r2	C lo + acc
++	mull	v0,	r1,	r28	C
++	ldi	rp,	-16(rp)		C
++	br	r31,	$ent0		C
++
++$3mod8:	ldl	r5,	0(rp)		C
++	ldi	up,	8(up)		C
++	mull	v0,	r3,	r7	C
++	umulh	v0,	r3,	r8	C
++	ADDSUB	r5,	r7,	r23	C
++	CMPCY(	r5,	r23),	r20	C
++	addl	r8,	r20,	r24	C
++	stl	r23,	0(rp)		C
++	ldi	rp,	8(rp)		C
++	ldl	r3,	0(up)		C
++$2mod8:	ldl	r1,	8(up)		C
++	mull	v0,	r3,	r25	C
++	umulh	v0,	r3,	r3	C
++	mull	v0,	r1,	r28	C
++	ble	r18,	$n23		C
++	ldl	r0,	16(up)		C
++	ldl	r4,	0(rp)		C
++	umulh	v0,	r1,	r8	C
++	ldl	r1,	24(up)		C
++	ldi	up,	16(up)		C L1 bookkeeping
++	mull	v0,	r0,	r2	C
++	ldl	r5,	8(rp)		C
++	ldi	rp,	0(rp)		C L1 bookkeeping
++	umulh	v0,	r0,	r6	C
++	ADDSUB	r4,	r25,	r25	C lo + acc
++	mull	v0,	r1,	r7	C
++	br	r31,	$ent2		C
++
++$5mod8:	ldl	r5,	0(rp)		C
++	ldi	up,	8(up)		C
++	mull	v0,	r3,	r7	C
++	umulh	v0,	r3,	r24	C
++	ADDSUB	r5,	r7,	r23	C
++	CMPCY(	r5,	r23),	r20	C
++	addl	r24,	r20,	r8	C
++	stl	r23,	0(rp)		C
++	ldi	rp,	8(rp)		C
++	ldl	r3,	0(up)		C
++$4mod8:	ldl	r1,	8(up)		C
++	mull	v0,	r3,	r2	C
++	umulh	v0,	r3,	r6	C
++	mull	v0,	r1,	r7	C
++	ldl	r0,	16(up)		C
++	ldl	r4,	0(rp)		C
++	umulh	v0,	r1,	r24	C
++	ldl	r1,	24(up)		C
++	ldi	up,	32(up)		C L1 bookkeeping
++	mull	v0,	r0,	r25	C
++	ldl	r5,	8(rp)		C
++	ldi	rp,	16(rp)		C L1 bookkeeping
++	umulh	v0,	r0,	r3	C
++	ADDSUB	r4,	r2,	r2	C lo + acc
++	mull	v0,	r1,	r28	C
++	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
++	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
++	ble	r18,	$Lend		C
++	ALIGN(16)
++$Loop:
++	bis	r31,	r31,	r31	C U1 mt
++	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
++	addl	r6,	r20,	r6	C U0 hi mul + carry
++	ldl	r0,	0(up)		C
++
++	bis	r31,	r31,	r31	C U1 mt
++	ADDSUB	r5,	r7,	r7	C L0 lo + acc
++	addl	r6,	r21,	r6	C U0 hi mul + carry
++	ldl	r4,	0(rp)		C L1
++
++	umulh	v0,	r1,	r8	C U1
++	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
++	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
++	ldl	r1,	8(up)		C L1
++
++	mull	v0,	r0,	r2	C U1
++	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
++	addl	r24,	r20,	r24	C U0 hi mul + carry
++	ldl	r5,	8(rp)		C L1
++
++	umulh	v0,	r0,	r6	C U1
++	ADDSUB	r4,	r25,	r25	C U0 lo + acc
++	stl	r22,	-16(rp)		C L0
++	stl	r23,	-8(rp)		C L1
++
++	bis	r31,	r31,	r31	C L0 st slosh
++	mull	v0,	r1,	r7	C U1
++	bis	r31,	r31,	r31	C L1 st slosh
++	addl	r24,	r21,	r24	C U0 hi mul + carry
++$ent2:
++	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
++	bis	r31,	r31,	r31	C U1 mt
++	ldi	r18,	-8(r18)		C L1 bookkeeping
++	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
++
++	bis	r31,	r31,	r31	C U1 mt
++	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
++	addl	r3,	r20,	r3	C U0 hi mul + carry
++	ldl	r0,	16(up)		C L1
++
++	bis	r31,	r31,	r31	C U1 mt
++	ADDSUB	r5,	r28,	r28	C L0 lo + acc
++	addl	r3,	r21,	r3	C U0 hi mul + carry
++	ldl	r4,	16(rp)		C L1
++
++	umulh	v0,	r1,	r24	C U1
++	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
++	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
++	ldl	r1,	24(up)		C L1
++
++	mull	v0,	r0,	r25	C U1
++	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
++	addl	r8,	r20,	r8	C U0 hi mul + carry
++	ldl	r5,	24(rp)		C L1
++
++	umulh	v0,	r0,	r3	C U1
++	ADDSUB	r4,	r2,	r2	C U0 lo + acc
++	stl	r22,	0(rp)		C L0
++	stl	r23,	8(rp)		C L1
++
++	bis	r31,	r31,	r31	C L0 st slosh
++	mull	v0,	r1,	r28	C U1
++	bis	r31,	r31,	r31	C L1 st slosh
++	addl	r8,	r21,	r8	C U0 hi mul + carry
++$ent0:
++	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
++	bis	r31,	r31,	r31	C U1 mt
++	ldi	up,	64(up)		C L1 bookkeeping
++	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
++
++	bis	r31,	r31,	r31	C U1 mt
++	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
++	addl	r6,	r20,	r6	C U0 hi mul + carry
++	ldl	r0,	-32(up)		C L1
++
++	bis	r31,	r31,	r31	C U1 mt
++	ADDSUB	r5,	r7,	r7	C L0 lo + acc
++	addl	r6,	r21,	r6	C U0 hi mul + carry
++	ldl	r4,	32(rp)		C L1
++
++	umulh	v0,	r1,	r8	C U1
++	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
++	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
++	ldl	r1,	-24(up)		C L1
++
++	mull	v0,	r0,	r2	C U1
++	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
++	addl	r24,	r20,	r24	C U0 hi mul + carry
++	ldl	r5,	40(rp)		C L1
++
++	umulh	v0,	r0,	r6	C U1
++	ADDSUB	r4,	r25,	r25	C U0 lo + acc
++	stl	r22,	16(rp)		C L0
++	stl	r23,	24(rp)		C L1
++
++	bis	r31,	r31,	r31	C L0 st slosh
++	mull	v0,	r1,	r7	C U1
++	bis	r31,	r31,	r31	C L1 st slosh
++	addl	r24,	r21,	r24	C U0 hi mul + carry
++$ent6:
++	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
++	bis	r31,	r31,	r31	C U1 mt
++	ldi	rp,	64(rp)		C L1 bookkeeping
++	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
++
++	bis	r31,	r31,	r31	C U1 mt
++	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
++	addl	r3,	r20,	r3	C U0 hi mul + carry
++	ldl	r0,	-16(up)		C L1
++
++	bis	r31,	r31,	r31	C U1 mt
++	ADDSUB	r5,	r28,	r28	C L0 lo + acc
++	addl	r3,	r21,	r3	C U0 hi mul + carry
++	ldl	r4,	-16(rp)		C L1
++
++	umulh	v0,	r1,	r24	C U1
++	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
++	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
++	ldl	r1,	-8(up)		C L1
++
++	mull	v0,	r0,	r25	C U1
++	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
++	addl	r8,	r20,	r8	C U0 hi mul + carry
++	ldl	r5,	-8(rp)		C L1
++
++	umulh	v0,	r0,	r3	C U1
++	ADDSUB	r4,	r2,	r2	C U0 lo + acc
++	stl	r22,	-32(rp)		C L0
++	stl	r23,	-24(rp)		C L1
++
++	bis	r31,	r31,	r31	C L0 st slosh
++	mull	v0,	r1,	r28	C U1
++	bis	r31,	r31,	r31	C L1 st slosh
++	addl	r8,	r21,	r8	C U0 hi mul + carry
++
++	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
++	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
++	ldl	r31,	256(up)		C prefetch up[]
++	bgt	r18,	$Loop		C U1 bookkeeping
++
++$Lend:	CMPCY(	r2,	r22),	r21	C
++	addl	r6,	r20,	r6	C
++	ADDSUB	r5,	r7,	r7	C
++	addl	r6,	r21,	r6	C
++	ldl	r4,	0(rp)		C
++	umulh	v0,	r1,	r8	C
++	CMPCY(	r5,	r7),	r20	C
++	ADDSUB	r7,	r6,	r23	C
++	CMPCY(r7,	r23),	r21	C
++	addl	r24,	r20,	r24	C
++	ldl	r5,	8(rp)		C
++	ADDSUB	r4,	r25,	r25	C
++	stl	r22,	-16(rp)		C
++	stl	r23,	-8(rp)		C
++	addl	r24,	r21,	r24	C
++	br	L(x)
++
++	ALIGN(16)
++$n23:	ldl	r4,	0(rp)		C
++	ldl	r5,	8(rp)		C
++	umulh	v0,	r1,	r8	C
++	ADDSUB	r4,	r25,	r25	C
++L(x):	CMPCY(	r4,	r25),	r20	C
++	ADDSUB	r25,	r24,	r22	C
++	CMPCY(	r25,	r22),	r21	C
++	addl	r3,	r20,	r3	C
++	ADDSUB	r5,	r28,	r28	C
++	addl	r3,	r21,	r3	C
++	CMPCY(	r5,	r28),	r20	C
++	ADDSUB	r28,	r3,	r23	C
++	CMPCY(	r28,	r23),	r21	C
++	addl	r8,	r20,	r8	C
++	stl	r22,	0(rp)		C
++	stl	r23,	8(rp)		C
++	addl	r8,	r21,	r0	C
++	ret	r31,	(r26),	1	C
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/gmp-mparam.h b/mpn/sw_64/sw6/gmp-mparam.h
+new file mode 100644
+index 0000000..e51d6b0
+--- /dev/null
++++ b/mpn/sw_64/sw6/gmp-mparam.h
+@@ -0,0 +1,209 @@
++/* gmp-mparam.h -- Compiler/machine parameter header file.
++
++Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
++Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#define GMP_LIMB_BITS 64
++#define GMP_LIMB_BYTES 8
++
++#define DIVEXACT_BY3_METHOD 0	/* override ../diveby3.asm */
++
++/* 500 MHz 21164 (agnesi.math.su.se) */
++/* FFT tuning limit = 20000000 */
++/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */
++
++#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
++#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
++#define MOD_1_1P_METHOD                      2
++#define MOD_1_NORM_THRESHOLD                 0  /* always */
++#define MOD_1_UNNORM_THRESHOLD               0  /* always */
++#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
++#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
++#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
++#define MOD_1_2_TO_MOD_1_4_THRESHOLD        21
++#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
++#define USE_PREINV_DIVREM_1                  1  /* preinv always */
++#define DIV_QR_1N_PI1_METHOD                 2
++#define DIV_QR_1_NORM_THRESHOLD              5
++#define DIV_QR_1_UNNORM_THRESHOLD            1
++#define DIV_QR_2_PI2_THRESHOLD               8
++#define DIVEXACT_1_THRESHOLD                 0  /* always */
++#define BMOD_1_TO_MOD_1_THRESHOLD           20
++
++#define MUL_TOOM22_THRESHOLD                32
++#define MUL_TOOM33_THRESHOLD               117
++#define MUL_TOOM44_THRESHOLD               124
++#define MUL_TOOM6H_THRESHOLD               230
++#define MUL_TOOM8H_THRESHOLD               357
++
++#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
++#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
++#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88
++#define MUL_TOOM42_TO_TOOM63_THRESHOLD     105
++#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
++
++#define SQR_BASECASE_THRESHOLD               0  /* always */
++#define SQR_TOOM2_THRESHOLD                 59
++#define SQR_TOOM3_THRESHOLD                123
++#define SQR_TOOM4_THRESHOLD                163
++#define SQR_TOOM6_THRESHOLD                333
++#define SQR_TOOM8_THRESHOLD                  0  /* always */
++
++#define MULMID_TOOM42_THRESHOLD             52
++
++#define MULMOD_BNM1_THRESHOLD               19
++#define SQRMOD_BNM1_THRESHOLD                5
++
++#define MUL_FFT_MODF_THRESHOLD             468  /* k = 5 */
++#define MUL_FFT_TABLE3                                      \
++  { {    468, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
++    {     11, 5}, {     23, 6}, {     19, 7}, {     10, 6}, \
++    {     24, 7}, {     13, 6}, {     27, 7}, {     14, 6}, \
++    {     29, 7}, {     17, 6}, {     35, 7}, {     29, 8}, \
++    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
++    {     19, 7}, {     39, 8}, {     29, 9}, {     15, 8}, \
++    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
++    {     51, 9}, {     27, 8}, {     55, 9}, {     35, 8}, \
++    {     71, 9}, {     39,10}, {     23, 9}, {     55,10}, \
++    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
++    {     47, 9}, {     95,10}, {     55,11}, {     31,10}, \
++    {     79,11}, {     47,10}, {    103,12}, {     31,11}, \
++    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
++    {     95,10}, {    199,11}, {    111,12}, {     63,11}, \
++    {    143,10}, {    287, 9}, {    575,11}, {    159,10}, \
++    {    319,12}, {     95,11}, {    191,10}, {    383,11}, \
++    {    207,13}, {     63,12}, {    127,11}, {    255,10}, \
++    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
++    {    575,12}, {    159,11}, {    319,10}, {    639,11}, \
++    {    335,10}, {    671,11}, {    351,10}, {    703,12}, \
++    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
++    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
++    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
++    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
++    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
++    {    415,11}, {    831,12}, {    447,14}, {    127,13}, \
++    {    255,12}, {    575,11}, {   1151,12}, {    607,13}, \
++    {    319,12}, {    735,13}, {    383,12}, {    767,11}, \
++    {   1535,12}, {    831,13}, {    447,12}, {    959,14}, \
++    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
++    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \
++    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
++    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
++    {    511,13}, {   1215,14}, {    639,13}, {   1407,14}, \
++    {    767,13}, {   1663,14}, {    895,13}, {   1855,15}, \
++    {    511,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
++    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
++    {2097152,22}, {4194304,23}, {8388608,24} }
++#define MUL_FFT_TABLE3_SIZE 151
++#define MUL_FFT_THRESHOLD                 5760
++
++#define SQR_FFT_MODF_THRESHOLD             412  /* k = 5 */
++#define SQR_FFT_TABLE3                                      \
++  { {    412, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
++    {     11, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
++    {     27, 7}, {     14, 6}, {     29, 7}, {     28, 8}, \
++    {     15, 7}, {     31, 8}, {     17, 7}, {     36, 8}, \
++    {     19, 7}, {     39, 8}, {     29, 9}, {     15, 8}, \
++    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
++    {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
++    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
++    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
++    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
++    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
++    {    127, 9}, {    255,11}, {     79,10}, {    159, 9}, \
++    {    319,10}, {    167,11}, {     95,10}, {    191, 9}, \
++    {    383,11}, {    111,12}, {     63,11}, {    127,10}, \
++    {    271,11}, {    143,10}, {    287, 9}, {    575,10}, \
++    {    303,11}, {    159,10}, {    319,12}, {     95,11}, \
++    {    191,10}, {    383,11}, {    207,13}, {     63,12}, \
++    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
++    {    543,11}, {    287,10}, {    575,11}, {    303,12}, \
++    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
++    {    671,11}, {    351,10}, {    703,11}, {    367,12}, \
++    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
++    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
++    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
++    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \
++    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
++    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
++    {    447,11}, {    895,12}, {    479,14}, {    127,13}, \
++    {    255,12}, {    575,11}, {   1151,12}, {    607,13}, \
++    {    319,12}, {    703,11}, {   1407,12}, {    735,13}, \
++    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
++    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
++    {   1151,13}, {    639,12}, {   1279,13}, {    703,12}, \
++    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
++    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
++    {    511,13}, {   1215,14}, {    639,13}, {   1407,14}, \
++    {    767,13}, {   1663,14}, {    895,13}, {   1791,15}, \
++    {    511,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
++    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
++    {2097152,22}, {4194304,23}, {8388608,24} }
++#define SQR_FFT_TABLE3_SIZE 159
++#define SQR_FFT_THRESHOLD                 5056
++
++#define MULLO_BASECASE_THRESHOLD             0  /* always */
++#define MULLO_DC_THRESHOLD                 100
++#define MULLO_MUL_N_THRESHOLD            11355
++
++#define DC_DIV_QR_THRESHOLD                124
++#define DC_DIVAPPR_Q_THRESHOLD             438
++#define DC_BDIV_QR_THRESHOLD               153
++#define DC_BDIV_Q_THRESHOLD                318
++
++#define INV_MULMOD_BNM1_THRESHOLD           62
++#define INV_NEWTON_THRESHOLD               384
++#define INV_APPR_THRESHOLD                 402
++
++#define BINV_NEWTON_THRESHOLD              381
++#define REDC_1_TO_REDC_N_THRESHOLD         110
++
++#define MU_DIV_QR_THRESHOLD               1752
++#define MU_DIVAPPR_Q_THRESHOLD            1895
++#define MUPI_DIV_QR_THRESHOLD              174
++#define MU_BDIV_QR_THRESHOLD              1387
++#define MU_BDIV_Q_THRESHOLD               1787
++
++#define POWM_SEC_TABLE  1,13,66,82,579
++
++#define MATRIX22_STRASSEN_THRESHOLD         15
++#define HGCD_THRESHOLD                     318
++#define HGCD_APPR_THRESHOLD                363
++#define HGCD_REDUCE_THRESHOLD             2384
++#define GCD_DC_THRESHOLD                  2504
++#define GCDEXT_DC_THRESHOLD                671
++#define JACOBI_BASE_METHOD                   3
++
++#define GET_STR_DC_THRESHOLD                14
++#define GET_STR_PRECOMPUTE_THRESHOLD        25
++#define SET_STR_DC_THRESHOLD              3754
++#define SET_STR_PRECOMPUTE_THRESHOLD      8097
++
++#define FAC_DSC_THRESHOLD                  951
++#define FAC_ODD_THRESHOLD                   24
+diff --git a/mpn/sw_64/sw6/mod_1_4.asm b/mpn/sw_64/sw6/mod_1_4.asm
+new file mode 100644
+index 0000000..ff4d655
+--- /dev/null
++++ b/mpn/sw_64/sw6/mod_1_4.asm
+@@ -0,0 +1,333 @@
++dnl Sw_64 mpn_mod_1s_4p
++
++dnl  Contributed to the GNU project by Torbjorn Granlund.
++
++dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C TODO:
++C  * Optimise.  2.75 c/l should be possible.
++C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
++C  * Optimise feed-in code, starting the sw pipeline in switch code.
++C  * Shorten software pipeline.  The mul instructions are scheduled too far
++C    from their users.  Fixing this will allow us to use fewer registers.
++C  * If we cannot reduce register usage, write perhaps small-n basecase.
++C  * Does this work for PIC?
++
++C      cycles/limb
++
++define(`ap',     `r16')
++define(`n',      `r17')
++define(`pl',     `r24')
++define(`ph',     `r25')
++define(`rl',     `r6')
++define(`rh',     `r7')
++define(`B1modb', `r1')
++define(`B2modb', `r2')
++define(`B3modb', `r3')
++define(`B4modb', `r4')
++define(`B5modb', `r5')
++
++ASM_START()
++PROLOGUE(mpn_mod_1s_4p)
++	ldi	r30, -64(r30)
++	stl	r9, 8(r30)
++	ldl	B1modb, 16(r19)
++	stl	r10, 16(r30)
++	ldl	B2modb, 24(r19)
++	stl	r11, 24(r30)
++	ldl	B3modb, 32(r19)
++	stl	r12, 32(r30)
++	ldl	B4modb, 40(r19)
++	stl	r13, 40(r30)
++	ldl	B5modb, 48(r19)
++	s8addl	n, ap, ap		C point ap at vector end
++
++	and	n, 3, r0
++	ldi	n, -4(n)
++	beq	r0, L(b0)
++	ldi	r6, -2(r0)
++	blt	r6, L(b1)
++	beq	r6, L(b2)
++
++L(b3):	ldl	r21, -16(ap)
++	ldl	r22, -8(ap)
++	ldl	r20, -24(ap)
++	mull	r21, B1modb, r8
++	umulh	r21, B1modb, r12
++	mull	r22, B2modb, r9
++	umulh	r22, B2modb, r13
++	addl	r8, r20, pl
++	cmpult	pl, r8, r0
++	addl	r0, r12, ph
++	addl	r9, pl, rl
++	cmpult	rl, r9, r0
++	addl	r13, ph, ph
++	addl	r0, ph, rh
++	ldi	ap, -56(ap)
++	br	L(com)
++
++L(b0):	ldl	r21, -24(ap)
++	ldl	r22, -16(ap)
++	ldl	r23, -8(ap)
++	ldl	r20, -32(ap)
++	mull	r21, B1modb, r8
++	umulh	r21, B1modb, r12
++	mull	r22, B2modb, r9
++	umulh	r22, B2modb, r13
++	mull	r23, B3modb, r10
++	umulh	r23, B3modb, r27
++	addl	r8, r20, pl
++	cmpult	pl, r8, r0
++	addl	r0, r12, ph
++	addl	r9, pl, pl
++	cmpult	pl, r9, r0
++	addl	r13, ph, ph
++	addl	r0, ph, ph
++	addl	r10, pl, rl
++	cmpult	rl, r10, r0
++	addl	r27, ph, ph
++	addl	r0, ph, rh
++	ldi	ap, -64(ap)
++	br	L(com)
++
++L(b1):	bis	r31, r31, rh
++	ldl	rl, -8(ap)
++	ldi	ap, -40(ap)
++	br	L(com)
++
++L(b2):	ldl	rh, -8(ap)
++	ldl	rl, -16(ap)
++	ldi	ap, -48(ap)
++
++L(com):	ble	n, L(ed3)
++	ldl	r21, 8(ap)
++	ldl	r22, 16(ap)
++	ldl	r23, 24(ap)
++	ldl	r20, 0(ap)
++	ldi	n, -4(n)
++	ldi	ap, -32(ap)
++	mull	r21, B1modb, r8
++	umulh	r21, B1modb, r12
++	mull	r22, B2modb, r9
++	umulh	r22, B2modb, r13
++	mull	r23, B3modb, r10
++	umulh	r23, B3modb, r27
++	mull	rl, B4modb, r11
++	umulh	rl, B4modb, r28
++	ble	n, L(ed2)
++
++	ALIGN(16)
++L(top):	ldl	r21, 8(ap)
++	mull	rh, B5modb, rl
++	addl	r8, r20, pl
++	ldl	r22, 16(ap)
++	cmpult	pl, r8, r0
++	umulh	rh, B5modb, rh
++	ldl	r23, 24(ap)
++	addl	r0, r12, ph
++	addl	r9, pl, pl
++	mull	r21, B1modb, r8
++	cmpult	pl, r9, r0
++	addl	r13, ph, ph
++	umulh	r21, B1modb, r12
++	ldi	ap, -32(ap)
++	addl	r0, ph, ph
++	addl	r10, pl, pl
++	mull	r22, B2modb, r9
++	cmpult	pl, r10, r0
++	addl	r27, ph, ph
++	addl	r11, pl, pl
++	umulh	r22, B2modb, r13
++	addl	r0, ph, ph
++	cmpult	pl, r11, r0
++	addl	r28, ph, ph
++	mull	r23, B3modb, r10
++	ldl	r20, 32(ap)
++	addl	pl, rl, rl
++	umulh	r23, B3modb, r27
++	addl	r0, ph, ph
++	cmpult	rl, pl, r0
++	mull	rl, B4modb, r11
++	addl	ph, rh, rh
++	umulh	rl, B4modb, r28
++	addl	r0, rh, rh
++	ldi	n, -4(n)
++	bgt	n, L(top)
++
++L(ed2):	mull	rh, B5modb, rl
++	addl	r8, r20, pl
++	umulh	rh, B5modb, rh
++	cmpult	pl, r8, r0
++	addl	r0, r12, ph
++	addl	r9, pl, pl
++	cmpult	pl, r9, r0
++	addl	r13, ph, ph
++	addl	r0, ph, ph
++	addl	r10, pl, pl
++	cmpult	pl, r10, r0
++	addl	r27, ph, ph
++	addl	r11, pl, pl
++	addl	r0, ph, ph
++	cmpult	pl, r11, r0
++	addl	r28, ph, ph
++	addl	pl, rl, rl
++	addl	r0, ph, ph
++	cmpult	rl, pl, r0
++	addl	ph, rh, rh
++	addl	r0, rh, rh
++
++L(ed3):	mull	rh, B1modb, r8
++	umulh	rh, B1modb, rh
++	addl	r8, rl, rl
++	cmpult	rl, r8, r0
++	addl	r0, rh, rh
++
++	ldl	r24, 8(r19)		C cnt
++	sll	rh, r24, rh
++	subl	r31, r24, r25
++	srl	rl, r25, r2
++	sll	rl, r24, rl
++	or	r2, rh, rh
++
++	ldl	r23, 0(r19)		C bi
++	mull	rh, r23, r8
++	umulh	rh, r23, r9
++	addl	rh, 1, r7
++	addl	r8, rl, r8		C ql
++	cmpult	r8, rl, r0
++	addl	r9, r7, r9
++	addl	r0, r9, r9		C qh
++	mull	r9, r18, r21		C qh * b
++	subl	rl, r21, rl
++	cmpult	r8, rl, r0		C rl > ql
++	negl	r0, r0
++	and	r0, r18, r0
++	addl	rl, r0, rl
++	cmpule	r18, rl, r0		C rl >= b
++	negl	r0, r0
++	and	r0, r18, r0
++	subl	rl, r0, rl
++
++	srl	rl, r24, r0
++
++	ldl	r9, 8(r30)
++	ldl	r10, 16(r30)
++	ldl	r11, 24(r30)
++	ldl	r12, 32(r30)
++	ldl	r13, 40(r30)
++	ldi	r30, 64(r30)
++	ret	r31, (r26), 1
++EPILOGUE()
++
++PROLOGUE(mpn_mod_1s_4p_cps,gp)
++	ldi	r30, -32(r30)
++	stl	r26, 0(r30)
++	stl	r9, 8(r30)
++	stl	r10, 16(r30)
++	stl	r11, 24(r30)
++	mov	r16, r11
++	LEA(	r4, __clz_tab)
++	ldi	r10, 65(r31)
++	cmpgeb	r31, r17, r1
++	srl	r1, 1, r1
++	xor	r1, 127, r1
++	addl	r1, r4, r1
++	ldl_u	r2, 0(r1)
++	ext0b	r2, r1, r2
++	s8subl	r2, 7, r2
++	srl	r17, r2, r3
++	subl	r10, r2, r10
++	addl	r3, r4, r3
++	ldl_u	r1, 0(r3)
++	ext0b	r1, r3, r1
++	subl	r10, r1, r10
++	sll	r17, r10, r9
++	mov	r9, r16
++	call	r26, mpn_invert_limb
++	LDGP(	r29, 0(r26))
++	subl	r31, r10, r2
++	ldi	r1, 1(r31)
++	sll	r1, r10, r1
++	subl	r31, r9, r3
++	srl	r0, r2, r2
++	ldl	r26, 0(r30)
++	bis	r2, r1, r2
++	stl	r0, 0(r11)
++	stl	r10, 8(r11)
++	mull	r2, r3, r2
++	srl	r2, r10, r3
++	umulh	r2, r0, r1
++	stl	r3, 16(r11)
++	mull	r2, r0, r3
++	ornot	r31, r1, r1
++	subl	r1, r2, r1
++	mull	r1, r9, r1
++	addl	r1, r9, r2
++	cmpule	r1, r3, r3
++	seleq	r3, r2, r1
++	srl	r1, r10, r3
++	umulh	r1, r0, r2
++	stl	r3, 24(r11)
++	mull	r1, r0, r3
++	ornot	r31, r2, r2
++	subl	r2, r1, r2
++	mull	r2, r9, r2
++	addl	r2, r9, r1
++	cmpule	r2, r3, r3
++	seleq	r3, r1, r2
++	srl	r2, r10, r1
++	umulh	r2, r0, r3
++	stl	r1, 32(r11)
++	mull	r2, r0, r1
++	ornot	r31, r3, r3
++	subl	r3, r2, r3
++	mull	r3, r9, r3
++	addl	r3, r9, r2
++	cmpule	r3, r1, r1
++	seleq	r1, r2, r3
++	srl	r3, r10, r2
++	umulh	r3, r0, r1
++	stl	r2, 40(r11)
++	mull	r3, r0, r0
++	ornot	r31, r1, r1
++	subl	r1, r3, r1
++	mull	r1, r9, r1
++	addl	r1, r9, r9
++	cmpule	r1, r0, r0
++	seleq	r0, r9, r1
++	ldl	r9, 8(r30)
++	srl	r1, r10, r1
++	ldl	r10, 16(r30)
++	stl	r1, 48(r11)
++	ldl	r11, 24(r30)
++	ldi	r30, 32(r30)
++	ret	r31, (r26), 1
++EPILOGUE()
+diff --git a/mpn/sw_64/sw6/mul_1.asm b/mpn/sw_64/sw6/mul_1.asm
+new file mode 100644
+index 0000000..7d8dff3
+--- /dev/null
++++ b/mpn/sw_64/sw6/mul_1.asm
+@@ -0,0 +1,496 @@
++dnl  Sw_64 sw6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
++dnl  result in a second limb vector.
++
++dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C INPUT PARAMETERS
++C res_ptr	r16
++C s1_ptr	r17
++C size		r18
++C s2_limb	r19
++
++C This code runs at 2.25 cycles/limb on SW6.
++
++C This code was written in close cooperation with sw6 pipeline expert
++C Steve Root.  Any errors are tege's fault, though.
++
++C Code structure:
++
++C  code for n < 8
++C  code for n > 8	code for (n mod 8)
++C			code for (n div 8)	feed-in code
++C						8-way unrolled loop
++C						wind-down code
++
++C Some notes about unrolled loop:
++C
++C   r1-r8     multiplies and workup
++C   r21-r28   multiplies and workup
++C   r9-r12    loads
++C   r0       -1
++C   r20,r29,r13-r15  scramble
++C
++C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
++C   put-the-carry-into-hi.  The idea is that these branches are very rarely
++C   taken, and since a non-taken branch consumes no resources, that is better
++C   than an addl.
++C
++C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
++C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
++
++C The code could use some further work:
++C   1. Speed up really small multiplies.  The default sw_64/mul_1.asm code is
++C      faster than this for size < 3.
++C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
++C      that is too costly.
++C   3. Consider using 4-way unrolling, even if that runs slower.
++C   4. Reduce register usage.  In particular, try to avoid using r29.
++
++ASM_START()
++PROLOGUE(mpn_mul_1)
++	cmpult	r18,	8,	r1
++	beq	r1,	$Large
++$Lsmall:
++	ldl	r2,0(r17)	C r2 = s1_limb
++	ldi	r18,-1(r18)	C size--
++	mull	r2,r19,r3	C r3 = prod_low
++	bic	r31,r31,r4	C clear cy_limb
++	umulh	r2,r19,r0	C r0 = prod_high
++	beq	r18,$Le1a	C jump if size was == 1
++	ldl	r2,8(r17)	C r2 = s1_limb
++	ldi	r18,-1(r18)	C size--
++	stl	r3,0(r16)
++	beq	r18,$Le2a	C jump if size was == 2
++	ALIGN(8)
++$Lopa:	mull	r2,r19,r3	C r3 = prod_low
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	ldi	r18,-1(r18)	C size--
++	umulh	r2,r19,r4	C r4 = cy_limb
++	ldl	r2,16(r17)	C r2 = s1_limb
++	ldi	r17,8(r17)	C s1_ptr++
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	stl	r3,8(r16)
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	ldi	r16,8(r16)	C res_ptr++
++	bne	r18,$Lopa
++
++$Le2a:	mull	r2,r19,r3	C r3 = prod_low
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	umulh	r2,r19,r4	C r4 = cy_limb
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	stl	r3,8(r16)
++	addl	r4,r0,r0	C cy_limb = prod_high + cy
++	ret	r31,(r26),1
++$Le1a:	stl	r3,0(r16)
++	ret	r31,(r26),1
++
++$Large:
++	ldi	r30,	-224(r30)
++	stl	r26,	0(r30)
++	stl	r9,	8(r30)
++	stl	r10,	16(r30)
++	stl	r11,	24(r30)
++	stl	r12,	32(r30)
++	stl	r13,	40(r30)
++	stl	r14,	48(r30)
++	stl	r15,	56(r30)
++	stl	r29,	64(r30)
++
++	and	r18,	7,	r20	C count for the first loop, 0-7
++	srl	r18,	3,	r18	C count for unrolled loop
++	bis	r31,	r31,	r21
++	beq	r20,	$L_8_or_more	C skip first loop
++
++$L_9_or_more:
++	ldl	r2,0(r17)	C r2 = s1_limb
++	ldi	r17,8(r17)	C s1_ptr++
++	ldi	r20,-1(r20)	C size--
++	mull	r2,r19,r3	C r3 = prod_low
++	umulh	r2,r19,r21	C r21 = prod_high
++	beq	r20,$Le1b	C jump if size was == 1
++	bis	r31, r31, r0	C FIXME: shouldn't need this
++	ldl	r2,0(r17)	C r2 = s1_limb
++	ldi	r17,8(r17)	C s1_ptr++
++	ldi	r20,-1(r20)	C size--
++	stl	r3,0(r16)
++	ldi	r16,8(r16)	C res_ptr++
++	beq	r20,$Le2b	C jump if size was == 2
++	ALIGN(8)
++$Lopb:	mull	r2,r19,r3	C r3 = prod_low
++	addl	r21,r0,r0	C cy_limb = cy_limb + 'cy'
++	ldi	r20,-1(r20)	C size--
++	umulh	r2,r19,r21	C r21 = prod_high
++	ldl	r2,0(r17)	C r2 = s1_limb
++	ldi	r17,8(r17)	C s1_ptr++
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	stl	r3,0(r16)
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	ldi	r16,8(r16)	C res_ptr++
++	bne	r20,$Lopb
++
++$Le2b:	mull	r2,r19,r3	C r3 = prod_low
++	addl	r21,r0,r0	C cy_limb = cy_limb + 'cy'
++	umulh	r2,r19,r21	C r21 = prod_high
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	stl	r3,0(r16)
++	ldi	r16,8(r16)	C res_ptr++
++	addl	r21,r0,r21	C cy_limb = prod_high + cy
++	br	r31,	$L_8_or_more
++$Le1b:	stl	r3,0(r16)
++	ldi	r16,8(r16)	C res_ptr++
++
++$L_8_or_more:
++	ldi	r0,	-1(r31)		C put -1 in r0, for tricky loop control
++	ldi	r17,	-32(r17)	C L1 bookkeeping
++	ldi	r18,	-1(r18)		C decrement count
++
++	ldl	r9,	32(r17)		C L1
++	ldl	r10,	40(r17)		C L1
++	mull	r9,	r19,	r22	C U1 #07
++	ldl	r11,	48(r17)		C L1
++	umulh	r9,	r19,	r23	C U1 #08
++	ldl	r12,	56(r17)		C L1
++	mull	r10,	r19,	r24	C U1 #09
++	ldl	r9,	64(r17)		C L1
++
++	ldi	r17,	64(r17)		C L1 bookkeeping
++
++	umulh	r10,	r19,	r25	C U1 #11
++	mull	r11,	r19,	r26	C U1 #12
++	umulh	r11,	r19,	r27	C U1 #13
++	mull	r12,	r19,	r28	C U1 #14
++	ldl	r10,	8(r17)		C L1
++	umulh	r12,	r19,	r1	C U1 #15
++	ldl	r11,	16(r17)		C L1
++	mull	r9,	r19,	r2	C U1 #16
++	ldl	r12,	24(r17)		C L1
++	umulh	r9,	r19,	r3	C U1 #17
++	addl	r21,	r22,	r13	C L1 mov
++	mull	r10,	r19,	r4	C U1 #18
++	addl	r23,	r24,	r22	C L0 sum 2 mul's
++	cmpult	r13,	r21,	r14	C L1 carry from sum
++	bgt	r18,	$L_16_or_more
++
++	cmpult	r22,	r24,	r24	C U0 carry from sum
++	umulh	r10,	r19,	r5	C U1 #02
++	addl	r25,	r26,	r23	C U0 sum 2 mul's
++	mull	r11,	r19,	r6	C U1 #03
++	cmpult	r23,	r26,	r25	C U0 carry from sum
++	umulh	r11,	r19,	r7	C U1 #04
++	addl	r27,	r28,	r28	C U0 sum 2 mul's
++	mull	r12,	r19,	r8	C U1 #05
++	cmpult	r28,	r27,	r15	C L0 carry from sum
++	ldi	r16,	32(r16)		C L1 bookkeeping
++	addl	r13,	r31,	r13	C U0 start carry cascade
++	umulh	r12,	r19,	r21	C U1 #06
++	br	r31,	$ret0c
++
++$L_16_or_more:
++C ---------------------------------------------------------------
++	subl	r18,1,r18
++	cmpult	r22,	r24,	r24	C U0 carry from sum
++	ldl	r9,	32(r17)		C L1
++
++	umulh	r10,	r19,	r5	C U1 #02
++	addl	r25,	r26,	r23	C U0 sum 2 mul's
++	mull	r11,	r19,	r6	C U1 #03
++	cmpult	r23,	r26,	r25	C U0 carry from sum
++	umulh	r11,	r19,	r7	C U1 #04
++	addl	r27,	r28,	r28	C U0 sum 2 mul's
++	mull	r12,	r19,	r8	C U1 #05
++	cmpult	r28,	r27,	r15	C L0 carry from sum
++	ldi	r16,	32(r16)		C L1 bookkeeping
++	addl	r13,	r31,	r13	C U0 start carry cascade
++
++	umulh	r12,	r19,	r21	C U1 #06
++C	beq	r13,	$fix0w		C U0
++$ret0w:	addl	r22,	r14,	r26	C L0
++	ldl	r10,	40(r17)		C L1
++
++	mull	r9,	r19,	r22	C U1 #07
++	beq	r26,	$fix1w		C U0
++$ret1w:	addl	r23,	r24,	r27	C L0
++	ldl	r11,	48(r17)		C L1
++
++	umulh	r9,	r19,	r23	C U1 #08
++	beq	r27,	$fix2w		C U0
++$ret2w:	addl	r28,	r25,	r28	C L0
++	ldl	r12,	56(r17)		C L1
++
++	mull	r10,	r19,	r24	C U1 #09
++	beq	r28,	$fix3w		C U0
++$ret3w:	addl	r1,	r2,	r20	C L0 sum 2 mul's
++	ldl	r9,	64(r17)		C L1
++
++	addl	r3,	r4,	r2	C L0 #10 2 mul's
++	ldi	r17,	64(r17)		C L1 bookkeeping
++	cmpult	r20,	r1,	r29	C U0 carry from sum
++
++	umulh	r10,	r19,	r25	C U1 #11
++	cmpult	r2,	r4,	r4	C U0 carry from sum
++	stl	r13,	-32(r16)	C L0
++	stl	r26,	-24(r16)	C L1
++
++	mull	r11,	r19,	r26	C U1 #12
++	addl	r5,	r6,	r14	C U0 sum 2 mul's
++	stl	r27,	-16(r16)	C L0
++	stl	r28,	-8(r16)		C L1
++
++	umulh	r11,	r19,	r27	C U1 #13
++	cmpult	r14,	r6,	r3	C U0 carry from sum
++C could do cross-jumping here:
++C	bra	$L_middle_of_unrolled_loop
++	mull	r12,	r19,	r28	C U1 #14
++	addl	r7,	r3,	r5	C L0 eat carry
++	addl	r20,	r15,	r20	C U0 carry cascade
++	ldl	r10,	8(r17)		C L1
++
++	umulh	r12,	r19,	r1	C U1 #15
++	beq	r20,	$fix4		C U0
++$ret4w:	addl	r2,	r29,	r6	C L0
++	ldl	r11,	16(r17)		C L1
++
++	mull	r9,	r19,	r2	C U1 #16
++	beq	r6,	$fix5		C U0
++$ret5w:	addl	r14,	r4,	r7	C L0
++	ldl	r12,	24(r17)		C L1
++
++	umulh	r9,	r19,	r3	C U1 #17
++	beq	r7,	$fix6		C U0
++$ret6w:	addl	r5,	r8,	r8	C L0 sum 2
++	addl	r21,	r22,	r13	C L1 sum 2 mul's
++
++	mull	r10,	r19,	r4	C U1 #18
++	addl	r23,	r24,	r22	C L0 sum 2 mul's
++	cmpult	r13,	r21,	r14	C L1 carry from sum
++	ble	r18,	$Lend		C U0
++C ---------------------------------------------------------------
++	ALIGN(16)
++$Loop:
++	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
++	cmpult	r8,	r5,	r29	C L0 carry from last bunch
++	cmpult	r22,	r24,	r24	C U0 carry from sum
++	ldl	r9,	32(r17)		C L1
++
++	umulh	r10,	r19,	r5	C U1 #02
++	addl	r25,	r26,	r23	C U0 sum 2 mul's
++	stl	r20,	0(r16)		C L0
++	stl	r6,	8(r16)		C L1
++
++	mull	r11,	r19,	r6	C U1 #03
++	cmpult	r23,	r26,	r25	C U0 carry from sum
++	stl	r7,	16(r16)		C L0
++	stl	r8,	24(r16)		C L1
++
++	umulh	r11,	r19,	r7	C U1 #04
++	bis	r31,	r31,	r31	C L0 st slosh
++	bis	r31,	r31,	r31	C L1 st slosh
++	addl	r27,	r28,	r28	C U0 sum 2 mul's
++
++	mull	r12,	r19,	r8	C U1 #05
++	cmpult	r28,	r27,	r15	C L0 carry from sum
++	ldi	r16,	64(r16)		C L1 bookkeeping
++	addl	r13,	r29,	r13	C U0 start carry cascade
++
++	umulh	r12,	r19,	r21	C U1 #06
++	beq	r13,	$fix0		C U0
++$ret0:	addl	r22,	r14,	r26	C L0
++	ldl	r10,	40(r17)		C L1
++
++	mull	r9,	r19,	r22	C U1 #07
++	beq	r26,	$fix1		C U0
++$ret1:	addl	r23,	r24,	r27	C L0
++	ldl	r11,	48(r17)		C L1
++
++	umulh	r9,	r19,	r23	C U1 #08
++	beq	r27,	$fix2		C U0
++$ret2:	addl	r28,	r25,	r28	C L0
++	ldl	r12,	56(r17)		C L1
++
++	mull	r10,	r19,	r24	C U1 #09
++	beq	r28,	$fix3		C U0
++$ret3:	addl	r1,	r2,	r20	C L0 sum 2 mul's
++	ldl	r9,	64(r17)		C L1
++
++	addl	r3,	r4,	r2	C L0 #10 2 mul's
++	bis	r31,	r31,	r31	C U1 mul hole
++	ldi	r17,	64(r17)		C L1 bookkeeping
++	cmpult	r20,	r1,	r29	C U0 carry from sum
++
++	umulh	r10,	r19,	r25	C U1 #11
++	cmpult	r2,	r4,	r4	C U0 carry from sum
++	stl	r13,	-32(r16)	C L0
++	stl	r26,	-24(r16)	C L1
++
++	mull	r11,	r19,	r26	C U1 #12
++	addl	r5,	r6,	r14	C U0 sum 2 mul's
++	stl	r27,	-16(r16)	C L0
++	stl	r28,	-8(r16)		C L1
++
++	umulh	r11,	r19,	r27	C U1 #13
++	bis	r31,	r31,	r31	C L0 st slosh
++	bis	r31,	r31,	r31	C L1 st slosh
++	cmpult	r14,	r6,	r3	C U0 carry from sum
++$L_middle_of_unrolled_loop:
++	mull	r12,	r19,	r28	C U1 #14
++	addl	r7,	r3,	r5	C L0 eat carry
++	addl	r20,	r15,	r20	C U0 carry cascade
++	ldl	r10,	8(r17)		C L1
++
++	umulh	r12,	r19,	r1	C U1 #15
++	beq	r20,	$fix4		C U0
++$ret4:	addl	r2,	r29,	r6	C L0
++	ldl	r11,	16(r17)		C L1
++
++	mull	r9,	r19,	r2	C U1 #16
++	beq	r6,	$fix5		C U0
++$ret5:	addl	r14,	r4,	r7	C L0
++	ldl	r12,	24(r17)		C L1
++
++	umulh	r9,	r19,	r3	C U1 #17
++	beq	r7,	$fix6		C U0
++$ret6:	addl	r5,	r8,	r8	C L0 sum 2
++	addl	r21,	r22,	r13	C L1 sum 2 mul's
++
++	mull	r10,	r19,	r4	C U1 #18
++	addl	r23,	r24,	r22	C L0 sum 2 mul's
++	cmpult	r13,	r21,	r14	C L1 carry from sum
++	bgt	r18,	$Loop		C U0
++C ---------------------------------------------------------------
++$Lend:
++	cmpult	r8,	r5,	r29	C L0 carry from last bunch
++	cmpult	r22,	r24,	r24	C U0 carry from sum
++
++	umulh	r10,	r19,	r5	C U1 #02
++	addl	r25,	r26,	r23	C U0 sum 2 mul's
++	stl	r20,	0(r16)		C L0
++	stl	r6,	8(r16)		C L1
++
++	mull	r11,	r19,	r6	C U1 #03
++	cmpult	r23,	r26,	r25	C U0 carry from sum
++	stl	r7,	16(r16)		C L0
++	stl	r8,	24(r16)		C L1
++
++	umulh	r11,	r19,	r7	C U1 #04
++	addl	r27,	r28,	r28	C U0 sum 2 mul's
++
++	mull	r12,	r19,	r8	C U1 #05
++	cmpult	r28,	r27,	r15	C L0 carry from sum
++	ldi	r16,	64(r16)		C L1 bookkeeping
++	addl	r13,	r29,	r13	C U0 start carry cascade
++
++	umulh	r12,	r19,	r21	C U1 #06
++	beq	r13,	$fix0c		C U0
++$ret0c:	addl	r22,	r14,	r26	C L0
++	beq	r26,	$fix1c		C U0
++$ret1c:	addl	r23,	r24,	r27	C L0
++	beq	r27,	$fix2c		C U0
++$ret2c:	addl	r28,	r25,	r28	C L0
++	beq	r28,	$fix3c		C U0
++$ret3c:	addl	r1,	r2,	r20	C L0 sum 2 mul's
++	addl	r3,	r4,	r2	C L0 #10 2 mul's
++	ldi	r17,	64(r17)		C L1 bookkeeping
++	cmpult	r20,	r1,	r29	C U0 carry from sum
++	cmpult	r2,	r4,	r4	C U0 carry from sum
++	stl	r13,	-32(r16)	C L0
++	stl	r26,	-24(r16)	C L1
++	addl	r5,	r6,	r14	C U0 sum 2 mul's
++	stl	r27,	-16(r16)	C L0
++	stl	r28,	-8(r16)		C L1
++	cmpult	r14,	r6,	r3	C U0 carry from sum
++	addl	r7,	r3,	r5	C L0 eat carry
++	addl	r20,	r15,	r20	C U0 carry cascade
++	beq	r20,	$fix4c		C U0
++$ret4c:	addl	r2,	r29,	r6	C L0
++	beq	r6,	$fix5c		C U0
++$ret5c:	addl	r14,	r4,	r7	C L0
++	beq	r7,	$fix6c		C U0
++$ret6c:	addl	r5,	r8,	r8	C L0 sum 2
++	cmpult	r8,	r5,	r29	C L0 carry from last bunch
++	stl	r20,	0(r16)		C L0
++	stl	r6,	8(r16)		C L1
++	stl	r7,	16(r16)		C L0
++	stl	r8,	24(r16)		C L1
++	addl	r29,	r21,	r0
++
++	ldl	r26,	0(r30)
++	ldl	r9,	8(r30)
++	ldl	r10,	16(r30)
++	ldl	r11,	24(r30)
++	ldl	r12,	32(r30)
++	ldl	r13,	40(r30)
++	ldl	r14,	48(r30)
++	ldl	r15,	56(r30)
++	ldl	r29,	64(r30)
++	ldi	r30,	224(r30)
++	ret	r31,	(r26),	1
++
++C $fix0w:	bis	r14,	r29,	r14	C join carries
++C	br	r31,	$ret0w
++$fix1w:	bis	r24,	r14,	r24	C join carries
++	br	r31,	$ret1w
++$fix2w:	bis	r25,	r24,	r25	C join carries
++	br	r31,	$ret2w
++$fix3w:	bis	r15,	r25,	r15	C join carries
++	br	r31,	$ret3w
++$fix0:	bis	r14,	r29,	r14	C join carries
++	br	r31,	$ret0
++$fix1:	bis	r24,	r14,	r24	C join carries
++	br	r31,	$ret1
++$fix2:	bis	r25,	r24,	r25	C join carries
++	br	r31,	$ret2
++$fix3:	bis	r15,	r25,	r15	C join carries
++	br	r31,	$ret3
++$fix4:	bis	r29,	r15,	r29	C join carries
++	br	r31,	$ret4
++$fix5:	bis	r4,	r29,	r4	C join carries
++	br	r31,	$ret5
++$fix6:	addl	r5,	r4,	r5	C can't carry twice!
++	br	r31,	$ret6
++$fix0c:	bis	r14,	r29,	r14	C join carries
++	br	r31,	$ret0c
++$fix1c:	bis	r24,	r14,	r24	C join carries
++	br	r31,	$ret1c
++$fix2c:	bis	r25,	r24,	r25	C join carries
++	br	r31,	$ret2c
++$fix3c:	bis	r15,	r25,	r15	C join carries
++	br	r31,	$ret3c
++$fix4c:	bis	r29,	r15,	r29	C join carries
++	br	r31,	$ret4c
++$fix5c:	bis	r4,	r29,	r4	C join carries
++	br	r31,	$ret5c
++$fix6c:	addl	r5,	r4,	r5	C can't carry twice!
++	br	r31,	$ret6c
++
++EPILOGUE(mpn_mul_1)
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/README b/mpn/sw_64/sw6/nails/README
+new file mode 100644
+index 0000000..b214ac5
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/README
+@@ -0,0 +1,65 @@
++Copyright 2002, 2005 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.
++
++
++
++
++
++This directory contains assembly code for nails-enabled 21264.  The code is not
++very well optimized.
++
++For addmul_N, as N grows larger, we could make multiple loads together, then do
++about 3.3 i/c.  10 cycles after the last load, we can increase to 4 i/c.  This
++would surely allow addmul_4 to run at 2 c/l, but the same should be possible
++also for addmul_3 and perhaps even addmul_2.
++
++
++		current		fair		best
++Routine		c/l  unroll	c/l  unroll	c/l  i/c
++mul_1		3.25		2.75		2.75 3.273
++addmul_1	4.0	4	3.5	4 14	3.25 3.385
++addmul_2	4.0	1	2.5	2 10	2.25 3.333
++addmul_3	3.0	1	2.33	2 14	2    3.333
++addmul_4	2.5	1	2.125	2 17	2    3.135
++
++addmul_5			2	1 10
++addmul_6			2	1 12
++addmul_7			2	1 14
++
++(The "best" column doesn't account for bookkeeping instructions and
++thereby assumes infinite unrolling.)
++
++Basecase usages:
++
++1	 addmul_1
++2	 addmul_2
++3	 addmul_3
++4	 addmul_4
++5	 addmul_3 + addmul_2	2.3998
++6	 addmul_4 + addmul_2
++7	 addmul_4 + addmul_3
+diff --git a/mpn/sw_64/sw6/nails/addmul_1.asm b/mpn/sw_64/sw6/nails/addmul_1.asm
+new file mode 100644
+index 0000000..1108355
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/addmul_1.asm
+@@ -0,0 +1,394 @@
++dnl  Sw_64 sw6 nails mpn_addmul_1.
++
++dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     4
++
++C TODO
++C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
++C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
++C    umulh.
++C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
++C    and would work since the loop structure is really regular.
++
++C  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`n', `r18')
++define(`vl0',`r19')
++
++define(`numb_mask',`r6')
++
++define(`m0a',`r0')
++define(`m0b',`r1')
++define(`m1a',`r2')
++define(`m1b',`r3')
++define(`m2a',`r20')
++define(`m2b',`r21')
++define(`m3a',`r22')
++define(`m3b',`r23')
++
++define(`acc0',`r25')
++define(`acc1',`r27')
++
++define(`ul0',`r4')
++define(`ul1',`r5')
++define(`ul2',`r4')
++define(`ul3',`r5')
++
++define(`rl0',`r24')
++define(`rl1',`r24')
++define(`rl2',`r24')
++define(`rl3',`r24')
++
++define(`t0',`r7')
++define(`t1',`r8')
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`NUMB_BITS',`GMP_NUMB_BITS')
++
++dnl  This declaration is munged by configure
++NAILS_SUPPORT(2-63)
++
++ASM_START()
++PROLOGUE(mpn_addmul_1)
++	sll	vl0, NAIL_BITS, vl0
++	ldi	numb_mask, -1(r31)
++	srl	numb_mask, NAIL_BITS, numb_mask
++
++	and	n,	3,	r25
++	cmpeq	r25,	1,	r21
++	bne	r21,	L(1m4)
++	cmpeq	r25,	2,	r21
++	bne	r21,	L(2m4)
++	beq	r25,	L(0m4)
++
++L(3m4):	ldl	ul3,	0(up)
++	ldi	n,	-4(n)
++	ldl	ul0,	8(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	ul1,	16(up)
++	ldi	up,	24(up)
++	ldi	rp,	-8(rp)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge3)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc1
++	addl	rl3,	acc1,	acc1
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	m3b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	br	r31,	L(ta3)
++
++L(ge3):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	addl	t0,	r31,	acc1
++	umulh	vl0,	ul2,	m2b
++	addl	rl3,	acc1,	acc1
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	addl	t0,	m3b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	br	r31,	L(el3)
++
++L(0m4):	ldi	n,	-8(n)
++	ldl	ul2,	0(up)
++	ldl	ul3,	8(up)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge4)
++
++	ldl	rl2,	0(rp)
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul1,	m1b
++	addl	rl2,	acc0,	acc0
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	m2b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	br	r31,	L(ta4)
++
++L(ge4):	ldl	rl2,	0(rp)
++	srl	m2a,NAIL_BITS,	t0
++	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul1,	m1b
++	addl	rl2,	acc0,	acc0
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	addl	t0,	m2b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	br	r31,	L(el0)
++
++L(2m4):	ldi	n,	-4(n)
++	ldl	ul0,	0(up)
++	ldl	ul1,	8(up)
++	ldi	up,	16(up)
++	ldi	rp,	-16(rp)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge2)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc0
++	addl	rl0,	acc0,	acc0
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	br	r31,	L(ta2)
++
++L(ge2):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul3,	m3b
++	addl	rl0,	acc0,	acc0
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	ldi	rp,	32(rp)
++	mull	vl0,	ul0,	m0a
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	bge	n,	L(el2)
++
++	br	r31,	L(ta6)
++
++L(1m4):	ldi	n,	-4(n)
++	ldl	ul1,	0(up)
++	ldi	up,	8(up)
++	ldi	rp,	-24(rp)
++	bge	n,	L(ge1)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	addl	rl1,	t0,	acc1
++	and	acc1,numb_mask,	r28
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	24(rp)
++	addl	t1,	m1b,	r0
++	ret	r31,	(r26),	1
++
++L(ge1):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	ldi	rp,	32(rp)
++	mull	vl0,	ul0,	m0a
++	addl	t0,	r31,	acc1
++	umulh	vl0,	ul0,	m0b
++	addl	rl1,	acc1,	acc1
++	ldl	rl2,	0(rp)
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	m1b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	blt	n,	L(ta5)
++
++L(ge5):	ldl	ul2,	0(up)
++	br	r31,	L(el1)
++
++	ALIGN(16)
++L(top):	mull	vl0,	ul0,	m0a		C U1
++	addl	t0,	m0b,	acc1		C L0
++	srl	acc0,NUMB_BITS,	t1		C U0
++	stl	r28,	-24(rp)			C L1
++C
++L(el2):	umulh	vl0,	ul0,	m0b		C U1
++	and	acc0,numb_mask,	r28		C L0
++	addl	rl1,	acc1,	acc1		C U0
++	ldl	rl2,	0(rp)			C L1
++C
++	unop					C U1
++	addl	t1,	acc1,	acc1		C L0
++	srl	m2a,NAIL_BITS,	t0		C U0
++	ldl	ul2,	0(up)			C L1
++C
++	mull	vl0,	ul1,	m1a		C U1
++	addl	t0,	m1b,	acc0		C L0
++	srl	acc1,NUMB_BITS,	t1		C U0
++	stl	r28,	-16(rp)			C L1
++C
++L(el1):	umulh	vl0,	ul1,	m1b		C U1
++	and	acc1,numb_mask,	r28		C L0
++	addl	rl2,	acc0,	acc0		C U0
++	ldl	rl3,	8(rp)			C L1
++C
++	ldi	n,	-4(n)			C L1
++	addl	t1,	acc0,	acc0		C L0
++	srl	m3a,NAIL_BITS,	t0		C U0
++	ldl	ul3,	8(up)			C L1
++C
++	mull	vl0,	ul2,	m2a		C U1
++	addl	t0,	m2b,	acc1		C L0
++	srl	acc0,NUMB_BITS,	t1		C U0
++	stl	r28,	-8(rp)			C L1
++C
++L(el0):	umulh	vl0,	ul2,	m2b		C U1
++	and	acc0,numb_mask,	r28		C L0
++	addl	rl3,	acc1,	acc1		C U0
++	ldl	rl0,	16(rp)			C L1
++C
++	unop					C U1
++	addl	t1,	acc1,	acc1		C L0
++	srl	m0a,NAIL_BITS,	t0		C U0
++	ldl	ul0,	16(up)			C L1
++C
++	mull	vl0,	ul3,	m3a		C U1
++	addl	t0,	m3b,	acc0		C L0
++	srl	acc1,NUMB_BITS,	t1		C U0
++	stl	r28,	0(rp)			C L1
++C
++L(el3):	umulh	vl0,	ul3,	m3b		C U1
++	and	acc1,numb_mask,	r28		C L0
++	addl	rl0,	acc0,	acc0		C U0
++	ldl	rl1,	24(rp)			C L1
++C
++	unop					C U1
++	addl	t1,	acc0,	acc0		C L0
++	srl	m1a,NAIL_BITS,	t0		C U0
++	ldl	ul1,	24(up)			C L1
++C
++	ldi	up,	32(up)			C L0
++	unop					C U1
++	ldi	rp,	32(rp)			C L1
++	bge	n,	L(top)			C U0
++
++L(end):	mull	vl0,	ul0,	m0a
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	stl	r28,	-24(rp)
++L(ta6):	umulh	vl0,	ul0,	m0b
++	and	acc0,numb_mask,	r28
++	addl	rl1,	acc1,	acc1
++	ldl	rl2,	0(rp)
++	addl	t1,	acc1,	acc1
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	m1b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	-16(rp)
++L(ta5):	umulh	vl0,	ul1,	m1b
++	and	acc1,numb_mask,	r28
++	addl	rl2,	acc0,	acc0
++	ldl	rl3,	8(rp)
++	addl	t1,	acc0,	acc0
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	m2b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	stl	r28,	-8(rp)
++	unop
++	ALIGN(16)
++L(ta4):	and	acc0,numb_mask,	r28
++	addl	rl3,	acc1,	acc1
++	ldl	rl0,	16(rp)
++	addl	t1,	acc1,	acc1
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	m3b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	0(rp)
++	unop
++	ALIGN(16)
++L(ta3):	and	acc1,numb_mask,	r28
++	addl	rl0,	acc0,	acc0
++	ldl	rl1,	24(rp)
++	addl	t1,	acc0,	acc0
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	stl	r28,	8(rp)
++	unop
++	ALIGN(16)
++L(ta2):	and	acc0,numb_mask,	r28
++	addl	rl1,	acc1,	acc1
++	addl	t1,	acc1,	acc1
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	16(rp)
++	and	acc1,numb_mask,	r28
++	addl	t1,	m1b,	r0
++	stl	r28,	24(rp)
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/addmul_2.asm b/mpn/sw_64/sw6/nails/addmul_2.asm
+new file mode 100644
+index 0000000..29154b1
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/addmul_2.asm
+@@ -0,0 +1,146 @@
++dnl  Sw_64 sw6 nails mpn_addmul_2.
++
++dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C Runs at 4.0 cycles/limb.
++
++C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
++C or 4-way unrolling over 20 cycles, for 2.5 c/l.
++
++
++C  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`n',`r18')
++define(`vp',`r19')
++
++C  Useful register aliases
++define(`numb_mask',`r24')
++define(`ulimb',`r25')
++define(`rlimb',`r27')
++
++define(`m0a',`r0')
++define(`m0b',`r1')
++define(`m1a',`r2')
++define(`m1b',`r3')
++
++define(`acc0',`r4')
++define(`acc1',`r5')
++
++define(`v0',`r6')
++define(`v1',`r7')
++
++C Used for temps: r8 r19 r28
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`NUMB_BITS',`GMP_NUMB_BITS')
++
++C  This declaration is munged by configure
++NAILS_SUPPORT(3-63)
++
++ASM_START()
++PROLOGUE(mpn_addmul_2)
++	ldi	numb_mask,-1(r31)
++	srl	numb_mask,NAIL_BITS,numb_mask
++
++	ldl	v0,	0(vp)
++	ldl	v1,	8(vp)
++
++	bis	r31,	r31,	acc0		C	zero acc0
++	sll	v0,NAIL_BITS,	v0
++	bis	r31,	r31,	acc1		C	zero acc1
++	sll	v1,NAIL_BITS,	v1
++	bis	r31,	r31,	r19
++
++	ldl	ulimb,	0(up)
++	ldi	up,	8(up)
++	mull	v0,	ulimb,	m0a		C U1
++	umulh	v0,	ulimb,	m0b		C U1
++	mull	v1,	ulimb,	m1a		C U1
++	umulh	v1,	ulimb,	m1b		C U1
++	ldi	n,	-1(n)
++	beq	n,	L(end)			C U0
++
++	ALIGN(16)
++L(top):	bis	r31,	r31,	r31		C U1	nop
++	addl	r19,	acc0,	acc0		C U0	propagate nail
++	ldl	rlimb,	0(rp)			C L0
++	ldl	ulimb,	0(up)			C L1
++
++	ldi	rp,	8(rp)			C L1
++	srl	m0a,NAIL_BITS,	r8		C U0
++	ldi	up,	8(up)			C L0
++	mull	v0,	ulimb,	m0a		C U1
++
++	addl	r8,	acc0,	r19		C U0
++	addl	m0b,	acc1,	acc0		C L1
++	umulh	v0,	ulimb,	m0b		C U1
++	bis	r31,	r31,	r31		C L0	nop
++
++	addl	rlimb,	r19,	r19		C L1	FINAL PROD-SUM
++	srl	m1a,NAIL_BITS,	r8		C U0
++	ldi	n,	-1(n)			C L0
++	mull	v1,	ulimb,	m1a		C U1
++
++	addl	r8,	acc0,	acc0		C U0
++	bis	r31,	m1b,	acc1		C L1
++	umulh	v1,	ulimb,	m1b		C U1
++	and	r19,numb_mask,	r28		C L0	extract numb part
++
++	unop
++	srl	r19,NUMB_BITS,	r19		C U1	extract nail part
++	stl	r28,	-8(rp)			C L1
++	bne	n,	L(top)			C U0
++
++L(end):	ldl	rlimb,	0(rp)
++	addl	r19,	acc0,	acc0		C	propagate nail
++	ldi	rp,	8(rp)
++	srl	m0a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc0,	r19
++	addl	m0b,	acc1,	acc0
++	addl	rlimb,	r19,	r19
++	srl	m1a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc0,	acc0
++	bis	r31,	m1b,	acc1
++	and	r19,numb_mask,	r28		C extract limb
++
++	srl	r19,NUMB_BITS,	r19		C extract nail
++	stl	r28,	-8(rp)
++
++	addl	r19,	acc0,	acc0		C propagate nail
++	and	acc0,numb_mask,	r28
++	stl	r28,	0(rp)
++	srl	acc0,NUMB_BITS,	r19
++	addl	r19,	acc1,	r0
++
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/addmul_3.asm b/mpn/sw_64/sw6/nails/addmul_3.asm
+new file mode 100644
+index 0000000..d2fbd97
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/addmul_3.asm
+@@ -0,0 +1,169 @@
++dnl  Sw_64 sw6 nails mpn_addmul_3.
++
++dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C Runs at 3.0 cycles/limb.
++
++C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
++
++
++C  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`n',`r18')
++define(`vp',`r19')
++
++C  Useful register aliases
++define(`numb_mask',`r24')
++define(`ulimb',`r25')
++define(`rlimb',`r27')
++
++define(`m0a',`r0')
++define(`m0b',`r1')
++define(`m1a',`r2')
++define(`m1b',`r3')
++define(`m2a',`r20')
++define(`m2b',`r21')
++
++define(`acc0',`r4')
++define(`acc1',`r5')
++define(`acc2',`r22')
++
++define(`v0',`r6')
++define(`v1',`r7')
++define(`v2',`r23')
++
++C Used for temps: r8 r19 r28
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`NUMB_BITS',`GMP_NUMB_BITS')
++
++C  This declaration is munged by configure
++NAILS_SUPPORT(3-63)
++
++ASM_START()
++PROLOGUE(mpn_addmul_3)
++	ldi	numb_mask,-1(r31)
++	srl	numb_mask,NAIL_BITS,numb_mask
++
++	ldl	v0,	0(vp)
++	ldl	v1,	8(vp)
++	ldl	v2,	16(vp)
++
++	bis	r31,	r31,	acc0		C	zero acc0
++	sll	v0,NAIL_BITS,	v0
++	bis	r31,	r31,	acc1		C	zero acc1
++	sll	v1,NAIL_BITS,	v1
++	bis	r31,	r31,	acc2		C	zero acc2
++	sll	v2,NAIL_BITS,	v2
++	bis	r31,	r31,	r19
++
++	ldl	ulimb,	0(up)
++	ldi	up,	8(up)
++	mull	v0,	ulimb,	m0a		C U1
++	umulh	v0,	ulimb,	m0b		C U1
++	mull	v1,	ulimb,	m1a		C U1
++	umulh	v1,	ulimb,	m1b		C U1
++	ldi	n,	-1(n)
++	mull	v2,	ulimb,	m2a		C U1
++	umulh	v2,	ulimb,	m2b		C U1
++	beq	n,	L(end)			C U0
++
++	ALIGN(16)
++L(top):	ldl	rlimb,	0(rp)			C L1
++	ldl	ulimb,	0(up)			C L0
++	bis	r31,	r31,	r31		C U0	nop
++	addl	r19,	acc0,	acc0		C U1	propagate nail
++
++	ldi	rp,	8(rp)			C L1
++	srl	m0a,NAIL_BITS,	r8		C U0
++	ldi	up,	8(up)			C L0
++	mull	v0,	ulimb,	m0a		C U1
++
++	addl	r8,	acc0,	r19		C U0
++	addl	m0b,	acc1,	acc0		C L1
++	umulh	v0,	ulimb,	m0b		C U1
++	bis	r31,	r31,	r31		C L0	nop
++
++	addl	rlimb,	r19,	r19		C L1
++	srl	m1a,NAIL_BITS,	r8		C U0
++	bis	r31,	r31,	r31		C L0	nop
++	mull	v1,	ulimb,	m1a		C U1
++
++	addl	r8,	acc0,	acc0		C U0
++	addl	m1b,	acc2,	acc1		C L1
++	umulh	v1,	ulimb,	m1b		C U1
++	and	r19,numb_mask,	r28		C L0	extract numb part
++
++	bis	r31,	r31,	r31		C L1	nop
++	srl	m2a,NAIL_BITS,	r8		C U0
++	ldi	n,	-1(n)			C L0
++	mull	v2,	ulimb,	m2a		C U1
++
++	addl	r8,	acc1,	acc1		C L0
++	bis	r31,	m2b,	acc2		C L1
++	umulh	v2,	ulimb,	m2b		C U1
++	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
++
++	stl	r28,	-8(rp)			C L
++	bne	n,	L(top)			C U0
++
++L(end):	ldl	rlimb,	0(rp)
++	addl	r19,	acc0,	acc0		C	propagate nail
++	ldi	rp,	8(rp)
++	srl	m0a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc0,	r19
++	addl	m0b,	acc1,	acc0
++	addl	rlimb,	r19,	r19
++	srl	m1a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc0,	acc0
++	addl	m1b,	acc2,	acc1
++	and	r19,numb_mask,	r28		C extract limb
++	srl	m2a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc1,	acc1
++	bis	r31,	m2b,	acc2
++	srl	r19,NUMB_BITS,	r19		C extract nail
++	stl	r28,	-8(rp)
++
++	addl	r19,	acc0,	acc0		C propagate nail
++	and	acc0,numb_mask,	r28
++	stl	r28,	0(rp)
++	srl	acc0,NUMB_BITS,	r19
++	addl	r19,	acc1,	acc1
++
++	and	acc1,numb_mask,	r28
++	stl	r28,	8(rp)
++	srl	acc1,NUMB_BITS,	r19
++	addl	r19,	acc2,	m0a
++
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/addmul_4.asm b/mpn/sw_64/sw6/nails/addmul_4.asm
+new file mode 100644
+index 0000000..f253da1
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/addmul_4.asm
+@@ -0,0 +1,210 @@
++dnl  Sw_64 sw6 nails mpn_addmul_4.
++
++dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C Runs at 2.5 cycles/limb.
++
++C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
++C to 3.24 insn/cycle.
++
++
++C  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`n',`r18')
++define(`vp',`r19')
++
++C  Useful register aliases
++define(`numb_mask',`r24')
++define(`ulimb',`r25')
++define(`rlimb',`r27')
++
++define(`m0a',`r0')
++define(`m0b',`r1')
++define(`m1a',`r2')
++define(`m1b',`r3')
++define(`m2a',`r20')
++define(`m2b',`r21')
++define(`m3a',`r12')
++define(`m3b',`r13')
++
++define(`acc0',`r4')
++define(`acc1',`r5')
++define(`acc2',`r22')
++define(`acc3',`r14')
++
++define(`v0',`r6')
++define(`v1',`r7')
++define(`v2',`r23')
++define(`v3',`r15')
++
++C Used for temps: r8 r19 r28
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`NUMB_BITS',`GMP_NUMB_BITS')
++
++C  This declaration is munged by configure
++NAILS_SUPPORT(4-63)
++
++ASM_START()
++PROLOGUE(mpn_addmul_4)
++	ldi	r30,	-240(r30)
++	stl	r12,	32(r30)
++	stl	r13,	40(r30)
++	stl	r14,	48(r30)
++	stl	r15,	56(r30)
++
++	ldi	numb_mask,-1(r31)
++	srl	numb_mask,NAIL_BITS,numb_mask
++
++	ldl	v0,	0(vp)
++	ldl	v1,	8(vp)
++	ldl	v2,	16(vp)
++	ldl	v3,	24(vp)
++
++	bis	r31,	r31,	acc0		C	zero acc0
++	sll	v0,NAIL_BITS,	v0
++	bis	r31,	r31,	acc1		C	zero acc1
++	sll	v1,NAIL_BITS,	v1
++	bis	r31,	r31,	acc2		C	zero acc2
++	sll	v2,NAIL_BITS,	v2
++	bis	r31,	r31,	acc3		C	zero acc3
++	sll	v3,NAIL_BITS,	v3
++	bis	r31,	r31,	r19
++
++	ldl	ulimb,	0(up)
++	ldi	up,	8(up)
++	mull	v0,	ulimb,	m0a		C U1
++	umulh	v0,	ulimb,	m0b		C U1
++	mull	v1,	ulimb,	m1a		C U1
++	umulh	v1,	ulimb,	m1b		C U1
++	ldi	n,	-1(n)
++	mull	v2,	ulimb,	m2a		C U1
++	umulh	v2,	ulimb,	m2b		C U1
++	mull	v3,	ulimb,	m3a		C U1
++	umulh	v3,	ulimb,	m3b		C U1
++	beq	n,	L(end)			C U0
++
++	ALIGN(16)
++L(top):	bis	r31,	r31,	r31		C U1	nop
++	ldl	rlimb,	0(rp)			C L0
++	ldl	ulimb,	0(up)			C L1
++	addl	r19,	acc0,	acc0		C U0	propagate nail
++
++	bis	r31,	r31,	r31		C L0	nop
++	bis	r31,	r31,	r31		C U1	nop
++	bis	r31,	r31,	r31		C L1	nop
++	bis	r31,	r31,	r31		C U0	nop
++
++	ldi	rp,	8(rp)			C L0
++	srl	m0a,NAIL_BITS,	r8		C U0
++	ldi	up,	8(up)			C L1
++	mull	v0,	ulimb,	m0a		C U1
++
++	addl	r8,	acc0,	r19		C U0
++	addl	m0b,	acc1,	acc0		C L0
++	umulh	v0,	ulimb,	m0b		C U1
++	bis	r31,	r31,	r31		C L1	nop
++
++	addl	rlimb,	r19,	r19		C L0
++	srl	m1a,NAIL_BITS,	r8		C U0
++	bis	r31,	r31,	r31		C L1	nop
++	mull	v1,	ulimb,	m1a		C U1
++
++	addl	r8,	acc0,	acc0		C U0
++	addl	m1b,	acc2,	acc1		C L0
++	umulh	v1,	ulimb,	m1b		C U1
++	and	r19,numb_mask,	r28		C L1	extract numb part
++
++	bis	r31,	r31,	r31		C L0	nop
++	srl	m2a,NAIL_BITS,	r8		C U0
++	ldi	n,	-1(n)			C L1
++	mull	v2,	ulimb,	m2a		C U1
++
++	addl	r8,	acc1,	acc1		C L1
++	addl	m2b,	acc3,	acc2		C L0
++	umulh	v2,	ulimb,	m2b		C U1
++	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
++
++	bis	r31,	r31,	r31		C L0	nop
++	srl	m3a,NAIL_BITS,	r8		C U0
++	stl	r28,	-8(rp)			C L1
++	mull	v3,	ulimb,	m3a		C U1
++
++	addl	r8,	acc2,	acc2		C L0
++	bis	r31,	m3b,	acc3		C L1
++	umulh	v3,	ulimb,	m3b		C U1
++	bne	n,	L(top)			C U0
++
++L(end):	ldl	rlimb,	0(rp)
++	addl	r19,	acc0,	acc0		C	propagate nail
++	ldi	rp,	8(rp)			C FIXME: DELETE
++	srl	m0a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc0,	r19
++	addl	m0b,	acc1,	acc0
++	addl	rlimb,	r19,	r19
++	srl	m1a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc0,	acc0
++	addl	m1b,	acc2,	acc1
++	and	r19,numb_mask,	r28		C extract limb
++	srl	m2a,NAIL_BITS,	r8		C U0
++	addl	r8,	acc1,	acc1
++	addl	m2b,	acc3,	acc2
++	srl	r19,NUMB_BITS,	r19		C extract nail
++	srl	m3a,NAIL_BITS,	r8		C U0
++	stl	r28,	-8(rp)
++	addl	r8,	acc2,	acc2
++	bis	r31,	m3b,	acc3
++
++	addl	r19,	acc0,	acc0		C propagate nail
++	and	acc0,numb_mask,	r28
++	stl	r28,	0(rp)
++	srl	acc0,NUMB_BITS,	r19
++	addl	r19,	acc1,	acc1
++
++	and	acc1,numb_mask,	r28
++	stl	r28,	8(rp)
++	srl	acc1,NUMB_BITS,	r19
++	addl	r19,	acc2,	acc2
++
++	and	acc2,numb_mask,	r28
++	stl	r28,	16(rp)
++	srl	acc2,NUMB_BITS,	r19
++	addl	r19,	acc3,	r0
++
++	ldl	r12,	32(r30)
++	ldl	r13,	40(r30)
++	ldl	r14,	48(r30)
++	ldl	r15,	56(r30)
++	ldi	r30,	240(r30)
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/aors_n.asm b/mpn/sw_64/sw6/nails/aors_n.asm
+new file mode 100644
+index 0000000..cc183d4
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/aors_n.asm
+@@ -0,0 +1,233 @@
++dnl  Sw_64 sw6 nails mpn_add_n and mpn_sub_n.
++
++dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++
++dnl  Runs at 2.5 cycles/limb.  It would be possible to reach 2.0 cycles/limb
++dnl  with 8-way unrolling.
++
++include(`../config.m4')
++
++dnl  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`vp',`r18')
++define(`n',`r19')
++
++define(`rl0',`r0')
++define(`rl1',`r1')
++define(`rl2',`r2')
++define(`rl3',`r3')
++
++define(`ul0',`r4')
++define(`ul1',`r5')
++define(`ul2',`r6')
++define(`ul3',`r7')
++
++define(`vl0',`r22')
++define(`vl1',`r23')
++define(`vl2',`r24')
++define(`vl3',`r25')
++
++define(`numb_mask',`r21')
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`CYSH',`GMP_NUMB_BITS')
++
++dnl  This declaration is munged by configure
++NAILS_SUPPORT(1-63)
++
++ifdef(`OPERATION_add_n', `
++	define(`OP',        addl)
++	define(`CYSH',`GMP_NUMB_BITS')
++	define(`func',  mpn_add_n)')
++ifdef(`OPERATION_sub_n', `
++	define(`OP',        subl)
++	define(`CYSH',63)
++	define(`func',  mpn_sub_n)')
++
++MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
++
++ASM_START()
++PROLOGUE(func)
++	ldi	numb_mask, -1(r31)
++	srl	numb_mask, NAIL_BITS, numb_mask
++	bis	r31,	r31,	r20
++
++	and	n,	3,	r25
++	ldi	n,	-4(n)
++	beq	r25,	L(ge4)
++
++L(lp0):	ldl	ul0,	0(up)
++	ldi	up,	8(up)
++	ldl	vl0,	0(vp)
++	ldi	vp,	8(vp)
++	ldi	rp,	8(rp)
++	ldi	r25,	-1(r25)
++	OP	ul0,	vl0,	rl0
++	OP	rl0,	r20,	rl0
++	and	rl0, numb_mask,	r28
++	stl	r28,	-8(rp)
++	srl	rl0,	CYSH,	r20
++	bne	r25,	L(lp0)
++
++	blt	n,	L(ret)
++
++L(ge4):	ldl	ul0,	0(up)
++	ldl	vl0,	0(vp)
++	ldl	ul1,	8(up)
++	ldl	vl1,	8(vp)
++	ldl	ul2,	16(up)
++	ldl	vl2,	16(vp)
++	ldl	ul3,	24(up)
++	ldl	vl3,	24(vp)
++	ldi	up,	32(up)
++	ldi	vp,	32(vp)
++	ldi	n,	-4(n)
++	bge	n,	L(ge8)
++
++	OP	ul0,	vl0,	rl0	C		main-add 0
++	OP	rl0,	r20,	rl0	C		cy-add 0
++	OP	ul1,	vl1,	rl1	C		main-add 1
++	srl	rl0,	CYSH,	r20	C		gen cy 0
++	OP	rl1,	r20,	rl1	C		cy-add 1
++	and	rl0,numb_mask,	r27
++	br	r31,	L(cj0)
++
++L(ge8):	OP	ul0,	vl0,	rl0	C		main-add 0
++	ldl	ul0,	0(up)
++	ldl	vl0,	0(vp)
++	OP	rl0,	r20,	rl0	C		cy-add 0
++	OP	ul1,	vl1,	rl1	C		main-add 1
++	srl	rl0,	CYSH,	r20	C		gen cy 0
++	ldl	ul1,	8(up)
++	ldl	vl1,	8(vp)
++	OP	rl1,	r20,	rl1	C		cy-add 1
++	and	rl0,numb_mask,	r27
++	OP	ul2,	vl2,	rl2	C		main-add 2
++	srl	rl1,	CYSH,	r20	C		gen cy 1
++	ldl	ul2,	16(up)
++	ldl	vl2,	16(vp)
++	OP	rl2,	r20,	rl2	C		cy-add 2
++	and	rl1,numb_mask,	r28
++	stl	r27,	0(rp)
++	OP	ul3,	vl3,	rl3	C		main-add 3
++	srl	rl2,	CYSH,	r20	C		gen cy 2
++	ldl	ul3,	24(up)
++	ldl	vl3,	24(vp)
++	OP	rl3,	r20,	rl3	C		cy-add 3
++	and	rl2,numb_mask,	r27
++	stl	r28,	8(rp)
++	ldi	rp,	32(rp)
++	ldi	up,	32(up)
++	ldi	vp,	32(vp)
++	ldi	n,	-4(n)
++	blt	n,	L(end)
++
++	ALIGN(32)
++L(top):	OP	ul0,	vl0,	rl0	C		main-add 0
++	srl	rl3,	CYSH,	r20	C		gen cy 3
++	ldl	ul0,	0(up)
++	ldl	vl0,	0(vp)
++
++	OP	rl0,	r20,	rl0	C		cy-add 0
++	and	rl3,numb_mask,	r28
++	stl	r27,	-16(rp)
++	bis	r31,	r31,	r31
++
++	OP	ul1,	vl1,	rl1	C		main-add 1
++	srl	rl0,	CYSH,	r20	C		gen cy 0
++	ldl	ul1,	8(up)
++	ldl	vl1,	8(vp)
++
++	OP	rl1,	r20,	rl1	C		cy-add 1
++	and	rl0,numb_mask,	r27
++	stl	r28,	-8(rp)
++	bis	r31,	r31,	r31
++
++	OP	ul2,	vl2,	rl2	C		main-add 2
++	srl	rl1,	CYSH,	r20	C		gen cy 1
++	ldl	ul2,	16(up)
++	ldl	vl2,	16(vp)
++
++	OP	rl2,	r20,	rl2	C		cy-add 2
++	and	rl1,numb_mask,	r28
++	stl	r27,	0(rp)
++	bis	r31,	r31,	r31
++
++	OP	ul3,	vl3,	rl3	C		main-add 3
++	srl	rl2,	CYSH,	r20	C		gen cy 2
++	ldl	ul3,	24(up)
++	ldl	vl3,	24(vp)
++
++	OP	rl3,	r20,	rl3	C		cy-add 3
++	and	rl2,numb_mask,	r27
++	stl	r28,	8(rp)
++	bis	r31,	r31,	r31
++
++	bis	r31,	r31,	r31
++	ldi	n,	-4(n)
++	ldi	up,	32(up)
++	ldi	vp,	32(vp)
++
++	bis	r31,	r31,	r31
++	bis	r31,	r31,	r31
++	ldi	rp,	32(rp)
++	bge	n,	L(top)
++
++L(end):	OP	ul0,	vl0,	rl0	C		main-add 0
++	srl	rl3,	CYSH,	r20	C		gen cy 3
++	OP	rl0,	r20,	rl0	C		cy-add 0
++	and	rl3,numb_mask,	r28
++	stl	r27,	-16(rp)
++	OP	ul1,	vl1,	rl1	C		main-add 1
++	srl	rl0,	CYSH,	r20	C		gen cy 0
++	OP	rl1,	r20,	rl1	C		cy-add 1
++	and	rl0,numb_mask,	r27
++	stl	r28,	-8(rp)
++L(cj0):	OP	ul2,	vl2,	rl2	C		main-add 2
++	srl	rl1,	CYSH,	r20	C		gen cy 1
++	OP	rl2,	r20,	rl2	C		cy-add 2
++	and	rl1,numb_mask,	r28
++	stl	r27,	0(rp)
++	OP	ul3,	vl3,	rl3	C		main-add 3
++	srl	rl2,	CYSH,	r20	C		gen cy 2
++	OP	rl3,	r20,	rl3	C		cy-add 3
++	and	rl2,numb_mask,	r27
++	stl	r28,	8(rp)
++
++	srl	rl3,	CYSH,	r20	C		gen cy 3
++	and	rl3,numb_mask,	r28
++	stl	r27,	16(rp)
++	stl	r28,	24(rp)
++
++L(ret):	and	r20,	1,	r0
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/gmp-mparam.h b/mpn/sw_64/sw6/nails/gmp-mparam.h
+new file mode 100644
+index 0000000..7949fe8
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/gmp-mparam.h
+@@ -0,0 +1,72 @@
++/* gmp-mparam.h -- Compiler/machine parameter header file.
++
++Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#define GMP_LIMB_BITS 64
++#define GMP_LIMB_BYTES 8
++
++/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
++
++#define MUL_TOOM22_THRESHOLD             40
++#define MUL_TOOM33_THRESHOLD            236
++
++#define SQR_BASECASE_THRESHOLD            7  /* karatsuba */
++#define SQR_TOOM2_THRESHOLD               0  /* never sqr_basecase */
++#define SQR_TOOM3_THRESHOLD             120
++
++#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
++#define DIV_DC_THRESHOLD                 48
++#define POWM_THRESHOLD                  113
++
++#define HGCD_THRESHOLD                   78
++#define GCD_ACCEL_THRESHOLD               3
++#define GCD_DC_THRESHOLD                392
++#define JACOBI_BASE_METHOD                1
++
++#define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
++#define DIVREM_1_UNNORM_THRESHOLD     MP_SIZE_T_MAX  /* no preinv with nails */
++#define MOD_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* no preinv with nails */
++#define MOD_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* no preinv with nails */
++#define USE_PREINV_DIVREM_1               0  /* no preinv with nails */
++#define USE_PREINV_MOD_1                  0  /* no preinv with nails */
++#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* no preinv with nails */
++#define DIVEXACT_1_THRESHOLD              0  /* always */
++#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
++
++#define GET_STR_DC_THRESHOLD             15
++#define GET_STR_PRECOMPUTE_THRESHOLD     24
++#define SET_STR_THRESHOLD              6336
++
++#define MUL_FFT_TABLE  { 688, 1440, 3648, 6400, 25600, 0 }
++#define MUL_FFT_MODF_THRESHOLD          488
++#define MUL_FFT_THRESHOLD              3712
++
++#define SQR_FFT_TABLE  { 432, 864, 3136, 6400, 25600, 0 }
++#define SQR_FFT_MODF_THRESHOLD          480
++#define SQR_FFT_THRESHOLD              2976
+diff --git a/mpn/sw_64/sw6/nails/mul_1.asm b/mpn/sw_64/sw6/nails/mul_1.asm
+new file mode 100644
+index 0000000..63c14ad
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/mul_1.asm
+@@ -0,0 +1,362 @@
++dnl  Sw_64 sw6 nails mpn_mul_1.
++
++dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     3.25
++
++C TODO
++C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
++C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
++C    umulh.
++C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
++C    and would work since the loop structure is really regular.
++
++C  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`n', `r18')
++define(`vl0',`r19')
++
++define(`numb_mask',`r6')
++
++define(`m0a',`r0')
++define(`m0b',`r1')
++define(`m1a',`r2')
++define(`m1b',`r3')
++define(`m2a',`r20')
++define(`m2b',`r21')
++define(`m3a',`r22')
++define(`m3b',`r23')
++
++define(`acc0',`r25')
++define(`acc1',`r27')
++
++define(`ul0',`r4')
++define(`ul1',`r5')
++define(`ul2',`r4')
++define(`ul3',`r5')
++
++define(`rl0',`r24')
++define(`rl1',`r24')
++define(`rl2',`r24')
++define(`rl3',`r24')
++
++define(`t0',`r7')
++define(`t1',`r8')
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`NUMB_BITS',`GMP_NUMB_BITS')
++
++dnl  This declaration is munged by configure
++NAILS_SUPPORT(1-63)
++
++ASM_START()
++PROLOGUE(mpn_mul_1)
++	sll	vl0, NAIL_BITS, vl0
++	ldi	numb_mask, -1(r31)
++	srl	numb_mask, NAIL_BITS, numb_mask
++
++	and	n,	3,	r25
++	cmpeq	r25,	1,	r21
++	bne	r21,	L(1m4)
++	cmpeq	r25,	2,	r21
++	bne	r21,	L(2m4)
++	beq	r25,	L(0m4)
++
++L(3m4):	ldl	ul3,	0(up)
++	ldi	n,	-4(n)
++	ldl	ul0,	8(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	ul1,	16(up)
++	ldi	up,	24(up)
++	ldi	rp,	-8(rp)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge3)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc1
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	m3b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	br	r31,	L(ta3)
++
++L(ge3):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	srl	m3a,NAIL_BITS,	t0
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	addl	t0,	r31,	acc1
++	umulh	vl0,	ul2,	m2b
++	srl	m0a,NAIL_BITS,	t0
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	addl	t0,	m3b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	br	r31,	L(el3)
++
++L(0m4):	ldi	n,	-8(n)
++	ldl	ul2,	0(up)
++	ldl	ul3,	8(up)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge4)
++
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul1,	m1b
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	m2b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	br	r31,	L(ta4)
++
++L(ge4):	srl	m2a,NAIL_BITS,	t0
++	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul1,	m1b
++	srl	m3a,NAIL_BITS,	t0
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	addl	t0,	m2b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	br	r31,	L(el0)
++
++L(2m4):	ldi	n,	-4(n)
++	ldl	ul0,	0(up)
++	ldl	ul1,	8(up)
++	ldi	up,	16(up)
++	ldi	rp,	-16(rp)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge2)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc0
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	br	r31,	L(ta2)
++
++L(ge2):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	srl	m0a,NAIL_BITS,	t0
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul3,	m3b
++	srl	m1a,NAIL_BITS,	t0
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	ldi	rp,	32(rp)
++	mull	vl0,	ul0,	m0a
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	bge	n,	L(el2)
++
++	br	r31,	L(ta6)
++
++L(1m4):	ldi	n,	-4(n)
++	ldl	ul1,	0(up)
++	ldi	up,	8(up)
++	ldi	rp,	-24(rp)
++	bge	n,	L(ge1)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc1
++	and	acc1,numb_mask,	r28
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	24(rp)
++	addl	t1,	m1b,	r0
++	ret	r31,	(r26),	1
++
++L(ge1):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	srl	m1a,NAIL_BITS,	t0
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	ldi	rp,	32(rp)
++	mull	vl0,	ul0,	m0a
++	addl	t0,	r31,	acc1
++	umulh	vl0,	ul0,	m0b
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	m1b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	blt	n,	L(ta5)
++
++L(ge5):	ldl	ul2,	0(up)
++	br	r31,	L(el1)
++
++	ALIGN(16)
++L(top):	mull	vl0,	ul0,	m0a		C U1
++	addl	t0,	m0b,	acc1		C L0
++	srl	acc0,NUMB_BITS,	t1		C U0
++	stl	r28,	-24(rp)			C L1
++C
++L(el2):	umulh	vl0,	ul0,	m0b		C U1
++	and	acc0,numb_mask,	r28		C L0
++	unop					C U0
++	unop					C L1
++C
++	unop					C U1
++	addl	t1,	acc1,	acc1		C L0
++	srl	m2a,NAIL_BITS,	t0		C U0
++	ldl	ul2,	0(up)			C L1
++C
++	mull	vl0,	ul1,	m1a		C U1
++	addl	t0,	m1b,	acc0		C L0
++	srl	acc1,NUMB_BITS,	t1		C U0
++	stl	r28,	-16(rp)			C L1
++C
++L(el1):	umulh	vl0,	ul1,	m1b		C U1
++	and	acc1,numb_mask,	r28		C L0
++	unop					C U0
++	ldi	n,	-4(n)			C L1
++C
++	unop					C U1
++	addl	t1,	acc0,	acc0		C L0
++	srl	m3a,NAIL_BITS,	t0		C U0
++	ldl	ul3,	8(up)			C L1
++C
++	mull	vl0,	ul2,	m2a		C U1
++	addl	t0,	m2b,	acc1		C L0
++	srl	acc0,NUMB_BITS,	t1		C U0
++	stl	r28,	-8(rp)			C L1
++C
++L(el0):	umulh	vl0,	ul2,	m2b		C U1
++	and	acc0,numb_mask,	r28		C L0
++	unop					C U0
++	unop					C L1
++C
++	unop					C U1
++	addl	t1,	acc1,	acc1		C L0
++	srl	m0a,NAIL_BITS,	t0		C U0
++	ldl	ul0,	16(up)			C L1
++C
++	mull	vl0,	ul3,	m3a		C U1
++	addl	t0,	m3b,	acc0		C L0
++	srl	acc1,NUMB_BITS,	t1		C U0
++	stl	r28,	0(rp)			C L1
++C
++L(el3):	umulh	vl0,	ul3,	m3b		C U1
++	and	acc1,numb_mask,	r28		C L0
++	unop					C U0
++	unop					C L1
++C
++	unop					C U1
++	addl	t1,	acc0,	acc0		C L0
++	srl	m1a,NAIL_BITS,	t0		C U0
++	ldl	ul1,	24(up)			C L1
++C
++	ldi	up,	32(up)			C L0
++	unop					C U1
++	ldi	rp,	32(rp)			C L1
++	bge	n,	L(top)			C U0
++
++L(end):	mull	vl0,	ul0,	m0a
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	stl	r28,	-24(rp)
++L(ta6):	umulh	vl0,	ul0,	m0b
++	and	acc0,numb_mask,	r28
++	addl	t1,	acc1,	acc1
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	m1b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	-16(rp)
++L(ta5):	umulh	vl0,	ul1,	m1b
++	and	acc1,numb_mask,	r28
++	addl	t1,	acc0,	acc0
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	m2b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	stl	r28,	-8(rp)
++	ALIGN(16)
++L(ta4):	and	acc0,numb_mask,	r28
++	addl	t1,	acc1,	acc1
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	m3b,	acc0
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	0(rp)
++	unop
++	ALIGN(16)
++L(ta3):	and	acc1,numb_mask,	r28
++	addl	t1,	acc0,	acc0
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	m0b,	acc1
++	srl	acc0,NUMB_BITS,	t1
++	stl	r28,	8(rp)
++	unop
++	ALIGN(16)
++L(ta2):	and	acc0,numb_mask,	r28
++	addl	t1,	acc1,	acc1
++	srl	acc1,NUMB_BITS,	t1
++	stl	r28,	16(rp)
++	and	acc1,numb_mask,	r28
++	addl	t1,	m1b,	r0
++	stl	r28,	24(rp)
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/nails/submul_1.asm b/mpn/sw_64/sw6/nails/submul_1.asm
+new file mode 100644
+index 0000000..3ac8f2e
+--- /dev/null
++++ b/mpn/sw_64/sw6/nails/submul_1.asm
+@@ -0,0 +1,394 @@
++dnl  Sw_64 sw6 nails mpn_submul_1.
++
++dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     4
++
++C TODO
++C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
++C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
++C    umulh.
++C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
++C    and would work since the loop structure is really regular.
++
++C  INPUT PARAMETERS
++define(`rp',`r16')
++define(`up',`r17')
++define(`n', `r18')
++define(`vl0',`r19')
++
++define(`numb_mask',`r6')
++
++define(`m0a',`r0')
++define(`m0b',`r1')
++define(`m1a',`r2')
++define(`m1b',`r3')
++define(`m2a',`r20')
++define(`m2b',`r21')
++define(`m3a',`r22')
++define(`m3b',`r23')
++
++define(`acc0',`r25')
++define(`acc1',`r27')
++
++define(`ul0',`r4')
++define(`ul1',`r5')
++define(`ul2',`r4')
++define(`ul3',`r5')
++
++define(`rl0',`r24')
++define(`rl1',`r24')
++define(`rl2',`r24')
++define(`rl3',`r24')
++
++define(`t0',`r7')
++define(`t1',`r8')
++
++define(`NAIL_BITS',`GMP_NAIL_BITS')
++define(`NUMB_BITS',`GMP_NUMB_BITS')
++
++dnl  This declaration is munged by configure
++NAILS_SUPPORT(2-63)
++
++ASM_START()
++PROLOGUE(mpn_submul_1)
++	sll	vl0, NAIL_BITS, vl0
++	ldi	numb_mask, -1(r31)
++	srl	numb_mask, NAIL_BITS, numb_mask
++
++	and	n,	3,	r25
++	cmpeq	r25,	1,	r21
++	bne	r21,	L(1m4)
++	cmpeq	r25,	2,	r21
++	bne	r21,	L(2m4)
++	beq	r25,	L(0m4)
++
++L(3m4):	ldl	ul3,	0(up)
++	ldi	n,	-4(n)
++	ldl	ul0,	8(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	ul1,	16(up)
++	ldi	up,	24(up)
++	ldi	rp,	-8(rp)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge3)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc1
++	subl	rl3,	acc1,	acc1
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	m3b,	acc0
++	sra	acc1,NUMB_BITS,	t1
++	br	r31,	L(ta3)
++
++L(ge3):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	addl	t0,	r31,	acc1
++	umulh	vl0,	ul2,	m2b
++	subl	rl3,	acc1,	acc1
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	addl	t0,	m3b,	acc0
++	sra	acc1,NUMB_BITS,	t1
++	br	r31,	L(el3)
++
++L(0m4):	ldi	n,	-8(n)
++	ldl	ul2,	0(up)
++	ldl	ul3,	8(up)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge4)
++
++	ldl	rl2,	0(rp)
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul1,	m1b
++	subl	rl2,	acc0,	acc0
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	m2b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	br	r31,	L(ta4)
++
++L(ge4):	ldl	rl2,	0(rp)
++	srl	m2a,NAIL_BITS,	t0
++	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul1,	m1b
++	subl	rl2,	acc0,	acc0
++	ldl	rl3,	8(rp)
++	srl	m3a,NAIL_BITS,	t0
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	addl	t0,	m2b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	br	r31,	L(el0)
++
++L(2m4):	ldi	n,	-4(n)
++	ldl	ul0,	0(up)
++	ldl	ul1,	8(up)
++	ldi	up,	16(up)
++	ldi	rp,	-16(rp)
++	mull	vl0,	ul0,	m0a
++	umulh	vl0,	ul0,	m0b
++	bge	n,	L(ge2)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	r31,	acc0
++	subl	rl0,	acc0,	acc0
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	m0b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	br	r31,	L(ta2)
++
++L(ge2):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	rl0,	16(rp)
++	srl	m0a,NAIL_BITS,	t0
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	addl	t0,	r31,	acc0
++	umulh	vl0,	ul3,	m3b
++	subl	rl0,	acc0,	acc0
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	ldi	rp,	32(rp)
++	mull	vl0,	ul0,	m0a
++	addl	t0,	m0b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	bge	n,	L(el2)
++
++	br	r31,	L(ta6)
++
++L(1m4):	ldi	n,	-4(n)
++	ldl	ul1,	0(up)
++	ldi	up,	8(up)
++	ldi	rp,	-24(rp)
++	bge	n,	L(ge1)
++
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	subl	rl1,	t0,	acc1
++	and	acc1,numb_mask,	r28
++	sra	acc1,NUMB_BITS,	t1
++	stl	r28,	24(rp)
++	subl	m1b,	t1,	r0
++	ret	r31,	(r26),	1
++
++L(ge1):	ldl	ul2,	0(up)
++	mull	vl0,	ul1,	m1a
++	umulh	vl0,	ul1,	m1b
++	ldl	ul3,	8(up)
++	ldi	n,	-4(n)
++	mull	vl0,	ul2,	m2a
++	umulh	vl0,	ul2,	m2b
++	ldl	ul0,	16(up)
++	mull	vl0,	ul3,	m3a
++	umulh	vl0,	ul3,	m3b
++	ldl	rl1,	24(rp)
++	srl	m1a,NAIL_BITS,	t0
++	ldl	ul1,	24(up)
++	ldi	up,	32(up)
++	ldi	rp,	32(rp)
++	mull	vl0,	ul0,	m0a
++	addl	t0,	r31,	acc1
++	umulh	vl0,	ul0,	m0b
++	subl	rl1,	acc1,	acc1
++	ldl	rl2,	0(rp)
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	m1b,	acc0
++	sra	acc1,NUMB_BITS,	t1
++	blt	n,	L(ta5)
++
++L(ge5):	ldl	ul2,	0(up)
++	br	r31,	L(el1)
++
++	ALIGN(16)
++L(top):	mull	vl0,	ul0,	m0a		C U1
++	addl	t0,	m0b,	acc1		C L0
++	sra	acc0,NUMB_BITS,	t1		C U0
++	stl	r28,	-24(rp)			C L1
++C
++L(el2):	umulh	vl0,	ul0,	m0b		C U1
++	and	acc0,numb_mask,	r28		C L0
++	subl	rl1,	acc1,	acc1		C U0
++	ldl	rl2,	0(rp)			C L1
++C
++	unop					C U1
++	addl	t1,	acc1,	acc1		C L0
++	srl	m2a,NAIL_BITS,	t0		C U0
++	ldl	ul2,	0(up)			C L1
++C
++	mull	vl0,	ul1,	m1a		C U1
++	addl	t0,	m1b,	acc0		C L0
++	sra	acc1,NUMB_BITS,	t1		C U0
++	stl	r28,	-16(rp)			C L1
++C
++L(el1):	umulh	vl0,	ul1,	m1b		C U1
++	and	acc1,numb_mask,	r28		C L0
++	subl	rl2,	acc0,	acc0		C U0
++	ldl	rl3,	8(rp)			C L1
++C
++	ldi	n,	-4(n)			C L1
++	addl	t1,	acc0,	acc0		C L0
++	srl	m3a,NAIL_BITS,	t0		C U0
++	ldl	ul3,	8(up)			C L1
++C
++	mull	vl0,	ul2,	m2a		C U1
++	addl	t0,	m2b,	acc1		C L0
++	sra	acc0,NUMB_BITS,	t1		C U0
++	stl	r28,	-8(rp)			C L1
++C
++L(el0):	umulh	vl0,	ul2,	m2b		C U1
++	and	acc0,numb_mask,	r28		C L0
++	subl	rl3,	acc1,	acc1		C U0
++	ldl	rl0,	16(rp)			C L1
++C
++	unop					C U1
++	addl	t1,	acc1,	acc1		C L0
++	srl	m0a,NAIL_BITS,	t0		C U0
++	ldl	ul0,	16(up)			C L1
++C
++	mull	vl0,	ul3,	m3a		C U1
++	addl	t0,	m3b,	acc0		C L0
++	sra	acc1,NUMB_BITS,	t1		C U0
++	stl	r28,	0(rp)			C L1
++C
++L(el3):	umulh	vl0,	ul3,	m3b		C U1
++	and	acc1,numb_mask,	r28		C L0
++	subl	rl0,	acc0,	acc0		C U0
++	ldl	rl1,	24(rp)			C L1
++C
++	unop					C U1
++	addl	t1,	acc0,	acc0		C L0
++	srl	m1a,NAIL_BITS,	t0		C U0
++	ldl	ul1,	24(up)			C L1
++C
++	ldi	up,	32(up)			C L0
++	unop					C U1
++	ldi	rp,	32(rp)			C L1
++	bge	n,	L(top)			C U0
++
++L(end):	mull	vl0,	ul0,	m0a
++	addl	t0,	m0b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	stl	r28,	-24(rp)
++L(ta6):	umulh	vl0,	ul0,	m0b
++	and	acc0,numb_mask,	r28
++	subl	rl1,	acc1,	acc1
++	ldl	rl2,	0(rp)
++	addl	t1,	acc1,	acc1
++	srl	m2a,NAIL_BITS,	t0
++	mull	vl0,	ul1,	m1a
++	addl	t0,	m1b,	acc0
++	sra	acc1,NUMB_BITS,	t1
++	stl	r28,	-16(rp)
++L(ta5):	umulh	vl0,	ul1,	m1b
++	and	acc1,numb_mask,	r28
++	subl	rl2,	acc0,	acc0
++	ldl	rl3,	8(rp)
++	addl	t1,	acc0,	acc0
++	srl	m3a,NAIL_BITS,	t0
++	addl	t0,	m2b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	stl	r28,	-8(rp)
++	unop
++	ALIGN(16)
++L(ta4):	and	acc0,numb_mask,	r28
++	subl	rl3,	acc1,	acc1
++	ldl	rl0,	16(rp)
++	addl	t1,	acc1,	acc1
++	srl	m0a,NAIL_BITS,	t0
++	addl	t0,	m3b,	acc0
++	sra	acc1,NUMB_BITS,	t1
++	stl	r28,	0(rp)
++	unop
++	ALIGN(16)
++L(ta3):	and	acc1,numb_mask,	r28
++	subl	rl0,	acc0,	acc0
++	ldl	rl1,	24(rp)
++	addl	t1,	acc0,	acc0
++	srl	m1a,NAIL_BITS,	t0
++	addl	t0,	m0b,	acc1
++	sra	acc0,NUMB_BITS,	t1
++	stl	r28,	8(rp)
++	unop
++	ALIGN(16)
++L(ta2):	and	acc0,numb_mask,	r28
++	subl	rl1,	acc1,	acc1
++	addl	t1,	acc1,	acc1
++	sra	acc1,NUMB_BITS,	t1
++	stl	r28,	16(rp)
++	and	acc1,numb_mask,	r28
++	subl	m1b,	t1,	r0
++	stl	r28,	24(rp)
++	ret	r31,	(r26),	1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6/slot.pl b/mpn/sw_64/sw6/slot.pl
+new file mode 100755
+index 0000000..dbb40b4
+--- /dev/null
++++ b/mpn/sw_64/sw6/slot.pl
+@@ -0,0 +1,318 @@
++#!/usr/bin/perl -w
++
++# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc.
++#
++#  This file is part of the GNU MP Library.
++#
++#  The GNU MP Library is free software; you can redistribute it and/or modify
++#  it under the terms of either:
++#
++#    * the GNU Lesser General Public License as published by the Free
++#      Software Foundation; either version 3 of the License, or (at your
++#      option) any later version.
++#
++#  or
++#
++#    * the GNU General Public License as published by the Free Software
++#      Foundation; either version 2 of the License, or (at your option) any
++#      later version.
++#
++#  or both in parallel, as here.
++#
++#  The GNU MP Library is distributed in the hope that it will be useful, but
++#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++#  for more details.
++#
++#  You should have received copies of the GNU General Public License and the
++#  GNU Lesser General Public License along with the GNU MP Library.  If not,
++#  see https://www.gnu.org/licenses/.
++
++
++# Usage: slot.pl [filename.o]...
++#
++# Run "objdump" to produce a disassembly of the given object file(s) and
++# annotate the output with "U" or "L" slotting which Sw_64 SW6 will use.
++#
++# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as
++# a reminder that it wasn't a fixed requirement that gave the U or L, but
++# the octaword slotting rules.
++#
++# If an instruction is not recognised, that octaword does not get any U/L
++# shown, only lower-case "u", "l" or "e" for the instructions which are
++# known.  Add any unknown instructions to %optable below.
++
++
++use strict;
++
++# The U or L which various instructions demand, or E if either.
++#
++my %optable =
++  (
++   'addl'   => 'E',
++   'and'    => 'E',
++   'andnot' => 'E',
++   'beq'    => 'U',
++   'bge'    => 'U',
++   'bgt'    => 'U',
++   'bic'    => 'E',
++   'bis'    => 'E',
++   'blt'    => 'U',
++   'bne'    => 'U',
++   'br'     => 'L',
++   'clr'    => 'E',
++   'cmpule' => 'E',
++   'cmpult' => 'E',
++   'cmpeq'  => 'E',
++   'seleq' => 'E',
++   'selne' => 'E',
++   'ctpop'  => 'U',
++   'ctlz'   => 'U',
++   'cttz'   => 'U',
++   'ext0b'  => 'U',
++   'ext6b'  => 'U',
++   'ext2b'  => 'U',
++   'ext7b'  => 'U',
++   'ext3b'  => 'U',
++   'ext5b'  => 'U',
++   'ext1b'  => 'U',
++   'call'    => 'L',
++   'ldi'    => 'E',
++   'ldih'   => 'E',
++   'ldbu'   => 'L',
++   'ldw'    => 'L',
++   'ldl'    => 'L',
++   'fldd'    => 'L',
++   'ret'    => 'L',
++   'mov'    => 'E',
++   'mulw'   => 'U',
++   'mull'   => 'U',
++   'negl'   => 'E',
++   'nop'    => 'E',
++   'not'    => 'E',
++   's8addl' => 'E',
++   's8subl' => 'E',
++   # 'sextb'  => ?
++   # 'sextl'  => ?
++   'sll'    => 'U',
++   'srl'    => 'U',
++   'stl'    => 'L',
++   'subl'   => 'E',
++   'umulh'  => 'U',
++   'unop'   => 'E',
++   'xor'    => 'E',
++  );
++
++# Slottings used for a given pattern of U/L/E in an octaword.  This is as
++# per the "Ebox Slotting" section of the SW6 hardware reference manual.
++#
++my %slottable =
++  (
++   'EEEE' => 'ULUL',
++   'EEEL' => 'ULUL',
++   'EEEU' => 'ULLU',
++   'EELE' => 'ULLU',
++   'EELL' => 'UULL',
++   'EELU' => 'ULLU',
++   'EEUE' => 'ULUL',
++   'EEUL' => 'ULUL',
++   'EEUU' => 'LLUU',
++   'ELEE' => 'ULUL',
++   'ELEL' => 'ULUL',
++   'ELEU' => 'ULLU',
++   'ELLE' => 'ULLU',
++   'ELLL' => 'ULLL',
++   'ELLU' => 'ULLU',
++   'ELUE' => 'ULUL',
++   'ELUL' => 'ULUL',
++
++   'LLLL' => 'LLLL',
++   'LLLU' => 'LLLU',
++   'LLUE' => 'LLUU',
++   'LLUL' => 'LLUL',
++   'LLUU' => 'LLUU',
++   'LUEE' => 'LULU',
++   'LUEL' => 'LUUL',
++   'LUEU' => 'LULU',
++   'LULE' => 'LULU',
++   'LULL' => 'LULL',
++   'LULU' => 'LULU',
++   'LUUE' => 'LUUL',
++   'LUUL' => 'LUUL',
++   'LUUU' => 'LUUU',
++   'UEEE' => 'ULUL',
++   'UEEL' => 'ULUL',
++   'UEEU' => 'ULLU',
++
++   'ELUU' => 'LLUU',
++   'EUEE' => 'LULU',
++   'EUEL' => 'LUUL',
++   'EUEU' => 'LULU',
++   'EULE' => 'LULU',
++   'EULL' => 'UULL',
++   'EULU' => 'LULU',
++   'EUUE' => 'LUUL',
++   'EUUL' => 'LUUL',
++   'EUUU' => 'LUUU',
++   'LEEE' => 'LULU',
++   'LEEL' => 'LUUL',
++   'LEEU' => 'LULU',
++   'LELE' => 'LULU',
++   'LELL' => 'LULL',
++   'LELU' => 'LULU',
++   'LEUE' => 'LUUL',
++   'LEUL' => 'LUUL',
++   'LEUU' => 'LLUU',
++   'LLEE' => 'LLUU',
++   'LLEL' => 'LLUL',
++   'LLEU' => 'LLUU',
++   'LLLE' => 'LLLU',
++
++   'UELE' => 'ULLU',
++   'UELL' => 'UULL',
++   'UELU' => 'ULLU',
++   'UEUE' => 'ULUL',
++   'UEUL' => 'ULUL',
++   'UEUU' => 'ULUU',
++   'ULEE' => 'ULUL',
++   'ULEL' => 'ULUL',
++   'ULEU' => 'ULLU',
++   'ULLE' => 'ULLU',
++   'ULLL' => 'ULLL',
++   'ULLU' => 'ULLU',
++   'ULUE' => 'ULUL',
++   'ULUL' => 'ULUL',
++   'ULUU' => 'ULUU',
++   'UUEE' => 'UULL',
++   'UUEL' => 'UULL',
++   'UUEU' => 'UULU',
++   'UULE' => 'UULL',
++   'UULL' => 'UULL',
++   'UULU' => 'UULU',
++   'UUUE' => 'UUUL',
++   'UUUL' => 'UUUL',
++   'UUUU' => 'UUUU',
++  );
++
++# Check all combinations of U/L/E are present in %slottable.
++sub coverage {
++  foreach my $a ('U', 'L', 'E') {
++    foreach my $b ('U', 'L', 'E') {
++      foreach my $c ('U', 'L', 'E') {
++        foreach my $d ('U', 'L', 'E') {
++          my $x = $a . $b . $c . $d;
++          if (! defined $slottable{$x}) {
++            print "slottable missing: $x\n"
++          }
++        }
++      }
++    }
++  }
++}
++
++# Certain consistency checks for %slottable.
++sub check {
++  foreach my $x (keys %slottable) {
++    my $a = substr($x,0,1);
++    my $b = substr($x,1,1);
++    my $c = substr($x,2,1);
++    my $d = substr($x,3,1);
++    my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E');
++    my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L');
++    my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U');
++
++    my $got = $slottable{$x};
++    my $want = $x;
++
++    if ($es == 0) {
++
++    } elsif ($es == 1) {
++      # when only one E, it's mapped to whichever of U or L is otherwise
++      # used the least
++      if ($ls > $us) {
++        $want =~ s/E/U/;
++      } else {
++        $want =~ s/E/L/;
++      }
++    } elsif ($es == 2) {
++      # when two E's and two U, then the E's map to L; vice versa for two E
++      # and two L
++      if ($ls == 2) {
++        $want =~ s/E/U/g;
++      } elsif ($us == 2) {
++        $want =~ s/E/L/g;
++      } else {
++        next;
++      }
++    } elsif ($es == 3) {
++      next;
++
++    } else { # $es == 4
++      next;
++    }
++
++    if ($want ne $got) {
++      print "slottable $x want $want got $got\n";
++    }
++  }
++}
++
++sub disassemble {
++  my ($file) = @_;
++
++  open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n";
++
++  my (%pre, %post, %type);
++  while (<IN>) {
++    my $line = $_ . "";
++
++    if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) {
++      my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4);
++
++      my $this_type = $optable{$opcode};
++      if (! defined ($this_type)) { $this_type = ' '; }
++
++      $pre{$addr} = $this_pre;
++      $post{$addr} = $this_post;
++      $type{$addr} = $this_type;
++
++      if ($addr eq 'c') {
++        my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' ');
++
++        my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'};
++        $str = $slottable{$str};
++        if (defined $str) {
++          $slot{'c'} = substr($str,0,1);
++          $slot{'8'} = substr($str,1,1);
++          $slot{'4'} = substr($str,2,1);
++          $slot{'0'} = substr($str,3,1);
++        }
++
++        foreach my $i ('0', '4', '8', 'c') {
++          if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; }
++          print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, '  ', $post{$i}, "\n";
++        }
++
++        %pre = ();
++        %type = ();
++        %post = ();
++      }
++    }
++  }
++
++  close IN || die "Error from objdump (or objdump not available)\n";
++}
++
++coverage();
++check();
++
++my @files;
++if ($#ARGV >= 0) {
++  @files = @ARGV;
++} else {
++  die
++}
++
++foreach (@files)  {
++    disassemble($_);
++}
+diff --git a/mpn/sw_64/sw6/sub_n.asm b/mpn/sw_64/sw6/sub_n.asm
+new file mode 100644
+index 0000000..64b7711
+--- /dev/null
++++ b/mpn/sw_64/sw6/sub_n.asm
+@@ -0,0 +1,281 @@
++dnl  Sw_64 sw6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
++dnl  and store difference in a third limb vector.
++
++dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     2.125
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  vp	r18
++C  n	r19
++C  cy	r20   (for mpn_add_nc)
++
++C TODO
++C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
++C   Use multi-pronged feed-in.
++C   Perform additional micro-tuning
++
++C  This code was written in cooperation with sw6 pipeline expert Steve Root.
++
++C  Pair loads and stores where possible
++C  Store pairs oct-aligned where possible (didn't need it here)
++C  Stores are delayed every third cycle
++C  Loads and stores are delayed by fills
++C  U stays still, put code there where possible (note alternation of U1 and U0)
++C  L moves because of loads and stores
++C  Note dampers in L to limit damage
++
++C  This odd-looking optimization expects that were having random bits in our
++C  data, so that a pure zero result is unlikely. so we penalize the unlikely
++C  case to help the common case.
++
++define(`u0', `r0')  define(`u1', `r3')
++define(`v0', `r1')  define(`v1', `r4')
++
++define(`cy0', `r20')  define(`cy1', `r21')
++
++MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
++
++ASM_START()
++PROLOGUE(mpn_sub_nc)
++	br	r31,	$entry
++EPILOGUE()
++PROLOGUE(mpn_sub_n)
++	bis	r31,	r31,	cy0	C clear carry in
++$entry:	cmpult	r19,	5,	r22	C L1 move counter
++	ldl	u1,	0(r17)		C L0 get next ones
++	ldl	v1,	0(r18)		C L1
++	bne	r22,	$Lsmall
++
++	ldl	u0,	8(r17)		C L0 get next ones
++	ldl	v0,	8(r18)		C L1
++	subl	u1,	v1,	r5	C U0 sub two data
++
++	cmpult	u1,	v1,	r23	C U0 did it borrow
++	ldl	u1,	16(r17)		C L0 get next ones
++	ldl	v1,	16(r18)		C L1
++
++	subl	u0,	v0,	r8	C U1 sub two data
++	subl	r5,	cy0,	r24	C U0 borrow in
++
++	cmpult	u0,	v0,	r22	C U1 did it borrow
++	beq	r5,	$fix5f		C U0 fix exact zero
++$ret5f:	ldl	u0,	24(r17)		C L0 get next ones
++	ldl	v0,	24(r18)		C L1
++
++	subl	r8,	r23,	r25	C U1 borrow from last
++	subl	u1,	v1,	r7	C U0 sub two data
++
++	beq	r8,	$fix6f		C U1 fix exact zero
++$ret6f:	cmpult	u1,	v1,	r23	C U0 did it borrow
++	ldl	u1,	32(r17)		C L0 get next ones
++	ldl	v1,	32(r18)		C L1
++
++	ldi	r17,	40(r17)		C L0 move pointer
++	ldi	r18,	40(r18)		C L1 move pointer
++
++	ldi	r16,	-8(r16)
++	ldi	r19,	-13(r19)	C L1 move counter
++	blt	r19,	$Lend		C U1 loop control
++
++
++C Main loop.  8-way unrolled.
++	ALIGN(16)
++$Loop:	subl	u0,	v0,	r2	C U1 sub two data
++	stl	r24,	8(r16)		C L0 put an answer
++	subl	r7,	r22,	r24	C U0 borrow from last
++	stl	r25,	16(r16)		C L1 pair
++
++	cmpult	u0,	v0,	cy1	C U1 did it borrow
++	beq	r7,	$fix7		C U0 fix exact 0
++$ret7:	ldl	u0,	0(r17)		C L0 get next ones
++	ldl	v0,	0(r18)		C L1
++
++	bis	r31,	r31,	r31	C L  damp out
++	subl	r2,	r23,	r25	C U1 borrow from last
++	bis	r31,	r31,	r31	C L  moves in L !
++	subl	u1,	v1,	r5	C U0 sub two data
++
++	beq	r2,	$fix0		C U1 fix exact zero
++$ret0:	cmpult	u1,	v1,	cy0	C U0 did it borrow
++	ldl	u1,	8(r17)		C L0 get next ones
++	ldl	v1,	8(r18)		C L1
++
++	subl	u0,	v0,	r8	C U1 sub two data
++	stl	r24,	24(r16)		C L0 store pair
++	subl	r5,	cy1,	r24	C U0 borrow from last
++	stl	r25,	32(r16)		C L1
++
++	cmpult	u0,	v0,	r22	C U1 did it borrow
++	beq	r5,	$fix1		C U0 fix exact zero
++$ret1:	ldl	u0,	16(r17)		C L0 get next ones
++	ldl	v0,	16(r18)		C L1
++
++	ldi	r16,	64(r16)		C L0 move pointer
++	subl	r8,	cy0,	r25	C U1 borrow from last
++	ldi	r19,	-8(r19)		C L1 move counter
++	subl	u1,	v1,	r7	C U0 sub two data
++
++	beq	r8,	$fix2		C U1 fix exact zero
++$ret2:	cmpult	u1,	v1,	r23	C U0 did it borrow
++	ldl	u1,	24(r17)		C L0 get next ones
++	ldl	v1,	24(r18)		C L1
++
++	subl	u0,	v0,	r2	C U1 sub two data
++	stl	r24,	-24(r16)	C L0 put an answer
++	subl	r7,	r22,	r24	C U0 borrow from last
++	stl	r25,	-16(r16)	C L1 pair
++
++	cmpult	u0,	v0,	cy1	C U1 did it borrow
++	beq	r7,	$fix3		C U0 fix exact 0
++$ret3:	ldl	u0,	32(r17)		C L0 get next ones
++	ldl	v0,	32(r18)		C L1
++
++	bis	r31,	r31,	r31	C L  damp out
++	subl	r2,	r23,	r25	C U1 borrow from last
++	bis	r31,	r31,	r31	C L  moves in L !
++	subl	u1,	v1,	r5	C U0 sub two data
++
++	beq	r2,	$fix4		C U1 fix exact zero
++$ret4:	cmpult	u1,	v1,	cy0	C U0 did it borrow
++	ldl	u1,	40(r17)		C L0 get next ones
++	ldl	v1,	40(r18)		C L1
++
++	subl	u0,	v0,	r8	C U1 sub two data
++	stl	r24,	-8(r16)		C L0 store pair
++	subl	r5,	cy1,	r24	C U0 borrow from last
++	stl	r25,	0(r16)		C L1
++
++	cmpult	u0,	v0,	r22	C U1 did it borrow
++	beq	r5,	$fix5		C U0 fix exact zero
++$ret5:	ldl	u0,	48(r17)		C L0 get next ones
++	ldl	v0,	48(r18)		C L1
++
++	ldl	r31, 256(r17)		C L0 prefetch
++	subl	r8,	cy0,	r25	C U1 borrow from last
++	ldl	r31, 256(r18)		C L1 prefetch
++	subl	u1,	v1,	r7	C U0 sub two data
++
++	beq	r8,	$fix6		C U1 fix exact zero
++$ret6:	cmpult	u1,	v1,	r23	C U0 did it borrow
++	ldl	u1,	56(r17)		C L0 get next ones
++	ldl	v1,	56(r18)		C L1
++
++	ldi	r17,	64(r17)		C L0 move pointer
++	bis	r31,	r31,	r31	C U
++	ldi	r18,	64(r18)		C L1 move pointer
++	bge	r19,	$Loop		C U1 loop control
++C ==== main loop end
++
++$Lend:	subl	u0,	v0,	r2	C U1 sub two data
++	stl	r24,	8(r16)		C L0 put an answer
++	subl	r7,	r22,	r24	C U0 borrow from last
++	stl	r25,	16(r16)		C L1 pair
++	cmpult	u0,	v0,	cy1	C U1 did it borrow
++	beq	r7,	$fix7c		C U0 fix exact 0
++$ret7c:	subl	r2,	r23,	r25	C U1 borrow from last
++	subl	u1,	v1,	r5	C U0 sub two data
++	beq	r2,	$fix0c		C U1 fix exact zero
++$ret0c:	cmpult	u1,	v1,	cy0	C U0 did it borrow
++	stl	r24,	24(r16)		C L0 store pair
++	subl	r5,	cy1,	r24	C U0 borrow from last
++	stl	r25,	32(r16)		C L1
++	beq	r5,	$fix1c		C U0 fix exact zero
++$ret1c:	stl	r24,	40(r16)		C L0 put an answer
++	ldi	r16,	48(r16)		C L0 move pointer
++
++	ldi	r19,	8(r19)
++	beq	r19,	$Lret
++
++	ldl	u1,	0(r17)
++	ldl	v1,	0(r18)
++$Lsmall:
++	ldi	r19,	-1(r19)
++	beq	r19,	$Lend0
++
++	ALIGN(8)
++$Loop0:	subl	u1,	v1,	r2	C main sub
++	cmpult	u1,	v1,	r8	C compute bw from last sub
++	ldl	u1,	8(r17)
++	ldl	v1,	8(r18)
++	subl	r2,	cy0,	r5	C borrow sub
++	ldi	r17,	8(r17)
++	ldi	r18,	8(r18)
++	stl	r5,	0(r16)
++	cmpult	r2,	cy0,	cy0	C compute bw from last sub
++	ldi	r19,	-1(r19)		C decr loop cnt
++	bis	r8,	cy0,	cy0	C combine bw from the two subs
++	ldi	r16,	8(r16)
++	bne	r19,	$Loop0
++$Lend0:	subl	u1,	v1,	r2	C main sub
++	subl	r2,	cy0,	r5	C borrow sub
++	cmpult	u1,	v1,	r8	C compute bw from last sub
++	cmpult	r2,	cy0,	cy0	C compute bw from last sub
++	stl	r5,	0(r16)
++	bis	r8,	cy0,	r0	C combine bw from the two subs
++	ret	r31,(r26),1
++
++	ALIGN(8)
++$Lret:	ldi	r0,	0(cy0)		C copy borrow into return register
++	ret	r31,(r26),1
++
++$fix5f:	bis	r23,	cy0,	r23	C bring forward borrow
++	br	r31,	$ret5f
++$fix6f:	bis	r22,	r23,	r22	C bring forward borrow
++	br	r31,	$ret6f
++$fix0:	bis	cy1,	r23,	cy1	C bring forward borrow
++	br	r31,	$ret0
++$fix1:	bis	cy0,	cy1,	cy0	C bring forward borrow
++	br	r31,	$ret1
++$fix2:	bis	r22,	cy0,	r22	C bring forward borrow
++	br	r31,	$ret2
++$fix3:	bis	r23,	r22,	r23	C bring forward borrow
++	br	r31,	$ret3
++$fix4:	bis	cy1,	r23,	cy1	C bring forward borrow
++	br	r31,	$ret4
++$fix5:	bis	cy1,	cy0,	cy0	C bring forward borrow
++	br	r31,	$ret5
++$fix6:	bis	r22,	cy0,	r22	C bring forward borrow
++	br	r31,	$ret6
++$fix7:	bis	r23,	r22,	r23	C bring forward borrow
++	br	r31,	$ret7
++$fix0c:	bis	cy1,	r23,	cy1	C bring forward borrow
++	br	r31,	$ret0c
++$fix1c:	bis	cy0,	cy1,	cy0	C bring forward borrow
++	br	r31,	$ret1c
++$fix7c:	bis	r23,	r22,	r23	C bring forward borrow
++	br	r31,	$ret7c
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6a/gcd_1.asm b/mpn/sw_64/sw6a/gcd_1.asm
+new file mode 100644
+index 0000000..ce75dc5
+--- /dev/null
++++ b/mpn/sw_64/sw6a/gcd_1.asm
+@@ -0,0 +1,145 @@
++dnl  Sw_64 sw6a mpn_gcd_1 -- Nx1 greatest common divisor.
++
++dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6a: 3.4 cycles/bitpair for 1x1 part
++
++
++C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
++C
++C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
++C strip trailing zeros from abs(x-y) to maintain x and y both odd.
++C
++C The trailing zeros are calculated from just x-y, since in twos-complement
++C there's the same number of trailing zeros on d or -d.  This means the cttz
++C runs in parallel with abs(x-y).
++C
++C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
++C operands with this algorithm gives the measured 3.4 c/l.
++C
++C The slottings shown are for SVR4 style systems, Unicos differs in the
++C initial gp setup and the LEA.
++C
++C Enhancement:
++C
++C On the call, !lituse_call! (when available) would allow the linker to relax
++C it to a bsr, but probably only in a static binary.  Plain "call foo" gives
++C the right object code for relaxation, and ought to be available
++C everywhere, but we prefer to schedule the GOT ldl (LEA) back earlier, for
++C the usual case of running in a shared library.
++C
++C bsr could perhaps be used explicitly anyway.  We should be able to assume
++C modexact is in the same module as us (ie. shared library or mainline).
++C Would there be any worries about the size of the displacement?  Could
++C always put modexact and gcd_1 in the same .o to be certain.
++
++ASM_START()
++PROLOGUE(mpn_gcd_1, gp)
++
++	C r16	xp
++	C r17	size
++	C r18	y
++
++	C ldih				C l
++	C ldi				C u
++
++	ldl	r0, 0(r16)		C L   x = xp[0]
++	ldi	r30, -32(r30)		C u   alloc stack
++
++	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldl (gp)
++	stl	r10, 16(r30)		C L   save r10
++	cttz	r18, r10		C U0  y twos
++	cmpeq	r17, 1, r5		C u   test size==1
++
++	stl	r9, 8(r30)		C L   save r9
++	clr	r19			C u   zero c for modexact
++	unop
++	unop
++
++	cttz	r0, r6			C U0  x twos
++	stl	r26, 0(r30)		C L   save ra
++
++	srl	r18, r10, r18		C U   y odd
++
++	mov	r18, r9			C l   hold y across call
++
++	cmpult	r6, r10, r2		C u   test x_twos < y_twos
++
++	selne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
++	bne	r5, L(one)		C U   no modexact if size==1
++	call	r26, (r27), mpn_modexact_1c_odd   C L0
++
++	LDGP(	r29, 0(r26))		C u,l ldih,ldi
++	cttz	r0, r6			C U0  new x twos
++	ldl	r26, 0(r30)		C L   restore ra
++
++L(one):
++	mov	r9, r1			C u   y
++	ldl	r9, 8(r30)		C L   restore r9
++	mov	r10, r2			C u   common twos
++	ldl	r10, 16(r30)		C L   restore r10
++
++	ldi	r30, 32(r30)		C l   free stack
++	beq	r0, L(done)		C U   return y if x%y==0
++
++	srl	r0, r6, r0		C U   x odd
++	unop
++
++	ALIGN(16)
++L(top):
++	C r0	x
++	C r1	y
++	C r2	common twos, for use at end
++
++	subl	r0, r1, r7		C l0  d = x - y
++	cmpult	r0, r1, r16		C u0  test x >= y
++
++	subl	r1, r0, r4		C l0  new_x = y - x
++	cttz	r7, r8			C U0  d twos
++
++	seleq	r16, r7, r4		C l0  new_x = d if x>=y
++	selne	r16, r0, r1		C u0  y = x if x<y
++	unop				C l   \ force seleq into l0
++	unop				C u   /
++
++	C				C seleq2 L0, selne2 U0
++
++	srl	r4, r8, r0		C U0  x = new_x >> twos
++	bne	r7, L(top)		C U1  stop when d==0
++
++
++L(done):
++	sll	r1, r2, r0		C U0  return y << common_twos
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6a/hamdist.asm b/mpn/sw_64/sw6a/hamdist.asm
+new file mode 100644
+index 0000000..d005eb0
+--- /dev/null
++++ b/mpn/sw_64/sw6a/hamdist.asm
+@@ -0,0 +1,111 @@
++dnl  Sw_64 sw6a mpn_hamdist -- mpn hamming distance.
++
++dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6a: 2.5 cycles/limb
++
++
++C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
++C
++C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
++C renaming register shortage.  Since we need 5 instructions per limb, further
++C unrolling could approach 1.5 c/l.
++C
++C The main loop processes two limbs from each operand on each iteration.  An
++C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
++C is even that result is discarded, and is repeated by the main loop.
++C
++
++ASM_START()
++PROLOGUE(mpn_hamdist)
++
++	C r16	xp
++	C r17	yp
++	C r18	size
++
++	ldl	r1, 0(r16)		C L0  xp[0]
++	ldl	r2, 0(r17)		C L1  yp[0]
++	and	r18, 1, r8		C U1  1 if size odd
++	srl	r18, 1, r18		C U0  size, limb pairs
++
++	clr	r0			C L0  initial total
++	s8addl	r8, r17, r17		C U1  yp++ if size odd
++	s8addl	r8, r16, r16		C L1  xp++ if size odd
++	clr	r6			C U0  dummy initial xor 1
++
++	xor	r1, r2, r5		C L   initial xor 0
++	beq	r18, L(one)		C U   if size==1
++
++	seleq	r8, r31, r5		C L   discard first limb if size even
++	unop				C U
++
++
++	ALIGN(16)
++L(top):
++	C r0	total accumulating
++	C r7	xor 0
++	C r8	xor 1
++	C r16	xp, incrementing
++	C r17	yp, incrementing
++	C r18	size, limb pairs, decrementing
++
++	ldl	r1, 0(r16)		C L
++	ldl	r2, 0(r17)		C L
++	ctpop	r5, r7			C U0
++	ldi	r16, 16(r16)		C U
++
++	ldl	r3, -8(r16)		C L
++	ldl	r4, 8(r17)		C L
++	ctpop	r6, r8			C U0
++	ldi	r17, 16(r17)		C U
++
++	ldl	r31, 256(r16)		C L	prefetch
++	ldl	r31, 256(r17)		C L	prefetch
++	xor	r1, r2, r5		C U
++	ldi	r18, -1(r18)		C U
++
++	xor	r3, r4, r6		C U
++	addl	r0, r7, r0		C L
++	addl	r0, r8, r0		C L
++	bne	r18, L(top)		C U
++
++
++	ctpop	r6, r8			C U0
++	addl	r0, r8, r0		C L
++L(one):
++	ctpop	r5, r7			C U0
++	addl	r0, r7, r0		C L
++
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6a/popcount.asm b/mpn/sw_64/sw6a/popcount.asm
+new file mode 100644
+index 0000000..388d062
+--- /dev/null
++++ b/mpn/sw_64/sw6a/popcount.asm
+@@ -0,0 +1,101 @@
++dnl  Sw_64 sw6a mpn_popcount -- mpn bit population count.
++
++dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6a: 1.5 cycles/limb
++
++
++C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
++C
++C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide
++C all latencies, the addl's must be deferred to the next iteration.
++C
++C Since we need just 3 instructions per limb, further unrolling could approach
++C 1.0 c/l.
++C
++C The main loop processes two limbs at a time.  An odd size is handled by
++C processing src[0] at the start.  If the size is even that result is
++C discarded, and src[0] is repeated by the main loop.
++C
++
++ASM_START()
++PROLOGUE(mpn_popcount)
++
++	C r16	src
++	C r17	size
++
++	ldl	r0, 0(r16)		C L0  src[0]
++	and	r17, 1, r8		C U1  1 if size odd
++	srl	r17, 1, r17		C U0  size, limb pairs
++
++	s8addl	r8, r16, r16		C L1  src++ if size odd
++	ctpop	r0, r0			C U0
++	beq	r17, L(one)		C U1  if size==1
++
++	seleq	r8, r31, r0		C L   discard first limb if size even
++	clr	r3			C L
++
++	clr	r4			C L
++	unop				C U
++	unop				C L
++	unop				C U
++
++
++	ALIGN(16)
++L(top):
++	C r0	total accumulating
++	C r3	pop 0
++	C r4	pop 1
++	C r16	src, incrementing
++	C r17	size, decrementing
++
++	ldl	r1, 0(r16)		C L
++	ldl	r2, 8(r16)		C L
++	ldi	r16, 16(r16)		C U
++	ldi	r17, -1(r17)		C U
++
++	addl	r0, r3, r0		C L
++	addl	r0, r4, r0		C L
++	ctpop	r1, r3			C U0
++	ctpop	r2, r4			C U0
++
++	ldl	r31, 512(r16)		C L	prefetch
++	bne	r17, L(top)		C U
++
++
++	addl	r0, r3, r0		C L
++	addl	r0, r4, r0		C U
++L(one):
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6b/gcd_1.asm b/mpn/sw_64/sw6b/gcd_1.asm
+new file mode 100644
+index 0000000..984ac92
+--- /dev/null
++++ b/mpn/sw_64/sw6b/gcd_1.asm
+@@ -0,0 +1,145 @@
++dnl  Sw_64 sw6b mpn_gcd_1 -- Nx1 greatest common divisor.
++
++dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6b: 3.4 cycles/bitpair for 1x1 part
++
++
++C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
++C
++C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
++C strip trailing zeros from abs(x-y) to maintain x and y both odd.
++C
++C The trailing zeros are calculated from just x-y, since in twos-complement
++C there's the same number of trailing zeros on d or -d.  This means the cttz
++C runs in parallel with abs(x-y).
++C
++C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
++C operands with this algorithm gives the measured 3.4 c/l.
++C
++C The slottings shown are for SVR4 style systems, Unicos differs in the
++C initial gp setup and the LEA.
++C
++C Enhancement:
++C
++C On the call, !lituse_call! (when available) would allow the linker to relax
++C it to a bsr, but probably only in a static binary.  Plain "call foo" gives
++C the right object code for relaxation, and ought to be available
++C everywhere, but we prefer to schedule the GOT ldl (LEA) back earlier, for
++C the usual case of running in a shared library.
++C
++C bsr could perhaps be used explicitly anyway.  We should be able to assume
++C modexact is in the same module as us (ie. shared library or mainline).
++C Would there be any worries about the size of the displacement?  Could
++C always put modexact and gcd_1 in the same .o to be certain.
++
++ASM_START()
++PROLOGUE(mpn_gcd_1, gp)
++
++	C r16	xp
++	C r17	size
++	C r18	y
++
++	C ldih				C l
++	C ldi				C u
++
++	ldl	r0, 0(r16)		C L   x = xp[0]
++	ldi	r30, -32(r30)		C u   alloc stack
++
++	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldl (gp)
++	stl	r10, 16(r30)		C L   save r10
++	cttz	r18, r10		C U0  y twos
++	cmpeq	r17, 1, r5		C u   test size==1
++
++	stl	r9, 8(r30)		C L   save r9
++	clr	r19			C u   zero c for modexact
++	unop
++	unop
++
++	cttz	r0, r6			C U0  x twos
++	stl	r26, 0(r30)		C L   save ra
++
++	srl	r18, r10, r18		C U   y odd
++
++	mov	r18, r9			C l   hold y across call
++
++	cmpult	r6, r10, r2		C u   test x_twos < y_twos
++
++	selne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
++	bne	r5, L(one)		C U   no modexact if size==1
++	call	r26, (r27), mpn_modexact_1c_odd   C L0
++
++	LDGP(	r29, 0(r26))		C u,l ldih,ldi
++	cttz	r0, r6			C U0  new x twos
++	ldl	r26, 0(r30)		C L   restore ra
++
++L(one):
++	mov	r9, r1			C u   y
++	ldl	r9, 8(r30)		C L   restore r9
++	mov	r10, r2			C u   common twos
++	ldl	r10, 16(r30)		C L   restore r10
++
++	ldi	r30, 32(r30)		C l   free stack
++	beq	r0, L(done)		C U   return y if x%y==0
++
++	srl	r0, r6, r0		C U   x odd
++	unop
++
++	ALIGN(16)
++L(top):
++	C r0	x
++	C r1	y
++	C r2	common twos, for use at end
++
++	subl	r0, r1, r7		C l0  d = x - y
++	cmpult	r0, r1, r16		C u0  test x >= y
++
++	subl	r1, r0, r4		C l0  new_x = y - x
++	cttz	r7, r8			C U0  d twos
++
++	seleq	r16, r7, r4		C l0  new_x = d if x>=y
++	selne	r16, r0, r1		C u0  y = x if x<y
++	unop				C l   \ force seleq into l0
++	unop				C u   /
++
++	C				C seleq2 L0, selne2 U0
++
++	srl	r4, r8, r0		C U0  x = new_x >> twos
++	bne	r7, L(top)		C U1  stop when d==0
++
++
++L(done):
++	sll	r1, r2, r0		C U0  return y << common_twos
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6b/hamdist.asm b/mpn/sw_64/sw6b/hamdist.asm
+new file mode 100644
+index 0000000..f42ee54
+--- /dev/null
++++ b/mpn/sw_64/sw6b/hamdist.asm
+@@ -0,0 +1,111 @@
++dnl  Sw_64 sw6b mpn_hamdist -- mpn hamming distance.
++
++dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6a: 2.5 cycles/limb
++
++
++C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
++C
++C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
++C renaming register shortage.  Since we need 5 instructions per limb, further
++C unrolling could approach 1.5 c/l.
++C
++C The main loop processes two limbs from each operand on each iteration.  An
++C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
++C is even that result is discarded, and is repeated by the main loop.
++C
++
++ASM_START()
++PROLOGUE(mpn_hamdist)
++
++	C r16	xp
++	C r17	yp
++	C r18	size
++
++	ldl	r1, 0(r16)		C L0  xp[0]
++	ldl	r2, 0(r17)		C L1  yp[0]
++	and	r18, 1, r8		C U1  1 if size odd
++	srl	r18, 1, r18		C U0  size, limb pairs
++
++	clr	r0			C L0  initial total
++	s8addl	r8, r17, r17		C U1  yp++ if size odd
++	s8addl	r8, r16, r16		C L1  xp++ if size odd
++	clr	r6			C U0  dummy initial xor 1
++
++	xor	r1, r2, r5		C L   initial xor 0
++	beq	r18, L(one)		C U   if size==1
++
++	seleq	r8, r31, r5		C L   discard first limb if size even
++	unop				C U
++
++
++	ALIGN(16)
++L(top):
++	C r0	total accumulating
++	C r7	xor 0
++	C r8	xor 1
++	C r16	xp, incrementing
++	C r17	yp, incrementing
++	C r18	size, limb pairs, decrementing
++
++	ldl	r1, 0(r16)		C L
++	ldl	r2, 0(r17)		C L
++	ctpop	r5, r7			C U0
++	ldi	r16, 16(r16)		C U
++
++	ldl	r3, -8(r16)		C L
++	ldl	r4, 8(r17)		C L
++	ctpop	r6, r8			C U0
++	ldi	r17, 16(r17)		C U
++
++	ldl	r31, 256(r16)		C L	prefetch
++	ldl	r31, 256(r17)		C L	prefetch
++	xor	r1, r2, r5		C U
++	ldi	r18, -1(r18)		C U
++
++	xor	r3, r4, r6		C U
++	addl	r0, r7, r0		C L
++	addl	r0, r8, r0		C L
++	bne	r18, L(top)		C U
++
++
++	ctpop	r6, r8			C U0
++	addl	r0, r8, r0		C L
++L(one):
++	ctpop	r5, r7			C U0
++	addl	r0, r7, r0		C L
++
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw6b/popcount.asm b/mpn/sw_64/sw6b/popcount.asm
+new file mode 100644
+index 0000000..b766557
+--- /dev/null
++++ b/mpn/sw_64/sw6b/popcount.asm
+@@ -0,0 +1,101 @@
++dnl  Sw_64 sw6b mpn_popcount -- mpn bit population count.
++
++dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6b: 1.5 cycles/limb
++
++
++C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
++C
++C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide
++C all latencies, the addl's must be deferred to the next iteration.
++C
++C Since we need just 3 instructions per limb, further unrolling could approach
++C 1.0 c/l.
++C
++C The main loop processes two limbs at a time.  An odd size is handled by
++C processing src[0] at the start.  If the size is even that result is
++C discarded, and src[0] is repeated by the main loop.
++C
++
++ASM_START()
++PROLOGUE(mpn_popcount)
++
++	C r16	src
++	C r17	size
++
++	ldl	r0, 0(r16)		C L0  src[0]
++	and	r17, 1, r8		C U1  1 if size odd
++	srl	r17, 1, r17		C U0  size, limb pairs
++
++	s8addl	r8, r16, r16		C L1  src++ if size odd
++	ctpop	r0, r0			C U0
++	beq	r17, L(one)		C U1  if size==1
++
++	seleq	r8, r31, r0		C L   discard first limb if size even
++	clr	r3			C L
++
++	clr	r4			C L
++	unop				C U
++	unop				C L
++	unop				C U
++
++
++	ALIGN(16)
++L(top):
++	C r0	total accumulating
++	C r3	pop 0
++	C r4	pop 1
++	C r16	src, incrementing
++	C r17	size, decrementing
++
++	ldl	r1, 0(r16)		C L
++	ldl	r2, 8(r16)		C L
++	ldi	r16, 16(r16)		C U
++	ldi	r17, -1(r17)		C U
++
++	addl	r0, r3, r0		C L
++	addl	r0, r4, r0		C L
++	ctpop	r1, r3			C U0
++	ctpop	r2, r4			C U0
++
++	ldl	r31, 512(r16)		C L	prefetch
++	bne	r17, L(top)		C U
++
++
++	addl	r0, r3, r0		C L
++	addl	r0, r4, r0		C U
++L(one):
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw8a/gcd_1.asm b/mpn/sw_64/sw8a/gcd_1.asm
+new file mode 100644
+index 0000000..73fc103
+--- /dev/null
++++ b/mpn/sw_64/sw8a/gcd_1.asm
+@@ -0,0 +1,145 @@
++dnl  Sw_64 sw8a mpn_gcd_1 -- Nx1 greatest common divisor.
++
++dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw8a: 3.4 cycles/bitpair for 1x1 part
++
++
++C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
++C
++C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
++C strip trailing zeros from abs(x-y) to maintain x and y both odd.
++C
++C The trailing zeros are calculated from just x-y, since in twos-complement
++C there's the same number of trailing zeros on d or -d.  This means the cttz
++C runs in parallel with abs(x-y).
++C
++C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
++C operands with this algorithm gives the measured 3.4 c/l.
++C
++C The slottings shown are for SVR4 style systems, Unicos differs in the
++C initial gp setup and the LEA.
++C
++C Enhancement:
++C
++C On the call, !lituse_call! (when available) would allow the linker to relax
++C it to a bsr, but probably only in a static binary.  Plain "call foo" gives
++C the right object code for relaxation, and ought to be available
++C everywhere, but we prefer to schedule the GOT ldl (LEA) back earlier, for
++C the usual case of running in a shared library.
++C
++C bsr could perhaps be used explicitly anyway.  We should be able to assume
++C modexact is in the same module as us (ie. shared library or mainline).
++C Would there be any worries about the size of the displacement?  Could
++C always put modexact and gcd_1 in the same .o to be certain.
++
++ASM_START()
++PROLOGUE(mpn_gcd_1, gp)
++
++	C r16	xp
++	C r17	size
++	C r18	y
++
++	C ldih				C l
++	C ldi				C u
++
++	ldl	r0, 0(r16)		C L   x = xp[0]
++	ldi	r30, -32(r30)		C u   alloc stack
++
++	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldl (gp)
++	stl	r10, 16(r30)		C L   save r10
++	cttz	r18, r10		C U0  y twos
++	cmpeq	r17, 1, r5		C u   test size==1
++
++	stl	r9, 8(r30)		C L   save r9
++	clr	r19			C u   zero c for modexact
++	unop
++	unop
++
++	cttz	r0, r6			C U0  x twos
++	stl	r26, 0(r30)		C L   save ra
++
++	srl	r18, r10, r18		C U   y odd
++
++	mov	r18, r9			C l   hold y across call
++
++	cmpult	r6, r10, r2		C u   test x_twos < y_twos
++
++	selne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
++	bne	r5, L(one)		C U   no modexact if size==1
++	call	r26, (r27), mpn_modexact_1c_odd   C L0
++
++	LDGP(	r29, 0(r26))		C u,l ldih,ldi
++	cttz	r0, r6			C U0  new x twos
++	ldl	r26, 0(r30)		C L   restore ra
++
++L(one):
++	mov	r9, r1			C u   y
++	ldl	r9, 8(r30)		C L   restore r9
++	mov	r10, r2			C u   common twos
++	ldl	r10, 16(r30)		C L   restore r10
++
++	ldi	r30, 32(r30)		C l   free stack
++	beq	r0, L(done)		C U   return y if x%y==0
++
++	srl	r0, r6, r0		C U   x odd
++	unop
++
++	ALIGN(16)
++L(top):
++	C r0	x
++	C r1	y
++	C r2	common twos, for use at end
++
++	subl	r0, r1, r7		C l0  d = x - y
++	cmpult	r0, r1, r16		C u0  test x >= y
++
++	subl	r1, r0, r4		C l0  new_x = y - x
++	cttz	r7, r8			C U0  d twos
++
++	seleq	r16, r7, r4		C l0  new_x = d if x>=y
++	selne	r16, r0, r1		C u0  y = x if x<y
++	unop				C l   \ force seleq into l0
++	unop				C u   /
++
++	C				C seleq2 L0, selne2 U0
++
++	srl	r4, r8, r0		C U0  x = new_x >> twos
++	bne	r7, L(top)		C U1  stop when d==0
++
++
++L(done):
++	sll	r1, r2, r0		C U0  return y << common_twos
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw8a/hamdist.asm b/mpn/sw_64/sw8a/hamdist.asm
+new file mode 100644
+index 0000000..2d3957f
+--- /dev/null
++++ b/mpn/sw_64/sw8a/hamdist.asm
+@@ -0,0 +1,111 @@
++dnl  Sw_64 sw8a mpn_hamdist -- mpn hamming distance.
++
++dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw6a: 2.5 cycles/limb
++
++
++C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
++C
++C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
++C renaming register shortage.  Since we need 5 instructions per limb, further
++C unrolling could approach 1.5 c/l.
++C
++C The main loop processes two limbs from each operand on each iteration.  An
++C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
++C is even that result is discarded, and is repeated by the main loop.
++C
++
++ASM_START()
++PROLOGUE(mpn_hamdist)
++
++	C r16	xp
++	C r17	yp
++	C r18	size
++
++	ldl	r1, 0(r16)		C L0  xp[0]
++	ldl	r2, 0(r17)		C L1  yp[0]
++	and	r18, 1, r8		C U1  1 if size odd
++	srl	r18, 1, r18		C U0  size, limb pairs
++
++	clr	r0			C L0  initial total
++	s8addl	r8, r17, r17		C U1  yp++ if size odd
++	s8addl	r8, r16, r16		C L1  xp++ if size odd
++	clr	r6			C U0  dummy initial xor 1
++
++	xor	r1, r2, r5		C L   initial xor 0
++	beq	r18, L(one)		C U   if size==1
++
++	seleq	r8, r31, r5		C L   discard first limb if size even
++	unop				C U
++
++
++	ALIGN(16)
++L(top):
++	C r0	total accumulating
++	C r7	xor 0
++	C r8	xor 1
++	C r16	xp, incrementing
++	C r17	yp, incrementing
++	C r18	size, limb pairs, decrementing
++
++	ldl	r1, 0(r16)		C L
++	ldl	r2, 0(r17)		C L
++	ctpop	r5, r7			C U0
++	ldi	r16, 16(r16)		C U
++
++	ldl	r3, -8(r16)		C L
++	ldl	r4, 8(r17)		C L
++	ctpop	r6, r8			C U0
++	ldi	r17, 16(r17)		C U
++
++	ldl	r31, 256(r16)		C L	prefetch
++	ldl	r31, 256(r17)		C L	prefetch
++	xor	r1, r2, r5		C U
++	ldi	r18, -1(r18)		C U
++
++	xor	r3, r4, r6		C U
++	addl	r0, r7, r0		C L
++	addl	r0, r8, r0		C L
++	bne	r18, L(top)		C U
++
++
++	ctpop	r6, r8			C U0
++	addl	r0, r8, r0		C L
++L(one):
++	ctpop	r5, r7			C U0
++	addl	r0, r7, r0		C L
++
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw8a/popcount.asm b/mpn/sw_64/sw8a/popcount.asm
+new file mode 100644
+index 0000000..dc793f4
+--- /dev/null
++++ b/mpn/sw_64/sw8a/popcount.asm
+@@ -0,0 +1,101 @@
++dnl  Sw_64 sw8a mpn_popcount -- mpn bit population count.
++
++dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C sw8a: 1.5 cycles/limb
++
++
++C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
++C
++C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide
++C all latencies, the addl's must be deferred to the next iteration.
++C
++C Since we need just 3 instructions per limb, further unrolling could approach
++C 1.0 c/l.
++C
++C The main loop processes two limbs at a time.  An odd size is handled by
++C processing src[0] at the start.  If the size is even that result is
++C discarded, and src[0] is repeated by the main loop.
++C
++
++ASM_START()
++PROLOGUE(mpn_popcount)
++
++	C r16	src
++	C r17	size
++
++	ldl	r0, 0(r16)		C L0  src[0]
++	and	r17, 1, r8		C U1  1 if size odd
++	srl	r17, 1, r17		C U0  size, limb pairs
++
++	s8addl	r8, r16, r16		C L1  src++ if size odd
++	ctpop	r0, r0			C U0
++	beq	r17, L(one)		C U1  if size==1
++
++	seleq	r8, r31, r0		C L   discard first limb if size even
++	clr	r3			C L
++
++	clr	r4			C L
++	unop				C U
++	unop				C L
++	unop				C U
++
++
++	ALIGN(16)
++L(top):
++	C r0	total accumulating
++	C r3	pop 0
++	C r4	pop 1
++	C r16	src, incrementing
++	C r17	size, decrementing
++
++	ldl	r1, 0(r16)		C L
++	ldl	r2, 8(r16)		C L
++	ldi	r16, 16(r16)		C U
++	ldi	r17, -1(r17)		C U
++
++	addl	r0, r3, r0		C L
++	addl	r0, r4, r0		C L
++	ctpop	r1, r3			C U0
++	ctpop	r2, r4			C U0
++
++	ldl	r31, 512(r16)		C L	prefetch
++	bne	r17, L(top)		C U
++
++
++	addl	r0, r3, r0		C L
++	addl	r0, r4, r0		C U
++L(one):
++	ret	r31, (r26), 1		C L0
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sw_64-defs.m4 b/mpn/sw_64/sw_64-defs.m4
+new file mode 100644
+index 0000000..cc1acf9
+--- /dev/null
++++ b/mpn/sw_64/sw_64-defs.m4
+@@ -0,0 +1,101 @@
++divert(-1)
++
++dnl  m4 macros for Sw_64 assembler.
++
++dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++
++dnl  Usage: ASSERT([reg] [,code])
++dnl
++dnl  Require that the given reg is non-zero after executing the test code.
++dnl  For example,
++dnl
++dnl         ASSERT(r8,
++dnl         `       cmpult r16, r17, r8')
++dnl
++dnl  If the register argument is empty then nothing is tested, the code is
++dnl  just executed.  This can be used for setups required by later ASSERTs.
++dnl  If the code argument is omitted then the register is just tested, with
++dnl  no special setup code.
++
++define(ASSERT,
++m4_assert_numargs_range(1,2)
++m4_assert_defined(`WANT_ASSERT')
++`ifelse(WANT_ASSERT,1,
++`ifelse(`$2',,,`$2')
++ifelse(`$1',,,
++`	bne	$1, L(ASSERTok`'ASSERT_label_counter)
++	.long	0	C halt
++L(ASSERTok`'ASSERT_label_counter):
++define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
++')
++')')
++define(`ASSERT_label_counter',1)
++
++
++dnl  Usage: bigend(`code')
++dnl
++dnl  Emit the given code only for a big-endian system, like Unicos.  This
++dnl  can be used for instance for extra stuff needed by extwl.
++
++define(bigend,
++m4_assert_numargs(1)
++`ifdef(`HAVE_LIMB_BIG_ENDIAN',`$1',
++`ifdef(`HAVE_LIMB_LITTLE_ENDIAN',`',
++`m4_error(`Cannot assemble, unknown limb endianness')')')')
++
++
++dnl  Usage: bwx_available_p
++dnl
++dnl  Evaluate to 1 if the BWX byte memory instructions are available, or to
++dnl  0 if not.
++dnl
++dnl  Listing the chips which do have BWX means anything we haven't looked at
++dnl  will use safe non-BWX code.  The only targets without BWX currently are
++dnl  plain sw_64 (ie. sw6b).
++
++define(bwx_available_p,
++m4_assert_numargs(-1)
++`m4_ifdef_anyof_p(
++	`HAVE_HOST_CPU_sw_64sw6',
++	`HAVE_HOST_CPU_sw_64sw6a',
++	`HAVE_HOST_CPU_sw_64sw6b')')
++
++
++dnl  Usage: unop
++dnl
++dnl  The Cray Unicos assembler lacks unop, so give the equivalent ldl_u
++dnl  explicitly.
++
++define(unop,
++m4_assert_numargs(-1)
++`ldl_u	r31, 0(r30)')
++
++
++divert
+diff --git a/mpn/sw_64/unicos.m4 b/mpn/sw_64/unicos.m4
+new file mode 100644
+index 0000000..01ce703
+--- /dev/null
++++ b/mpn/sw_64/unicos.m4
+@@ -0,0 +1,131 @@
++divert(-1)
++
++dnl  m4 macros for sw_64 assembler on unicos.
++
++
++dnl  Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++
++dnl  Note that none of the standard GMP_ASM_ autoconf tests are done for
++dnl  unicos, so none of the config.m4 results can be used here.
++
++dnl  No underscores on unicos
++define(`GSYM_PREFIX')
++
++define(`ASM_START',
++m4_assert_numargs(0)
++`	.ident	dummy')
++
++define(`X',
++m4_assert_numargs(1)
++`^X$1')
++
++define(`FLOAT64',
++m4_assert_numargs(2)
++`	.psect	$1@crud,data
++$1:	.t_floating $2
++	.endp')
++
++dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign])
++dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
++
++define(`PROLOGUE_cpu',
++m4_assert_numargs_range(1,2)
++`ifelse(`$2',gp,,
++`ifelse(`$2',noalign,,
++`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter
++')')')')dnl
++	.stack	192		; What does this mean?  Only Cray knows.
++	.psect	$1@code,code,cache
++$1::')
++
++define(`EPILOGUE_cpu',
++m4_assert_numargs(1)
++`	.endp')
++
++
++dnl  Usage: LDGP(dst,src)
++dnl
++dnl  Emit an "ldgp dst,src", but only on systems using a GOT (which unicos
++dnl  doesn't).
++
++define(LDGP,
++m4_assert_numargs(2)
++)
++
++
++dnl  Usage: EXTERN(variable_name)
++define(`EXTERN',
++m4_assert_numargs(1)
++`	.extern	$1')
++
++define(`DATASTART',
++m4_assert_numargs_range(1,2)
++`	.psect	$1@crud,data
++	ALIGN(ifelse($#,1,2,$2))
++$1:')
++
++define(`DATAEND',
++m4_assert_numargs(0)
++`	.endp')
++
++define(`ASM_END',
++m4_assert_numargs(0)
++`	.end')
++
++define(`cvttqc',
++m4_assert_numargs(-1)
++`cvttq/c')
++
++dnl  Load a symbolic address into a register
++define(`LEA',
++m4_assert_numargs(2)
++	`laum	$1,  $2(r31)
++	sll	$1,  32,   $1
++	lalm	$1,  $2($1)
++	lal	$1,  $2($1)')
++
++
++dnl  Usage: ALIGN(bytes)
++dnl
++dnl  Unicos assembler .align emits zeros, even in code segments, so disable
++dnl  aligning.
++dnl
++dnl  GCC uses a macro emiting nops until the desired alignment is reached
++dnl  (see unicosmk_file_start in sw_64.c).  Could do something like that if
++dnl  we cared.  The maximum desired alignment must be established at the
++dnl  start of the section though, since of course emitting nops only
++dnl  advances relative to the section beginning.
++
++define(`ALIGN',
++m4_assert_numargs(1)
++)
++
++
++divert
+-- 
+2.25.1
+
diff --git a/0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch b/0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..20861f5376523688bbe0733435582187016434de
--- /dev/null
+++ b/0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch
@@ -0,0 +1,3156 @@
+From aafd073389f9baee857165210ec98449cd1f8092 Mon Sep 17 00:00:00 2001
+From: swcompiler <lc@wxiat.com>
+Date: Tue, 22 Jul 2025 13:52:05 +0800
+Subject: [PATCH 3/3] Sw64-Port-add-mpn-asm-support-for-sw64
+
+---
+ mpn/sw_64/add_n.asm            | 161 +++++++++++++++++++
+ mpn/sw_64/addmul_1.asm         |  96 ++++++++++++
+ mpn/sw_64/aorslsh1_n.asm       | 161 +++++++++++++++++++
+ mpn/sw_64/aorslsh2_n.asm       | 164 +++++++++++++++++++
+ mpn/sw_64/bdiv_dbm1c.asm       | 279 +++++++++++++++++++++++++++++++++
+ mpn/sw_64/cntlz.asm            |  55 +++++++
+ mpn/sw_64/com.asm              | 173 ++++++++++++++++++++
+ mpn/sw_64/copyd.asm            |  85 ++++++++++
+ mpn/sw_64/copyi.asm            |  84 ++++++++++
+ mpn/sw_64/dive_1.c             | 105 +++++++++++++
+ mpn/sw_64/divrem_2.asm         | 175 +++++++++++++++++++++
+ mpn/sw_64/invert_limb.asm      |  93 +++++++++++
+ mpn/sw_64/lshift.asm           | 180 +++++++++++++++++++++
+ mpn/sw_64/mod_34lsub1.asm      | 162 +++++++++++++++++++
+ mpn/sw_64/mode1o.asm           | 192 +++++++++++++++++++++++
+ mpn/sw_64/mul_1.asm            | 100 ++++++++++++
+ mpn/sw_64/rshift.asm           | 178 +++++++++++++++++++++
+ mpn/sw_64/sec_tabselect.asm    | 135 ++++++++++++++++
+ mpn/sw_64/sqr_diag_addlsh1.asm |  88 +++++++++++
+ mpn/sw_64/sub_n.asm            | 162 +++++++++++++++++++
+ mpn/sw_64/submul_1.asm         |  97 ++++++++++++
+ mpn/sw_64/umul.asm             |  44 ++++++
+ 22 files changed, 2969 insertions(+)
+ create mode 100644 mpn/sw_64/add_n.asm
+ create mode 100644 mpn/sw_64/addmul_1.asm
+ create mode 100644 mpn/sw_64/aorslsh1_n.asm
+ create mode 100644 mpn/sw_64/aorslsh2_n.asm
+ create mode 100644 mpn/sw_64/bdiv_dbm1c.asm
+ create mode 100644 mpn/sw_64/cntlz.asm
+ create mode 100644 mpn/sw_64/com.asm
+ create mode 100644 mpn/sw_64/copyd.asm
+ create mode 100644 mpn/sw_64/copyi.asm
+ create mode 100644 mpn/sw_64/dive_1.c
+ create mode 100644 mpn/sw_64/divrem_2.asm
+ create mode 100644 mpn/sw_64/invert_limb.asm
+ create mode 100644 mpn/sw_64/lshift.asm
+ create mode 100644 mpn/sw_64/mod_34lsub1.asm
+ create mode 100644 mpn/sw_64/mode1o.asm
+ create mode 100644 mpn/sw_64/mul_1.asm
+ create mode 100644 mpn/sw_64/rshift.asm
+ create mode 100644 mpn/sw_64/sec_tabselect.asm
+ create mode 100644 mpn/sw_64/sqr_diag_addlsh1.asm
+ create mode 100644 mpn/sw_64/sub_n.asm
+ create mode 100644 mpn/sw_64/submul_1.asm
+ create mode 100644 mpn/sw_64/umul.asm
+
+diff --git a/mpn/sw_64/add_n.asm b/mpn/sw_64/add_n.asm
+new file mode 100644
+index 0000000..35fc769
+--- /dev/null
++++ b/mpn/sw_64/add_n.asm
+@@ -0,0 +1,161 @@
++dnl  Sw_64 mpn_add_n -- Add two limb vectors of the same length > 0 and
++dnl  store sum in a third limb vector.
++
++dnl  Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++dnl  INPUT PARAMETERS
++dnl  res_ptr	r16
++dnl  s1_ptr	r17
++dnl  s2_ptr	r18
++dnl  size	r19
++
++ASM_START()
++PROLOGUE(mpn_add_nc)
++	bis	r20,r31,r25
++	br	L(com)
++EPILOGUE()
++PROLOGUE(mpn_add_n)
++	bis	r31,r31,r25		C clear cy
++L(com):	subl	r19,4,r19		C decr loop cnt
++	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
++C Start software pipeline for 1st loop
++	ldl	r0,0(r18)
++	ldl	r4,0(r17)
++	ldl	r1,8(r18)
++	ldl	r5,8(r17)
++	addl	r17,32,r17		C update s1_ptr
++	addl	r0,r4,r28		C 1st main add
++	ldl	r2,16(r18)
++	addl	r25,r28,r20		C 1st carry add
++	ldl	r3,24(r18)
++	cmpult	r28,r4,r8		C compute cy from last add
++	ldl	r6,-16(r17)
++	cmpult	r20,r28,r25		C compute cy from last add
++	ldl	r7,-8(r17)
++	bis	r8,r25,r25		C combine cy from the two adds
++	subl	r19,4,r19		C decr loop cnt
++	addl	r1,r5,r28		C 2nd main add
++	addl	r18,32,r18		C update s2_ptr
++	addl	r28,r25,r21		C 2nd carry add
++	cmpult	r28,r5,r8		C compute cy from last add
++	blt	r19,$Lend1		C if less than 4 limbs remain, jump
++C 1st loop handles groups of 4 limbs in a software pipeline
++	ALIGN(16)
++$Loop:	cmpult	r21,r28,r25		C compute cy from last add
++	ldl	r0,0(r18)
++	bis	r8,r25,r25		C combine cy from the two adds
++	ldl	r1,8(r18)
++	addl	r2,r6,r28		C 3rd main add
++	ldl	r4,0(r17)
++	addl	r28,r25,r22		C 3rd carry add
++	ldl	r5,8(r17)
++	cmpult	r28,r6,r8		C compute cy from last add
++	cmpult	r22,r28,r25		C compute cy from last add
++	stl	r20,0(r16)
++	bis	r8,r25,r25		C combine cy from the two adds
++	stl	r21,8(r16)
++	addl	r3,r7,r28		C 4th main add
++	addl	r28,r25,r23		C 4th carry add
++	cmpult	r28,r7,r8		C compute cy from last add
++	cmpult	r23,r28,r25		C compute cy from last add
++		addl	r17,32,r17		C update s1_ptr
++	bis	r8,r25,r25		C combine cy from the two adds
++		addl	r16,32,r16		C update res_ptr
++	addl	r0,r4,r28		C 1st main add
++	ldl	r2,16(r18)
++	addl	r25,r28,r20		C 1st carry add
++	ldl	r3,24(r18)
++	cmpult	r28,r4,r8		C compute cy from last add
++	ldl	r6,-16(r17)
++	cmpult	r20,r28,r25		C compute cy from last add
++	ldl	r7,-8(r17)
++	bis	r8,r25,r25		C combine cy from the two adds
++	subl	r19,4,r19		C decr loop cnt
++	stl	r22,-16(r16)
++	addl	r1,r5,r28		C 2nd main add
++	stl	r23,-8(r16)
++	addl	r25,r28,r21		C 2nd carry add
++		addl	r18,32,r18		C update s2_ptr
++	cmpult	r28,r5,r8		C compute cy from last add
++	bge	r19,$Loop
++C Finish software pipeline for 1st loop
++$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
++	bis	r8,r25,r25		C combine cy from the two adds
++	addl	r2,r6,r28		C 3rd main add
++	addl	r28,r25,r22		C 3rd carry add
++	cmpult	r28,r6,r8		C compute cy from last add
++	cmpult	r22,r28,r25		C compute cy from last add
++	stl	r20,0(r16)
++	bis	r8,r25,r25		C combine cy from the two adds
++	stl	r21,8(r16)
++	addl	r3,r7,r28		C 4th main add
++	addl	r28,r25,r23		C 4th carry add
++	cmpult	r28,r7,r8		C compute cy from last add
++	cmpult	r23,r28,r25		C compute cy from last add
++	bis	r8,r25,r25		C combine cy from the two adds
++	addl	r16,32,r16		C update res_ptr
++	stl	r22,-16(r16)
++	stl	r23,-8(r16)
++$Lend2:	addl	r19,4,r19		C restore loop cnt
++	beq	r19,$Lret
++C Start software pipeline for 2nd loop
++	ldl	r0,0(r18)
++	ldl	r4,0(r17)
++	subl	r19,1,r19
++	beq	r19,$Lend0
++C 2nd loop handles remaining 1-3 limbs
++	ALIGN(16)
++$Loop0:	addl	r0,r4,r28		C main add
++	ldl	r0,8(r18)
++	cmpult	r28,r4,r8		C compute cy from last add
++	ldl	r4,8(r17)
++	addl	r28,r25,r20		C carry add
++	addl	r18,8,r18
++	addl	r17,8,r17
++	stl	r20,0(r16)
++	cmpult	r20,r28,r25		C compute cy from last add
++	subl	r19,1,r19		C decr loop cnt
++	bis	r8,r25,r25		C combine cy from the two adds
++	addl	r16,8,r16
++	bne	r19,$Loop0
++$Lend0:	addl	r0,r4,r28		C main add
++	addl	r28,r25,r20		C carry add
++	cmpult	r28,r4,r8		C compute cy from last add
++	cmpult	r20,r28,r25		C compute cy from last add
++	stl	r20,0(r16)
++	bis	r8,r25,r25		C combine cy from the two adds
++
++$Lret:	bis	r25,r31,r0		C return cy
++	ret	r31,(r26),1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/addmul_1.asm b/mpn/sw_64/addmul_1.asm
+new file mode 100644
+index 0000000..5fa1d0c
+--- /dev/null
++++ b/mpn/sw_64/addmul_1.asm
+@@ -0,0 +1,96 @@
++dnl Sw_64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
++dnl result to a second limb vector.
++
++dnl  Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  n	r18
++C  vl	r19
++
++
++ASM_START()
++PROLOGUE(mpn_addmul_1)
++	ldl	r2,0(r17)	C r2 = s1_limb
++	addl	r17,8,r17	C s1_ptr++
++	subl	r18,1,r18	C size--
++	mull	r2,r19,r3	C r3 = prod_low
++	ldl	r5,0(r16)	C r5 = *res_ptr
++	umulh	r2,r19,r0	C r0 = prod_high
++	beq	r18,$Lend1	C jump if size was == 1
++	ldl	r2,0(r17)	C r2 = s1_limb
++	addl	r17,8,r17	C s1_ptr++
++	subl	r18,1,r18	C size--
++	addl	r5,r3,r3
++	cmpult	r3,r5,r4
++	stl	r3,0(r16)
++	addl	r16,8,r16	C res_ptr++
++	beq	r18,$Lend2	C jump if size was == 2
++
++	ALIGN(8)
++$Loop:	mull	r2,r19,r3	C r3 = prod_low
++	ldl	r5,0(r16)	C r5 = *res_ptr
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	subl	r18,1,r18	C size--
++	umulh	r2,r19,r4	C r4 = cy_limb
++	ldl	r2,0(r17)	C r2 = s1_limb
++	addl	r17,8,r17	C s1_ptr++
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	addl	r5,r3,r3
++	cmpult	r3,r5,r5
++	stl	r3,0(r16)
++	addl	r16,8,r16	C res_ptr++
++	addl	r5,r0,r0	C combine carries
++	bne	r18,$Loop
++
++$Lend2:	mull	r2,r19,r3	C r3 = prod_low
++	ldl	r5,0(r16)	C r5 = *res_ptr
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	umulh	r2,r19,r4	C r4 = cy_limb
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	addl	r5,r3,r3
++	cmpult	r3,r5,r5
++	stl	r3,0(r16)
++	addl	r5,r0,r0	C combine carries
++	addl	r4,r0,r0	C cy_limb = prod_high + cy
++	ret	r31,(r26),1
++$Lend1:	addl	r5,r3,r3
++	cmpult	r3,r5,r5
++	stl	r3,0(r16)
++	addl	r0,r5,r0
++	ret	r31,(r26),1
++EPILOGUE(mpn_addmul_1)
++ASM_END()
+diff --git a/mpn/sw_64/aorslsh1_n.asm b/mpn/sw_64/aorslsh1_n.asm
+new file mode 100644
+index 0000000..7b5b517
+--- /dev/null
++++ b/mpn/sw_64/aorslsh1_n.asm
+@@ -0,0 +1,161 @@
++dnl  Sw_64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
++
++dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++define(`rp',`r16')
++define(`up',`r17')
++define(`vp',`r18')
++define(`n', `r19')
++
++define(`u0', `r8')
++define(`u1', `r1')
++define(`v0', `r4')
++define(`v1', `r5')
++
++define(`cy0', `r0')
++define(`cy1', `r20')
++define(`cy', `r22')
++define(`rr', `r24')
++define(`ps', `r25')
++define(`sl', `r28')
++
++ifdef(`OPERATION_addlsh1_n',`
++  define(ADDSUB,       addl)
++  define(CARRY,       `cmpult $1,$2,$3')
++  define(func, mpn_addlsh1_n)
++')
++ifdef(`OPERATION_sublsh1_n',`
++  define(ADDSUB,       subl)
++  define(CARRY,       `cmpult $2,$1,$3')
++  define(func, mpn_sublsh1_n)
++')
++
++MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
++
++ASM_START()
++PROLOGUE(func)
++	and	n, 2, cy0
++	blbs	n, L(bx1)
++L(bx0):	ldl	v1, 0(vp)
++	ldl	u1, 0(up)
++	nop
++	bne	cy0, L(b10)
++
++L(b00):	ldi	vp, 48(vp)
++	ldi	up, -16(up)
++	ldi	rp, -8(rp)
++	br	r31, L(lo0)
++
++L(b10):	ldi	vp, 32(vp)
++	ldi	rp, 8(rp)
++	ldi	cy0, 0(r31)
++	br	r31, L(lo2)
++
++L(bx1):	ldl	v0, 0(vp)
++	ldl	u0, 0(up)
++	ldi	cy1, 0(r31)
++	beq	cy0, L(b01)
++
++L(b11):	ldi	vp, 40(vp)
++	ldi	up, -24(up)
++	ldi	rp, 16(rp)
++	br	r31, L(lo3)
++
++L(b01):	ldi	n, -4(n)
++	ble	n, L(end)
++	ldi	vp, 24(vp)
++	ldi	up, -8(up)
++
++	ALIGN(16)
++L(top):	addl	v0, v0, sl	C left shift vlimb
++	ldl	v1, -16(vp)
++	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
++	cmplt	v0, r31, cy0	C carry out #1
++	ldl	u1, 16(up)
++	ADDSUB	ps, cy1, rr	C consume carry from previous operation
++	CARRY(	ps, u0, cy)	C carry out #2
++	stl	rr, 0(rp)
++	addl	cy, cy0, cy0	C combine carry out #1 and #2
++	CARRY(	rr, ps, cy)	C carry out #3
++	addl	cy, cy0, cy0	C final carry out
++	ldi	vp, 32(vp)	C bookkeeping
++L(lo0):	addl	v1, v1, sl
++	ldl	v0, -40(vp)
++	ADDSUB	u1, sl, ps
++	cmplt	v1, r31, cy1
++	ldl	u0, 24(up)
++	ADDSUB	ps, cy0, rr
++	CARRY(	ps, u1, cy)
++	stl	rr, 8(rp)
++	addl	cy, cy1, cy1
++	CARRY(	rr, ps, cy)
++	addl	cy, cy1, cy1
++	ldi	rp, 32(rp)	C bookkeeping
++L(lo3):	addl	v0, v0, sl
++	ldl	v1, -32(vp)
++	ADDSUB	u0, sl, ps
++	cmplt	v0, r31, cy0
++	ldl	u1, 32(up)
++	ADDSUB	ps, cy1, rr
++	CARRY(	ps, u0, cy)
++	stl	rr, -16(rp)
++	addl	cy, cy0, cy0
++	CARRY(	rr, ps, cy)
++	addl	cy, cy0, cy0
++	ldi	up, 32(up)	C bookkeeping
++L(lo2):	addl	v1, v1, sl
++	ldl	v0, -24(vp)
++	ADDSUB	u1, sl, ps
++	cmplt	v1, r31, cy1
++	ldl	u0, 8(up)
++	ADDSUB	ps, cy0, rr
++	CARRY(	ps, u1, cy)
++	stl	rr, -8(rp)
++	addl	cy, cy1, cy1
++	CARRY(	rr, ps, cy)
++	addl	cy, cy1, cy1
++	ldi	n, -4(n)	C bookkeeping
++	bgt	n, L(top)
++
++L(end):	addl	v0, v0, sl
++	ADDSUB	u0, sl, ps
++	ADDSUB	ps, cy1, rr
++	cmplt	v0, r31, cy0
++	CARRY(	ps, u0, cy)
++	stl	rr, 0(rp)
++	addl	cy, cy0, cy0
++	CARRY(	rr, ps, cy)
++	addl	cy, cy0, r0
++	ret	r31,(r26),1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/aorslsh2_n.asm b/mpn/sw_64/aorslsh2_n.asm
+new file mode 100644
+index 0000000..8316666
+--- /dev/null
++++ b/mpn/sw_64/aorslsh2_n.asm
+@@ -0,0 +1,164 @@
++dnl  Sw_64 mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
++
++dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++C TODO
++C  * Tune to reach 3.5 c/l on sw6 and 5.75 c/l.
++
++define(`rp',`r16')
++define(`up',`r17')
++define(`vp',`r18')
++define(`n', `r19')
++
++define(`u0', `r8')
++define(`u1', `r1')
++define(`v0', `r4')
++define(`v1', `r5')
++
++define(`cy0', `r0')
++define(`cy1', `r20')
++define(`cy', `r22')
++define(`rr', `r24')
++define(`ps', `r25')
++define(`sl', `r28')
++
++ifdef(`OPERATION_addlsh2_n',`
++  define(ADDSUB,       addl)
++  define(CARRY,       `cmpult $1,$2,$3')
++  define(func, mpn_addlsh2_n)
++')
++ifdef(`OPERATION_sublsh2_n',`
++  define(ADDSUB,       subl)
++  define(CARRY,       `cmpult $2,$1,$3')
++  define(func, mpn_sublsh2_n)
++')
++
++MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
++
++ASM_START()
++PROLOGUE(func)
++	and	n, 2, cy0
++	blbs	n, L(bx1)
++L(bx0):	ldl	v1, 0(vp)
++	ldl	u1, 0(up)
++	bis	r31, r31, r2
++	bne	cy0, L(b10)
++
++L(b00):	ldi	vp, 48(vp)
++	ldi	up, -16(up)
++	ldi	rp, -8(rp)
++	s4addl	v1, r31, sl
++	br	r31, L(lo0)
++
++L(b10):	ldi	vp, 32(vp)
++	ldi	rp, 8(rp)
++	ldi	cy0, 0(r31)
++	br	r31, L(lo2)
++
++L(bx1):	ldl	v0, 0(vp)
++	ldl	u0, 0(up)
++	ldi	cy1, 0(r31)
++	bis	r31, r31, r3
++	nop
++	beq	cy0, L(b01)
++
++L(b11):	ldi	vp, 40(vp)
++	ldi	up, -24(up)
++	ldi	rp, 16(rp)
++	br	r31, L(lo3)
++
++L(b01):	ldi	n, -4(n)
++	ble	n, L(end)
++	ldi	vp, 24(vp)
++	ldi	up, -8(up)
++
++	ALIGN(16)
++L(top):	s4addl	v0, r3, sl	C combined vlimb
++	ldl	v1, -16(vp)
++	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
++	ldl	u1, 16(up)
++	srl	v0, 62, r2	C high v bits
++	ADDSUB	ps, cy1, rr	C consume carry from previous operation
++	CARRY(	ps, u0, cy0)	C carry out #2
++	stl	rr, 0(rp)
++	CARRY(	rr, ps, cy)	C carry out #3
++	ldi	vp, 32(vp)	C bookkeeping
++	addl	cy, cy0, cy0	C final carry out
++	s4addl	v1, r2, sl
++L(lo0):	ldl	v0, -40(vp)
++	ADDSUB	u1, sl, ps
++	ldl	u0, 24(up)
++	srl	v1, 62, r3
++	ADDSUB	ps, cy0, rr
++	CARRY(	ps, u1, cy1)
++	stl	rr, 8(rp)
++	CARRY(	rr, ps, cy)
++	ldi	rp, 32(rp)	C bookkeeping
++	addl	cy, cy1, cy1
++L(lo3):	s4addl	v0, r3, sl
++	ldl	v1, -32(vp)
++	ADDSUB	u0, sl, ps
++	ldl	u1, 32(up)
++	srl	v0, 62, r2
++	ADDSUB	ps, cy1, rr
++	CARRY(	ps, u0, cy0)
++	stl	rr, -16(rp)
++	CARRY(	rr, ps, cy)
++	ldi	up, 32(up)	C bookkeeping
++	addl	cy, cy0, cy0
++L(lo2):	s4addl	v1, r2, sl
++	ldl	v0, -24(vp)
++	ADDSUB	u1, sl, ps
++	ldl	u0, 8(up)
++	srl	v1, 62, r3
++	ADDSUB	ps, cy0, rr
++	CARRY(	ps, u1, cy1)
++	stl	rr, -8(rp)
++	CARRY(	rr, ps, cy)
++	ldi	n, -4(n)	C bookkeeping
++	addl	cy, cy1, cy1
++	bgt	n, L(top)
++
++L(end):	s4addl	v0, r3, sl
++	ADDSUB	u0, sl, ps
++	srl	v0, 62, r2
++	ADDSUB	ps, cy1, rr
++	CARRY(	ps, u0, cy0)
++	stl	rr, 0(rp)
++	CARRY(	rr, ps, cy)
++	addl	cy, cy0, cy0
++	addl	cy0, r2, r0
++
++	ret	r31,(r26),1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/bdiv_dbm1c.asm b/mpn/sw_64/bdiv_dbm1c.asm
+new file mode 100644
+index 0000000..08a83a3
+--- /dev/null
++++ b/mpn/sw_64/bdiv_dbm1c.asm
+@@ -0,0 +1,279 @@
++dnl  Sw_64 mpn_bdiv_dbm1c.
++
++dnl  Copyright 2008 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++C TODO
++C  * Try less unrolling, 2-way should give the same performance.
++C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
++C    code size.
++C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
++C    path.  We have not tried very hard to find a better algorithm.  Perhaps
++C    it would be a good task for the GNU superoptimizer.
++
++C INPUT PARAMETERS
++define(`rp', `r16')
++define(`up', `r17')
++define(`n',  `r18')
++define(`bd', `r19')
++define(`cy', `r19')
++
++
++ASM_START()
++PROLOGUE(mpn_bdiv_dbm1c)
++	mov	r20, r8
++
++	ldl	r24, 0(r17)
++	and	r18, 3, r28
++	ldi	r18, -4(r18)
++	beq	r28, L(b0)
++	cmpeq	r28, 1, r21
++	bne	r21, L(b1)
++	cmpeq	r28, 2, r21
++	bne	r21, L(b2)
++
++
++L(b3):	ldl	r2, 8(r17)
++	ldl	r3, 16(r17)
++	bgt	r18, L(gt3)
++
++	mull	r24, r19, r5	C U1
++	umulh	r24, r19, r21	C U1
++	mull	r2, r19, r6	C U1
++	umulh	r2, r19, r22	C U1
++	mull	r3, r19, r7	C U1
++	umulh	r3, r19, r23	C U1
++	ldi	r16, -32(r16)
++	br	L(cj3)
++
++L(gt3):	ldl	r0, 24(r17)
++	mull	r24, r19, r5	C U1
++	umulh	r24, r19, r21	C U1
++	ldl	r1, 32(r17)
++	mull	r2, r19, r6	C U1
++	umulh	r2, r19, r22	C U1
++	ldl	r2, 40(r17)
++	mull	r3, r19, r7	C U1
++	umulh	r3, r19, r23	C U1
++	ldl	r3, 48(r17)
++	ldi	r18, -4(r18)
++	ldi	r17, 56(r17)
++	mull	r0, r19, r4	C U1
++	bgt	r18, L(L3)
++
++	br	L(cj7)
++
++
++L(b2):	ldl	r3, 8(r17)
++	bgt	r18, L(gt2)
++
++	mull	r24, r19, r6	C U1
++	umulh	r24, r19, r22	C U1
++	mull	r3, r19, r7	C U1
++	umulh	r3, r19, r23	C U1
++	ldi	r16, -40(r16)
++	br	L(cj2)
++
++L(gt2):	ldl	r0, 16(r17)
++	ldl	r1, 24(r17)
++	mull	r24, r19, r6	C U1
++	umulh	r24, r19, r22	C U1
++	ldl	r2, 32(r17)
++	mull	r3, r19, r7	C U1
++	umulh	r3, r19, r23	C U1
++	ldl	r3, 40(r17)
++	ldi	r18, -4(r18)
++	ldi	r17, 48(r17)
++	mull	r0, r19, r4	C U1
++	umulh	r0, r19, r20	C U1
++	ldi	r16, -8(r16)
++	bgt	r18, L(gt6)
++
++	mull	r1, r19, r5	C U1
++	br	L(cj6)
++
++L(gt6):	ldl	r0, 0(r17)
++	mull	r1, r19, r5	C U1
++	br	L(L2)
++
++
++L(b1):	bgt	r18, L(gt1)
++
++	mull	r24, r19, r7	C U1
++	umulh	r24, r19, r23	C U1
++	ldi	r16, -48(r16)
++	br	L(cj1)
++
++L(gt1):	ldl	r0, 8(r17)
++	ldl	r1, 16(r17)
++	ldl	r2, 24(r17)
++	mull	r24, r19, r7	C U1
++	umulh	r24, r19, r23	C U1
++	ldl	r3, 32(r17)
++	ldi	r18, -4(r18)
++	ldi	r17, 40(r17)
++	mull	r0, r19, r4	C U1
++	umulh	r0, r19, r20	C U1
++	ldi	r16, -16(r16)
++	bgt	r18, L(gt5)
++
++	mull	r1, r19, r5	C U1
++	umulh	r1, r19, r21	C U1
++	mull	r2, r19, r6	C U1
++	br	L(cj5)
++
++L(gt5):	ldl	r0, 0(r17)
++	mull	r1, r19, r5	C U1
++	umulh	r1, r19, r21	C U1
++	ldl	r1, 8(r17)
++	mull	r2, r19, r6	C U1
++	br	L(L1)
++
++
++L(b0):	ldl	r1, 8(r17)
++	ldl	r2, 16(r17)
++	ldl	r3, 24(r17)
++	ldi	r17, 32(r17)
++	ldi	r16, -24(r16)
++	mull	r24, r19, r4	C U1
++	umulh	r24, r19, r20	C U1
++	bgt	r18, L(gt4)
++
++	mull	r1, r19, r5	C U1
++	umulh	r1, r19, r21	C U1
++	mull	r2, r19, r6	C U1
++	umulh	r2, r19, r22	C U1
++	mull	r3, r19, r7	C U1
++	br	L(cj4)
++
++L(gt4):	ldl	r0, 0(r17)
++	mull	r1, r19, r5	C U1
++	umulh	r1, r19, r21	C U1
++	ldl	r1, 8(r17)
++	mull	r2, r19, r6	C U1
++	umulh	r2, r19, r22	C U1
++	ldl	r2, 16(r17)
++	mull	r3, r19, r7	C U1
++	br	L(L0)
++
++C *** MAIN LOOP START ***
++	ALIGN(16)
++L(top):	mull	r0, r19, r4	C U1
++	subl	r8, r28, r8
++L(L3):	umulh	r0, r19, r20	C U1
++	cmpult	r8, r5, r28
++	ldl	r0, 0(r17)
++	subl	r8, r5, r8
++	addl	r21, r28, r28
++	stl	r8, 0(r16)
++
++	mull	r1, r19, r5	C U1
++	subl	r8, r28, r8
++L(L2):	umulh	r1, r19, r21	C U1
++	cmpult	r8, r6, r28
++	ldl	r1, 8(r17)
++	subl	r8, r6, r8
++	addl	r22, r28, r28
++	stl	r8, 8(r16)
++
++	mull	r2, r19, r6	C U1
++	subl	r8, r28, r8
++L(L1):	umulh	r2, r19, r22	C U1
++	cmpult	r8, r7, r28
++	ldl	r2, 16(r17)
++	subl	r8, r7, r8
++	addl	r23, r28, r28
++	stl	r8, 16(r16)
++
++	mull	r3, r19, r7	C U1
++	subl	r8, r28, r8
++L(L0):	umulh	r3, r19, r23	C U1
++	cmpult	r8, r4, r28
++	ldl	r3, 24(r17)
++	subl	r8, r4, r8
++	addl	r20, r28, r28
++	stl	r8, 24(r16)
++
++	ldi	r18, -4(r18)
++	ldi	r17, 32(r17)
++	ldi	r16, 32(r16)
++	bgt	r18, L(top)
++C *** MAIN LOOP END ***
++
++	mull	r0, r19, r4	C U1
++	subl	r8, r28, r8
++L(cj7):	umulh	r0, r19, r20	C U1
++	cmpult	r8, r5, r28
++	subl	r8, r5, r8
++	addl	r21, r28, r28
++	stl	r8, 0(r16)
++	mull	r1, r19, r5	C U1
++	subl	r8, r28, r8
++L(cj6):	umulh	r1, r19, r21	C U1
++	cmpult	r8, r6, r28
++	subl	r8, r6, r8
++	addl	r22, r28, r28
++	stl	r8, 8(r16)
++	mull	r2, r19, r6	C U1
++	subl	r8, r28, r8
++L(cj5):	umulh	r2, r19, r22	C U1
++	cmpult	r8, r7, r28
++	subl	r8, r7, r8
++	addl	r23, r28, r28
++	stl	r8, 16(r16)
++	mull	r3, r19, r7	C U1
++	subl	r8, r28, r8
++L(cj4):	umulh	r3, r19, r23	C U1
++	cmpult	r8, r4, r28
++	subl	r8, r4, r8
++	addl	r20, r28, r28
++	stl	r8, 24(r16)
++	subl	r8, r28, r8
++L(cj3):	cmpult	r8, r5, r28
++	subl	r8, r5, r8
++	addl	r21, r28, r28
++	stl	r8, 32(r16)
++	subl	r8, r28, r8
++L(cj2):	cmpult	r8, r6, r28
++	subl	r8, r6, r8
++	addl	r22, r28, r28
++	stl	r8, 40(r16)
++	subl	r8, r28, r8
++L(cj1):	cmpult	r8, r7, r28
++	subl	r8, r7, r8
++	addl	r23, r28, r28
++	stl	r8, 48(r16)
++	subl	r8, r28, r0
++	ret	r31, (r26), 1
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/cntlz.asm b/mpn/sw_64/cntlz.asm
+new file mode 100644
+index 0000000..2625199
+--- /dev/null
++++ b/mpn/sw_64/cntlz.asm
+@@ -0,0 +1,55 @@
++dnl  Sw_64 auxiliary for longlong.h's count_leading_zeros
++
++dnl  Copyright 1997, 2000, 2002 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++ASM_START()
++EXTERN(__clz_tab)
++PROLOGUE(mpn_count_leading_zeros,gp)
++	cmpgeb	r31,  r16, r1
++	LEA(r3,__clz_tab)
++	sra	r1,   1,   r1
++	xor	r1,   127, r1
++	srl	r16,  1,   r16
++	addl	r1,   r3,  r1
++	ldl_u	r0,   0(r1)
++	ldi	r2,   64
++	ext0b	r0,   r1,   r0
++	s8subw	r0,   8,    r0
++	srl	r16,  r0,   r16
++	addl	r16,  r3,   r16
++	ldl_u	r1,   0(r16)
++	ext0b	r1,   r16,  r1
++	subl	r2,   r1,   r2
++	subl	r2,   r0,   r0
++	ret	r31,  (r26),1
++EPILOGUE(mpn_count_leading_zeros)
++ASM_END()
+diff --git a/mpn/sw_64/com.asm b/mpn/sw_64/com.asm
+new file mode 100644
+index 0000000..98b9c0f
+--- /dev/null
++++ b/mpn/sw_64/com.asm
+@@ -0,0 +1,173 @@
++dnl  Sw_64 mpn_com -- mpn one's complement.
++
++dnl  Copyright 2003 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C      cycles/limb
++
++
++C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
++C
++C the main loop is 7 cycles plus 1 taken branch bubble, for a total
++C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
++C will be 1.5+2/N c/l.
++C
++C 2 cycles of loop control are unavoidable, for pointer updates and the
++C taken branch bubble, but also since ldl cannot issue two cycles after stl
++C (and with a run of stls that means neither of two cycles at the end of the
++C loop.
++C
++C The fbeq is forced into the second cycle of the loop using unops, since
++C the first time through it must wait for the cvtqt result.  Once that
++C result is ready (a 1 cycle stall) then both the branch and following loads
++C can issue together.
++C
++C The main loop handles an odd count of limbs, being two limbs loaded before
++C each size test, plus one pipelined around from the previous iteration (or
++C setup in the entry sequence).
++C
++C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
++C entry sequence, and an increment of the pointers.  For an odd size there's
++C no increment and the first store in the loop (r24) is a repeat of dst[0].
++C
++C Note that the load for r24 after the possible pointer increment is done
++C before the explicit store to dst[0], in case src==dst.
++
++
++ASM_START()
++
++FLOAT64(L(dat), 2.0)
++
++	ALIGN(16)
++
++PROLOGUE(mpn_com,gp)
++
++	C r16	dst
++	C r17	src
++	C r18	size
++
++	ldi	r30, -16(r30)		C temporary stack space
++	ldi	r7, -3(r18)		C size - 3
++
++	ldl	r20, 0(r17)		C src[0]
++	srl	r7, 1, r6		C (size-3)/2
++
++	stl	r6, 8(r30)		C (size-3)/2
++	and	r7, 1, r5		C 1 if size even
++
++	LEA(	r8, L(dat))
++	s8addl	r5, r17, r17		C skip src[0] if even
++
++	ornot	r31, r20, r20		C ~src[0]
++	unop
++
++	fldd	f0, 8(r30)		C (size-3)/2
++	ldl	r24, 0(r17)		C src[0 or 1]
++
++	stl	r20, 0(r16)		C dst[0]
++	s8addl	r5, r16, r19		C skip dst[0] if even
++
++	fldd	f1, 0(r8)		C data 2.0
++	ldi	r30, 16(r30)		C restore stack
++        fcvtld  f0, f10                 C (size-3)/2 as float
++        fcpys   f10,f10,f0
++
++	ornot	r31, r24, r24
++	blt	r7, L(done_1)		C if size<=2
++	unop
++	unop
++
++
++	C 16-byte alignment here
++L(top):
++	C r17	src, incrementing
++	C r19	dst, incrementing
++	C r24	dst[i] result, ready to store
++	C f0	(size-3)/2, decrementing
++	C f1	2.0
++
++	ldl	r20, 8(r17)		C src[i+1]
++	ldl	r21, 16(r17)		C src[i+2]
++	unop
++	unop
++
++	fbeq	f0, L(done_2)
++	unop
++	ldl	r22, 24(r17)		C src[i+3]
++	ldl	r23, 32(r17)		C src[i+4]
++
++	stl	r24, 0(r19)		C dst[i]
++	ornot	r31, r20, r20
++        fsubd   f0, f1, f10             C count -= 2
++        fcpys   f10,f10,f0
++
++	stl	r20, 8(r19)		C dst[i+1]
++	ornot	r31, r21, r21
++	unop
++	unop
++
++	stl	r21, 16(r19)		C dst[i+2]
++	ornot	r31, r22, r22
++
++	stl	r22, 24(r19)		C dst[i+3]
++	ornot	r31, r23, r24
++
++	ldi	r17, 32(r17)		C src += 4
++	ldi	r19, 32(r19)		C dst += 4
++	unop
++	fbge	f0, L(top)
++
++
++L(done_1):
++	C r19	&dst[size-1]
++	C r24	result for dst[size-1]
++
++	stl	r24, 0(r19)		C dst[size-1]
++	ret	r31, (r26), 1
++
++
++L(done_2):
++	C r19	&dst[size-3]
++	C r20	src[size-2]
++	C r21	src[size-1]
++	C r24	result for dst[size-3]
++
++	stl	r24, 0(r19)		C dst[size-3]
++	ornot	r31, r20, r20
++
++	stl	r20, 8(r19)		C dst[size-2]
++	ornot	r31, r21, r21
++
++	stl	r21, 16(r19)		C dst[size-1]
++	ret	r31, (r26), 1
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/copyd.asm b/mpn/sw_64/copyd.asm
+new file mode 100644
+index 0000000..ae29b1a
+--- /dev/null
++++ b/mpn/sw_64/copyd.asm
+@@ -0,0 +1,85 @@
++dnl  Sw_64 mpn_copyd -- copy, decrementing.
++
++dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++
++C INPUT PARAMETERS
++C rp	r16
++C up	r17
++C n	r18
++
++
++ASM_START()
++PROLOGUE(mpn_copyd)
++	s8addl	r18,r16,r16		C E0
++	s8addl	r18,r17,r17		C E1
++	ldi	r18,-8(r18)		C E0
++	blt	r18,$Lend		C E1
++$Loop:	ldl	r0,-8(r17)		C E0
++	ldl	r1,-16(r17)		C E1
++	ldl	r2,-24(r17)		C E0
++	ldl	r3,-32(r17)		C E1
++	ldl	r4,-40(r17)		C E0
++	ldl	r5,-48(r17)		C E1
++	ldl	r6,-56(r17)		C E0
++	ldl	r7,-64(r17)		C E1
++	stl	r0,-8(r16)		C E0
++	ldi	r17,-64(r17)		C E1
++	stl	r1,-16(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r2,-24(r16)		C E0
++	ldi	r18,-8(r18)		C E1
++	stl	r3,-32(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r4,-40(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r5,-48(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r6,-56(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r7,-64(r16)		C E0
++	ldi	r16,-64(r16)		C E1
++	bge	r18,$Loop		C E1
++$Lend:	ldi	r18,7(r18)		C E0
++	blt	r18,$Lret		C E1
++	ldl	r0,-8(r17)		C E0
++	beq	r18,$Lend0		C E1
++$Loop0:	stl	r0,-8(r16)		C E0
++	ldi	r16,-8(r16)		C E1
++	ldl	r0,-16(r17)		C E0
++	ldi	r18,-1(r18)		C E1
++	ldi	r17,-8(r17)		C E0
++	bgt	r18,$Loop0		C E1
++$Lend0:	stl	r0,-8(r16)		C E0
++$Lret:	ret	r31,(r26),1		C E1
++EPILOGUE(mpn_copyd)
++ASM_END()
+diff --git a/mpn/sw_64/copyi.asm b/mpn/sw_64/copyi.asm
+new file mode 100644
+index 0000000..be1e7ac
+--- /dev/null
++++ b/mpn/sw_64/copyi.asm
+@@ -0,0 +1,84 @@
++dnl  Sw_64 mpn_copyi -- copy, incrementing.
++
++dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     1
++
++C INPUT PARAMETERS
++C rp	r16
++C up	r17
++C n	r18
++
++
++ASM_START()
++PROLOGUE(mpn_copyi)
++	ldi	r18,-8(r18)		C E0
++	blt	r18,$Lend		C E1
++$Loop:	ldl	r0,0(r17)		C E0
++	ldl	r1,8(r17)		C E1
++	ldl	r2,16(r17)		C E0
++	ldl	r3,24(r17)		C E1
++	ldl	r4,32(r17)		C E0
++	ldl	r5,40(r17)		C E1
++	ldl	r6,48(r17)		C E0
++	ldl	r7,56(r17)		C E1
++	stl	r0,0(r16)		C E0
++	ldi	r17,64(r17)		C E1
++	stl	r1,8(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r2,16(r16)		C E0
++	ldi	r18,-8(r18)		C E1
++	stl	r3,24(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r4,32(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r5,40(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r6,48(r16)		C E0
++	bis	r31, r31, r31		C E1
++	stl	r7,56(r16)		C E0
++	ldi	r16,64(r16)		C E1
++	bge	r18,$Loop		C E1
++$Lend:	ldi	r18,7(r18)		C E0
++	blt	r18,$Lret		C E1
++	ldl	r0,0(r17)		C E0
++	beq	r18,$Lend0		C E1
++$Loop0:	stl	r0,0(r16)		C E0
++	ldi	r16,8(r16)		C E1
++	ldl	r0,8(r17)		C E0
++	ldi	r18,-1(r18)		C E1
++	ldi	r17,8(r17)		C E0
++	bgt	r18,$Loop0		C E1
++$Lend0:	stl	r0,0(r16)		C E0
++$Lret:	ret	r31,(r26),1		C E1
++EPILOGUE(mpn_copyi)
++ASM_END()
+diff --git a/mpn/sw_64/dive_1.c b/mpn/sw_64/dive_1.c
+new file mode 100644
+index 0000000..b00eeca
+--- /dev/null
++++ b/mpn/sw_64/dive_1.c
+@@ -0,0 +1,105 @@
++/* Sw_64 mpn_divexact_1 -- mpn by limb exact division.
++
++   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
++   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
++   FUTURE GNU MP RELEASES.
++
++Copyright 2000-2003 Free Software Foundation, Inc.
++
++This file is part of the GNU MP Library.
++
++The GNU MP Library is free software; you can redistribute it and/or modify
++it under the terms of either:
++
++  * the GNU Lesser General Public License as published by the Free
++    Software Foundation; either version 3 of the License, or (at your
++    option) any later version.
++
++or
++
++  * the GNU General Public License as published by the Free Software
++    Foundation; either version 2 of the License, or (at your option) any
++    later version.
++
++or both in parallel, as here.
++
++The GNU MP Library is distributed in the hope that it will be useful, but
++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++for more details.
++
++You should have received copies of the GNU General Public License and the
++GNU Lesser General Public License along with the GNU MP Library.  If not,
++see https://www.gnu.org/licenses/.  */
++
++#include "gmp-impl.h"
++#include "longlong.h"
++
++
++/*      cycles/limb
++   SW6:    15.0
++*/
++
++
++/* The dependent chain is as follows (the same as modexact), and this is
++   what the code runs as.
++
++   The time to load src[i+1] and establish x hides under the umulh latency.  */
++
++void
++mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
++{
++  mp_limb_t  inverse, lshift_mask, s, sr, s_next, c, h, x, y, q, dummy;
++  unsigned   rshift, lshift;
++
++  ASSERT (size >= 1);
++  ASSERT (divisor != 0);
++  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
++  ASSERT_MPN (src, size);
++  ASSERT_LIMB (divisor);
++
++  s_next = *src++;   /* src[0] */
++
++  rshift = 0;
++  lshift_mask = 0;
++  if ((divisor & 1) == 0)
++    {
++      count_trailing_zeros (rshift, divisor);
++      lshift_mask = MP_LIMB_T_MAX;
++      divisor >>= rshift;
++    }
++
++  binvert_limb (inverse, divisor);
++  lshift = 64 - rshift;
++
++  c = 0;
++  h = 0;
++  sr = s_next >> rshift;
++
++  size--;
++  if (LIKELY (size != 0))
++    {
++      do
++        {
++          s_next = *src++;      /* src[i+1] */
++          s = sr | ((s_next << lshift) & lshift_mask);
++          x = s - c;
++          c = s < c;
++          sr = s_next >> rshift;
++
++          y = x - h;
++          c += (x < h);
++          q = y * inverse;
++          *dst++ = q;
++          umul_ppmm (h, dummy, q, divisor);
++
++          size--;
++        }
++      while (size != 0);
++    }
++
++  x = sr - c;
++  y = x - h;
++  q = y * inverse;
++  *dst = q;         /* dst[size-1] */
++}
+diff --git a/mpn/sw_64/divrem_2.asm b/mpn/sw_64/divrem_2.asm
+new file mode 100644
+index 0000000..0b22bee
+--- /dev/null
++++ b/mpn/sw_64/divrem_2.asm
+@@ -0,0 +1,175 @@
++dnl  Sw_64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
++
++dnl  Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C		norm	frac
++C sw6		29	29
++
++C TODO
++C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
++C    any registers (thus save ~10 cycles per call).
++C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
++C    or two.
++C  * Check cluster delays (for sw6).  We very likely could save some cycles.
++C  * Use branch-free code for computing di.
++C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
++
++C INPUT PARAMETERS
++define(`qp',		`r16')
++define(`fn',		`r17')
++define(`up_param',	`r18')
++define(`un_param',	`r19')
++define(`dp',		`r20')
++
++ASM_START()
++PROLOGUE(mpn_divrem_2,gp)
++	ldi	r30, -80(r30)
++	stl	r26, 0(r30)
++	stl	r9, 8(r30)
++	stl	r10, 16(r30)
++	stl	r11, 24(r30)
++	stl	r12, 32(r30)
++	stl	r13, 40(r30)
++C	stl	r14, 48(r30)
++	stl	r15, 56(r30)
++	.prologue	1
++	stl	r16, 64(r30)
++	bis	r31, r17, r15
++	s8addl	r19, r18, r13
++	ldi	r13, -24(r13)
++	ldl	r12, 8(r20)
++	ldl	r10, 0(r20)
++	ldl	r11, 16(r13)
++	ldl	r9, 8(r13)
++
++	bis	r31, r31, r3		C most_significant_q_limb = 0
++	cmpult	r11, r12, r1
++	bne	r1, L(L8)
++	cmpule	r11, r12, r1
++	cmpult	r9, r10, r2
++	and	r1, r2, r1
++	bne	r1, L(L8)
++	subl	r11, r12, r11
++	subl	r11, r2, r11
++	subl	r9, r10, r9
++	ldi	r3, 1(r31)		C most_significant_q_limb = 1
++L(L8):	stl	r3, 72(r30)
++
++	addl	r15, r19, r19
++	ldi	r19, -3(r19)
++	blt	r19, L(L10)
++	bis	r31, r12, r16
++	call	r26, mpn_invert_limb
++	LDGP(	r29, 0(r26))
++	mull	r0, r12, r4		C t0 = LO(di * d1)
++	umulh	r0, r10, r2		C s1 = HI(di * d0)
++	addl	r4, r10, r4		C t0 += d0
++	cmpule	r10, r4, r7		C (t0 < d0)
++	addl	r4, r2, r4		C t0 += s1
++	cmpult	r4, r2, r1
++	subl	r1, r7, r7		C t1 (-1, 0, or 1)
++	blt	r7, L(L42)
++L(L22):
++	ldi	r0, -1(r0)		C di--
++	cmpult	r4, r12, r1		C cy for: t0 -= d1 (below)
++	subl	r7, r1, r7		C t1 -= cy
++	subl	r4, r12, r4		C t0 -= d1
++	bge	r7, L(L22)
++L(L42):
++	ldl	r16, 64(r30)
++	s8addl	r19, r16, r16
++	ALIGN(16)
++L(loop):
++	mull	r11, r0, r5		C q0 (early)
++	umulh	r11, r0, r6		C q  (early)
++	addl	r5, r9, r8		C q0 += n1
++	addl	r6, r11, r6		C q  += n2
++	cmpult	r8, r5, r1		C cy for: q0 += n1
++	addl	r6, r1, r6		C q  += cy
++	unop
++	mull	r12, r6, r1		C LO(d1 * q)
++	umulh	r10, r6, r7		C t1 = HI(d0 * q)
++	subl	r9, r1, r9		C n1 -= LO(d1 * q)
++	mull	r10, r6, r4		C t0 = LO(d0 * q)
++	unop
++	cmple	r15, r19, r5		C condition and n0...
++	beq	r5, L(L31)
++	ldl	r5, 0(r13)
++	ldi	r13, -8(r13)
++L(L31):	subl	r9, r12, r9		C n1 -= d1
++	cmpult	r5, r10, r1		C
++	subl	r9, r1, r9		C
++	subl	r5, r10, r5		C n0 -= d0
++	subl	r9, r7, r9		C n1 -= t0
++	cmpult	r5, r4, r1		C
++	subl	r9, r1, r2		C
++	subl	r5, r4, r5		C n0 -= t1
++	cmpult	r2, r8, r1		C (n1 < q0)
++	addl	r6, r1, r6		C q += cond
++	ldi	r1, -1(r1)		C -(n1 >= q0)
++	and	r1, r10, r4		C
++	addl	r5, r4, r9		C n0 += mask & d0
++	and	r1, r12, r1		C
++	cmpult	r9, r5, r11		C cy for: n0 += mask & d0
++	addl	r2, r1, r1		C n1 += mask & d1
++	addl	r1, r11, r11		C n1 += cy
++	cmpult	r11, r12, r1		C
++	beq	r1, L(fix)		C
++L(bck):	stl	r6, 0(r16)
++	ldi	r16, -8(r16)
++	ldi	r19, -1(r19)
++	bge	r19, L(loop)
++
++L(L10):	stl	r9, 8(r13)
++	stl	r11, 16(r13)
++	ldl	r0, 72(r30)
++	ldl	r26, 0(r30)
++	ldl	r9, 8(r30)
++	ldl	r10, 16(r30)
++	ldl	r11, 24(r30)
++	ldl	r12, 32(r30)
++	ldl	r13, 40(r30)
++C	ldl	r14, 48(r30)
++	ldl	r15, 56(r30)
++	ldi	r30, 80(r30)
++	ret	r31, (r26), 1
++
++L(fix):	cmpule	r11, r12, r1
++	cmpult	r9, r10, r2
++	and	r1, r2, r1
++	bne	r1, L(bck)
++	subl	r11, r12, r11
++	subl	r11, r2, r11
++	subl	r9, r10, r9
++	ldi	r6, 1(r6)
++	br	L(bck)
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/invert_limb.asm b/mpn/sw_64/invert_limb.asm
+new file mode 100644
+index 0000000..e1d1b88
+--- /dev/null
++++ b/mpn/sw_64/invert_limb.asm
+@@ -0,0 +1,93 @@
++dnl  Sw_64 mpn_invert_limb -- Invert a normalized limb.
++
++dnl  Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:    71/72
++
++C This was compiler generated, with minimal manual edits.  Surely several
++C cycles could be cut with some thought.
++
++ASM_START()
++PROLOGUE(mpn_invert_limb,gp)
++	LEA(	r2, approx_tab)
++	srl	r16, 54, r1
++	srl	r16, 24, r4
++	and	r16, 1, r5
++	bic	r1, 1, r7
++	ldi	r4, 1(r4)
++	srl	r16, 1, r3
++	addl	r7, r2, r1
++ifelse(bwx_available_p,1,`
++	ldhu	r0, -512(r1)
++',`
++	ldl_u	r0, -512(r1)
++	ext1b	r0, r7, r0
++')
++	addl	r3, r5, r3
++	mulw	r0, r0, r1
++	sll	r0, 11, r0
++	mull	r1, r4, r1
++	srl	r1, 40, r1
++	subl	r0, r1, r0
++	ldi	r0, -1(r0)
++	mull	r0, r0, r2
++	sll	r0, 60, r1
++	sll	r0, 13, r0
++	mull	r2, r4, r2
++	subl	r1, r2, r1
++	srl	r1, 47, r1
++	addl	r0, r1, r0
++	mull	r0, r3, r3
++	srl	r0, 1, r1
++	seleq	r5, 0, r1,r1
++	subl	r1, r3, r1
++	umulh	r1, r0, r3
++	sll	r0, 31, r0
++	srl	r3, 1, r1
++	addl	r0, r1, r0
++	mull	r0, r16, r2
++	umulh	r0, r16, r3
++	addl	r2, r16, r1
++	addl	r3, r16, r16
++	cmpult	r1, r2, r1
++	addl	r16, r1, r3
++	subl	r0, r3, r0
++	ret	r31, (r26), 1
++EPILOGUE()
++DATASTART(approx_tab,8)
++forloop(i,256,512-1,dnl
++`	.word	eval(0x7fd00/i)
++')dnl
++	SIZE(approx_tab, 512)
++	TYPE(approx_tab, object)
++DATAEND()
++ASM_END()
+diff --git a/mpn/sw_64/lshift.asm b/mpn/sw_64/lshift.asm
+new file mode 100644
+index 0000000..03dad97
+--- /dev/null
++++ b/mpn/sw_64/lshift.asm
+@@ -0,0 +1,180 @@
++dnl  Sw_64 mpn_lshift -- Shift a number left.
++
++dnl  Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     1.75
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  n	r18
++C  cnt	r19
++
++
++ASM_START()
++PROLOGUE(mpn_lshift)
++	s8addl	r18,r17,r17	C make r17 point at end of s1
++	ldl	r4,-8(r17)	C load first limb
++	subl	r31,r19,r20
++	s8addl	r18,r16,r16	C make r16 point at end of RES
++	subl	r18,1,r18
++	and	r18,4-1,r28	C number of limbs in first loop
++	srl	r4,r20,r0	C compute function result
++
++	beq	r28,L(L0)
++	subl	r18,r28,r18
++
++	ALIGN(8)
++L(top0):
++	ldl	r3,-16(r17)
++	subl	r16,8,r16
++	sll	r4,r19,r5
++	subl	r17,8,r17
++	subl	r28,1,r28
++	srl	r3,r20,r6
++	bis	r3,r3,r4
++	bis	r5,r6,r8
++	stl	r8,0(r16)
++	bne	r28,L(top0)
++
++L(L0):	sll	r4,r19,r24
++	beq	r18,L(end)
++C warm up phase 1
++	ldl	r1,-16(r17)
++	subl	r18,4,r18
++	ldl	r2,-24(r17)
++	ldl	r3,-32(r17)
++	ldl	r4,-40(r17)
++C warm up phase 2
++	srl	r1,r20,r7
++	sll	r1,r19,r21
++	srl	r2,r20,r8
++	beq	r18,L(end1)
++	ldl	r1,-48(r17)
++	sll	r2,r19,r22
++	ldl	r2,-56(r17)
++	srl	r3,r20,r5
++	bis	r7,r24,r7
++	sll	r3,r19,r23
++	bis	r8,r21,r8
++	srl	r4,r20,r6
++	ldl	r3,-64(r17)
++	sll	r4,r19,r24
++	ldl	r4,-72(r17)
++	subl	r18,4,r18
++	beq	r18,L(end2)
++	ALIGN(16)
++C main loop
++L(top):	stl	r7,-8(r16)
++	bis	r5,r22,r5
++	stl	r8,-16(r16)
++	bis	r6,r23,r6
++
++	srl	r1,r20,r7
++	subl	r18,4,r18
++	sll	r1,r19,r21
++	unop	C ldl	r31,-96(r17)
++
++	srl	r2,r20,r8
++	ldl	r1,-80(r17)
++	sll	r2,r19,r22
++	ldl	r2,-88(r17)
++
++	stl	r5,-24(r16)
++	bis	r7,r24,r7
++	stl	r6,-32(r16)
++	bis	r8,r21,r8
++
++	srl	r3,r20,r5
++	unop	C ldl	r31,-96(r17)
++	sll	r3,r19,r23
++	subl	r16,32,r16
++
++	srl	r4,r20,r6
++	ldl	r3,-96(r17)
++	sll	r4,r19,r24
++	ldl	r4,-104(r17)
++
++	subl	r17,32,r17
++	bne	r18,L(top)
++C cool down phase 2/1
++L(end2):
++	stl	r7,-8(r16)
++	bis	r5,r22,r5
++	stl	r8,-16(r16)
++	bis	r6,r23,r6
++	srl	r1,r20,r7
++	sll	r1,r19,r21
++	srl	r2,r20,r8
++	sll	r2,r19,r22
++	stl	r5,-24(r16)
++	bis	r7,r24,r7
++	stl	r6,-32(r16)
++	bis	r8,r21,r8
++	srl	r3,r20,r5
++	sll	r3,r19,r23
++	srl	r4,r20,r6
++	sll	r4,r19,r24
++C cool down phase 2/2
++	stl	r7,-40(r16)
++	bis	r5,r22,r5
++	stl	r8,-48(r16)
++	bis	r6,r23,r6
++	stl	r5,-56(r16)
++	stl	r6,-64(r16)
++C cool down phase 2/3
++	stl	r24,-72(r16)
++	ret	r31,(r26),1
++
++C cool down phase 1/1
++L(end1):
++	sll	r2,r19,r22
++	srl	r3,r20,r5
++	bis	r7,r24,r7
++	sll	r3,r19,r23
++	bis	r8,r21,r8
++	srl	r4,r20,r6
++	sll	r4,r19,r24
++C cool down phase 1/2
++	stl	r7,-8(r16)
++	bis	r5,r22,r5
++	stl	r8,-16(r16)
++	bis	r6,r23,r6
++	stl	r5,-24(r16)
++	stl	r6,-32(r16)
++	stl	r24,-40(r16)
++	ret	r31,(r26),1
++
++L(end):	stl	r24,-8(r16)
++	ret	r31,(r26),1
++EPILOGUE(mpn_lshift)
++ASM_END()
+diff --git a/mpn/sw_64/mod_34lsub1.asm b/mpn/sw_64/mod_34lsub1.asm
+new file mode 100644
+index 0000000..8f7b049
+--- /dev/null
++++ b/mpn/sw_64/mod_34lsub1.asm
+@@ -0,0 +1,162 @@
++dnl Sw_64 mpn_mod_34lsub1.
++
++dnl  Copyright 2002 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     1.67
++
++
++dnl  INPUT PARAMETERS
++dnl  up		r16
++dnl  n		r17
++
++define(`l0',`r18')
++define(`l1',`r19')
++define(`l2',`r20')
++define(`a0',`r21')
++define(`a1',`r22')
++define(`a2',`r23')
++define(`c0',`r24')
++define(`c1',`r5')
++define(`c2',`r6')
++
++ASM_START()
++PROLOGUE(mpn_mod_34lsub1)
++	bis	r31, r31, c0
++	bis	r31, r31, c1
++	bis	r31, r31, c2
++
++	ldi	r17, -3(r17)
++	bge	r17, $L_3_or_more
++	bis	r31, r31, a0
++	bis	r31, r31, a1
++	bis	r31, r31, a2
++	br	r31, $L_012
++
++$L_3_or_more:
++	ldl	a0, 0(r16)
++	ldl	a1, 8(r16)
++	ldl	a2, 16(r16)
++	ldi	r16, 24(r16)
++	ldi	r17, -3(r17)
++	blt	r17, $L_012
++
++$L_6_or_more:
++	ldl	l0, 0(r16)
++	ldl	l1, 8(r16)
++	ldl	l2, 16(r16)
++	addl	l0, a0, a0
++
++	ldi	r16, 24(r16)
++	ldi	r17, -3(r17)
++	blt	r17, $L_end
++
++	ALIGN(16)
++C Main loop
++$L_9_or_more:
++$Loop:	cmpult	a0, l0, r0
++	ldl	l0, 0(r16)
++	addl	r0, c0, c0
++	addl	l1, a1, a1
++	cmpult	a1, l1, r0
++	ldl	l1, 8(r16)
++	addl	r0, c1, c1
++	addl	l2, a2, a2
++	cmpult	a2, l2, r0
++	ldl	l2, 16(r16)
++	addl	r0, c2, c2
++	addl	l0, a0, a0
++	ldi	r16, 24(r16)
++	ldi	r17, -3(r17)
++	bge	r17, $Loop
++
++$L_end:	cmpult	a0, l0, r0
++	addl	r0, c0, c0
++	addl	l1, a1, a1
++	cmpult	a1, l1, r0
++	addl	r0, c1, c1
++	addl	l2, a2, a2
++	cmpult	a2, l2, r0
++	addl	r0, c2, c2
++
++C Handle the last (n mod 3) limbs
++$L_012:	ldi	r17, 2(r17)
++	blt	r17, $L_0
++	ldl	l0, 0(r16)
++	addl	l0, a0, a0
++	cmpult	a0, l0, r0
++	addl	r0, c0, c0
++	beq	r17, $L_0
++	ldl	l1, 8(r16)
++	addl	l1, a1, a1
++	cmpult	a1, l1, r0
++	addl	r0, c1, c1
++
++C Align and sum our 3 main accumulators and 3 carry accumulators
++$L_0:	srl	a0, 48, r2
++	srl	a1, 32, r4
++ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
++`	ins2b	a1, 2, r1',		C (a1 & 0xffffffff) << 16
++`	zapnot	a1, 15, r25
++	sll	r25, 16, r1')
++	zapnot	a0, 63, r0		C a0 & 0xffffffffffff
++	srl	a2, 16, a1
++ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
++`	ins1b	a2, 4, r3',		C (a2 & 0xffff) << 32
++`	zapnot	a2, 3, r25
++	sll	r25, 32, r3')
++	addl	r1, r4, r1
++	addl	r0, r2, r0
++	srl	c0, 32, a2
++ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
++`	ins2b	c0, 2, r4',		C (c0 & 0xffffffff) << 16
++`	zapnot	c0, 15, r25
++	sll	r25, 16, r4')
++	addl	r0, r1, r0
++	addl	r3, a1, r3
++	addl	r0, r3, r0
++	srl	c1, 16, c0
++ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
++`	ins1b	c1, 4, r2',		C (c1 & 0xffff) << 32
++`	zapnot	c1, 3, r25
++	sll	r25, 32, r2')
++	addl	r4, a2, r4
++C	srl	c2, 48, r3		C This will be 0 in practise
++	zapnot	c2, 63, r1		C r1 = c2 & 0xffffffffffff
++	addl	r0, r4, r0
++	addl	r2, c0, r2
++	addl	r0, r2, r0
++C	addl	r1, r3, r1
++	addl	r0, r1, r0
++
++	ret	r31, (r26), 1
++EPILOGUE(mpn_mod_34lsub1)
++ASM_END()
+diff --git a/mpn/sw_64/mode1o.asm b/mpn/sw_64/mode1o.asm
+new file mode 100644
+index 0000000..d5ed23f
+--- /dev/null
++++ b/mpn/sw_64/mode1o.asm
+@@ -0,0 +1,192 @@
++dnl  Sw_64 mpn_modexact_1c_odd -- mpn exact remainder
++
++dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C      cycles/limb
++C SW6:    15
++
++
++C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
++C                                mp_limb_t c)
++C In each case, the load latency, loop control, and extra carry bit handling
++C hide under the multiply latencies.  Those latencies are long enough that
++C we don't need to worry about alignment or pairing to squeeze out
++C performance.
++C
++C For the first limb, some of the loop code is broken out and scheduled back
++C since it can be done earlier.
++C
++C   - The first ldl src[0] is near the start of the routine, for maximum
++C     time from memory.
++C
++C   - The subl y=x-climb can be done without waiting for the inverse.
++C
++C   - The mull y*inverse is replicated after the final subl for the inverse,
++C     instead of branching to the mull in the main loop.
++C
++C For the last limb, high<divisor is tested and if that's true a subtract
++C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
++C data-dependent branch, but we're waiting for umulh so any penalty should
++C hide there.  The multiplies saved would be worth the cost anyway.
++C
++C Enhancements:
++C
++
++ASM_START()
++PROLOGUE(mpn_modexact_1c_odd,gp)
++
++	C r16	src
++	C r17	size
++	C r18	d
++	C r19	c
++
++	LEA(r0, binvert_limb_table)
++	srl	r18, 1, r20		C d >> 1
++
++	and	r20, 127, r20		C idx = d>>1 & 0x7F
++
++	addl	r0, r20, r21		C table + idx
++
++ifelse(bwx_available_p,1,
++`	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
++',`
++	ldl_u	r20, 0(r21)		C table[idx] qword
++	ext0b	r20, r21, r20		C table[idx], inverse 8 bits
++')
++
++	mulw	r20, r20, r7		C i*i
++	addl	r20, r20, r20		C 2*i
++
++	ldl	r2, 0(r16)		C x = s = src[0]
++	ldi	r17, -1(r17)		C size--
++	clr	r0			C initial cbit=0
++
++	mulw	r7, r18, r7		C i*i*d
++
++	subl	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
++
++	mulw	r20, r20, r7		C i*i
++	addl	r20, r20, r20		C 2*i
++
++	mulw	r7, r18, r7		C i*i*d
++
++	subl	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
++
++	mull	r20, r20, r7		C i*i
++	addl	r20, r20, r20		C 2*i
++
++	mull	r7, r18, r7		C i*i*d
++	subl	r2, r19, r3		C y = x - climb
++
++	subl	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
++
++ASSERT(r7, C should have d*inv==1 mod 2^64
++`	mull	r18, r20, r7
++	cmpeq	r7, 1, r7')
++
++	mull	r3, r20, r4		C first q = y * inv
++
++	beq	r17, L(one)		C if size==1
++	br	L(entry)
++
++
++L(top):
++	C r0	cbit
++	C r16	src, incrementing
++	C r17	size, decrementing
++	C r18	d
++	C r19	climb
++	C r20	inv
++
++	ldl	r1, 0(r16)		C s = src[i]
++	subl	r1, r0, r2		C x = s - cbit
++	cmpult	r1, r0, r0		C new cbit = s < cbit
++
++	subl	r2, r19, r3		C y = x - climb
++
++	mull	r3, r20, r4		C q = y * inv
++L(entry):
++	cmpult	r2, r19, r5		C cbit2 = x < climb
++	addl	r5, r0, r0		C cbit += cbit2
++	ldi	r16, 8(r16)		C src++
++	ldi	r17, -1(r17)		C size--
++
++	umulh	r4, r18, r19		C climb = q * d
++	bne	r17, L(top)		C while 2 or more limbs left
++
++
++
++	C r0	cbit
++	C r18	d
++	C r19	climb
++	C r20	inv
++
++	ldl	r1, 0(r16)		C s = src[size-1] high limb
++
++	cmpult	r1, r18, r2		C test high<divisor
++	bne	r2, L(skip)		C skip if so
++
++	C can't skip a division, repeat loop code
++
++	subl	r1, r0, r2		C x = s - cbit
++	cmpult	r1, r0, r0		C new cbit = s < cbit
++
++	subl	r2, r19, r3		C y = x - climb
++
++	mull	r3, r20, r4		C q = y * inv
++L(one):
++	cmpult	r2, r19, r5		C cbit2 = x < climb
++	addl	r5, r0, r0		C cbit += cbit2
++
++	umulh	r4, r18, r19		C climb = q * d
++
++	addl	r19, r0, r0		C return climb + cbit
++	ret	r31, (r26), 1
++
++
++	ALIGN(8)
++L(skip):
++	C with high<divisor, the final step can be just (cbit+climb)-s and
++	C an addback of d if that underflows
++
++	addl	r19, r0, r19		C c = climb + cbit
++
++	subl	r19, r1, r2		C c - s
++	cmpult	r19, r1, r3		C c < s
++
++	addl	r2, r18, r0		C return c-s + divisor
++
++	seleq	r3, r2, r0, r0		C return c-s if no underflow
++	ret	r31, (r26), 1
++
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/mul_1.asm b/mpn/sw_64/mul_1.asm
+new file mode 100644
+index 0000000..279374e
+--- /dev/null
++++ b/mpn/sw_64/mul_1.asm
+@@ -0,0 +1,100 @@
++dnl  Sw_64 mpn_mul_1 -- Multiply a limb vector with a limb and store
++dnl  the result in a second limb vector.
++
++dnl  Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:      7
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  n	r18
++C  vl	r19
++C  cl	r20
++
++
++ASM_START()
++PROLOGUE(mpn_mul_1c)
++	ldl	r2,0(r17)	C r2 = s1_limb
++	ldi	r18,-1(r18)	C size--
++	mull	r2,r19,r3	C r3 = prod_low
++	umulh	r2,r19,r4	C r4 = prod_high
++	beq	r18,$Le1c	C jump if size was == 1
++	ldl	r2,8(r17)	C r2 = s1_limb
++	ldi	r18,-1(r18)	C size--
++	addl	r3,r20,r3	C r3 = cy_limb + cl
++	stl	r3,0(r16)
++	cmpult	r3,r20,r0	C r0 = carry from (cy_limb + cl)
++	bne	r18,$Loop	C jump if size was == 2
++	br	r31,$Le2
++$Le1c:	addl	r3,r20,r3	C r3 = cy_limb + cl
++	cmpult	r3,r20,r0	C r0 = carry from (cy_limb + cl)
++$Le1:	stl	r3,0(r16)
++	addl	r4,r0,r0
++	ret	r31,(r26),1
++EPILOGUE(mpn_mul_1c)
++
++PROLOGUE(mpn_mul_1)
++	ldl	r2,0(r17)	C r2 = s1_limb
++	ldi	r18,-1(r18)	C size--
++	mull	r2,r19,r3	C r3 = prod_low
++	bic	r31,r31,r0	C clear cy_limb
++	umulh	r2,r19,r4	C r4 = prod_high
++	beq	r18,$Le1	C jump if size was == 1
++	ldl	r2,8(r17)	C r2 = s1_limb
++	ldi	r18,-1(r18)	C size--
++	stl	r3,0(r16)
++	beq	r18,$Le2	C jump if size was == 2
++
++	ALIGN(8)
++$Loop:	mull	r2,r19,r3	C r3 = prod_low
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	ldi	r18,-1(r18)	C size--
++	umulh	r2,r19,r4	C r4 = prod_high
++	ldl	r2,16(r17)	C r2 = s1_limb
++	ldi	r17,8(r17)	C s1_ptr++
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	stl	r3,8(r16)
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	ldi	r16,8(r16)	C res_ptr++
++	bne	r18,$Loop
++
++$Le2:	mull	r2,r19,r3	C r3 = prod_low
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	umulh	r2,r19,r4	C r4 = prod_high
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	stl	r3,8(r16)
++	addl	r4,r0,r0	C cy_limb = prod_high + cy
++	ret	r31,(r26),1
++EPILOGUE(mpn_mul_1)
++ASM_END()
+diff --git a/mpn/sw_64/rshift.asm b/mpn/sw_64/rshift.asm
+new file mode 100644
+index 0000000..6a23768
+--- /dev/null
++++ b/mpn/sw_64/rshift.asm
+@@ -0,0 +1,178 @@
++dnl  Sw_64 mpn_rshift -- Shift a number right.
++
++dnl  Copyright 1994, 1995, 2000, 2009 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C EV6:     1.75
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  n	r18
++C  cnt	r19
++
++
++ASM_START()
++PROLOGUE(mpn_rshift)
++	ldl	r4,0(r17)	C load first limb
++	subl	r31,r19,r20
++	subl	r18,1,r18
++	and	r18,4-1,r28	C number of limbs in first loop
++	sll	r4,r20,r0	C compute function result
++
++	beq	r28,L(L0)
++	subl	r18,r28,r18
++
++	ALIGN(8)
++L(top0):
++	ldl	r3,8(r17)
++	addl	r16,8,r16
++	srl	r4,r19,r5
++	addl	r17,8,r17
++	subl	r28,1,r28
++	sll	r3,r20,r6
++	bis	r3,r3,r4
++	bis	r5,r6,r8
++	stl	r8,-8(r16)
++	bne	r28,L(top0)
++
++L(L0):	srl	r4,r19,r24
++	beq	r18,L(end)
++C warm up phase 1
++	ldl	r1,8(r17)
++	subl	r18,4,r18
++	ldl	r2,16(r17)
++	ldl	r3,24(r17)
++	ldl	r4,32(r17)
++C warm up phase 2
++	sll	r1,r20,r7
++	srl	r1,r19,r21
++	sll	r2,r20,r8
++	beq	r18,L(end1)
++	ldl	r1,40(r17)
++	srl	r2,r19,r22
++	ldl	r2,48(r17)
++	sll	r3,r20,r5
++	bis	r7,r24,r7
++	srl	r3,r19,r23
++	bis	r8,r21,r8
++	sll	r4,r20,r6
++	ldl	r3,56(r17)
++	srl	r4,r19,r24
++	ldl	r4,64(r17)
++	subl	r18,4,r18
++	beq	r18,L(end2)
++	ALIGN(16)
++C main loop
++L(top):	stl	r7,0(r16)
++	bis	r5,r22,r5
++	stl	r8,8(r16)
++	bis	r6,r23,r6
++
++	sll	r1,r20,r7
++	subl	r18,4,r18
++	srl	r1,r19,r21
++	unop	C ldl	r31,-96(r17)
++
++	sll	r2,r20,r8
++	ldl	r1,72(r17)
++	srl	r2,r19,r22
++	ldl	r2,80(r17)
++
++	stl	r5,16(r16)
++	bis	r7,r24,r7
++	stl	r6,24(r16)
++	bis	r8,r21,r8
++
++	sll	r3,r20,r5
++	unop	C ldl	r31,-96(r17)
++	srl	r3,r19,r23
++	addl	r16,32,r16
++
++	sll	r4,r20,r6
++	ldl	r3,88(r17)
++	srl	r4,r19,r24
++	ldl	r4,96(r17)
++
++	addl	r17,32,r17
++	bne	r18,L(top)
++C cool down phase 2/1
++L(end2):
++	stl	r7,0(r16)
++	bis	r5,r22,r5
++	stl	r8,8(r16)
++	bis	r6,r23,r6
++	sll	r1,r20,r7
++	srl	r1,r19,r21
++	sll	r2,r20,r8
++	srl	r2,r19,r22
++	stl	r5,16(r16)
++	bis	r7,r24,r7
++	stl	r6,24(r16)
++	bis	r8,r21,r8
++	sll	r3,r20,r5
++	srl	r3,r19,r23
++	sll	r4,r20,r6
++	srl	r4,r19,r24
++C cool down phase 2/2
++	stl	r7,32(r16)
++	bis	r5,r22,r5
++	stl	r8,40(r16)
++	bis	r6,r23,r6
++	stl	r5,48(r16)
++	stl	r6,56(r16)
++C cool down phase 2/3
++	stl	r24,64(r16)
++	ret	r31,(r26),1
++
++C cool down phase 1/1
++L(end1):
++	srl	r2,r19,r22
++	sll	r3,r20,r5
++	bis	r7,r24,r7
++	srl	r3,r19,r23
++	bis	r8,r21,r8
++	sll	r4,r20,r6
++	srl	r4,r19,r24
++C cool down phase 1/2
++	stl	r7,0(r16)
++	bis	r5,r22,r5
++	stl	r8,8(r16)
++	bis	r6,r23,r6
++	stl	r5,16(r16)
++	stl	r6,24(r16)
++	stl	r24,32(r16)
++	ret	r31,(r26),1
++
++L(end):	stl	r24,0(r16)
++	ret	r31,(r26),1
++EPILOGUE(mpn_rshift)
++ASM_END()
+diff --git a/mpn/sw_64/sec_tabselect.asm b/mpn/sw_64/sec_tabselect.asm
+new file mode 100644
+index 0000000..d802503
+--- /dev/null
++++ b/mpn/sw_64/sec_tabselect.asm
+@@ -0,0 +1,135 @@
++dnl  Sw_64 mpn_sec_tabselect.
++
++dnl  Contributed to the GNU project by Torbjörn Granlund.
++
++dnl  Copyright 2011-2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:      1.64
++
++define(`rp',     `r16')
++define(`tp',     `r17')
++define(`n',      `r18')
++define(`nents',  `r19')
++define(`which',  `r20')
++
++define(`i',      `r21')
++define(`j',      `r22')
++define(`stride', `r23')
++define(`mask',   `r24')
++define(`k',      `r25')
++
++
++ASM_START()
++PROLOGUE(mpn_sec_tabselect)
++	subl	n, 4, j			C outer loop induction variable
++
++	blt	j, L(outer_end)
++L(outer_top):
++	mov	tp, r8
++	ldi	r0, 0(r31)
++	ldi	r1, 0(r31)
++	ldi	r2, 0(r31)
++	ldi	r3, 0(r31)
++	subl	j, 4, j			C outer loop induction variable
++	subl	nents, which, k
++	mov	nents, i
++
++	ALIGN(16)
++L(top):	ldl	r4, 0(tp)
++	ldl	r5, 8(tp)
++	cmpeq	k, i, mask
++	subl	i, 1, i
++	subl	r31, mask, mask
++	ldl	r6, 16(tp)
++	ldl	r7, 24(tp)
++	and	r4, mask, r4
++	and	r5, mask, r5
++	or	r0, r4, r0
++	or	r1, r5, r1
++	and	r6, mask, r6
++	and	r7, mask, r7
++	or	r2, r6, r2
++	or	r3, r7, r3
++	s8addl	n, tp, tp
++	bne	i, L(top)
++
++	stl	r0, 0(rp)
++	stl	r1, 8(rp)
++	stl	r2, 16(rp)
++	stl	r3, 24(rp)
++	addl	r8, 32, tp
++	addl	rp, 32, rp
++	bge	j, L(outer_top)
++L(outer_end):
++
++	and	n, 2, r0
++	beq	r0, L(b0x)
++L(b1x):	mov	tp, r8
++	ldi	r0, 0(r31)
++	ldi	r1, 0(r31)
++	subl	nents, which, k
++	mov	nents, i
++	ALIGN(16)
++L(tp2):	ldl	r4, 0(tp)
++	ldl	r5, 8(tp)
++	cmpeq	k, i, mask
++	subl	i, 1, i
++	subl	r31, mask, mask
++	and	r4, mask, r4
++	and	r5, mask, r5
++	or	r0, r4, r0
++	or	r1, r5, r1
++	s8addl	n, tp, tp
++	bne	i, L(tp2)
++	stl	r0, 0(rp)
++	stl	r1, 8(rp)
++	addl	r8, 16, tp
++	addl	rp, 16, rp
++
++L(b0x):	and	n, 1, r0
++	beq	r0, L(b00)
++L(b01):	ldi	r0, 0(r31)
++	subl	nents, which, k
++	mov	nents, i
++	ALIGN(16)
++L(tp1):	ldl	r4, 0(tp)
++	cmpeq	k, i, mask
++	subl	i, 1, i
++	subl	r31, mask, mask
++	and	r4, mask, r4
++	or	r0, r4, r0
++	s8addl	n, tp, tp
++	bne	i, L(tp1)
++	stl	r0, 0(rp)
++
++L(b00):	ret	r31, (r26), 1
++EPILOGUE()
+diff --git a/mpn/sw_64/sqr_diag_addlsh1.asm b/mpn/sw_64/sqr_diag_addlsh1.asm
+new file mode 100644
+index 0000000..1f7aac7
+--- /dev/null
++++ b/mpn/sw_64/sqr_diag_addlsh1.asm
+@@ -0,0 +1,88 @@
++dnl  Sw_64 mpn_sqr_diag_addlsh1.
++
++dnl  Copyright 2013 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:      4.5
++
++C Algorithm: We allow ourselves to propagate carry to a product high word
++C without worrying for carry out, since (B-1)^2 = B^2-2B+1 has a high word of
++C B-2, i.e, will not spill.  We propagate carry similarly to a product low word
++C since the problem value B-1 is a quadratic non-residue mod B, but our
++C products are squares.
++
++define(`rp',	`r16')
++define(`tp',	`r17')
++define(`up',	`r18')
++define(`n',	`r19')
++
++ASM_START()
++PROLOGUE(mpn_sqr_diag_addlsh1)
++	ldl	r0, 0(up)
++	bis	r31, r31, r21
++	bis	r31, r31, r3
++	mull	r0, r0, r7
++	stl	r7, 0(rp)
++	umulh	r0, r0, r6
++	ldi	n, -1(n)
++
++	ALIGN(16)
++L(top):	ldl	r0, 8(up)
++	ldi	up, 8(up)
++	ldl	r8, 0(tp)
++	ldl	r20, 8(tp)
++	mull	r0, r0, r7
++	ldi	tp, 16(tp)
++	sll	r8, 1, r23
++	srl	r8, 63, r22
++	or	r21, r23, r23
++	sll	r20, 1, r24
++	addl	r3, r6, r6		C cannot carry per comment above
++	or	r22, r24, r24
++	addl	r23, r6, r21
++	umulh	r0, r0, r6
++	cmpult	r21, r23, r1
++	addl	r1, r7, r7		C cannot carry per comment above
++	stl	r21, 8(rp)
++	addl	r24, r7, r22
++	stl	r22, 16(rp)
++	ldi	n, -1(n)
++	cmpult	r22, r7, r3
++	srl	r20, 63, r21
++	ldi	rp, 16(rp)
++	bne	n, L(top)
++
++	addl	r3, r6, r6		C cannot carry per comment above
++	addl	r21, r6, r21
++	stl	r21, 8(rp)
++	ret	r31, (r26), 1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/sub_n.asm b/mpn/sw_64/sub_n.asm
+new file mode 100644
+index 0000000..9618e67
+--- /dev/null
++++ b/mpn/sw_64/sub_n.asm
+@@ -0,0 +1,162 @@
++dnl  Sw_64 mpn_sub_n -- Subtract two limb vectors of the same length > 0
++dnl  and store difference in a third limb vector.
++
++dnl  Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:     3
++
++dnl  INPUT PARAMETERS
++dnl  res_ptr	r16
++dnl  s1_ptr	r17
++dnl  s2_ptr	r18
++dnl  size	r19
++
++ASM_START()
++PROLOGUE(mpn_sub_nc)
++	bis	r31,r20,r25
++	br	L(com)
++EPILOGUE()
++PROLOGUE(mpn_sub_n)
++	bis	r31,r31,r25		C clear cy
++L(com):	subl	r19,4,r19		C decr loop cnt
++	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
++C Start software pipeline for 1st loop
++	ldl	r0,0(r18)
++	ldl	r4,0(r17)
++	ldl	r1,8(r18)
++	ldl	r5,8(r17)
++	addl	r17,32,r17		C update s1_ptr
++	subl	r4,r0,r28		C 1st main subtract
++	ldl	r2,16(r18)
++	subl	r28,r25,r20		C 1st carry subtract
++	ldl	r3,24(r18)
++	cmpult	r4,r0,r8		C compute cy from last subtract
++	ldl	r6,-16(r17)
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	ldl	r7,-8(r17)
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	subl	r19,4,r19		C decr loop cnt
++	subl	r5,r1,r28		C 2nd main subtract
++	addl	r18,32,r18		C update s2_ptr
++	subl	r28,r25,r21		C 2nd carry subtract
++	cmpult	r5,r1,r8		C compute cy from last subtract
++	blt	r19,$Lend1		C if less than 4 limbs remain, jump
++C 1st loop handles groups of 4 limbs in a software pipeline
++	ALIGN(16)
++$Loop:	cmpult	r28,r25,r25		C compute cy from last subtract
++	ldl	r0,0(r18)
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	ldl	r1,8(r18)
++	subl	r6,r2,r28		C 3rd main subtract
++	ldl	r4,0(r17)
++	subl	r28,r25,r22		C 3rd carry subtract
++	ldl	r5,8(r17)
++	cmpult	r6,r2,r8		C compute cy from last subtract
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	stl	r20,0(r16)
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	stl	r21,8(r16)
++	subl	r7,r3,r28		C 4th main subtract
++	subl	r28,r25,r23		C 4th carry subtract
++	cmpult	r7,r3,r8		C compute cy from last subtract
++	cmpult	r28,r25,r25		C compute cy from last subtract
++		addl	r17,32,r17		C update s1_ptr
++	bis	r8,r25,r25		C combine cy from the two subtracts
++		addl	r16,32,r16		C update res_ptr
++	subl	r4,r0,r28		C 1st main subtract
++	ldl	r2,16(r18)
++	subl	r28,r25,r20		C 1st carry subtract
++	ldl	r3,24(r18)
++	cmpult	r4,r0,r8		C compute cy from last subtract
++	ldl	r6,-16(r17)
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	ldl	r7,-8(r17)
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	subl	r19,4,r19		C decr loop cnt
++	stl	r22,-16(r16)
++	subl	r5,r1,r28		C 2nd main subtract
++	stl	r23,-8(r16)
++	subl	r28,r25,r21		C 2nd carry subtract
++		addl	r18,32,r18		C update s2_ptr
++	cmpult	r5,r1,r8		C compute cy from last subtract
++	bge	r19,$Loop
++C Finish software pipeline for 1st loop
++$Lend1:	cmpult	r28,r25,r25		C compute cy from last subtract
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	subl	r6,r2,r28		C cy add
++	subl	r28,r25,r22		C 3rd main subtract
++	cmpult	r6,r2,r8		C compute cy from last subtract
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	stl	r20,0(r16)
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	stl	r21,8(r16)
++	subl	r7,r3,r28		C cy add
++	subl	r28,r25,r23		C 4th main subtract
++	cmpult	r7,r3,r8		C compute cy from last subtract
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	addl	r16,32,r16		C update res_ptr
++	stl	r22,-16(r16)
++	stl	r23,-8(r16)
++$Lend2:	addl	r19,4,r19		C restore loop cnt
++	beq	r19,$Lret
++C Start software pipeline for 2nd loop
++	ldl	r0,0(r18)
++	ldl	r4,0(r17)
++	subl	r19,1,r19
++	beq	r19,$Lend0
++C 2nd loop handles remaining 1-3 limbs
++	ALIGN(16)
++$Loop0:	subl	r4,r0,r28		C main subtract
++	cmpult	r4,r0,r8		C compute cy from last subtract
++	ldl	r0,8(r18)
++	ldl	r4,8(r17)
++	subl	r28,r25,r20		C carry subtract
++	addl	r18,8,r18
++	addl	r17,8,r17
++	stl	r20,0(r16)
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	subl	r19,1,r19		C decr loop cnt
++	bis	r8,r25,r25		C combine cy from the two subtracts
++	addl	r16,8,r16
++	bne	r19,$Loop0
++$Lend0:	subl	r4,r0,r28		C main subtract
++	subl	r28,r25,r20		C carry subtract
++	cmpult	r4,r0,r8		C compute cy from last subtract
++	cmpult	r28,r25,r25		C compute cy from last subtract
++	stl	r20,0(r16)
++	bis	r8,r25,r25		C combine cy from the two subtracts
++
++$Lret:	bis	r25,r31,r0		C return cy
++	ret	r31,(r26),1
++EPILOGUE()
++ASM_END()
+diff --git a/mpn/sw_64/submul_1.asm b/mpn/sw_64/submul_1.asm
+new file mode 100644
+index 0000000..8558ed0
+--- /dev/null
++++ b/mpn/sw_64/submul_1.asm
+@@ -0,0 +1,97 @@
++dnl  Sw_64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
++dnl  the result from a second limb vector.
++
++dnl  Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++C      cycles/limb
++C SW6:      7
++
++C  INPUT PARAMETERS
++C  rp	r16
++C  up	r17
++C  n	r18
++C  limb	r19
++
++
++ASM_START()
++PROLOGUE(mpn_submul_1)
++	ldl	r2,0(r17)	C r2 = s1_limb
++	addl	r17,8,r17	C s1_ptr++
++	subl	r18,1,r18	C size--
++	mull	r2,r19,r3	C r3 = prod_low
++	ldl	r5,0(r16)	C r5 = *res_ptr
++	umulh	r2,r19,r0	C r0 = prod_high
++	beq	r18,$Lend1	C jump if size was == 1
++	ldl	r2,0(r17)	C r2 = s1_limb
++	addl	r17,8,r17	C s1_ptr++
++	subl	r18,1,r18	C size--
++	subl	r5,r3,r3
++	cmpult	r5,r3,r4
++	stl	r3,0(r16)
++	addl	r16,8,r16	C res_ptr++
++	beq	r18,$Lend2	C jump if size was == 2
++
++	ALIGN(8)
++$Loop:	mull	r2,r19,r3	C r3 = prod_low
++	ldl	r5,0(r16)	C r5 = *res_ptr
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	subl	r18,1,r18	C size--
++	umulh	r2,r19,r4	C r4 = cy_limb
++	ldl	r2,0(r17)	C r2 = s1_limb
++	addl	r17,8,r17	C s1_ptr++
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	subl	r5,r3,r3
++	cmpult	r5,r3,r5
++	stl	r3,0(r16)
++	addl	r16,8,r16	C res_ptr++
++	addl	r5,r0,r0	C combine carries
++	bne	r18,$Loop
++
++$Lend2:	mull	r2,r19,r3	C r3 = prod_low
++	ldl	r5,0(r16)	C r5 = *res_ptr
++	addl	r4,r0,r0	C cy_limb = cy_limb + 'cy'
++	umulh	r2,r19,r4	C r4 = cy_limb
++	addl	r3,r0,r3	C r3 = cy_limb + prod_low
++	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
++	subl	r5,r3,r3
++	cmpult	r5,r3,r5
++	stl	r3,0(r16)
++	addl	r5,r0,r0	C combine carries
++	addl	r4,r0,r0	C cy_limb = prod_high + cy
++	ret	r31,(r26),1
++$Lend1:	subl	r5,r3,r3
++	cmpult	r5,r3,r5
++	stl	r3,0(r16)
++	addl	r0,r5,r0
++	ret	r31,(r26),1
++EPILOGUE(mpn_submul_1)
++ASM_END()
+diff --git a/mpn/sw_64/umul.asm b/mpn/sw_64/umul.asm
+new file mode 100644
+index 0000000..bb6d56c
+--- /dev/null
++++ b/mpn/sw_64/umul.asm
+@@ -0,0 +1,44 @@
++dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
++
++dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
++
++dnl  This file is part of the GNU MP Library.
++dnl
++dnl  The GNU MP Library is free software; you can redistribute it and/or modify
++dnl  it under the terms of either:
++dnl
++dnl    * the GNU Lesser General Public License as published by the Free
++dnl      Software Foundation; either version 3 of the License, or (at your
++dnl      option) any later version.
++dnl
++dnl  or
++dnl
++dnl    * the GNU General Public License as published by the Free Software
++dnl      Foundation; either version 2 of the License, or (at your option) any
++dnl      later version.
++dnl
++dnl  or both in parallel, as here.
++dnl
++dnl  The GNU MP Library is distributed in the hope that it will be useful, but
++dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++dnl  for more details.
++dnl
++dnl  You should have received copies of the GNU General Public License and the
++dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
++dnl  see https://www.gnu.org/licenses/.
++
++include(`../config.m4')
++
++
++C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
++C
++
++ASM_START()
++PROLOGUE(mpn_umul_ppmm)
++	mull	r17, r18, r1
++	umulh	r17, r18, r0
++	stl	r1, 0(r16)
++	ret	r31, (r26), 1
++EPILOGUE()
++ASM_END()
+-- 
+2.25.1
+
diff --git a/gmp.spec b/gmp.spec
index 1d672b0633c3bbda614fb4939b85a5146871e967..1de570a58c1e2fcbae3439b6a4b9c2631d8f8d54 100644
--- a/gmp.spec
+++ b/gmp.spec
@@ -1,6 +1,6 @@
 Name: gmp
 Version: 6.3.0
-Release: 4
+Release: 5
 Epoch: 1
 URL: https://gmplib.org
 Source0: https://gmplib.org/download/gmp/gmp-%{version}.tar.xz
@@ -8,6 +8,11 @@ License: (LGPL-3.0-or-later OR GPL-2.0-or-later OR (LGPL-3.0-or-later AND GPL-2.
 BuildRequires: gcc gcc-c++ make m4
 Summary: A GNU multiple precision arithmetic library
 
+# Sw64 Port
+Patch0001: 0001-Sw64-Port-add-configure-support-for-sw64.patch
+Patch0002: 0002-Sw64-Port-add-mpn-configure-support-for-sw64.patch
+Patch0003: 0003-Sw64-Port-add-mpn-asm-support-for-sw64.patch
+
 %description
 GMP is a portable library written in C for arbitrary precision arithmetic
 on integers, rational numbers, and floating-point numbers. It aims to provide
@@ -83,6 +88,9 @@ export LD_LIBRARY_PATH=`pwd`/.libs
 %{_libdir}/libgmpxx.so.*
 
 %changelog
+* Tue Jul 22 2025 swcompiler <lc@wxiat.com> - 1:6.3.0-5
+- Sw64 Port gmp
+
 * Tue Jul 01 2025 wangxiao <wangxiao184@h-partners.com> - 1:6.3.0-4
 - delete macros in changelog