diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 45471ee7e91900ba2e551f5a25eb56ef8de04885..0bb3257cba42998eab6b8ad76b9bd1d3cc7340c0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -17,6 +17,7 @@
 #include <linux/bio.h>
 #include <linux/sched/signal.h>
 #include <linux/migrate.h>
+#include <linux/math64.h>
 #include "trace.h"
 
 #include "../internal.h"
@@ -1044,11 +1045,10 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 EXPORT_SYMBOL_GPL(iomap_zero_range);
 
 int
-iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops)
+iomap_truncate_page(struct inode *inode, loff_t pos, unsigned int blocksize,
+		bool *did_zero, const struct iomap_ops *ops)
 {
-	unsigned int blocksize = i_blocksize(inode);
-	unsigned int off = pos & (blocksize - 1);
+	unsigned int off = rem_u64(pos, blocksize);
 
 	/* Block boundary? Nothing to do */
 	if (!off)
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 95bfe89651add0e260964d756fb295f9a9872d04..23c0e666d2f49389266fb9c0adf4c5a9466b7cb2 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -408,20 +408,18 @@ xfs_alloc_compute_diff(
  * Fix up the length, based on mod and prod.
  * len should be k * prod + mod for some k.
  * If len is too small it is returned unchanged.
- * If len hits maxlen it is left alone.
  */
-STATIC void
+static void
 xfs_alloc_fix_len(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
+	struct xfs_alloc_arg	*args)
 {
-	xfs_extlen_t	k;
-	xfs_extlen_t	rlen;
+	xfs_extlen_t		k;
+	xfs_extlen_t		rlen = args->len;
 
 	ASSERT(args->mod < args->prod);
-	rlen = args->len;
 	ASSERT(rlen >= args->minlen);
 	ASSERT(rlen <= args->maxlen);
-	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	if (args->prod <= 1 || rlen < args->mod ||
 	    (args->mod == 0 && rlen < args->prod))
 		return;
 	k = rlen % args->prod;
@@ -2385,14 +2383,23 @@ xfs_alloc_space_available(
 	if (available < (int)max(args->total, alloc_len))
 		return false;
 
+	if (flags & XFS_ALLOC_FLAG_CHECK)
+		return true;
+
 	/*
-	 * Clamp maxlen to the amount of free space available for the actual
-	 * extent allocation.
+	 * If we can't do a maxlen allocation, then we must reduce the size of
+	 * the allocation to match the available free space. We know how big
+	 * the largest contiguous free space we can allocate is, so that's our
+	 * upper bound. However, we don't exaclty know what alignment/size
+	 * constraints have been placed on the allocation, so we can't
+	 * arbitrarily select some new max size. Hence make this a minlen
+	 * allocation as we know that will definitely succeed and match the
+	 * callers alignment constraints.
 	 */
-	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
-		args->maxlen = available;
+	alloc_len = args->maxlen + (args->alignment - 1) + args->minalignslop;
+	if (longest < alloc_len) {
+		args->maxlen = args->minlen;
 		ASSERT(args->maxlen > 0);
-		ASSERT(args->maxlen >= args->minlen);
 	}
 
 	return true;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7682dfe2f7010b73bf3e6fa4d3f65b9892b2c339..1323259192d65825596c21faf6358e5a8d78fba0 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3253,32 +3253,51 @@ xfs_bmap_longest_free_extent(
 	return error;
 }
 
-static void
+static int
 xfs_bmap_select_minlen(
 	struct xfs_bmalloca	*ap,
 	struct xfs_alloc_arg	*args,
 	xfs_extlen_t		*blen,
 	int			notinit)
 {
+	xfs_extlen_t nlen = 0;
+
+	/* Adjust best length for extent start alignment. */
+	if (*blen > args->alignment)
+		*blen -= args->alignment;
+
 	if (notinit || *blen < ap->minlen) {
 		/*
 		 * Since we did a BUF_TRYLOCK above, it is possible that
 		 * there is space for this request.
 		 */
-		args->minlen = ap->minlen;
+		nlen = ap->minlen;
 	} else if (*blen < args->maxlen) {
 		/*
 		 * If the best seen length is less than the request length,
 		 * use the best as the minimum.
 		 */
-		args->minlen = *blen;
+
+		nlen = *blen;
 	} else {
 		/*
 		 * Otherwise we've seen an extent as big as maxlen, use that
 		 * as the minimum.
 		 */
-		args->minlen = args->maxlen;
+		nlen = args->maxlen;
 	}
+
+	if (args->alignment > 1) {
+		nlen = rounddown(nlen, args->alignment);
+		if (nlen < ap->minlen) {
+			if (xfs_inode_forcealign(ap->ip) &&
+				(ap->datatype & XFS_ALLOC_USERDATA))
+				return -ENOSPC;
+			nlen = ap->minlen;
+		}
+	}
+	args->minlen = nlen;
+	return 0;
 }
 
 STATIC int
@@ -3311,8 +3330,8 @@ xfs_bmap_btalloc_nullfb(
 			break;
 	}
 
-	xfs_bmap_select_minlen(ap, args, blen, notinit);
-	return 0;
+	error = xfs_bmap_select_minlen(ap, args, blen, notinit);
+	return error;
 }
 
 STATIC int
@@ -3349,7 +3368,9 @@ xfs_bmap_btalloc_filestreams(
 
 	}
 
-	xfs_bmap_select_minlen(ap, args, blen, notinit);
+	error = xfs_bmap_select_minlen(ap, args, blen, notinit);
+	if (error)
+		return error;
 
 	/*
 	 * Set the failure fallback case to look in the selected AG as stream
@@ -3419,9 +3440,8 @@ xfs_bmap_btalloc(
 	xfs_fileoff_t	orig_offset;
 	xfs_extlen_t	orig_length;
 	xfs_extlen_t	blen;
-	xfs_extlen_t	nextminlen = 0;
+	xfs_extlen_t    alignment;
 	int		nullfb;		/* true if ap->firstblock isn't set */
-	int		isaligned;
 	int		tryagain;
 	int		error;
 	int		stripe_align;
@@ -3480,7 +3500,7 @@ xfs_bmap_btalloc(
 	/*
 	 * Normal allocation, done through xfs_alloc_vextent.
 	 */
-	tryagain = isaligned = 0;
+	tryagain = 0;
 	memset(&args, 0, sizeof(args));
 	args.tp = ap->tp;
 	args.mp = mp;
@@ -3491,13 +3511,12 @@ xfs_bmap_btalloc(
 	 * xfs_get_cowextsz_hint() returns extsz_hint for when forcealign is
 	 * set as forcealign and cowextsz_hint are mutually exclusive
 	 */
-	if (xfs_inode_forcealign(ap->ip) && align) {
+	if (xfs_inode_forcealign(ap->ip))
 		args.alignment = align;
-		if (stripe_align == 0 || stripe_align % align)
-			stripe_align = align;
-	} else {
+	else if (stripe_align)
+		args.alignment = stripe_align;
+	else
 		args.alignment = 1;
-	}
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = min(ap->length, mp->m_ag_max_usable);
@@ -3548,47 +3567,27 @@ xfs_bmap_btalloc(
 	 * is only set if the allocation length is >= the stripe unit and the
 	 * allocation offset is at the end of file.
 	 */
-	if (!(ap->tp->t_flags & XFS_TRANS_LOWMODE) && ap->aeof) {
-		if (!ap->offset) {
-			args.alignment = stripe_align;
-			atype = args.type;
-			isaligned = 1;
-			/*
-			 * Adjust minlen to try and preserve alignment if we
-			 * can't guarantee an aligned maxlen extent.
-			 */
-			if (blen > args.alignment &&
-			    blen <= args.maxlen + args.alignment)
-				args.minlen = blen - args.alignment;
-			args.minalignslop = 0;
-		} else {
-			/*
-			 * First try an exact bno allocation.
-			 * If it fails then do a near or start bno
-			 * allocation with alignment turned on.
-			 */
-			atype = args.type;
-			tryagain = 1;
-			args.type = XFS_ALLOCTYPE_THIS_BNO;
-			/*
-			 * Compute the minlen+alignment for the
-			 * next case.  Set slop so that the value
-			 * of minlen+alignment+slop doesn't go up
-			 * between the calls.
-			 */
-			if (blen > stripe_align && blen <= args.maxlen)
-				nextminlen = blen - stripe_align;
-			else
-				nextminlen = args.minlen;
-			if (nextminlen + stripe_align > args.minlen + 1)
-				args.minalignslop =
-					nextminlen + stripe_align -
-					args.minlen - 1;
-			else
-				args.minalignslop = 0;
+	args.minalignslop = 0;
+	if (ap->tp->t_flags & XFS_TRANS_LOWMODE) {
+		if (args.alignment > 1 && xfs_inode_forcealign(ap->ip)) {
+			args.fsbno = NULLFSBLOCK;
+			goto alloc_out;
 		}
-	} else {
-		args.minalignslop = 0;
+		args.alignment = 1;
+	} else if (ap->aeof && ap->offset) {
+		/*
+		 * First try an exact bno allocation.
+		 * If it fails then do a near or start bno
+		 * allocation with alignment turned on.
+		 */
+		alignment = args.alignment;
+		atype = args.type;
+		tryagain = 1;
+		args.type = XFS_ALLOCTYPE_THIS_BNO;
+		args.fsbno = ap->blkno;
+
+		args.alignment = 1;
+		args.minalignslop = alignment - args.alignment;
 	}
 	args.postallocs = 1;
 	args.minleft = ap->minleft;
@@ -3607,21 +3606,26 @@ xfs_bmap_btalloc(
 		 */
 		args.type = atype;
 		args.fsbno = ap->blkno;
-		args.alignment = stripe_align;
-		args.minlen = nextminlen;
+		args.alignment = alignment;
 		args.minalignslop = 0;
-		isaligned = 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
 
-	if (isaligned && args.fsbno == NULLFSBLOCK &&
-		(args.alignment <= 1 || !xfs_inode_forcealign(ap->ip))) {
+	if (args.fsbno == NULLFSBLOCK && args.alignment > 1 &&
+		xfs_inode_forcealign(ap->ip)) {
+		/*
+		 * Don't attempting non-aligned fallbacks alloc
+		 * for forcealign
+		 */
+		goto alloc_out;
+	}
+
+	if (args.alignment > 1 && args.fsbno == NULLFSBLOCK) {
 		/*
 		 * allocation failed, so turn off alignment and
 		 * try again.
 		 */
-		args.type = atype;
 		args.fsbno = ap->blkno;
 		args.alignment = 0;
 		if ((error = xfs_alloc_vextent(&args)))
@@ -3643,6 +3647,8 @@ xfs_bmap_btalloc(
 			return error;
 		ap->tp->t_flags |= XFS_TRANS_LOWMODE;
 	}
+
+alloc_out:
 	if (args.fsbno != NULLFSBLOCK) {
 		/*
 		 * check the allocation happened at the same or higher AG than
@@ -3669,10 +3675,12 @@ xfs_bmap_btalloc(
 		 * very fragmented so we're unlikely to be able to satisfy the
 		 * hints anyway.
 		 */
-		if (ap->length <= orig_length)
-			ap->offset = orig_offset;
-		else if (ap->offset + ap->length < orig_offset + orig_length)
-			ap->offset = orig_offset + orig_length - ap->length;
+		if (!(xfs_inode_forcealign(ap->ip) && align)) {
+			if (ap->length <= orig_length)
+				ap->offset = orig_offset;
+			else if (ap->offset + ap->length < orig_offset + orig_length)
+				ap->offset = orig_offset + orig_length - ap->length;
+		}
 		xfs_bmap_btalloc_accounting(ap, &args);
 	} else {
 		ap->blkno = NULLFSBLOCK;
@@ -5289,7 +5297,7 @@ __xfs_bunmapi(
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	end = start + len;
 	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1
-			&& S_ISREG(VFS_I(ip)->i_mode)) {
+		&& S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) {
 		start = roundup_64(start, ip->i_d.di_extsize);
 		end = rounddown_64(end, ip->i_d.di_extsize);
 		len  = end - start;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a527a544a684170dd3ce57c1ddf09ddb0b0dfcc5..30dc960951cac30f0f20389621fa1d69c5317202 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -769,6 +769,8 @@ xfs_setattr_size(
 	int			error;
 	uint			lock_flags = 0;
 	bool			did_zeroing = false;
+	bool                    write_back = false;
+	unsigned int            blocksize = 0;
 
 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
@@ -776,6 +778,11 @@ xfs_setattr_size(
 	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 		ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
 
+	if (xfs_inode_forcealign(ip) && ip->i_d.di_extsize > 1)
+		blocksize = ip->i_d.di_extsize << i_blocksize(inode);
+	else
+		blocksize = i_blocksize(inode);
+
 	oldsize = inode->i_size;
 	newsize = iattr->ia_size;
 
@@ -805,21 +812,8 @@ xfs_setattr_size(
 	 */
 	inode_dio_wait(inode);
 
-	/*
-	 * File data changes must be complete before we start the transaction to
-	 * modify the inode.  This needs to be done before joining the inode to
-	 * the transaction because the inode cannot be unlocked once it is a
-	 * part of the transaction.
-	 *
-	 * Start with zeroing any data beyond EOF that we may expose on file
-	 * extension, or zeroing out the rest of the block on a downward
-	 * truncate.
-	 */
-	if (newsize > oldsize) {
-		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
-		error = iomap_zero_range(inode, oldsize, newsize - oldsize,
-				&did_zeroing, &xfs_buffered_write_iomap_ops);
-	} else {
+	write_back = newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size;
+	if (newsize < oldsize) {
 		/*
 		 * iomap won't detect a dirty page over an unwritten block (or a
 		 * cow block over a hole) and subsequently skips zeroing the
@@ -827,53 +821,69 @@ xfs_setattr_size(
 		 * convert the block before the pagecache truncate.
 		 */
 		error = filemap_write_and_wait_range(inode->i_mapping, newsize,
-						     newsize);
+					roundup_64(newsize, blocksize) - 1);
 		if (error)
 			return error;
-		error = iomap_truncate_page(inode, newsize, &did_zeroing,
-				&xfs_buffered_write_iomap_ops);
-	}
 
-	if (error)
-		return error;
-
-	/*
-	 * We've already locked out new page faults, so now we can safely remove
-	 * pages from the page cache knowing they won't get refaulted until we
-	 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
-	 * complete. The truncate_setsize() call also cleans partial EOF page
-	 * PTEs on extending truncates and hence ensures sub-page block size
-	 * filesystems are correctly handled, too.
-	 *
-	 * We have to do all the page cache truncate work outside the
-	 * transaction context as the "lock" order is page lock->log space
-	 * reservation as defined by extent allocation in the writeback path.
-	 * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but
-	 * having already truncated the in-memory version of the file (i.e. made
-	 * user visible changes). There's not much we can do about this, except
-	 * to hope that the caller sees ENOMEM and retries the truncate
-	 * operation.
-	 *
-	 * And we update in-core i_size and truncate page cache beyond newsize
-	 * before writeback the [di_size, newsize] range, so we're guaranteed
-	 * not to write stale data past the new EOF on truncate down.
-	 */
-	truncate_setsize(inode, newsize);
-
-	/*
-	 * We are going to log the inode size change in this transaction so
-	 * any previous writes that are beyond the on disk EOF and the new
-	 * EOF that have not been written out need to be written here.  If we
-	 * do not write the data out, we expose ourselves to the null files
-	 * problem. Note that this includes any block zeroing we did above;
-	 * otherwise those blocks may not be zeroed after a crash.
-	 */
-	if (did_zeroing ||
-	    (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
-		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-						ip->i_d.di_size, newsize - 1);
+		error = iomap_truncate_page(inode, newsize, blocksize,
+				&did_zeroing, &xfs_buffered_write_iomap_ops);
 		if (error)
 			return error;
+		/*
+		 * We are going to log the inode size change in this transaction
+		 * so any previous writes that are beyond the on disk EOF and
+		 * the new EOF that have not been written out need to be written
+		 * here.  If we do not write the data out, we expose ourselves
+		 * to the null files problem. Note that this includes any block
+		 * zeroing we did above; otherwise those blocks may not be
+		 * zeroed after a crash.
+		 */
+		if (did_zeroing || write_back) {
+			error = filemap_write_and_wait_range(inode->i_mapping,
+					min_t(loff_t, ip->i_d.di_size, newsize),
+					roundup_64(newsize, blocksize) - 1);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * Updating i_size after writing back to make sure the zeroed
+		 * blocks could been written out, and drop all the page cache
+		 * range that beyond blocksize aligned new EOF block.
+		 *
+		 * We've already locked out new page faults, so now we can
+		 * safely remove pages from the page cache knowing they won't
+		 * get refaulted until we drop the XFS_MMAP_EXCL lock after the
+		 * extent manipulations are complete.
+		 */
+		i_size_write(inode, newsize);
+		truncate_pagecache(inode, roundup_64(newsize, blocksize));
+	} else {
+		/*
+		 * Start with zeroing any data beyond EOF that we may expose on
+		 * file extension.
+		 */
+		if (newsize > oldsize) {
+			trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
+			error = iomap_zero_range(inode, oldsize, newsize - oldsize,
+					&did_zeroing, &xfs_buffered_write_iomap_ops);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * The truncate_setsize() call also cleans partial EOF page
+		 * PTEs on extending truncates and hence ensures sub-page block
+		 * size filesystems are correctly handled, too.
+		 */
+		truncate_setsize(inode, newsize);
+
+		if (did_zeroing || write_back) {
+			error = filemap_write_and_wait_range(inode->i_mapping,
+					ip->i_d.di_size, newsize - 1);
+			if (error)
+				return error;
+		}
 	}
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d43f76a4b99a433f17697cf44ed2507cef076d35..f2ff547e760c0b6e1ec547236bf5786c67987f50 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1658,10 +1658,19 @@ xfs_fc_fill_super(
 		}
 	}
 
-	if (xfs_has_forcealign(mp))
+	if (xfs_has_forcealign(mp)) {
 		xfs_warn(mp,
 "EXPERIMENTAL forced data extent alignment feature in use. Use at your own risk!");
 
+		if (xfs_has_realtime(mp)) {
+			xfs_alert(mp,
+	"forcealign not supported for realtime device!");
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+
+	}
+
 	if (xfs_has_atomicwrites(mp))
 		xfs_warn(mp,
 "EXPERIMENTAL atomicwrites feature in use. Use at your own risk!");
@@ -1674,6 +1683,14 @@ xfs_fc_fill_super(
 			goto out_filestream_unmount;
 		}
 
+		if (xfs_has_forcealign(mp)) {
+			xfs_alert(mp,
+	"reflink not compatible with forcealign!");
+			error = -EINVAL;
+			goto out_filestream_unmount;
+		}
+
+
 		if (xfs_globals.always_cow) {
 			xfs_info(mp, "using DEBUG-only always_cow mode.");
 			mp->m_always_cow = true;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index d14a729d40ce3de937e8b41e9a083416bb5391a3..1b6e22741d43002e8ecbad1bc980da64101eb858 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -207,8 +207,8 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
 		bool *did_zero, const struct iomap_ops *ops);
-int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops);
+int iomap_truncate_page(struct inode *inode, loff_t pos, unsigned int blocksize,
+		bool *did_zero, const struct iomap_ops *ops);
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
 			const struct iomap_ops *ops);
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 66deb1fdc2ef641d1c4a917c212585be80d94751..b5c4d1df08e522248bca845f84f00c039f09ab4e 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -3,6 +3,7 @@
 #define _LINUX_MATH64_H
 
 #include <linux/types.h>
+#include <linux/log2.h>
 #include <vdso/math64.h>
 #include <asm/div64.h>
 
@@ -11,6 +12,20 @@
 #define div64_long(x, y) div64_s64((x), (y))
 #define div64_ul(x, y)   div64_u64((x), (y))
 
+/**
+ * rem_u64 - remainder of unsigned 64bit divide with 32bit divisor
+ * @dividend: unsigned 64bit dividend
+ * @divisor: unsigned 32bit divisor
+ *
+ * Return: dividend % divisor
+ */
+static inline u32 rem_u64(u64 dividend, u32 divisor)
+{
+	if (is_power_of_2(divisor))
+		return dividend & (divisor - 1);
+	return dividend % divisor;
+}
+
 /**
  * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder
  * @dividend: unsigned 64bit dividend
@@ -85,6 +100,15 @@ static inline s64 div64_s64(s64 dividend, s64 divisor)
 #define div64_long(x, y) div_s64((x), (y))
 #define div64_ul(x, y)   div_u64((x), (y))
 
+#ifndef rem_u64
+static inline u32 rem_u64(u64 dividend, u32 divisor)
+{
+	if (is_power_of_2(divisor))
+		return dividend & (divisor - 1);
+	return do_div(dividend, divisor);
+}
+#endif
+
 #ifndef div_u64_rem
 static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 {