diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 7bbebfa5b71039f4b6716656102418b80bc7cb33..7a872c77c03acce7057c92b10f6ff4e2ddee9af4 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -65,33 +65,4 @@ alternative_else_nop_endif
 9999:	x;					\
 	_asm_extable_uaccess	9999b, l
 
-/*
- * Generate the assembly for LDTR/STTR with exception table entries.
- * This is complicated as there is no post-increment or pair versions of the
- * unprivileged instructions, and USER() only works for single instructions.
- */
-	.macro user_ldp l, reg1, reg2, addr, post_inc
-8888:		ldtr	\reg1, [\addr];
-8889:		ldtr	\reg2, [\addr, #8];
-		add	\addr, \addr, \post_inc;
-
-		_asm_extable_uaccess	8888b, \l;
-		_asm_extable_uaccess	8889b, \l;
-	.endm
-
-	.macro user_stp l, reg1, reg2, addr, post_inc
-8888:		sttr	\reg1, [\addr];
-8889:		sttr	\reg2, [\addr, #8];
-		add	\addr, \addr, \post_inc;
-
-		_asm_extable_uaccess	8888b,\l;
-		_asm_extable_uaccess	8889b,\l;
-	.endm
-
-	.macro user_ldst l, inst, reg, addr, post_inc
-8888:		\inst		\reg, [\addr];
-		add		\addr, \addr, \post_inc;
-
-		_asm_extable_uaccess	8888b, \l;
-	.endm
 #endif
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e3179075244932422dc708508cd2d097b498fe..a80b8679c4b5894f8adbcd56d2b1f5db950c97a1 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -20,54 +20,219 @@
  *	x0 - bytes not copied
  */
 
-	.macro ldrb1 reg, ptr, val
-	user_ldst 9998f, ldtrb, \reg, \ptr, \val
-	.endm
+dstin	.req	x0
+end	.req	x5
+src	.req	x1
+srcin	.req	x15
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+dst	.req	x6
 
-	.macro strb1 reg, ptr, val
-	strb \reg, [\ptr], \val
-	.endm
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
 
-	.macro ldrh1 reg, ptr, val
-	user_ldst 9997f, ldtrh, \reg, \ptr, \val
-	.endm
+#define USER_OFF(off, x...)	USER(fixup_offset_##off, x)
+#define FIXUP_OFFSET(n)				\
+fixup_offset_##n:				\
+	sub	x0, end, dst;			\
+	sub	x0, x0, n;			\
+	ret
 
-	.macro strh1 reg, ptr, val
-	strh \reg, [\ptr], \val
-	.endm
+FIXUP_OFFSET(0)
+FIXUP_OFFSET(8)
+FIXUP_OFFSET(16)
+FIXUP_OFFSET(24)
+FIXUP_OFFSET(32)
+FIXUP_OFFSET(40)
+FIXUP_OFFSET(48)
+FIXUP_OFFSET(56)
 
-	.macro ldr1 reg, ptr, val
-	user_ldst 9997f, ldtr, \reg, \ptr, \val
-	.endm
+SYM_FUNC_START(__arch_copy_from_user)
+	add	end, x0, x2
+	mov	srcin, x1
 
-	.macro str1 reg, ptr, val
-	str \reg, [\ptr], \val
-	.endm
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
 
-	.macro ldp1 reg1, reg2, ptr, val
-	user_ldp 9997f, \reg1, \reg2, \ptr, \val
-	.endm
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwriting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+USER_OFF(0,	ldtrb tmp1w, [src, #0])
+	strb	tmp1w, [dst], #1
+	add	src, src, #1
+1:
+	tbz	tmp2, #1, 2f
+USER_OFF(0,	ldtrh tmp1w, [src, #0])
+	strh	tmp1w, [dst], #2
+	add	src, src, #2
+2:
+	tbz	tmp2, #2, 3f
+USER_OFF(0,	ldtr tmp1w, [src, #0])
+	str	tmp1w, [dst], #4
+	add	src, src, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+USER_OFF(0,	ldtr tmp1, [src, #0])
+	str	tmp1, [dst], #8
+	add	src, src, #8
 
-	.macro stp1 reg1, reg2, ptr, val
-	stp \reg1, \reg2, [\ptr], \val
-	.endm
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp 	A_l, A_h, [dst], #16
+	add	src, src, #16
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+1:
+	stp 	A_l, A_h, [dst], #16
+	add	src, src, #16
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+2:
+	stp 	A_l, A_h, [dst], #16
+	add	src, src, #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+USER_OFF(0,	ldtr tmp1, [src, #0])
+	str	tmp1, [dst], #8
+	add	src, src, #8
+1:
+	tbz	count, #2, 2f
+USER_OFF(0,	ldtr tmp1w, [src, #0])
+	str	tmp1w, [dst], #4
+	add	src, src, #4
+2:
+	tbz	count, #1, 3f
+USER_OFF(0,	ldtrh tmp1w, [src, #0])
+	strh	tmp1w, [dst], #2
+	add	src, src, #2
+3:
+	tbz	count, #0, .Lexitfunc
+USER_OFF(0,	ldtrb tmp1w, [src, #0])
+	strb	tmp1w, [dst], #1
+	add	src, src, #1
 
-end	.req	x5
-srcin	.req	x15
-SYM_FUNC_START(__arch_copy_from_user)
-	add	end, x0, x2
-	mov	srcin, x1
-#include "copy_template.S"
+	b	.Lexitfunc
+
+.Lcpy_over64:
+	.p2align	L1_CACHE_SHIFT
+	USER_OFF(0,	ldtr A_l, [src, #0])
+	USER_OFF(8,	ldtr A_h, [src, #8])
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	stp 	A_l, A_h, [dst, #0]
+USER_OFF(16,	ldtr B_l, [src, #16])
+USER_OFF(24,	ldtr B_h, [src, #24])
+USER_OFF(32,	ldtr C_l, [src, #32])
+USER_OFF(40,	ldtr C_h, [src, #40])
+	stp 	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+USER_OFF(48,	ldtr D_l, [src, #48])
+USER_OFF(56,	ldtr D_h, [src, #56])
+	add	src, src, #64
+	stp	D_l, D_h, [dst, #48]
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	b	.Lexitfunc
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+USER_OFF(16,	ldtr B_l, [src, #16])
+USER_OFF(24,	ldtr B_h, [src, #24])
+USER_OFF(32,	ldtr C_l, [src, #32])
+USER_OFF(40,	ldtr C_h, [src, #40])
+USER_OFF(48,	ldtr D_l, [src, #48])
+USER_OFF(56,	ldtr D_h, [src, #56])
+	add	src, src, #64
+
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp 	A_l, A_h, [dst, #0]
+USER_OFF(0,	ldtr A_l, [src, #0])
+USER_OFF(8,	ldtr A_h, [src, #8])
+	stp 	B_l, B_h, [dst, #16]
+USER_OFF(16,	ldtr B_l, [src, #16])
+USER_OFF(24,	ldtr B_h, [src, #24])
+	stp	C_l, C_h, [dst, #32]
+USER_OFF(32,	ldtr C_l, [src, #32])
+USER_OFF(40,	ldtr C_h, [src, #40])
+	stp	D_l, D_h, [dst, #48]
+USER_OFF(48,	ldtr D_l, [src, #48])
+	add	dst, dst, #64
+USER_OFF(56,	ldtr D_h, [src, #56])
+	add	src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+	stp 	A_l, A_h, [dst, #0]
+	stp 	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	add	dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lexitfunc:
 	mov	x0, #0				// Nothing to copy
 	ret
 
-	// Exception fixups
-9997:	cmp	dst, dstin
-	b.ne	9998f
-	// Before being absolutely sure we couldn't copy anything, try harder
-USER(9998f, ldtrb tmp1w, [srcin])
-	strb	tmp1w, [dst], #1
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
 SYM_FUNC_END(__arch_copy_from_user)
 EXPORT_SYMBOL(__arch_copy_from_user)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 2ac716c0d6d8cba5df698f7825dc19a75e53db78..7b69dece56f6d5c2996ecf2c67fd29782085200c 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,55 +19,219 @@
  * Returns:
  *	x0 - bytes not copied
  */
-	.macro ldrb1 reg, ptr, val
-	KERNEL_ME_SAFE(9998f, ldrb  \reg, [\ptr], \val)
-	.endm
 
-	.macro strb1 reg, ptr, val
-	user_ldst 9998f, sttrb, \reg, \ptr, \val
-	.endm
-
-	.macro ldrh1 reg, ptr, val
-	KERNEL_ME_SAFE(9998f, ldrh  \reg, [\ptr], \val)
-	.endm
-
-	.macro strh1 reg, ptr, val
-	user_ldst 9997f, sttrh, \reg, \ptr, \val
-	.endm
-
-	.macro ldr1 reg, ptr, val
-	KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val)
-	.endm
+dstin	.req	x0
+src	.req	x1
+end	.req	x5
+srcin	.req	x15
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+dst	.req	x6
 
-	.macro str1 reg, ptr, val
-	user_ldst 9997f, sttr, \reg, \ptr, \val
-	.endm
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
 
-	.macro ldp1 reg1, reg2, ptr, val
-	KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val)
-	.endm
+#define USER_OFF(off, x...)    USER(fixup_offset_##off, x)
+#define FIXUP_OFFSET(n)                                \
+fixup_offset_##n:                              \
+       sub     x0, end, dst;                   \
+       sub     x0, x0, n;                      \
+       ret
 
-	.macro stp1 reg1, reg2, ptr, val
-	user_stp 9997f, \reg1, \reg2, \ptr, \val
-	.endm
+FIXUP_OFFSET(0)
+FIXUP_OFFSET(8)
+FIXUP_OFFSET(16)
+FIXUP_OFFSET(24)
+FIXUP_OFFSET(32)
+FIXUP_OFFSET(40)
+FIXUP_OFFSET(48)
+FIXUP_OFFSET(56)
 
-end	.req	x5
-srcin	.req	x15
 SYM_FUNC_START(__arch_copy_to_user)
 	add	end, x0, x2
 	mov	srcin, x1
-#include "copy_template.S"
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwriting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+USER_OFF(0,    sttrb tmp1w, [dst, #0])
+	add     dst, dst, #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh	tmp1w, [src], #2
+USER_OFF(0,    sttrh tmp1w, [dst, #0])
+	add     dst, dst, #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr	tmp1w, [src], #4
+USER_OFF(0,    sttr tmp1w, [dst, #0])
+	add     dst, dst, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src], #8
+USER_OFF(0,    sttr tmp1, [dst, #0])
+	add     dst, dst, #8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	ldp	A_l, A_h, [src], #16
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	A_l, A_h, [src], #16
+	add     dst, dst, #16
+1:
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	A_l, A_h, [src], #16
+	add     dst, dst, #16
+2:
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	add     dst, dst, #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+USER_OFF(0,    sttr tmp1, [dst, #0])
+	add     dst, dst, #8
+1:
+	tbz	count, #2, 2f
+	ldr	tmp1w, [src], #4
+USER_OFF(0,    sttr tmp1w, [dst, #0])
+	add     dst, dst, #4
+2:
+	tbz	count, #1, 3f
+	ldrh	tmp1w, [src], #2
+USER_OFF(0,    sttrh tmp1w, [dst, #0])
+	add     dst, dst, #2
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src], #1
+USER_OFF(0,    sttrb tmp1w, [dst, #0])
+	add     dst, dst, #1
+
+	b	.Lexitfunc
+
+.Lcpy_over64:
+	.p2align	L1_CACHE_SHIFT
+	ldp	A_l, A_h, [src, #0]
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+USER_OFF(16,   sttr B_l, [dst, #16])
+USER_OFF(24,   sttr B_h, [dst, #24])
+USER_OFF(32,   sttr C_l, [dst, #32])
+USER_OFF(40,   sttr C_h, [dst, #40])
+	ldp	D_l, D_h, [src, #48]
+	add     src, src, #64
+USER_OFF(48,   sttr D_l, [dst, #48])
+USER_OFF(56,   sttr D_h, [dst, #56])
+	add     dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	b	.Lexitfunc
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	add     src, src, #64
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+	ldp	A_l, A_h, [src, #0]
+USER_OFF(16,   sttr B_l, [dst, #16])
+USER_OFF(24,   sttr B_h, [dst, #24])
+	ldp	B_l, B_h, [src, #16]
+USER_OFF(32,   sttr C_l, [dst, #32])
+USER_OFF(40,   sttr C_h, [dst, #40])
+	ldp	C_l, C_h, [src, #32]
+USER_OFF(48,   sttr D_l, [dst, #48])
+USER_OFF(56,   sttr D_h, [dst, #56])
+	add     dst, dst, #64
+	ldp	D_l, D_h, [src, #48]
+	add     src, src, #64
+	subs	count, count, #64
+	b.ge	1b
+USER_OFF(0,    sttr A_l, [dst, #0])
+USER_OFF(8,    sttr A_h, [dst, #8])
+USER_OFF(16,   sttr B_l, [dst, #16])
+USER_OFF(24,   sttr B_h, [dst, #24])
+USER_OFF(32,   sttr C_l, [dst, #32])
+USER_OFF(40,   sttr C_h, [dst, #40])
+USER_OFF(48,   sttr D_l, [dst, #48])
+USER_OFF(56,   sttr D_h, [dst, #56])
+	add     dst, dst, #64
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lexitfunc:
 	mov	x0, #0
 	ret
 
-	// Exception fixups
-9997:	cmp	dst, dstin
-	b.ne	9998f
-	// Before being absolutely sure we couldn't copy anything, try harder
-KERNEL_ME_SAFE(9998f, ldrb	tmp1w, [srcin])
-USER(9998f, sttrb tmp1w, [dst])
-	add	dst, dst, #1
-9998:	sub	x0, end, dst			// bytes not copied
-	ret
 SYM_FUNC_END(__arch_copy_to_user)
 EXPORT_SYMBOL(__arch_copy_to_user)