diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index 7bbebfa5b71039f4b6716656102418b80bc7cb33..7a872c77c03acce7057c92b10f6ff4e2ddee9af4 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -65,33 +65,4 @@ alternative_else_nop_endif 9999: x; \ _asm_extable_uaccess 9999b, l -/* - * Generate the assembly for LDTR/STTR with exception table entries. - * This is complicated as there is no post-increment or pair versions of the - * unprivileged instructions, and USER() only works for single instructions. - */ - .macro user_ldp l, reg1, reg2, addr, post_inc -8888: ldtr \reg1, [\addr]; -8889: ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - - _asm_extable_uaccess 8888b, \l; - _asm_extable_uaccess 8889b, \l; - .endm - - .macro user_stp l, reg1, reg2, addr, post_inc -8888: sttr \reg1, [\addr]; -8889: sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - - _asm_extable_uaccess 8888b,\l; - _asm_extable_uaccess 8889b,\l; - .endm - - .macro user_ldst l, inst, reg, addr, post_inc -8888: \inst \reg, [\addr]; - add \addr, \addr, \post_inc; - - _asm_extable_uaccess 8888b, \l; - .endm #endif diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 34e3179075244932422dc708508cd2d097b498fe..a80b8679c4b5894f8adbcd56d2b1f5db950c97a1 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -20,54 +20,219 @@ * x0 - bytes not copied */ - .macro ldrb1 reg, ptr, val - user_ldst 9998f, ldtrb, \reg, \ptr, \val - .endm +dstin .req x0 +end .req x5 +src .req x1 +srcin .req x15 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 - .macro strb1 reg, ptr, val - strb \reg, [\ptr], \val - .endm +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 - .macro ldrh1 reg, ptr, val - user_ldst 9997f, ldtrh, \reg, \ptr, \val - .endm +#define USER_OFF(off, x...) USER(fixup_offset_##off, x) +#define FIXUP_OFFSET(n) \ +fixup_offset_##n: \ + sub x0, end, dst; \ + sub x0, x0, n; \ + ret - .macro strh1 reg, ptr, val - strh \reg, [\ptr], \val - .endm +FIXUP_OFFSET(0) +FIXUP_OFFSET(8) +FIXUP_OFFSET(16) +FIXUP_OFFSET(24) +FIXUP_OFFSET(32) +FIXUP_OFFSET(40) +FIXUP_OFFSET(48) +FIXUP_OFFSET(56) - .macro ldr1 reg, ptr, val - user_ldst 9997f, ldtr, \reg, \ptr, \val - .endm +SYM_FUNC_START(__arch_copy_from_user) + add end, x0, x2 + mov srcin, x1 - .macro str1 reg, ptr, val - str \reg, [\ptr], \val - .endm + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 - .macro ldp1 reg1, reg2, ptr, val - user_ldp 9997f, \reg1, \reg2, \ptr, \val - .endm + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f +USER_OFF(0, ldtrb tmp1w, [src, #0]) + strb tmp1w, [dst], #1 + add src, src, #1 +1: + tbz tmp2, #1, 2f +USER_OFF(0, ldtrh tmp1w, [src, #0]) + strh tmp1w, [dst], #2 + add src, src, #2 +2: + tbz tmp2, #2, 3f +USER_OFF(0, ldtr tmp1w, [src, #0]) + str tmp1w, [dst], #4 + add src, src, #4 +3: + tbz tmp2, #3, .LSrcAligned +USER_OFF(0, ldtr tmp1, [src, #0]) + str tmp1, [dst], #8 + add src, src, #8 - .macro stp1 reg1, reg2, ptr, val - stp \reg1, \reg2, [\ptr], \val - .endm +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_h, [dst], #16 + add src, src, #16 + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) +1: + stp A_l, A_h, [dst], #16 + add src, src, #16 + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) +2: + stp A_l, A_h, [dst], #16 + add src, src, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f +USER_OFF(0, ldtr tmp1, [src, #0]) + str tmp1, [dst], #8 + add src, src, #8 +1: + tbz count, #2, 2f +USER_OFF(0, ldtr tmp1w, [src, #0]) + str tmp1w, [dst], #4 + add src, src, #4 +2: + tbz count, #1, 3f +USER_OFF(0, ldtrh tmp1w, [src, #0]) + strh tmp1w, [dst], #2 + add src, src, #2 +3: + tbz count, #0, .Lexitfunc +USER_OFF(0, ldtrb tmp1w, [src, #0]) + strb tmp1w, [dst], #1 + add src, src, #1 -end .req x5 -srcin .req x15 -SYM_FUNC_START(__arch_copy_from_user) - add end, x0, x2 - mov srcin, x1 -#include "copy_template.S" + b .Lexitfunc + +.Lcpy_over64: + .p2align L1_CACHE_SHIFT + USER_OFF(0, ldtr A_l, [src, #0]) + USER_OFF(8, ldtr A_h, [src, #8]) + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + stp A_l, A_h, [dst, #0] +USER_OFF(16, ldtr B_l, [src, #16]) +USER_OFF(24, ldtr B_h, [src, #24]) +USER_OFF(32, ldtr C_l, [src, #32]) +USER_OFF(40, ldtr C_h, [src, #40]) + stp B_l, B_h, [dst, #16] + stp C_l, C_h, [dst, #32] +USER_OFF(48, ldtr D_l, [src, #48]) +USER_OFF(56, ldtr D_h, [src, #56]) + add src, src, #64 + stp D_l, D_h, [dst, #48] + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ +.Lcpy_body_large: + /* pre-get 64 bytes data. */ +USER_OFF(16, ldtr B_l, [src, #16]) +USER_OFF(24, ldtr B_h, [src, #24]) +USER_OFF(32, ldtr C_l, [src, #32]) +USER_OFF(40, ldtr C_h, [src, #40]) +USER_OFF(48, ldtr D_l, [src, #48]) +USER_OFF(56, ldtr D_h, [src, #56]) + add src, src, #64 + +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp A_l, A_h, [dst, #0] +USER_OFF(0, ldtr A_l, [src, #0]) +USER_OFF(8, ldtr A_h, [src, #8]) + stp B_l, B_h, [dst, #16] +USER_OFF(16, ldtr B_l, [src, #16]) +USER_OFF(24, ldtr B_h, [src, #24]) + stp C_l, C_h, [dst, #32] +USER_OFF(32, ldtr C_l, [src, #32]) +USER_OFF(40, ldtr C_h, [src, #40]) + stp D_l, D_h, [dst, #48] +USER_OFF(48, ldtr D_l, [src, #48]) + add dst, dst, #64 +USER_OFF(56, ldtr D_h, [src, #56]) + add src, src, #64 + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst, #0] + stp B_l, B_h, [dst, #16] + stp C_l, C_h, [dst, #32] + stp D_l, D_h, [dst, #48] + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: mov x0, #0 // Nothing to copy ret - // Exception fixups -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -USER(9998f, ldtrb tmp1w, [srcin]) - strb tmp1w, [dst], #1 -9998: sub x0, end, dst // bytes not copied - ret SYM_FUNC_END(__arch_copy_from_user) EXPORT_SYMBOL(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 2ac716c0d6d8cba5df698f7825dc19a75e53db78..7b69dece56f6d5c2996ecf2c67fd29782085200c 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -19,55 +19,219 @@ * Returns: * x0 - bytes not copied */ - .macro ldrb1 reg, ptr, val - KERNEL_ME_SAFE(9998f, ldrb \reg, [\ptr], \val) - .endm - .macro strb1 reg, ptr, val - user_ldst 9998f, sttrb, \reg, \ptr, \val - .endm - - .macro ldrh1 reg, ptr, val - KERNEL_ME_SAFE(9998f, ldrh \reg, [\ptr], \val) - .endm - - .macro strh1 reg, ptr, val - user_ldst 9997f, sttrh, \reg, \ptr, \val - .endm - - .macro ldr1 reg, ptr, val - KERNEL_ME_SAFE(9998f, ldr \reg, [\ptr], \val) - .endm +dstin .req x0 +src .req x1 +end .req x5 +srcin .req x15 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 - .macro str1 reg, ptr, val - user_ldst 9997f, sttr, \reg, \ptr, \val - .endm +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 - .macro ldp1 reg1, reg2, ptr, val - KERNEL_ME_SAFE(9998f, ldp \reg1, \reg2, [\ptr], \val) - .endm +#define USER_OFF(off, x...) USER(fixup_offset_##off, x) +#define FIXUP_OFFSET(n) \ +fixup_offset_##n: \ + sub x0, end, dst; \ + sub x0, x0, n; \ + ret - .macro stp1 reg1, reg2, ptr, val - user_stp 9997f, \reg1, \reg2, \ptr, \val - .endm +FIXUP_OFFSET(0) +FIXUP_OFFSET(8) +FIXUP_OFFSET(16) +FIXUP_OFFSET(24) +FIXUP_OFFSET(32) +FIXUP_OFFSET(40) +FIXUP_OFFSET(48) +FIXUP_OFFSET(56) -end .req x5 -srcin .req x15 SYM_FUNC_START(__arch_copy_to_user) add end, x0, x2 mov srcin, x1 -#include "copy_template.S" + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwriting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb tmp1w, [src], #1 +USER_OFF(0, sttrb tmp1w, [dst, #0]) + add dst, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh tmp1w, [src], #2 +USER_OFF(0, sttrh tmp1w, [dst, #0]) + add dst, dst, #2 +2: + tbz tmp2, #2, 3f + ldr tmp1w, [src], #4 +USER_OFF(0, sttr tmp1w, [dst, #0]) + add dst, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr tmp1, [src], #8 +USER_OFF(0, sttr tmp1, [dst, #0]) + add dst, dst, #8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + ldp A_l, A_h, [src], #16 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp A_l, A_h, [src], #16 + add dst, dst, #16 +1: +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp A_l, A_h, [src], #16 + add dst, dst, #16 +2: +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + add dst, dst, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr tmp1, [src], #8 +USER_OFF(0, sttr tmp1, [dst, #0]) + add dst, dst, #8 +1: + tbz count, #2, 2f + ldr tmp1w, [src], #4 +USER_OFF(0, sttr tmp1w, [dst, #0]) + add dst, dst, #4 +2: + tbz count, #1, 3f + ldrh tmp1w, [src], #2 +USER_OFF(0, sttrh tmp1w, [dst, #0]) + add dst, dst, #2 +3: + tbz count, #0, .Lexitfunc + ldrb tmp1w, [src], #1 +USER_OFF(0, sttrb tmp1w, [dst, #0]) + add dst, dst, #1 + + b .Lexitfunc + +.Lcpy_over64: + .p2align L1_CACHE_SHIFT + ldp A_l, A_h, [src, #0] + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] +USER_OFF(16, sttr B_l, [dst, #16]) +USER_OFF(24, sttr B_h, [dst, #24]) +USER_OFF(32, sttr C_l, [dst, #32]) +USER_OFF(40, sttr C_h, [dst, #40]) + ldp D_l, D_h, [src, #48] + add src, src, #64 +USER_OFF(48, sttr D_l, [dst, #48]) +USER_OFF(56, sttr D_h, [dst, #56]) + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp B_l, B_h, [src, #16] + ldp C_l, C_h, [src, #32] + ldp D_l, D_h, [src, #48] + add src, src, #64 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) + ldp A_l, A_h, [src, #0] +USER_OFF(16, sttr B_l, [dst, #16]) +USER_OFF(24, sttr B_h, [dst, #24]) + ldp B_l, B_h, [src, #16] +USER_OFF(32, sttr C_l, [dst, #32]) +USER_OFF(40, sttr C_h, [dst, #40]) + ldp C_l, C_h, [src, #32] +USER_OFF(48, sttr D_l, [dst, #48]) +USER_OFF(56, sttr D_h, [dst, #56]) + add dst, dst, #64 + ldp D_l, D_h, [src, #48] + add src, src, #64 + subs count, count, #64 + b.ge 1b +USER_OFF(0, sttr A_l, [dst, #0]) +USER_OFF(8, sttr A_h, [dst, #8]) +USER_OFF(16, sttr B_l, [dst, #16]) +USER_OFF(24, sttr B_h, [dst, #24]) +USER_OFF(32, sttr C_l, [dst, #32]) +USER_OFF(40, sttr C_h, [dst, #40]) +USER_OFF(48, sttr D_l, [dst, #48]) +USER_OFF(56, sttr D_h, [dst, #56]) + add dst, dst, #64 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: mov x0, #0 ret - // Exception fixups -9997: cmp dst, dstin - b.ne 9998f - // Before being absolutely sure we couldn't copy anything, try harder -KERNEL_ME_SAFE(9998f, ldrb tmp1w, [srcin]) -USER(9998f, sttrb tmp1w, [dst]) - add dst, dst, #1 -9998: sub x0, end, dst // bytes not copied - ret SYM_FUNC_END(__arch_copy_to_user) EXPORT_SYMBOL(__arch_copy_to_user)