From 88f631ff461cfb3364b9a13ef8d2a87b3206c176 Mon Sep 17 00:00:00 2001 From: Zhiteng Qiu Date: Tue, 16 Jul 2024 14:28:31 +0800 Subject: [PATCH] anolis: arch/x86/lib: Enhanced Hygon processor's processing capabilities for large memory copying ANBZ: #9470 The following methods are used to improve the large memory copy performance of Hygon processor between kernel and user mode: Prefetch is a technique for reading blocks of data from memory at very high rates. then operated on within the cache, then results are written out to memory, all with high efficiency. The code can employ a very special instruction: NT. This is a streaming store instruction, for writing data to memory. This instruction bypasses the on-chip cache, and sends data directly into a write combining buffer. And because the NT allows the CPU to avoid reading the old data from the memory destination address, NT can effectively improve the write bandwidth. There are similar optimizations for reading data from memory. Interruptions may occur when copying large memory, which may trigger thread switching. Need to save the current MMX register context and continue copying when switching back to the thread next time. Signed-off-by: Zhiteng Qiu --- arch/x86/Kconfig | 1 + arch/x86/Kconfig.fpu | 26 +++ arch/x86/include/asm/fpu/api.h | 38 ++++ arch/x86/include/asm/fpu/internal.h | 60 ++++++ arch/x86/include/asm/fpu/types.h | 5 + arch/x86/include/asm/thread_info.h | 1 + arch/x86/include/asm/uaccess_64.h | 76 ++++++- arch/x86/kernel/cpu/hygon.c | 100 +++++++++ arch/x86/kernel/fpu/core.c | 79 +++++++ arch/x86/kernel/process_64.c | 5 + arch/x86/lib/Makefile | 2 + arch/x86/lib/copy_user_avx2.S | 323 ++++++++++++++++++++++++++++ arch/x86/lib/copy_user_sse2.S | 245 +++++++++++++++++++++ 13 files changed, 959 insertions(+), 2 deletions(-) create mode 100644 arch/x86/Kconfig.fpu create mode 100644 arch/x86/lib/copy_user_avx2.S create mode 100644 arch/x86/lib/copy_user_sse2.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b4c48b92c47..0da214e4f320 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -889,6 +889,7 @@ config INTEL_TDX_GUEST endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" +source "arch/x86/Kconfig.fpu" config HPET_TIMER def_bool X86_64 diff --git a/arch/x86/Kconfig.fpu b/arch/x86/Kconfig.fpu new file mode 100644 index 000000000000..5410feda1bc7 --- /dev/null +++ b/arch/x86/Kconfig.fpu @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0 + +menuconfig USING_FPU_IN_KERNEL_NONATOMIC + bool "Hygon large memory copy support" + +if USING_FPU_IN_KERNEL_NONATOMIC + +choice + prompt "X86_HYGON_LMC" + depends on X86_64 && CPU_SUP_HYGON + default X86_HYGON_LMC_SSE2_ON + +config X86_HYGON_LMC_SSE2_ON + bool "Using sse2 nt copy for large memory copy" + help + When this feature is enabled, we will using copy_user_sse2_nt_string + for lagre memory copy. + +config X86_HYGON_LMC_AVX2_ON + bool "Using avx2 nt copy for large memory copy" + help + When this feature is enabled, we will using copy_user_avx2_nt_string + for lagre memory copy. + +endchoice +endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 7c8f544f71f5..4cffcba00e54 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -37,6 +37,44 @@ static inline void kernel_fpu_begin(void) kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); } +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask); +extern void kernel_fpu_end_nonatomic(void); + +/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */ +static inline int kernel_fpu_begin_nonatomic(void) +{ +#ifdef CONFIG_X86_64 + /* + * Any 64-bit code that uses 387 instructions must explicitly request + * KFPU_387. + */ + return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR); +#else + /* + * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * as long as it checks that the CPU has the required capability. + */ + return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR); +#endif +} + +/* + * It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic + * func, but before kernel_fpu_end_nonatomic + */ +static inline void check_using_kernel_fpu(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + WARN_ON_ONCE(test_thread_flag(TIF_USING_FPU_NONATOMIC)); +} + +#else +static inline void check_using_kernel_fpu(void) { } + +#endif + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index e69de29bb2d1..727d667d1c19 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Kernel FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_kernel_fpu_prepare() saves the old kernel fpu state. + * This is done within the context of the old process. + * + * - switch_kernel_fpu_finish() restore new kernel fpu state. + * + * The kernel FPU context is only stored/restored for a user task in kernel + * mode and PF_KTHREAD is used to distinguish between kernel and user threads. + */ +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu); +static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu) +{ + struct fpu *old_fpu = &prev->thread.fpu; + + if ((boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) || + !test_thread_flag(TIF_USING_FPU_NONATOMIC)) + return; + + if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) + save_fpregs_to_fpkernelstate(old_fpu); +} + +/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */ +static inline void fpregs_restore_kernelregs(struct fpu *kfpu) +{ + kernel_fpu_states_restore(NULL, &kfpu->fpstate->kernel_state, + sizeof(kfpu->fpstate->kernel_state)); +} + +/* Loading of the complete FPU state immediately. */ +static inline void switch_kernel_fpu_finish(struct task_struct *next) +{ + struct fpu *new_fpu = &next->thread.fpu; + + if ((next->flags & PF_KTHREAD) || + (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)) + return; + + if (cpu_feature_enabled(X86_FEATURE_FPU) && + test_ti_thread_flag((struct thread_info *)next, + TIF_USING_FPU_NONATOMIC)) + fpregs_restore_kernelregs(new_fpu); +} +#else +static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu) +{ +} +static inline void switch_kernel_fpu_finish(struct task_struct *next) +{ +} + +#endif diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 8f70c5eb3a58..7678fef7e21d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -381,6 +381,11 @@ struct fpstate { /* @is_guest: Indicator for guest state (KVM) */ unsigned int is_guest : 1; +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) + /* @kernel_state: Kernel FPU registers state saving */ + union fpregs_state kernel_state; +#endif /* * @is_confidential: Indicator for KVM confidential mode. * The FPU registers are restored by the diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f9bdfa4da93a..e66aeba703ac 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -107,6 +107,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_USING_FPU_NONATOMIC 31 /* using fpu in kernel non-atomic context */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f..ea7cc9dab376 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,10 @@ #include #include #include +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +#include +#endif /* * Copy To/From Userspace @@ -24,13 +28,81 @@ copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); +#ifdef CONFIG_X86_HYGON_LMC_SSE2_ON +void fpu_save_xmm0_3(void *to, const void *from, unsigned len); +void fpu_restore_xmm0_3(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_xmm0_3 +#define kernel_fpu_states_restore fpu_restore_xmm0_3 + +__must_check unsigned long copy_user_sse2_opt_string(void *to, const void *from, + unsigned len); + +#define copy_user_large_memory_generic_string copy_user_sse2_opt_string + +#endif + +#ifdef CONFIG_X86_HYGON_LMC_AVX2_ON +void fpu_save_ymm0_7(void *to, const void *from, unsigned len); +void fpu_restore_ymm0_7(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_ymm0_7 +#define kernel_fpu_states_restore fpu_restore_ymm0_7 + +__must_check unsigned long +copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned len); + +#define copy_user_large_memory_generic_string copy_user_avx2_pf64_nt_string +#endif + +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +unsigned int get_nt_block_copy_mini_len(void); +static inline bool Hygon_LMC_check(unsigned len) +{ + unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && + ((nt_blk_cpy_mini_len) && (nt_blk_cpy_mini_len <= len) && + (system_state == SYSTEM_RUNNING) && + (!kernel_fpu_begin_nonatomic()))) + return true; + else + return false; +} +static inline unsigned long +copy_large_memory_generic_string(void *to, const void *from, unsigned len) +{ + unsigned ret; + + ret = copy_user_large_memory_generic_string(to, from, len); + kernel_fpu_end_nonatomic(); + return ret; +} +#else +static inline bool Hygon_LMC_check(unsigned len) +{ + return false; +} +static inline unsigned long +copy_large_memory_generic_string(void *to, const void *from, unsigned len) +{ + return 0; +} +#endif + static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + /* Check if Hygon large memory copy support enabled. */ + if (Hygon_LMC_check(len)) { + ret = copy_large_memory_generic_string(to, from, len); + return ret; + } + + /* If CPU has ERMS feature, use copy_user_enhanced_fast_string. * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use copy_user_generic_unrolled. */ diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ebe35ba90357..e2dccacc966f 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_X86_64 # include #endif @@ -480,3 +483,100 @@ static const struct cpu_dev hygon_cpu_dev = { }; cpu_dev_register(hygon_cpu_dev); + +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +struct hygon_c86_info { + unsigned int nt_cpy_mini_len; +}; + +static struct hygon_c86_info hygon_c86_data = { + .nt_cpy_mini_len = 0 +}; + +void set_c86_features_para_invalid(void) +{ + memset((void *)&hygon_c86_data, 0, sizeof(struct hygon_c86_info)); +} + +unsigned int get_nt_block_copy_mini_len(void) +{ + return hygon_c86_data.nt_cpy_mini_len; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_mini_len); + +static ssize_t show_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_mini_len); +} + +static ssize_t store_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_mini_len = val; + + return count; +} + +static struct kobj_attribute nt_cpy_mini_len_attribute = + __ATTR(nt_cpy_mini_len, 0600, show_nt_cpy_mini_len, + store_nt_cpy_mini_len); + +static struct attribute *c86_default_attrs[] = { + &nt_cpy_mini_len_attribute.attr, + NULL +}; + +const struct attribute_group hygon_c86_attr_group = { + .attrs = c86_default_attrs, + .name = "hygon_c86", +}; + +static struct kobject *c86_features_kobj; +static int __init kobject_hygon_c86_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + goto err_out; + + c86_features_kobj = kobject_create_and_add("c86_features", NULL); + + if (c86_features_kobj) { + ret = sysfs_create_group(c86_features_kobj, + &hygon_c86_attr_group); + if (ret) + goto err_out; + } + + return 0; +err_out: + set_c86_features_para_invalid(); + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } + + return -1; +} +module_init(kobject_hygon_c86_init); + +static void __exit kobject_hygon_c86_exit(void) +{ + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } +} +module_exit(kobject_hygon_c86_exit); + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7948f1499138..e2f550d020c8 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -425,6 +425,8 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); + check_using_kernel_fpu(); + WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); @@ -448,6 +450,8 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { + check_using_kernel_fpu(); + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); @@ -455,6 +459,81 @@ void kernel_fpu_end(void) } EXPORT_SYMBOL_GPL(kernel_fpu_end); +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +/* + * We can call kernel_fpu_begin_nonatomic in non-atomic task context. + */ +int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask) +{ + preempt_disable(); + /* we not support Nested call */ + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)) + goto err; + + /* + * This means we call kernel_fpu_begin_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + if (this_cpu_read(in_kernel_fpu)) + goto err; + + if (in_interrupt()) + goto err; + + if (current->flags & PF_KTHREAD) + goto err; + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + save_fpregs_to_fpstate(¤t->thread.fpu); + } + /* Set thread flag: TIC_USING_FPU_NONATOMIC */ + set_thread_flag(TIF_USING_FPU_NONATOMIC); + + __cpu_invalidate_fpregs_state(); + + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); + + preempt_enable(); + + return 0; + +err: + preempt_enable(); + + return -1; +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin_nonatomic_mask); + +void kernel_fpu_end_nonatomic(void) +{ + preempt_disable(); + /* + * This means we call kernel_fpu_end_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); + + WARN_ON_FPU(!test_thread_flag(TIF_USING_FPU_NONATOMIC)); + + clear_thread_flag(TIF_USING_FPU_NONATOMIC); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end_nonatomic); + +void save_fpregs_to_fpkernelstate(struct fpu *kfpu) +{ + kernel_fpu_states_save(&kfpu->fpstate->kernel_state, NULL, + sizeof(kfpu->fpstate->kernel_state)); +} +#endif + /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 77cf9d87ad45..860bac711a17 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include @@ -568,6 +569,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); + switch_kernel_fpu_prepare(prev_p, cpu); + /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -622,6 +625,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(); + switch_kernel_fpu_finish(next_p); + /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index bad4dee4f0e4..3b93f64ffa0e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -69,5 +69,7 @@ else lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o + lib-$(CONFIG_X86_HYGON_LMC_SSE2_ON) += copy_user_sse2.o + lib-$(CONFIG_X86_HYGON_LMC_AVX2_ON) += copy_user_avx2.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_avx2.S b/arch/x86/lib/copy_user_avx2.S new file mode 100644 index 000000000000..2e2c5caeceb9 --- /dev/null +++ b/arch/x86/lib/copy_user_avx2.S @@ -0,0 +1,323 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 64 + +#define PREFETCH(addr) prefetchnta addr + +.macro ALIGN_DESTINATION_32 + /* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */ + /* if <32Bytes, jb 302f */ + cmpl $32, %edx + jb 302f + + movl %edi, %ecx + andl $31, %ecx + jz 302f /* already aligned */ + + subl $32, %ecx + negl %ecx + subl %ecx, %edx + +300: + movb (%rsi), %al +301: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 300b +302: + +.section .fixup,"ax" +303: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(300b, 303b) + _ASM_EXTABLE_CPY(301b, 303b) +.endm + +/* + * large block copy, use avx2 nt & prefetchnta + */ +SYM_FUNC_START(copy_user_avx2_pf64_nt_string) + ASM_STAC + ALIGN_DESTINATION_32 + + /* len >= 256 . */ + cmpl $256, %edx + jb .Lless_than_256_bytes_cpy + + movl %esi, %ecx /* check if src is aligned */ + andl $31, %ecx + jnz large_block_nt_unaligned_cpy + +large_block_nt_aligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +32: + vmovdqa 0(%rsi), %ymm0 +33: + vmovdqa 32(%rsi), %ymm1 +34: + vmovdqa 64(%rsi), %ymm2 +35: + vmovdqa 96(%rsi), %ymm3 +36: + vmovdqa 128(%rsi), %ymm4 +37: + vmovdqa 160(%rsi), %ymm5 +38: + vmovdqa 192(%rsi), %ymm6 +39: + vmovdqa 224(%rsi), %ymm7 + +40: + vmovntdq %ymm0, 0(%rdi) +41: + vmovntdq %ymm1, 32(%rdi) +42: + vmovntdq %ymm2, 64(%rdi) +43: + vmovntdq %ymm3, 96(%rdi) +44: + vmovntdq %ymm4, 128(%rdi) +45: + vmovntdq %ymm5, 160(%rdi) +46: + vmovntdq %ymm6, 192(%rdi) +47: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_aligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + +large_block_nt_unaligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +48: + vmovdqu 0(%rsi), %ymm0 +49: + vmovdqu 32(%rsi), %ymm1 +50: + vmovdqu 64(%rsi), %ymm2 +51: + vmovdqu 96(%rsi), %ymm3 +52: + vmovdqu 128(%rsi), %ymm4 +53: + vmovdqu 160(%rsi), %ymm5 +54: + vmovdqu 192(%rsi), %ymm6 +55: + vmovdqu 224(%rsi), %ymm7 + +56: + vmovntdq %ymm0, 0(%rdi) +57: + vmovntdq %ymm1, 32(%rdi) +58: + vmovntdq %ymm2, 64(%rdi) +59: + vmovntdq %ymm3, 96(%rdi) +60: + vmovntdq %ymm4, 128(%rdi) +61: + vmovntdq %ymm5, 160(%rdi) +62: + vmovntdq %ymm6, 192(%rdi) +63: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_unaligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + + .section .fixup,"ax" + +88: + vzeroupper + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(32b, 88b) + _ASM_EXTABLE_CPY(33b, 88b) + _ASM_EXTABLE_CPY(34b, 88b) + _ASM_EXTABLE_CPY(35b, 88b) + _ASM_EXTABLE_CPY(36b, 88b) + _ASM_EXTABLE_CPY(37b, 88b) + _ASM_EXTABLE_CPY(38b, 88b) + _ASM_EXTABLE_CPY(39b, 88b) + + _ASM_EXTABLE_CPY(40b, 88b) + _ASM_EXTABLE_CPY(41b, 88b) + _ASM_EXTABLE_CPY(42b, 88b) + _ASM_EXTABLE_CPY(43b, 88b) + _ASM_EXTABLE_CPY(44b, 88b) + _ASM_EXTABLE_CPY(45b, 88b) + _ASM_EXTABLE_CPY(46b, 88b) + _ASM_EXTABLE_CPY(47b, 88b) + _ASM_EXTABLE_CPY(48b, 88b) + _ASM_EXTABLE_CPY(49b, 88b) + + _ASM_EXTABLE_CPY(50b, 88b) + _ASM_EXTABLE_CPY(51b, 88b) + _ASM_EXTABLE_CPY(52b, 88b) + _ASM_EXTABLE_CPY(53b, 88b) + _ASM_EXTABLE_CPY(54b, 88b) + _ASM_EXTABLE_CPY(55b, 88b) + _ASM_EXTABLE_CPY(56b, 88b) + _ASM_EXTABLE_CPY(57b, 88b) + _ASM_EXTABLE_CPY(58b, 88b) + _ASM_EXTABLE_CPY(59b, 88b) + + _ASM_EXTABLE_CPY(60b, 88b) + _ASM_EXTABLE_CPY(61b, 88b) + _ASM_EXTABLE_CPY(62b, 88b) + _ASM_EXTABLE_CPY(63b, 88b) +SYM_FUNC_END(copy_user_avx2_pf64_nt_string) +EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string) + +/* + * If len < 256 bytes, then we use rep mov directly. + */ +SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy) + movl %edx, %ecx +90: + rep movsb + + xorl %eax,%eax + ASM_CLAC + RET + + .section .fixup,"ax" +99: + mov %ecx,%eax + + ASM_CLAC + RET + .previous + + _ASM_EXTABLE_CPY(90b, 99b) +SYM_CODE_END(.Lless_than_256_bytes_cpy) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ + +SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail) + movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f + +1: rep movsb +2: mov %ecx,%eax + + ASM_CLAC + RET + +3: xorl %eax,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_CPY(1b, 2b) +SYM_CODE_END(.Lavx2_copy_user_handle_tail) + +/* + * Called when task schedule. we call fpu_save_%ymm0_7 to save old + * task's fpu states and we call fpu_restore_%ymm0_7 to restore new + * task's fpu states. + */ +SYM_FUNC_START(fpu_restore_ymm0_7) + vmovdqu 0(%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vmovdqu 64(%rsi), %ymm2 + vmovdqu 96(%rsi), %ymm3 + vmovdqu 128(%rsi), %ymm4 + vmovdqu 160(%rsi), %ymm5 + vmovdqu 192(%rsi), %ymm6 + vmovdqu 224(%rsi), %ymm7 + + xorl %eax,%eax + RET//ret +SYM_FUNC_END(fpu_restore_ymm0_7) +EXPORT_SYMBOL(fpu_restore_ymm0_7) + +SYM_FUNC_START(fpu_save_ymm0_7) + vmovdqu %ymm0, 0(%rdi) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, 128(%rdi) + vmovdqu %ymm5, 160(%rdi) + vmovdqu %ymm6, 192(%rdi) + vmovdqu %ymm7, 224(%rdi) + + xorl %eax,%eax + RET +SYM_FUNC_END(fpu_save_ymm0_7) +EXPORT_SYMBOL(fpu_save_ymm0_7) diff --git a/arch/x86/lib/copy_user_sse2.S b/arch/x86/lib/copy_user_sse2.S new file mode 100644 index 000000000000..65f4a5a24303 --- /dev/null +++ b/arch/x86/lib/copy_user_sse2.S @@ -0,0 +1,245 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 256 + +.macro ALIGN_DESTINATION_16 + /* check for bad alignment of destination, there is 16Bytes, for we will use movdqa */ + /* if len<16Bytes, jb 202f */ + cmpl $16,%edx + jb 202f + + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $15,%ecx + jz 202f /* already aligned */ + + subl $16,%ecx + negl %ecx + subl %ecx,%edx +200: + movb (%rsi),%al +201: + movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 200b +202: + + .section .fixup,"ax" +203: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(200b, 203b) + _ASM_EXTABLE_CPY(201b, 203b) +.endm +/*****************************************************************************/ +SYM_FUNC_START(copy_user_sse2_opt_string) + ASM_STAC + ALIGN_DESTINATION_16 + + cmpl $64,%edx + jb 70f /* less then 64 bytes, avoid the costly 'rep' */ + + movl %esi,%ecx /* check if src is aligned */ + andl $15,%ecx + jnz 20f + +10: + prefetchnta PREFETCH_DISTANCE(%rsi) +11: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +12: + movdqa (%rsi),%xmm0 +13: + movdqa 16(%rsi),%xmm1 +14: + movdqa 32(%rsi),%xmm2 +15: + movdqa 48(%rsi),%xmm3 +16: + movntdq %xmm0,0(%rdi) +17: + movntdq %xmm1,16(%rdi) +18: + movntdq %xmm2,32(%rdi) +19: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 10b + sfence + jmp 70f + +20: + prefetchnta PREFETCH_DISTANCE(%rsi) +21: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +22: + movdqu (%rsi),%xmm0 +23: + movdqu 16(%rsi),%xmm1 +24: + movdqu 32(%rsi),%xmm2 +25: + movdqu 48(%rsi),%xmm3 +26: + movntdq %xmm0,0(%rdi) +27: + movntdq %xmm1,16(%rdi) +28: + movntdq %xmm2,32(%rdi) +29: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 20b + sfence + +70: + movl %edx,%ecx +80: + rep + movsb + + xorl %eax,%eax + ASM_CLAC + RET//ret + + .section .fixup,"ax" +99: + movl %ecx,%edx /* ecx is zerorest also */ +100: + sfence + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(10b, 100b) + _ASM_EXTABLE_CPY(11b, 100b) + _ASM_EXTABLE_CPY(12b, 100b) + _ASM_EXTABLE_CPY(13b, 100b) + _ASM_EXTABLE_CPY(14b, 100b) + _ASM_EXTABLE_CPY(15b, 100b) + _ASM_EXTABLE_CPY(16b, 100b) + _ASM_EXTABLE_CPY(17b, 100b) + _ASM_EXTABLE_CPY(18b, 100b) + _ASM_EXTABLE_CPY(19b, 100b) + + _ASM_EXTABLE_CPY(20b, 100b) + _ASM_EXTABLE_CPY(21b, 100b) + _ASM_EXTABLE_CPY(22b, 100b) + _ASM_EXTABLE_CPY(23b, 100b) + _ASM_EXTABLE_CPY(24b, 100b) + _ASM_EXTABLE_CPY(25b, 100b) + _ASM_EXTABLE_CPY(26b, 100b) + _ASM_EXTABLE_CPY(27b, 100b) + _ASM_EXTABLE_CPY(28b, 100b) + _ASM_EXTABLE_CPY(29b, 100b) + + _ASM_EXTABLE_CPY(80b, 99b) +SYM_FUNC_END(copy_user_sse2_opt_string) +EXPORT_SYMBOL(copy_user_sse2_opt_string) + +SYM_FUNC_START(fpu_restore_xmm0_3) + ASM_STAC + movdqu (%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_restore_xmm0_3) +EXPORT_SYMBOL(fpu_restore_xmm0_3) + +SYM_FUNC_START(fpu_save_xmm0_3) + ASM_STAC + + movdqu %xmm0,(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_save_xmm0_3) +EXPORT_SYMBOL(fpu_save_xmm0_3) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +SYM_CODE_START_LOCAL(.Lsse2_copy_user_handle_tail) + movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + RET + + /* + * Return zero to pretend that this copy succeeded. This + * is counter-intuitive, but needed to prevent the code + * in lib/iov_iter.c from retrying and running back into + * the poison cache line again. The machine check handler + * will ensure that a SIGBUS is sent to the task. + */ +3: xorl %eax,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_CPY(1b, 2b) +SYM_CODE_END(.Lsse2_copy_user_handle_tail) + +/*****************************************************************************/ -- Gitee