From 7004a577f8a744b0397598c1ffaddf92a336653f Mon Sep 17 00:00:00 2001 From: Zhiteng Qiu Date: Wed, 3 Jul 2024 21:07:00 +0800 Subject: [PATCH] Add large memory copy for c86 processing power Signed-off-by: Zhiteng Qiu --- arch/x86/Kconfig | 29 +++ arch/x86/Kconfig.fpu | 9 + arch/x86/include/asm/fpu/api.h | 25 +++ arch/x86/include/asm/fpu/internal.h | 31 +++ arch/x86/include/asm/fpu/types.h | 3 + arch/x86/include/asm/thread_info.h | 1 + arch/x86/include/asm/uaccess_64.h | 129 ++++++++++- arch/x86/kernel/cpu/hygon.c | 173 +++++++++++++++ arch/x86/kernel/fpu/core.c | 83 ++++++- arch/x86/kernel/process_64.c | 10 + arch/x86/lib/Makefile | 2 + arch/x86/lib/copy_user_avx2.S | 329 ++++++++++++++++++++++++++++ arch/x86/lib/copy_user_sse2.S | 245 +++++++++++++++++++++ 13 files changed, 1061 insertions(+), 8 deletions(-) create mode 100644 arch/x86/Kconfig.fpu create mode 100644 arch/x86/lib/copy_user_avx2.S create mode 100644 arch/x86/lib/copy_user_sse2.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b4c48b92c47..d608a67c2813 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -889,6 +889,35 @@ config INTEL_TDX_GUEST endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" +source "arch/x86/Kconfig.fpu" + +choice + prompt "For lagre memory copy" + default X86_Hygon_LMC_MODE_OFF + help + Hygon's LMC (large memory copy) feature allows you to + optimize the performance of large data copies between the kernel and user space + by using vectorized instructions, thus significantly improving performance. + +config X86_Hygon_LMC_MODE_OFF + bool "No optimization for large memory copy" + help + No optimization for large memory copy + +config X86_Hygon_LMC_SSE2_ON + bool "Using sse2 nt copy for large memory copy" + depends on USING_FPU_IN_KERNEL_NONATOMIC + help + When this feature is enabled, we will using copy_user_sse2_nt_string + for lagre memory copy. + +config X86_Hygon_LMC_AVX2_ON + bool "Using avx2 nt copy for large memory copy" + depends on USING_FPU_IN_KERNEL_NONATOMIC + help + When this feature is enabled, we will using copy_user_avx2_nt_string + for lagre memory copy. +endchoice config HPET_TIMER def_bool X86_64 diff --git a/arch/x86/Kconfig.fpu b/arch/x86/Kconfig.fpu new file mode 100644 index 000000000000..41a093a72f43 --- /dev/null +++ b/arch/x86/Kconfig.fpu @@ -0,0 +1,9 @@ +#SPDX - License - Identifier : GPL - 2.0 + +config USING_FPU_IN_KERNEL_NONATOMIC + default n + bool "Support using fpu instructions in kernel non-atomic context" + depends on X86_64 && CPU_SUP_HYGON + help + When this feature is enabled, we can use fpu instructions in kernel + non-atomic context. \ No newline at end of file diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 7c8f544f71f5..fdb1ce71d85e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -25,6 +25,7 @@ /* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ #define KFPU_387 _BITUL(0) /* 387 state will be initialized */ #define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */ +#define KFPU_USER _BITUL(2) /* MXCSR will be initialized */ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); @@ -32,11 +33,35 @@ extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); /* Code that is unaware of kernel_fpu_begin_mask() can use this */ +/* 这里是开启387浮点的函数 */ static inline void kernel_fpu_begin(void) { kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); } +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask); +//extern void kernel_fpu_end_nonatomic(void); + +/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */ +static inline int kernel_fpu_begin_nonatomic(void) +{ +#ifdef CONFIG_X86_64 + /* + * * Any 64-bit code that uses 387 instructions must explicitly request + * * KFPU_387. + * */ + return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR | KFPU_USER); +#else + /* + * * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * * as long as it checks that the CPU has the required capability. + * */ + return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR | KFPU_USER); +#endif +} +#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index e69de29bb2d1..20c5c90d352d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu); +static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu) + { + struct fpu *old_fpu = &prev->thread.fpu; + if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) { + save_fpregs_to_fpkernelstate(old_fpu); + } + } + + /* Internal helper for switch_kernel_fpu_finish() and signal frame setup */ + static inline void fpregs_restore_kernelregs(struct fpu *kfpu) + { + kernel_fpu_states_restore(NULL, &kfpu->fpstate->kernel_state, sizeof(kfpu->fpstate->kernel_state)); + } + +static inline void switch_kernel_fpu_finish(struct task_struct *next) +{ + struct fpu *new_fpu = &next->thread.fpu; + if (next->flags & PF_KTHREAD) + return; + + if (cpu_feature_enabled(X86_FEATURE_FPU) + && test_ti_thread_flag((struct thread_info *)next, + TIF_USING_FPU_NONATOMIC)) + fpregs_restore_kernelregs(new_fpu); + +} +#endif \ No newline at end of file diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 8f70c5eb3a58..0f6561f74800 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -381,6 +381,9 @@ struct fpstate { /* @is_guest: Indicator for guest state (KVM) */ unsigned int is_guest : 1; +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + union fpregs_state kernel_state; +#endif /* * @is_confidential: Indicator for KVM confidential mode. * The FPU registers are restored by the diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f9bdfa4da93a..e66aeba703ac 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -107,6 +107,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_USING_FPU_NONATOMIC 31 /* using fpu in kernel non-atomic context */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f..4e938ec8be72 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,9 @@ #include #include #include +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#include +#endif /* * Copy To/From Userspace @@ -24,13 +27,91 @@ copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#ifdef CONFIG_X86_Hygon_LMC_SSE2_ON +void fpu_save_xmm0_3(void *to, const void *from, unsigned len); +void fpu_restore_xmm0_3(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_xmm0_3 +#define kernel_fpu_states_restore fpu_restore_xmm0_3 + +__must_check unsigned long +copy_user_sse2_opt_string(void *to, const void *from, unsigned len); + +#define copy_user_large_memory_generic_string copy_user_sse2_opt_string + +#endif //CONFIG_X86_Hygon_LMC_SSE2_ON + +#ifdef CONFIG_X86_Hygon_LMC_AVX2_ON +void fpu_save_ymm0_7(void *to, const void *from, unsigned len); +void fpu_restore_ymm0_7(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_ymm0_7 +#define kernel_fpu_states_restore fpu_restore_ymm0_7 + +__must_check unsigned long +copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned len); + +#define copy_user_large_memory_generic_string copy_user_avx2_pf64_nt_string +#endif //CONFIG_USING_AVX2_FOR_LAGRE_MEMORY_COPY + +#if defined (CONFIG_X86_Hygon_LMC_SSE2_ON) || defined (CONFIG_X86_Hygon_LMC_AVX2_ON) +unsigned int get_nt_block_copy_mini_len(void); +unsigned int get_nt_block_copy_to_user_mini_nr_pages(void); +unsigned int get_nt_block_copy_from_user_mini_nr_pages(void); + +static __always_inline __must_check unsigned long +copy_user_block_data_generic(void *to, const void *from, unsigned len) +{ + unsigned ret; + unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len(); + if (nt_blk_cpy_mini_len && (nt_blk_cpy_mini_len <= len) + && (system_state == SYSTEM_RUNNING) + && (!kernel_fpu_begin_nonatomic())) + { + ret = copy_user_large_memory_generic_string(to, from, len); + kernel_fpu_end(); + + return ret; + } + + /* If CPU has ERMS feature, use copy_user_enhanced_fast_string. + * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. + * Otherwise, use copy_user_generic_unrolled. + */ + alternative_call_2(copy_user_generic_unrolled, + copy_user_generic_string, + X86_FEATURE_REP_GOOD, + copy_user_enhanced_fast_string, + X86_FEATURE_ERMS, + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), + "=d" (len)), + "1" (to), "2" (from), "3" (len) + : "memory", "rcx", "r8", "r9", "r10", "r11"); + return ret; +} +#endif //CONFIG_X86_Hygon_LMC_SSE2_ON || CONFIG_X86_Hygon_LMC_AVX2_ON +#endif + static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#if defined (CONFIG_X86_Hygon_LMC_SSE2_ON) || defined (CONFIG_X86_Hygon_LMC_AVX2_ON) + unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len(); + if (((nt_blk_cpy_mini_len) && (nt_blk_cpy_mini_len <= len) + && (system_state == SYSTEM_RUNNING) + && (!kernel_fpu_begin_nonatomic()))) + { + ret = copy_user_large_memory_generic_string(to, from, len); + kernel_fpu_end(); + return ret; + } +#endif +#endif + /* If CPU has ERMS feature, use copy_user_enhanced_fast_string. * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use copy_user_generic_unrolled. */ @@ -46,6 +127,44 @@ copy_user_generic(void *to, const void *from, unsigned len) return ret; } +#if defined (CONFIG_X86_Hygon_LMC_SSE2_ON) || defined (CONFIG_X86_Hygon_LMC_AVX2_ON) +static __always_inline __must_check unsigned long +raw_copy_block_data_from_user(void *dst, const void __user *src, unsigned long size, unsigned long pages_nr) +{ + unsigned int mini_nr_pages = get_nt_block_copy_from_user_mini_nr_pages(); + if (mini_nr_pages && pages_nr >= mini_nr_pages) + return copy_user_block_data_generic(dst, (__force void *)src, size); + else + return copy_user_generic(dst, (__force void *)src, size); +} + +static __always_inline __must_check unsigned long +raw_copy_block_data_to_user(void __user *dst, const void *src, unsigned long size, unsigned long pages_nr) +{ + unsigned int mini_nr_pages = get_nt_block_copy_to_user_mini_nr_pages(); + if (mini_nr_pages && pages_nr >= mini_nr_pages) + return copy_user_block_data_generic((__force void *)dst, src, size); + else + return copy_user_generic((__force void *)dst, src, size); +} +#else +static __always_inline __must_check unsigned long +raw_copy_block_data_from_user(void *dst, const void __user *src, unsigned long size, + unsigned long pages_nr) +{ + pages_nr = pages_nr; + return copy_user_generic(dst, (__force void *)src, size); +} + +static __always_inline __must_check unsigned long +raw_copy_block_data_to_user(void __user *dst, const void *src, unsigned long size, + unsigned long pages_nr) +{ + pages_nr = pages_nr; + return copy_user_generic((__force void *)dst, src, size); +} +#endif //CONFIG_X86_Hygon_LMC_SSE2_ON || CONFIG_X86_Hygon_LMC_AVX2_ON + static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { @@ -60,9 +179,9 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size) static __always_inline __must_check unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) -{ - return copy_user_generic((__force void *)dst, - (__force void *)src, size); + { + return copy_user_generic((__force void *)dst, + (__force void *)src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ebe35ba90357..094a95cdf18e 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_X86_64 # include #endif @@ -480,3 +483,173 @@ static const struct cpu_dev hygon_cpu_dev = { }; cpu_dev_register(hygon_cpu_dev); + +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +#if defined (CONFIG_X86_Hygon_LMC_SSE2_ON) || defined (CONFIG_X86_Hygon_LMC_AVX2_ON) +struct hygon_c86_info { + unsigned int nt_cpy_mini_len; + unsigned int nt_cpy_to_user_mini_nr_pages; + unsigned int nt_cpy_from_user_mini_nr_pages; +}; + +static struct hygon_c86_info hygon_c86_data = { + .nt_cpy_mini_len = PAGE_SIZE, + .nt_cpy_to_user_mini_nr_pages = 3, + .nt_cpy_from_user_mini_nr_pages = 2 +}; + +void set_c86_features_para_invaild(void) +{ + memset((void *)&hygon_c86_data, 0, sizeof(struct hygon_c86_info)); +} + +unsigned int get_nt_block_copy_mini_len(void) +{ + return hygon_c86_data.nt_cpy_mini_len; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_mini_len); + +unsigned int get_nt_block_copy_to_user_mini_nr_pages(void) +{ + return hygon_c86_data.nt_cpy_to_user_mini_nr_pages; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_to_user_mini_nr_pages); + +unsigned int get_nt_block_copy_from_user_mini_nr_pages(void) +{ + return hygon_c86_data.nt_cpy_from_user_mini_nr_pages; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_from_user_mini_nr_pages); + +static ssize_t show_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_mini_len); +} + +static ssize_t store_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_mini_len = val; + + return count; +} + +static ssize_t show_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_to_user_mini_nr_pages); +} + +static ssize_t store_nt_cpy_to_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_to_user_mini_nr_pages = val; + + return count; +} + +static ssize_t show_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_from_user_mini_nr_pages); +} + +static ssize_t store_nt_cpy_from_user_mini_nr_pages(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_from_user_mini_nr_pages = val; + + return count; +} + +static struct kobj_attribute nt_cpy_mini_len_attribute = + __ATTR(nt_cpy_mini_len, S_IRUSR | S_IWUSR, + show_nt_cpy_mini_len, + store_nt_cpy_mini_len); +static struct kobj_attribute nt_cpy_to_user_mini_nr_pages_attribute = + __ATTR(nt_cpy_to_user_mini_nr_pages, S_IRUSR | S_IWUSR, + show_nt_cpy_to_user_mini_nr_pages, + store_nt_cpy_to_user_mini_nr_pages); +static struct kobj_attribute nt_cpy_from_user_mini_nr_pages_attribute = + __ATTR(nt_cpy_from_user_mini_nr_pages, S_IRUSR | S_IWUSR, + show_nt_cpy_from_user_mini_nr_pages, + store_nt_cpy_from_user_mini_nr_pages); + +static struct attribute *c86_default_attrs[] = { + &nt_cpy_mini_len_attribute.attr, + &nt_cpy_to_user_mini_nr_pages_attribute.attr, + &nt_cpy_from_user_mini_nr_pages_attribute.attr, + NULL +}; + +const struct attribute_group hygon_c86_attr_group = { + .attrs = c86_default_attrs, + .name = "hygon_c86", +}; + +static struct kobject *c86_features_kobj; +static int __init kobject_hygon_c86_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + goto err_out; + + c86_features_kobj = kobject_create_and_add("c86_features", NULL); + + if (c86_features_kobj) { + ret = sysfs_create_group(c86_features_kobj, &hygon_c86_attr_group); + if (ret) + goto err_out; + } + + return 0; +err_out: + set_c86_features_para_invaild(); + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } + + return -1; +} +module_init(kobject_hygon_c86_init); + +static void __exit kobject_hygon_c86_exit(void) +{ + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } +} +module_exit(kobject_hygon_c86_exit); +#endif +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7948f1499138..0f38ce832424 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -423,9 +423,20 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { - preempt_disable(); + if (likely(!(kfpu_mask & KFPU_USER))){ + preempt_disable(); + } + + +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* + * It means we call kernel_fpu_begin after kernel_fpu_begin_nonatomic + * func, but before kernel_fpu_end_nonatomic + */ + WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC)); +#endif - WARN_ON_FPU(!irq_fpu_usable()); + WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); @@ -443,18 +454,84 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); + + if(unlikely(kfpu_mask & KFPU_USER)){ + set_thread_flag(TIF_USING_FPU_NONATOMIC); + preempt_enable(); + } } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* + * It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic + * func, but before kernel_fpu_end_nonatomic + */ + WARN_ON_FPU(test_thread_flag(TIF_USING_FPU_NONATOMIC)); +#endif WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); - preempt_enable(); + + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)){ + clear_thread_flag(TIF_USING_FPU_NONATOMIC); + }else{ + preempt_enable(); + } } EXPORT_SYMBOL_GPL(kernel_fpu_end); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC +/* + * We can call kernel_fpu_begin_nonatomic in non-atomic task context. + */ +int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask) +{ + preempt_disable(); + /* we not support Nested call */ + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)) + { + goto nested_err; + } + + /* + * This means we call kernel_fpu_begin_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + if (this_cpu_read(in_kernel_fpu)){ + goto nested_err; + } + + if (in_interrupt()){ + goto irq_err; + } + + if (current->flags & PF_KTHREAD) + { + goto err; + } + + kernel_fpu_begin_mask(kfpu_mask); + + return 0; + +nested_err: +irq_err: +err: + preempt_enable(); + + return -1; +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin_nonatomic_mask); + +void save_fpregs_to_fpkernelstate(struct fpu *kfpu) +{ + kernel_fpu_states_save(&kfpu->fpstate->kernel_state, NULL, sizeof(kfpu->fpstate->kernel_state)); +} +#endif //CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 77cf9d87ad45..1c5d5c60f789 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include @@ -568,6 +569,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)) + switch_kernel_fpu_prepare(prev_p, cpu); +#endif + //printk("__switch_to now prev pid is %d\n",prev_p->pid); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -622,6 +628,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(); +#ifdef CONFIG_USING_FPU_IN_KERNEL_NONATOMIC + switch_kernel_fpu_finish(next_p); +#endif + //printk("__switch_to now next pid is %d\n",next_p->pid); /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index bad4dee4f0e4..c4bbf9b576ce 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -69,5 +69,7 @@ else lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o + lib-$(CONFIG_X86_Hygon_LMC_SSE2_ON) += copy_user_sse2.o + lib-$(CONFIG_X86_Hygon_LMC_AVX2_ON) += copy_user_avx2.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_avx2.S b/arch/x86/lib/copy_user_avx2.S new file mode 100644 index 000000000000..20e00f8e775e --- /dev/null +++ b/arch/x86/lib/copy_user_avx2.S @@ -0,0 +1,329 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 64 +//#define PREFETCH_DISTANCE 128 +//#define PREFETCH_DISTANCE 192 +//#define PREFETCH_DISTANCE 256 + +#define X86_NON_TEMPORAL_THRESHOLD 4095 +//#define X86_NON_TEMPORAL_THRESHOLD 1000000 + +#define PREFETCH(addr) prefetchnta addr + +.macro ALIGN_DESTINATION_32 + /* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */ + /* if <32Bytes, jb 302f */ + cmpl $32, %edx + jb 302f + + movl %edi, %ecx + andl $31, %ecx + jz 302f /* already aligned */ + + subl $32, %ecx + negl %ecx + subl %ecx, %edx + +300: + movb (%rsi), %al +301: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 300b +302: + +.section .fixup,"ax" +303: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(300b, 303b) + _ASM_EXTABLE_CPY(301b, 303b) +.endm + +/* + * large block copy, use avx2 nt & prefetchnta + */ +SYM_FUNC_START(copy_user_avx2_pf64_nt_string) + ASM_STAC + ALIGN_DESTINATION_32 + + /* len >= 256 . */ + cmpl $256, %edx + jb .Lless_than_256_bytes_cpy + + movl %esi, %ecx /* check if src is aligned */ + andl $31, %ecx + jnz large_block_nt_unaligned_cpy + +large_block_nt_aligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +32: + vmovdqa 0(%rsi), %ymm0 +33: + vmovdqa 32(%rsi), %ymm1 +34: + vmovdqa 64(%rsi), %ymm2 +35: + vmovdqa 96(%rsi), %ymm3 +36: + vmovdqa 128(%rsi), %ymm4 +37: + vmovdqa 160(%rsi), %ymm5 +38: + vmovdqa 192(%rsi), %ymm6 +39: + vmovdqa 224(%rsi), %ymm7 + +40: + vmovntdq %ymm0, 0(%rdi) +41: + vmovntdq %ymm1, 32(%rdi) +42: + vmovntdq %ymm2, 64(%rdi) +43: + vmovntdq %ymm3, 96(%rdi) +44: + vmovntdq %ymm4, 128(%rdi) +45: + vmovntdq %ymm5, 160(%rdi) +46: + vmovntdq %ymm6, 192(%rdi) +47: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_aligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + +large_block_nt_unaligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +48: + vmovdqu 0(%rsi), %ymm0 +49: + vmovdqu 32(%rsi), %ymm1 +50: + vmovdqu 64(%rsi), %ymm2 +51: + vmovdqu 96(%rsi), %ymm3 +52: + vmovdqu 128(%rsi), %ymm4 +53: + vmovdqu 160(%rsi), %ymm5 +54: + vmovdqu 192(%rsi), %ymm6 +55: + vmovdqu 224(%rsi), %ymm7 + +56: + vmovntdq %ymm0, 0(%rdi) +57: + vmovntdq %ymm1, 32(%rdi) +58: + vmovntdq %ymm2, 64(%rdi) +59: + vmovntdq %ymm3, 96(%rdi) +60: + vmovntdq %ymm4, 128(%rdi) +61: + vmovntdq %ymm5, 160(%rdi) +62: + vmovntdq %ymm6, 192(%rdi) +63: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_unaligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + + .section .fixup,"ax" + +88: + vzeroupper + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(32b, 88b) + _ASM_EXTABLE_CPY(33b, 88b) + _ASM_EXTABLE_CPY(34b, 88b) + _ASM_EXTABLE_CPY(35b, 88b) + _ASM_EXTABLE_CPY(36b, 88b) + _ASM_EXTABLE_CPY(37b, 88b) + _ASM_EXTABLE_CPY(38b, 88b) + _ASM_EXTABLE_CPY(39b, 88b) + + _ASM_EXTABLE_CPY(40b, 88b) + _ASM_EXTABLE_CPY(41b, 88b) + _ASM_EXTABLE_CPY(42b, 88b) + _ASM_EXTABLE_CPY(43b, 88b) + _ASM_EXTABLE_CPY(44b, 88b) + _ASM_EXTABLE_CPY(45b, 88b) + _ASM_EXTABLE_CPY(46b, 88b) + _ASM_EXTABLE_CPY(47b, 88b) + _ASM_EXTABLE_CPY(48b, 88b) + _ASM_EXTABLE_CPY(49b, 88b) + + _ASM_EXTABLE_CPY(50b, 88b) + _ASM_EXTABLE_CPY(51b, 88b) + _ASM_EXTABLE_CPY(52b, 88b) + _ASM_EXTABLE_CPY(53b, 88b) + _ASM_EXTABLE_CPY(54b, 88b) + _ASM_EXTABLE_CPY(55b, 88b) + _ASM_EXTABLE_CPY(56b, 88b) + _ASM_EXTABLE_CPY(57b, 88b) + _ASM_EXTABLE_CPY(58b, 88b) + _ASM_EXTABLE_CPY(59b, 88b) + + _ASM_EXTABLE_CPY(60b, 88b) + _ASM_EXTABLE_CPY(61b, 88b) + _ASM_EXTABLE_CPY(62b, 88b) + _ASM_EXTABLE_CPY(63b, 88b) +SYM_FUNC_END(copy_user_avx2_pf64_nt_string) +EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string) + +/* + * If len < 256 bytes, then we use rep mov directly. + */ +SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy) + movl %edx, %ecx +90: + rep movsb + + xorl %eax,%eax + ASM_CLAC + RET + + .section .fixup,"ax" +99: + mov %ecx,%eax + + ASM_CLAC + RET + .previous + + _ASM_EXTABLE_CPY(90b, 99b) +SYM_CODE_END(.Lless_than_256_bytes_cpy) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ + +SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail) + movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f + +1: rep movsb +2: mov %ecx,%eax + + ASM_CLAC + RET + +3: xorl %eax,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_CPY(1b, 2b) +SYM_CODE_END(.Lavx2_copy_user_handle_tail) + +/* + * Called when task schedule. we call fpu_save_%ymm0_7 to save old + * task's fpu states and we call fpu_restore_%ymm0_7 to restore new + * task's fpu states. + */ +SYM_FUNC_START(fpu_restore_ymm0_7) + vmovdqu 0(%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vmovdqu 64(%rsi), %ymm2 + vmovdqu 96(%rsi), %ymm3 + vmovdqu 128(%rsi), %ymm4 + vmovdqu 160(%rsi), %ymm5 + vmovdqu 192(%rsi), %ymm6 + vmovdqu 224(%rsi), %ymm7 + + xorl %eax,%eax + RET//ret +SYM_FUNC_END(fpu_restore_ymm0_7) +EXPORT_SYMBOL(fpu_restore_ymm0_7) + +SYM_FUNC_START(fpu_save_ymm0_7) + vmovdqu %ymm0, 0(%rdi) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, 128(%rdi) + vmovdqu %ymm5, 160(%rdi) + vmovdqu %ymm6, 192(%rdi) + vmovdqu %ymm7, 224(%rdi) + + xorl %eax,%eax + RET +SYM_FUNC_END(fpu_save_ymm0_7) +EXPORT_SYMBOL(fpu_save_ymm0_7) diff --git a/arch/x86/lib/copy_user_sse2.S b/arch/x86/lib/copy_user_sse2.S new file mode 100644 index 000000000000..65f4a5a24303 --- /dev/null +++ b/arch/x86/lib/copy_user_sse2.S @@ -0,0 +1,245 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 256 + +.macro ALIGN_DESTINATION_16 + /* check for bad alignment of destination, there is 16Bytes, for we will use movdqa */ + /* if len<16Bytes, jb 202f */ + cmpl $16,%edx + jb 202f + + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $15,%ecx + jz 202f /* already aligned */ + + subl $16,%ecx + negl %ecx + subl %ecx,%edx +200: + movb (%rsi),%al +201: + movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 200b +202: + + .section .fixup,"ax" +203: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(200b, 203b) + _ASM_EXTABLE_CPY(201b, 203b) +.endm +/*****************************************************************************/ +SYM_FUNC_START(copy_user_sse2_opt_string) + ASM_STAC + ALIGN_DESTINATION_16 + + cmpl $64,%edx + jb 70f /* less then 64 bytes, avoid the costly 'rep' */ + + movl %esi,%ecx /* check if src is aligned */ + andl $15,%ecx + jnz 20f + +10: + prefetchnta PREFETCH_DISTANCE(%rsi) +11: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +12: + movdqa (%rsi),%xmm0 +13: + movdqa 16(%rsi),%xmm1 +14: + movdqa 32(%rsi),%xmm2 +15: + movdqa 48(%rsi),%xmm3 +16: + movntdq %xmm0,0(%rdi) +17: + movntdq %xmm1,16(%rdi) +18: + movntdq %xmm2,32(%rdi) +19: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 10b + sfence + jmp 70f + +20: + prefetchnta PREFETCH_DISTANCE(%rsi) +21: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +22: + movdqu (%rsi),%xmm0 +23: + movdqu 16(%rsi),%xmm1 +24: + movdqu 32(%rsi),%xmm2 +25: + movdqu 48(%rsi),%xmm3 +26: + movntdq %xmm0,0(%rdi) +27: + movntdq %xmm1,16(%rdi) +28: + movntdq %xmm2,32(%rdi) +29: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 20b + sfence + +70: + movl %edx,%ecx +80: + rep + movsb + + xorl %eax,%eax + ASM_CLAC + RET//ret + + .section .fixup,"ax" +99: + movl %ecx,%edx /* ecx is zerorest also */ +100: + sfence + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(10b, 100b) + _ASM_EXTABLE_CPY(11b, 100b) + _ASM_EXTABLE_CPY(12b, 100b) + _ASM_EXTABLE_CPY(13b, 100b) + _ASM_EXTABLE_CPY(14b, 100b) + _ASM_EXTABLE_CPY(15b, 100b) + _ASM_EXTABLE_CPY(16b, 100b) + _ASM_EXTABLE_CPY(17b, 100b) + _ASM_EXTABLE_CPY(18b, 100b) + _ASM_EXTABLE_CPY(19b, 100b) + + _ASM_EXTABLE_CPY(20b, 100b) + _ASM_EXTABLE_CPY(21b, 100b) + _ASM_EXTABLE_CPY(22b, 100b) + _ASM_EXTABLE_CPY(23b, 100b) + _ASM_EXTABLE_CPY(24b, 100b) + _ASM_EXTABLE_CPY(25b, 100b) + _ASM_EXTABLE_CPY(26b, 100b) + _ASM_EXTABLE_CPY(27b, 100b) + _ASM_EXTABLE_CPY(28b, 100b) + _ASM_EXTABLE_CPY(29b, 100b) + + _ASM_EXTABLE_CPY(80b, 99b) +SYM_FUNC_END(copy_user_sse2_opt_string) +EXPORT_SYMBOL(copy_user_sse2_opt_string) + +SYM_FUNC_START(fpu_restore_xmm0_3) + ASM_STAC + movdqu (%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_restore_xmm0_3) +EXPORT_SYMBOL(fpu_restore_xmm0_3) + +SYM_FUNC_START(fpu_save_xmm0_3) + ASM_STAC + + movdqu %xmm0,(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_save_xmm0_3) +EXPORT_SYMBOL(fpu_save_xmm0_3) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +SYM_CODE_START_LOCAL(.Lsse2_copy_user_handle_tail) + movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + RET + + /* + * Return zero to pretend that this copy succeeded. This + * is counter-intuitive, but needed to prevent the code + * in lib/iov_iter.c from retrying and running back into + * the poison cache line again. The machine check handler + * will ensure that a SIGBUS is sent to the task. + */ +3: xorl %eax,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_CPY(1b, 2b) +SYM_CODE_END(.Lsse2_copy_user_handle_tail) + +/*****************************************************************************/ -- Gitee