diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b4c48b92c47b37f77bc21a7200d7d12ccddfc51..0da214e4f32073f13106463f80ca98a6e44b854c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -889,6 +889,7 @@ config INTEL_TDX_GUEST endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" +source "arch/x86/Kconfig.fpu" config HPET_TIMER def_bool X86_64 diff --git a/arch/x86/Kconfig.fpu b/arch/x86/Kconfig.fpu new file mode 100644 index 0000000000000000000000000000000000000000..5410feda1bc770e12e6de371cce05a846fcc249b --- /dev/null +++ b/arch/x86/Kconfig.fpu @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: GPL-2.0 + +menuconfig USING_FPU_IN_KERNEL_NONATOMIC + bool "Hygon large memory copy support" + +if USING_FPU_IN_KERNEL_NONATOMIC + +choice + prompt "X86_HYGON_LMC" + depends on X86_64 && CPU_SUP_HYGON + default X86_HYGON_LMC_SSE2_ON + +config X86_HYGON_LMC_SSE2_ON + bool "Using sse2 nt copy for large memory copy" + help + When this feature is enabled, we will using copy_user_sse2_nt_string + for lagre memory copy. + +config X86_HYGON_LMC_AVX2_ON + bool "Using avx2 nt copy for large memory copy" + help + When this feature is enabled, we will using copy_user_avx2_nt_string + for lagre memory copy. + +endchoice +endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 7c8f544f71f5175dbd0c33584415200f341e97a6..4cffcba00e540869d1a6277bff9b7449d19dd03b 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -37,6 +37,44 @@ static inline void kernel_fpu_begin(void) kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); } +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask); +extern void kernel_fpu_end_nonatomic(void); + +/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */ +static inline int kernel_fpu_begin_nonatomic(void) +{ +#ifdef CONFIG_X86_64 + /* + * Any 64-bit code that uses 387 instructions must explicitly request + * KFPU_387. + */ + return kernel_fpu_begin_nonatomic_mask(KFPU_MXCSR); +#else + /* + * 32-bit kernel code may use 387 operations as well as SSE2, etc, + * as long as it checks that the CPU has the required capability. + */ + return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR); +#endif +} + +/* + * It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic + * func, but before kernel_fpu_end_nonatomic + */ +static inline void check_using_kernel_fpu(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + WARN_ON_ONCE(test_thread_flag(TIF_USING_FPU_NONATOMIC)); +} + +#else +static inline void check_using_kernel_fpu(void) { } + +#endif + /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..727d667d1c19410c4017d80cfeb2da97d5f8e3c5 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Kernel FPU state switching for scheduling. + * + * This is a two-stage process: + * + * - switch_kernel_fpu_prepare() saves the old kernel fpu state. + * This is done within the context of the old process. + * + * - switch_kernel_fpu_finish() restore new kernel fpu state. + * + * The kernel FPU context is only stored/restored for a user task in kernel + * mode and PF_KTHREAD is used to distinguish between kernel and user threads. + */ +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu); +static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu) +{ + struct fpu *old_fpu = &prev->thread.fpu; + + if ((boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) || + !test_thread_flag(TIF_USING_FPU_NONATOMIC)) + return; + + if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD)) + save_fpregs_to_fpkernelstate(old_fpu); +} + +/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */ +static inline void fpregs_restore_kernelregs(struct fpu *kfpu) +{ + kernel_fpu_states_restore(NULL, &kfpu->fpstate->kernel_state, + sizeof(kfpu->fpstate->kernel_state)); +} + +/* Loading of the complete FPU state immediately. */ +static inline void switch_kernel_fpu_finish(struct task_struct *next) +{ + struct fpu *new_fpu = &next->thread.fpu; + + if ((next->flags & PF_KTHREAD) || + (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)) + return; + + if (cpu_feature_enabled(X86_FEATURE_FPU) && + test_ti_thread_flag((struct thread_info *)next, + TIF_USING_FPU_NONATOMIC)) + fpregs_restore_kernelregs(new_fpu); +} +#else +static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu) +{ +} +static inline void switch_kernel_fpu_finish(struct task_struct *next) +{ +} + +#endif diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 8f70c5eb3a5870c50014fd894c48ee84c3a8d7d2..7678fef7e21d72614d4abd3b1d1d1dbb2c9fcc9e 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -381,6 +381,11 @@ struct fpstate { /* @is_guest: Indicator for guest state (KVM) */ unsigned int is_guest : 1; +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) + /* @kernel_state: Kernel FPU registers state saving */ + union fpregs_state kernel_state; +#endif /* * @is_confidential: Indicator for KVM confidential mode. * The FPU registers are restored by the diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f9bdfa4da93ada41709184ddf36fe8afbc4ed8d8..e66aeba703ac3eed60962f37f0d6041b0a989787 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -107,6 +107,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_USING_FPU_NONATOMIC 31 /* using fpu in kernel non-atomic context */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index e7265a552f4f0cb47dbfb2a624b7c17b97fdca36..ea7cc9dab376264c8e27af073a5e59c58e3ff557 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -11,6 +11,10 @@ #include #include #include +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +#include +#endif /* * Copy To/From Userspace @@ -24,13 +28,81 @@ copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); +#ifdef CONFIG_X86_HYGON_LMC_SSE2_ON +void fpu_save_xmm0_3(void *to, const void *from, unsigned len); +void fpu_restore_xmm0_3(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_xmm0_3 +#define kernel_fpu_states_restore fpu_restore_xmm0_3 + +__must_check unsigned long copy_user_sse2_opt_string(void *to, const void *from, + unsigned len); + +#define copy_user_large_memory_generic_string copy_user_sse2_opt_string + +#endif + +#ifdef CONFIG_X86_HYGON_LMC_AVX2_ON +void fpu_save_ymm0_7(void *to, const void *from, unsigned len); +void fpu_restore_ymm0_7(void *to, const void *from, unsigned len); + +#define kernel_fpu_states_save fpu_save_ymm0_7 +#define kernel_fpu_states_restore fpu_restore_ymm0_7 + +__must_check unsigned long +copy_user_avx2_pf64_nt_string(void *to, const void *from, unsigned len); + +#define copy_user_large_memory_generic_string copy_user_avx2_pf64_nt_string +#endif + +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +unsigned int get_nt_block_copy_mini_len(void); +static inline bool Hygon_LMC_check(unsigned len) +{ + unsigned int nt_blk_cpy_mini_len = get_nt_block_copy_mini_len(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && + ((nt_blk_cpy_mini_len) && (nt_blk_cpy_mini_len <= len) && + (system_state == SYSTEM_RUNNING) && + (!kernel_fpu_begin_nonatomic()))) + return true; + else + return false; +} +static inline unsigned long +copy_large_memory_generic_string(void *to, const void *from, unsigned len) +{ + unsigned ret; + + ret = copy_user_large_memory_generic_string(to, from, len); + kernel_fpu_end_nonatomic(); + return ret; +} +#else +static inline bool Hygon_LMC_check(unsigned len) +{ + return false; +} +static inline unsigned long +copy_large_memory_generic_string(void *to, const void *from, unsigned len) +{ + return 0; +} +#endif + static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - /* - * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + /* Check if Hygon large memory copy support enabled. */ + if (Hygon_LMC_check(len)) { + ret = copy_large_memory_generic_string(to, from, len); + return ret; + } + + /* If CPU has ERMS feature, use copy_user_enhanced_fast_string. * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use copy_user_generic_unrolled. */ diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ebe35ba90357b617360a741a08534ddbd8b54146..e2dccacc966fbabf5e3953ce4775be6b35d5a59a 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_X86_64 # include #endif @@ -480,3 +483,100 @@ static const struct cpu_dev hygon_cpu_dev = { }; cpu_dev_register(hygon_cpu_dev); + +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +struct hygon_c86_info { + unsigned int nt_cpy_mini_len; +}; + +static struct hygon_c86_info hygon_c86_data = { + .nt_cpy_mini_len = 0 +}; + +void set_c86_features_para_invalid(void) +{ + memset((void *)&hygon_c86_data, 0, sizeof(struct hygon_c86_info)); +} + +unsigned int get_nt_block_copy_mini_len(void) +{ + return hygon_c86_data.nt_cpy_mini_len; +} +EXPORT_SYMBOL_GPL(get_nt_block_copy_mini_len); + +static ssize_t show_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 40, "%d\n", hygon_c86_data.nt_cpy_mini_len); +} + +static ssize_t store_nt_cpy_mini_len(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + hygon_c86_data.nt_cpy_mini_len = val; + + return count; +} + +static struct kobj_attribute nt_cpy_mini_len_attribute = + __ATTR(nt_cpy_mini_len, 0600, show_nt_cpy_mini_len, + store_nt_cpy_mini_len); + +static struct attribute *c86_default_attrs[] = { + &nt_cpy_mini_len_attribute.attr, + NULL +}; + +const struct attribute_group hygon_c86_attr_group = { + .attrs = c86_default_attrs, + .name = "hygon_c86", +}; + +static struct kobject *c86_features_kobj; +static int __init kobject_hygon_c86_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + goto err_out; + + c86_features_kobj = kobject_create_and_add("c86_features", NULL); + + if (c86_features_kobj) { + ret = sysfs_create_group(c86_features_kobj, + &hygon_c86_attr_group); + if (ret) + goto err_out; + } + + return 0; +err_out: + set_c86_features_para_invalid(); + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } + + return -1; +} +module_init(kobject_hygon_c86_init); + +static void __exit kobject_hygon_c86_exit(void) +{ + if (c86_features_kobj) { + sysfs_remove_group(c86_features_kobj, &hygon_c86_attr_group); + kobject_del(c86_features_kobj); + } +} +module_exit(kobject_hygon_c86_exit); + +#endif diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 7948f1499138d0fc5d9edd531eb43f80bd5ce667..e2f550d020c83eba04f7ac9e088a10b063be6b6b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -425,6 +425,8 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); + check_using_kernel_fpu(); + WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); @@ -448,6 +450,8 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { + check_using_kernel_fpu(); + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); @@ -455,6 +459,81 @@ void kernel_fpu_end(void) } EXPORT_SYMBOL_GPL(kernel_fpu_end); +#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \ + defined(CONFIG_X86_HYGON_LMC_AVX2_ON) +/* + * We can call kernel_fpu_begin_nonatomic in non-atomic task context. + */ +int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask) +{ + preempt_disable(); + /* we not support Nested call */ + if (test_thread_flag(TIF_USING_FPU_NONATOMIC)) + goto err; + + /* + * This means we call kernel_fpu_begin_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + if (this_cpu_read(in_kernel_fpu)) + goto err; + + if (in_interrupt()) + goto err; + + if (current->flags & PF_KTHREAD) + goto err; + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + save_fpregs_to_fpstate(¤t->thread.fpu); + } + /* Set thread flag: TIC_USING_FPU_NONATOMIC */ + set_thread_flag(TIF_USING_FPU_NONATOMIC); + + __cpu_invalidate_fpregs_state(); + + /* Put sane initial values into the control registers. */ + if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) + ldmxcsr(MXCSR_DEFAULT); + + if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) + asm volatile ("fninit"); + + preempt_enable(); + + return 0; + +err: + preempt_enable(); + + return -1; +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin_nonatomic_mask); + +void kernel_fpu_end_nonatomic(void) +{ + preempt_disable(); + /* + * This means we call kernel_fpu_end_nonatomic after kernel_fpu_begin, + * but before kernel_fpu_end. + */ + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); + + WARN_ON_FPU(!test_thread_flag(TIF_USING_FPU_NONATOMIC)); + + clear_thread_flag(TIF_USING_FPU_NONATOMIC); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_end_nonatomic); + +void save_fpregs_to_fpkernelstate(struct fpu *kfpu) +{ + kernel_fpu_states_save(&kfpu->fpstate->kernel_state, NULL, + sizeof(kfpu->fpstate->kernel_state)); +} +#endif + /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 77cf9d87ad4518bb91b2412c1bf606bef158ea40..860bac711a173b5030e7e0f913c0ba972c6db686 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include @@ -568,6 +569,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (!test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_fpu, cpu); + switch_kernel_fpu_prepare(prev_p, cpu); + /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -622,6 +625,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(); + switch_kernel_fpu_finish(next_p); + /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index bad4dee4f0e4274d00c60145db3ee01a010847d2..3b93f64ffa0eb2d909793b006fa1fd8603ab4e87 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -69,5 +69,7 @@ else lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o + lib-$(CONFIG_X86_HYGON_LMC_SSE2_ON) += copy_user_sse2.o + lib-$(CONFIG_X86_HYGON_LMC_AVX2_ON) += copy_user_avx2.o lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/copy_user_avx2.S b/arch/x86/lib/copy_user_avx2.S new file mode 100644 index 0000000000000000000000000000000000000000..2e2c5caeceb9ffe95dc28da60499126f461c5959 --- /dev/null +++ b/arch/x86/lib/copy_user_avx2.S @@ -0,0 +1,323 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 64 + +#define PREFETCH(addr) prefetchnta addr + +.macro ALIGN_DESTINATION_32 + /* check for bad alignment of destination, there is 32Bytes, for we will use vmovntdq */ + /* if <32Bytes, jb 302f */ + cmpl $32, %edx + jb 302f + + movl %edi, %ecx + andl $31, %ecx + jz 302f /* already aligned */ + + subl $32, %ecx + negl %ecx + subl %ecx, %edx + +300: + movb (%rsi), %al +301: + movb %al, (%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 300b +302: + +.section .fixup,"ax" +303: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(300b, 303b) + _ASM_EXTABLE_CPY(301b, 303b) +.endm + +/* + * large block copy, use avx2 nt & prefetchnta + */ +SYM_FUNC_START(copy_user_avx2_pf64_nt_string) + ASM_STAC + ALIGN_DESTINATION_32 + + /* len >= 256 . */ + cmpl $256, %edx + jb .Lless_than_256_bytes_cpy + + movl %esi, %ecx /* check if src is aligned */ + andl $31, %ecx + jnz large_block_nt_unaligned_cpy + +large_block_nt_aligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +32: + vmovdqa 0(%rsi), %ymm0 +33: + vmovdqa 32(%rsi), %ymm1 +34: + vmovdqa 64(%rsi), %ymm2 +35: + vmovdqa 96(%rsi), %ymm3 +36: + vmovdqa 128(%rsi), %ymm4 +37: + vmovdqa 160(%rsi), %ymm5 +38: + vmovdqa 192(%rsi), %ymm6 +39: + vmovdqa 224(%rsi), %ymm7 + +40: + vmovntdq %ymm0, 0(%rdi) +41: + vmovntdq %ymm1, 32(%rdi) +42: + vmovntdq %ymm2, 64(%rdi) +43: + vmovntdq %ymm3, 96(%rdi) +44: + vmovntdq %ymm4, 128(%rdi) +45: + vmovntdq %ymm5, 160(%rdi) +46: + vmovntdq %ymm6, 192(%rdi) +47: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_aligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + +large_block_nt_unaligned_cpy: + PREFETCH(PREFETCH_DISTANCE(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 64)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 128)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 192)(%rsi)) + PREFETCH((PREFETCH_DISTANCE + 256)(%rsi)) + +48: + vmovdqu 0(%rsi), %ymm0 +49: + vmovdqu 32(%rsi), %ymm1 +50: + vmovdqu 64(%rsi), %ymm2 +51: + vmovdqu 96(%rsi), %ymm3 +52: + vmovdqu 128(%rsi), %ymm4 +53: + vmovdqu 160(%rsi), %ymm5 +54: + vmovdqu 192(%rsi), %ymm6 +55: + vmovdqu 224(%rsi), %ymm7 + +56: + vmovntdq %ymm0, 0(%rdi) +57: + vmovntdq %ymm1, 32(%rdi) +58: + vmovntdq %ymm2, 64(%rdi) +59: + vmovntdq %ymm3, 96(%rdi) +60: + vmovntdq %ymm4, 128(%rdi) +61: + vmovntdq %ymm5, 160(%rdi) +62: + vmovntdq %ymm6, 192(%rdi) +63: + vmovntdq %ymm7, 224(%rdi) + + add $256, %rsi + add $256, %rdi + subl $256, %edx + cmpl $256, %edx + jg large_block_nt_unaligned_cpy + + vzeroupper + sfence + jmp .Lless_than_256_bytes_cpy + + .section .fixup,"ax" + +88: + vzeroupper + jmp .Lavx2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(32b, 88b) + _ASM_EXTABLE_CPY(33b, 88b) + _ASM_EXTABLE_CPY(34b, 88b) + _ASM_EXTABLE_CPY(35b, 88b) + _ASM_EXTABLE_CPY(36b, 88b) + _ASM_EXTABLE_CPY(37b, 88b) + _ASM_EXTABLE_CPY(38b, 88b) + _ASM_EXTABLE_CPY(39b, 88b) + + _ASM_EXTABLE_CPY(40b, 88b) + _ASM_EXTABLE_CPY(41b, 88b) + _ASM_EXTABLE_CPY(42b, 88b) + _ASM_EXTABLE_CPY(43b, 88b) + _ASM_EXTABLE_CPY(44b, 88b) + _ASM_EXTABLE_CPY(45b, 88b) + _ASM_EXTABLE_CPY(46b, 88b) + _ASM_EXTABLE_CPY(47b, 88b) + _ASM_EXTABLE_CPY(48b, 88b) + _ASM_EXTABLE_CPY(49b, 88b) + + _ASM_EXTABLE_CPY(50b, 88b) + _ASM_EXTABLE_CPY(51b, 88b) + _ASM_EXTABLE_CPY(52b, 88b) + _ASM_EXTABLE_CPY(53b, 88b) + _ASM_EXTABLE_CPY(54b, 88b) + _ASM_EXTABLE_CPY(55b, 88b) + _ASM_EXTABLE_CPY(56b, 88b) + _ASM_EXTABLE_CPY(57b, 88b) + _ASM_EXTABLE_CPY(58b, 88b) + _ASM_EXTABLE_CPY(59b, 88b) + + _ASM_EXTABLE_CPY(60b, 88b) + _ASM_EXTABLE_CPY(61b, 88b) + _ASM_EXTABLE_CPY(62b, 88b) + _ASM_EXTABLE_CPY(63b, 88b) +SYM_FUNC_END(copy_user_avx2_pf64_nt_string) +EXPORT_SYMBOL(copy_user_avx2_pf64_nt_string) + +/* + * If len < 256 bytes, then we use rep mov directly. + */ +SYM_CODE_START_LOCAL(.Lless_than_256_bytes_cpy) + movl %edx, %ecx +90: + rep movsb + + xorl %eax,%eax + ASM_CLAC + RET + + .section .fixup,"ax" +99: + mov %ecx,%eax + + ASM_CLAC + RET + .previous + + _ASM_EXTABLE_CPY(90b, 99b) +SYM_CODE_END(.Lless_than_256_bytes_cpy) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ + +SYM_CODE_START_LOCAL(.Lavx2_copy_user_handle_tail) + movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f + +1: rep movsb +2: mov %ecx,%eax + + ASM_CLAC + RET + +3: xorl %eax,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_CPY(1b, 2b) +SYM_CODE_END(.Lavx2_copy_user_handle_tail) + +/* + * Called when task schedule. we call fpu_save_%ymm0_7 to save old + * task's fpu states and we call fpu_restore_%ymm0_7 to restore new + * task's fpu states. + */ +SYM_FUNC_START(fpu_restore_ymm0_7) + vmovdqu 0(%rsi), %ymm0 + vmovdqu 32(%rsi), %ymm1 + vmovdqu 64(%rsi), %ymm2 + vmovdqu 96(%rsi), %ymm3 + vmovdqu 128(%rsi), %ymm4 + vmovdqu 160(%rsi), %ymm5 + vmovdqu 192(%rsi), %ymm6 + vmovdqu 224(%rsi), %ymm7 + + xorl %eax,%eax + RET//ret +SYM_FUNC_END(fpu_restore_ymm0_7) +EXPORT_SYMBOL(fpu_restore_ymm0_7) + +SYM_FUNC_START(fpu_save_ymm0_7) + vmovdqu %ymm0, 0(%rdi) + vmovdqu %ymm1, 32(%rdi) + vmovdqu %ymm2, 64(%rdi) + vmovdqu %ymm3, 96(%rdi) + vmovdqu %ymm4, 128(%rdi) + vmovdqu %ymm5, 160(%rdi) + vmovdqu %ymm6, 192(%rdi) + vmovdqu %ymm7, 224(%rdi) + + xorl %eax,%eax + RET +SYM_FUNC_END(fpu_save_ymm0_7) +EXPORT_SYMBOL(fpu_save_ymm0_7) diff --git a/arch/x86/lib/copy_user_sse2.S b/arch/x86/lib/copy_user_sse2.S new file mode 100644 index 0000000000000000000000000000000000000000..65f4a5a243036758bc85e35a849aabc95bef58ba --- /dev/null +++ b/arch/x86/lib/copy_user_sse2.S @@ -0,0 +1,245 @@ +/* + * Copyright © 2011 Siarhei Siamashka + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PREFETCH_DISTANCE 256 + +.macro ALIGN_DESTINATION_16 + /* check for bad alignment of destination, there is 16Bytes, for we will use movdqa */ + /* if len<16Bytes, jb 202f */ + cmpl $16,%edx + jb 202f + + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $15,%ecx + jz 202f /* already aligned */ + + subl $16,%ecx + negl %ecx + subl %ecx,%edx +200: + movb (%rsi),%al +201: + movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 200b +202: + + .section .fixup,"ax" +203: + addl %ecx,%edx/* ecx is zerorest also */ + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(200b, 203b) + _ASM_EXTABLE_CPY(201b, 203b) +.endm +/*****************************************************************************/ +SYM_FUNC_START(copy_user_sse2_opt_string) + ASM_STAC + ALIGN_DESTINATION_16 + + cmpl $64,%edx + jb 70f /* less then 64 bytes, avoid the costly 'rep' */ + + movl %esi,%ecx /* check if src is aligned */ + andl $15,%ecx + jnz 20f + +10: + prefetchnta PREFETCH_DISTANCE(%rsi) +11: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +12: + movdqa (%rsi),%xmm0 +13: + movdqa 16(%rsi),%xmm1 +14: + movdqa 32(%rsi),%xmm2 +15: + movdqa 48(%rsi),%xmm3 +16: + movntdq %xmm0,0(%rdi) +17: + movntdq %xmm1,16(%rdi) +18: + movntdq %xmm2,32(%rdi) +19: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 10b + sfence + jmp 70f + +20: + prefetchnta PREFETCH_DISTANCE(%rsi) +21: + prefetchnta (PREFETCH_DISTANCE + 32)(%rsi) +22: + movdqu (%rsi),%xmm0 +23: + movdqu 16(%rsi),%xmm1 +24: + movdqu 32(%rsi),%xmm2 +25: + movdqu 48(%rsi),%xmm3 +26: + movntdq %xmm0,0(%rdi) +27: + movntdq %xmm1,16(%rdi) +28: + movntdq %xmm2,32(%rdi) +29: + movntdq %xmm3,48(%rdi) + add $64,%rsi + add $64,%rdi + subl $64,%edx + cmpl $64,%edx + jg 20b + sfence + +70: + movl %edx,%ecx +80: + rep + movsb + + xorl %eax,%eax + ASM_CLAC + RET//ret + + .section .fixup,"ax" +99: + movl %ecx,%edx /* ecx is zerorest also */ +100: + sfence + jmp .Lsse2_copy_user_handle_tail + .previous + + _ASM_EXTABLE_CPY(10b, 100b) + _ASM_EXTABLE_CPY(11b, 100b) + _ASM_EXTABLE_CPY(12b, 100b) + _ASM_EXTABLE_CPY(13b, 100b) + _ASM_EXTABLE_CPY(14b, 100b) + _ASM_EXTABLE_CPY(15b, 100b) + _ASM_EXTABLE_CPY(16b, 100b) + _ASM_EXTABLE_CPY(17b, 100b) + _ASM_EXTABLE_CPY(18b, 100b) + _ASM_EXTABLE_CPY(19b, 100b) + + _ASM_EXTABLE_CPY(20b, 100b) + _ASM_EXTABLE_CPY(21b, 100b) + _ASM_EXTABLE_CPY(22b, 100b) + _ASM_EXTABLE_CPY(23b, 100b) + _ASM_EXTABLE_CPY(24b, 100b) + _ASM_EXTABLE_CPY(25b, 100b) + _ASM_EXTABLE_CPY(26b, 100b) + _ASM_EXTABLE_CPY(27b, 100b) + _ASM_EXTABLE_CPY(28b, 100b) + _ASM_EXTABLE_CPY(29b, 100b) + + _ASM_EXTABLE_CPY(80b, 99b) +SYM_FUNC_END(copy_user_sse2_opt_string) +EXPORT_SYMBOL(copy_user_sse2_opt_string) + +SYM_FUNC_START(fpu_restore_xmm0_3) + ASM_STAC + movdqu (%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_restore_xmm0_3) +EXPORT_SYMBOL(fpu_restore_xmm0_3) + +SYM_FUNC_START(fpu_save_xmm0_3) + ASM_STAC + + movdqu %xmm0,(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + xorl %eax,%eax + ASM_CLAC + RET//ret +SYM_FUNC_END(fpu_save_xmm0_3) +EXPORT_SYMBOL(fpu_save_xmm0_3) + +/* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * Don't try to copy the tail if machine check happened + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +SYM_CODE_START_LOCAL(.Lsse2_copy_user_handle_tail) + movl %edx,%ecx + cmp $X86_TRAP_MC,%eax /* check if X86_TRAP_MC */ + je 3f +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + RET + + /* + * Return zero to pretend that this copy succeeded. This + * is counter-intuitive, but needed to prevent the code + * in lib/iov_iter.c from retrying and running back into + * the poison cache line again. The machine check handler + * will ensure that a SIGBUS is sent to the task. + */ +3: xorl %eax,%eax + ASM_CLAC + RET + + _ASM_EXTABLE_CPY(1b, 2b) +SYM_CODE_END(.Lsse2_copy_user_handle_tail) + +/*****************************************************************************/