diff --git a/Documentation/virt/kvm/arm/pvsched.rst b/Documentation/virt/kvm/arm/pvsched.rst new file mode 100644 index 0000000000000000000000000000000000000000..8f7112a8a9cd91a5cd69dc0ff6d47c946f0960b4 --- /dev/null +++ b/Documentation/virt/kvm/arm/pvsched.rst @@ -0,0 +1,58 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Paravirtualized sched support for arm64 +======================================= + +KVM/arm64 provides some hypervisor service calls to support a paravirtualized +sched. + +Some SMCCC compatible hypercalls are defined: + +* PV_SCHED_FEATURES: 0xC5000090 +* PV_SCHED_IPA_INIT: 0xC5000091 +* PV_SCHED_IPA_RELEASE: 0xC5000092 + +The existence of the PV_SCHED hypercall should be probed using the SMCCC 1.1 +ARCH_FEATURES mechanism before calling it. + +PV_SCHED_FEATURES + ============= ======== ========== + Function ID: (uint32) 0xC5000090 + PV_call_id: (uint32) The function to query for support. + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the relevant + PV-sched feature is supported by the hypervisor. + ============= ======== ========== + +PV_SCHED_IPA_INIT + ============= ======== ========== + Function ID: (uint32) 0xC5000091 + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the IPA of + this vCPU's PV data structure is shared to the + hypervisor. + ============= ======== ========== + +PV_SCHED_IPA_RELEASE + ============= ======== ========== + Function ID: (uint32) 0xC5000092 + Return value: (int64) NOT_SUPPORTED (-1) or SUCCESS (0) if the IPA of + this vCPU's PV data structure is released. + ============= ======== ========== + +PV sched state +-------------- + +The structure pointed to by the PV_SCHED_IPA hypercall is as follows: + ++-----------+-------------+-------------+-----------------------------------+ +| Field | Byte Length | Byte Offset | Description | ++===========+=============+=============+===================================+ +| preempted | 4 | 0 | Indicates that the vCPU that owns | +| | | | this struct is running or not. | +| | | | Non-zero values mean the vCPU has | +| | | | been preempted. Zero means the | +| | | | vCPU is not preempted. | ++-----------+-------------+-------------+-----------------------------------+ + +The preempted field will be updated to 0 by the hypervisor prior to scheduling +a vCPU. When the vCPU is scheduled out, the preempted field will be updated +to 1 by the hypervisor. diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 67bc2ad13453c1fcbe2ed2837249bcc62388bae1..425114ca1cc707b9cfadebd33d0b259911f38192 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1549,6 +1549,17 @@ config PARAVIRT under a hypervisor, potentially improving performance significantly over full virtualization. +config PARAVIRT_SCHED + bool "Paravirtualization layer for sched" + depends on PARAVIRT + help + This supports the vCPU preemption check to enhance lock performance on + overcommitted hosts (more runnable vCPUs than physical CPUs in the + system) as doing busy waits for preempted vCPUs will hurt system + performance far worse than early yielding. + + If you are unsure how to answer this question, answer Y. + config PARAVIRT_TIME_ACCOUNTING bool "Paravirtual steal time accounting" select PARAVIRT diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 17246843f65751f4e755209ec57f5ce71f7da762..3f87b679c6a0e1ac067dcabb93cc79dd110aa66e 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -469,6 +469,7 @@ CONFIG_SCHED_HRTICK=y CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_HW_PERF_EVENTS=y CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_SCHED=y CONFIG_PARAVIRT_TIME_ACCOUNTING=y CONFIG_ARCH_SUPPORTS_KEXEC=y CONFIG_ARCH_SUPPORTS_KEXEC_FILE=y diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 993b60d7219d76799dbe9f82251b1aadf58bd848..f4d25426aef34827da69cffb3aeebad948345351 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -596,6 +596,13 @@ struct kvm_vcpu_arch { gpa_t base; } steal; +#ifdef CONFIG_PARAVIRT_SCHED + /* Guest PV sched state */ + struct { + gpa_t base; + } pvsched; +#endif + /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; @@ -1051,6 +1058,33 @@ static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch) return (vcpu_arch->steal.base != INVALID_GPA); } +#ifdef CONFIG_PARAVIRT_SCHED +long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu); +void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted); + +static inline void kvm_arm_pvsched_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) +{ + vcpu_arch->pvsched.base = INVALID_GPA; +} + +static inline bool kvm_arm_is_pvsched_enabled(struct kvm_vcpu_arch *vcpu_arch) +{ + return (vcpu_arch->pvsched.base != INVALID_GPA); +} +#else +static inline long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, + u32 preempted) {} +static inline void kvm_arm_pvsched_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) {} +static inline bool kvm_arm_is_pvsched_enabled(struct kvm_vcpu_arch *vcpu_arch) +{ + return false; +} +#endif + void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); diff --git a/arch/arm64/include/asm/paravirt.h b/arch/arm64/include/asm/paravirt.h index 9aa193e0e8f28d9309bc18013230e714152f4f93..5ccead71bf8786fd9301c6a911c719fe5076b3d5 100644 --- a/arch/arm64/include/asm/paravirt.h +++ b/arch/arm64/include/asm/paravirt.h @@ -20,9 +20,22 @@ static inline u64 paravirt_steal_clock(int cpu) int __init pv_time_init(void); +#ifdef CONFIG_PARAVIRT_SCHED +int __init pv_sched_init(void); + +__visible bool __native_vcpu_is_preempted(int cpu); +DECLARE_STATIC_CALL(pv_vcpu_preempted, __native_vcpu_is_preempted); + +static inline bool pv_vcpu_is_preempted(int cpu) +{ + return static_call(pv_vcpu_preempted)(cpu); +} +#endif /* CONFIG_PARAVIRT_SCHED */ + #else #define pv_time_init() do {} while (0) +#define pv_sched_init() do {} while (0) #endif // CONFIG_PARAVIRT diff --git a/arch/arm64/include/asm/pvsched-abi.h b/arch/arm64/include/asm/pvsched-abi.h new file mode 100644 index 0000000000000000000000000000000000000000..cc8e27dd266aeddb30cb51b7df0779083f9ebdca --- /dev/null +++ b/arch/arm64/include/asm/pvsched-abi.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#ifndef __ASM_PVSCHED_ABI_H +#define __ASM_PVSCHED_ABI_H + +#ifdef CONFIG_PARAVIRT_SCHED +struct pvsched_vcpu_state { + __le32 preempted; + /* Structure must be 64 byte aligned, pad to that size */ + u8 padding[60]; +} __packed; +#endif /* CONFIG_PARAVIRT_SCHED */ + +#endif diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index 0525c0b089edf7b1551bc1e4a47f167c969727e9..378aea19e840896b1e186a5fcc984536e5a02c27 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -7,6 +7,7 @@ #include #include +#include /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() @@ -19,9 +20,16 @@ * https://lore.kernel.org/lkml/20200110100612.GC2827@hirez.programming.kicks-ass.net */ #define vcpu_is_preempted vcpu_is_preempted +#if defined(CONFIG_PARAVIRT) && defined(CONFIG_PARAVIRT_SCHED) +static inline bool vcpu_is_preempted(int cpu) +{ + return pv_vcpu_is_preempted(cpu); +} +#else static inline bool vcpu_is_preempted(int cpu) { return false; } +#endif /* CONFIG_PARAVIRT && CONFIG_PARAVIRT_SCHED */ #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index c3b93910fa4d57b3b01853169f248184b980286a..21ef9c21a40076d1d0e69292e65fd160dd487111 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -60,7 +60,7 @@ obj-$(CONFIG_ARMV8_DEPRECATED) += armv8_deprecated.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o -obj-$(CONFIG_PARAVIRT) += paravirt.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt-spinlocks.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/ obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o obj-$(CONFIG_ELF_CORE) += elfcore.o diff --git a/arch/arm64/kernel/paravirt-spinlocks.c b/arch/arm64/kernel/paravirt-spinlocks.c new file mode 100644 index 0000000000000000000000000000000000000000..f402e7e6c30186a3c93f7d6d707b219b3792b85d --- /dev/null +++ b/arch/arm64/kernel/paravirt-spinlocks.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#ifdef CONFIG_PARAVIRT_SCHED +#include +#include +#include + +__visible bool __native_vcpu_is_preempted(int cpu) +{ + return false; +} + +DEFINE_STATIC_CALL(pv_vcpu_preempted, __native_vcpu_is_preempted); +#endif /* CONFIG_PARAVIRT_SCHED */ diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index aa718d6a9274ab06e78b6dc28f08de97a517a3ec..53bb6fa7636689d24e9657b54653c769796c5071 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -22,6 +22,7 @@ #include #include +#include #include struct static_key paravirt_steal_enabled; @@ -174,3 +175,110 @@ int __init pv_time_init(void) return 0; } + +#ifdef CONFIG_PARAVIRT_SCHED +DEFINE_PER_CPU(struct pvsched_vcpu_state, pvsched_vcpu_region) __aligned(64); +EXPORT_PER_CPU_SYMBOL(pvsched_vcpu_region); + +static bool kvm_vcpu_is_preempted(int cpu) +{ + struct pvsched_vcpu_state *reg; + u32 preempted; + + reg = &per_cpu(pvsched_vcpu_region, cpu); + if (!reg) { + pr_warn_once("PV sched enabled but not configured for cpu %d\n", + cpu); + return false; + } + + preempted = le32_to_cpu(READ_ONCE(reg->preempted)); + + return !!preempted; +} + +static int pvsched_vcpu_state_dying_cpu(unsigned int cpu) +{ + struct pvsched_vcpu_state *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&pvsched_vcpu_region); + if (!reg) + return -EFAULT; + + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE, &res); + memset(reg, 0, sizeof(*reg)); + + return 0; +} + +static int init_pvsched_vcpu_state(unsigned int cpu) +{ + struct pvsched_vcpu_state *reg; + struct arm_smccc_res res; + + reg = this_cpu_ptr(&pvsched_vcpu_region); + if (!reg) + return -EFAULT; + + /* Pass the memory address to host via hypercall */ + arm_smccc_1_1_invoke(ARM_SMCCC_HV_PV_SCHED_IPA_INIT, + virt_to_phys(reg), &res); + + return 0; +} + +static int kvm_arm_init_pvsched(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ARM_KVM_PVSCHED_STARTING, + "hypervisor/arm/pvsched:starting", + init_pvsched_vcpu_state, + pvsched_vcpu_state_dying_cpu); + + if (ret < 0) { + pr_warn("PV sched init failed\n"); + return ret; + } + + return 0; +} + +static bool has_kvm_pvsched(void) +{ + struct arm_smccc_res res; + + /* To detect the presence of PV sched support we require SMCCC 1.1+ */ + if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE) + return false; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_HV_PV_SCHED_FEATURES, &res); + + return (res.a0 == SMCCC_RET_SUCCESS); +} + +int __init pv_sched_init(void) +{ + int ret; + + if (is_hyp_mode_available()) + return 0; + + if (!has_kvm_pvsched()) { + pr_warn("PV sched is not available\n"); + return 0; + } + + ret = kvm_arm_init_pvsched(); + if (ret) + return ret; + + static_call_update(pv_vcpu_preempted, kvm_vcpu_is_preempted); + pr_info("using PV sched preempted\n"); + + return 0; +} +early_initcall(pv_sched_init); +#endif /* CONFIG_PARAVIRT_SCHED */ diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 826a05d072d7a6ce1369ef60ca27989666fdc799..fe63a91a4c547960c5c5005a17cfdb4ca4c99c28 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -10,7 +10,7 @@ include $(srctree)/virt/kvm/Makefile.kvm obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ -kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ +kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o pvsched.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e7e35b95821f873c1432e9fae919cfb899121467..a6435fa2461177ca1a9d66f5f7ebdf3f8fed5a6b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -408,6 +408,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_arm_pvtime_vcpu_init(&vcpu->arch); + kvm_arm_pvsched_vcpu_init(&vcpu->arch); + vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; err = kvm_vgic_vcpu_init(vcpu); @@ -495,6 +497,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_set_on_unsupported_cpu(vcpu); kvm_tlbi_dvmbm_vcpu_load(vcpu); + + if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) + kvm_update_pvsched_preempted(vcpu, 0); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -512,6 +517,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; kvm_tlbi_dvmbm_vcpu_put(vcpu); + + if (kvm_arm_is_pvsched_enabled(&vcpu->arch)) + kvm_update_pvsched_preempted(vcpu, 1); } static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) @@ -1369,6 +1377,8 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, spin_unlock(&vcpu->arch.mp_state_lock); + kvm_arm_pvsched_vcpu_init(&vcpu->arch); + return 0; } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 7fb4df0456dea53f9cdeccdd28c2ff47a8ce6ff3..4c9fc5df5142b1a8b531a57fe059609701dea393 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -332,6 +332,11 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) &smccc_feat->std_hyp_bmap)) val[0] = SMCCC_RET_SUCCESS; break; +#ifdef CONFIG_PARAVIRT_SCHED + case ARM_SMCCC_HV_PV_SCHED_FEATURES: + val[0] = SMCCC_RET_SUCCESS; + break; +#endif /* CONFIG_PARAVIRT_SCHED */ } break; case ARM_SMCCC_HV_PV_TIME_FEATURES: @@ -342,6 +347,22 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) if (gpa != INVALID_GPA) val[0] = gpa; break; +#ifdef CONFIG_PARAVIRT_SCHED + case ARM_SMCCC_HV_PV_SCHED_FEATURES: + val[0] = kvm_hypercall_pvsched_features(vcpu); + break; + case ARM_SMCCC_HV_PV_SCHED_IPA_INIT: + gpa = smccc_get_arg1(vcpu); + if (gpa != INVALID_GPA) { + vcpu->arch.pvsched.base = gpa; + val[0] = SMCCC_RET_SUCCESS; + } + break; + case ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE: + vcpu->arch.pvsched.base = INVALID_GPA; + val[0] = SMCCC_RET_SUCCESS; + break; +#endif /* CONFIG_PARAVIRT_SCHED */ case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; diff --git a/arch/arm64/kvm/pvsched.c b/arch/arm64/kvm/pvsched.c new file mode 100644 index 0000000000000000000000000000000000000000..ee7fed4f539ed3c2da289b18867ad120fe6bb5c2 --- /dev/null +++ b/arch/arm64/kvm/pvsched.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright(c) 2019 Huawei Technologies Co., Ltd + * Author: Zengruan Ye + */ + +#ifdef CONFIG_PARAVIRT_SCHED +#include +#include + +#include + +#include + +void kvm_update_pvsched_preempted(struct kvm_vcpu *vcpu, u32 preempted) +{ + struct kvm *kvm = vcpu->kvm; + u64 base = vcpu->arch.pvsched.base; + u64 offset = offsetof(struct pvsched_vcpu_state, preempted); + int idx; + + if (base == INVALID_GPA) + return; + + /* + * This function is called from atomic context, so we need to + * disable page faults. + */ + pagefault_disable(); + + idx = srcu_read_lock(&kvm->srcu); + kvm_put_guest(kvm, base + offset, cpu_to_le32(preempted)); + srcu_read_unlock(&kvm->srcu, idx); + + pagefault_enable(); +} + +long kvm_hypercall_pvsched_features(struct kvm_vcpu *vcpu) +{ + u32 feature = smccc_get_arg1(vcpu); + long val = SMCCC_RET_NOT_SUPPORTED; + + switch (feature) { + case ARM_SMCCC_HV_PV_SCHED_FEATURES: + case ARM_SMCCC_HV_PV_SCHED_IPA_INIT: + case ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE: + val = SMCCC_RET_SUCCESS; + break; + } + + return val; +} +#endif /* CONFIG_PARAVIRT_SCHED */ + diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 083f8565371616269fa0050036ec8a3631c44357..7acb9898fbe1fdbe4293c0ea2067f68be5e95137 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -577,5 +577,26 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, method; \ }) +#ifdef CONFIG_PARAVIRT_SCHED +/* Paravirtualised sched calls */ +#define ARM_SMCCC_HV_PV_SCHED_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x90) + +#define ARM_SMCCC_HV_PV_SCHED_IPA_INIT \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x91) + +#define ARM_SMCCC_HV_PV_SCHED_IPA_RELEASE \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x92) +#endif /* CONFIG_PARAVIRT_SCHED */ + #endif /*__ASSEMBLY__*/ #endif /*__LINUX_ARM_SMCCC_H*/ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 624d4a38c358a08f2ca417523058bc1c6a319a8d..f94a1b8e34e0278f09e176935fe9522c13c04795 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -190,6 +190,7 @@ enum cpuhp_state { CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, CPUHP_AP_ARM_XEN_RUNSTATE_STARTING, + CPUHP_AP_ARM_KVM_PVSCHED_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING,